昨天,我们把建立决策树条件设定完,那今天,我打算写建立决策树後半:
有了条件後,就开始建立整个决策树-->利用递回放入
# 建立决策树
def create_Tree(data):
#把之前条件函数加入进来
classList = [dt[0] for dt in data]
if no_data_check(data):
return [None]
if one_data_check(data):
return one_data_check(data)[0]
if one_feature_check(data):
return one_feature_check(data)[0]
#分割最佳点
best_feature_col,best_split_value=Best_Feature(data)
new_data,leftData,leftDataindex,rightData,rightDataindex,
best_feature_col=split_Data(data,best_feature_col,best_split_value)
#建立决策树
myTree =
{best_feature_col: {'<' + str(best_split_value): {}, '>' + str(best_split_value): {}}}
myTree[best_feature_col]['<' + str(best_split_value)] =
create_Tree(leftData,best_feature_col)
myTree[best_feature_col]['>' + str(best_split_value)]=
create_Tree(rightData,best_feature_col)
return myTree
print(create_Tree(data))
这样子就会得到结果:
{2: {'<0.4495': {1: {'<2.51': 2, '>2.51': {1: {'<9.0': 1, '>9.0': 2}}}}, '>0.4495': 1}}
但这样结果会有问题的是因为列表在删除元素後,会改变原本栏位
因为列表是没有栏位名(我觉得这就是列表极限),所以我现在要把资料带入pandas.DataFrame里,并命名栏位名称:
data=pd.DataFrame([[1,1.01,0.852,5,1.5],[2,2.01,0.31,8,8.1],[1,3.01,0.589,9,5.6],[1,3.01,0.01,8,2.3],[2,4.01,0.258,10,1.1]])
print(data)
执行後就会长这样(这样就有栏位名)
0 1 2 3 4
0 1 1.01 0.852 5 1.5
1 2 2.01 0.310 8 8.1
2 1 3.01 0.589 9 5.6
3 1 3.01 0.010 8 2.3
4 2 4.01 0.258 10 1.1
当然其他函数要进行改写:(像是基尼函数,split_data..)等,以下是整个修改完的程序:
import random as rd
import numpy as np
import pandas as pd
#一个5维资料,共5笔
data=pd.DataFrame([[1,1.01,0.852,5,1.5],[2,2.01,0.31,8,8.1],[1,3.01,0.589,9,5.6],[1,3.01,0.01,8,2.3],[2,4.01,0.258,10,1.1]])
print(data)
#划分方式
def split_Data_Set(data, index, value):
data1, data2 = [], []
for j in data[index]:
#是否超过指定value
if j<= value:
data1.append(j)
else:
data2.append(j)
return data1, data2
def Best_Feature(data):
#1为最大(效果最差)
best_Gini_cofe = 1
#位置最小为0,先设定-1
best_feature_col = -1
#因为数值有可能正或负,所以先设定None
best_split_value = None
# 第i个特徵
for l in data.columns:
# print("第",i,"个特徵")
if l==0:
continue
feat_list = [k for k in data[l]]
sortfeats = sorted(list(set(feat_list)))
# print("排序好特徵资料:",sortfeats)
split_list = []
if len(sortfeats)==1:
splitList=sortfeats
else:
for j in range(len(sortfeats) - 1):
split_list.append(np.round((sortfeats[j] + sortfeats[j + 1]) / 2,5))
# print("节点:",split_list)
#每个划分点都测试
for split_value in split_list:
subdata1, subdata2 = split_Data_Set(data, l, split_value)
#使用前几天的Gini_cofe函数
new_Gini = Gini_cofe(subdata1, subdata2)
#如果基尼系数较小代表比较好
if new_Gini < best_Gini_cofe:
best_Gini_cofe = new_Gini
best_feature_col = l
best_split_value = split_value
return best_feature_col, best_split_value
best_feature_col, best_split_value=Best_Feature(data)
print("最佳分割特徵为: 第",best_feature_col,"特徵")
print("最佳分割特徵数值为:",best_split_value)
import copy
def split_Data(data, best_feature_col, best_split_value):
new_data = copy.deepcopy(data)
#去除特徵点资料
new_data = new_data.drop(columns=best_feature_col,axis=1)
leftData, rightData = [], []
leftDataindex,rightDataindex = [], []
#去除特徵後分类资料
for j in data.index:
if data[best_feature_col][j] <= best_split_value:
leftData.append(new_data.iloc[j].tolist())
leftDataindex.append(j)
else:
rightData.append(new_data.iloc[j].tolist())
rightDataindex.append(j)
leftData=pd.DataFrame(leftData,columns=new_data.columns)
rightData=pd.DataFrame(rightData,columns=new_data.columns)
return new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col
new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col=split_Data(data,best_feature_col,best_split_value)
print("去除特徵後资料:",new_data)
print("去除特徵後左资料:",leftData)
print("左资料在原始资料序列:",leftDataindex)
print("去除特徵後右资料:",rightData)
print("右资料在原始资料序列:",rightDataindex)
print("被去除特徵点:第",best_feature_col,"位")
#确认是否为同一类别
def one_data_check(data):
count = 0
#拿第一笔值当范例
check = data[0][0]
for i in range(len(data)):
#check所有资料
if data[0][i]==check:
count+=1
if count==len(data):
return [check]
else:
return False
#确认是否为空
def no_data_check(data):
if len(data)==0:
return True
else:
return False
#类别都分类完
import random as rd
def one_feature_check(data):
count_use={}
if len(data[0])==1:
new_data=[]
for i in range(len(data)):
new_data.append(data[i][0])
for i in new_data:
if i in count_use.keys():
count_use[i]+=1
else:
count_use[i]=1
#确认最大值
max_check=max(count_use, key=count_use.get)
#确认类别是否一样多
the_same=[]
for key,value in count_use.items():
if(value == max(count_use.values())):
the_same.append(key)
#如果有一样就随机取
return rd.sample(the_same,1)
else:
return False
#示范有可能是0或1
print(one_feature_check([[0],[0],[1],[1]]))
# 建立决策树
def create_Tree(data):
#把之前条件函数加入进来
# classList = [dt[0] for dt in data]
if no_data_check(data):
return [None]
if one_data_check(data):
return one_data_check(data)[0]
if one_feature_check(data):
return one_feature_check(data)[0]
#分割最佳点
# print(data)
best_feature_col,best_split_value=Best_Feature(data)
new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col=split_Data(data,best_feature_col,best_split_value)
#建立决策树
myTree = {best_feature_col: {'<' + str(best_split_value): {}, '>' + str(best_split_value): {}}}
myTree[best_feature_col]['<' + str(best_split_value)] = create_Tree(leftData)
myTree[best_feature_col]['>' + str(best_split_value)] = create_Tree(rightData)
return myTree
print(create_Tree(data))
结果:
{3: {'<8.5': {1: {'<1.51': 1.0, '>1.51': {2: {'<0.16': 1.0, '>0.16': 2.0}}}}, '>8.5': {1: {'<3.51': 1.0, '>3.51': 2.0}}}}
好,今天实作部分就到这,明天就把资料去做带入动作
男孩沿着森林小径,朝着歌声方向前进,一路上除了歌声之外,森林原本的声音都不见了,没有鸟叫的声音,也没有树枝摩擦的声音,但男孩忽略了这种异常,继续朝着歌声前进,走了一段时间後,他看见一栋小木屋,歌声似乎是从里面传了出来,男孩想从窗外窥视里面,但里面被窗帘遮得死死的,於是男孩走到门前,敲了敲门,并说:有人在吗?
--|我看着你,你却看不到我|-- MS.CM
<<: [Day21] Flutter - Presentation AutoRouter(part5)
Stack 组件用於沿垂直或水平轴的布局 也是RWD应用的选项之一 复杂度跟所选参数都可以轻易使用 ...
中台作为一种生态系统层级的架构,倚赖业界主流的技术系统,包含开源技术平台与框架: 业务中台:微服务─...
今天来练习下面这个版面~ 运用到的观念: 使用float排版 :first-child ~选取器 相...
今天处里剩下的部分:checker 函式和它注入页面的辅助函式。 checker checker 函...
在上一章中介绍了 attribute directive 的用法,接着要来介绍另一种 Angular...