DAY07随机森林演算法(续5)

昨天,我们把建立决策树条件设定完,那今天,我打算写建立决策树後半:
有了条件後,就开始建立整个决策树-->利用递回放入

# 建立决策树
def create_Tree(data):
    #把之前条件函数加入进来
    classList = [dt[0] for dt in data]
    if no_data_check(data):
        return [None]
    if one_data_check(data):
        return one_data_check(data)[0]
    if one_feature_check(data):
        return one_feature_check(data)[0]
    #分割最佳点
    best_feature_col,best_split_value=Best_Feature(data)
    new_data,leftData,leftDataindex,rightData,rightDataindex, 
    best_feature_col=split_Data(data,best_feature_col,best_split_value)
    #建立决策树
    myTree = 
    {best_feature_col: {'<' + str(best_split_value): {}, '>' + str(best_split_value): {}}}
    myTree[best_feature_col]['<' + str(best_split_value)] =  
        create_Tree(leftData,best_feature_col)
    myTree[best_feature_col]['>' + str(best_split_value)]= 
        create_Tree(rightData,best_feature_col)
    return myTree
print(create_Tree(data))

这样子就会得到结果:

{2: {'<0.4495': {1: {'<2.51': 2, '>2.51': {1: {'<9.0': 1, '>9.0': 2}}}}, '>0.4495': 1}}

但这样结果会有问题的是因为列表在删除元素後,会改变原本栏位
因为列表是没有栏位名(我觉得这就是列表极限),所以我现在要把资料带入pandas.DataFrame里,并命名栏位名称:

data=pd.DataFrame([[1,1.01,0.852,5,1.5],[2,2.01,0.31,8,8.1],[1,3.01,0.589,9,5.6],[1,3.01,0.01,8,2.3],[2,4.01,0.258,10,1.1]])
print(data)

执行後就会长这样(这样就有栏位名)

0     1      2   3    4
0  1  1.01  0.852   5  1.5
1  2  2.01  0.310   8  8.1
2  1  3.01  0.589   9  5.6
3  1  3.01  0.010   8  2.3
4  2  4.01  0.258  10  1.1

当然其他函数要进行改写:(像是基尼函数,split_data..)等,以下是整个修改完的程序:

import random as rd
import numpy as np
import pandas as pd
#一个5维资料,共5笔
data=pd.DataFrame([[1,1.01,0.852,5,1.5],[2,2.01,0.31,8,8.1],[1,3.01,0.589,9,5.6],[1,3.01,0.01,8,2.3],[2,4.01,0.258,10,1.1]])
print(data)
#划分方式
def split_Data_Set(data, index, value):
    data1, data2 = [], []
    for j in data[index]:
        #是否超过指定value
        if j<= value:
            data1.append(j)
        else:
            data2.append(j)
    return data1, data2
def Best_Feature(data):
    #1为最大(效果最差)
    best_Gini_cofe = 1
    #位置最小为0,先设定-1
    best_feature_col = -1
    #因为数值有可能正或负,所以先设定None
    best_split_value = None
    # 第i个特徵

    for l in data.columns:
        # print("第",i,"个特徵")
        if l==0:
            continue
        feat_list = [k for k in data[l]]
        sortfeats = sorted(list(set(feat_list)))
        # print("排序好特徵资料:",sortfeats)
        split_list = []

        if len(sortfeats)==1:
            splitList=sortfeats
        else:
            for j in range(len(sortfeats) - 1):
                split_list.append(np.round((sortfeats[j] + sortfeats[j + 1]) / 2,5))
        # print("节点:",split_list)
        #每个划分点都测试
        for split_value in split_list:   
            
            subdata1, subdata2 = split_Data_Set(data, l, split_value)
            #使用前几天的Gini_cofe函数
            
            new_Gini = Gini_cofe(subdata1, subdata2)
            
            #如果基尼系数较小代表比较好
            if new_Gini < best_Gini_cofe:
                
                best_Gini_cofe = new_Gini
                best_feature_col = l
                best_split_value = split_value



    return best_feature_col, best_split_value
best_feature_col, best_split_value=Best_Feature(data)    
print("最佳分割特徵为: 第",best_feature_col,"特徵")
print("最佳分割特徵数值为:",best_split_value) 
import copy
def split_Data(data, best_feature_col, best_split_value):
    new_data = copy.deepcopy(data)
    #去除特徵点资料
    
    new_data = new_data.drop(columns=best_feature_col,axis=1)

    leftData, rightData = [], []
    leftDataindex,rightDataindex = [], []
    #去除特徵後分类资料

    for j in data.index:
        
        if data[best_feature_col][j] <= best_split_value:
            
            leftData.append(new_data.iloc[j].tolist())
            leftDataindex.append(j)
        else:
            
            rightData.append(new_data.iloc[j].tolist())
            rightDataindex.append(j)
    leftData=pd.DataFrame(leftData,columns=new_data.columns)
    rightData=pd.DataFrame(rightData,columns=new_data.columns)
    return new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col
new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col=split_Data(data,best_feature_col,best_split_value)
print("去除特徵後资料:",new_data)
print("去除特徵後左资料:",leftData)
print("左资料在原始资料序列:",leftDataindex)
print("去除特徵後右资料:",rightData)
print("右资料在原始资料序列:",rightDataindex)
print("被去除特徵点:第",best_feature_col,"位")

#确认是否为同一类别
def one_data_check(data):
    count = 0
    #拿第一笔值当范例

    check = data[0][0]
    
    for i in range(len(data)):
        #check所有资料
        if data[0][i]==check:
            count+=1
    if count==len(data):
        
        return [check]
    else:
        return False
#确认是否为空
def no_data_check(data):
    if len(data)==0:
        return True
    else:
        return False
#类别都分类完
import random as rd
def one_feature_check(data):
    count_use={}
    if len(data[0])==1:
        new_data=[]
        for i in range(len(data)):
            new_data.append(data[i][0])
        for i in new_data:
            if i in count_use.keys():
                count_use[i]+=1
            else:
                count_use[i]=1
        #确认最大值
        max_check=max(count_use, key=count_use.get)
        #确认类别是否一样多
        the_same=[]
        for key,value in count_use.items():
            if(value == max(count_use.values())):
                the_same.append(key)
        #如果有一样就随机取
        return rd.sample(the_same,1)
    else:
        return False
#示范有可能是0或1 
print(one_feature_check([[0],[0],[1],[1]]))
# 建立决策树
def create_Tree(data):
    #把之前条件函数加入进来
    # classList = [dt[0] for dt in data]
    if no_data_check(data):
        return [None]
    if one_data_check(data):
        return one_data_check(data)[0]
    if one_feature_check(data):
        return one_feature_check(data)[0]
    #分割最佳点
    # print(data)
    best_feature_col,best_split_value=Best_Feature(data)

    new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col=split_Data(data,best_feature_col,best_split_value)
    #建立决策树
    myTree = {best_feature_col: {'<' + str(best_split_value): {}, '>' + str(best_split_value): {}}}
    myTree[best_feature_col]['<' + str(best_split_value)] = create_Tree(leftData)
    myTree[best_feature_col]['>' + str(best_split_value)] = create_Tree(rightData)
    return myTree
print(create_Tree(data))

结果:

{3: {'<8.5': {1: {'<1.51': 1.0, '>1.51': {2: {'<0.16': 1.0, '>0.16': 2.0}}}}, '>8.5': {1: {'<3.51': 1.0, '>3.51': 2.0}}}}

好,今天实作部分就到这,明天就把资料去做带入动作

男孩沿着森林小径,朝着歌声方向前进,一路上除了歌声之外,森林原本的声音都不见了,没有鸟叫的声音,也没有树枝摩擦的声音,但男孩忽略了这种异常,继续朝着歌声前进,走了一段时间後,他看见一栋小木屋,歌声似乎是从里面传了出来,男孩想从窗外窥视里面,但里面被窗帘遮得死死的,於是男孩走到门前,敲了敲门,并说:有人在吗?
				--|我看着你,你却看不到我|--     MS.CM

<<: [Day21] Flutter - Presentation AutoRouter(part5)

>>: Day13-pod服务处介绍service