手动实现——决策树ID3算法

思路：

第一步：定义函数——计算熵：

输入：某个样本集合的因变量数据，输出计算出的熵

第二步：定义函数——根据特征分割样本集合

输入：总样本集合df、某个特征，输出划分后的各个样本子集sub_df

第三步：定义函数——选择最佳特征

输入：样本集合、特征集合，选出能够最大化gain的最佳特征，输出最佳特征、gain值、用第二步得到的分割结果

第四步：定义class——递归生成决策树

核心函数：生成决策树
1.根据第三步函数进行
2.结束条件：输出最佳特征为None
3.递归部分：对每一个子集做递归

其他函数：定义节点——联系二叉树结构

1.读入数据

#### 1.读入数据
import os
import pandas as pd
import numpy as np
from math import log  # 计算信息熵要用

## 将代码块运行结果全部输出，而不是只输出最后的，适用于全文
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"    

## 分类变量——转化为虚拟变量
os.chdir("/Users/mac/Desktop/快乐研一/python/机器学习python实现")
df = pd.read_csv("example_data.csv")
df.head()

	humility	outlook	temp	windy	play
0	high	sunny	hot	False	no
1	high	sunny	hot	True	no
2	high	overcast	hot	False	yes
3	high	rainy	mild	False	yes
4	normal	rainy	cool	False	yes

2.定义计算熵的函数

#### 2.定义计算熵的函数
#  前置问题
tmp_y = df['play'].tolist()      # 转化为list，方便后面迭代
set(tmp_y)                         # 因变量的类别
[tmp_y.count(i) for i in set(tmp_y)]   # 各类别的样本比例

# 正式定义函数计算信息熵
def information(data_y):  
    # 信息熵 = -sum(p*log2p)
    data_y  = data_y.tolist()
    probs = [data_y.count(i)/len(data_y) for i in set(data_y)]
    entropy = [prob*log(prob,2) for prob in probs]
    sum_entropy = -sum(entropy)
    return sum_entropy

# 测试
information(df['play'])

{'no', 'yes'}






[5, 9]






0.9402859586706309

3.根据特征分割样本

#### 3.根据特征分割样本
# 前置问题1——怎么subset？
df[df['outlook']=="sunny"]    # 选择出outlook变量等于sunny的样本子集
# 前置问题2——怎么输出多个子集——字典{key:df}
pd.DataFrame   # 定义一个空的df
aa = {types: pd.DataFrame for types in set(df['outlook'])}    # 定义一个空的字典用于存放所有子集
aa
aa.keys()


# 正式定义函数
def split_df(data,col):
    '''
    输入：data——df数据集，col——列名，根据col这一列的取值来分割data
    输出：字典，每一个key是col取值，对应value是子集
    '''
    types = set(data[col].tolist())
    sub_df  = {i:pd.DataFrame for i in types}      # 定义一个存放子集集合的空字典
    for key in sub_df.keys():
        sub_df[key] = data[data[col]==key]         # 把对应的子集填入空字典
    return sub_df

# 测试
sub_df = split_df(df,'temp')
sub_df['mild']['play']

	humility	outlook	temp	windy	play
0	high	sunny	hot	False	no
1	high	sunny	hot	True	no
7	high	sunny	mild	False	no
8	normal	sunny	cool	False	yes
10	normal	sunny	mild	True	yes

pandas.core.frame.DataFrame






{'overcast': pandas.core.frame.DataFrame,
 'rainy': pandas.core.frame.DataFrame,
 'sunny': pandas.core.frame.DataFrame}






dict_keys(['overcast', 'rainy', 'sunny'])






3     yes
7      no
9     yes
10    yes
11    yes
13     no
Name: play, dtype: object

4.选择最佳特征

#### 选择最佳特征
# 前置知识——给出所有自变量的colname
[col for col in df.columns if col!='play']

# 正式定义函数
def choose_col(data,col_y):
    '''
    输入：样本集合、特征集合，从各个特征中选出能够最大化gain的最佳特征
    
    设输出的初始值max_gain = 0（gain 的取值范围为0~logn)
    for每一个特征：
        调用split_df分割数据，for 分割后的每一个子集，调用information 计算条件熵，再按照样本比例加权求和
        差值计算信息增益，如果大于前一个，更新输出值

    输出：最佳特征、gain值、用第二步得到的分割结果
    '''
    entroy_D = information(data[col_y])       # 计算全集上的信息熵
    colnames_x = [col for col in data.columns if col!=col_y]   # 自变量的列名
    max_gain = -1   # 设定gain初始值
    max_col = None
    max_df = None
    
    for col_x in colnames_x:                   # 对于所有特征
        sub_df = split_df(data,col_x)         # 按照某个特征分割
        entroy_sub = []                            # 用于保存各个子集信息熵*权重的list
        
        for key in sub_df.keys():                # 对于每一个子集
            sub_y = sub_df[key][col_y]
            entroy_sub.append(information(sub_y)*len(sub_y)/len(data[col_y]))   # 子集计算信息熵*权重
        entroy_x  = sum(entroy_sub)
        gain_x = entroy_D-entroy_x

        if gain_x> max_gain:         # 如果信息增益增大了，更新输出值
            max_gain = gain_x
            max_col = col_x
            max_df = sub_df
    return max_gain, max_col, max_df

# 测试
choose_col(df,'play')

['humility', 'outlook', 'temp', 'windy']






(0.246749819774439,
 'outlook',
 {'overcast':    humility   outlook  temp  windy play
  2      high  overcast   hot  False  yes
  6    normal  overcast  cool   True  yes
  11     high  overcast  mild   True  yes
  12   normal  overcast   hot  False  yes,
  'rainy':    humility outlook  temp  windy play
  3      high   rainy  mild  False  yes
  4    normal   rainy  cool  False  yes
  5    normal   rainy  cool   True   no
  9    normal   rainy  mild  False  yes
  13     high   rainy  mild   True   no,
  'sunny':    humility outlook  temp  windy play
  0      high   sunny   hot  False   no
  1      high   sunny   hot   True   no
  7      high   sunny  mild  False   no
  8    normal   sunny  cool  False  yes
  10   normal   sunny  mild   True  yes})

# 前置知识1，如何定义一个有更多叉的数结构的节点，name是他自己，connections存储链接的所有子节点
class Node:
    def __init__(self,name):
        self.name = name
        self.connections = {}    # 存储所有子节点的信息      
    def connect(self,label,node):  # 链接到子节点
        self.connections[label] = node
# 测试
parent = Node('root')
child1 = Node('child1')
child2 = Node('child2')
parent.connect('A',child1)
parent.connect('B',child2)
parent.connections


#  前置知识2 核心函数：生成决策树**
# 结束条件：输出最佳特征为None
# 递归部分：对每一个子集做递归
def build(data, col_x, col_y):
    # 结束条件
    if len(data)==0 or len(col_x)==0:
        return 
    # 递归
    current_gain, current_col, current_df = choose_col(data[col_x+[col_y]], col_y)   # 对剩余的特征空间寻找
    print(current_gain, current_col, current_df.keys())
    col_x_new = [_ for _ in col_x if _ not in [current_col]]    # 更新剩余的特征空间
    
    for i in current_df.keys():            # 对于每一个子集
        build(current_df[i], col_x_new, col_y)       # 递归调用

# 测试
build(df,['humility', 'outlook', 'temp', 'windy'],'play')

{'A': <__main__.Node at 0x116459a58>, 'B': <__main__.Node at 0x11645ceb8>}



0.246749819774439 outlook dict_keys(['overcast', 'rainy', 'sunny'])
-0.0 humility dict_keys(['high', 'normal'])
-0.0 temp dict_keys(['mild', 'hot'])
-0.0 windy dict_keys([True])
-0.0 windy dict_keys([False])
-0.0 temp dict_keys(['hot', 'cool'])
-0.0 windy dict_keys([False])
-0.0 windy dict_keys([True])
0.9709505944546686 windy dict_keys([False, True])
-0.0 humility dict_keys(['high', 'normal'])
-0.0 temp dict_keys(['mild'])
-0.0 temp dict_keys(['mild', 'cool'])
-0.0 humility dict_keys(['normal', 'high'])
-0.0 temp dict_keys(['cool'])
-0.0 temp dict_keys(['mild'])
0.9709505944546686 humility dict_keys(['high', 'normal'])
-0.0 temp dict_keys(['mild', 'hot'])
-0.0 windy dict_keys([False])
-0.0 windy dict_keys([False, True])
-0.0 temp dict_keys(['mild', 'cool'])
-0.0 windy dict_keys([True])
-0.0 windy dict_keys([False])

5.定义一个class，把build的结果存成一个tree

#  定义一个class，把build的结果存成一个tree
class ID3Tree:    

    class Node:       # 定义节点  
        def __init__(self, name):
            self.name = name
            self.connections = {}    
            
        def connect(self, label, node):   # 节点对应多个子节点
            self.connections[label] = node    
        
    def __init__(self, data, label):     # 需要给出data和因变量label
        self.columns = data.columns
        self.data = data
        self.label = label
        self.root = self.Node("Root")       # 初始化一个根节点
        
    def construct_tree(self):
        self.construct(self.root, "", self.data, self.columns)    # 输入父节点、父节点的取值、当前样本集合、剩余特征集合
    
    def construct(self, parent_node, parent_connection_label, input_data, columns):
        max_value, best_col, max_splited = choose_col(input_data[columns], self.label)
        if (not best_col) or max_value==0:    # 结束条件，1.没有剩余特征2.样本集合为空3.已经归于同一类
            node = self.Node(input_data[self.label].iloc[0])   
            parent_node.connect(parent_connection_label, node)   # 设为叶节点         
            return
        node = self.Node(best_col)    # 生成新节点
        parent_node.connect(parent_connection_label, node)
        
        new_columns = [col for col in columns if col != best_col]        # 更新剩余特征集合
        for splited_value, splited_data in max_splited.items():             # 对于新子集，递归
            self.construct(node, splited_value, splited_data, new_columns)
            
    def print_tree(self, node, tabs):
        print(tabs + node.name)        
        for connection, child_node in node.connections.items():
            print(tabs + "\t" + "(" + str(connection) + ")")
            self.print_tree(child_node, tabs + "\t\t") 

df
mytree = ID3Tree(df,'play')
mytree.construct_tree()
mytree.print_tree(mytree.root,"")

	humility	outlook	temp	windy	play
0	high	sunny	hot	False	no
1	high	sunny	hot	True	no
2	high	overcast	hot	False	yes
3	high	rainy	mild	False	yes
4	normal	rainy	cool	False	yes
5	normal	rainy	cool	True	no
6	normal	overcast	cool	True	yes
7	high	sunny	mild	False	no
8	normal	sunny	cool	False	yes
9	normal	rainy	mild	False	yes
10	normal	sunny	mild	True	yes
11	high	overcast	mild	True	yes
12	normal	overcast	hot	False	yes
13	high	rainy	mild	True	no

Root
	()
		outlook
			(overcast)
				yes
			(rainy)
				windy
					(False)
						yes
					(True)
						no
			(sunny)
				humility
					(high)
						no
					(normal)
						yes