当先锋百科网

首页 1 2 3 4 5 6 7

一. ID3算法原理

ID3算法通过计算每个属性的信息增益,认为信息增益越大属性越优,每次划分选取信息增益最大的属性为划分标准,重复这个过程,直到构成一棵决策树。

二. 关键概念

1. 信息熵

信息熵是描述事件给我们的惊讶程度,如果所有事件的概率均等,那熵值大,惊讶程度低。如果有一事件的概率极高而其他极低,熵值便低,惊讶程度大。其计算公式如下:

2. 信息增益

信息增益描述某个事件发生前后信息熵的变化。 在ID3里面可以理解为引入特征之后信息熵的变化(这句话纯属个人理解,如果不对请指正)。其计算公式如下:


其中n表示分类的数目。

三. 测试数据

data.txt
young	myope	no	reduced	no lenses
young	myope	no	normal	soft
young	myope	yes	reduced	no lenses
young	myope	yes	normal	hard
young	hyper	no	reduced	no lenses
young	hyper	no	normal	soft
young	hyper	yes	reduced	no lenses
young	hyper	yes	normal	hard
pre	myope	no	reduced	no lenses
pre	myope	no	normal	soft
pre	myope	yes	reduced	no lenses
pre	myope	yes	normal	hard
pre	hyper	no	reduced	no lenses
pre	hyper	no	normal	soft
pre	hyper	yes	reduced	no lenses
pre	hyper	yes	normal	no lenses
presbyopic	myope	no	reduced	no lenses
presbyopic	myope	no	normal	no lenses
presbyopic	myope	yes	reduced	no lenses
presbyopic	myope	yes	normal	hard
presbyopic	hyper	no	reduced	no lenses
presbyopic	hyper	no	normal	soft
presbyopic	hyper	yes	reduced	no lenses
presbyopic	hyper	yes	normal	no lenses
label.txt
age	prescript	astigmatic	tearRate

四. 实验代码

#encoding: utf-8
import math

def cal_shannon_ent(data):
	data_num = len(data)
	cnt = {}
	for feat_vec in data:
		tmp_class = feat_vec[-1]
		if tmp_class not in cnt.keys():
			cnt[tmp_class] = 0
		cnt[tmp_class] += 1
	shannon_ent = 0.0
	for key in cnt:
		p = float(cnt[key]) / data_num
		shannon_ent -= p * math.log(p, 2)
	return shannon_ent

def split_data(data, index, value):
	ret_data = []
	for feat_vec in data:
		if feat_vec[index] == value:
			remain_feat_vec = feat_vec[:index]
			remain_feat_vec.extend(feat_vec[index + 1:])
			ret_data.append(remain_feat_vec)
	return ret_data

def get_best_feat(data):
	base_ent = cal_shannon_ent(data)
	feat_num = len(data[0]) - 1
	best_info_gain = 0.0
	best_feat = -1
	for i in range(feat_num):
		values = [j[i] for j in data]
		unique_values = set(values)
		new_ent = 0.0
		for value in unique_values:
			sub_data = split_data(data, i, value)
			p = float(len(sub_data)) / len(data)
			new_ent += p * cal_shannon_ent(sub_data)	#
		info_gain = base_ent - new_ent
		if info_gain > best_info_gain:
			best_info_gain = info_gain
			best_feat = i
	return best_feat

def get_major_class(class_list):
	cnt = {}
	maxx = -1
	major_class = -1
	for _class in class_list:
		if _class not in cnt.keys():
			cnt[_class] = 0
		cnt[_class] += 1
		if(cnt[_class] > maxx):
			maxx = cnt[_class]
			major_class = _class
	return major_class

def create_tree_id3(data, label):
	class_list = [i[-1] for i in data]
	if class_list.count(class_list[0]) == len(class_list):
		return class_list[0]
	if len(data[0]) == 1:
		return get_major_class(class_list)
	best_feat = get_best_feat(data)
	best_feat_label = label[best_feat]
	ret_tree = {best_feat_label:{}}
	del(label[best_feat])
	values = [i[best_feat] for i in data]
	unique_values = set(values)
	for value in unique_values:
		sub_label = label[:]	#
		ret_tree[best_feat_label][value] = create_tree_id3(split_data(data, best_feat, value), sub_label)
	return ret_tree

if __name__=="__main__":
	# data = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
	# label = ['no surfacing', 'flippers']
	fr = open('data.txt')
	data = [line.strip().split('\t') for line in fr.readlines()]
	fr = open('label.txt')
	label = [line.strip().split('\t') for line in fr.readlines()][0]
	print create_tree_id3(data, label)

五. 实验结果

{'tearRate': {'reduced': 'no lenses', 'normal': {'astigmatic': {'yes': {'prescript': {'hyper': {'age': {'pre': 'no lenses', 'presbyopic': 'no lenses', 'young': 'hard'}}, 'myope': 'hard'}}, 'no': {'age': {'pre': 'soft', 'presbyopic': {'prescript': {'hyper': 'soft', 'myope': 'no lenses'}}, 'young': 'soft'}}}}}}