数据挖掘概念与技术习题选做
第六章习题
(1) 用python简单实现Apriori算法
# -*- coding: utf-8 -*-
__author__ = "Yunfan Yang"
def gen_L1(TID):
"""从事务集中产生频繁1项集"""
initial_C1 = {} # 定义一个空字典用于统计初始项集信息,键值对形如{"M": 3}
for tid in TID:
for item in tid:
if item not in initial_C1.keys():
initial_C1[item] =
else:
initial_C1[item] +=
# 候选1项集
C1 = []
for key, value in initial_C1.items():
tmp_tuple = ([key], value)
C1.append(tmp_tuple)
# C1结果为[(['M'], 3), (['O'], 4), (['N'], 2), (['K'], 5), (['E'], 4), (['Y'], 3), (['D'], 1), (['A'], 1), (['U'], 1), (['C'], 2), (['I'], 1)]
# 频繁1项集
L1 = []
for item in C1:
if item[] / len(TID) >= min_sup:
L1.append(item)
# L1结果为[(['M'], 3), (['O'], 4), (['K'], 5), (['E'], 4), (['Y'], 3)]
return L1
def generateC_k(Lk_1):
"""产生候选k项集的函数"""
C_k = []
# 执行连接步骤
for i in range(len(Lk_1)-): # 遍历Lk_1中的项集
# print(C)
for j in range(i+, len(Lk_1)):
if len(Lk_1[i][]) <= : # 频繁项集只有1项,则直接连接,产生候选项
C = Lk_1[i][][:] # 复制Lk_1[i][0]给C
C.append(Lk_1[j][][-])
elif len(Lk_1[i][]) > :
if if_equal(Lk_1[i][][:-], Lk_1[j][][:-]) and Lk_1[i][][-] != Lk_1[j][][-]: # 连接条件
C = Lk_1[i][][:]
C.append(Lk_1[j][][-]) # 连接步
else:
break
if has_infrequent_subset(C, Lk_1): # 剪枝步:删除非频繁候选
pass
else:
C_k.append(C)
return C_k
# Lk_1 = [(['O', 'K'], 3), (['O', 'E'], 3)]
def if_equal(A, B):
"""判断两个长度相等的列表是否相等"""
for i in range(len(A)):
if A[i] == B[i]:
continue
else:
return False
return True
# L1 = [(['M'], 3), (['O'], 4), (['K'], 5), (['E'], 4), (['Y'], 3)]
# C = ['M', 'I']
def has_infrequent_subset(C, Lk_1):
"""利用先验知识,判断候选项的子集是否有子集不属于频繁集"""
L = [] # 创建一个空列表只包含频繁项集,形如[['M'], ['O']]
for i in range(len(Lk_1)):
L.append(Lk_1[i][])
# print(L)
for element in subset(C):
# print(element)
if element not in L: # 如果候选项的子集不在Lk_1中,返回True
return True
return False
def subset(C):
"""产生一个列表的长度减1项子集,比如[1,2,3]生成[[1,2],[1,3],[2,3]]"""
result = []
for i in range(len(C)):
copyC = C[:]
copyC.pop(i)
subsetC = copyC
result.append(subsetC)
return result
def AinB(A, B):
"""定义函数检测列表B是否全部包含列表A的元素"""
for a in A:
if a in B:
continue
else:
return False
return True
def main_apriori(Lk_1):
"""Apriori主程序"""
while Lk_1[-] != []:
Ck = generateC_k(Lk_1[-]) # 产生候选项集
# print(Ck)
Lk = [] # 初始化每次要生成的频繁项集
for i in range(len(Ck)):
count = # 初始化计数
for tid in TID: # 扫描事务库,进行计数
if AinB(Ck[i], tid): # 判断候选项集中的候选项是否出现在事务集的事务中
count +=
lk = (Ck[i], count)
if lk[] / len(TID) >= min_sup: # 判断是否满足最小支持度
Lk.append(lk)
Lk_1.append(Lk)
return Lk_1 # 得出每次生成的频繁项集列表
def gen_rule_set(Lk_1):
"""生成只包含关联规则项集的列表"""
# 因为最后的频繁集的所有非空子集及支持度计数都出现在列表Lk_1中,故可以跟据Lk_1的列表寻找关联规则
# 首先生成一个新列表只包含最终频繁集的及其非空子集
rule_set = []
for i in range(len(Lk_1) - ): # 遍历Lk_1的每个频繁项集
for L in Lk_1[i]: # 对于每个频繁项
if AinB(L[], Lk_1[-][][]): # 如果频繁项的所有元素都在最终频繁项中,则取出该列表至rule_set中
rule_set.append(L)
return rule_set
def gen_rule(rule_set):
"""生成规则"""
for i in range(len(rule_set)):
for j in range(len(rule_set)): # 遍历规则列表rule_set
if not AinB(rule_set[i][], rule_set[j][]) and not AinB(rule_set[j][], rule_set[i][]) and (
len(rule_set[i][]) + len(rule_set[j][])) <= len(rule_set[-][]): # 取出没有项本身的另一个项,生成规则
rule = [rule_set[i], rule_set[j]]
join = rule_set[i][] + rule_set[j][] # 将规则里的两个项集合并,即取并集
for k in range(len(rule_set)): # 再一次遍历关联规则的项集合
if len(join) == len(rule_set[k][]) and AinB(join, rule_set[k][]): # 取出并集的支持度计数
rule_union = rule_set[k]
# print(rule_union)
conf = rule_union[] / rule[][] # 计算置信度
if conf >= min_conf: # 判断是否满足最小置信度
print(rule[][], '-->', rule[][], '置信度为:{}%'.format(conf*))
if __name__ == "__main__":
# 原始事务集
t100 = ('M', 'O', 'N', 'K', 'E', 'Y')
t200 = ('D', 'O', 'N', 'K', 'E', 'Y')
t300 = ('M', 'A', 'K', 'E')
t400 = ('M', 'U', 'C', 'K', 'Y')
t500 = ('C', 'O', 'O', 'K', 'I', 'E')
TID = [t100, t200, t300, t400, t500]
# 设置最小支持度以及置信度
min_sup =
min_conf =
# # 原始事务集
# t100 = ('I1', 'I2', 'I5')
# t200 = ('I2', 'I4')
# t300 = ('I2', 'I3')
# t400 = ('I1', 'I2', 'I4')
# t500 = ('I1', 'I3')
# t600 = ('I2', 'I3')
# t700 = ('I1', 'I3')
# t800 = ('I1', 'I2', 'I3', 'I5')
# t900 = ('I1', 'I2', 'I3')
# TID = [t100, t200, t300, t400, t500, t600, t700, t800, t900]
#
# # 设置最小支持度以及置信度
# min_sup = 2/9
# min_conf = 0.7
Lk_1 = [gen_L1(TID)] # 将每个频繁项集作为一个子列表储存在Lk_1中
Lk_1 = main_apriori(Lk_1)
print("\n生成的频繁项集列表为:", Lk_1[:-])
Lk = Lk_1[-]
print("最后生成的频繁项集为:", Lk)
rule_set = gen_rule_set(Lk_1)
print("产生关联规则的项集为:", rule_set)
print("\n生成的强关联规则有:")
gen_rule(rule_set)
运行结果为:
生成的频繁项集列表为: [[(['M'], ), (['O'], ), (['K'], ), (['E'], ), (['Y'], )], [(['M', 'K'], ), (['O', 'K'], ), (['O', 'E'], ), (['K', 'E'], ), (['K', 'Y'], )], [(['O', 'K', 'E'], )]]
最后生成的频繁项集为: [(['O', 'K', 'E'], )]
产生关联规则的项集为: [(['O'], ), (['K'], ), (['E'], ), (['O', 'K'], ), (['O', 'E'], ), (['K', 'E'], ), (['O', 'K', 'E'], )]
生成的强关联规则有:
['K'] --> ['E'] 置信度为:%
['E'] --> ['K'] 置信度为:%
['O', 'K'] --> ['E'] 置信度为:%
['O', 'E'] --> ['K'] 置信度为:%
(2) FP-growth 算法
再说吧,先往下进行,有空再写。另外Apriori写的着实太烂,还是太菜了,继续加油把。