-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathnaive_apriori.py
161 lines (142 loc) · 4.43 KB
/
naive_apriori.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import itertools
import csv
import parameters
def load_data(filename, max_attr=100):
"""
Loads data from given
:param filename Name of data source file
:param max_attr: Maximum number of attributes to be considered
:return: Returns transactions each of which containing no more than
max_attr
"""
with open(filename, 'r') as f:
data = list(csv.reader(f, delimiter=','))
result = []
for i in xrange(len(data)):
attrbs = []
for val in data[i][1:]:
val = val.strip()
if int(val) <= max_attr:
attrbs.append(val)
if len(attrbs) > 0:
result.append(attrbs)
return result
def solve(data, support, confidence):
transactions = len(data)
T = []
hashmap = {}
for row in data:
for word in row:
if word not in hashmap.keys():
hashmap[word] = 1
else:
hashmap[word] += 1
level = []
for key in hashmap:
if (100 * hashmap[key] / transactions) >= float(support):
level.append([key])
return find_association_rules(level, support, confidence)
def find_subsets(S, m):
return set(itertools.combinations(S, m))
def has_infrequent_subset(dataset, prev_l, k):
list = find_subsets(dataset, k)
for item in list:
s = []
for l in item:
s.append(l)
s.sort()
if s not in prev_l:
return True
return False
def apply_support(singles, support):
k = 2
prev_l = []
L = []
for item in singles:
prev_l.append(item)
while prev_l:
current = []
sets = apriori(prev_l, k - 1)
for c in sets:
cnt = 0
trans = len(data)
s = set(c)
for T in data:
t = set(T)
if s.issubset(t):
cnt += 1
if (100 * cnt / trans) >= float(support):
c.sort()
current.append(c)
prev_l = []
for l in current:
prev_l.append(l)
k += 1
if current:
L.append(current)
# for item in L:
# for x in item:
# print x
return L
def find_association_rules(singles, support, confidence):
num = 1
L = apply_support(singles, support)
result = 0
for list in L:
for l in list:
length = len(l)
count = 1
while count < length:
r = find_subsets(l, count)
count += 1
for item in r:
inc1 = 0
inc2 = 0
s = []
m = []
for i in item:
s.append(i)
for T in data:
if set(s).issubset(set(T)):
inc1 += 1
if set(l).issubset(set(T)):
inc2 += 1
if 100 * inc2 / inc1 >= float(confidence):
for index in l:
if index not in s:
m.append(index)
#print(s, " ==> ", set(l) - set(s))
left = ','.join(s)
right = ','.join(set(l) - set(s))
print (' ==> '.join([left, right]))
result += 1
num += 1
print('Total rules generated:', result)
return result
def apriori(prev_l, k):
length = k
result = []
for list1 in prev_l:
for list2 in prev_l:
count = 0
c = []
if list1 != list2:
while count < length - 1:
if list1[count] != list2[count]:
break
else:
count += 1
else:
if list1[length - 1] < list2[length - 1]:
for item in list1:
c.append(item)
c.append(list2[length - 1])
if not has_infrequent_subset(c, prev_l, k):
result.append(c)
return result
data = load_data('1000-out1.csv')
solve(data, parameters.SUPPORT, parameters.CONFIDENCE)
# for s in xrange(2, 11, 2):
# for c in xrange(s, 11, 2):
# count = solve(data, s, c)
# print '{0:10} {1:10} {2:20}'.format(str(s), str(c), str(count))