@@ -61,40 +61,36 @@ def generate_new_combinations(old_combinations):
61
61
# early exit from for-loop skips else clause just below
62
62
break
63
63
else :
64
- yield from candidate
64
+ yield candidate
65
65
j = j + 1
66
66
67
67
68
- def compute_supports_low_memory (X , is_sparse , combin ):
69
- supports = np .zeros (combin .shape [0 ])
70
- ncomb , nitems = combin .shape
68
+ def generate_supports_and_itemsets (X , is_sparse , combin , min_support ):
69
+ counter = 0
71
70
if is_sparse :
72
- _bools = X [:, 0 ].toarray ()
73
- for c in range (ncomb ):
74
- _bools [:] = X [:, combin [c , 0 ]].toarray ()
75
- for j in range (1 , nitems ):
76
- _bools [:] &= X [:, combin [c , j ]].toarray ()
77
- supports [c ] = np .count_nonzero (_bools )
71
+ count = np .empty (X .shape [0 ], dtype = int )
72
+ for itemset in combin :
73
+ counter += 1
74
+ count [:] = 0
75
+ for item in itemset :
76
+ # much faster than X[:, item].toarray() or X.getcol(item).indices
77
+ count [X .indices [X .indptr [item ]:X .indptr [item + 1 ]]] += 1
78
+ support = np .count_nonzero (count == len (itemset ))
79
+ if support >= min_support :
80
+ yield support
81
+ yield from itemset
78
82
else :
79
- _bools = np .copy (X [:, 0 ])
80
- for c in range (ncomb ):
81
- _bools [:] = X [:, combin [c , 0 ]]
82
- for j in range (1 , nitems ):
83
- _bools [:] &= X [:, combin [c , j ]]
84
- supports [c ] = np .count_nonzero (_bools )
85
- return supports
86
-
87
-
88
- def compute_supports (X , is_sparse , combin ):
89
- all_ones = np .ones ((X .shape [0 ], 1 ))
90
- if is_sparse :
91
- _bools = X [:, combin [:, 0 ]] == all_ones
92
- for n in range (1 , combin .shape [1 ]):
93
- _bools = _bools & (X [:, combin [:, n ]] == all_ones )
94
- else :
95
- _bools = np .all (X [:, combin ], axis = 2 )
96
-
97
- return np .sum (_bools , axis = 0 )
83
+ for itemset in combin :
84
+ counter += 1
85
+ _bools = np .ones (X .shape [0 ], dtype = bool )
86
+ for item in itemset :
87
+ _bools [:] &= X [:, item ]
88
+ support = np .count_nonzero (_bools )
89
+ if support >= min_support :
90
+ yield support
91
+ yield from itemset
92
+ # return the total of processed itemsets as last element
93
+ yield counter
98
94
99
95
100
96
def apriori (df , min_support = 0.5 , use_colnames = False , max_len = None , verbose = 0 ,
@@ -223,38 +219,25 @@ def _support(_x, _n_rows, _is_sparse):
223
219
support_dict = {1 : support [support >= min_support ]}
224
220
itemset_dict = {1 : ary_col_idx [support >= min_support ].reshape (- 1 , 1 )}
225
221
max_itemset = 1
226
- rows_count = float (X .shape [0 ])
227
-
228
- all_ones = np .ones ((int (rows_count ), 1 ))
229
222
230
223
while max_itemset and max_itemset < (max_len or float ('inf' )):
231
224
next_max_itemset = max_itemset + 1
232
225
233
226
combin = generate_new_combinations (itemset_dict [max_itemset ])
234
- combin = np .fromiter (combin , dtype = int )
235
- combin = combin .reshape (- 1 , next_max_itemset )
227
+ gen_itemsets = generate_supports_and_itemsets (X , is_sparse , combin , int (min_support * X .shape [0 ]))
228
+
229
+ support_valid_itemsets = np .fromiter (gen_itemsets , dtype = int )
230
+ processed_itemsets = support_valid_itemsets [- 1 ]
236
231
237
- if combin .size == 0 :
238
- break
239
232
if verbose :
240
233
print (
241
- '\r Processing %d combinations | Sampling itemset size %d' %
242
- (combin .size , next_max_itemset ), end = "" )
243
-
244
- # With exceptionally large datasets, the matrix operations can use a
245
- # substantial amount of memory. For low memory applications or large
246
- # datasets, set `low_memory=True` to use a slower but more memory-
247
- # efficient implementation.
248
- if low_memory :
249
- support = compute_supports_low_memory (X , is_sparse , combin )
250
- else :
251
- support = compute_supports (X , is_sparse , combin )
252
- support /= rows_count
234
+ '\r Processed %d combinations | Sampling itemset size %d' %
235
+ (processed_itemsets , next_max_itemset ), end = "" )
253
236
254
- _mask = ( support >= min_support )
255
- if any ( _mask ) :
256
- itemset_dict [next_max_itemset ] = np . array ( combin [ _mask ])
257
- support_dict [next_max_itemset ] = np . array ( support [ _mask ])
237
+ support_valid_itemsets = support_valid_itemsets [: - 1 ]. reshape ( - 1 , 1 + next_max_itemset )
238
+ if support_valid_itemsets . size > 0 :
239
+ itemset_dict [next_max_itemset ] = support_valid_itemsets [:, 1 :]
240
+ support_dict [next_max_itemset ] = support_valid_itemsets [:, 0 ] / X . shape [ 0 ]
258
241
max_itemset = next_max_itemset
259
242
else :
260
243
# Exit condition
0 commit comments