Let apriori always use low_memory processing

dbarbier · dbarbier · commit 10847824b2f5 · 2019-12-28T19:44:24.000+01:00
Thanks to previous optimizations, processing with low_memory=True is now almost as efficient as with low_memory=False, and allows to process much larger datasets. Removing processing with low_memory=False makes code simpler and allows to generate itemsets by a generator, which saves more meory. The downside is that we do not know in advance the number of itemsets to process, thus it is displayed afterwards. Note that commit 2f928cb introduced a bug, the number of processing combinations was multiplied by itemset's length, which explains why output is different now.
diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py
@@ -61,40 +61,36 @@ def generate_new_combinations(old_combinations):
                     # early exit from for-loop skips else clause just below
                     break
             else:
-                yield from candidate
+                yield candidate
             j = j + 1
 
 
-def compute_supports_low_memory(X, is_sparse, combin):
-    supports = np.zeros(combin.shape[0])
-    ncomb, nitems = combin.shape
+def generate_supports_and_itemsets(X, is_sparse, combin, min_support):
+    counter = 0
     if is_sparse:
-        _bools = X[:, 0].toarray()
-        for c in range(ncomb):
-            _bools[:] = X[:, combin[c, 0]].toarray()
-            for j in range(1, nitems):
-                _bools[:] &= X[:, combin[c, j]].toarray()
-            supports[c] = np.count_nonzero(_bools)
+        count = np.empty(X.shape[0], dtype=int)
+        for itemset in combin:
+            counter += 1
+            count[:] = 0
+            for item in itemset:
+                # much faster than X[:, item].toarray() or X.getcol(item).indices
+                count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1
+            support = np.count_nonzero(count == len(itemset))
+            if support >= min_support:
+                yield support
+                yield from itemset
     else:
-        _bools = np.copy(X[:, 0])
-        for c in range(ncomb):
-            _bools[:] = X[:, combin[c, 0]]
-            for j in range(1, nitems):
-                _bools[:] &= X[:, combin[c, j]]
-            supports[c] = np.count_nonzero(_bools)
-    return supports
-
-
-def compute_supports(X, is_sparse, combin):
-    all_ones = np.ones((X.shape[0], 1))
-    if is_sparse:
-        _bools = X[:, combin[:, 0]] == all_ones
-        for n in range(1, combin.shape[1]):
-            _bools = _bools & (X[:, combin[:, n]] == all_ones)
-    else:
-        _bools = np.all(X[:, combin], axis=2)
-
-    return np.sum(_bools, axis=0)
+        for itemset in combin:
+            counter += 1
+            _bools = np.ones(X.shape[0], dtype=bool)
+            for item in itemset:
+                _bools[:] &= X[:, item]
+            support = np.count_nonzero(_bools)
+            if support >= min_support:
+                yield support
+                yield from itemset
+    # return the total of processed itemsets as last element
+    yield counter
 
 
 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
@@ -223,38 +219,25 @@ def _support(_x, _n_rows, _is_sparse):
     support_dict = {1: support[support >= min_support]}
     itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
     max_itemset = 1
-    rows_count = float(X.shape[0])
-
-    all_ones = np.ones((int(rows_count), 1))
 
     while max_itemset and max_itemset < (max_len or float('inf')):
         next_max_itemset = max_itemset + 1
 
         combin = generate_new_combinations(itemset_dict[max_itemset])
-        combin = np.fromiter(combin, dtype=int)
-        combin = combin.reshape(-1, next_max_itemset)
+        gen_itemsets = generate_supports_and_itemsets(X, is_sparse, combin, int(min_support * X.shape[0]))
+
+        support_valid_itemsets = np.fromiter(gen_itemsets, dtype=int)
+        processed_itemsets = support_valid_itemsets[-1]
 
-        if combin.size == 0:
-            break
         if verbose:
             print(
-                '\rProcessing %d combinations | Sampling itemset size %d' %
-                (combin.size, next_max_itemset), end="")
-
-        # With exceptionally large datasets, the matrix operations can use a
-        # substantial amount of memory. For low memory applications or large
-        # datasets, set `low_memory=True` to use a slower but more memory-
-        # efficient implementation.
-        if low_memory:
-            support = compute_supports_low_memory(X, is_sparse, combin)
-        else:
-            support = compute_supports(X, is_sparse, combin)
-        support /= rows_count
+                '\rProcessed %d combinations | Sampling itemset size %d' %
+                (processed_itemsets, next_max_itemset), end="")
 
-        _mask = (support >= min_support)
-        if any(_mask):
-            itemset_dict[next_max_itemset] = np.array(combin[_mask])
-            support_dict[next_max_itemset] = np.array(support[_mask])
+        support_valid_itemsets = support_valid_itemsets[:-1].reshape(-1, 1 + next_max_itemset)
+        if support_valid_itemsets.size > 0:
+            itemset_dict[next_max_itemset] = support_valid_itemsets[:, 1:]
+            support_dict[next_max_itemset] = support_valid_itemsets[:, 0] / X.shape[0]
             max_itemset = next_max_itemset
         else:
             # Exit condition