Skip to content

Commit f228762

Browse files
committed
Add benchmark script
This is a work in progress.
1 parent eb80667 commit f228762

File tree

1 file changed

+105
-0
lines changed

1 file changed

+105
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# Sebastian Raschka 2014-2019
2+
# myxtend Machine Learning Library Extensions
3+
# Author: Sebastian Raschka <sebastianraschka.com>
4+
#
5+
# License: BSD 3 clause
6+
7+
from mlxtend.preprocessing import TransactionEncoder
8+
from mlxtend.frequent_patterns import apriori
9+
import pandas as pd
10+
import numpy as np
11+
import gzip
12+
import os
13+
import sys
14+
from time import time
15+
import signal
16+
from contextlib import contextmanager
17+
18+
19+
@contextmanager
20+
def timeout(time):
21+
# Register a function to raise a TimeoutError on the signal.
22+
signal.signal(signal.SIGALRM, raise_timeout)
23+
# Schedule the signal to be sent after ``time``.
24+
signal.alarm(time)
25+
26+
try:
27+
yield
28+
except TimeoutError:
29+
pass
30+
finally:
31+
# Unregister the signal so it won't be triggered
32+
# if the timeout is not reached.
33+
signal.signal(signal.SIGALRM, signal.SIG_IGN)
34+
35+
36+
def raise_timeout(signum, frame):
37+
raise TimeoutError
38+
39+
40+
files = [
41+
# "chess.dat.gz",
42+
# "connect.dat.gz",
43+
"mushroom.dat.gz",
44+
"pumsb.dat.gz",
45+
"pumsb_star.dat.gz",
46+
# "T10I4D100K.dat.gz",
47+
# "T40I10D100K.dat.gz",
48+
# "kosarak.dat.gz", # this file is too large in sparse format
49+
# "kosarak-1k.dat.gz",
50+
# "kosarak-10k.dat.gz",
51+
# "kosarak-50k.dat.gz",
52+
# "kosarak-100k.dat.gz",
53+
# "kosarak-200k.dat.gz",
54+
]
55+
56+
57+
low_memory = True
58+
commit = "b731fd2"
59+
test_supports = [0.5, 0.3, 0.1, 0.05, 0.03, 0.01, 0.005, 0.003, 0.001]
60+
61+
for sparse, col_major in [[False, True], [False, False], [True, True]]:
62+
sys.stdout = open("Results/{}-sparse{}-col_major{}.out".format(
63+
commit, sparse, col_major), "w")
64+
for filename in files:
65+
with gzip.open(os.path.join("data", filename)) if filename.endswith(
66+
".gz"
67+
) else open(os.path.join("data", filename)) as f:
68+
data = f.readlines()
69+
70+
dataset = [list(map(int, line.split())) for line in data]
71+
items = np.unique(dataset)
72+
print("{} contains {} transactions and {} items".format(
73+
filename, len(dataset), len(items)))
74+
75+
te = TransactionEncoder()
76+
te_ary = te.fit(dataset).transform(dataset, sparse=sparse)
77+
columns = ["c"+str(i) for i in te.columns_]
78+
if sparse:
79+
try:
80+
df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=columns)
81+
except AttributeError:
82+
# pandas < 0.25
83+
df = pd.SparseDataFrame(te_ary, columns=columns,
84+
default_fill_value=False)
85+
else:
86+
df = pd.DataFrame(te_ary, columns=columns)
87+
if col_major:
88+
df = pd.DataFrame({col: df[col] for col in df.columns})
89+
np.info(df.values)
90+
91+
kwds = {"use_colnames": False, "low_memory": low_memory}
92+
for min_support in test_supports:
93+
tick = time()
94+
with timeout(120):
95+
print(apriori(df, min_support=min_support, verbose=1, **kwds))
96+
print("\nmin_support={} temps: {}\n".format(
97+
min_support, time() - tick))
98+
if time() - tick < 10:
99+
times = []
100+
for _ in range(5):
101+
tick = time()
102+
apriori(df, min_support=min_support, verbose=0, **kwds)
103+
times.append(time() - tick)
104+
print("Times:", times)
105+
sys.stdout.close()

0 commit comments

Comments
 (0)