Skip to content

Commit 610e63f

Browse files
authored
Merge pull request #40 from mahmudhera/main
added sketch sizes in compare output (resolves #39)
2 parents 19b7709 + 3de8eee commit 610e63f

22 files changed

+43
-36
lines changed

README.md

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,16 @@ All tool usages are available using `--help` flag.
1818

1919
## `compare` output format
2020
1. Index of query
21-
2. Name of query
22-
3. MD5 of query
23-
4. Index of match
24-
5. Name of match
25-
6. MD5 of match
26-
7. Jaccard
27-
8. Containment(query, target)
28-
9. Containment(target, query)
21+
1. Name of query
22+
1. MD5 of query
23+
1. Size of the query sketch
24+
1. Index of match
25+
1. Name of match
26+
1. MD5 of match
27+
1. Size of the matched reference sketch
28+
1. Jaccard
29+
1. Containment(query, target)
30+
1. Containment(target, query)
2931

3032

3133
# Testing

benchmark/plot_indexing_time_memory.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# col names: filesize,cpu_time,wall_clock_time,peak_memory_usage
1313

1414
# plot wall-clock time against file size
15-
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['wall_clock_time'], label='sourmash index', marker='o')
15+
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['wall_clock_time'], label='sourmash index (1 thread)', marker='o')
1616
plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['wall_clock_time'], label='cpp index (1 thread)', marker='o')
1717
plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['wall_clock_time'], label='cpp index (128 threads)', marker='o')
1818

@@ -26,7 +26,7 @@
2626

2727
# plot cpu time against file size
2828
plt.clf()
29-
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['cpu_time'], label='sourmash index', marker='o')
29+
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['cpu_time'], label='sourmash index (1 thread)', marker='o')
3030
plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['cpu_time'], label='cpp index (1 thread)', marker='o')
3131
plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['cpu_time'], label='cpp index (128 threads)', marker='o')
3232

@@ -40,7 +40,7 @@
4040

4141
# plot peak memory usage against file size
4242
plt.clf()
43-
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['peak_memory_usage'], label='sourmash index', marker='o')
43+
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['peak_memory_usage'], label='sourmash index (1 thread)', marker='o')
4444
plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['peak_memory_usage'], label='cpp index (1 thread)', marker='o')
4545
plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['peak_memory_usage'], label='cpp index (128 threads)', marker='o')
4646

benchmark/plot_many_v_many.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
df_compare_without_index_30000 = df_compare_without_index[df_compare_without_index['query_filesize'] == 30000]
2222
df_multisearch_30000 = df_multisearch[df_multisearch['query_filesize'] == 30000]
2323

24-
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp)', marker='o')
25-
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) without index', marker='o')
24+
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp) by building index', marker='o')
25+
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) by loading index', marker='o')
2626
plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
2727

2828
plt.xlabel('Reference list size')
@@ -36,8 +36,8 @@
3636

3737
# plot CPU time for query file size = 30000, use different reference file sizes in x-axis
3838
plt.clf()
39-
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['cpu_time'], label='compare (cpp)', marker='o')
40-
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) without index', marker='o')
39+
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['cpu_time'], label='compare (cpp) by building index', marker='o')
40+
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) by loading index', marker='o')
4141
plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['cpu_time'], label='multisearch (sourmash)', marker='o')
4242

4343
plt.xlabel('Reference list size')
@@ -51,8 +51,8 @@
5151

5252
# plot peak memory usage for query file size = 30000, use different reference file sizes in x-axis
5353
plt.clf()
54-
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp)', marker='o')
55-
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) without index', marker='o')
54+
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp) by building index', marker='o')
55+
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) by loading index', marker='o')
5656
plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
5757

5858
plt.xlabel('Reference list size')
@@ -70,8 +70,8 @@
7070
df_compare_without_index_30000 = df_compare_without_index[df_compare_without_index['ref_filesize'] == 30000]
7171
df_multisearch_30000 = df_multisearch[df_multisearch['ref_filesize'] == 30000]
7272

73-
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp)', marker='o')
74-
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) without index', marker='o')
73+
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp) by building index', marker='o')
74+
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) by loading index', marker='o')
7575
plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
7676

7777
plt.xlabel('Query list size')
@@ -85,8 +85,8 @@
8585

8686
# plot peak memory usage for ref file size = 30000, use different query file sizes in x-axis
8787
plt.clf()
88-
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp)', marker='o')
89-
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) without index', marker='o')
88+
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp) by building index', marker='o')
89+
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) by loading index', marker='o')
9090
plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
9191

9292
plt.xlabel('Query list size')
@@ -100,8 +100,8 @@
100100

101101
# plot CPU time for ref file size = 30000, use different query file sizes in x-axis
102102
plt.clf()
103-
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['cpu_time'], label='compare (cpp)', marker='o')
104-
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) without index', marker='o')
103+
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['cpu_time'], label='compare (cpp) by building index', marker='o')
104+
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) by loading index', marker='o')
105105
plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['cpu_time'], label='multisearch (sourmash)', marker='o')
106106

107107
plt.xlabel('Query list size')

benchmark/plot_one_v_many.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
# col names: ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
1313

1414
# plot wall-clock time against reference file size
15-
plt.plot(df_compare['ref_filesize'], df_compare['wall_clock_time'], label='compare (cpp)', marker='o')
16-
plt.plot(df_multisearch['ref_filesize'], df_multisearch['wall_clock_time'], label='multisearch (sourmash)', marker='o')
17-
plt.plot(df_prefetch_lm['ref_filesize'], df_prefetch_lm['wall_clock_time'], label='prefetch-lm (cpp)', marker='o')
15+
plt.plot(df_compare['ref_filesize'], df_compare['wall_clock_time'], label='compare (cpp) (1 thread)', marker='o')
16+
plt.plot(df_multisearch['ref_filesize'], df_multisearch['wall_clock_time'], label='multisearch (sourmash) (1 thread)', marker='o')
17+
plt.plot(df_prefetch_lm['ref_filesize'], df_prefetch_lm['wall_clock_time'], label='prefetch-lm (cpp) (1 thread)', marker='o')
1818

1919
plt.xlabel('Reference list size')
2020
plt.ylabel('Wall-clock time (s)')
@@ -26,9 +26,9 @@
2626

2727
# plot peak memory usage against reference file size
2828
plt.clf()
29-
plt.plot(df_compare['ref_filesize'], df_compare['peak_memory_usage'], label='compare (cpp)', marker='o')
30-
plt.plot(df_multisearch['ref_filesize'], df_multisearch['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
31-
plt.plot(df_prefetch_lm['ref_filesize'], df_prefetch_lm['peak_memory_usage'], label='prefetch-lm (cpp)', marker='o')
29+
plt.plot(df_compare['ref_filesize'], df_compare['peak_memory_usage'], label='compare (cpp) (1 thread)', marker='o')
30+
plt.plot(df_multisearch['ref_filesize'], df_multisearch['peak_memory_usage'], label='multisearch (sourmash) (1 thread)', marker='o')
31+
plt.plot(df_prefetch_lm['ref_filesize'], df_prefetch_lm['peak_memory_usage'], label='prefetch-lm (cpp) (1 thread)', marker='o')
3232

3333
plt.xlabel('Reference list size')
3434
plt.ylabel('Peak memory usage (GB)')

benchmark/plot_this_vs_sourmash_prefetch.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
# col names: ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
1212

1313
# plot wall-clock time against reference file size
14-
plt.plot(df_this_prefetch['ref_filesize'], df_this_prefetch['wall_clock_time'], label='cpp prefetch', marker='o')
15-
plt.plot(df_sourmash_prefetch['ref_filesize'], df_sourmash_prefetch['wall_clock_time'], label='sourmash prefetch', marker='o')
14+
plt.plot(df_this_prefetch['ref_filesize'], df_this_prefetch['wall_clock_time'], label='cpp prefetch (1 thread)', marker='o')
15+
plt.plot(df_sourmash_prefetch['ref_filesize'], df_sourmash_prefetch['wall_clock_time'], label='sourmash prefetch (1 thread)', marker='o')
1616

1717
plt.xlabel('Reference list size')
1818
plt.ylabel('Wall-clock time (s)')
@@ -25,8 +25,8 @@
2525

2626
# plot peak memory usage against reference file size
2727
plt.clf()
28-
plt.plot(df_this_prefetch['ref_filesize'], df_this_prefetch['peak_memory_usage'], label='cpp prefetch', marker='o')
29-
plt.plot(df_sourmash_prefetch['ref_filesize'], df_sourmash_prefetch['peak_memory_usage'], label='sourmash prefetch', marker='o')
28+
plt.plot(df_this_prefetch['ref_filesize'], df_this_prefetch['peak_memory_usage'], label='cpp prefetch (1 thread)', marker='o')
29+
plt.plot(df_sourmash_prefetch['ref_filesize'], df_sourmash_prefetch['peak_memory_usage'], label='sourmash prefetch (1 thread)', marker='o')
3030

3131
plt.xlabel('Reference list size')
3232
plt.ylabel('Peak memory usage (GB)')
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)