Skip to content

Commit dd5559c

Browse files
authored
Merge pull request #32 from mahmudhera/main
implementing low-memory prefetch for one-vs-many comparisons (resolves #27)
2 parents 20894a4 + 5fc7908 commit dd5559c

21 files changed

+484
-198
lines changed

.github/workflows/ci.yml

+16
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,19 @@ jobs:
8080
mv test/output/archived_index_100.tar.gz test/data/archived_index_100.tar.gz
8181
bin/compare test/data/filelist_100.txt test/data/archived_index_100.tar.gz test/output/working_directory test/output/compare_100_v_100_tar_moved -c 0.0 -t 2 -n 500 -k 51
8282
python test/compare_multisearch_results.py test/data/multisearch_100_v_100 test/output/compare_100_v_100_tar_moved
83+
84+
# 10. Run prefetch and compare against sourmash prefetch
85+
- name: Run prefetch and compare against sourmash prefetch (threshold=0)
86+
run: |
87+
./bin/prefetch test/data/query_sketch.sig test/output/index_100 test/output/this_prefetch_against_100 -b 0
88+
./bin/prefetch test/data/query_sketch.sig test/output/index_50 test/output/this_prefetch_against_50 -b 0
89+
python test/compare_against_prefetch.py test/data/sourmash_prefetch_against_100 test/output/this_prefetch_against_100
90+
python test/compare_against_prefetch.py test/data/sourmash_prefetch_against_50 test/output/this_prefetch_against_50
91+
92+
# 11. Run prefetch and compare against sourmash prefetch
93+
- name: Run prefetch and compare against sourmash prefetch (threshold=2)
94+
run: |
95+
./bin/prefetch test/data/query_sketch.sig test/output/index_100 test/output/this_prefetch_against_100_threshold_2 -b 2
96+
./bin/prefetch test/data/query_sketch.sig test/output/index_50 test/output/this_prefetch_against_50_threshold_2 -b 2
97+
python test/compare_against_prefetch.py test/data/sourmash_prefetch_against_100_threshold_2 test/output/this_prefetch_against_100_threshold_2
98+
python test/compare_against_prefetch.py test/data/sourmash_prefetch_against_50_threshold_2 test/output/this_prefetch_against_50_threshold_2

benchmark/comparison_multisearch_vs_compare_many_vs_many.csv

-12
This file was deleted.

benchmark/comparison_multisearch_vs_compare_one_vs_many.csv

-4
This file was deleted.

benchmark/plot_many_v_many.py

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import pandas as pd
2+
import matplotlib.pyplot as plt
3+
4+
# set arial as the default font
5+
plt.rcParams['font.family'] = 'Arial'
6+
7+
# Load the data
8+
df_compare = pd.read_csv('benchmark_results_compare_many_v_many.csv')
9+
df_multisearch = pd.read_csv('benchmark_results_multisearch_many_v_many.csv')
10+
11+
# col names: query_filesize,ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
12+
13+
# plot wall-clock time for query file size = 50000, use different reference file sizes in x-axis
14+
df_compare_50000 = df_compare[df_compare['query_filesize'] == 50000]
15+
df_multisearch_50000 = df_multisearch[df_multisearch['query_filesize'] == 50000]
16+
17+
plt.plot(df_compare_50000['ref_filesize'], df_compare_50000['wall_clock_time'], label='compare (cpp)', marker='o')
18+
plt.plot(df_multisearch_50000['ref_filesize'], df_multisearch_50000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
19+
20+
plt.xlabel('Reference list size')
21+
plt.ylabel('Wall-clock time (s)')
22+
plt.title('Wall-clock time for 50k queries')
23+
plt.xticks(df_compare_50000['ref_filesize'])
24+
plt.legend()
25+
plt.grid(linestyle='--', alpha=0.3)
26+
plt.savefig('plots/benchmark_results_compare_many_v_many_wall_clock_time.pdf')
27+
28+
29+
# plot peak memory usage for query file size = 50000, use different reference file sizes in x-axis
30+
plt.clf()
31+
plt.plot(df_compare_50000['ref_filesize'], df_compare_50000['peak_memory_usage'], label='compare (cpp)', marker='o')
32+
plt.plot(df_multisearch_50000['ref_filesize'], df_multisearch_50000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
33+
34+
plt.xlabel('Reference list size')
35+
plt.ylabel('Peak memory usage (GB)')
36+
plt.title('Peak memory usage for 50k queries')
37+
plt.xticks(df_compare_50000['ref_filesize'])
38+
plt.legend()
39+
plt.grid(linestyle='--', alpha=0.3)
40+
plt.savefig('plots/benchmark_results_compare_many_v_many_peak_memory_usage.pdf')
41+
42+
43+
# plot wall-clock time for ref file size = 50000, use different query file sizes in x-axis
44+
plt.clf()
45+
df_compare_50000 = df_compare[df_compare['ref_filesize'] == 50000]
46+
df_multisearch_50000 = df_multisearch[df_multisearch['ref_filesize'] == 50000]
47+
48+
plt.plot(df_compare_50000['query_filesize'], df_compare_50000['wall_clock_time'], label='compare (cpp)', marker='o')
49+
plt.plot(df_multisearch_50000['query_filesize'], df_multisearch_50000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
50+
51+
plt.xlabel('Query list size')
52+
plt.ylabel('Wall-clock time (s)')
53+
plt.title('Wall-clock time for 50k references')
54+
plt.xticks(df_compare_50000['query_filesize'])
55+
plt.legend()
56+
plt.grid(linestyle='--', alpha=0.3)
57+
plt.savefig('plots/benchmark_results_compare_many_v_many_wall_clock_time_query.pdf')
58+
59+
60+
# plot peak memory usage for ref file size = 50000, use different query file sizes in x-axis
61+
plt.clf()
62+
plt.plot(df_compare_50000['query_filesize'], df_compare_50000['peak_memory_usage'], label='compare (cpp)', marker='o')
63+
plt.plot(df_multisearch_50000['query_filesize'], df_multisearch_50000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
64+
65+
plt.xlabel('Query list size')
66+
plt.ylabel('Peak memory usage (GB)')
67+
plt.title('Peak memory usage for 50k references')
68+
plt.xticks(df_compare_50000['query_filesize'])
69+
plt.legend()
70+
plt.grid(linestyle='--', alpha=0.3)
71+
plt.savefig('plots/benchmark_results_compare_many_v_many_peak_memory_usage_query.pdf')

benchmark/plot_one_v_many.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import pandas as pd
2+
import matplotlib.pyplot as plt
3+
4+
# set arial as the default font
5+
plt.rcParams['font.family'] = 'Arial'
6+
7+
# Load the data
8+
df_compare = pd.read_csv('benchmark_results_compare_one_v_many.csv')
9+
df_multisearch = pd.read_csv('benchmark_results_multisearch_one_v_many.csv')
10+
11+
# col names: ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
12+
13+
# plot wall-clock time against reference file size
14+
plt.plot(df_compare['ref_filesize'], df_compare['wall_clock_time'], label='compare (cpp)', marker='o')
15+
plt.plot(df_multisearch['ref_filesize'], df_multisearch['wall_clock_time'], label='multisearch (sourmash)', marker='o')
16+
17+
plt.xlabel('Reference list size')
18+
plt.ylabel('Wall-clock time (s)')
19+
plt.title('Wall-clock time for one query')
20+
plt.xticks(df_compare['ref_filesize'])
21+
plt.legend()
22+
plt.grid(linestyle='--', alpha=0.3)
23+
plt.savefig('plots/benchmark_results_compare_one_v_many_wall_clock_time.pdf')
24+
25+
# plot peak memory usage against reference file size
26+
plt.clf()
27+
plt.plot(df_compare['ref_filesize'], df_compare['peak_memory_usage'], label='compare (cpp)', marker='o')
28+
plt.plot(df_multisearch['ref_filesize'], df_multisearch['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
29+
30+
plt.xlabel('Reference list size')
31+
plt.ylabel('Peak memory usage (GB)')
32+
plt.title('Peak memory usage for one query')
33+
plt.xticks(df_compare['ref_filesize'])
34+
plt.legend()
35+
plt.grid(linestyle='--', alpha=0.3)
36+
37+
plt.savefig('plots/benchmark_results_compare_one_v_many_peak_memory_usage.pdf')
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)