KoslickiLab
diff --git a/‎.github/workflows/ci.yml
+16 b/‎.github/workflows/ci.yml
+16
diff --git a/‎benchmark/comparison_multisearch_vs_compare_many_vs_many.csv
-12 b/‎benchmark/comparison_multisearch_vs_compare_many_vs_many.csv
-12
diff --git a/‎benchmark/comparison_multisearch_vs_compare_one_vs_many.csv
-4 b/‎benchmark/comparison_multisearch_vs_compare_one_vs_many.csv
-4
diff --git a/‎benchmark/plot_many_v_many.py
+71 b/‎benchmark/plot_many_v_many.py
+71
diff --git a/‎benchmark/plot_one_v_many.py
+37 b/‎benchmark/plot_one_v_many.py
+37
diff --git a/‎benchmark/plots/benchmark_results_compare_many_v_many_peak_memory_usage.pdf
16.6 KB b/‎benchmark/plots/benchmark_results_compare_many_v_many_peak_memory_usage.pdf
16.6 KB
diff --git a/‎benchmark/plots/benchmark_results_compare_many_v_many_peak_memory_usage_query.pdf
16.3 KB b/‎benchmark/plots/benchmark_results_compare_many_v_many_peak_memory_usage_query.pdf
16.3 KB
diff --git a/‎benchmark/plots/benchmark_results_compare_many_v_many_wall_clock_time.pdf
15 KB b/‎benchmark/plots/benchmark_results_compare_many_v_many_wall_clock_time.pdf
15 KB
diff --git a/‎benchmark/plots/benchmark_results_compare_many_v_many_wall_clock_time_query.pdf
15.1 KB b/‎benchmark/plots/benchmark_results_compare_many_v_many_wall_clock_time_query.pdf
15.1 KB
diff --git a/‎benchmark/plots/benchmark_results_compare_one_v_many_peak_memory_usage.pdf
16 KB b/‎benchmark/plots/benchmark_results_compare_one_v_many_peak_memory_usage.pdf
16 KB
diff --git a/‎benchmark/plots/benchmark_results_compare_one_v_many_wall_clock_time.pdf
15.4 KB b/‎benchmark/plots/benchmark_results_compare_one_v_many_wall_clock_time.pdf
15.4 KB
@@ -80,3 +80,19 @@ jobs:
         mv test/output/archived_index_100.tar.gz test/data/archived_index_100.tar.gz
         bin/compare test/data/filelist_100.txt test/data/archived_index_100.tar.gz test/output/working_directory test/output/compare_100_v_100_tar_moved -c 0.0 -t 2 -n 500 -k 51
         python test/compare_multisearch_results.py test/data/multisearch_100_v_100 test/output/compare_100_v_100_tar_moved
+
+    # 10. Run prefetch and compare against sourmash prefetch
+    - name: Run prefetch and compare against sourmash prefetch (threshold=0)
+      run: |
+        ./bin/prefetch test/data/query_sketch.sig test/output/index_100 test/output/this_prefetch_against_100 -b 0
+        ./bin/prefetch test/data/query_sketch.sig test/output/index_50 test/output/this_prefetch_against_50 -b 0
+        python test/compare_against_prefetch.py test/data/sourmash_prefetch_against_100 test/output/this_prefetch_against_100
+        python test/compare_against_prefetch.py test/data/sourmash_prefetch_against_50 test/output/this_prefetch_against_50
+
+    # 11. Run prefetch and compare against sourmash prefetch
+    - name: Run prefetch and compare against sourmash prefetch (threshold=2)
+      run: |
+        ./bin/prefetch test/data/query_sketch.sig test/output/index_100 test/output/this_prefetch_against_100_threshold_2 -b 2
+        ./bin/prefetch test/data/query_sketch.sig test/output/index_50 test/output/this_prefetch_against_50_threshold_2 -b 2
+        python test/compare_against_prefetch.py test/data/sourmash_prefetch_against_100_threshold_2 test/output/this_prefetch_against_100_threshold_2
+        python test/compare_against_prefetch.py test/data/sourmash_prefetch_against_50_threshold_2 test/output/this_prefetch_against_50_threshold_2
@@ -0,0 +1,71 @@
+import pandas as pd 
+import matplotlib.pyplot as plt
+
+# set arial as the default font
+plt.rcParams['font.family'] = 'Arial'
+
+# Load the data
+df_compare = pd.read_csv('benchmark_results_compare_many_v_many.csv')
+df_multisearch = pd.read_csv('benchmark_results_multisearch_many_v_many.csv')
+
+# col names: query_filesize,ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
+
+# plot wall-clock time for query file size = 50000, use different reference file sizes in x-axis
+df_compare_50000 = df_compare[df_compare['query_filesize'] == 50000]
+df_multisearch_50000 = df_multisearch[df_multisearch['query_filesize'] == 50000]
+
+plt.plot(df_compare_50000['ref_filesize'], df_compare_50000['wall_clock_time'], label='compare (cpp)', marker='o')
+plt.plot(df_multisearch_50000['ref_filesize'], df_multisearch_50000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
+
+plt.xlabel('Reference list size')
+plt.ylabel('Wall-clock time (s)')
+plt.title('Wall-clock time for 50k queries')
+plt.xticks(df_compare_50000['ref_filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+plt.savefig('plots/benchmark_results_compare_many_v_many_wall_clock_time.pdf')
+
+
+# plot peak memory usage for query file size = 50000, use different reference file sizes in x-axis
+plt.clf()
+plt.plot(df_compare_50000['ref_filesize'], df_compare_50000['peak_memory_usage'], label='compare (cpp)', marker='o')
+plt.plot(df_multisearch_50000['ref_filesize'], df_multisearch_50000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
+
+plt.xlabel('Reference list size')
+plt.ylabel('Peak memory usage (GB)')
+plt.title('Peak memory usage for 50k queries')
+plt.xticks(df_compare_50000['ref_filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+plt.savefig('plots/benchmark_results_compare_many_v_many_peak_memory_usage.pdf')
+
+
+# plot wall-clock time for ref file size = 50000, use different query file sizes in x-axis
+plt.clf()
+df_compare_50000 = df_compare[df_compare['ref_filesize'] == 50000]
+df_multisearch_50000 = df_multisearch[df_multisearch['ref_filesize'] == 50000]
+
+plt.plot(df_compare_50000['query_filesize'], df_compare_50000['wall_clock_time'], label='compare (cpp)', marker='o')
+plt.plot(df_multisearch_50000['query_filesize'], df_multisearch_50000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
+
+plt.xlabel('Query list size')
+plt.ylabel('Wall-clock time (s)')
+plt.title('Wall-clock time for 50k references')
+plt.xticks(df_compare_50000['query_filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+plt.savefig('plots/benchmark_results_compare_many_v_many_wall_clock_time_query.pdf')
+
+
+# plot peak memory usage for ref file size = 50000, use different query file sizes in x-axis
+plt.clf()
+plt.plot(df_compare_50000['query_filesize'], df_compare_50000['peak_memory_usage'], label='compare (cpp)', marker='o')
+plt.plot(df_multisearch_50000['query_filesize'], df_multisearch_50000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
+
+plt.xlabel('Query list size')
+plt.ylabel('Peak memory usage (GB)')
+plt.title('Peak memory usage for 50k references')
+plt.xticks(df_compare_50000['query_filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+plt.savefig('plots/benchmark_results_compare_many_v_many_peak_memory_usage_query.pdf')
@@ -0,0 +1,37 @@
+import pandas as pd 
+import matplotlib.pyplot as plt
+
+# set arial as the default font
+plt.rcParams['font.family'] = 'Arial'
+
+# Load the data
+df_compare = pd.read_csv('benchmark_results_compare_one_v_many.csv')
+df_multisearch = pd.read_csv('benchmark_results_multisearch_one_v_many.csv')
+
+# col names: ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
+
+# plot wall-clock time against reference file size
+plt.plot(df_compare['ref_filesize'], df_compare['wall_clock_time'], label='compare (cpp)', marker='o')
+plt.plot(df_multisearch['ref_filesize'], df_multisearch['wall_clock_time'], label='multisearch (sourmash)', marker='o')
+
+plt.xlabel('Reference list size')
+plt.ylabel('Wall-clock time (s)')
+plt.title('Wall-clock time for one query')
+plt.xticks(df_compare['ref_filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+plt.savefig('plots/benchmark_results_compare_one_v_many_wall_clock_time.pdf')
+
+# plot peak memory usage against reference file size
+plt.clf()
+plt.plot(df_compare['ref_filesize'], df_compare['peak_memory_usage'], label='compare (cpp)', marker='o')
+plt.plot(df_multisearch['ref_filesize'], df_multisearch['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
+
+plt.xlabel('Reference list size')
+plt.ylabel('Peak memory usage (GB)')
+plt.title('Peak memory usage for one query')
+plt.xticks(df_compare['ref_filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+
+plt.savefig('plots/benchmark_results_compare_one_v_many_peak_memory_usage.pdf')