KoslickiLab · mahmudhera · Dec 19, 2024 · Dec 18, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/benchmark/benchmark_index.py b/benchmark/benchmark_index.py
@@ -5,7 +5,7 @@
     data_dir = "/scratch/mbr5797/fast_index_compare_data"
     filelists = ["filelist_10k", "filelist_20k", "filelist_30k", "filelist_40k", "filelist_50k"]
     index_names = ["index_10k", "index_20k", "index_30k", "index_40k", "index_50k"]
-    output_filename = "index_benchmark_results.csv"
+    output_filename = "benchmark/benchmark_results_index_1_thread.csv"
     filesizes = [10000, 20000, 30000, 40000, 50000]
 
     filesize_to_metrics = {}
@@ -15,8 +15,8 @@
         filelist_path = os.path.join(data_dir, filelist)
         index_path = os.path.join(data_dir, index_name)
 
-        # command = bin/index filelist index_10k -t 128 -f
-        command = f"bin/index {filelist_path} {index_path} -t 128 -f"
+        # command = bin/index filelist index_10k -t num_threads -f
+        command = f"bin/index {filelist_path} {index_path} -t 1 -f"
         print(command)
         metrics = run_command_and_record_time_memory(command)
         if metrics:

diff --git a/benchmark/benchmark_results_compare_many_v_many_no_index.csv b/benchmark/benchmark_results_compare_many_v_many_no_index.csv
@@ -0,0 +1,10 @@
+query_filesize,ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
+10000,10000,204.47,12.57,2.415107727
+10000,20000,380.01,24.37,4.331439972
+10000,30000,545.8,36.85,6.556518555
+20000,10000,302.33,14.6,3.080425262
+20000,20000,547.52,26.98,5.377666473
+20000,30000,816.97,43.64,7.965732574
+30000,10000,403.61,16.38,3.748374939
+30000,20000,736.66,31.35,6.426616669
+30000,30000,1043.56,47.25,9.372467041
diff --git a/benchmark/benchmark_results_compare_one_v_many.csv b/benchmark/benchmark_results_compare_one_v_many.csv
@@ -1,6 +1,6 @@
 ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
-10000,6.05,4.88,1.4627342224121094
-20000,12.87,8.21,2.6674842834472656
-30000,19.36,11.51,3.665019989013672
-40000,28.41,15.75,4.927120208740234
-50000,32.76,18.09,5.807842254638672
+10000,4.14,5.49,1.4588813781738281
+20000,8.29,11.23,2.6649627685546875
+30000,11.61,15.14,3.6682586669921875
+40000,15.52,20.81,4.928775787353516
+50000,18.86,24.97,5.815422058105469
diff --git a/benchmark/index_benchmark_results.csv → ...k/benchmark_results_index_128_threads.csv b/benchmark/index_benchmark_results.csv → ...k/benchmark_results_index_128_threads.csv
diff --git a/benchmark/benchmark_results_index_1_thread.csv b/benchmark/benchmark_results_index_1_thread.csv
@@ -0,0 +1,6 @@
+filesize,cpu_time,wall_clock_time,peak_memory_usage
+10000,33.17,39.2,1.7743682861328125
+20000,78.25,85.62,3.2980804443359375
+30000,108.39,120.13,4.630786895751953
+40000,146.96,245.95,6.2045135498046875
+50000,190.05,216.12,7.409183502197266
diff --git a/benchmark/benchmark_results_multisearch_one_v_many.csv b/benchmark/benchmark_results_multisearch_one_v_many.csv
@@ -1,6 +1,6 @@
 ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
-10000,31.48,18.79,0.6686553955078125
-20000,48.81,36.86,1.2401275634765625
-30000,65.18,54.9,1.8105087280273438
-40000,90.81,82.46000000000001,2.385051727294922
-50000,127.87,120.25,2.9600563049316406
+10000,37.02,24.78,0.6694526672363281
+20000,53.23,49.53,1.2409553527832031
+30000,82.19,73.13,1.8104515075683594
+40000,106.16,98.03,2.3841896057128906
+50000,127.68,121.55,2.960651397705078
diff --git a/benchmark/benchmark_results_sourmash_index.csv b/benchmark/benchmark_results_sourmash_index.csv
@@ -0,0 +1,6 @@
+filesize,cpu_time,wall_clock_time,peak_memory_usage
+10000,68.28,68.57,1.3034210205078125
+20000,134.9,133.14,1.8972129821777344
+30000,195.83,205.06,2.7955970764160156
+40000,269.48,276.15,3.690673828125
+50000,348.36,360.26,4.592220306396484
diff --git a/benchmark/benchmark_results_sourmash_prefetch.csv b/benchmark/benchmark_results_sourmash_prefetch.csv
@@ -0,0 +1,6 @@
+ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
+10000,33.38,19.81,0.748443603515625
+20000,62.05,48.86,1.2132568359375
+30000,74.46,68.56,1.7856864929199219
+40000,96.94,90.77,2.3553733825683594
+50000,132.8,120.42,2.9264678955078125
diff --git a/benchmark/benchmark_results_this_prefetch.csv b/benchmark/benchmark_results_this_prefetch.csv
@@ -0,0 +1,6 @@
+ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
+10000,2.28,2.47,0.010650634765625
+20000,4.77,5.16,0.01788330078125
+30000,6.69,7.25,0.025115966796875
+40000,8.46,9.03,0.032421112060546875
+50000,10.37,10.84,0.03954315185546875
diff --git a/benchmark/benchmark_sourmash_index.py b/benchmark/benchmark_sourmash_index.py
@@ -0,0 +1,36 @@
+import os
+from utils import run_command_and_record_time_memory
+
+if __name__ == "__main__":
+    data_dir = "/scratch/mbr5797/fast_index_compare_data"
+    filelists = ["filelist_10k", "filelist_20k", "filelist_30k", "filelist_40k", "filelist_50k"]
+    index_names = ["sourmash_sbt_10k", "sourmash_sbt_20k", "sourmash_sbt_30k", "sourmash_sbt_40k", "sourmash_sbt_50k"]
+    output_filename = "benchmark/benchmark_results_sourmash_index.csv"
+    filesizes = [10000, 20000, 30000, 40000, 50000]
+
+    filesize_to_metrics = {}
+
+    for filelist, index_name, filesize in zip(filelists, index_names, filesizes):
+        print(f"Processing {filelist}")
+        filelist_path = os.path.join(data_dir, filelist)
+        index_path = os.path.join(data_dir, index_name)
+
+        # command = sourmash index /scratch/mbr5797/fast_index_compare_data/sourmash_sbt_10k --from-file /scratch/mbr5797/fast_index_compare_data/filelist_10k
+        command = f"sourmash index {index_path} --from-file {filelist_path}"
+        print(command)
+
+        metrics = run_command_and_record_time_memory(command)
+        if metrics:
+            filesize_to_metrics[filesize] = metrics
+        else:
+            print(f"Error processing {filelist}")
+
+    print(filesize_to_metrics)
+
+    # Write results to CSV, headers should have no spaces or brackets
+    with open(output_filename, 'w') as f:
+        f.write("filesize,cpu_time,wall_clock_time,peak_memory_usage\n")
+        for filesize, metrics in filesize_to_metrics.items():
+            f.write(f"{filesize},{metrics['cpu_time']},{metrics['wall_clock_time']},{metrics['peak_memory_usage']}\n")
+
+
diff --git a/benchmark/benchmark_sourmash_prefetch.py b/benchmark/benchmark_sourmash_prefetch.py
@@ -0,0 +1,38 @@
+import os
+from utils import run_command_and_record_time_memory
+
+if __name__ == "__main__":
+    data_dir = "/scratch/mbr5797/fast_index_compare_data"
+    index_names = ["sourmash_sbt_10k.sbt.zip", "sourmash_sbt_20k.sbt.zip", "sourmash_sbt_30k.sbt.zip", "sourmash_sbt_40k.sbt.zip", "sourmash_sbt_50k.sbt.zip"]
+    benchmark_filename = "benchmark/benchmark_results_sourmash_prefetch.csv"
+    ref_sizes = [10000, 20000, 30000, 40000, 50000]
+
+    # this is the query sketch in filelist_1, used in the other benchmarks
+    query_sketch_name = "/scratch/mbr5797/fast_index_compare_data/signatures_gtdb_rs207_genomic/718882c2dd33e2711f95f8ed2a413fde.sig"
+
+    ref_size_to_metrics = {}
+
+    for index_name, ref_size in zip(index_names, ref_sizes):
+        print(f"Processing {query_sketch_name} vs {index_name}")
+
+        index_path = os.path.join(data_dir, index_name)
+        output_filename = f"sourmash_prefetch_results_against_{ref_size}.csv"
+        output_filename = os.path.join(data_dir, output_filename)
+
+        # cmd: sourmash prefetch query_filename sbt_name --threshold_bp 0 -o output_filename
+        command = f"sourmash prefetch {query_sketch_name} {index_path} --threshold-bp 0 -o {output_filename}"
+        print(command)
+
+        metrics = run_command_and_record_time_memory(command)
+        if metrics:
+            ref_size_to_metrics[ref_size] = metrics
+        else:
+            print(f"Error processing {query_sketch_name} vs {index_name}")
+
+    print(ref_size_to_metrics)
+
+    with open(benchmark_filename, "w") as f:
+        print('Writing to: ' + benchmark_filename)
+        f.write("ref_filesize,cpu_time,wall_clock_time,peak_memory_usage\n")
+        for ref_size, metrics in ref_size_to_metrics.items():
+            f.write(f"{ref_size},{metrics['cpu_time']},{metrics['wall_clock_time']},{metrics['peak_memory_usage']}\n")
diff --git a/benchmark/benchmark_this_prefetch.py b/benchmark/benchmark_this_prefetch.py
@@ -0,0 +1,38 @@
+import os
+from utils import run_command_and_record_time_memory
+
+if __name__ == "__main__":
+    data_dir = "/scratch/mbr5797/fast_index_compare_data"
+    index_names = ["index_10k", "index_20k", "index_30k", "index_40k", "index_50k"]
+    benchmark_filename = "benchmark/benchmark_results_this_prefetch.csv"
+    ref_sizes = [10000, 20000, 30000, 40000, 50000]
+
+    # this is the query sketch in filelist_1, used in the other benchmarks
+    query_sketch_name = "/scratch/mbr5797/fast_index_compare_data/signatures_gtdb_rs207_genomic/718882c2dd33e2711f95f8ed2a413fde.sig"
+
+    ref_size_to_metrics = {}
+
+    for index_name, ref_size in zip(index_names, ref_sizes):
+        print(f"Processing {query_sketch_name} vs {index_name}")
+
+        index_path = os.path.join(data_dir, index_name)
+        output_filename = f"prefetch_results_against_{ref_size}.csv"
+        output_filename = os.path.join(data_dir, output_filename)
+
+        # cmd: bin/prefetch query_path ref_index_path output_filename -b 0
+        command = f"bin/prefetch {query_sketch_name} {index_path} {output_filename} -b 0"
+        print(command)
+
+        metrics = run_command_and_record_time_memory(command)
+        if metrics:
+            ref_size_to_metrics[ref_size] = metrics
+        else:
+            print(f"Error processing {query_sketch_name} vs {index_name}")
+
+    print(ref_size_to_metrics)
+
+    with open(benchmark_filename, "w") as f:
+        print('Writing to: ' + benchmark_filename)
+        f.write("ref_filesize,cpu_time,wall_clock_time,peak_memory_usage\n")
+        for ref_size, metrics in ref_size_to_metrics.items():
+            f.write(f"{ref_size},{metrics['cpu_time']},{metrics['wall_clock_time']},{metrics['peak_memory_usage']}\n")
diff --git a/benchmark/plot_indexing_time_memory.py b/benchmark/plot_indexing_time_memory.py
@@ -0,0 +1,53 @@
+import pandas as pd 
+import matplotlib.pyplot as plt
+
+# set arial as the default font
+plt.rcParams['font.family'] = 'Arial'
+
+# Load the data
+df_sourmash_index = pd.read_csv('benchmark_results_sourmash_index.csv')
+df_cpp_index_one_thread = pd.read_csv('benchmark_results_index_1_thread.csv')
+df_cpp_index_128_threads = pd.read_csv('benchmark_results_index_128_threads.csv')
+
+# col names: filesize,cpu_time,wall_clock_time,peak_memory_usage
+
+# plot wall-clock time against file size
+plt.plot(df_sourmash_index['filesize'], df_sourmash_index['wall_clock_time'], label='sourmash index', marker='o')
+plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['wall_clock_time'], label='cpp index (1 thread)', marker='o')
+plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['wall_clock_time'], label='cpp index (128 threads)', marker='o')
+
+plt.xlabel('Number of references')
+plt.ylabel('Wall-clock time (s) to build index')
+plt.title('Wall-clock time to build indices for different number of references')
+plt.xticks(df_sourmash_index['filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+plt.savefig('plots/benchmark_results_indexing_wall_clock_time.pdf')
+
+# plot cpu time against file size
+plt.clf()
+plt.plot(df_sourmash_index['filesize'], df_sourmash_index['cpu_time'], label='sourmash index', marker='o')
+plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['cpu_time'], label='cpp index (1 thread)', marker='o')
+plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['cpu_time'], label='cpp index (128 threads)', marker='o')
+
+plt.xlabel('Number of references')
+plt.ylabel('CPU time (s) to build index')
+plt.title('CPU time to build indices for different number of references')
+plt.xticks(df_sourmash_index['filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+plt.savefig('plots/benchmark_results_indexing_cpu_time.pdf')
+
+# plot peak memory usage against file size
+plt.clf()
+plt.plot(df_sourmash_index['filesize'], df_sourmash_index['peak_memory_usage'], label='sourmash index', marker='o')
+plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['peak_memory_usage'], label='cpp index (1 thread)', marker='o')
+plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['peak_memory_usage'], label='cpp index (128 threads)', marker='o')
+
+plt.xlabel('Number of references')
+plt.ylabel('Peak memory usage (GB) to build index')
+plt.title('Peak memory usage to build indices for different number of references')
+plt.xticks(df_sourmash_index['filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+plt.savefig('plots/benchmark_results_indexing_peak_memory_usage.pdf')
diff --git a/benchmark/plot_many_v_many.py b/benchmark/plot_many_v_many.py
@@ -6,66 +6,108 @@
 
 # Load the data
 df_compare = pd.read_csv('benchmark_results_compare_many_v_many.csv')
+df_compare_without_index = pd.read_csv('benchmark_results_compare_many_v_many_no_index.csv')
 df_multisearch = pd.read_csv('benchmark_results_multisearch_many_v_many.csv')
 
+# drop rows with query_filesize > 30000, ref_filesize > 30000
+df_compare = df_compare[(df_compare['query_filesize'] <= 30000) & (df_compare['ref_filesize'] <= 30000)]
+df_compare_without_index = df_compare_without_index[(df_compare_without_index['query_filesize'] <= 30000) & (df_compare_without_index['ref_filesize'] <= 30000)]
+df_multisearch = df_multisearch[(df_multisearch['query_filesize'] <= 30000) & (df_multisearch['ref_filesize'] <= 30000)]
+
 # col names: query_filesize,ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
 
-# plot wall-clock time for query file size = 50000, use different reference file sizes in x-axis
-df_compare_50000 = df_compare[df_compare['query_filesize'] == 50000]
-df_multisearch_50000 = df_multisearch[df_multisearch['query_filesize'] == 50000]
+# plot wall-clock time for query file size = 30000, use different reference file sizes in x-axis
+df_compare_30000 = df_compare[df_compare['query_filesize'] == 30000]
+df_compare_without_index_30000 = df_compare_without_index[df_compare_without_index['query_filesize'] == 30000]
+df_multisearch_30000 = df_multisearch[df_multisearch['query_filesize'] == 30000]
 
-plt.plot(df_compare_50000['ref_filesize'], df_compare_50000['wall_clock_time'], label='compare (cpp)', marker='o')
-plt.plot(df_multisearch_50000['ref_filesize'], df_multisearch_50000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
+plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp)', marker='o')
+plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) without index', marker='o')
+plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
 
 plt.xlabel('Reference list size')
 plt.ylabel('Wall-clock time (s)')
-plt.title('Wall-clock time for 50k queries')
-plt.xticks(df_compare_50000['ref_filesize'])
+plt.title('Wall-clock time for 30k queries')
+plt.xticks(df_compare_30000['ref_filesize'])
 plt.legend()
 plt.grid(linestyle='--', alpha=0.3)
 plt.savefig('plots/benchmark_results_compare_many_v_many_wall_clock_time.pdf')
 
 
-# plot peak memory usage for query file size = 50000, use different reference file sizes in x-axis
+# plot CPU time for query file size = 30000, use different reference file sizes in x-axis
+plt.clf()
+plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['cpu_time'], label='compare (cpp)', marker='o')
+plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) without index', marker='o')
+plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['cpu_time'], label='multisearch (sourmash)', marker='o')
+
+plt.xlabel('Reference list size')
+plt.ylabel('CPU time (s)')
+plt.title('CPU time for 30k queries')
+plt.xticks(df_compare_30000['ref_filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+plt.savefig('plots/benchmark_results_compare_many_v_many_cpu_time.pdf')
+
+
+# plot peak memory usage for query file size = 30000, use different reference file sizes in x-axis
 plt.clf()
-plt.plot(df_compare_50000['ref_filesize'], df_compare_50000['peak_memory_usage'], label='compare (cpp)', marker='o')
-plt.plot(df_multisearch_50000['ref_filesize'], df_multisearch_50000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
+plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp)', marker='o')
+plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) without index', marker='o')
+plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
 
 plt.xlabel('Reference list size')
 plt.ylabel('Peak memory usage (GB)')
-plt.title('Peak memory usage for 50k queries')
-plt.xticks(df_compare_50000['ref_filesize'])
+plt.title('Peak memory usage for 30k queries')
+plt.xticks(df_compare_30000['ref_filesize'])
 plt.legend()
 plt.grid(linestyle='--', alpha=0.3)
 plt.savefig('plots/benchmark_results_compare_many_v_many_peak_memory_usage.pdf')
 
 
-# plot wall-clock time for ref file size = 50000, use different query file sizes in x-axis
+# plot wall-clock time for ref file size = 30000, use different query file sizes in x-axis
 plt.clf()
-df_compare_50000 = df_compare[df_compare['ref_filesize'] == 50000]
-df_multisearch_50000 = df_multisearch[df_multisearch['ref_filesize'] == 50000]
+df_compare_30000 = df_compare[df_compare['ref_filesize'] == 30000]
+df_compare_without_index_30000 = df_compare_without_index[df_compare_without_index['ref_filesize'] == 30000]
+df_multisearch_30000 = df_multisearch[df_multisearch['ref_filesize'] == 30000]
 
-plt.plot(df_compare_50000['query_filesize'], df_compare_50000['wall_clock_time'], label='compare (cpp)', marker='o')
-plt.plot(df_multisearch_50000['query_filesize'], df_multisearch_50000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
+plt.plot(df_compare_30000['query_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp)', marker='o')
+plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) without index', marker='o')
+plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
 
 plt.xlabel('Query list size')
 plt.ylabel('Wall-clock time (s)')
-plt.title('Wall-clock time for 50k references')
-plt.xticks(df_compare_50000['query_filesize'])
+plt.title('Wall-clock time for 30k references')
+plt.xticks(df_compare_30000['query_filesize'])
 plt.legend()
 plt.grid(linestyle='--', alpha=0.3)
 plt.savefig('plots/benchmark_results_compare_many_v_many_wall_clock_time_query.pdf')
 
 
-# plot peak memory usage for ref file size = 50000, use different query file sizes in x-axis
+# plot peak memory usage for ref file size = 30000, use different query file sizes in x-axis
 plt.clf()
-plt.plot(df_compare_50000['query_filesize'], df_compare_50000['peak_memory_usage'], label='compare (cpp)', marker='o')
-plt.plot(df_multisearch_50000['query_filesize'], df_multisearch_50000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
+plt.plot(df_compare_30000['query_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp)', marker='o')
+plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) without index', marker='o')
+plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
 
 plt.xlabel('Query list size')
 plt.ylabel('Peak memory usage (GB)')
-plt.title('Peak memory usage for 50k references')
-plt.xticks(df_compare_50000['query_filesize'])
+plt.title('Peak memory usage for 30k references')
+plt.xticks(df_compare_30000['query_filesize'])
 plt.legend()
 plt.grid(linestyle='--', alpha=0.3)
 plt.savefig('plots/benchmark_results_compare_many_v_many_peak_memory_usage_query.pdf')
+
+
+# plot CPU time for ref file size = 30000, use different query file sizes in x-axis
+plt.clf()
+plt.plot(df_compare_30000['query_filesize'], df_compare_30000['cpu_time'], label='compare (cpp)', marker='o')
+plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) without index', marker='o')
+plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['cpu_time'], label='multisearch (sourmash)', marker='o')
+
+plt.xlabel('Query list size')
+plt.ylabel('CPU time (s)')
+plt.title('CPU time for 30k references')
+plt.xticks(df_compare_30000['query_filesize'])
+plt.legend()
+plt.grid(linestyle='--', alpha=0.3)
+plt.savefig('plots/benchmark_results_compare_many_v_many_cpu_time_query.pdf')