Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions benchmark/benchmark_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
data_dir = "/scratch/mbr5797/fast_index_compare_data"
filelists = ["filelist_10k", "filelist_20k", "filelist_30k", "filelist_40k", "filelist_50k"]
index_names = ["index_10k", "index_20k", "index_30k", "index_40k", "index_50k"]
output_filename = "index_benchmark_results.csv"
output_filename = "benchmark/benchmark_results_index_1_thread.csv"
filesizes = [10000, 20000, 30000, 40000, 50000]

filesize_to_metrics = {}
Expand All @@ -15,8 +15,8 @@
filelist_path = os.path.join(data_dir, filelist)
index_path = os.path.join(data_dir, index_name)

# command = bin/index filelist index_10k -t 128 -f
command = f"bin/index {filelist_path} {index_path} -t 128 -f"
# command = bin/index filelist index_10k -t num_threads -f
command = f"bin/index {filelist_path} {index_path} -t 1 -f"
print(command)
metrics = run_command_and_record_time_memory(command)
if metrics:
Expand Down
10 changes: 10 additions & 0 deletions benchmark/benchmark_results_compare_many_v_many_no_index.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
query_filesize,ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
10000,10000,204.47,12.57,2.415107727
10000,20000,380.01,24.37,4.331439972
10000,30000,545.8,36.85,6.556518555
20000,10000,302.33,14.6,3.080425262
20000,20000,547.52,26.98,5.377666473
20000,30000,816.97,43.64,7.965732574
30000,10000,403.61,16.38,3.748374939
30000,20000,736.66,31.35,6.426616669
30000,30000,1043.56,47.25,9.372467041
10 changes: 5 additions & 5 deletions benchmark/benchmark_results_compare_one_v_many.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
10000,6.05,4.88,1.4627342224121094
20000,12.87,8.21,2.6674842834472656
30000,19.36,11.51,3.665019989013672
40000,28.41,15.75,4.927120208740234
50000,32.76,18.09,5.807842254638672
10000,4.14,5.49,1.4588813781738281
20000,8.29,11.23,2.6649627685546875
30000,11.61,15.14,3.6682586669921875
40000,15.52,20.81,4.928775787353516
50000,18.86,24.97,5.815422058105469
6 changes: 6 additions & 0 deletions benchmark/benchmark_results_index_1_thread.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
filesize,cpu_time,wall_clock_time,peak_memory_usage
10000,33.17,39.2,1.7743682861328125
20000,78.25,85.62,3.2980804443359375
30000,108.39,120.13,4.630786895751953
40000,146.96,245.95,6.2045135498046875
50000,190.05,216.12,7.409183502197266
10 changes: 5 additions & 5 deletions benchmark/benchmark_results_multisearch_one_v_many.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
10000,31.48,18.79,0.6686553955078125
20000,48.81,36.86,1.2401275634765625
30000,65.18,54.9,1.8105087280273438
40000,90.81,82.46000000000001,2.385051727294922
50000,127.87,120.25,2.9600563049316406
10000,37.02,24.78,0.6694526672363281
20000,53.23,49.53,1.2409553527832031
30000,82.19,73.13,1.8104515075683594
40000,106.16,98.03,2.3841896057128906
50000,127.68,121.55,2.960651397705078
6 changes: 6 additions & 0 deletions benchmark/benchmark_results_sourmash_index.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
filesize,cpu_time,wall_clock_time,peak_memory_usage
10000,68.28,68.57,1.3034210205078125
20000,134.9,133.14,1.8972129821777344
30000,195.83,205.06,2.7955970764160156
40000,269.48,276.15,3.690673828125
50000,348.36,360.26,4.592220306396484
6 changes: 6 additions & 0 deletions benchmark/benchmark_results_sourmash_prefetch.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
10000,33.38,19.81,0.748443603515625
20000,62.05,48.86,1.2132568359375
30000,74.46,68.56,1.7856864929199219
40000,96.94,90.77,2.3553733825683594
50000,132.8,120.42,2.9264678955078125
6 changes: 6 additions & 0 deletions benchmark/benchmark_results_this_prefetch.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
10000,2.28,2.47,0.010650634765625
20000,4.77,5.16,0.01788330078125
30000,6.69,7.25,0.025115966796875
40000,8.46,9.03,0.032421112060546875
50000,10.37,10.84,0.03954315185546875
36 changes: 36 additions & 0 deletions benchmark/benchmark_sourmash_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
from utils import run_command_and_record_time_memory

if __name__ == "__main__":
data_dir = "/scratch/mbr5797/fast_index_compare_data"
filelists = ["filelist_10k", "filelist_20k", "filelist_30k", "filelist_40k", "filelist_50k"]
index_names = ["sourmash_sbt_10k", "sourmash_sbt_20k", "sourmash_sbt_30k", "sourmash_sbt_40k", "sourmash_sbt_50k"]
output_filename = "benchmark/benchmark_results_sourmash_index.csv"
filesizes = [10000, 20000, 30000, 40000, 50000]

filesize_to_metrics = {}

for filelist, index_name, filesize in zip(filelists, index_names, filesizes):
print(f"Processing {filelist}")
filelist_path = os.path.join(data_dir, filelist)
index_path = os.path.join(data_dir, index_name)

# command = sourmash index /scratch/mbr5797/fast_index_compare_data/sourmash_sbt_10k --from-file /scratch/mbr5797/fast_index_compare_data/filelist_10k
command = f"sourmash index {index_path} --from-file {filelist_path}"
print(command)

metrics = run_command_and_record_time_memory(command)
if metrics:
filesize_to_metrics[filesize] = metrics
else:
print(f"Error processing {filelist}")

print(filesize_to_metrics)

# Write results to CSV, headers should have no spaces or brackets
with open(output_filename, 'w') as f:
f.write("filesize,cpu_time,wall_clock_time,peak_memory_usage\n")
for filesize, metrics in filesize_to_metrics.items():
f.write(f"{filesize},{metrics['cpu_time']},{metrics['wall_clock_time']},{metrics['peak_memory_usage']}\n")


38 changes: 38 additions & 0 deletions benchmark/benchmark_sourmash_prefetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os
from utils import run_command_and_record_time_memory

if __name__ == "__main__":
data_dir = "/scratch/mbr5797/fast_index_compare_data"
index_names = ["sourmash_sbt_10k.sbt.zip", "sourmash_sbt_20k.sbt.zip", "sourmash_sbt_30k.sbt.zip", "sourmash_sbt_40k.sbt.zip", "sourmash_sbt_50k.sbt.zip"]
benchmark_filename = "benchmark/benchmark_results_sourmash_prefetch.csv"
ref_sizes = [10000, 20000, 30000, 40000, 50000]

# this is the query sketch in filelist_1, used in the other benchmarks
query_sketch_name = "/scratch/mbr5797/fast_index_compare_data/signatures_gtdb_rs207_genomic/718882c2dd33e2711f95f8ed2a413fde.sig"

ref_size_to_metrics = {}

for index_name, ref_size in zip(index_names, ref_sizes):
print(f"Processing {query_sketch_name} vs {index_name}")

index_path = os.path.join(data_dir, index_name)
output_filename = f"sourmash_prefetch_results_against_{ref_size}.csv"
output_filename = os.path.join(data_dir, output_filename)

# cmd: sourmash prefetch query_filename sbt_name --threshold_bp 0 -o output_filename
command = f"sourmash prefetch {query_sketch_name} {index_path} --threshold-bp 0 -o {output_filename}"
print(command)

metrics = run_command_and_record_time_memory(command)
if metrics:
ref_size_to_metrics[ref_size] = metrics
else:
print(f"Error processing {query_sketch_name} vs {index_name}")

print(ref_size_to_metrics)

with open(benchmark_filename, "w") as f:
print('Writing to: ' + benchmark_filename)
f.write("ref_filesize,cpu_time,wall_clock_time,peak_memory_usage\n")
for ref_size, metrics in ref_size_to_metrics.items():
f.write(f"{ref_size},{metrics['cpu_time']},{metrics['wall_clock_time']},{metrics['peak_memory_usage']}\n")
38 changes: 38 additions & 0 deletions benchmark/benchmark_this_prefetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os
from utils import run_command_and_record_time_memory

if __name__ == "__main__":
data_dir = "/scratch/mbr5797/fast_index_compare_data"
index_names = ["index_10k", "index_20k", "index_30k", "index_40k", "index_50k"]
benchmark_filename = "benchmark/benchmark_results_this_prefetch.csv"
ref_sizes = [10000, 20000, 30000, 40000, 50000]

# this is the query sketch in filelist_1, used in the other benchmarks
query_sketch_name = "/scratch/mbr5797/fast_index_compare_data/signatures_gtdb_rs207_genomic/718882c2dd33e2711f95f8ed2a413fde.sig"

ref_size_to_metrics = {}

for index_name, ref_size in zip(index_names, ref_sizes):
print(f"Processing {query_sketch_name} vs {index_name}")

index_path = os.path.join(data_dir, index_name)
output_filename = f"prefetch_results_against_{ref_size}.csv"
output_filename = os.path.join(data_dir, output_filename)

# cmd: bin/prefetch query_path ref_index_path output_filename -b 0
command = f"bin/prefetch {query_sketch_name} {index_path} {output_filename} -b 0"
print(command)

metrics = run_command_and_record_time_memory(command)
if metrics:
ref_size_to_metrics[ref_size] = metrics
else:
print(f"Error processing {query_sketch_name} vs {index_name}")

print(ref_size_to_metrics)

with open(benchmark_filename, "w") as f:
print('Writing to: ' + benchmark_filename)
f.write("ref_filesize,cpu_time,wall_clock_time,peak_memory_usage\n")
for ref_size, metrics in ref_size_to_metrics.items():
f.write(f"{ref_size},{metrics['cpu_time']},{metrics['wall_clock_time']},{metrics['peak_memory_usage']}\n")
53 changes: 53 additions & 0 deletions benchmark/plot_indexing_time_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pandas as pd
import matplotlib.pyplot as plt

# set arial as the default font
plt.rcParams['font.family'] = 'Arial'

# Load the data
df_sourmash_index = pd.read_csv('benchmark_results_sourmash_index.csv')
df_cpp_index_one_thread = pd.read_csv('benchmark_results_index_1_thread.csv')
df_cpp_index_128_threads = pd.read_csv('benchmark_results_index_128_threads.csv')

# col names: filesize,cpu_time,wall_clock_time,peak_memory_usage

# plot wall-clock time against file size
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['wall_clock_time'], label='sourmash index', marker='o')
plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['wall_clock_time'], label='cpp index (1 thread)', marker='o')
plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['wall_clock_time'], label='cpp index (128 threads)', marker='o')

plt.xlabel('Number of references')
plt.ylabel('Wall-clock time (s) to build index')
plt.title('Wall-clock time to build indices for different number of references')
plt.xticks(df_sourmash_index['filesize'])
plt.legend()
plt.grid(linestyle='--', alpha=0.3)
plt.savefig('plots/benchmark_results_indexing_wall_clock_time.pdf')

# plot cpu time against file size
plt.clf()
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['cpu_time'], label='sourmash index', marker='o')
plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['cpu_time'], label='cpp index (1 thread)', marker='o')
plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['cpu_time'], label='cpp index (128 threads)', marker='o')

plt.xlabel('Number of references')
plt.ylabel('CPU time (s) to build index')
plt.title('CPU time to build indices for different number of references')
plt.xticks(df_sourmash_index['filesize'])
plt.legend()
plt.grid(linestyle='--', alpha=0.3)
plt.savefig('plots/benchmark_results_indexing_cpu_time.pdf')

# plot peak memory usage against file size
plt.clf()
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['peak_memory_usage'], label='sourmash index', marker='o')
plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['peak_memory_usage'], label='cpp index (1 thread)', marker='o')
plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['peak_memory_usage'], label='cpp index (128 threads)', marker='o')

plt.xlabel('Number of references')
plt.ylabel('Peak memory usage (GB) to build index')
plt.title('Peak memory usage to build indices for different number of references')
plt.xticks(df_sourmash_index['filesize'])
plt.legend()
plt.grid(linestyle='--', alpha=0.3)
plt.savefig('plots/benchmark_results_indexing_peak_memory_usage.pdf')
90 changes: 66 additions & 24 deletions benchmark/plot_many_v_many.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,66 +6,108 @@

# Load the data
df_compare = pd.read_csv('benchmark_results_compare_many_v_many.csv')
df_compare_without_index = pd.read_csv('benchmark_results_compare_many_v_many_no_index.csv')
df_multisearch = pd.read_csv('benchmark_results_multisearch_many_v_many.csv')

# drop rows with query_filesize > 30000, ref_filesize > 30000
df_compare = df_compare[(df_compare['query_filesize'] <= 30000) & (df_compare['ref_filesize'] <= 30000)]
df_compare_without_index = df_compare_without_index[(df_compare_without_index['query_filesize'] <= 30000) & (df_compare_without_index['ref_filesize'] <= 30000)]
df_multisearch = df_multisearch[(df_multisearch['query_filesize'] <= 30000) & (df_multisearch['ref_filesize'] <= 30000)]

# col names: query_filesize,ref_filesize,cpu_time,wall_clock_time,peak_memory_usage

# plot wall-clock time for query file size = 50000, use different reference file sizes in x-axis
df_compare_50000 = df_compare[df_compare['query_filesize'] == 50000]
df_multisearch_50000 = df_multisearch[df_multisearch['query_filesize'] == 50000]
# plot wall-clock time for query file size = 30000, use different reference file sizes in x-axis
df_compare_30000 = df_compare[df_compare['query_filesize'] == 30000]
df_compare_without_index_30000 = df_compare_without_index[df_compare_without_index['query_filesize'] == 30000]
df_multisearch_30000 = df_multisearch[df_multisearch['query_filesize'] == 30000]

plt.plot(df_compare_50000['ref_filesize'], df_compare_50000['wall_clock_time'], label='compare (cpp)', marker='o')
plt.plot(df_multisearch_50000['ref_filesize'], df_multisearch_50000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp)', marker='o')
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) without index', marker='o')
plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['wall_clock_time'], label='multisearch (sourmash)', marker='o')

plt.xlabel('Reference list size')
plt.ylabel('Wall-clock time (s)')
plt.title('Wall-clock time for 50k queries')
plt.xticks(df_compare_50000['ref_filesize'])
plt.title('Wall-clock time for 30k queries')
plt.xticks(df_compare_30000['ref_filesize'])
plt.legend()
plt.grid(linestyle='--', alpha=0.3)
plt.savefig('plots/benchmark_results_compare_many_v_many_wall_clock_time.pdf')


# plot peak memory usage for query file size = 50000, use different reference file sizes in x-axis
# plot CPU time for query file size = 30000, use different reference file sizes in x-axis
plt.clf()
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['cpu_time'], label='compare (cpp)', marker='o')
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) without index', marker='o')
plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['cpu_time'], label='multisearch (sourmash)', marker='o')

plt.xlabel('Reference list size')
plt.ylabel('CPU time (s)')
plt.title('CPU time for 30k queries')
plt.xticks(df_compare_30000['ref_filesize'])
plt.legend()
plt.grid(linestyle='--', alpha=0.3)
plt.savefig('plots/benchmark_results_compare_many_v_many_cpu_time.pdf')


# plot peak memory usage for query file size = 30000, use different reference file sizes in x-axis
plt.clf()
plt.plot(df_compare_50000['ref_filesize'], df_compare_50000['peak_memory_usage'], label='compare (cpp)', marker='o')
plt.plot(df_multisearch_50000['ref_filesize'], df_multisearch_50000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp)', marker='o')
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) without index', marker='o')
plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')

plt.xlabel('Reference list size')
plt.ylabel('Peak memory usage (GB)')
plt.title('Peak memory usage for 50k queries')
plt.xticks(df_compare_50000['ref_filesize'])
plt.title('Peak memory usage for 30k queries')
plt.xticks(df_compare_30000['ref_filesize'])
plt.legend()
plt.grid(linestyle='--', alpha=0.3)
plt.savefig('plots/benchmark_results_compare_many_v_many_peak_memory_usage.pdf')


# plot wall-clock time for ref file size = 50000, use different query file sizes in x-axis
# plot wall-clock time for ref file size = 30000, use different query file sizes in x-axis
plt.clf()
df_compare_50000 = df_compare[df_compare['ref_filesize'] == 50000]
df_multisearch_50000 = df_multisearch[df_multisearch['ref_filesize'] == 50000]
df_compare_30000 = df_compare[df_compare['ref_filesize'] == 30000]
df_compare_without_index_30000 = df_compare_without_index[df_compare_without_index['ref_filesize'] == 30000]
df_multisearch_30000 = df_multisearch[df_multisearch['ref_filesize'] == 30000]

plt.plot(df_compare_50000['query_filesize'], df_compare_50000['wall_clock_time'], label='compare (cpp)', marker='o')
plt.plot(df_multisearch_50000['query_filesize'], df_multisearch_50000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp)', marker='o')
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) without index', marker='o')
plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['wall_clock_time'], label='multisearch (sourmash)', marker='o')

plt.xlabel('Query list size')
plt.ylabel('Wall-clock time (s)')
plt.title('Wall-clock time for 50k references')
plt.xticks(df_compare_50000['query_filesize'])
plt.title('Wall-clock time for 30k references')
plt.xticks(df_compare_30000['query_filesize'])
plt.legend()
plt.grid(linestyle='--', alpha=0.3)
plt.savefig('plots/benchmark_results_compare_many_v_many_wall_clock_time_query.pdf')


# plot peak memory usage for ref file size = 50000, use different query file sizes in x-axis
# plot peak memory usage for ref file size = 30000, use different query file sizes in x-axis
plt.clf()
plt.plot(df_compare_50000['query_filesize'], df_compare_50000['peak_memory_usage'], label='compare (cpp)', marker='o')
plt.plot(df_multisearch_50000['query_filesize'], df_multisearch_50000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp)', marker='o')
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) without index', marker='o')
plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')

plt.xlabel('Query list size')
plt.ylabel('Peak memory usage (GB)')
plt.title('Peak memory usage for 50k references')
plt.xticks(df_compare_50000['query_filesize'])
plt.title('Peak memory usage for 30k references')
plt.xticks(df_compare_30000['query_filesize'])
plt.legend()
plt.grid(linestyle='--', alpha=0.3)
plt.savefig('plots/benchmark_results_compare_many_v_many_peak_memory_usage_query.pdf')


# plot CPU time for ref file size = 30000, use different query file sizes in x-axis
plt.clf()
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['cpu_time'], label='compare (cpp)', marker='o')
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) without index', marker='o')
plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['cpu_time'], label='multisearch (sourmash)', marker='o')

plt.xlabel('Query list size')
plt.ylabel('CPU time (s)')
plt.title('CPU time for 30k references')
plt.xticks(df_compare_30000['query_filesize'])
plt.legend()
plt.grid(linestyle='--', alpha=0.3)
plt.savefig('plots/benchmark_results_compare_many_v_many_cpu_time_query.pdf')
Loading
Loading