Skip to content

Commit 610e63f

Browse files
authored
Merge pull request #40 from mahmudhera/main
added sketch sizes in compare output (resolves #39)
2 parents 19b7709 + 3de8eee commit 610e63f

22 files changed

+43
-36
lines changed

README.md

+10-8
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,16 @@ All tool usages are available using `--help` flag.
1818

1919
## `compare` output format
2020
1. Index of query
21-
2. Name of query
22-
3. MD5 of query
23-
4. Index of match
24-
5. Name of match
25-
6. MD5 of match
26-
7. Jaccard
27-
8. Containment(query, target)
28-
9. Containment(target, query)
21+
1. Name of query
22+
1. MD5 of query
23+
1. Size of the query sketch
24+
1. Index of match
25+
1. Name of match
26+
1. MD5 of match
27+
1. Size of the matched reference sketch
28+
1. Jaccard
29+
1. Containment(query, target)
30+
1. Containment(target, query)
2931

3032

3133
# Testing

benchmark/plot_indexing_time_memory.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# col names: filesize,cpu_time,wall_clock_time,peak_memory_usage
1313

1414
# plot wall-clock time against file size
15-
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['wall_clock_time'], label='sourmash index', marker='o')
15+
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['wall_clock_time'], label='sourmash index (1 thread)', marker='o')
1616
plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['wall_clock_time'], label='cpp index (1 thread)', marker='o')
1717
plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['wall_clock_time'], label='cpp index (128 threads)', marker='o')
1818

@@ -26,7 +26,7 @@
2626

2727
# plot cpu time against file size
2828
plt.clf()
29-
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['cpu_time'], label='sourmash index', marker='o')
29+
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['cpu_time'], label='sourmash index (1 thread)', marker='o')
3030
plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['cpu_time'], label='cpp index (1 thread)', marker='o')
3131
plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['cpu_time'], label='cpp index (128 threads)', marker='o')
3232

@@ -40,7 +40,7 @@
4040

4141
# plot peak memory usage against file size
4242
plt.clf()
43-
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['peak_memory_usage'], label='sourmash index', marker='o')
43+
plt.plot(df_sourmash_index['filesize'], df_sourmash_index['peak_memory_usage'], label='sourmash index (1 thread)', marker='o')
4444
plt.plot(df_cpp_index_one_thread['filesize'], df_cpp_index_one_thread['peak_memory_usage'], label='cpp index (1 thread)', marker='o')
4545
plt.plot(df_cpp_index_128_threads['filesize'], df_cpp_index_128_threads['peak_memory_usage'], label='cpp index (128 threads)', marker='o')
4646

benchmark/plot_many_v_many.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
df_compare_without_index_30000 = df_compare_without_index[df_compare_without_index['query_filesize'] == 30000]
2222
df_multisearch_30000 = df_multisearch[df_multisearch['query_filesize'] == 30000]
2323

24-
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp)', marker='o')
25-
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) without index', marker='o')
24+
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp) by building index', marker='o')
25+
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) by loading index', marker='o')
2626
plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
2727

2828
plt.xlabel('Reference list size')
@@ -36,8 +36,8 @@
3636

3737
# plot CPU time for query file size = 30000, use different reference file sizes in x-axis
3838
plt.clf()
39-
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['cpu_time'], label='compare (cpp)', marker='o')
40-
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) without index', marker='o')
39+
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['cpu_time'], label='compare (cpp) by building index', marker='o')
40+
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) by loading index', marker='o')
4141
plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['cpu_time'], label='multisearch (sourmash)', marker='o')
4242

4343
plt.xlabel('Reference list size')
@@ -51,8 +51,8 @@
5151

5252
# plot peak memory usage for query file size = 30000, use different reference file sizes in x-axis
5353
plt.clf()
54-
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp)', marker='o')
55-
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) without index', marker='o')
54+
plt.plot(df_compare_30000['ref_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp) by building index', marker='o')
55+
plt.plot(df_compare_without_index_30000['ref_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) by loading index', marker='o')
5656
plt.plot(df_multisearch_30000['ref_filesize'], df_multisearch_30000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
5757

5858
plt.xlabel('Reference list size')
@@ -70,8 +70,8 @@
7070
df_compare_without_index_30000 = df_compare_without_index[df_compare_without_index['ref_filesize'] == 30000]
7171
df_multisearch_30000 = df_multisearch[df_multisearch['ref_filesize'] == 30000]
7272

73-
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp)', marker='o')
74-
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) without index', marker='o')
73+
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['wall_clock_time'], label='compare (cpp) by building index', marker='o')
74+
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['wall_clock_time'], label='compare (cpp) by loading index', marker='o')
7575
plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['wall_clock_time'], label='multisearch (sourmash)', marker='o')
7676

7777
plt.xlabel('Query list size')
@@ -85,8 +85,8 @@
8585

8686
# plot peak memory usage for ref file size = 30000, use different query file sizes in x-axis
8787
plt.clf()
88-
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp)', marker='o')
89-
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) without index', marker='o')
88+
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['peak_memory_usage'], label='compare (cpp) by building index', marker='o')
89+
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['peak_memory_usage'], label='compare (cpp) by loading index', marker='o')
9090
plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
9191

9292
plt.xlabel('Query list size')
@@ -100,8 +100,8 @@
100100

101101
# plot CPU time for ref file size = 30000, use different query file sizes in x-axis
102102
plt.clf()
103-
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['cpu_time'], label='compare (cpp)', marker='o')
104-
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) without index', marker='o')
103+
plt.plot(df_compare_30000['query_filesize'], df_compare_30000['cpu_time'], label='compare (cpp) by building index', marker='o')
104+
plt.plot(df_compare_without_index_30000['query_filesize'], df_compare_without_index_30000['cpu_time'], label='compare (cpp) by loading index', marker='o')
105105
plt.plot(df_multisearch_30000['query_filesize'], df_multisearch_30000['cpu_time'], label='multisearch (sourmash)', marker='o')
106106

107107
plt.xlabel('Query list size')

benchmark/plot_one_v_many.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
# col names: ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
1313

1414
# plot wall-clock time against reference file size
15-
plt.plot(df_compare['ref_filesize'], df_compare['wall_clock_time'], label='compare (cpp)', marker='o')
16-
plt.plot(df_multisearch['ref_filesize'], df_multisearch['wall_clock_time'], label='multisearch (sourmash)', marker='o')
17-
plt.plot(df_prefetch_lm['ref_filesize'], df_prefetch_lm['wall_clock_time'], label='prefetch-lm (cpp)', marker='o')
15+
plt.plot(df_compare['ref_filesize'], df_compare['wall_clock_time'], label='compare (cpp) (1 thread)', marker='o')
16+
plt.plot(df_multisearch['ref_filesize'], df_multisearch['wall_clock_time'], label='multisearch (sourmash) (1 thread)', marker='o')
17+
plt.plot(df_prefetch_lm['ref_filesize'], df_prefetch_lm['wall_clock_time'], label='prefetch-lm (cpp) (1 thread)', marker='o')
1818

1919
plt.xlabel('Reference list size')
2020
plt.ylabel('Wall-clock time (s)')
@@ -26,9 +26,9 @@
2626

2727
# plot peak memory usage against reference file size
2828
plt.clf()
29-
plt.plot(df_compare['ref_filesize'], df_compare['peak_memory_usage'], label='compare (cpp)', marker='o')
30-
plt.plot(df_multisearch['ref_filesize'], df_multisearch['peak_memory_usage'], label='multisearch (sourmash)', marker='o')
31-
plt.plot(df_prefetch_lm['ref_filesize'], df_prefetch_lm['peak_memory_usage'], label='prefetch-lm (cpp)', marker='o')
29+
plt.plot(df_compare['ref_filesize'], df_compare['peak_memory_usage'], label='compare (cpp) (1 thread)', marker='o')
30+
plt.plot(df_multisearch['ref_filesize'], df_multisearch['peak_memory_usage'], label='multisearch (sourmash) (1 thread)', marker='o')
31+
plt.plot(df_prefetch_lm['ref_filesize'], df_prefetch_lm['peak_memory_usage'], label='prefetch-lm (cpp) (1 thread)', marker='o')
3232

3333
plt.xlabel('Reference list size')
3434
plt.ylabel('Peak memory usage (GB)')

benchmark/plot_this_vs_sourmash_prefetch.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
# col names: ref_filesize,cpu_time,wall_clock_time,peak_memory_usage
1212

1313
# plot wall-clock time against reference file size
14-
plt.plot(df_this_prefetch['ref_filesize'], df_this_prefetch['wall_clock_time'], label='cpp prefetch', marker='o')
15-
plt.plot(df_sourmash_prefetch['ref_filesize'], df_sourmash_prefetch['wall_clock_time'], label='sourmash prefetch', marker='o')
14+
plt.plot(df_this_prefetch['ref_filesize'], df_this_prefetch['wall_clock_time'], label='cpp prefetch (1 thread)', marker='o')
15+
plt.plot(df_sourmash_prefetch['ref_filesize'], df_sourmash_prefetch['wall_clock_time'], label='sourmash prefetch (1 thread)', marker='o')
1616

1717
plt.xlabel('Reference list size')
1818
plt.ylabel('Wall-clock time (s)')
@@ -25,8 +25,8 @@
2525

2626
# plot peak memory usage against reference file size
2727
plt.clf()
28-
plt.plot(df_this_prefetch['ref_filesize'], df_this_prefetch['peak_memory_usage'], label='cpp prefetch', marker='o')
29-
plt.plot(df_sourmash_prefetch['ref_filesize'], df_sourmash_prefetch['peak_memory_usage'], label='sourmash prefetch', marker='o')
28+
plt.plot(df_this_prefetch['ref_filesize'], df_this_prefetch['peak_memory_usage'], label='cpp prefetch (1 thread)', marker='o')
29+
plt.plot(df_sourmash_prefetch['ref_filesize'], df_sourmash_prefetch['peak_memory_usage'], label='sourmash prefetch (1 thread)', marker='o')
3030

3131
plt.xlabel('Reference list size')
3232
plt.ylabel('Peak memory usage (GB)')
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

src/compare.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ void do_compare(Arguments& args) {
102102

103103
// write the header in the output file
104104
ofstream output_file(args.output_filename);
105-
output_file << "query_id, query_name, query_md5, match_id, match_name, match_md5, jaccard, containment_query_in_match, containment_match_in_query, max_containment, max_containment_ani" << endl;
105+
output_file << "query_id,query_name,query_md5,query_sketch_size,match_id,match_name,match_md5,match_sketch_size,jaccard,containment_query_in_match,containment_match_in_query,max_containment,max_containment_ani" << endl;
106106
output_file.close();
107107

108108
// combining command: cat

src/utils.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -277,8 +277,13 @@ void compute_intersection_matrix_by_sketches(int query_sketch_start_index, int q
277277
double max_containment = std::max(containment_i_in_j, containment_j_in_i);
278278
double max_containment_ani = pow(max_containment, 1.0/ksize);
279279

280-
// write i, query_name, query_md5, j, ref_name, ref_md5, jaccard, containment_i_in_j, containment_j_in_i, max_containment, max_containment_ani
281-
outfile << i << ",\"" << query_name << "\"," << query_md5 << "," << j << ",\"" << ref_name << "\"," << ref_md5 << "," << jaccard << "," << containment_i_in_j << "," << containment_j_in_i << "," << max_containment << "," << max_containment_ani << std::endl;
280+
// write i, query_name, query_md5, quey_sketch_size, j, ref_name, ref_md5, ref_sketch_size, jaccard, containment_i_in_j, containment_j_in_i, max_containment, max_containment_ani
281+
outfile << i << ",\"" << query_name << "\"," << query_md5 << "," <<
282+
sketches_query[i].size() << "," << j << ",\"" << ref_name << "\"," <<
283+
ref_md5 << "," << info_of_ref_sketches[j].size() << "," << jaccard << "," <<
284+
containment_i_in_j << "," << containment_j_in_i << "," << max_containment << "," <<
285+
max_containment_ani << std::endl;
286+
//outfile << i << ",\"" << query_name << "\"," << query_md5 << "," << j << ",\"" << ref_name << "\"," << ref_md5 << "," << jaccard << "," << containment_i_in_j << "," << containment_j_in_i << "," << max_containment << "," << max_containment_ani << std::endl;
282287

283288
similars[i].push_back(j);
284289
}

0 commit comments

Comments
 (0)