Skip to content

Commit a3c8435

Browse files
WIP: simplify RevIndex in Python & expand functionality (#3578)
Note: PR into #3545 This PR tackles #1939, fixing & customizing & simplifying the `index.RevIndex` Python container wrapping `MemRevIndex` on the Rust side. A few notes - * fixes ksize calculation for templating/selection for protein, hp, and dayhoff sketches; * fixes Jaccard calculation inside `MemRevIndex` search; * adds hooks in `sourmash gather` (CLI) and `Index.prefetch(...)` (Python layer) to test out `RevIndex` as a replacement index class & `CounterGather` implementation; * simplifies the Python layer and removes unused Rust code for `MemRevIndex`; Overall things seem to work well enough, with a few hiccups around `scaled` and sketch `location`. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent d55dfb6 commit a3c8435

File tree

9 files changed

+243
-187
lines changed

9 files changed

+243
-187
lines changed

include/sourmash.h

+1-12
Original file line numberDiff line numberDiff line change
@@ -388,20 +388,9 @@ const SourmashSearchResult *const *revindex_gather(const SourmashRevIndex *ptr,
388388

389389
uint64_t revindex_len(const SourmashRevIndex *ptr);
390390

391-
SourmashRevIndex *revindex_new_with_paths(const SourmashStr *const *search_sigs_ptr,
392-
uintptr_t insigs,
393-
const SourmashKmerMinHash *template_ptr,
394-
uintptr_t threshold,
395-
const SourmashKmerMinHash *const *queries_ptr,
396-
uintptr_t inqueries,
397-
bool keep_sigs);
398-
399391
SourmashRevIndex *revindex_new_with_sigs(const SourmashSignature *const *search_sigs_ptr,
400392
uintptr_t insigs,
401-
const SourmashKmerMinHash *template_ptr,
402-
uintptr_t threshold,
403-
const SourmashKmerMinHash *const *queries_ptr,
404-
uintptr_t inqueries);
393+
const SourmashKmerMinHash *template_ptr);
405394

406395
ScaledType revindex_scaled(const SourmashRevIndex *ptr);
407396

src/core/src/ffi/index/revindex.rs

+5-22
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
11
use std::slice;
22

3-
use camino::Utf8PathBuf as PathBuf;
4-
53
use crate::encodings::*;
64
use crate::ffi::index::SourmashSearchResult;
75
use crate::ffi::minhash::SourmashKmerMinHash;
86
use crate::ffi::signature::SourmashSignature;
9-
use crate::ffi::utils::{ForeignObject, SourmashStr};
7+
use crate::ffi::utils::ForeignObject;
108
use crate::index::revindex::mem_revindex;
119
use crate::index::Index;
1210
use crate::prelude::*;
1311
use crate::signature::{Signature, SigsTrait};
14-
use crate::sketch::minhash::KmerMinHash;
1512
use crate::sketch::Sketch;
1613
use crate::ScaledType;
1714

@@ -38,7 +35,7 @@ fn from_template(template: &Sketch) -> Selection {
3835
let adj_ksize: u32 = match moltype {
3936
HashFunctions::Murmur64Dna => ksize,
4037
HashFunctions::Murmur64Protein => ksize / 3,
41-
HashFunctions::Murmur64Dayhoff => ksize / 3,
38+
HashFunctions::Murmur64Dayhoff => ksize / 3,
4239
HashFunctions::Murmur64Hp => ksize / 3,
4340
HashFunctions::Murmur64Skipm1n3 => ksize,
4441
HashFunctions::Murmur64Skipm2n3 => ksize,
@@ -52,6 +49,7 @@ fn from_template(template: &Sketch) -> Selection {
5249
.build()
5350
}
5451

52+
/*
5553
ffi_fn! {
5654
unsafe fn revindex_new_with_paths(
5755
search_sigs_ptr: *const *const SourmashStr,
@@ -105,15 +103,13 @@ unsafe fn revindex_new_with_paths(
105103
Ok(SourmashRevIndex::from_rust(revindex))
106104
}
107105
}
106+
*/
108107

109108
ffi_fn! {
110109
unsafe fn revindex_new_with_sigs(
111110
search_sigs_ptr: *const *const SourmashSignature,
112111
insigs: usize,
113112
template_ptr: *const SourmashKmerMinHash,
114-
threshold: usize,
115-
queries_ptr: *const *const SourmashKmerMinHash,
116-
inqueries: usize,
117113
) -> Result<*mut SourmashRevIndex> {
118114
let search_sigs: Vec<Signature> = {
119115
assert!(!search_sigs_ptr.is_null());
@@ -130,21 +126,8 @@ unsafe fn revindex_new_with_sigs(
130126
Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone())
131127
};
132128

133-
let queries_vec: Vec<KmerMinHash>;
134-
let queries: Option<&[KmerMinHash]> = if queries_ptr.is_null() {
135-
None
136-
} else {
137-
queries_vec = slice::from_raw_parts(queries_ptr, inqueries)
138-
.iter()
139-
.map(|mh_ptr|
140-
// TODO: avoid this clone
141-
SourmashKmerMinHash::as_rust(*mh_ptr).clone())
142-
.collect();
143-
Some(queries_vec.as_ref())
144-
};
145-
146129
let selection = from_template(&template);
147-
let revindex = mem_revindex::RevIndex::new_with_sigs(search_sigs, &selection, threshold, queries)?;
130+
let revindex = mem_revindex::RevIndex::new_with_sigs(search_sigs, &selection, 0, None)?;
148131
Ok(SourmashRevIndex::from_rust(revindex))
149132
}
150133
}

src/core/src/index/revindex/mem_revindex.rs

+33-5
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use crate::signature::{Signature, SigsTrait};
1717
use crate::sketch::minhash::KmerMinHash;
1818
use crate::sketch::Sketch;
1919
use crate::Result;
20+
use crate::ScaledType;
2021

2122
pub struct RevIndex {
2223
linear: LinearIndex,
@@ -236,6 +237,14 @@ impl RevIndex {
236237
self.linear.template().clone()
237238
}
238239

240+
pub fn scaled(&self) -> ScaledType {
241+
if let Sketch::MinHash(mh) = self.linear.template() {
242+
mh.clone().scaled() // @CTB avoid clone
243+
} else {
244+
unimplemented!()
245+
}
246+
}
247+
239248
// TODO: mh should be a sketch, or even a sig...
240249
pub(crate) fn find_signatures(
241250
&self,
@@ -244,10 +253,24 @@ impl RevIndex {
244253
containment: bool,
245254
_ignore_scaled: bool,
246255
) -> Result<Vec<(f64, Signature, String)>> {
256+
let index_scaled = self.scaled();
257+
let query_scaled = mh.scaled();
258+
259+
// @CTB avoid clones?
260+
let query_mh = {
261+
if query_scaled < index_scaled {
262+
mh.clone()
263+
.downsample_scaled(index_scaled)
264+
.expect("cannot downsample query")
265+
} else {
266+
mh.clone()
267+
}
268+
};
269+
247270
// TODO: proper threshold calculation
248-
let threshold: usize = (threshold * (mh.size() as f64)) as _;
271+
let threshold: usize = (threshold * (query_mh.size() as f64)) as _;
249272

250-
let counter = self.counter_for_query(mh);
273+
let counter = self.counter_for_query(&query_mh);
251274

252275
debug!(
253276
"number of matching signatures for hashes: {}",
@@ -256,7 +279,9 @@ impl RevIndex {
256279

257280
let mut results = vec![];
258281
for (dataset_id, size) in counter.most_common() {
259-
let match_size = if size >= threshold { size } else { break };
282+
if size < threshold {
283+
break;
284+
};
260285

261286
let match_sig = self.linear.sig_for_dataset(dataset_id)?;
262287
let match_path = self
@@ -266,16 +291,19 @@ impl RevIndex {
266291
.internal_location();
267292

268293
let mut match_mh = None;
294+
//@CTB use something other than mh here?
269295
if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(self.linear.template()) {
270296
match_mh = Some(mh);
271297
}
272298
let match_mh = match_mh.unwrap();
273299

274300
if size >= threshold {
275301
let score = if containment {
276-
size as f64 / mh.size() as f64
302+
size as f64 / query_mh.size() as f64
277303
} else {
278-
mh.jaccard(match_mh).expect("cannot calculate Jaccard")
304+
query_mh
305+
.jaccard(match_mh)
306+
.expect("cannot calculate Jaccard")
279307
};
280308
let filename = match_path.to_string();
281309
let mut sig: Signature = match_sig.clone().into();

src/sourmash/commands.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -942,8 +942,9 @@ def gather(args):
942942

943943
if args.linear: # force linear traversal?
944944
databases = [LazyLinearIndex(db) for db in databases]
945-
else:
945+
elif 0:
946946
# @CTB foo revindex
947+
print("XXX NOTE: using RevIndex for ZipFileLinearIndex")
947948
from sourmash.index.revindex import RevIndex
948949
from sourmash.index import ZipFileLinearIndex
949950

src/sourmash/index/__init__.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -320,14 +320,23 @@ def counter_gather(self, query, threshold_bp, **kwargs):
320320
# tada!
321321
return counter
322322
else:
323-
from .revindex import RevIndex
323+
print("XXX NOTE: Using RevIndex CounterGather")
324+
from .revindex import RevIndex_CounterGather, RevIndex
324325

325326
revindex = RevIndex(template=prefetch_query.minhash)
327+
cg = RevIndex_CounterGather(
328+
prefetch_query, revindex, threshold_bp, allow_insert=True
329+
)
326330

331+
n_added = 0
327332
for result in self.prefetch(prefetch_query, threshold_bp, **kwargs):
328-
revindex.insert(result.signature)
333+
cg.add(result.signature, location=result.location)
334+
n_added += 1
335+
336+
if n_added == 0:
337+
raise ValueError("no signatures to count")
329338

330-
return revindex.counter_gather(prefetch_query, threshold_bp)
339+
return cg
331340

332341
@abstractmethod
333342
def select(

0 commit comments

Comments
 (0)