Skip to content

Commit 1fa528c

Browse files
committed
test: verify index serialization roundtrip and canonical k-mer deduplication
Add property-based tests for: - Index save/load preserves all entries exactly - K-mer counts don't exceed valid window count - K-mer and reverse complement map to same canonical entry
1 parent c2345d9 commit 1fa528c

2 files changed

Lines changed: 98 additions & 0 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [Unreleased]
9+
10+
### Internal
11+
12+
- Added property tests for index serialization roundtrip and canonical k-mer deduplication
13+
814
## [0.3.0] - 2026-01-27
915

1016
### Added

tests/property_tests.rs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,12 @@
44
//! catching edge cases that might be missed by example-based tests.
55
66
use bytes::Bytes;
7+
use kmerust::index::{load_index, save_index, KmerIndex};
78
use kmerust::kmer::{unpack_to_bytes, unpack_to_string, Kmer, KmerLength};
9+
use kmerust::streaming::count_kmers_from_sequences;
810
use proptest::prelude::*;
11+
use std::collections::HashMap;
12+
use tempfile::NamedTempFile;
913

1014
/// Strategy for generating valid DNA sequences of length 1-32.
1115
fn dna_sequence(min_len: usize, max_len: usize) -> impl Strategy<Value = String> {
@@ -229,4 +233,92 @@ proptest! {
229233
);
230234
}
231235
}
236+
237+
/// Index save/load roundtrip should preserve all entries exactly.
238+
///
239+
/// Property: load(save(index)) = index
240+
#[test]
241+
fn index_roundtrip_preserves_all_entries(
242+
k in 1usize..=32,
243+
entries in proptest::collection::hash_map(any::<u64>(), 1u64..1000, 0..100)
244+
) {
245+
let k_len = KmerLength::new(k).unwrap();
246+
let index = KmerIndex::new(k_len, entries.clone());
247+
248+
let tmp = NamedTempFile::with_suffix(".kmix").unwrap();
249+
save_index(&index, tmp.path()).unwrap();
250+
let loaded = load_index(tmp.path()).unwrap();
251+
252+
prop_assert_eq!(loaded.k(), k_len);
253+
prop_assert_eq!(loaded.counts(), &entries);
254+
}
255+
256+
/// Total k-mer count should not exceed the number of valid windows in the input.
257+
///
258+
/// Property: Σ(counts) ≤ (seq.len - k + 1) for a single sequence
259+
#[test]
260+
fn total_count_at_most_valid_windows(
261+
seq in dna_sequence(1, 100),
262+
k in 1usize..=32
263+
) {
264+
prop_assume!(seq.len() >= k);
265+
266+
let k_len = KmerLength::new(k).unwrap();
267+
let counts = count_kmers_from_sequences(
268+
vec![Bytes::from(seq.clone())].into_iter(),
269+
k_len
270+
);
271+
272+
let total: u64 = counts.values().sum();
273+
let max_windows = (seq.len() - k + 1) as u64;
274+
275+
prop_assert!(
276+
total <= max_windows,
277+
"Total count {total} exceeds max windows {max_windows}"
278+
);
279+
}
280+
281+
/// A k-mer and its reverse complement should be counted together under one canonical entry.
282+
///
283+
/// Property: When counting a k-mer and its reverse complement as separate sequences
284+
/// (each of exactly length k), they should produce exactly one canonical entry
285+
/// with count 2 (or count 2 for palindromes since they're the same sequence).
286+
#[test]
287+
fn kmer_and_rc_count_together(seq in dna_sequence(1, 32)) {
288+
// Compute reverse complement
289+
let rc: String = seq
290+
.chars()
291+
.rev()
292+
.map(|c| match c {
293+
'A' => 'T',
294+
'T' => 'A',
295+
'C' => 'G',
296+
'G' => 'C',
297+
_ => unreachable!(),
298+
})
299+
.collect();
300+
301+
let k = seq.len();
302+
let k_len = KmerLength::new(k).unwrap();
303+
304+
// Count the k-mer and its RC as separate sequences (each exactly k bases)
305+
// This ensures we get exactly one k-mer from each sequence
306+
let counts: HashMap<u64, u64> = count_kmers_from_sequences(
307+
vec![Bytes::from(seq.clone()), Bytes::from(rc.clone())].into_iter(),
308+
k_len
309+
);
310+
311+
// Both should map to the same canonical form
312+
prop_assert_eq!(
313+
counts.len(), 1,
314+
"K-mer and RC should produce exactly 1 canonical entry, got {}", counts.len()
315+
);
316+
317+
// Count should be 2 (once from original, once from RC)
318+
let kmer_count = *counts.values().next().unwrap();
319+
prop_assert_eq!(
320+
kmer_count, 2,
321+
"K-mer and RC should have combined count 2, got {}", kmer_count
322+
);
323+
}
232324
}

0 commit comments

Comments
 (0)