|
4 | 4 | //! catching edge cases that might be missed by example-based tests. |
5 | 5 |
|
6 | 6 | use bytes::Bytes; |
| 7 | +use kmerust::index::{load_index, save_index, KmerIndex}; |
7 | 8 | use kmerust::kmer::{unpack_to_bytes, unpack_to_string, Kmer, KmerLength}; |
| 9 | +use kmerust::streaming::count_kmers_from_sequences; |
8 | 10 | use proptest::prelude::*; |
| 11 | +use std::collections::HashMap; |
| 12 | +use tempfile::NamedTempFile; |
9 | 13 |
|
10 | 14 | /// Strategy for generating valid DNA sequences of length 1-32. |
11 | 15 | fn dna_sequence(min_len: usize, max_len: usize) -> impl Strategy<Value = String> { |
@@ -229,4 +233,92 @@ proptest! { |
229 | 233 | ); |
230 | 234 | } |
231 | 235 | } |
| 236 | + |
| 237 | + /// Index save/load roundtrip should preserve all entries exactly. |
| 238 | + /// |
| 239 | + /// Property: load(save(index)) = index |
| 240 | + #[test] |
| 241 | + fn index_roundtrip_preserves_all_entries( |
| 242 | + k in 1usize..=32, |
| 243 | + entries in proptest::collection::hash_map(any::<u64>(), 1u64..1000, 0..100) |
| 244 | + ) { |
| 245 | + let k_len = KmerLength::new(k).unwrap(); |
| 246 | + let index = KmerIndex::new(k_len, entries.clone()); |
| 247 | + |
| 248 | + let tmp = NamedTempFile::with_suffix(".kmix").unwrap(); |
| 249 | + save_index(&index, tmp.path()).unwrap(); |
| 250 | + let loaded = load_index(tmp.path()).unwrap(); |
| 251 | + |
| 252 | + prop_assert_eq!(loaded.k(), k_len); |
| 253 | + prop_assert_eq!(loaded.counts(), &entries); |
| 254 | + } |
| 255 | + |
| 256 | + /// Total k-mer count should not exceed the number of valid windows in the input. |
| 257 | + /// |
| 258 | + /// Property: Σ(counts) ≤ (seq.len - k + 1) for a single sequence |
| 259 | + #[test] |
| 260 | + fn total_count_at_most_valid_windows( |
| 261 | + seq in dna_sequence(1, 100), |
| 262 | + k in 1usize..=32 |
| 263 | + ) { |
| 264 | + prop_assume!(seq.len() >= k); |
| 265 | + |
| 266 | + let k_len = KmerLength::new(k).unwrap(); |
| 267 | + let counts = count_kmers_from_sequences( |
| 268 | + vec![Bytes::from(seq.clone())].into_iter(), |
| 269 | + k_len |
| 270 | + ); |
| 271 | + |
| 272 | + let total: u64 = counts.values().sum(); |
| 273 | + let max_windows = (seq.len() - k + 1) as u64; |
| 274 | + |
| 275 | + prop_assert!( |
| 276 | + total <= max_windows, |
| 277 | + "Total count {total} exceeds max windows {max_windows}" |
| 278 | + ); |
| 279 | + } |
| 280 | + |
| 281 | + /// A k-mer and its reverse complement should be counted together under one canonical entry. |
| 282 | + /// |
| 283 | + /// Property: When counting a k-mer and its reverse complement as separate sequences |
| 284 | + /// (each of exactly length k), they should produce exactly one canonical entry |
| 285 | + /// with count 2 (or count 2 for palindromes since they're the same sequence). |
| 286 | + #[test] |
| 287 | + fn kmer_and_rc_count_together(seq in dna_sequence(1, 32)) { |
| 288 | + // Compute reverse complement |
| 289 | + let rc: String = seq |
| 290 | + .chars() |
| 291 | + .rev() |
| 292 | + .map(|c| match c { |
| 293 | + 'A' => 'T', |
| 294 | + 'T' => 'A', |
| 295 | + 'C' => 'G', |
| 296 | + 'G' => 'C', |
| 297 | + _ => unreachable!(), |
| 298 | + }) |
| 299 | + .collect(); |
| 300 | + |
| 301 | + let k = seq.len(); |
| 302 | + let k_len = KmerLength::new(k).unwrap(); |
| 303 | + |
| 304 | + // Count the k-mer and its RC as separate sequences (each exactly k bases) |
| 305 | + // This ensures we get exactly one k-mer from each sequence |
| 306 | + let counts: HashMap<u64, u64> = count_kmers_from_sequences( |
| 307 | + vec![Bytes::from(seq.clone()), Bytes::from(rc.clone())].into_iter(), |
| 308 | + k_len |
| 309 | + ); |
| 310 | + |
| 311 | + // Both should map to the same canonical form |
| 312 | + prop_assert_eq!( |
| 313 | + counts.len(), 1, |
| 314 | + "K-mer and RC should produce exactly 1 canonical entry, got {}", counts.len() |
| 315 | + ); |
| 316 | + |
| 317 | + // Count should be 2 (once from original, once from RC) |
| 318 | + let kmer_count = *counts.values().next().unwrap(); |
| 319 | + prop_assert_eq!( |
| 320 | + kmer_count, 2, |
| 321 | + "K-mer and RC should have combined count 2, got {}", kmer_count |
| 322 | + ); |
| 323 | + } |
232 | 324 | } |
0 commit comments