1+ /// Tests for chaining stability across different gap values
2+ ///
3+ /// Key property: When gap threshold increases, chains should only grow or stay the same,
4+ /// never shrink. If two mappings are 50kb apart, they should chain with both -j 100k and -j 1m.
5+ use std:: collections:: HashMap ;
6+
7+ /// Parse chain membership from PAF output
8+ fn parse_chains ( output : & str ) -> HashMap < String , Vec < String > > {
9+ let mut chains: HashMap < String , Vec < String > > = HashMap :: new ( ) ;
10+
11+ for line in output. lines ( ) {
12+ if line. starts_with ( '[' ) || line. is_empty ( ) {
13+ continue ;
14+ }
15+
16+ let fields: Vec < & str > = line. split ( '\t' ) . collect ( ) ;
17+ if fields. len ( ) < 13 {
18+ continue ;
19+ }
20+
21+ // Find chain ID in tags
22+ let mut chain_id = None ;
23+ for field in fields. iter ( ) . skip ( 12 ) {
24+ if field. starts_with ( "ch:Z:" ) {
25+ chain_id = Some ( field[ 5 ..] . to_string ( ) ) ;
26+ break ;
27+ }
28+ }
29+
30+ if let Some ( cid) = chain_id {
31+ // Create a unique mapping ID from query and coordinates
32+ let mapping_id = format ! ( "{}:{}-{}" , fields[ 0 ] , fields[ 2 ] , fields[ 3 ] ) ;
33+ chains. entry ( cid) . or_insert_with ( Vec :: new) . push ( mapping_id) ;
34+ }
35+ }
36+
37+ chains
38+ }
39+
40+ #[ test]
41+ fn test_chaining_monotonicity ( ) {
42+ // Test that larger gap values create supersets of chains from smaller gaps
43+
44+ // Run with different gap values
45+ let gaps = vec ! [ 10_000 , 50_000 , 100_000 , 500_000 , 1_000_000 ] ;
46+ let mut all_chains = Vec :: new ( ) ;
47+
48+ for gap in & gaps {
49+ let output = std:: process:: Command :: new ( "./target/release/sweepga" )
50+ . arg ( "data/scerevisiae8.fa.gz" )
51+ . arg ( "-j" )
52+ . arg ( gap. to_string ( ) )
53+ . arg ( "-i" )
54+ . arg ( "0" ) // No identity filter for this test
55+ . output ( )
56+ . expect ( "Failed to run sweepga" ) ;
57+
58+ let stdout = String :: from_utf8_lossy ( & output. stdout ) ;
59+ let chains = parse_chains ( & stdout) ;
60+
61+ // Count total members across all chains
62+ let total_members: usize = chains. values ( ) . map ( |v| v. len ( ) ) . sum ( ) ;
63+ all_chains. push ( ( gap, chains, total_members) ) ;
64+ }
65+
66+ // Verify monotonicity: larger gaps should have same or more chain members
67+ for i in 1 ..all_chains. len ( ) {
68+ let ( gap1, _, count1) = & all_chains[ i-1 ] ;
69+ let ( gap2, _, count2) = & all_chains[ i] ;
70+
71+ assert ! (
72+ count2 >= count1,
73+ "Chain membership should not decrease with larger gaps: \
74+ -j {} has {} members, but -j {} has {} members",
75+ gap1, count1, gap2, count2
76+ ) ;
77+ }
78+ }
79+
80+ #[ test]
81+ fn test_chain_identity_stability ( ) {
82+ // Test that chain identities remain reasonable with large gaps
83+
84+ let gaps = vec ! [ 10_000 , 100_000 , 1_000_000 ] ;
85+
86+ for gap in gaps {
87+ let output = std:: process:: Command :: new ( "./target/release/sweepga" )
88+ . arg ( "data/scerevisiae8.fa.gz" )
89+ . arg ( "-j" )
90+ . arg ( gap. to_string ( ) )
91+ . arg ( "-i" )
92+ . arg ( "0.90" ) // 90% identity threshold
93+ . output ( )
94+ . expect ( "Failed to run sweepga" ) ;
95+
96+ let stderr = String :: from_utf8_lossy ( & output. stderr ) ;
97+
98+ // Extract coverage from output
99+ let mut coverage = None ;
100+ for line in stderr. lines ( ) {
101+ if line. contains ( "Output:" ) && line. contains ( "Mb total" ) {
102+ // Parse: "Output: 2778.5 Mb total, 94.8% avg identity"
103+ if let Some ( parts) = line. split ( "Output:" ) . nth ( 1 ) {
104+ if let Some ( mb_str) = parts. trim ( ) . split_whitespace ( ) . next ( ) {
105+ coverage = mb_str. parse :: < f64 > ( ) . ok ( ) ;
106+ }
107+ }
108+ }
109+ }
110+
111+ assert ! (
112+ coverage. is_some( ) ,
113+ "Failed to parse coverage for -j {}" ,
114+ gap
115+ ) ;
116+
117+ let cov = coverage. unwrap ( ) ;
118+
119+ // For 99% identical yeast genomes, we should get high coverage
120+ // even with large gap values
121+ // With 8 genomes (~12 Mb each) and 56 pairs (8×7, excluding self-mappings),
122+ // we expect ~672 Mb total. Getting >600 Mb means good coverage.
123+ assert ! (
124+ cov > 600.0 ,
125+ "Coverage with -j {} is too low: {} Mb (expected >600 Mb for 8 yeast genomes, 56 pairs)" ,
126+ gap, cov
127+ ) ;
128+ }
129+ }
130+
131+ #[ test]
132+ fn test_nearest_neighbor_chaining ( ) {
133+ // Test that mappings chain to their nearest neighbors, not distant ones
134+
135+ // Create a simple test case with three collinear mappings:
136+ // Mapping A: query 0-1000, target 0-1000
137+ // Mapping B: query 1100-2100, target 1100-2100 (100bp gap from A)
138+ // Mapping C: query 5000-6000, target 5000-6000 (3900bp gap from B)
139+
140+ // With -j 10000, all three should be chainable, but:
141+ // - A should chain to B (nearest at 100bp)
142+ // - B should chain to C (next nearest at 3900bp)
143+ // Result: Single chain A-B-C
144+
145+ // This is harder to test without creating custom PAF input
146+ // TODO: Create a test PAF file with known structure
147+ }
148+
149+ #[ test]
150+ fn test_overlap_penalty ( ) {
151+ // Test that overlapping mappings are penalized correctly
152+
153+ // Create test case with:
154+ // Mapping A: query 0-1000, target 0-1000
155+ // Mapping B: query 900-1900, target 900-1900 (100bp overlap with A)
156+ // Mapping C: query 1100-2100, target 1100-2100 (100bp gap from A)
157+
158+ // With overlap penalty, A should prefer to chain to C (gap) over B (overlap)
159+
160+ // TODO: Create test PAF file with overlapping mappings
161+ }
0 commit comments