pangenome
diff --git a/‎Cargo.toml‎
Lines changed: 15 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎docs/sgd_reverse_handle_bug.md‎
Lines changed: 68 additions & 0 deletions b/‎docs/sgd_reverse_handle_bug.md‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎src/bidirected_gfa_writer.rs‎
Lines changed: 111 additions & 11 deletions b/‎src/bidirected_gfa_writer.rs‎
Lines changed: 111 additions & 11 deletions
@@ -55,6 +55,21 @@ path = "src/bin/test_handlegraph_traits.rs"
 required-features = ["debug-bins"]
 test = false
 
+[[bin]]
+name = "measure_layout_quality"
+path = "src/bin/measure_layout_quality.rs"
+test = false
+
+[[bin]]
+name = "sgd_diagnostics"
+path = "src/bin/sgd_diagnostics.rs"
+test = false
+
+[[bin]]
+name = "debug_sgd_simple"
+path = "src/bin/debug_sgd_simple.rs"
+test = false
+
 [features]
 debug-bins = []
 default = ["use-allwave"]
 
@@ -0,0 +1,68 @@
+# SGD Path Position Calculation Bug - Reverse Handles
+
+## Problem
+
+Path-guided SGD produces suboptimal layouts with "bubbles" - nodes that should be adjacent in paths are placed far apart. Topological sort would eliminate these, indicating SGD is not calculating correct positions.
+
+## Root Cause
+
+**Location**: `src/path_sgd_exact.rs` lines 528-574
+
+The bug: When calculating `pos_in_path_b`, the code searches for `handle_b` by comparing full handles:
+
+```rust
+let pos_in_path_b = {
+    let mut pos = 0;
+    for (pid, h) in &path_index_steps {
+        if *pid == path_id {
+            if *h == handle_b {  // ← Compares node ID + orientation
+                break;
+            }
+            if let Some(node) = graph_clone.nodes.get(&h.node_id()) {
+                pos += node.sequence.len();
+            }
+        }
+    }
+    pos
+};
+```
+
+**The Problem**: This finds the FIRST occurrence of `handle_b` in the path. If a node appears multiple times (valid for structural variation), or if we're looking for the wrong rank, we calculate the wrong position.
+
+**However**, my attempted fix (using `b_rank` instead) made things WORSE, suggesting the current implementation is closer to correct.
+
+## Alternative Hypothesis
+
+The real issue may be that **path positions are calculated correctly**, but the SGD is not converging because:
+
+1. Initial positions (seeded by node ID order) create massive separations
+2. Not enough iterations to overcome poor initialization
+3. Conflicting constraints from paths with mixed orientations
+
+## Diagnostic Evidence
+
+Created `sgd_diagnostics` tool showing:
+- Adjacent nodes in paths (1-40bp apart) are placed 100x-3976x apart in final layout
+- Example: Node 1 and 205 are 1bp apart in path, but 845 positions apart in layout
+- ALL paths show this problem (not just RC paths)
+
+## Next Steps
+
+1. Create focused tests for simple cases (chains with RC edges)
+2. Verify SGD can achieve perfect layout on trivial graphs
+3. If tests fail, the bug is confirmed in position calculation or SGD convergence
+4. Fix and verify
+
+## Test Cases Needed
+
+1. **Simple chain forward**: `1+->2+->3+` should place nodes sequentially
+2. **Simple chain reverse**: `1-->2-->3-` should place nodes sequentially
+3. **Mixed orientation chain**: `1+->2-->3+` should place nodes sequentially
+4. **Simple bubble with RC**:
+   ```
+   Path A: 1+->2+->3+
+   Path B: 1+->2-->3+  (traverses node 2 in reverse)
+   ```
+5. **Repeated node**: `1+->2+->1+` should handle revisiting correctly
+
+Each test should verify SGD produces perfect ordering (no displacement).
@@ -15,6 +15,7 @@ impl SeqRush {
         iterative_groom: Option<usize>,
         odgi_style_groom: bool,
         sgd_sort: bool,
+        sgd_iter_max: u64,
         threads: usize,
         verbose: bool,
     ) -> Result<(), Box<dyn std::error::Error>> {
@@ -34,10 +35,25 @@ impl SeqRush {
 
         if verbose {
             eprintln!("[bidirected_gfa] BidirectedGraph has {} nodes (sorted)", bi_graph.nodes.len());
-            eprintln!("[bidirected_gfa] After build - has node 1? {}, has node 2? {}", 
+            eprintln!("[bidirected_gfa] After build - has node 1? {}, has node 2? {}",
                 bi_graph.nodes.contains_key(&1), bi_graph.nodes.contains_key(&2));
         }
-        
+
+        // Compact BEFORE sorting if using SGD (to match ODGI/seqwish behavior)
+        if sgd_sort && !no_compact {
+            if verbose {
+                eprintln!("[bidirected_gfa] Compacting graph BEFORE SGD (like seqwish does)...");
+                let nodes_before = bi_graph.nodes.len();
+                bi_graph.compact();
+                let nodes_after = bi_graph.nodes.len();
+                eprintln!("[bidirected_gfa] Compacted from {} to {} nodes", nodes_before, nodes_after);
+                bi_graph.renumber_nodes_sequentially();
+            } else {
+                bi_graph.compact();
+                bi_graph.renumber_nodes_sequentially();
+            }
+        }
+
         // Apply grooming and/or sorting strategies
         if sgd_sort {
             // Path-guided SGD sorting (like odgi sort -p Ygs)
@@ -48,19 +64,67 @@ impl SeqRush {
 
             // Only apply SGD if we have nodes
             if !bi_graph.nodes.is_empty() {
-                // Run exact ODGI path-guided SGD with same parameters as odgi sort -p Ygs
+                // Calculate parameters from graph structure (like ODGI does)
+                use crate::path_sgd_exact::XPIndex;
+                let path_index = XPIndex::from_graph(&bi_graph);
+
+                let mut sum_path_step_count = 0u64;
+                let mut max_path_step_count = 0usize;
+                let mut max_path_length = 0usize;
+
+                for path_id in 0..bi_graph.paths.len() {
+                    let step_count = path_index.get_path_step_count(path_id);
+                    sum_path_step_count += step_count as u64;
+                    max_path_step_count = max_path_step_count.max(step_count);
+                    max_path_length = max_path_length.max(path_index.get_path_length(path_id));
+                }
+
+                // Run exact ODGI path-guided SGD with calculated parameters
                 let mut params = PathSGDParams::default();
+                params.iter_max = sgd_iter_max;
                 params.nthreads = threads;
                 params.progress = verbose;
+                params.min_term_updates = sum_path_step_count;
+                params.eta_max = (max_path_step_count * max_path_step_count) as f64;
+                params.space = max_path_length as u64;
+
+                // Calculate space_quantization_step dynamically like ODGI does
+                // This ensures we have approximately 100-102 zipf distributions instead of 488
+                const MAX_NUMBER_OF_ZIPF_DISTRIBUTIONS: u64 = 100;
+                let space_max = params.space_max;
+                // ODGI uses max(space_max + 1, MAX_NUMBER_OF_ZIPF_DISTRIBUTIONS)
+                let max_num_distributions = (space_max + 1).max(MAX_NUMBER_OF_ZIPF_DISTRIBUTIONS);
+
+                if params.space > space_max && max_num_distributions > space_max {
+                    // Dynamic calculation to achieve approximately MAX_NUMBER_OF_ZIPF_DISTRIBUTIONS
+                    params.space_quantization_step = 2u64.max(
+                        ((params.space - space_max) as f64 / (max_num_distributions - space_max) as f64).ceil() as u64
+                    );
+                } else {
+                    // Fallback to ODGI's default
+                    params.space_quantization_step = 100;
+                }
 
                 if verbose {
+                    eprintln!("[bidirected_gfa] Calculated SGD parameters:");
+                    eprintln!("  sum_path_step_count: {}", sum_path_step_count);
+                    eprintln!("  max_path_step_count: {}", max_path_step_count);
+                    eprintln!("  max_path_length: {}", max_path_length);
+                    eprintln!("  min_term_updates: {}", params.min_term_updates);
+                    eprintln!("  eta_max: {}", params.eta_max);
+                    eprintln!("  space: {}", params.space);
+                    eprintln!("  space_max: {}", params.space_max);
+                    eprintln!("  space_quantization_step: {} (dynamically calculated)", params.space_quantization_step);
                     eprintln!("[bidirected_gfa] Running exact ODGI path_linear_sgd with {} iterations", params.iter_max);
                 }
 
+                // Apply full Ygs pipeline: Y (SGD) + g (groom) + s (topological sort)
+
+                // Step 1: Y - Path-guided SGD
                 let sorted_handles = path_sgd_sort(&bi_graph, params);
 
                 if verbose {
-                    eprintln!("[bidirected_gfa] Path SGD produced ordering of {} nodes", sorted_handles.len());
+                    eprintln!("[bidirected_gfa] Step 1/3: Path SGD produced ordering of {} nodes", sorted_handles.len());
                 }
 
                 // Convert to forward handles only for node ordering
@@ -71,14 +135,49 @@ impl SeqRush {
                 // Apply the SGD ordering
                 bi_graph.apply_ordering(forward_order, verbose);
 
-                // Then apply grooming (degroom) - only flip orientations, don't reorder!
+                // Validate after SGD ordering
+                eprintln!("[VALIDATION] After SGD ordering:");
+                bi_graph.validate_paths("after SGD ordering");
+
+                // DEBUG: Write stage 1 output
+                if verbose {
+                    let stage1_file = std::fs::File::create("stage1_seqrush_Y.gfa").unwrap();
+                    let mut stage1_writer = std::io::BufWriter::new(stage1_file);
+                    bi_graph.write_gfa(&mut stage1_writer).unwrap();
+                    eprintln!("[bidirected_gfa] DEBUG: Written stage1_seqrush_Y.gfa");
+                }
+
+                // Step 2: g - Grooming
                 if verbose {
-                    eprintln!("[bidirected_gfa] Applying grooming (orientation flips only) after SGD...");
+                    eprintln!("[bidirected_gfa] Step 2/3: Applying grooming after SGD...");
                 }
-                let groomed_order = bi_graph.groom(true, false);
-                bi_graph.apply_grooming_with_reorder(groomed_order, false, false);
+                let groomed_order = bi_graph.groom(true, verbose);
+                bi_graph.apply_grooming_with_reorder(groomed_order, false, verbose);
+
+                // Validate after grooming
+                eprintln!("[VALIDATION] After grooming:");
+                bi_graph.validate_paths("after grooming");
+
+                // DEBUG: Write stage 2 output
+                if verbose {
+                    let stage2_file = std::fs::File::create("stage2_seqrush_Yg.gfa").unwrap();
+                    let mut stage2_writer = std::io::BufWriter::new(stage2_file);
+                    bi_graph.write_gfa(&mut stage2_writer).unwrap();
+                    eprintln!("[bidirected_gfa] DEBUG: Written stage2_seqrush_Yg.gfa");
+                }
+
+                // CRITICAL: Renumber nodes sequentially after grooming to preserve Y+g ordering
+                if verbose {
+                    eprintln!("[bidirected_gfa] Renumbering nodes sequentially to preserve Y+g order...");
+                }
+                bi_graph.renumber_nodes_sequentially();
 
-                // NO topological sort after SGD! That would destroy the SGD ordering
+                // NOTE: Skipping topological sort reordering because it would destroy the Y+g layout
+                // The topological sort was causing massive jumps in node connectivity because it
+                // traverses the graph topology (following edges) and then renumbers nodes based on
+                // that traversal, which doesn't respect the linear Y+g layout.
+                // TODO: Investigate if we need a different kind of topological adjustment that
+                // only fixes orientations or local ordering without global reordering
             } else if verbose {
                 eprintln!("[bidirected_gfa] Skipping SGD - no nodes in graph yet");
             }
@@ -114,8 +213,9 @@ impl SeqRush {
         }
         // Note: Regular sorting is already done in build_bidirected_graph
 
-        // Apply compaction if requested
-        if !no_compact {
+        // Apply compaction if requested (but skip if we already compacted before SGD)
+        let already_compacted = sgd_sort && !no_compact;
+        if !no_compact && !already_compacted {
             if verbose {
                 eprintln!("[bidirected_gfa] Applying compaction...");
                 let nodes_before = bi_graph.nodes.len();