cucapra · johnpalsberg · May 7, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 24, 2025
diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml
@@ -0,0 +1,103 @@
+name: performance
+
+
+on:
+  push:
+    branches:
+      - john-performance
+
+jobs:
+  test-py:
+    name: test Python tools
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      # Set up and use uv.
+      - uses: actions/cache@v4
+        id: cache-uv
+        with:
+          path: ~/.cache/uv
+          key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv
+      - name: uv sync and activate
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          uv sync
+          echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV
+          echo "$PWD/.venv/bin" >> $GITHUB_PATH
+
+      # Set up for tests.
+      - name: Problem matcher
+        run: echo '::add-matcher::.github/tap-matcher.json'
+      - name: Fetch test data
+        run: make fetch SMALL=1
+
+      - name: Pull odgi container
+        run: |
+          docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1
+          docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi
+      - name: Install odgi alias
+        run: |
+          mkdir -p $HOME/.local/bin
+          cp .github/odgi.sh $HOME/.local/bin/odgi
+          chmod a+x $HOME/.local/bin/odgi
+
+      # Test slow_odgi.
+      - name: Set up for slow_odgi tests
+        run: make -C slow_odgi setup oracles SMALL=1
+      - name: Test slow_odgi
+        run: make -C slow_odgi test SMALL=1
+
+  test-flatgfa:
+    name: test FlatGFA
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: rustup toolchain install stable --no-self-update
+
+      # Install slow-odgi.
+      - uses: actions/cache@v4
+        id: cache-uv
+        with:
+          path: ~/.cache/uv
+          key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv
+      - name: uv sync and activate
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          uv sync
+          echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV
+          echo "$PWD/.venv/bin" >> $GITHUB_PATH
+
+      # Install odgi
+      - name: Pull odgi container
+        run: |
+          docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1
+          docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi
+      - name: Install odgi alias
+        run: |
+          mkdir -p $HOME/.local/bin
+          cp .github/odgi.sh $HOME/.local/bin/odgi
+          chmod a+x $HOME/.local/bin/odgi
+
+      # Install Turnt.
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install Turnt
+        run: pip install turnt
+      - name: Problem matcher
+        run: echo '::add-matcher::.github/tap-matcher.json'
+
+      # We need the test data.
+      - name: Fetch test data
+        run: make fetch SMALL=1
+
+      # Build and test.
+      - run: cargo build
+        working-directory: ./flatgfa
+      - run: cargo test
+        working-directory: ./flatgfa
+      - run: make test-flatgfa
diff --git a/bench/__init__.py b/bench/__init__.py
diff --git a/flatgfa-py/src/lib.rs b/flatgfa-py/src/lib.rs
@@ -1,5 +1,6 @@
 use flatgfa::namemap::NameMap;
 use flatgfa::ops::gaf::{ChunkEvent, GAFParser};
+use flatgfa::packedseq::decompress_into_buffer;
 use flatgfa::pool::Id;
 use flatgfa::{self, file, memfile, print, FlatGFA, Handle, HeapGFAStore};
 use memmap::Mmap;
@@ -352,7 +353,9 @@ impl PySegment {
         let gfa = self.0.store.view();
         let seg = &gfa.segs[self.0.id()];
         let seq = gfa.get_seq(seg);
-        PyBytes::new(py, seq)
+        let mut buffer: Vec<u8> = Vec::new();
+        decompress_into_buffer(seq, &mut buffer);
+        PyBytes::new(py, &buffer) // Note: data is decompressed here
     }
 
     /// The segment's name as declared in the GFA file, an `int`.

diff --git a/flatgfa-py/test/test_flatgfa.py b/flatgfa-py/test/test_flatgfa.py
@@ -104,7 +104,7 @@ def test_read_write_gfa(gfa, tmp_path):
             assert orig_f.read() == written_f.read()
 
     # You can also parse GFA text files from the filesystem.
-    new_gfa = flatgfa.parse(gfa_path)
+    new_gfa = flatgfa.parse(gfa_path)  # type: ignore
     assert len(new_gfa.segments) == len(gfa.segments)
 
 
@@ -114,7 +114,7 @@ def test_read_write_flatgfa(gfa, tmp_path):
     gfa.write_flatgfa(flatgfa_path)
 
     # And read them back, which should be very fast indeed.
-    new_gfa = flatgfa.load(flatgfa_path)
+    new_gfa = flatgfa.load(flatgfa_path)  # type: ignore
     assert len(new_gfa.segments) == len(gfa.segments)
 
 

diff --git a/flatgfa/src/flatgfa.rs b/flatgfa/src/flatgfa.rs
@@ -3,7 +3,10 @@
 use std::ops::Range;
 use std::str::FromStr;
 
-use crate::pool::{self, Id, Pool, Span, Store};
+use crate::{
+    packedseq::{PackedSeqView, SeqSpan},
+    pool::{self, Id, Pool, Span, Store},
+};
 use bstr::BStr;
 use num_enum::{IntoPrimitive, TryFromPrimitive};
 use zerocopy::{FromBytes, Immutable, IntoBytes};
@@ -75,7 +78,7 @@ pub struct Segment {
     pub name: usize,
 
     /// The base-pair sequence for the segment. This is a range in the `seq_data` pool.
-    pub seq: Span<u8>,
+    pub seq: SeqSpan,
 
     /// Segments can have optional fields. This is a range in the `optional_data` pool.
     pub optional: Span<u8>,
@@ -274,7 +277,7 @@ pub enum LineKind {
 /// This is mostly a `&[u8]`, but it also has a flag to indicate that we're
 /// representing the reverse-complement of the underlying sequence data.
 pub struct Sequence<'a> {
-    data: &'a [u8],
+    data: PackedSeqView<'a>,
     revcmp: bool,
 }
 
@@ -284,7 +287,7 @@ impl<'a> Sequence<'a> {
     /// `data` should be the "forward" version of the sequence. Use `ori` to
     /// indicate whether this `Sequence` represents the forward or backward
     /// (reverse complement) of that data.
-    pub fn new(data: &'a [u8], ori: Orientation) -> Self {
+    pub fn new(data: PackedSeqView<'a>, ori: Orientation) -> Self {
         Self {
             data,
             revcmp: ori == Orientation::Backward,
@@ -294,9 +297,9 @@ impl<'a> Sequence<'a> {
     /// Look up a single base pair in the sequence.
     pub fn index(&self, idx: usize) -> u8 {
         if self.revcmp {
-            nucleotide_complement(self.data[self.data.len() - idx - 1])
+            self.data.get(self.data.len() - idx - 1).complement().into()
         } else {
-            self.data[idx]
+            self.data.get(idx).into()
         }
     }
 
@@ -305,9 +308,10 @@ impl<'a> Sequence<'a> {
         let data = if self.revcmp {
             // The range starts at the end of the buffer:
             // [-----<end<******<start<------]
-            &self.data[(self.data.len() - range.end)..(self.data.len() - range.start)]
+            self.data
+                .range_slice((self.data.len() - range.end)..(self.data.len() - range.start))
         } else {
-            &self.data[range]
+            self.data.range_slice(range)
         };
         Self {
             data,
@@ -321,29 +325,14 @@ impl<'a> Sequence<'a> {
             self.data
                 .iter()
                 .rev()
-                .map(|&c| nucleotide_complement(c))
+                .map(|c| c.complement().into())
                 .collect()
         } else {
-            self.data.to_vec()
+            self.data.iter().map(|c| c.into()).collect()
         }
     }
 }
 
-/// Given an ASCII character for a nucleotide, get its complement.
-fn nucleotide_complement(c: u8) -> u8 {
-    match c {
-        b'A' => b'T',
-        b'T' => b'A',
-        b'C' => b'G',
-        b'G' => b'C',
-        b'a' => b't',
-        b't' => b'a',
-        b'c' => b'g',
-        b'g' => b'c',
-        x => x,
-    }
-}
-
 impl std::fmt::Display for Sequence<'_> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         if self.revcmp {
@@ -354,16 +343,16 @@ impl std::fmt::Display for Sequence<'_> {
             let bytes = self.to_vec();
             write!(f, "{}", BStr::new(&bytes))?;
         } else {
-            write!(f, "{}", BStr::new(self.data))?;
+            write!(f, "{}", self.data)?;
         }
         Ok(())
     }
 }
 
 impl<'a> FlatGFA<'a> {
     /// Get the base-pair sequence for a segment.
-    pub fn get_seq(&self, seg: &Segment) -> &BStr {
-        self.seq_data[seg.seq].as_ref()
+    pub fn get_seq(&self, seg: &Segment) -> PackedSeqView<'_> {
+        PackedSeqView::from_pool(self.seq_data, seg.seq)
     }
 
     /// Get the sequence that a *handle* refers to.
@@ -372,7 +361,7 @@ impl<'a> FlatGFA<'a> {
     /// gets the sequence in the orientation specified by the handle.
     pub fn get_seq_oriented(&self, handle: Handle) -> Sequence<'_> {
         let seg = self.get_handle_seg(handle);
-        let seq_data = self.seq_data[seg.seq].as_ref();
+        let seq_data = PackedSeqView::from_pool(self.seq_data, seg.seq);
         Sequence::new(seq_data, handle.orient())
     }
 
@@ -446,11 +435,62 @@ impl<'a, P: StoreFamily<'a>> GFAStore<'a, P> {
         self.header.add_slice(version);
     }
 
-    /// Add a new segment to the GFA file.
-    pub fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Id<Segment> {
+    /// Add a new segment to the GFA file, compressing the data in `seq`
+    pub fn compress_and_add_seg(
+        &mut self,
+        name: usize,
+        seq: &[u8],
+        optional: &[u8],
+    ) -> Id<Segment> {
+        self.seq_data.reserve(seq.len());
+        let mut high_nibble_end = true;
+        let mut combined_item = 0;
+        let start_id = self.seq_data.next_id();
+        for i in 0..seq.len() {
+            let item = seq[i];
+            let converted: u8 = match item {
+                65 => 0,
+                67 => 1,
+                84 => 2,
+                71 => 3,
+                78 => 4,
+                _ => panic!("Not a Nucleotide!"),
+            };
+            if high_nibble_end {
+                high_nibble_end = false;
+                if i == seq.len() - 1 {
+                    self.seq_data.add(converted);
+                    break;
+                }
+                combined_item = converted;
+            } else {
+                combined_item |= converted << 4;
+                self.seq_data.add(combined_item);
+                high_nibble_end = true;
+            }
+        }
+        let end_id = self.seq_data.next_id();
+        let byte_span = Span::new(start_id, end_id);
+        let start = SeqSpan::to_logical(byte_span.start.index(), false);
+        let end = SeqSpan::to_logical(byte_span.end.index() - 1, high_nibble_end) + 1;
+        self.segs.add(Segment {
+            name,
+            seq: SeqSpan {
+                start,
+                len: (end - start) as u16,
+            },
+            optional: self.optional_data.add_slice(optional),
+        })
+    }
+
+    /// Add a new segment with already compressed data
+    pub fn add_seg(&mut self, name: usize, seq: PackedSeqView, optional: &[u8]) -> Id<Segment> {
+        let byte_span = self.seq_data.add_slice(seq.data);
+        let start = SeqSpan::to_logical(byte_span.start.index(), seq.high_nibble_begin);
+        let end = SeqSpan::to_logical(byte_span.end.index() - 1, seq.high_nibble_end) + 1;
         self.segs.add(Segment {
             name,
-            seq: self.seq_data.add_slice(seq),
+            seq: (start as usize..end as usize).into(),
             optional: self.optional_data.add_slice(optional),
         })
     }

diff --git a/flatgfa/src/ops/chop.rs b/flatgfa/src/ops/chop.rs
@@ -1,3 +1,5 @@
+use std::ops::Range;
+
 use crate::flatgfa::{self, Handle, Link, Orientation, Path, Segment};
 use crate::pool::{Id, Span, Store};
 use crate::{GFAStore, HeapFamily};
@@ -34,17 +36,22 @@ pub fn chop(gfa: &flatgfa::FlatGFA, max_size: usize, incl_links: bool) -> flatgf
             max_node_id += 1;
             seg_map.push(Span::new(id, flat.segs.next_id()));
         } else {
-            let seq_end = seg.seq.end;
-            let mut offset = seg.seq.start.index();
+            let seq_range: Range<usize> = seg.seq.into();
+            let seq_end = seq_range.end;
+            let mut offset = seq_range.start;
             let segs_start = flat.segs.next_id();
             // Could also generate end_id by setting it equal to the start_id and
             // updating it for each segment that is added - only benefits us if we
             // don't unroll the last iteration of this loop
-            while offset < seq_end.index() - max_size {
+            while offset < seq_end - max_size {
                 // Generate a new segment of length c
                 flat.segs.add(Segment {
                     name: max_node_id,
-                    seq: Span::new(Id::new(offset), Id::new(offset + max_size)),
+                    seq: std::ops::Range {
+                        start: offset,
+                        end: offset + max_size,
+                    }
+                    .into(),
                     optional: Span::new_empty(),
                 });
                 offset += max_size;
@@ -53,7 +60,11 @@ pub fn chop(gfa: &flatgfa::FlatGFA, max_size: usize, incl_links: bool) -> flatgf
             // Generate the last segment
             flat.segs.add(Segment {
                 name: max_node_id,
-                seq: Span::new(Id::new(offset), seq_end),
+                seq: std::ops::Range {
+                    start: offset,
+                    end: seq_end,
+                }
+                .into(),
                 optional: Span::new_empty(),
             });
             max_node_id += 1;

diff --git a/flatgfa/src/ops/extract.rs b/flatgfa/src/ops/extract.rs
@@ -37,6 +37,7 @@ impl<'a> SubgraphBuilder<'a> {
     fn include_seg(&mut self, seg_id: Id<Segment>) {
         let seg = &self.old.segs[seg_id];
         let new_seg_id = self.store.add_seg(
+            // Note for reviwer, change made here
             seg.name,
             self.old.get_seq(seg),
             self.old.get_optional_data(seg),

diff --git a/flatgfa/src/ops/gaf.rs b/flatgfa/src/ops/gaf.rs
@@ -168,7 +168,6 @@ impl ChunkEvent {
         let seg = gfa.segs[self.handle.segment()];
         let seg_name = seg.name;
         let mut result = String::new();
-
         match self.range {
             ChunkRange::Partial(start, end) => {
                 result.push_str(&format!(