diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml new file mode 100644 index 00000000..619afd5e --- /dev/null +++ b/.github/workflows/performance.yml @@ -0,0 +1,103 @@ +name: performance + + +on: + push: + branches: + - john-performance + +jobs: + test-py: + name: test Python tools + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + # Set up and use uv. + - uses: actions/cache@v4 + id: cache-uv + with: + path: ~/.cache/uv + key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv + - name: uv sync and activate + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + uv sync + echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV + echo "$PWD/.venv/bin" >> $GITHUB_PATH + + # Set up for tests. + - name: Problem matcher + run: echo '::add-matcher::.github/tap-matcher.json' + - name: Fetch test data + run: make fetch SMALL=1 + + - name: Pull odgi container + run: | + docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 + docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi + - name: Install odgi alias + run: | + mkdir -p $HOME/.local/bin + cp .github/odgi.sh $HOME/.local/bin/odgi + chmod a+x $HOME/.local/bin/odgi + + # Test slow_odgi. + - name: Set up for slow_odgi tests + run: make -C slow_odgi setup oracles SMALL=1 + - name: Test slow_odgi + run: make -C slow_odgi test SMALL=1 + + test-flatgfa: + name: test FlatGFA + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: rustup toolchain install stable --no-self-update + + # Install slow-odgi. + - uses: actions/cache@v4 + id: cache-uv + with: + path: ~/.cache/uv + key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv + - name: uv sync and activate + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + uv sync + echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV + echo "$PWD/.venv/bin" >> $GITHUB_PATH + + # Install odgi + - name: Pull odgi container + run: | + docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 + docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi + - name: Install odgi alias + run: | + mkdir -p $HOME/.local/bin + cp .github/odgi.sh $HOME/.local/bin/odgi + chmod a+x $HOME/.local/bin/odgi + + # Install Turnt. + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install Turnt + run: pip install turnt + - name: Problem matcher + run: echo '::add-matcher::.github/tap-matcher.json' + + # We need the test data. + - name: Fetch test data + run: make fetch SMALL=1 + + # Build and test. + - run: cargo build + working-directory: ./flatgfa + - run: cargo test + working-directory: ./flatgfa + - run: make test-flatgfa diff --git a/bench/__init__.py b/bench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/flatgfa-py/src/lib.rs b/flatgfa-py/src/lib.rs index 17fc4dac..b6a93170 100644 --- a/flatgfa-py/src/lib.rs +++ b/flatgfa-py/src/lib.rs @@ -1,5 +1,6 @@ use flatgfa::namemap::NameMap; use flatgfa::ops::gaf::{ChunkEvent, GAFParser}; +use flatgfa::packedseq::decompress_into_buffer; use flatgfa::pool::Id; use flatgfa::{self, file, memfile, print, FlatGFA, Handle, HeapGFAStore}; use memmap::Mmap; @@ -352,7 +353,9 @@ impl PySegment { let gfa = self.0.store.view(); let seg = &gfa.segs[self.0.id()]; let seq = gfa.get_seq(seg); - PyBytes::new(py, seq) + let mut buffer: Vec = Vec::new(); + decompress_into_buffer(seq, &mut buffer); + PyBytes::new(py, &buffer) // Note: data is decompressed here } /// The segment's name as declared in the GFA file, an `int`. diff --git a/flatgfa-py/test/test_flatgfa.py b/flatgfa-py/test/test_flatgfa.py index 5d5ca4fd..044c46d6 100644 --- a/flatgfa-py/test/test_flatgfa.py +++ b/flatgfa-py/test/test_flatgfa.py @@ -104,7 +104,7 @@ def test_read_write_gfa(gfa, tmp_path): assert orig_f.read() == written_f.read() # You can also parse GFA text files from the filesystem. - new_gfa = flatgfa.parse(gfa_path) + new_gfa = flatgfa.parse(gfa_path) # type: ignore assert len(new_gfa.segments) == len(gfa.segments) @@ -114,7 +114,7 @@ def test_read_write_flatgfa(gfa, tmp_path): gfa.write_flatgfa(flatgfa_path) # And read them back, which should be very fast indeed. - new_gfa = flatgfa.load(flatgfa_path) + new_gfa = flatgfa.load(flatgfa_path) # type: ignore assert len(new_gfa.segments) == len(gfa.segments) diff --git a/flatgfa/src/flatgfa.rs b/flatgfa/src/flatgfa.rs index b27e0872..2cecb82b 100644 --- a/flatgfa/src/flatgfa.rs +++ b/flatgfa/src/flatgfa.rs @@ -3,7 +3,10 @@ use std::ops::Range; use std::str::FromStr; -use crate::pool::{self, Id, Pool, Span, Store}; +use crate::{ + packedseq::{PackedSeqView, SeqSpan}, + pool::{self, Id, Pool, Span, Store}, +}; use bstr::BStr; use num_enum::{IntoPrimitive, TryFromPrimitive}; use zerocopy::{FromBytes, Immutable, IntoBytes}; @@ -75,7 +78,7 @@ pub struct Segment { pub name: usize, /// The base-pair sequence for the segment. This is a range in the `seq_data` pool. - pub seq: Span, + pub seq: SeqSpan, /// Segments can have optional fields. This is a range in the `optional_data` pool. pub optional: Span, @@ -274,7 +277,7 @@ pub enum LineKind { /// This is mostly a `&[u8]`, but it also has a flag to indicate that we're /// representing the reverse-complement of the underlying sequence data. pub struct Sequence<'a> { - data: &'a [u8], + data: PackedSeqView<'a>, revcmp: bool, } @@ -284,7 +287,7 @@ impl<'a> Sequence<'a> { /// `data` should be the "forward" version of the sequence. Use `ori` to /// indicate whether this `Sequence` represents the forward or backward /// (reverse complement) of that data. - pub fn new(data: &'a [u8], ori: Orientation) -> Self { + pub fn new(data: PackedSeqView<'a>, ori: Orientation) -> Self { Self { data, revcmp: ori == Orientation::Backward, @@ -294,9 +297,9 @@ impl<'a> Sequence<'a> { /// Look up a single base pair in the sequence. pub fn index(&self, idx: usize) -> u8 { if self.revcmp { - nucleotide_complement(self.data[self.data.len() - idx - 1]) + self.data.get(self.data.len() - idx - 1).complement().into() } else { - self.data[idx] + self.data.get(idx).into() } } @@ -305,9 +308,10 @@ impl<'a> Sequence<'a> { let data = if self.revcmp { // The range starts at the end of the buffer: // [----- Sequence<'a> { self.data .iter() .rev() - .map(|&c| nucleotide_complement(c)) + .map(|c| c.complement().into()) .collect() } else { - self.data.to_vec() + self.data.iter().map(|c| c.into()).collect() } } } -/// Given an ASCII character for a nucleotide, get its complement. -fn nucleotide_complement(c: u8) -> u8 { - match c { - b'A' => b'T', - b'T' => b'A', - b'C' => b'G', - b'G' => b'C', - b'a' => b't', - b't' => b'a', - b'c' => b'g', - b'g' => b'c', - x => x, - } -} - impl std::fmt::Display for Sequence<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if self.revcmp { @@ -354,7 +343,7 @@ impl std::fmt::Display for Sequence<'_> { let bytes = self.to_vec(); write!(f, "{}", BStr::new(&bytes))?; } else { - write!(f, "{}", BStr::new(self.data))?; + write!(f, "{}", self.data)?; } Ok(()) } @@ -362,8 +351,8 @@ impl std::fmt::Display for Sequence<'_> { impl<'a> FlatGFA<'a> { /// Get the base-pair sequence for a segment. - pub fn get_seq(&self, seg: &Segment) -> &BStr { - self.seq_data[seg.seq].as_ref() + pub fn get_seq(&self, seg: &Segment) -> PackedSeqView<'_> { + PackedSeqView::from_pool(self.seq_data, seg.seq) } /// Get the sequence that a *handle* refers to. @@ -372,7 +361,7 @@ impl<'a> FlatGFA<'a> { /// gets the sequence in the orientation specified by the handle. pub fn get_seq_oriented(&self, handle: Handle) -> Sequence<'_> { let seg = self.get_handle_seg(handle); - let seq_data = self.seq_data[seg.seq].as_ref(); + let seq_data = PackedSeqView::from_pool(self.seq_data, seg.seq); Sequence::new(seq_data, handle.orient()) } @@ -446,11 +435,62 @@ impl<'a, P: StoreFamily<'a>> GFAStore<'a, P> { self.header.add_slice(version); } - /// Add a new segment to the GFA file. - pub fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Id { + /// Add a new segment to the GFA file, compressing the data in `seq` + pub fn compress_and_add_seg( + &mut self, + name: usize, + seq: &[u8], + optional: &[u8], + ) -> Id { + self.seq_data.reserve(seq.len()); + let mut high_nibble_end = true; + let mut combined_item = 0; + let start_id = self.seq_data.next_id(); + for i in 0..seq.len() { + let item = seq[i]; + let converted: u8 = match item { + 65 => 0, + 67 => 1, + 84 => 2, + 71 => 3, + 78 => 4, + _ => panic!("Not a Nucleotide!"), + }; + if high_nibble_end { + high_nibble_end = false; + if i == seq.len() - 1 { + self.seq_data.add(converted); + break; + } + combined_item = converted; + } else { + combined_item |= converted << 4; + self.seq_data.add(combined_item); + high_nibble_end = true; + } + } + let end_id = self.seq_data.next_id(); + let byte_span = Span::new(start_id, end_id); + let start = SeqSpan::to_logical(byte_span.start.index(), false); + let end = SeqSpan::to_logical(byte_span.end.index() - 1, high_nibble_end) + 1; + self.segs.add(Segment { + name, + seq: SeqSpan { + start, + len: (end - start) as u16, + }, + optional: self.optional_data.add_slice(optional), + }) + } + + /// Add a new segment with already compressed data + pub fn add_seg(&mut self, name: usize, seq: PackedSeqView, optional: &[u8]) -> Id { + let byte_span = self.seq_data.add_slice(seq.data); + let start = SeqSpan::to_logical(byte_span.start.index(), seq.high_nibble_begin); + let end = SeqSpan::to_logical(byte_span.end.index() - 1, seq.high_nibble_end) + 1; self.segs.add(Segment { name, - seq: self.seq_data.add_slice(seq), + seq: (start as usize..end as usize).into(), optional: self.optional_data.add_slice(optional), }) } diff --git a/flatgfa/src/ops/chop.rs b/flatgfa/src/ops/chop.rs index ce095421..1cf91b3f 100644 --- a/flatgfa/src/ops/chop.rs +++ b/flatgfa/src/ops/chop.rs @@ -1,3 +1,5 @@ +use std::ops::Range; + use crate::flatgfa::{self, Handle, Link, Orientation, Path, Segment}; use crate::pool::{Id, Span, Store}; use crate::{GFAStore, HeapFamily}; @@ -34,17 +36,22 @@ pub fn chop(gfa: &flatgfa::FlatGFA, max_size: usize, incl_links: bool) -> flatgf max_node_id += 1; seg_map.push(Span::new(id, flat.segs.next_id())); } else { - let seq_end = seg.seq.end; - let mut offset = seg.seq.start.index(); + let seq_range: Range = seg.seq.into(); + let seq_end = seq_range.end; + let mut offset = seq_range.start; let segs_start = flat.segs.next_id(); // Could also generate end_id by setting it equal to the start_id and // updating it for each segment that is added - only benefits us if we // don't unroll the last iteration of this loop - while offset < seq_end.index() - max_size { + while offset < seq_end - max_size { // Generate a new segment of length c flat.segs.add(Segment { name: max_node_id, - seq: Span::new(Id::new(offset), Id::new(offset + max_size)), + seq: std::ops::Range { + start: offset, + end: offset + max_size, + } + .into(), optional: Span::new_empty(), }); offset += max_size; @@ -53,7 +60,11 @@ pub fn chop(gfa: &flatgfa::FlatGFA, max_size: usize, incl_links: bool) -> flatgf // Generate the last segment flat.segs.add(Segment { name: max_node_id, - seq: Span::new(Id::new(offset), seq_end), + seq: std::ops::Range { + start: offset, + end: seq_end, + } + .into(), optional: Span::new_empty(), }); max_node_id += 1; diff --git a/flatgfa/src/ops/extract.rs b/flatgfa/src/ops/extract.rs index 3619d514..7c7cba7b 100644 --- a/flatgfa/src/ops/extract.rs +++ b/flatgfa/src/ops/extract.rs @@ -37,6 +37,7 @@ impl<'a> SubgraphBuilder<'a> { fn include_seg(&mut self, seg_id: Id) { let seg = &self.old.segs[seg_id]; let new_seg_id = self.store.add_seg( + // Note for reviwer, change made here seg.name, self.old.get_seq(seg), self.old.get_optional_data(seg), diff --git a/flatgfa/src/ops/gaf.rs b/flatgfa/src/ops/gaf.rs index cc7a6889..85b72fc4 100644 --- a/flatgfa/src/ops/gaf.rs +++ b/flatgfa/src/ops/gaf.rs @@ -168,7 +168,6 @@ impl ChunkEvent { let seg = gfa.segs[self.handle.segment()]; let seg_name = seg.name; let mut result = String::new(); - match self.range { ChunkRange::Partial(start, end) => { result.push_str(&format!( diff --git a/flatgfa/src/packedseq.rs b/flatgfa/src/packedseq.rs index a3b3d54e..a0877821 100644 --- a/flatgfa/src/packedseq.rs +++ b/flatgfa/src/packedseq.rs @@ -2,7 +2,9 @@ use crate::file::*; use crate::memfile::map_new_file; +use crate::pool::Pool; use std::fmt::{self, Write}; +use std::ops::Range; use zerocopy::*; const MAGIC_NUMBER: u64 = 0x12; @@ -13,6 +15,7 @@ pub enum Nucleotide { C, T, G, + N, } impl From for Nucleotide { @@ -22,6 +25,7 @@ impl From for Nucleotide { 'C' => Self::C, 'T' => Self::T, 'G' => Self::G, + 'N' => Self::N, _ => panic!("Not a Nucleotide!"), } } @@ -34,6 +38,7 @@ impl From for Nucleotide { 1 => Self::C, 2 => Self::T, 3 => Self::G, + 4 => Self::N, _ => panic!("Not a Nucleotide!"), } } @@ -46,6 +51,7 @@ impl From for u8 { Nucleotide::C => 1, Nucleotide::T => 2, Nucleotide::G => 3, + Nucleotide::N => 4, } } } @@ -57,6 +63,7 @@ impl From for char { Nucleotide::C => 'C', Nucleotide::G => 'G', Nucleotide::T => 'T', + Nucleotide::N => 'N', } } } @@ -70,6 +77,7 @@ impl Nucleotide { b'C' => Self::C, b'T' => Self::T, b'G' => Self::G, + b'N' => Self::N, _ => panic!("Not a Nucleotide!"), } } @@ -81,6 +89,17 @@ impl Nucleotide { Nucleotide::C => b'C', Nucleotide::T => b'T', Nucleotide::G => b'G', + Nucleotide::N => b'N', + } + } + + pub fn complement(&self) -> Nucleotide { + match self { + Nucleotide::A => Nucleotide::T, + Nucleotide::T => Nucleotide::A, + Nucleotide::C => Nucleotide::G, + Nucleotide::G => Nucleotide::C, + Nucleotide::N => Nucleotide::N, } } } @@ -90,19 +109,23 @@ impl Nucleotide { /// pub struct PackedSeqStore { /// A vector that stores a compressed encoding of this PackedSeqStore's sequence - data: Vec, + pub data: Vec, /// True if the final base pair in the sequence is stored at a /// high nibble - high_nibble_end: bool, + pub high_nibble_end: bool, } pub struct PackedSeqView<'a> { - data: &'a [u8], + pub data: &'a [u8], /// True if the final base pair in the sequence is stored at a /// high nibble - high_nibble_end: bool, + pub high_nibble_end: bool, + + /// True if the first base pair in the sequence is stored at a + /// high nibble + pub high_nibble_begin: bool, } #[derive(FromBytes, IntoBytes, Debug, KnownLayout, Immutable)] @@ -111,6 +134,7 @@ pub struct PackedToc { magic: u64, data: Size, high_nibble_end: u8, + high_nibble_begin: u8, } impl PackedToc { @@ -118,6 +142,7 @@ impl PackedToc { size_of::() + self.data.bytes::() } + /// Returns a PackededToc with data corresponding to `seq` fn full(seq: &PackedSeqView) -> Self { Self { magic: MAGIC_NUMBER, @@ -126,10 +151,12 @@ impl PackedToc { capacity: seq.data.len(), }, high_nibble_end: if seq.high_nibble_end { 1u8 } else { 0u8 }, + high_nibble_begin: if seq.high_nibble_begin { 1u8 } else { 0u8 }, } } - fn get_nibble_end(nibble: u8) -> bool { + /// Returns whether `nibble` represents a high or low nibble + fn get_nibble_bool(nibble: u8) -> bool { match nibble { 0u8 => false, 1u8 => true, @@ -137,6 +164,8 @@ impl PackedToc { } } + /// Given a reference to a memory-mapped file `data` containing a compressed + /// sequence of nucleotides, return a PackedToc along with the rest of the data fn read(data: &[u8]) -> (&Self, &[u8]) { let toc = PackedToc::ref_from_prefix(data).unwrap().0; let rest = &data[size_of::()..]; @@ -169,7 +198,8 @@ impl<'a> PackedSeqView<'a> { let (data, _) = slice_prefix(rest, toc.data); Self { data, - high_nibble_end: PackedToc::get_nibble_end(toc.high_nibble_end), + high_nibble_end: PackedToc::get_nibble_bool(toc.high_nibble_end), + high_nibble_begin: PackedToc::get_nibble_bool(toc.high_nibble_begin), } } @@ -183,10 +213,12 @@ impl<'a> PackedSeqView<'a> { /// Returns the number of nucleotides in this PackedSeqView pub fn len(&self) -> usize { - if self.high_nibble_end { - self.data.len() * 2 + if self.data.is_empty() { + 0 } else { - self.data.len() * 2 - 1 + let begin = if self.high_nibble_begin { 1 } else { 0 }; + let end = if self.high_nibble_end { 0 } else { 1 }; + self.data.len() * 2 - begin - end } } @@ -197,8 +229,13 @@ impl<'a> PackedSeqView<'a> { /// Returns the element of this PackedSeqView at index `index` pub fn get(&self, index: usize) -> Nucleotide { - let i = index / 2; - if index % 2 == 1 { + let real_idx = if self.high_nibble_begin { + index + 1 + } else { + index + }; + let i = real_idx / 2; + if real_idx % 2 == 1 { ((self.data[i] & 0b11110000u8) >> 4).into() } else { (self.data[i] & 0b00001111u8).into() @@ -220,6 +257,41 @@ impl<'a> PackedSeqView<'a> { self.get_range(0..(self.len() - 1)) } + /// Creates a subslice of this PackedSeqView in the range of `span` + pub fn slice(&self, span: SeqSpan) -> Self { + let new_data = &self.data[span.byte_range()]; + + Self { + data: new_data, + high_nibble_begin: span.get_nibble_begin(), + high_nibble_end: span.get_nibble_end(), + } + } + + /// Creates a subslice of this PackedSeqView in the range of `range` + pub fn range_slice(&self, range: Range) -> Self { + let start = range.start; + let end = range.end; + let new_data = &self.data[range]; + + Self { + data: new_data, + high_nibble_begin: !start.is_multiple_of(2), + high_nibble_end: (end % 2) != 1, + } + } + + /// Given a pool of compressed data (`pool`), create a PackedSeqView in the range of `span` + /// + pub fn from_pool(pool: Pool<'a, u8>, span: SeqSpan) -> Self { + let slice = &pool.all()[span.byte_range()]; + Self { + data: slice, + high_nibble_begin: span.get_nibble_begin(), + high_nibble_end: span.get_nibble_end(), + } + } + /// Iterate over the nucleotides in this sequence. pub fn iter(&self) -> PackedSeqViewIterator<'_> { PackedSeqViewIterator::new(self) @@ -238,6 +310,7 @@ impl fmt::Display for PackedSeqView<'_> { pub struct PackedSeqViewIterator<'a> { data: &'a PackedSeqView<'a>, cur_index: usize, + back_index: usize, } impl<'a> PackedSeqViewIterator<'a> { @@ -245,6 +318,7 @@ impl<'a> PackedSeqViewIterator<'a> { Self { data: vec, cur_index: 0, + back_index: vec.len(), } } } @@ -253,7 +327,7 @@ impl Iterator for PackedSeqViewIterator<'_> { type Item = Nucleotide; fn next(&mut self) -> Option { - if self.cur_index < self.data.len() { + if self.cur_index < self.back_index { self.cur_index += 1; Some(self.data.get(self.cur_index - 1)) } else { @@ -262,6 +336,17 @@ impl Iterator for PackedSeqViewIterator<'_> { } } +impl<'a> DoubleEndedIterator for PackedSeqViewIterator<'a> { + fn next_back(&mut self) -> Option { + if self.cur_index < self.back_index && self.back_index > 0 { + self.back_index -= 1; + Some(self.data.get(self.back_index)) + } else { + None + } + } +} + impl PackedSeqStore { /// Creates a new empty PackedSeqStore pub fn new() -> Self { @@ -318,6 +403,7 @@ impl PackedSeqStore { PackedSeqView { data: &self.data, high_nibble_end: self.high_nibble_end, + high_nibble_begin: false, } } } @@ -340,6 +426,74 @@ impl Default for PackedSeqStore { } } +/// A Span of a packed nucleotide sequence. +#[derive(Debug, FromBytes, IntoBytes, Clone, Copy, PartialEq, Eq, Hash, Immutable)] +#[repr(packed)] +pub struct SeqSpan { + /// The logical index of the first element of the sequence + pub start: u32, + + /// The length of the sequence + pub len: u16, +} + +impl SeqSpan { + /// Returns true if this SeqSpan is empty, else return false + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + // Returns the logical index of the element given the byte index and nibble offset + pub fn to_logical(byte_index: usize, end_offset: bool) -> u32 { + (byte_index * 2 + end_offset as usize) as u32 + } + + // Returns a range of the bytes covered by this SeqSpan + pub fn byte_range(&self) -> Range { + Range { + start: (self.start / 2) as usize, + end: self.end().div_ceil(2), + } + } + + // Returns the nibble offset of the beginning of the sequence + pub fn get_nibble_begin(&self) -> bool { + !self.start.is_multiple_of(2) + } + + // Returns the nibble offset of the ending of the sequence + pub fn get_nibble_end(&self) -> bool { + (self.end() % 2) != 1 + } + pub fn len_from_end(&self, end: usize) -> u16 { + ((end as u32) - self.start) as u16 + } + pub fn end(&self) -> usize { + (self.start as usize) + self.len() + } + pub fn len(&self) -> usize { + self.len as usize + } +} + +impl From> for SeqSpan { + fn from(range: Range) -> Self { + Self { + start: range.start as u32, + len: (range.end - range.start) as u16, + } + } +} + +impl From for Range { + fn from(span: SeqSpan) -> Self { + Range { + start: span.start as usize, + end: span.end(), + } + } +} + /// A reference to a subsection of a nucleotide sequence stored in a PackedSeqStore pub struct PackedSlice<'a> { /// The underlying vector that stores the sequence referenced by this slice @@ -363,12 +517,44 @@ pub fn get_slice_seq(slice: PackedSlice<'_>) -> Vec { slice.vec_ref.as_ref().get_range(slice.span) } +/// Writes `seq` into a file with name `filename` pub fn export(seq: PackedSeqView, filename: &str) { let num_bytes = seq.file_size(); let mut mem = map_new_file(filename, num_bytes as u64); seq.write_file(&mut mem); } +/// Takes a slice of compressed base pairs, decompresses them and pushes them into `output` +pub fn decompress_into_buffer(input: PackedSeqView, output: &mut Vec) { + if !input.high_nibble_begin { + output.push(convert_to_ascii(input.data[0] & 0b00001111u8)); + } + output.push(convert_to_ascii((input.data[0] & 0b11110000u8) >> 4)); + for item in &input.data[1..input.data.len() - 1] { + output.push(convert_to_ascii(item & 0b00001111u8)); + output.push(convert_to_ascii((item & 0b11110000u8) >> 4)); + } + output.push(convert_to_ascii( + input.data[input.data.len() - 1] & 0b00001111u8, + )); + if input.high_nibble_end { + output.push(convert_to_ascii( + (input.data[input.data.len() - 1] & 0b11110000u8) >> 4, + )); + } +} + +fn convert_to_ascii(elem: u8) -> u8 { + match elem { + 0 => 65, + 1 => 67, + 2 => 84, + 3 => 71, + 4 => 78, + _ => panic!("Not a Nucleotide!"), + } +} + #[cfg(test)] mod tests { use super::*; @@ -536,4 +722,24 @@ mod tests { assert_eq!(vec, new_seq.get_elements()); } } + + /// Test the len() method for PackedSeqView + #[test] + fn test_len() { + let store = PackedSeqStore::from_slice(&[ + Nucleotide::A, + Nucleotide::A, + Nucleotide::T, + Nucleotide::C, + Nucleotide::G, + Nucleotide::C, + Nucleotide::C, + ]); + let view = store.as_ref(); + assert_eq!(7, view.len()); + + let store = PackedSeqStore::from_slice(&[]); + let view = store.as_ref(); + assert_eq!(0, view.len()); + } } diff --git a/flatgfa/src/parse.rs b/flatgfa/src/parse.rs index 64669518..e9bef3be 100644 --- a/flatgfa/src/parse.rs +++ b/flatgfa/src/parse.rs @@ -136,7 +136,7 @@ impl<'a, P: flatgfa::StoreFamily<'a>> Parser<'a, P> { } fn add_seg(&mut self, seg: gfaline::Segment) { - let seg_id = self.flat.add_seg(seg.name, seg.seq, seg.data); + let seg_id = self.flat.compress_and_add_seg(seg.name, seg.seq, seg.data); self.seg_ids.insert(seg.name, seg_id); } diff --git a/flatgfa/src/pool.rs b/flatgfa/src/pool.rs index a53c1309..bc40984a 100644 --- a/flatgfa/src/pool.rs +++ b/flatgfa/src/pool.rs @@ -147,6 +147,9 @@ pub trait Store { fn next_id(&self) -> Id { Id::new(self.len()) } + + /// Reserve space for directly adding elements + fn reserve(&mut self, num_elems: usize); } /// A store that uses a `Vec` to allocate objects on the heap. @@ -181,6 +184,10 @@ impl Store for HeapStore { fn len(&self) -> usize { self.0.len() } + + fn reserve(&mut self, num_elems: usize) { + self.0.reserve(num_elems); + } } impl Default for HeapStore { @@ -223,6 +230,11 @@ impl Store for FixedStore<'_, T> { fn len(&self) -> usize { self.0.len() } + + fn reserve(&mut self, num_elems: usize) { + let required_capacity = self.len() + num_elems; + assert!(self.capacity() >= required_capacity); + } } impl FixedStore<'_, T> { diff --git a/pollen_py/pollen/depth/python_depth.py b/pollen_py/pollen/depth/python_depth.py index ed1ad694..82e0cb19 100644 --- a/pollen_py/pollen/depth/python_depth.py +++ b/pollen_py/pollen/depth/python_depth.py @@ -1,4 +1,4 @@ -""" A node depth computation for .og files implemented using odgi's +"""A node depth computation for .og files implemented using odgi's Python bindings. While this implementation reuses odgi's data structures, it does not reuse its node depth computation algorithm and instead implements it from scratch. diff --git a/process.py b/process.py index 3d5c89ed..07b0deba 100644 --- a/process.py +++ b/process.py @@ -17,7 +17,7 @@ def format_json_data(node_depths, mem="segments0"): depths = node_depths["memories"][mem] print("#node.id\tdepth") for i in range(len(depths)): - print(f"{i+1}\t{depths[i]}") + print(f"{i + 1}\t{depths[i]}") if __name__ == "__main__": diff --git a/tests/performance/__init__.py b/tests/performance/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/performance/chop-performance.py b/tests/performance/chop-performance.py new file mode 100644 index 00000000..fe166531 --- /dev/null +++ b/tests/performance/chop-performance.py @@ -0,0 +1,27 @@ +import subprocess +import sys +import re +import tempfile +import json +from bench.bench import hyperfine, HyperfineResult + + +def main(): + with open("performance-output.txt", "w") as f: + main_runtime = 0.0 + for i, arg in enumerate(sys.argv, start = 0): + if i == 0: + arg = "main" + f.write(f"Testing chop: {arg}\n") + subprocess.run(["git", "switch", "-q", arg]) + results = hyperfine(["fgfa -i tests/DRB1-3123.flatgfa chop -l -c 3"]) + runtime = results[0].mean + f.write(f"Runtime: {runtime} ms\n\n") + if i == 0: + main_runtime = runtime + if not i == 0: + diff = runtime / main_runtime + f.write(f"{arg} ran for {round(diff, 3)} times as long as main") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/performance/compress-file_size.py b/tests/performance/compress-file_size.py new file mode 100644 index 00000000..0829d1ed --- /dev/null +++ b/tests/performance/compress-file_size.py @@ -0,0 +1,21 @@ +import subprocess +import sys + +def main(): + with open("performance-output.txt", "w") as f: + main_file_size = 0 + for i, arg in enumerate(sys.argv, start = 0): + if i == 0: + arg = "main" + f.write(f"Testing compression: {arg}\n") + command = f"git switch -q {arg} ; cargo build --release ; fgfa -I tests/DRB1-3123.gfa -o blarg ; ls -l blarg | awk '{{print $5}}'" + result = subprocess.run(command, shell=True, capture_output=True, text=True) + f.write("File size: " + result.stdout.strip() + " bytes\n\n") + if i == 0: + main_file_size = int(result.stdout.strip()) + if not i == 0: + diff = int(result.stdout.strip()) / main_file_size + f.write(f"{arg} produced a file {diff} times the size vs main") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/performance/compress-performance.py b/tests/performance/compress-performance.py new file mode 100644 index 00000000..6ee067b1 --- /dev/null +++ b/tests/performance/compress-performance.py @@ -0,0 +1,26 @@ +import subprocess +import sys +import re + +def main(): + with open("performance-output.txt", "w") as f: + main_runtime = 0.0 + for i, arg in enumerate(sys.argv, start = 0): + if i == 0: + arg = "main" + f.write(f"Testing compression: {arg}\n") + command = f"git switch -q {arg} ; hyperfine -w3 -N 'fgfa -I tests/DRB1-3123.gfa extract -n 3 -c 3'" + result = subprocess.run(command, shell=True, capture_output=True, text=True) + runtime = 0.0 + runtime_search = re.search(r"Time \(mean ± σ\):\s+([0-9.]+)", result.stdout) + if runtime_search: + runtime = float(runtime_search.group(1)) + f.write(f"Runtime: {runtime} ms\n\n") + if i == 0: + main_runtime = runtime + if not i == 0: + diff = runtime / main_runtime + f.write(f"{arg} ran for {round(diff, 3)} times as long as main") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/performance/query-performance.py b/tests/performance/query-performance.py new file mode 100644 index 00000000..57b1b508 --- /dev/null +++ b/tests/performance/query-performance.py @@ -0,0 +1,27 @@ +import subprocess +import sys +import re +import tempfile +import json +from bench.bench import hyperfine, HyperfineResult + + +def main(): + with open("performance-output.txt", "w") as f: + main_runtime = 0.0 + for i, arg in enumerate(sys.argv, start = 0): + if i == 0: + arg = "main" + f.write(f"Testing query: {arg}\n") + subprocess.run(["git", "switch", "-q", arg]) + results = hyperfine(["fgfa -i DRB1-3123.flatgfa paths | head "]) + runtime = results[0].mean + f.write(f"Runtime: {runtime} ms\n\n") + if i == 0: + main_runtime = runtime + if not i == 0: + diff = runtime / main_runtime + f.write(f"{arg} ran for {round(diff, 3)} times as long as main") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/uv.lock b/uv.lock index 1f559a10..1183e4a3 100644 --- a/uv.lock +++ b/uv.lock @@ -67,7 +67,6 @@ wheels = [ [[package]] name = "mygfa" -version = "0.1" source = { editable = "mygfa" } [[package]] @@ -90,12 +89,10 @@ wheels = [ [[package]] name = "pollen" -version = "1" source = { editable = "pollen_py" } [[package]] name = "pollen-data-gen" -version = "0.1" source = { editable = "pollen_data_gen" } dependencies = [ { name = "mygfa" }, @@ -144,7 +141,6 @@ wheels = [ [[package]] name = "slow-odgi" -version = "0.1" source = { editable = "slow_odgi" } dependencies = [ { name = "mygfa" },