Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
f8dadda
Add zerocopy interfacing
johnpalsberg May 7, 2025
fe3972b
Attempt to resolve merge conflicts
johnpalsberg Jun 24, 2025
137ed25
Add a rust and a bash test case for writing and reading compressed da…
johnpalsberg Jun 24, 2025
f394aad
Make progress on adding the command line interface
johnpalsberg Jun 24, 2025
06fef05
Clean up code for push
johnpalsberg Jun 24, 2025
2ffc343
Make rand a dev-dependency
johnpalsberg Jun 24, 2025
1f7a76e
Add a subslice method to PackedSeqView
johnpalsberg Jun 24, 2025
ef0b96f
Add SeqSpan to flatgfa.rs
johnpalsberg Jun 26, 2025
293a296
Add a from_pool method to PackedSeqView
johnpalsberg Jun 26, 2025
247847e
Continue progressing with integration
johnpalsberg Jun 26, 2025
e8a2b41
Add a turnt test directory to flatgfa
johnpalsberg Jun 26, 2025
1a876fc
Complete the compressed Sequence implementation
johnpalsberg Jun 26, 2025
28e6ab2
Add turnt test case for the cli
johnpalsberg Jun 27, 2025
7f91260
Compress sequences when adding them to a GFA file
johnpalsberg Jun 30, 2025
f954e34
Fix remaining compile errors across codebase and make all test cases …
johnpalsberg Jul 19, 2025
502d5ae
Address Clippy warnings
johnpalsberg Jul 19, 2025
d3f0266
Add changes from compress-cli-refine
johnpalsberg Jul 19, 2025
4d2cef1
Merge remote-tracking branch 'origin/main' into john-zerocopy
johnpalsberg Jul 20, 2025
d8cd462
Clean up after merge
johnpalsberg Jul 20, 2025
7341185
Update Turnt tests
johnpalsberg Jul 20, 2025
9dc3aa5
progress on debugging
johnpalsberg Jul 26, 2025
5931f1a
Convert ASCII to internal nucleotide encoding when adding segments
johnpalsberg Jul 27, 2025
de3acc4
Fix Clippy warnings
johnpalsberg Jul 27, 2025
e7dbb94
Merge branch 'main' into john-zerocopy
johnpalsberg Jul 28, 2025
b7b7822
Add a 'N' variant to the Nucleotide enum
johnpalsberg Jul 28, 2025
d46499e
Fix python CI failures attempt #1
johnpalsberg Jul 28, 2025
15c11ef
Fix index issue with slice() function
johnpalsberg Jul 28, 2025
185a1fb
More indexing fixes
johnpalsberg Jul 28, 2025
c74d916
Fix indexing errors attempt #2
johnpalsberg Jul 28, 2025
c0ca7fb
Fix indexing errors attempt #3
johnpalsberg Jul 28, 2025
e03ab24
Fix Python dependencies
johnpalsberg Jul 29, 2025
6d0994c
Change SeqSpan to be half-open (exclude end byte)
johnpalsberg Jul 29, 2025
8ca33dd
Fix the from_range() method in SeqSpan
johnpalsberg Jul 30, 2025
466f5a6
Fix turnt test in flatgfa
johnpalsberg Jul 30, 2025
2cbc7a3
Fix Python ruff formatting
johnpalsberg Jul 30, 2025
0cb4e6b
Fix the indices in chop
johnpalsberg Jul 31, 2025
1e8f0dc
Use slow_odgi instead of uv run slow_odgi
johnpalsberg Jul 31, 2025
b7a3e3d
Clean up code for review
johnpalsberg Jul 31, 2025
4083d99
Add performance and file size tests for compression
johnpalsberg Sep 30, 2025
5b27e63
Use the hyperfine() function, and implement a couple more tests
johnpalsberg Oct 6, 2025
2be9d63
sync
johnpalsberg Oct 17, 2025
edff087
Test-only commit
johnpalsberg Oct 18, 2025
c9febd5
Test commit #2
johnpalsberg Oct 19, 2025
1da2970
Change SeqSpan index types from usize to u32
johnpalsberg Oct 19, 2025
13324af
Add logical indexing for SeqSpan
johnpalsberg Oct 31, 2025
764b90e
Add base + offset indexing
johnpalsberg Nov 1, 2025
59d7b1b
Add new cli command for printing file size statistics
johnpalsberg Nov 10, 2025
96131f6
Enable CI tests for all pushes to john-performance
johnpalsberg Nov 10, 2025
cc0ed9c
Fix an import bug
johnpalsberg Nov 10, 2025
3c7c82c
Byte fix (in progress)
johnpalsberg Nov 17, 2025
38cb1d9
Checkpoint for base + offset version
johnpalsberg Dec 1, 2025
7e22822
Fix bugs for base + end version
johnpalsberg Dec 1, 2025
b853ffa
Merge branch 'main' into john-performance
johnpalsberg Dec 1, 2025
b72e8af
Fix additional inconsistencies with main
johnpalsberg Dec 1, 2025
576837c
additional clippy fixes
johnpalsberg Dec 1, 2025
b9b8867
delete unnecessary files
johnpalsberg Dec 1, 2025
d40db39
add base + offset
johnpalsberg Dec 8, 2025
5aa1abc
Fix Clippy warnings
johnpalsberg Dec 8, 2025
eeb964a
Address most PR comments (except extra-copy)
johnpalsberg Jan 26, 2026
a3a2b44
Resolve more PR change requests
johnpalsberg Feb 2, 2026
7e94285
Make small clean up
johnpalsberg Feb 2, 2026
ac0e40c
Fix a bug in the compression
johnpalsberg Feb 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions .github/workflows/performance.yml
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this belongs in a separate benchmarking PR.

Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
name: performance


on:
push:
branches:
- john-performance

jobs:
test-py:
name: test Python tools
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"

# Set up and use uv.
- uses: actions/cache@v4
id: cache-uv
with:
path: ~/.cache/uv
key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv
- name: uv sync and activate
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
uv sync
echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV
echo "$PWD/.venv/bin" >> $GITHUB_PATH

# Set up for tests.
- name: Problem matcher
run: echo '::add-matcher::.github/tap-matcher.json'
- name: Fetch test data
run: make fetch SMALL=1

- name: Pull odgi container
run: |
docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1
docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi
- name: Install odgi alias
run: |
mkdir -p $HOME/.local/bin
cp .github/odgi.sh $HOME/.local/bin/odgi
chmod a+x $HOME/.local/bin/odgi

# Test slow_odgi.
- name: Set up for slow_odgi tests
run: make -C slow_odgi setup oracles SMALL=1
- name: Test slow_odgi
run: make -C slow_odgi test SMALL=1

test-flatgfa:
name: test FlatGFA
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: rustup toolchain install stable --no-self-update

# Install slow-odgi.
- uses: actions/cache@v4
id: cache-uv
with:
path: ~/.cache/uv
key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv
- name: uv sync and activate
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
uv sync
echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV
echo "$PWD/.venv/bin" >> $GITHUB_PATH

# Install odgi
- name: Pull odgi container
run: |
docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1
docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi
- name: Install odgi alias
run: |
mkdir -p $HOME/.local/bin
cp .github/odgi.sh $HOME/.local/bin/odgi
chmod a+x $HOME/.local/bin/odgi

# Install Turnt.
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install Turnt
run: pip install turnt
- name: Problem matcher
run: echo '::add-matcher::.github/tap-matcher.json'

# We need the test data.
- name: Fetch test data
run: make fetch SMALL=1

# Build and test.
- run: cargo build
working-directory: ./flatgfa
- run: cargo test
working-directory: ./flatgfa
- run: make test-flatgfa
Empty file added bench/__init__.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like diff noise?

Empty file.
5 changes: 4 additions & 1 deletion flatgfa-py/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use flatgfa::namemap::NameMap;
use flatgfa::ops::gaf::{ChunkEvent, GAFParser};
use flatgfa::packedseq::decompress_into_buffer;
use flatgfa::pool::Id;
use flatgfa::{self, file, memfile, print, FlatGFA, Handle, HeapGFAStore};
use memmap::Mmap;
Expand Down Expand Up @@ -352,7 +353,9 @@ impl PySegment {
let gfa = self.0.store.view();
let seg = &gfa.segs[self.0.id()];
let seq = gfa.get_seq(seg);
PyBytes::new(py, seq)
let mut buffer: Vec<u8> = Vec::new();
decompress_into_buffer(seq, &mut buffer);
PyBytes::new(py, &buffer) // Note: data is decompressed here
}

/// The segment's name as declared in the GFA file, an `int`.
Expand Down
4 changes: 2 additions & 2 deletions flatgfa-py/test/test_flatgfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_read_write_gfa(gfa, tmp_path):
assert orig_f.read() == written_f.read()

# You can also parse GFA text files from the filesystem.
new_gfa = flatgfa.parse(gfa_path)
new_gfa = flatgfa.parse(gfa_path) # type: ignore
assert len(new_gfa.segments) == len(gfa.segments)


Expand All @@ -114,7 +114,7 @@ def test_read_write_flatgfa(gfa, tmp_path):
gfa.write_flatgfa(flatgfa_path)

# And read them back, which should be very fast indeed.
new_gfa = flatgfa.load(flatgfa_path)
new_gfa = flatgfa.load(flatgfa_path) # type: ignore
assert len(new_gfa.segments) == len(gfa.segments)


Expand Down
104 changes: 72 additions & 32 deletions flatgfa/src/flatgfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
use std::ops::Range;
use std::str::FromStr;

use crate::pool::{self, Id, Pool, Span, Store};
use crate::{
packedseq::{PackedSeqView, SeqSpan},
pool::{self, Id, Pool, Span, Store},
};
use bstr::BStr;
use num_enum::{IntoPrimitive, TryFromPrimitive};
use zerocopy::{FromBytes, Immutable, IntoBytes};
Expand Down Expand Up @@ -75,7 +78,7 @@ pub struct Segment {
pub name: usize,

/// The base-pair sequence for the segment. This is a range in the `seq_data` pool.
pub seq: Span<u8>,
pub seq: SeqSpan,

/// Segments can have optional fields. This is a range in the `optional_data` pool.
pub optional: Span<u8>,
Expand Down Expand Up @@ -274,7 +277,7 @@ pub enum LineKind {
/// This is mostly a `&[u8]`, but it also has a flag to indicate that we're
/// representing the reverse-complement of the underlying sequence data.
pub struct Sequence<'a> {
data: &'a [u8],
data: PackedSeqView<'a>,
revcmp: bool,
}

Expand All @@ -284,7 +287,7 @@ impl<'a> Sequence<'a> {
/// `data` should be the "forward" version of the sequence. Use `ori` to
/// indicate whether this `Sequence` represents the forward or backward
/// (reverse complement) of that data.
pub fn new(data: &'a [u8], ori: Orientation) -> Self {
pub fn new(data: PackedSeqView<'a>, ori: Orientation) -> Self {
Self {
data,
revcmp: ori == Orientation::Backward,
Expand All @@ -294,9 +297,9 @@ impl<'a> Sequence<'a> {
/// Look up a single base pair in the sequence.
pub fn index(&self, idx: usize) -> u8 {
if self.revcmp {
nucleotide_complement(self.data[self.data.len() - idx - 1])
self.data.get(self.data.len() - idx - 1).complement().into()
} else {
self.data[idx]
self.data.get(idx).into()
}
}

Expand All @@ -305,9 +308,10 @@ impl<'a> Sequence<'a> {
let data = if self.revcmp {
// The range starts at the end of the buffer:
// [-----<end<******<start<------]
&self.data[(self.data.len() - range.end)..(self.data.len() - range.start)]
self.data
.range_slice((self.data.len() - range.end)..(self.data.len() - range.start))
} else {
&self.data[range]
self.data.range_slice(range)
};
Self {
data,
Expand All @@ -321,29 +325,14 @@ impl<'a> Sequence<'a> {
self.data
.iter()
.rev()
.map(|&c| nucleotide_complement(c))
.map(|c| c.complement().into())
.collect()
} else {
self.data.to_vec()
self.data.iter().map(|c| c.into()).collect()
}
}
}

/// Given an ASCII character for a nucleotide, get its complement.
fn nucleotide_complement(c: u8) -> u8 {
match c {
b'A' => b'T',
b'T' => b'A',
b'C' => b'G',
b'G' => b'C',
b'a' => b't',
b't' => b'a',
b'c' => b'g',
b'g' => b'c',
x => x,
}
}

impl std::fmt::Display for Sequence<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.revcmp {
Expand All @@ -354,16 +343,16 @@ impl std::fmt::Display for Sequence<'_> {
let bytes = self.to_vec();
write!(f, "{}", BStr::new(&bytes))?;
} else {
write!(f, "{}", BStr::new(self.data))?;
write!(f, "{}", self.data)?;
}
Ok(())
}
}

impl<'a> FlatGFA<'a> {
/// Get the base-pair sequence for a segment.
pub fn get_seq(&self, seg: &Segment) -> &BStr {
self.seq_data[seg.seq].as_ref()
pub fn get_seq(&self, seg: &Segment) -> PackedSeqView<'_> {
PackedSeqView::from_pool(self.seq_data, seg.seq)
}

/// Get the sequence that a *handle* refers to.
Expand All @@ -372,7 +361,7 @@ impl<'a> FlatGFA<'a> {
/// gets the sequence in the orientation specified by the handle.
pub fn get_seq_oriented(&self, handle: Handle) -> Sequence<'_> {
let seg = self.get_handle_seg(handle);
let seq_data = self.seq_data[seg.seq].as_ref();
let seq_data = PackedSeqView::from_pool(self.seq_data, seg.seq);
Sequence::new(seq_data, handle.orient())
}

Expand Down Expand Up @@ -446,11 +435,62 @@ impl<'a, P: StoreFamily<'a>> GFAStore<'a, P> {
self.header.add_slice(version);
}

/// Add a new segment to the GFA file.
pub fn add_seg(&mut self, name: usize, seq: &[u8], optional: &[u8]) -> Id<Segment> {
/// Add a new segment to the GFA file, compressing the data in `seq`
pub fn compress_and_add_seg(
&mut self,
name: usize,
seq: &[u8],
optional: &[u8],
) -> Id<Segment> {
self.seq_data.reserve(seq.len());
let mut high_nibble_end = true;
let mut combined_item = 0;
let start_id = self.seq_data.next_id();
for i in 0..seq.len() {
let item = seq[i];
let converted: u8 = match item {
65 => 0,
67 => 1,
84 => 2,
71 => 3,
78 => 4,
_ => panic!("Not a Nucleotide!"),
};
if high_nibble_end {
high_nibble_end = false;
if i == seq.len() - 1 {
self.seq_data.add(converted);
break;
}
combined_item = converted;
} else {
combined_item |= converted << 4;
self.seq_data.add(combined_item);
high_nibble_end = true;
}
}
let end_id = self.seq_data.next_id();
let byte_span = Span::new(start_id, end_id);
let start = SeqSpan::to_logical(byte_span.start.index(), false);
let end = SeqSpan::to_logical(byte_span.end.index() - 1, high_nibble_end) + 1;
self.segs.add(Segment {
name,
seq: SeqSpan {
start,
len: (end - start) as u16,
},
Comment on lines +478 to +481
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would probably be a good place for using that From<Range> impl, if we do it.

optional: self.optional_data.add_slice(optional),
})
}

/// Add a new segment with already compressed data
pub fn add_seg(&mut self, name: usize, seq: PackedSeqView, optional: &[u8]) -> Id<Segment> {
let byte_span = self.seq_data.add_slice(seq.data);
let start = SeqSpan::to_logical(byte_span.start.index(), seq.high_nibble_begin);
let end = SeqSpan::to_logical(byte_span.end.index() - 1, seq.high_nibble_end) + 1;
self.segs.add(Segment {
name,
seq: self.seq_data.add_slice(seq),
seq: (start as usize..end as usize).into(),
optional: self.optional_data.add_slice(optional),
})
}
Expand Down
21 changes: 16 additions & 5 deletions flatgfa/src/ops/chop.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::ops::Range;

use crate::flatgfa::{self, Handle, Link, Orientation, Path, Segment};
use crate::pool::{Id, Span, Store};
use crate::{GFAStore, HeapFamily};
Expand Down Expand Up @@ -34,17 +36,22 @@ pub fn chop(gfa: &flatgfa::FlatGFA, max_size: usize, incl_links: bool) -> flatgf
max_node_id += 1;
seg_map.push(Span::new(id, flat.segs.next_id()));
} else {
let seq_end = seg.seq.end;
let mut offset = seg.seq.start.index();
let seq_range: Range<usize> = seg.seq.into();
let seq_end = seq_range.end;
let mut offset = seq_range.start;
let segs_start = flat.segs.next_id();
// Could also generate end_id by setting it equal to the start_id and
// updating it for each segment that is added - only benefits us if we
// don't unroll the last iteration of this loop
while offset < seq_end.index() - max_size {
while offset < seq_end - max_size {
// Generate a new segment of length c
flat.segs.add(Segment {
name: max_node_id,
seq: Span::new(Id::new(offset), Id::new(offset + max_size)),
seq: std::ops::Range {
start: offset,
end: offset + max_size,
}
.into(),
optional: Span::new_empty(),
});
offset += max_size;
Expand All @@ -53,7 +60,11 @@ pub fn chop(gfa: &flatgfa::FlatGFA, max_size: usize, incl_links: bool) -> flatgf
// Generate the last segment
flat.segs.add(Segment {
name: max_node_id,
seq: Span::new(Id::new(offset), seq_end),
seq: std::ops::Range {
start: offset,
end: seq_end,
}
.into(),
optional: Span::new_empty(),
});
max_node_id += 1;
Expand Down
1 change: 1 addition & 0 deletions flatgfa/src/ops/extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ impl<'a> SubgraphBuilder<'a> {
fn include_seg(&mut self, seg_id: Id<Segment>) {
let seg = &self.old.segs[seg_id];
let new_seg_id = self.store.add_seg(
// Note for reviwer, change made here
seg.name,
self.old.get_seq(seg),
self.old.get_optional_data(seg),
Expand Down
1 change: 0 additions & 1 deletion flatgfa/src/ops/gaf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,6 @@ impl ChunkEvent {
let seg = gfa.segs[self.handle.segment()];
let seg_name = seg.name;
let mut result = String::new();

match self.range {
ChunkRange::Partial(start, end) => {
result.push_str(&format!(
Expand Down
Loading
Loading