diff --git a/flatgfa-py/examples/matrix.py b/flatgfa-py/examples/matrix.py new file mode 100644 index 00000000..91ae50e6 --- /dev/null +++ b/flatgfa-py/examples/matrix.py @@ -0,0 +1,61 @@ +import pathlib +from itertools import islice + +import flatgfa + + +def iter_bits_ones_dash(path, chunk_bytes: int = 64 * 1024): + ws = b" \t\r\n\v\f" + with open(path, "rb") as f: + while True: + chunk = f.read(chunk_bytes) + if not chunk: + break + for b in chunk: + if b in ws: + continue + if b == ord("1"): + yield True + elif b == ord("0"): + yield False + else: + raise ValueError( + f"Invalid character in file: {chr(b)!r} (expected '1' or '0')" + ) + + +def compare_row_to_file(row_bits, path) -> tuple[bool, int | None]: + n = len(row_bits) + i = 0 + for fb in iter_bits_ones_dash(path): + if i >= n: + # file has extra bits + raise ValueError("File contains more bits than the row length.") + if fb != bool(row_bits[i]): + return False, i + i += 1 + + if i != n: + # file ended early + raise ValueError(f"File ended early: got {i} bits, expected {n}.") + return True, None + + +TEST_DIR = pathlib.Path(__file__).parent +TEST_GFA = TEST_DIR / "./matrix_gaf_folder/Chr3.gfa" +GAF_DIR = (TEST_DIR / "./matrix_gaf_folder").resolve() +# MASTER_TXT = TEST_DIR / "./matrix_gaf_folder/Chr3-pangenotype" + +graph = flatgfa.parse(str(TEST_GFA)) +gaf = [str(p) for p in GAF_DIR.glob("*.gaf")] +pangenotype_matrix = graph.make_pangenotype_matrix(gaf) +# assert len(pangenotype_matrix) == len(gaf) +# first_row = list(pangenotype_matrix[0]) +# ok, where = compare_row_to_file(first_row, str(MASTER_TXT)) +# print("OK (all columns match)" if ok else f"Mismatch at column {where}") +FIRST_N = 100 +for gaf_path, row in zip(gaf, pangenotype_matrix): + row01 = [int(b) for b in row] + # print(pathlib.Path(gaf_path).name, *row01) + first_bits = islice(row, FIRST_N) + print(pathlib.Path(gaf_path).name, *map(int, first_bits)) diff --git a/flatgfa-py/examples/tiny.gaf b/flatgfa-py/examples/tiny.gaf new file mode 100644 index 00000000..979fd2ac --- /dev/null +++ b/flatgfa-py/examples/tiny.gaf @@ -0,0 +1,2 @@ +foo 12 0 12 + >1>2<4 38 5 17 12 12 0 cg:Z:150M +bar 20 0 20 + >1>2>3 30 7 27 20 20 0 cg:Z:150M diff --git a/flatgfa-py/examples/tiny.gfa b/flatgfa-py/examples/tiny.gfa new file mode 100644 index 00000000..0a9299a8 --- /dev/null +++ b/flatgfa-py/examples/tiny.gfa @@ -0,0 +1,11 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 AAATTTTCTGGAGTTCTAT +S 3 TTG +S 4 CCAACTCTCTG +P one 1+,2+,4- * +P two 1+,2+,3+,4- * +L 1 + 2 + 0M +L 2 + 4 - 0M +L 2 + 3 + 0M +L 3 + 4 - 0M diff --git a/flatgfa-py/flatgfa.pyi b/flatgfa-py/flatgfa.pyi index 16211d43..ffbb8eac 100644 --- a/flatgfa-py/flatgfa.pyi +++ b/flatgfa-py/flatgfa.pyi @@ -86,6 +86,7 @@ class FlatGFA: def write_gfa(self, filename: str) -> None: ... def all_reads(self, gaf: str) -> GAFParser: ... def print_gaf_lookup(self, gaf: str) -> None: ... + def make_pangenotype_matrix(self, gaf_files: list[str]) -> list[list[bool]]: ... def parse(filename: str) -> FlatGFA: ... def load(filename: str) -> FlatGFA: ... diff --git a/flatgfa-py/src/lib.rs b/flatgfa-py/src/lib.rs index 446eb6d5..68a85414 100644 --- a/flatgfa-py/src/lib.rs +++ b/flatgfa-py/src/lib.rs @@ -1,5 +1,6 @@ use flatgfa::namemap::NameMap; use flatgfa::ops::gaf::{ChunkEvent, GAFParser}; +use flatgfa::ops::pangenotype; use flatgfa::pool::Id; use flatgfa::{self, file, memfile, print, FlatGFA, Handle, HeapGFAStore}; use memmap::Mmap; @@ -167,6 +168,11 @@ impl PyFlatGFA { println!(); } } + + pub fn make_pangenotype_matrix(&self, gaf_files: Vec) -> Vec> { + let gfa = self.0.view(); + pangenotype::make_pangenotype_matrix(&gfa, gaf_files) + } } /// A reference to a list of *any* type within a FlatGFA. diff --git a/flatgfa-py/test/test_gaf.py b/flatgfa-py/test/test_gaf.py index 30d8fb9d..55a135bd 100644 --- a/flatgfa-py/test/test_gaf.py +++ b/flatgfa-py/test/test_gaf.py @@ -1,4 +1,7 @@ import pathlib + +import numpy as np + import flatgfa TEST_DIR = pathlib.Path(__file__).parent @@ -24,3 +27,37 @@ def test_gaf_ranges(): [(5, 8), (0, 9), (1, 0)], [(7, 8), (0, 18), (0, 0)], ] + + +def test_construct_pangenotype_matrix(): + gfa = flatgfa.parse_bytes(TEST_GFA.read_bytes()) + pangenotype_matrix = gfa.make_pangenotype_matrix([(str(TEST_GAF))]) + assert pangenotype_matrix == [[True, True, True, True]] + + +def test_pangenotype_regression(): + gfa = flatgfa.parse_bytes(TEST_GFA.read_bytes()) + pangenotype_matrix = gfa.make_pangenotype_matrix([str(TEST_GAF)]) + + # Optional: inspect structure + print("Matrix:", pangenotype_matrix) + + # Example nested loop: iterate over rows and values + for row in pangenotype_matrix: + for val in row: + print("Segment present:", val) + + # Convert nested list of bools -> NumPy array of floats + a = np.array(pangenotype_matrix, dtype=float) + b = np.random.rand(a.shape[0]) + result = np.linalg.lstsq(a, b, rcond=None) + x, residuals, rank, s = result + + # Print or assert just to verify + print("Coefficients:", x) + print("Residuals:", residuals) + print("Rank:", rank) + print("Singular values:", s) + + # Simple test condition: check the shapes + assert x.shape[0] == a.shape[1] diff --git a/flatgfa/Cargo.toml b/flatgfa/Cargo.toml index faa8d978..3141259a 100644 --- a/flatgfa/Cargo.toml +++ b/flatgfa/Cargo.toml @@ -12,6 +12,7 @@ argh = "0.1.13" atoi = "2.0.0" bit-vec = "0.8.0" bstr = "1.12.0" +flate2 = "1.0" memchr = "2.7.4" memmap = "0.7.0" num_enum = "0.7.3" diff --git a/flatgfa/src/cli/cmds.rs b/flatgfa/src/cli/cmds.rs index 61f8bb54..39952adc 100644 --- a/flatgfa/src/cli/cmds.rs +++ b/flatgfa/src/cli/cmds.rs @@ -338,3 +338,50 @@ pub fn bed_intersect(args: BEDIntersect) { } } } + +/// construct a pangenotype matrix +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "matrix")] +pub struct PangenotypeMatrix { + /// the GAF file to use for genotyping (use `-f `) + #[argh(option, short = 'f')] + file: Option, + + /// read from stdin + #[argh(switch)] + stdin: bool, +} + +pub fn pangenotype_matrix(gfa: &flatgfa::FlatGFA, args: PangenotypeMatrix) { + use std::io::{self, BufRead}; + + let matrix = if args.stdin { + // Read from stdin stream + let stdin = io::stdin(); + let reader: Box = Box::new(stdin.lock()); + match ops::pangenotype::make_pangenotype_matrix_from_stream(gfa, reader) { + Ok(row) => vec![row], + Err(e) => { + eprintln!("Error reading from stdin: {}", e); + return; + } + } + } else if let Some(file_path) = args.file { + // For files (plain or .gz) the existing file-based API handles mmap and gzip + ops::pangenotype::make_pangenotype_matrix(gfa, vec![file_path]) + } else { + eprintln!("Please provide a GAF file with -f or use --stdin"); + return; + }; + + for row in matrix { + for col in row { + if col { + print!("1"); + } else { + print!("0"); + } + } + println!(); + } +} diff --git a/flatgfa/src/cli/main.rs b/flatgfa/src/cli/main.rs index c2bb0715..c39bebd5 100644 --- a/flatgfa/src/cli/main.rs +++ b/flatgfa/src/cli/main.rs @@ -44,6 +44,7 @@ enum Command { GafLookup(cmds::GAFLookup), Bench(cmds::Bench), BedIntersect(cmds::BEDIntersect), + PangenotypeMatrix(cmds::PangenotypeMatrix), } fn main() -> Result<(), &'static str> { @@ -146,6 +147,9 @@ fn main() -> Result<(), &'static str> { Some(Command::BedIntersect(_sub_args)) => { panic!("Unreachable code"); } + Some(Command::PangenotypeMatrix(sub_args)) => { + cmds::pangenotype_matrix(&gfa, sub_args); + } None => { // Just emit the GFA or FlatGFA file. dump(&gfa, &args.output); diff --git a/flatgfa/src/ops/mod.rs b/flatgfa/src/ops/mod.rs index 5eaaf835..e6d4773f 100644 --- a/flatgfa/src/ops/mod.rs +++ b/flatgfa/src/ops/mod.rs @@ -3,4 +3,5 @@ pub mod chop; pub mod depth; pub mod extract; pub mod gaf; +pub mod pangenotype; pub mod position; diff --git a/flatgfa/src/ops/pangenotype.rs b/flatgfa/src/ops/pangenotype.rs new file mode 100644 index 00000000..97c9893d --- /dev/null +++ b/flatgfa/src/ops/pangenotype.rs @@ -0,0 +1,185 @@ +use crate::flatgfa::FlatGFA; +use crate::memfile; +use crate::namemap::NameMap; +use flate2::read::GzDecoder; +use memchr::memchr; +use std::fs::File; +use std::io::{self, BufRead, BufReader}; + +/// Open a GAF file reader for gzip-compressed files. +/// +/// This function opens a .gaf.gz file and returns a boxed BufRead trait object +/// for streaming decompression. Plain text files should use memory mapping instead. +/// +/// # Arguments +/// * `path` - The path to a .gaf.gz file +/// +/// # Returns +/// A Result containing a boxed trait object implementing BufRead +fn open_gzip_reader(path: &str) -> io::Result> { + let file = File::open(path)?; + let decoder = GzDecoder::new(file); + Ok(Box::new(BufReader::new(decoder))) +} + +/// Process a GAF stream (implementing BufRead) and update the pangenotype matrix. +/// +/// This is a generic function that works with any BufRead source (files, gzip streams, stdin, etc.) +/// +/// # Arguments +/// * `reader` - A boxed BufRead trait object +/// * `matrix` - The pangenotype matrix to update +/// * `file_idx` - The index of this file in the matrix +/// * `name_map` - The name map for segment lookup +fn process_gaf_stream( + reader: Box, + matrix: &mut Vec>, + file_idx: usize, + name_map: &crate::namemap::NameMap, +) -> io::Result<()> { + for line in reader.lines() { + let line = line?; + + if line.is_empty() || line.starts_with('#') { + continue; + } + + process_gaf_line(&line, matrix, file_idx, name_map); + } + Ok(()) +} + +pub fn make_pangenotype_matrix(gfa: &FlatGFA, gaf_files: Vec) -> Vec> { + let num_segments = gfa.segs.len(); + let mut matrix = vec![vec![false; num_segments]; gaf_files.len()]; + + let name_map = NameMap::build(&gfa); + + for (file_idx, gaf_path) in gaf_files.iter().enumerate() { + if gaf_path.ends_with(".gz") { + // Use BufRead stream for gzip files + match open_gzip_reader(gaf_path) { + Ok(reader) => { + if let Err(e) = process_gaf_stream(reader, &mut matrix, file_idx, &name_map) { + eprintln!("Error processing GAF stream {}: {}", gaf_path, e); + } + } + Err(e) => { + eprintln!("Error opening GAF file {}: {}", gaf_path, e); + } + } + } else { + // Use memory mapping for plain .gaf files (faster) + let mmap = memfile::map_file(gaf_path); + let mut start = 0; + + while let Some(pos) = memchr(b'\n', &mmap[start..]) { + let line_end = start + pos; + let line = &mmap[start..line_end]; + start = line_end + 1; + + if line.is_empty() || line[0] == b'#' { + continue; + } + + process_gaf_line_bytes(line, &mut matrix, file_idx, &name_map); + } + } + } + + matrix +} + +pub fn make_pangenotype_matrix_from_stream( + gfa: &FlatGFA, + reader: Box, +) -> io::Result> { + let num_segments = gfa.segs.len(); + let mut matrix = vec![vec![false; num_segments]]; + let name_map = NameMap::build(&gfa); + + process_gaf_stream(reader, &mut matrix, 0, &name_map)?; + + Ok(matrix.remove(0)) +} + +fn process_gaf_line( + line: &str, + matrix: &mut Vec>, + file_idx: usize, + name_map: &crate::namemap::NameMap, +) { + let bytes = line.as_bytes(); + let mut tab_count = 0; + let mut idx = 0; + while idx < bytes.len() && tab_count < 5 { + if bytes[idx] == b'\t' { + tab_count += 1; + } + idx += 1; + } + + if tab_count < 5 || idx >= bytes.len() { + return; + } + // The path data is in the 5th field (index 4, after 5 tabs) + let mut end_idx = idx; + while end_idx < bytes.len() && bytes[end_idx] != b'\t' { + end_idx += 1; + } + let path_field = &bytes[idx..end_idx]; + + parse_path_field(path_field, matrix, file_idx, name_map); +} + +fn process_gaf_line_bytes( + line: &[u8], + matrix: &mut Vec>, + file_idx: usize, + name_map: &crate::namemap::NameMap, +) { + let mut tab_count = 0; + let mut idx = 0; + while idx < line.len() && tab_count < 5 { + if line[idx] == b'\t' { + tab_count += 1; + } + idx += 1; + } + + if tab_count < 5 || idx >= line.len() { + return; + } + + let mut end_idx = idx; + while end_idx < line.len() && line[end_idx] != b'\t' { + end_idx += 1; + } + let path_field = &line[idx..end_idx]; + + parse_path_field(path_field, matrix, file_idx, name_map); +} + +fn parse_path_field( + path_field: &[u8], + matrix: &mut Vec>, + file_idx: usize, + name_map: &crate::namemap::NameMap, +) { + let mut p = 0; + while p < path_field.len() { + let byte = path_field[p]; + if byte == b'>' || byte == b'<' { + p += 1; + let mut num = 0usize; + while p < path_field.len() && path_field[p].is_ascii_digit() { + num = num * 10 + (path_field[p] - b'0') as usize; + p += 1; + } + let seg_id = name_map.get(num); + matrix[file_idx][u32::from(seg_id) as usize] = true; + } else { + p += 1; + } + } +}