Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions flatgfa-py/examples/matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pathlib
from itertools import islice

import flatgfa


def iter_bits_ones_dash(path, chunk_bytes: int = 64 * 1024):
ws = b" \t\r\n\v\f"
with open(path, "rb") as f:
while True:
chunk = f.read(chunk_bytes)
if not chunk:
break
for b in chunk:
if b in ws:
continue
if b == ord("1"):
yield True
elif b == ord("0"):
yield False
else:
raise ValueError(
f"Invalid character in file: {chr(b)!r} (expected '1' or '0')"
)


def compare_row_to_file(row_bits, path) -> tuple[bool, int | None]:
n = len(row_bits)
i = 0
for fb in iter_bits_ones_dash(path):
if i >= n:
# file has extra bits
raise ValueError("File contains more bits than the row length.")
if fb != bool(row_bits[i]):
return False, i
i += 1

if i != n:
# file ended early
raise ValueError(f"File ended early: got {i} bits, expected {n}.")
return True, None


TEST_DIR = pathlib.Path(__file__).parent
TEST_GFA = TEST_DIR / "./matrix_gaf_folder/Chr3.gfa"
GAF_DIR = (TEST_DIR / "./matrix_gaf_folder").resolve()
# MASTER_TXT = TEST_DIR / "./matrix_gaf_folder/Chr3-pangenotype"

graph = flatgfa.parse(str(TEST_GFA))
gaf = [str(p) for p in GAF_DIR.glob("*.gaf")]
pangenotype_matrix = graph.make_pangenotype_matrix(gaf)
# assert len(pangenotype_matrix) == len(gaf)
# first_row = list(pangenotype_matrix[0])
# ok, where = compare_row_to_file(first_row, str(MASTER_TXT))
# print("OK (all columns match)" if ok else f"Mismatch at column {where}")
FIRST_N = 100
for gaf_path, row in zip(gaf, pangenotype_matrix):
row01 = [int(b) for b in row]
# print(pathlib.Path(gaf_path).name, *row01)
first_bits = islice(row, FIRST_N)
print(pathlib.Path(gaf_path).name, *map(int, first_bits))
2 changes: 2 additions & 0 deletions flatgfa-py/examples/tiny.gaf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
foo 12 0 12 + >1>2<4 38 5 17 12 12 0 cg:Z:150M
bar 20 0 20 + >1>2>3 30 7 27 20 20 0 cg:Z:150M
11 changes: 11 additions & 0 deletions flatgfa-py/examples/tiny.gfa
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
H VN:Z:1.0
S 1 CAAATAAG
S 2 AAATTTTCTGGAGTTCTAT
S 3 TTG
S 4 CCAACTCTCTG
P one 1+,2+,4- *
P two 1+,2+,3+,4- *
L 1 + 2 + 0M
L 2 + 4 - 0M
L 2 + 3 + 0M
L 3 + 4 - 0M
1 change: 1 addition & 0 deletions flatgfa-py/flatgfa.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class FlatGFA:
def write_gfa(self, filename: str) -> None: ...
def all_reads(self, gaf: str) -> GAFParser: ...
def print_gaf_lookup(self, gaf: str) -> None: ...
def make_pangenotype_matrix(self, gaf_files: list[str]) -> list[list[bool]]: ...

def parse(filename: str) -> FlatGFA: ...
def load(filename: str) -> FlatGFA: ...
Expand Down
6 changes: 6 additions & 0 deletions flatgfa-py/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use flatgfa::namemap::NameMap;
use flatgfa::ops::gaf::{ChunkEvent, GAFParser};
use flatgfa::ops::pangenotype;
use flatgfa::pool::Id;
use flatgfa::{self, file, memfile, print, FlatGFA, Handle, HeapGFAStore};
use memmap::Mmap;
Expand Down Expand Up @@ -167,6 +168,11 @@ impl PyFlatGFA {
println!();
}
}

pub fn make_pangenotype_matrix(&self, gaf_files: Vec<String>) -> Vec<Vec<bool>> {
let gfa = self.0.view();
pangenotype::make_pangenotype_matrix(&gfa, gaf_files)
}
}

/// A reference to a list of *any* type within a FlatGFA.
Expand Down
37 changes: 37 additions & 0 deletions flatgfa-py/test/test_gaf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import pathlib

import numpy as np

import flatgfa

TEST_DIR = pathlib.Path(__file__).parent
Expand All @@ -24,3 +27,37 @@ def test_gaf_ranges():
[(5, 8), (0, 9), (1, 0)],
[(7, 8), (0, 18), (0, 0)],
]


def test_construct_pangenotype_matrix():
gfa = flatgfa.parse_bytes(TEST_GFA.read_bytes())
pangenotype_matrix = gfa.make_pangenotype_matrix([(str(TEST_GAF))])
assert pangenotype_matrix == [[True, True, True, True]]


def test_pangenotype_regression():
gfa = flatgfa.parse_bytes(TEST_GFA.read_bytes())
pangenotype_matrix = gfa.make_pangenotype_matrix([str(TEST_GAF)])

# Optional: inspect structure
print("Matrix:", pangenotype_matrix)

# Example nested loop: iterate over rows and values
for row in pangenotype_matrix:
for val in row:
print("Segment present:", val)

# Convert nested list of bools -> NumPy array of floats
a = np.array(pangenotype_matrix, dtype=float)
b = np.random.rand(a.shape[0])
result = np.linalg.lstsq(a, b, rcond=None)
x, residuals, rank, s = result

# Print or assert just to verify
print("Coefficients:", x)
print("Residuals:", residuals)
print("Rank:", rank)
print("Singular values:", s)

# Simple test condition: check the shapes
assert x.shape[0] == a.shape[1]
1 change: 1 addition & 0 deletions flatgfa/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ argh = "0.1.13"
atoi = "2.0.0"
bit-vec = "0.8.0"
bstr = "1.12.0"
flate2 = "1.0"
memchr = "2.7.4"
memmap = "0.7.0"
num_enum = "0.7.3"
Expand Down
47 changes: 47 additions & 0 deletions flatgfa/src/cli/cmds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -338,3 +338,50 @@ pub fn bed_intersect(args: BEDIntersect) {
}
}
}

/// construct a pangenotype matrix
#[derive(FromArgs, PartialEq, Debug)]
#[argh(subcommand, name = "matrix")]
pub struct PangenotypeMatrix {
/// the GAF file to use for genotyping (use `-f <file>`)
#[argh(option, short = 'f')]
file: Option<String>,

/// read from stdin
#[argh(switch)]
stdin: bool,
}

pub fn pangenotype_matrix(gfa: &flatgfa::FlatGFA, args: PangenotypeMatrix) {
use std::io::{self, BufRead};

let matrix = if args.stdin {
// Read from stdin stream
let stdin = io::stdin();
let reader: Box<dyn BufRead> = Box::new(stdin.lock());
match ops::pangenotype::make_pangenotype_matrix_from_stream(gfa, reader) {
Ok(row) => vec![row],
Err(e) => {
eprintln!("Error reading from stdin: {}", e);
return;
}
}
} else if let Some(file_path) = args.file {
// For files (plain or .gz) the existing file-based API handles mmap and gzip
ops::pangenotype::make_pangenotype_matrix(gfa, vec![file_path])
} else {
eprintln!("Please provide a GAF file with -f <file> or use --stdin");
return;
};

for row in matrix {
for col in row {
if col {
print!("1");
} else {
print!("0");
}
}
println!();
}
}
4 changes: 4 additions & 0 deletions flatgfa/src/cli/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ enum Command {
GafLookup(cmds::GAFLookup),
Bench(cmds::Bench),
BedIntersect(cmds::BEDIntersect),
PangenotypeMatrix(cmds::PangenotypeMatrix),
}

fn main() -> Result<(), &'static str> {
Expand Down Expand Up @@ -146,6 +147,9 @@ fn main() -> Result<(), &'static str> {
Some(Command::BedIntersect(_sub_args)) => {
panic!("Unreachable code");
}
Some(Command::PangenotypeMatrix(sub_args)) => {
cmds::pangenotype_matrix(&gfa, sub_args);
}
None => {
// Just emit the GFA or FlatGFA file.
dump(&gfa, &args.output);
Expand Down
1 change: 1 addition & 0 deletions flatgfa/src/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ pub mod chop;
pub mod depth;
pub mod extract;
pub mod gaf;
pub mod pangenotype;
pub mod position;
Loading
Loading