Skip to content

Commit 7dd3314

Browse files
author
Hang Ji
committed
Feature: decompressing for gzip file, supporting flate2 library and shell pipeline approach
1 parent cc48cab commit 7dd3314

File tree

3 files changed

+178
-47
lines changed

3 files changed

+178
-47
lines changed

flatgfa/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ argh = "0.1.13"
1212
atoi = "2.0.0"
1313
bit-vec = "0.8.0"
1414
bstr = "1.12.0"
15+
flate2 = "1.0"
1516
memchr = "2.7.4"
1617
memmap = "0.7.0"
1718
num_enum = "0.7.3"

flatgfa/src/cli/cmds.rs

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -343,13 +343,37 @@ pub fn bed_intersect(args: BEDIntersect) {
343343
#[derive(FromArgs, PartialEq, Debug)]
344344
#[argh(subcommand, name = "matrix")]
345345
pub struct PangenotypeMatrix {
346-
/// the GAF file to use for genotyping
347-
#[argh(positional)]
348-
gaf_file: String,
346+
/// the GAF file to use for genotyping (use `-f <file>`)
347+
#[argh(option, short = 'f')]
348+
file: Option<String>,
349+
350+
/// read from stdin
351+
#[argh(switch)]
352+
stdin: bool,
349353
}
350354

351355
pub fn pangenotype_matrix(gfa: &flatgfa::FlatGFA, args: PangenotypeMatrix) {
352-
let matrix = ops::pangenotype::make_pangenotype_matrix(gfa, vec![args.gaf_file]);
356+
use std::io::{self, BufRead};
357+
358+
let matrix = if args.stdin {
359+
// Read from stdin stream
360+
let stdin = io::stdin();
361+
let reader: Box<dyn BufRead> = Box::new(stdin.lock());
362+
match ops::pangenotype::make_pangenotype_matrix_from_stream(gfa, reader) {
363+
Ok(row) => vec![row],
364+
Err(e) => {
365+
eprintln!("Error reading from stdin: {}", e);
366+
return;
367+
}
368+
}
369+
} else if let Some(file_path) = args.file {
370+
// For files (plain or .gz) the existing file-based API handles mmap and gzip
371+
ops::pangenotype::make_pangenotype_matrix(gfa, vec![file_path])
372+
} else {
373+
eprintln!("Please provide a GAF file with -f <file> or use --stdin");
374+
return;
375+
};
376+
353377
for row in matrix {
354378
for col in row {
355379
if col {

flatgfa/src/ops/pangenotype.rs

Lines changed: 149 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,53 @@
11
use crate::flatgfa::FlatGFA;
22
use crate::memfile;
33
use crate::namemap::NameMap;
4+
use flate2::read::GzDecoder;
45
use memchr::memchr;
6+
use std::fs::File;
7+
use std::io::{self, BufRead, BufReader, Read};
8+
9+
/// Open a GAF file reader for gzip-compressed files.
10+
///
11+
/// This function opens a .gaf.gz file and returns a boxed BufRead trait object
12+
/// for streaming decompression. Plain text files should use memory mapping instead.
13+
///
14+
/// # Arguments
15+
/// * `path` - The path to a .gaf.gz file
16+
///
17+
/// # Returns
18+
/// A Result containing a boxed trait object implementing BufRead
19+
fn open_gzip_reader(path: &str) -> io::Result<Box<dyn BufRead>> {
20+
let file = File::open(path)?;
21+
let decoder = GzDecoder::new(file);
22+
Ok(Box::new(BufReader::new(decoder)))
23+
}
24+
25+
/// Process a GAF stream (implementing BufRead) and update the pangenotype matrix.
26+
///
27+
/// This is a generic function that works with any BufRead source (files, gzip streams, stdin, etc.)
28+
///
29+
/// # Arguments
30+
/// * `reader` - A boxed BufRead trait object
31+
/// * `matrix` - The pangenotype matrix to update
32+
/// * `file_idx` - The index of this file in the matrix
33+
/// * `name_map` - The name map for segment lookup
34+
fn process_gaf_stream(
35+
reader: Box<dyn BufRead>,
36+
matrix: &mut Vec<Vec<bool>>,
37+
file_idx: usize,
38+
name_map: &crate::namemap::NameMap,
39+
) -> io::Result<()> {
40+
for line in reader.lines() {
41+
let line = line?;
42+
43+
if line.is_empty() || line.starts_with('#') {
44+
continue;
45+
}
46+
47+
process_gaf_line(&line, matrix, file_idx, name_map);
48+
}
49+
Ok(())
50+
}
551

652
pub fn make_pangenotype_matrix(gfa: &FlatGFA, gaf_files: Vec<String>) -> Vec<Vec<bool>> {
753
let num_segments = gfa.segs.len();
@@ -10,56 +56,116 @@ pub fn make_pangenotype_matrix(gfa: &FlatGFA, gaf_files: Vec<String>) -> Vec<Vec
1056
let name_map = NameMap::build(&gfa);
1157

1258
for (file_idx, gaf_path) in gaf_files.iter().enumerate() {
13-
let mmap = memfile::map_file(gaf_path);
14-
let mut start = 0;
15-
16-
while let Some(pos) = memchr(b'\n', &mmap[start..]) {
17-
let line_end = start + pos;
18-
let line = &mmap[start..line_end];
19-
start = line_end + 1;
20-
21-
if line.is_empty() || line[0] == b'#' {
22-
continue;
23-
}
24-
25-
let mut tab_count = 0;
26-
let mut idx = 0;
27-
while idx < line.len() && tab_count < 5 {
28-
if line[idx] == b'\t' {
29-
tab_count += 1;
59+
if gaf_path.ends_with(".gz") {
60+
// Use BufRead stream for gzip files
61+
match open_gzip_reader(gaf_path) {
62+
Ok(reader) => {
63+
if let Err(e) = process_gaf_stream(reader, &mut matrix, file_idx, &name_map) {
64+
eprintln!("Error processing GAF stream {}: {}", gaf_path, e);
65+
}
66+
}
67+
Err(e) => {
68+
eprintln!("Error opening GAF file {}: {}", gaf_path, e);
3069
}
31-
idx += 1;
3270
}
71+
} else {
72+
// Use memory mapping for plain .gaf files (faster)
73+
let mmap = memfile::map_file(gaf_path);
74+
let mut start = 0;
3375

34-
if tab_count < 5 || idx >= line.len() {
35-
continue;
36-
}
37-
//The path data is in the 5th field only.
38-
let mut end_idx = idx;
39-
while end_idx < line.len() && line[end_idx] != b'\t' {
40-
end_idx += 1;
41-
}
42-
let path_field = &line[idx..end_idx];
43-
44-
// === Parse path field like >12<34>56 ===
45-
let mut p = 0;
46-
while p < path_field.len() {
47-
let byte = path_field[p];
48-
if byte == b'>' || byte == b'<' {
49-
p += 1;
50-
let mut num = 0usize;
51-
while p < path_field.len() && path_field[p].is_ascii_digit() {
52-
num = num * 10 + (path_field[p] - b'0') as usize;
53-
p += 1;
54-
}
55-
let seg_id = name_map.get(num);
56-
matrix[file_idx][u32::from(seg_id) as usize] = true;
57-
} else {
58-
p += 1;
76+
while let Some(pos) = memchr(b'\n', &mmap[start..]) {
77+
let line_end = start + pos;
78+
let line = &mmap[start..line_end];
79+
start = line_end + 1;
80+
81+
if line.is_empty() || line[0] == b'#' {
82+
continue;
5983
}
84+
85+
process_gaf_line_bytes(line, &mut matrix, file_idx, &name_map);
6086
}
6187
}
6288
}
6389

6490
matrix
6591
}
92+
93+
pub fn make_pangenotype_matrix_from_stream(
94+
gfa: &FlatGFA,
95+
reader: Box<dyn BufRead>,
96+
) -> io::Result<Vec<bool>> {
97+
let num_segments = gfa.segs.len();
98+
let mut matrix = vec![vec![false; num_segments]];
99+
let name_map = NameMap::build(&gfa);
100+
101+
process_gaf_stream(reader, &mut matrix, 0, &name_map)?;
102+
103+
Ok(matrix.remove(0))
104+
}
105+
106+
fn process_gaf_line(line: &str, matrix: &mut Vec<Vec<bool>>, file_idx: usize, name_map: &crate::namemap::NameMap) {
107+
let bytes = line.as_bytes();
108+
let mut tab_count = 0;
109+
let mut idx = 0;
110+
while idx < bytes.len() && tab_count < 5 {
111+
if bytes[idx] == b'\t' {
112+
tab_count += 1;
113+
}
114+
idx += 1;
115+
}
116+
117+
if tab_count < 5 || idx >= bytes.len() {
118+
return;
119+
}
120+
// The path data is in the 5th field (index 4, after 5 tabs)
121+
let mut end_idx = idx;
122+
while end_idx < bytes.len() && bytes[end_idx] != b'\t' {
123+
end_idx += 1;
124+
}
125+
let path_field = &bytes[idx..end_idx];
126+
127+
parse_path_field(path_field, matrix, file_idx, name_map);
128+
}
129+
130+
fn process_gaf_line_bytes(line: &[u8], matrix: &mut Vec<Vec<bool>>, file_idx: usize, name_map: &crate::namemap::NameMap) {
131+
let mut tab_count = 0;
132+
let mut idx = 0;
133+
while idx < line.len() && tab_count < 5 {
134+
if line[idx] == b'\t' {
135+
tab_count += 1;
136+
}
137+
idx += 1;
138+
}
139+
140+
if tab_count < 5 || idx >= line.len() {
141+
return;
142+
}
143+
144+
let mut end_idx = idx;
145+
while end_idx < line.len() && line[end_idx] != b'\t' {
146+
end_idx += 1;
147+
}
148+
let path_field = &line[idx..end_idx];
149+
150+
parse_path_field(path_field, matrix, file_idx, name_map);
151+
}
152+
153+
154+
fn parse_path_field(path_field: &[u8], matrix: &mut Vec<Vec<bool>>, file_idx: usize, name_map: &crate::namemap::NameMap) {
155+
let mut p = 0;
156+
while p < path_field.len() {
157+
let byte = path_field[p];
158+
if byte == b'>' || byte == b'<' {
159+
p += 1;
160+
let mut num = 0usize;
161+
while p < path_field.len() && path_field[p].is_ascii_digit() {
162+
num = num * 10 + (path_field[p] - b'0') as usize;
163+
p += 1;
164+
}
165+
let seg_id = name_map.get(num);
166+
matrix[file_idx][u32::from(seg_id) as usize] = true;
167+
} else {
168+
p += 1;
169+
}
170+
}
171+
}

0 commit comments

Comments
 (0)