Skip to content

Commit ca90518

Browse files
author
Hang Ji
committed
Revert "Feature: decompressing for gzip file, supporting flate2 library and shell pipeline approach"
This reverts commit 7dd3314.
1 parent 7dd3314 commit ca90518

File tree

3 files changed

+47
-178
lines changed

3 files changed

+47
-178
lines changed

flatgfa/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ argh = "0.1.13"
1212
atoi = "2.0.0"
1313
bit-vec = "0.8.0"
1414
bstr = "1.12.0"
15-
flate2 = "1.0"
1615
memchr = "2.7.4"
1716
memmap = "0.7.0"
1817
num_enum = "0.7.3"

flatgfa/src/cli/cmds.rs

Lines changed: 4 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -343,37 +343,13 @@ pub fn bed_intersect(args: BEDIntersect) {
343343
#[derive(FromArgs, PartialEq, Debug)]
344344
#[argh(subcommand, name = "matrix")]
345345
pub struct PangenotypeMatrix {
346-
/// the GAF file to use for genotyping (use `-f <file>`)
347-
#[argh(option, short = 'f')]
348-
file: Option<String>,
349-
350-
/// read from stdin
351-
#[argh(switch)]
352-
stdin: bool,
346+
/// the GAF file to use for genotyping
347+
#[argh(positional)]
348+
gaf_file: String,
353349
}
354350

355351
pub fn pangenotype_matrix(gfa: &flatgfa::FlatGFA, args: PangenotypeMatrix) {
356-
use std::io::{self, BufRead};
357-
358-
let matrix = if args.stdin {
359-
// Read from stdin stream
360-
let stdin = io::stdin();
361-
let reader: Box<dyn BufRead> = Box::new(stdin.lock());
362-
match ops::pangenotype::make_pangenotype_matrix_from_stream(gfa, reader) {
363-
Ok(row) => vec![row],
364-
Err(e) => {
365-
eprintln!("Error reading from stdin: {}", e);
366-
return;
367-
}
368-
}
369-
} else if let Some(file_path) = args.file {
370-
// For files (plain or .gz) the existing file-based API handles mmap and gzip
371-
ops::pangenotype::make_pangenotype_matrix(gfa, vec![file_path])
372-
} else {
373-
eprintln!("Please provide a GAF file with -f <file> or use --stdin");
374-
return;
375-
};
376-
352+
let matrix = ops::pangenotype::make_pangenotype_matrix(gfa, vec![args.gaf_file]);
377353
for row in matrix {
378354
for col in row {
379355
if col {

flatgfa/src/ops/pangenotype.rs

Lines changed: 43 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,7 @@
11
use crate::flatgfa::FlatGFA;
22
use crate::memfile;
33
use crate::namemap::NameMap;
4-
use flate2::read::GzDecoder;
54
use memchr::memchr;
6-
use std::fs::File;
7-
use std::io::{self, BufRead, BufReader, Read};
8-
9-
/// Open a GAF file reader for gzip-compressed files.
10-
///
11-
/// This function opens a .gaf.gz file and returns a boxed BufRead trait object
12-
/// for streaming decompression. Plain text files should use memory mapping instead.
13-
///
14-
/// # Arguments
15-
/// * `path` - The path to a .gaf.gz file
16-
///
17-
/// # Returns
18-
/// A Result containing a boxed trait object implementing BufRead
19-
fn open_gzip_reader(path: &str) -> io::Result<Box<dyn BufRead>> {
20-
let file = File::open(path)?;
21-
let decoder = GzDecoder::new(file);
22-
Ok(Box::new(BufReader::new(decoder)))
23-
}
24-
25-
/// Process a GAF stream (implementing BufRead) and update the pangenotype matrix.
26-
///
27-
/// This is a generic function that works with any BufRead source (files, gzip streams, stdin, etc.)
28-
///
29-
/// # Arguments
30-
/// * `reader` - A boxed BufRead trait object
31-
/// * `matrix` - The pangenotype matrix to update
32-
/// * `file_idx` - The index of this file in the matrix
33-
/// * `name_map` - The name map for segment lookup
34-
fn process_gaf_stream(
35-
reader: Box<dyn BufRead>,
36-
matrix: &mut Vec<Vec<bool>>,
37-
file_idx: usize,
38-
name_map: &crate::namemap::NameMap,
39-
) -> io::Result<()> {
40-
for line in reader.lines() {
41-
let line = line?;
42-
43-
if line.is_empty() || line.starts_with('#') {
44-
continue;
45-
}
46-
47-
process_gaf_line(&line, matrix, file_idx, name_map);
48-
}
49-
Ok(())
50-
}
515

526
pub fn make_pangenotype_matrix(gfa: &FlatGFA, gaf_files: Vec<String>) -> Vec<Vec<bool>> {
537
let num_segments = gfa.segs.len();
@@ -56,116 +10,56 @@ pub fn make_pangenotype_matrix(gfa: &FlatGFA, gaf_files: Vec<String>) -> Vec<Vec
5610
let name_map = NameMap::build(&gfa);
5711

5812
for (file_idx, gaf_path) in gaf_files.iter().enumerate() {
59-
if gaf_path.ends_with(".gz") {
60-
// Use BufRead stream for gzip files
61-
match open_gzip_reader(gaf_path) {
62-
Ok(reader) => {
63-
if let Err(e) = process_gaf_stream(reader, &mut matrix, file_idx, &name_map) {
64-
eprintln!("Error processing GAF stream {}: {}", gaf_path, e);
65-
}
66-
}
67-
Err(e) => {
68-
eprintln!("Error opening GAF file {}: {}", gaf_path, e);
69-
}
70-
}
71-
} else {
72-
// Use memory mapping for plain .gaf files (faster)
73-
let mmap = memfile::map_file(gaf_path);
74-
let mut start = 0;
13+
let mmap = memfile::map_file(gaf_path);
14+
let mut start = 0;
7515

76-
while let Some(pos) = memchr(b'\n', &mmap[start..]) {
77-
let line_end = start + pos;
78-
let line = &mmap[start..line_end];
79-
start = line_end + 1;
16+
while let Some(pos) = memchr(b'\n', &mmap[start..]) {
17+
let line_end = start + pos;
18+
let line = &mmap[start..line_end];
19+
start = line_end + 1;
8020

81-
if line.is_empty() || line[0] == b'#' {
82-
continue;
21+
if line.is_empty() || line[0] == b'#' {
22+
continue;
23+
}
24+
25+
let mut tab_count = 0;
26+
let mut idx = 0;
27+
while idx < line.len() && tab_count < 5 {
28+
if line[idx] == b'\t' {
29+
tab_count += 1;
8330
}
31+
idx += 1;
32+
}
8433

85-
process_gaf_line_bytes(line, &mut matrix, file_idx, &name_map);
34+
if tab_count < 5 || idx >= line.len() {
35+
continue;
36+
}
37+
//The path data is in the 5th field only.
38+
let mut end_idx = idx;
39+
while end_idx < line.len() && line[end_idx] != b'\t' {
40+
end_idx += 1;
41+
}
42+
let path_field = &line[idx..end_idx];
43+
44+
// === Parse path field like >12<34>56 ===
45+
let mut p = 0;
46+
while p < path_field.len() {
47+
let byte = path_field[p];
48+
if byte == b'>' || byte == b'<' {
49+
p += 1;
50+
let mut num = 0usize;
51+
while p < path_field.len() && path_field[p].is_ascii_digit() {
52+
num = num * 10 + (path_field[p] - b'0') as usize;
53+
p += 1;
54+
}
55+
let seg_id = name_map.get(num);
56+
matrix[file_idx][u32::from(seg_id) as usize] = true;
57+
} else {
58+
p += 1;
59+
}
8660
}
8761
}
8862
}
8963

9064
matrix
9165
}
92-
93-
pub fn make_pangenotype_matrix_from_stream(
94-
gfa: &FlatGFA,
95-
reader: Box<dyn BufRead>,
96-
) -> io::Result<Vec<bool>> {
97-
let num_segments = gfa.segs.len();
98-
let mut matrix = vec![vec![false; num_segments]];
99-
let name_map = NameMap::build(&gfa);
100-
101-
process_gaf_stream(reader, &mut matrix, 0, &name_map)?;
102-
103-
Ok(matrix.remove(0))
104-
}
105-
106-
fn process_gaf_line(line: &str, matrix: &mut Vec<Vec<bool>>, file_idx: usize, name_map: &crate::namemap::NameMap) {
107-
let bytes = line.as_bytes();
108-
let mut tab_count = 0;
109-
let mut idx = 0;
110-
while idx < bytes.len() && tab_count < 5 {
111-
if bytes[idx] == b'\t' {
112-
tab_count += 1;
113-
}
114-
idx += 1;
115-
}
116-
117-
if tab_count < 5 || idx >= bytes.len() {
118-
return;
119-
}
120-
// The path data is in the 5th field (index 4, after 5 tabs)
121-
let mut end_idx = idx;
122-
while end_idx < bytes.len() && bytes[end_idx] != b'\t' {
123-
end_idx += 1;
124-
}
125-
let path_field = &bytes[idx..end_idx];
126-
127-
parse_path_field(path_field, matrix, file_idx, name_map);
128-
}
129-
130-
fn process_gaf_line_bytes(line: &[u8], matrix: &mut Vec<Vec<bool>>, file_idx: usize, name_map: &crate::namemap::NameMap) {
131-
let mut tab_count = 0;
132-
let mut idx = 0;
133-
while idx < line.len() && tab_count < 5 {
134-
if line[idx] == b'\t' {
135-
tab_count += 1;
136-
}
137-
idx += 1;
138-
}
139-
140-
if tab_count < 5 || idx >= line.len() {
141-
return;
142-
}
143-
144-
let mut end_idx = idx;
145-
while end_idx < line.len() && line[end_idx] != b'\t' {
146-
end_idx += 1;
147-
}
148-
let path_field = &line[idx..end_idx];
149-
150-
parse_path_field(path_field, matrix, file_idx, name_map);
151-
}
152-
153-
154-
fn parse_path_field(path_field: &[u8], matrix: &mut Vec<Vec<bool>>, file_idx: usize, name_map: &crate::namemap::NameMap) {
155-
let mut p = 0;
156-
while p < path_field.len() {
157-
let byte = path_field[p];
158-
if byte == b'>' || byte == b'<' {
159-
p += 1;
160-
let mut num = 0usize;
161-
while p < path_field.len() && path_field[p].is_ascii_digit() {
162-
num = num * 10 + (path_field[p] - b'0') as usize;
163-
p += 1;
164-
}
165-
let seg_id = name_map.get(num);
166-
matrix[file_idx][u32::from(seg_id) as usize] = true;
167-
} else {
168-
p += 1;
169-
}
170-
}
171-
}

0 commit comments

Comments
 (0)