11use crate :: flatgfa:: FlatGFA ;
22use crate :: memfile;
33use crate :: namemap:: NameMap ;
4+ use flate2:: read:: GzDecoder ;
45use memchr:: memchr;
6+ use std:: fs:: File ;
7+ use std:: io:: { self , BufRead , BufReader , Read } ;
8+
9+ /// Open a GAF file reader for gzip-compressed files.
10+ ///
11+ /// This function opens a .gaf.gz file and returns a boxed BufRead trait object
12+ /// for streaming decompression. Plain text files should use memory mapping instead.
13+ ///
14+ /// # Arguments
15+ /// * `path` - The path to a .gaf.gz file
16+ ///
17+ /// # Returns
18+ /// A Result containing a boxed trait object implementing BufRead
19+ fn open_gzip_reader ( path : & str ) -> io:: Result < Box < dyn BufRead > > {
20+ let file = File :: open ( path) ?;
21+ let decoder = GzDecoder :: new ( file) ;
22+ Ok ( Box :: new ( BufReader :: new ( decoder) ) )
23+ }
24+
25+ /// Process a GAF stream (implementing BufRead) and update the pangenotype matrix.
26+ ///
27+ /// This is a generic function that works with any BufRead source (files, gzip streams, stdin, etc.)
28+ ///
29+ /// # Arguments
30+ /// * `reader` - A boxed BufRead trait object
31+ /// * `matrix` - The pangenotype matrix to update
32+ /// * `file_idx` - The index of this file in the matrix
33+ /// * `name_map` - The name map for segment lookup
34+ fn process_gaf_stream (
35+ reader : Box < dyn BufRead > ,
36+ matrix : & mut Vec < Vec < bool > > ,
37+ file_idx : usize ,
38+ name_map : & crate :: namemap:: NameMap ,
39+ ) -> io:: Result < ( ) > {
40+ for line in reader. lines ( ) {
41+ let line = line?;
42+
43+ if line. is_empty ( ) || line. starts_with ( '#' ) {
44+ continue ;
45+ }
46+
47+ process_gaf_line ( & line, matrix, file_idx, name_map) ;
48+ }
49+ Ok ( ( ) )
50+ }
551
652pub fn make_pangenotype_matrix ( gfa : & FlatGFA , gaf_files : Vec < String > ) -> Vec < Vec < bool > > {
753 let num_segments = gfa. segs . len ( ) ;
@@ -10,56 +56,116 @@ pub fn make_pangenotype_matrix(gfa: &FlatGFA, gaf_files: Vec<String>) -> Vec<Vec
1056 let name_map = NameMap :: build ( & gfa) ;
1157
1258 for ( file_idx, gaf_path) in gaf_files. iter ( ) . enumerate ( ) {
13- let mmap = memfile:: map_file ( gaf_path) ;
14- let mut start = 0 ;
15-
16- while let Some ( pos) = memchr ( b'\n' , & mmap[ start..] ) {
17- let line_end = start + pos;
18- let line = & mmap[ start..line_end] ;
19- start = line_end + 1 ;
20-
21- if line. is_empty ( ) || line[ 0 ] == b'#' {
22- continue ;
23- }
24-
25- let mut tab_count = 0 ;
26- let mut idx = 0 ;
27- while idx < line. len ( ) && tab_count < 5 {
28- if line[ idx] == b'\t' {
29- tab_count += 1 ;
59+ if gaf_path. ends_with ( ".gz" ) {
60+ // Use BufRead stream for gzip files
61+ match open_gzip_reader ( gaf_path) {
62+ Ok ( reader) => {
63+ if let Err ( e) = process_gaf_stream ( reader, & mut matrix, file_idx, & name_map) {
64+ eprintln ! ( "Error processing GAF stream {}: {}" , gaf_path, e) ;
65+ }
66+ }
67+ Err ( e) => {
68+ eprintln ! ( "Error opening GAF file {}: {}" , gaf_path, e) ;
3069 }
31- idx += 1 ;
3270 }
71+ } else {
72+ // Use memory mapping for plain .gaf files (faster)
73+ let mmap = memfile:: map_file ( gaf_path) ;
74+ let mut start = 0 ;
3375
34- if tab_count < 5 || idx >= line. len ( ) {
35- continue ;
36- }
37- //The path data is in the 5th field only.
38- let mut end_idx = idx;
39- while end_idx < line. len ( ) && line[ end_idx] != b'\t' {
40- end_idx += 1 ;
41- }
42- let path_field = & line[ idx..end_idx] ;
43-
44- // === Parse path field like >12<34>56 ===
45- let mut p = 0 ;
46- while p < path_field. len ( ) {
47- let byte = path_field[ p] ;
48- if byte == b'>' || byte == b'<' {
49- p += 1 ;
50- let mut num = 0usize ;
51- while p < path_field. len ( ) && path_field[ p] . is_ascii_digit ( ) {
52- num = num * 10 + ( path_field[ p] - b'0' ) as usize ;
53- p += 1 ;
54- }
55- let seg_id = name_map. get ( num) ;
56- matrix[ file_idx] [ u32:: from ( seg_id) as usize ] = true ;
57- } else {
58- p += 1 ;
76+ while let Some ( pos) = memchr ( b'\n' , & mmap[ start..] ) {
77+ let line_end = start + pos;
78+ let line = & mmap[ start..line_end] ;
79+ start = line_end + 1 ;
80+
81+ if line. is_empty ( ) || line[ 0 ] == b'#' {
82+ continue ;
5983 }
84+
85+ process_gaf_line_bytes ( line, & mut matrix, file_idx, & name_map) ;
6086 }
6187 }
6288 }
6389
6490 matrix
6591}
92+
93+ pub fn make_pangenotype_matrix_from_stream (
94+ gfa : & FlatGFA ,
95+ reader : Box < dyn BufRead > ,
96+ ) -> io:: Result < Vec < bool > > {
97+ let num_segments = gfa. segs . len ( ) ;
98+ let mut matrix = vec ! [ vec![ false ; num_segments] ] ;
99+ let name_map = NameMap :: build ( & gfa) ;
100+
101+ process_gaf_stream ( reader, & mut matrix, 0 , & name_map) ?;
102+
103+ Ok ( matrix. remove ( 0 ) )
104+ }
105+
106+ fn process_gaf_line ( line : & str , matrix : & mut Vec < Vec < bool > > , file_idx : usize , name_map : & crate :: namemap:: NameMap ) {
107+ let bytes = line. as_bytes ( ) ;
108+ let mut tab_count = 0 ;
109+ let mut idx = 0 ;
110+ while idx < bytes. len ( ) && tab_count < 5 {
111+ if bytes[ idx] == b'\t' {
112+ tab_count += 1 ;
113+ }
114+ idx += 1 ;
115+ }
116+
117+ if tab_count < 5 || idx >= bytes. len ( ) {
118+ return ;
119+ }
120+ // The path data is in the 5th field (index 4, after 5 tabs)
121+ let mut end_idx = idx;
122+ while end_idx < bytes. len ( ) && bytes[ end_idx] != b'\t' {
123+ end_idx += 1 ;
124+ }
125+ let path_field = & bytes[ idx..end_idx] ;
126+
127+ parse_path_field ( path_field, matrix, file_idx, name_map) ;
128+ }
129+
130+ fn process_gaf_line_bytes ( line : & [ u8 ] , matrix : & mut Vec < Vec < bool > > , file_idx : usize , name_map : & crate :: namemap:: NameMap ) {
131+ let mut tab_count = 0 ;
132+ let mut idx = 0 ;
133+ while idx < line. len ( ) && tab_count < 5 {
134+ if line[ idx] == b'\t' {
135+ tab_count += 1 ;
136+ }
137+ idx += 1 ;
138+ }
139+
140+ if tab_count < 5 || idx >= line. len ( ) {
141+ return ;
142+ }
143+
144+ let mut end_idx = idx;
145+ while end_idx < line. len ( ) && line[ end_idx] != b'\t' {
146+ end_idx += 1 ;
147+ }
148+ let path_field = & line[ idx..end_idx] ;
149+
150+ parse_path_field ( path_field, matrix, file_idx, name_map) ;
151+ }
152+
153+
154+ fn parse_path_field ( path_field : & [ u8 ] , matrix : & mut Vec < Vec < bool > > , file_idx : usize , name_map : & crate :: namemap:: NameMap ) {
155+ let mut p = 0 ;
156+ while p < path_field. len ( ) {
157+ let byte = path_field[ p] ;
158+ if byte == b'>' || byte == b'<' {
159+ p += 1 ;
160+ let mut num = 0usize ;
161+ while p < path_field. len ( ) && path_field[ p] . is_ascii_digit ( ) {
162+ num = num * 10 + ( path_field[ p] - b'0' ) as usize ;
163+ p += 1 ;
164+ }
165+ let seg_id = name_map. get ( num) ;
166+ matrix[ file_idx] [ u32:: from ( seg_id) as usize ] = true ;
167+ } else {
168+ p += 1 ;
169+ }
170+ }
171+ }
0 commit comments