11use crate :: flatgfa:: FlatGFA ;
22use crate :: memfile;
33use crate :: namemap:: NameMap ;
4- use flate2:: read:: GzDecoder ;
54use memchr:: memchr;
6- use std:: fs:: File ;
7- use std:: io:: { self , BufRead , BufReader , Read } ;
8-
9- /// Open a GAF file reader for gzip-compressed files.
10- ///
11- /// This function opens a .gaf.gz file and returns a boxed BufRead trait object
12- /// for streaming decompression. Plain text files should use memory mapping instead.
13- ///
14- /// # Arguments
15- /// * `path` - The path to a .gaf.gz file
16- ///
17- /// # Returns
18- /// A Result containing a boxed trait object implementing BufRead
19- fn open_gzip_reader ( path : & str ) -> io:: Result < Box < dyn BufRead > > {
20- let file = File :: open ( path) ?;
21- let decoder = GzDecoder :: new ( file) ;
22- Ok ( Box :: new ( BufReader :: new ( decoder) ) )
23- }
24-
25- /// Process a GAF stream (implementing BufRead) and update the pangenotype matrix.
26- ///
27- /// This is a generic function that works with any BufRead source (files, gzip streams, stdin, etc.)
28- ///
29- /// # Arguments
30- /// * `reader` - A boxed BufRead trait object
31- /// * `matrix` - The pangenotype matrix to update
32- /// * `file_idx` - The index of this file in the matrix
33- /// * `name_map` - The name map for segment lookup
34- fn process_gaf_stream (
35- reader : Box < dyn BufRead > ,
36- matrix : & mut Vec < Vec < bool > > ,
37- file_idx : usize ,
38- name_map : & crate :: namemap:: NameMap ,
39- ) -> io:: Result < ( ) > {
40- for line in reader. lines ( ) {
41- let line = line?;
42-
43- if line. is_empty ( ) || line. starts_with ( '#' ) {
44- continue ;
45- }
46-
47- process_gaf_line ( & line, matrix, file_idx, name_map) ;
48- }
49- Ok ( ( ) )
50- }
515
526pub fn make_pangenotype_matrix ( gfa : & FlatGFA , gaf_files : Vec < String > ) -> Vec < Vec < bool > > {
537 let num_segments = gfa. segs . len ( ) ;
@@ -56,116 +10,56 @@ pub fn make_pangenotype_matrix(gfa: &FlatGFA, gaf_files: Vec<String>) -> Vec<Vec
5610 let name_map = NameMap :: build ( & gfa) ;
5711
5812 for ( file_idx, gaf_path) in gaf_files. iter ( ) . enumerate ( ) {
59- if gaf_path. ends_with ( ".gz" ) {
60- // Use BufRead stream for gzip files
61- match open_gzip_reader ( gaf_path) {
62- Ok ( reader) => {
63- if let Err ( e) = process_gaf_stream ( reader, & mut matrix, file_idx, & name_map) {
64- eprintln ! ( "Error processing GAF stream {}: {}" , gaf_path, e) ;
65- }
66- }
67- Err ( e) => {
68- eprintln ! ( "Error opening GAF file {}: {}" , gaf_path, e) ;
69- }
70- }
71- } else {
72- // Use memory mapping for plain .gaf files (faster)
73- let mmap = memfile:: map_file ( gaf_path) ;
74- let mut start = 0 ;
13+ let mmap = memfile:: map_file ( gaf_path) ;
14+ let mut start = 0 ;
7515
76- while let Some ( pos) = memchr ( b'\n' , & mmap[ start..] ) {
77- let line_end = start + pos;
78- let line = & mmap[ start..line_end] ;
79- start = line_end + 1 ;
16+ while let Some ( pos) = memchr ( b'\n' , & mmap[ start..] ) {
17+ let line_end = start + pos;
18+ let line = & mmap[ start..line_end] ;
19+ start = line_end + 1 ;
8020
81- if line. is_empty ( ) || line[ 0 ] == b'#' {
82- continue ;
21+ if line. is_empty ( ) || line[ 0 ] == b'#' {
22+ continue ;
23+ }
24+
25+ let mut tab_count = 0 ;
26+ let mut idx = 0 ;
27+ while idx < line. len ( ) && tab_count < 5 {
28+ if line[ idx] == b'\t' {
29+ tab_count += 1 ;
8330 }
31+ idx += 1 ;
32+ }
8433
85- process_gaf_line_bytes ( line, & mut matrix, file_idx, & name_map) ;
34+ if tab_count < 5 || idx >= line. len ( ) {
35+ continue ;
36+ }
37+ //The path data is in the 5th field only.
38+ let mut end_idx = idx;
39+ while end_idx < line. len ( ) && line[ end_idx] != b'\t' {
40+ end_idx += 1 ;
41+ }
42+ let path_field = & line[ idx..end_idx] ;
43+
44+ // === Parse path field like >12<34>56 ===
45+ let mut p = 0 ;
46+ while p < path_field. len ( ) {
47+ let byte = path_field[ p] ;
48+ if byte == b'>' || byte == b'<' {
49+ p += 1 ;
50+ let mut num = 0usize ;
51+ while p < path_field. len ( ) && path_field[ p] . is_ascii_digit ( ) {
52+ num = num * 10 + ( path_field[ p] - b'0' ) as usize ;
53+ p += 1 ;
54+ }
55+ let seg_id = name_map. get ( num) ;
56+ matrix[ file_idx] [ u32:: from ( seg_id) as usize ] = true ;
57+ } else {
58+ p += 1 ;
59+ }
8660 }
8761 }
8862 }
8963
9064 matrix
9165}
92-
93- pub fn make_pangenotype_matrix_from_stream (
94- gfa : & FlatGFA ,
95- reader : Box < dyn BufRead > ,
96- ) -> io:: Result < Vec < bool > > {
97- let num_segments = gfa. segs . len ( ) ;
98- let mut matrix = vec ! [ vec![ false ; num_segments] ] ;
99- let name_map = NameMap :: build ( & gfa) ;
100-
101- process_gaf_stream ( reader, & mut matrix, 0 , & name_map) ?;
102-
103- Ok ( matrix. remove ( 0 ) )
104- }
105-
106- fn process_gaf_line ( line : & str , matrix : & mut Vec < Vec < bool > > , file_idx : usize , name_map : & crate :: namemap:: NameMap ) {
107- let bytes = line. as_bytes ( ) ;
108- let mut tab_count = 0 ;
109- let mut idx = 0 ;
110- while idx < bytes. len ( ) && tab_count < 5 {
111- if bytes[ idx] == b'\t' {
112- tab_count += 1 ;
113- }
114- idx += 1 ;
115- }
116-
117- if tab_count < 5 || idx >= bytes. len ( ) {
118- return ;
119- }
120- // The path data is in the 5th field (index 4, after 5 tabs)
121- let mut end_idx = idx;
122- while end_idx < bytes. len ( ) && bytes[ end_idx] != b'\t' {
123- end_idx += 1 ;
124- }
125- let path_field = & bytes[ idx..end_idx] ;
126-
127- parse_path_field ( path_field, matrix, file_idx, name_map) ;
128- }
129-
130- fn process_gaf_line_bytes ( line : & [ u8 ] , matrix : & mut Vec < Vec < bool > > , file_idx : usize , name_map : & crate :: namemap:: NameMap ) {
131- let mut tab_count = 0 ;
132- let mut idx = 0 ;
133- while idx < line. len ( ) && tab_count < 5 {
134- if line[ idx] == b'\t' {
135- tab_count += 1 ;
136- }
137- idx += 1 ;
138- }
139-
140- if tab_count < 5 || idx >= line. len ( ) {
141- return ;
142- }
143-
144- let mut end_idx = idx;
145- while end_idx < line. len ( ) && line[ end_idx] != b'\t' {
146- end_idx += 1 ;
147- }
148- let path_field = & line[ idx..end_idx] ;
149-
150- parse_path_field ( path_field, matrix, file_idx, name_map) ;
151- }
152-
153-
154- fn parse_path_field ( path_field : & [ u8 ] , matrix : & mut Vec < Vec < bool > > , file_idx : usize , name_map : & crate :: namemap:: NameMap ) {
155- let mut p = 0 ;
156- while p < path_field. len ( ) {
157- let byte = path_field[ p] ;
158- if byte == b'>' || byte == b'<' {
159- p += 1 ;
160- let mut num = 0usize ;
161- while p < path_field. len ( ) && path_field[ p] . is_ascii_digit ( ) {
162- num = num * 10 + ( path_field[ p] - b'0' ) as usize ;
163- p += 1 ;
164- }
165- let seg_id = name_map. get ( num) ;
166- matrix[ file_idx] [ u32:: from ( seg_id) as usize ] = true ;
167- } else {
168- p += 1 ;
169- }
170- }
171- }
0 commit comments