@@ -3,7 +3,7 @@ use either::Either;
33use glob:: glob;
44use serde:: { self , Deserialize , Serialize , de:: DeserializeOwned } ;
55use std:: {
6- collections:: HashMap ,
6+ collections:: { HashMap , HashSet } ,
77 error:: Error ,
88 fs:: { self , File , OpenOptions } ,
99 io:: { self , BufRead , BufReader , Read , Stdin , stdin} ,
@@ -142,12 +142,6 @@ pub struct SeqData {
142142 sequence : String ,
143143}
144144
145- #[ derive( Debug ) ]
146- pub struct RefLengthData {
147- name : String ,
148- length : usize ,
149- }
150-
151145/////////////// Structs to hold dais-ribosome data ///////////////
152146/// Insertion Data
153147#[ derive( Serialize , Deserialize , Debug ) ]
@@ -602,61 +596,88 @@ pub fn create_vtype_data(reads_data: &Vec<ReadsData>) -> Vec<ProcessedRecord> {
602596 processed_records
603597}
604598
605- /// Collect the reference lengths from the IRMA outputs
599+ // Function to collect reference lengths from IRMA outputs
606600pub fn get_reference_lens (
607601 irma_path : & PathBuf ,
608- ) -> Result < Vec < RefLengthData > , Box < dyn std:: error:: Error > > {
609- // Determine the glob pattern based on the organism
602+ ) -> Result < HashMap < String , usize > , Box < dyn std:: error:: Error > > {
610603 let pattern = format ! (
611604 "{}/*/IRMA/*/intermediate/0-ITERATIVE-REFERENCES/R0*ref" ,
612605 irma_path. to_string_lossy( )
613606 ) ;
614607
615- let mut ref_len_data : Vec < RefLengthData > = Vec :: new ( ) ;
608+ let mut ref_len_map : HashMap < String , usize > = HashMap :: new ( ) ;
616609
617- // Iterate over all files matching the pattern
618- for entry in glob ( & pattern) . expect ( "Failed to read glob pattern" ) {
610+ for entry in glob ( & pattern) ? {
619611 match entry {
620612 Ok ( path) => {
621613 let file = File :: open ( & path) ?;
622614 let reader = BufReader :: new ( file) ;
623615
624- // Parse the file line by line (assuming FASTA format)
625616 let mut ref_name = String :: new ( ) ;
626617 let mut current_sequence = String :: new ( ) ;
627618
628619 for line in reader. lines ( ) {
629620 let line = line?;
630621 if line. starts_with ( '>' ) {
631- // If there's an existing sequence, save its length
632622 if !ref_name. is_empty ( ) {
633- ref_len_data. push ( RefLengthData {
634- name : ref_name. clone ( ) ,
635- length : current_sequence. len ( ) ,
636- } ) ;
623+ ref_len_map. insert ( ref_name. clone ( ) , current_sequence. len ( ) ) ;
637624 }
638- // Start a new sequence
639- ref_name = line[ 1 ..] . to_string ( ) ; // Remove '>'
625+ ref_name = line[ 1 ..] . to_string ( ) ;
640626 current_sequence. clear ( ) ;
641627 } else {
642- // Append to the current sequence
643628 current_sequence. push_str ( & line) ;
644629 }
645630 }
646631
647- // Save the last sequence's length
648632 if !ref_name. is_empty ( ) {
649- ref_len_data. push ( RefLengthData {
650- name : ref_name,
651- length : current_sequence. len ( ) ,
652- } ) ;
633+ // Insert the last reference name and its length into the HashMap
634+ ref_len_map. insert ( ref_name, current_sequence. len ( ) ) ;
653635 }
654636 }
655637 Err ( e) => println ! ( "Error reading file: {e}" ) ,
656638 }
657639 }
658640
659- Ok ( ref_len_data)
641+ Ok ( ref_len_map)
642+ }
643+
644+ // Function to process reference names and generate segments, segset, and segcolor
645+ pub fn return_seg_data (
646+ reference_names : Vec < String > ,
647+ ) -> ( Vec < String > , Vec < String > , HashMap < String , & ' static str > ) {
648+ let mut segments: Vec < String > = reference_names. into_iter ( ) . collect ( ) ;
649+ segments. sort ( ) ;
650+ segments. dedup ( ) ;
651+
652+ let color_palette = vec ! [
653+ "#1f77b4" , "#ff7f0e" , "#2ca02c" , "#d62728" , "#9467bd" , "#8c564b" , "#e377c2" , "#7f7f7f" ,
654+ "#bcbd22" , "#17becf" ,
655+ ] ;
656+
657+ let mut segset: Vec < String > = Vec :: new ( ) ;
658+ for segment in & segments {
659+ let parts: Vec < & str > = segment. split ( '_' ) . collect ( ) ;
660+ if parts. len ( ) > 1 {
661+ segset. push ( parts[ 1 ] . to_string ( ) ) ;
662+ } else {
663+ segset. push ( segment. clone ( ) ) ;
664+ }
665+ }
666+
667+ let segset: Vec < String > = segset
668+ . into_iter ( )
669+ . collect :: < std:: collections:: HashSet < _ > > ( )
670+ . into_iter ( )
671+ . collect ( ) ;
672+
673+ let mut segcolor: HashMap < String , & str > = HashMap :: new ( ) ;
674+ for ( i, seg) in segset. iter ( ) . enumerate ( ) {
675+ if i < color_palette. len ( ) {
676+ segcolor. insert ( seg. clone ( ) , color_palette[ i] ) ;
677+ }
678+ }
679+
680+ ( segments, segset, segcolor)
660681}
661682
662683/////////////// Data reading functions for DAIS-ribosome ///////////////
0 commit comments