@@ -165,67 +165,28 @@ pub struct IndelsData {
165165 pub instrument : Option < String > ,
166166}
167167
168+ /// Run Info struct
169+ #[ derive( Serialize , Deserialize , Debug , Clone ) ]
170+ pub struct RunInfo {
171+ #[ serde( rename = "program_name" ) ]
172+ pub program_name : Option < String > ,
173+ #[ serde( rename = "PROGRAM" ) ]
174+ pub program : Option < String > ,
175+ #[ serde( rename = "Iterative Refinement Meta-Assembler (IRMA)" ) ]
176+ pub irma : Option < String > ,
177+ #[ serde( rename = "Run_ID" ) ]
178+ pub run_id : Option < String > ,
179+ #[ serde( rename = "Instrument" ) ]
180+ pub instrument : Option < String > ,
181+ }
182+
168183#[ derive( Debug ) ]
169184pub struct SeqData {
170185 pub name : String ,
171186 pub sequence : String ,
172187}
173188
174189/////////////// Structs to hold dais-ribosome data ///////////////
175- /// Insertion Data
176- #[ derive( Serialize , Deserialize , Debug ) ]
177- pub struct InsertionData {
178- #[ serde( rename = "ID" ) ]
179- pub sample_id : Option < String > ,
180- #[ serde( rename = "C_type" ) ]
181- pub ctype : Option < String > ,
182- #[ serde( rename = "Ref_ID" ) ]
183- pub reference : String ,
184- #[ serde( rename = "Protein" ) ]
185- pub protein : String ,
186- #[ serde( rename = "Upstream_aa" ) ]
187- pub upstream_aa_position : String ,
188- #[ serde( rename = "Inserted_nucleotides" ) ]
189- pub inserted_nucleotides : String ,
190- #[ serde( rename = "Inserted_residues" ) ]
191- pub inserted_residues : String ,
192- #[ serde( rename = "Upstream_nt" ) ]
193- pub upstream_nt : String ,
194- #[ serde( rename = "Codon_shift" ) ]
195- pub in_frame : String ,
196- }
197-
198- /// Deletions Data
199- #[ derive( Serialize , Deserialize , Debug ) ]
200- pub struct DeletionsData {
201- #[ serde( rename = "ID" ) ]
202- pub sample_id : Option < String > ,
203- #[ serde( rename = "C_type" ) ]
204- pub ctype : Option < String > ,
205- #[ serde( rename = "Ref_ID" ) ]
206- pub reference : String ,
207- #[ serde( rename = "Protein" ) ]
208- pub protein : String ,
209- #[ serde( rename = "VH" ) ]
210- pub vh : Option < String > ,
211- #[ serde( rename = "Del_AA_start" ) ]
212- pub del_start_aa_position : Option < String > ,
213- #[ serde( rename = "Del_AA_end" ) ]
214- pub del_end_aa_position : Option < String > ,
215- #[ serde( rename = "Del_AA_len" ) ]
216- pub del_aa_length : String ,
217- #[ serde( rename = "In_frame" ) ]
218- pub in_frame : String ,
219- #[ serde( rename = "CDS_ID" ) ]
220- pub cds_id : Option < String > ,
221- #[ serde( rename = "Del_CDS_start" ) ]
222- pub del_start_cds_position : String ,
223- #[ serde( rename = "Del_CDS_end" ) ]
224- pub del_end_cds_position : String ,
225- #[ serde( rename = "Del_CDS_len" ) ]
226- pub del_cds_length : Option < String > ,
227- }
228-
229190/// Dais Sequence Data
230191#[ derive( Serialize , Deserialize , Debug ) ]
231192pub struct DaisSeqData {
@@ -374,6 +335,33 @@ where
374335 Ok ( records)
375336}
376337
338+ /// Read tab-delimited data and include the sample name
339+ fn process_txt_without_sample < R , T > ( reader : R , has_headers : bool ) -> std:: vec:: Vec < T >
340+ where
341+ R : Read ,
342+ T : for < ' de > Deserialize < ' de > ,
343+ {
344+ let mut rdr = ReaderBuilder :: new ( )
345+ . has_headers ( has_headers)
346+ . delimiter ( b'\t' )
347+ . from_reader ( reader) ;
348+
349+ let mut records: Vec < T > = Vec :: new ( ) ;
350+ for result in rdr. deserialize ( ) {
351+ match result {
352+ Ok ( record) => {
353+ records. push ( record) ;
354+ }
355+ Err ( e) => {
356+ // Log a warning and skip the invalid record
357+ eprintln ! ( "Warning: Failed to deserialize record: {e}" ) ;
358+ }
359+ }
360+ }
361+
362+ records
363+ }
364+
377365/// Read in the coverage files made by IRMA and save to a vector of `CoverageData`
378366pub fn coverage_data_collection (
379367 irma_path : impl AsRef < Path > ,
@@ -452,6 +440,8 @@ pub fn reads_data_collection(
452440/// Collecting allele data created by IRMA and and save to vector of `AllelesData`
453441pub fn allele_data_collection (
454442 irma_path : & Path ,
443+ platform : & str ,
444+ runid : & str ,
455445) -> Result < Vec < AllelesData > , Box < dyn std:: error:: Error > > {
456446 let pattern = format ! (
457447 "{}/*/IRMA/*/tables/*variants.txt" ,
@@ -471,6 +461,11 @@ pub fn allele_data_collection(
471461 // Read the data from the file and include the sample name
472462 let mut records: Vec < AllelesData > = process_txt_with_sample ( reader, true , & sample) ?;
473463 records. retain ( |record| record. minority_frequency >= 0.05 ) ;
464+ // Add platform and runid to each record
465+ for record in & mut records {
466+ record. instrument = Some ( platform. to_string ( ) ) ;
467+ record. run_id = Some ( runid. to_string ( ) ) ;
468+ }
474469 alleles_data. append ( & mut records) ;
475470 }
476471 Err ( e) => println ! ( "Error reading file: {e}" ) ,
@@ -483,6 +478,8 @@ pub fn allele_data_collection(
483478/// Note that insertions and deletions are being added to the same Vec<Indelsdata>
484479pub fn indels_data_collection (
485480 irma_path : impl AsRef < Path > ,
481+ platform : & str ,
482+ runid : & str ,
486483) -> Result < Vec < IndelsData > , Box < dyn std:: error:: Error > > {
487484 let pattern1 = format ! (
488485 "{}/*/IRMA/*/tables/*insertions.txt" ,
@@ -505,6 +502,11 @@ pub fn indels_data_collection(
505502
506503 // Read the data from the file and include the sample name
507504 let mut records: Vec < IndelsData > = process_txt_with_sample ( reader, true , & sample) ?;
505+ // Add platform and runid to each record
506+ for record in & mut records {
507+ record. instrument = Some ( platform. to_string ( ) ) ;
508+ record. run_id = Some ( runid. to_string ( ) ) ;
509+ }
508510 indels_data. append ( & mut records) ;
509511 }
510512 Err ( e) => println ! ( "Error reading file: {e}" ) ,
@@ -646,6 +648,44 @@ pub fn get_reference_lens(
646648 Ok ( ref_len_map)
647649}
648650
651+ /// Collect read info created by IRMA and save to struct of `RunInfo`
652+ pub fn run_info_collection (
653+ irma_path : impl AsRef < Path > ,
654+ platform : & str ,
655+ runid : & str ,
656+ ) -> Result < Vec < RunInfo > , Box < dyn std:: error:: Error > > {
657+ let pattern = format ! (
658+ "{}/*/IRMA/*/logs/run_info.txt" ,
659+ irma_path. as_ref( ) . display( )
660+ ) ;
661+
662+ let mut run_info: Vec < RunInfo > = Vec :: new ( ) ;
663+
664+ // Start to iterate over all files matching the pattern
665+ for entry in glob ( & pattern) . expect ( "Failed to read glob pattern" ) {
666+ match entry {
667+ Ok ( path) => {
668+ let file = File :: open ( & path) ?;
669+ let reader = BufReader :: new ( file) ;
670+
671+ // Read the data from the file
672+ let mut records: Vec < RunInfo > = process_txt_without_sample ( reader, true ) ;
673+ for line in & mut records {
674+ line. run_id = Some ( runid. to_string ( ) ) ;
675+ line. instrument = Some ( platform. to_string ( ) ) ;
676+ }
677+ run_info. extend ( records) ;
678+
679+ // Break after processing the first valid file
680+ break ;
681+ }
682+ Err ( e) => println ! ( "Error reading file: {e}" ) ,
683+ }
684+ }
685+
686+ Ok ( run_info)
687+ }
688+
649689/////////////// Data reading functions for DAIS-ribosome ///////////////
650690/// Read tab-delimited data a withouot including sample name
651691pub fn process_txt < R , T > ( reader : R , has_headers : bool ) -> Result < Vec < T > , Box < dyn std:: error:: Error > >
@@ -667,67 +707,6 @@ where
667707 Ok ( records)
668708}
669709
670- /// Read in dais-ribosome ins file fto `InsertionData` struct
671- pub fn dais_insertion_data_collection (
672- dais_path : impl AsRef < Path > ,
673- ) -> Result < Vec < InsertionData > , Box < dyn std:: error:: Error > > {
674- // Construct the glob pattern for matching files
675- //If using * situation, you will have to use glob
676- let pattern = format ! (
677- "{}/aggregate_outputs/dais-ribosome/DAIS_ribosome.ins" ,
678- dais_path. as_ref( ) . display( )
679- ) ;
680-
681- let mut dais_ins_data: Vec < InsertionData > = Vec :: new ( ) ;
682-
683- // Use the glob crate to find all matching files
684- for entry in glob ( & pattern) ? {
685- match entry {
686- Ok ( path) => {
687- let file = File :: open ( & path) ?;
688- let reader = BufReader :: new ( file) ;
689- let mut records: Vec < InsertionData > = process_txt ( reader, false ) ?;
690- dais_ins_data. append ( & mut records) ;
691- }
692- Err ( e) => {
693- eprintln ! ( "Error processing file: {e}" ) ;
694- }
695- }
696- }
697-
698- Ok ( dais_ins_data)
699- }
700-
701- /// Read in dais-ribosome ins file fto `DeletionsData` struct
702- pub fn dais_deletion_data_collection (
703- dais_path : impl AsRef < Path > ,
704- ) -> Result < Vec < DeletionsData > , Box < dyn std:: error:: Error > > {
705- // Construct the glob pattern for matching files
706- //If using * situation, you will have to use glob
707- let pattern = format ! (
708- "{}/aggregate_outputs/dais-ribosome/DAIS_ribosome.del" ,
709- dais_path. as_ref( ) . display( )
710- ) ;
711-
712- let mut dais_del_data: Vec < DeletionsData > = Vec :: new ( ) ;
713-
714- // Use the glob crate to find all matching files
715- for entry in glob ( & pattern) ? {
716- match entry {
717- Ok ( path) => {
718- let file = File :: open ( & path) ?;
719- let reader = BufReader :: new ( file) ;
720- let mut records: Vec < DeletionsData > = process_txt ( reader, false ) ?;
721- dais_del_data. append ( & mut records) ;
722- }
723- Err ( e) => {
724- eprintln ! ( "Error processing file: {e}" ) ;
725- }
726- }
727- }
728- Ok ( dais_del_data)
729- }
730-
731710/// Read in dais-ribosome ins file fto `DaisSeqData` struct
732711pub fn dais_sequence_data_collection (
733712 dais_path : impl AsRef < Path > ,
0 commit comments