Skip to content

Commit 53ad54b

Browse files
committed
little restructure. get all csv files in order.
1 parent e52544d commit 53ad54b

File tree

10 files changed

+810
-552
lines changed

10 files changed

+810
-552
lines changed
Lines changed: 94 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -165,67 +165,28 @@ pub struct IndelsData {
165165
pub instrument: Option<String>,
166166
}
167167

168+
/// Run Info struct
169+
#[derive(Serialize, Deserialize, Debug, Clone)]
170+
pub struct RunInfo {
171+
#[serde(rename = "program_name")]
172+
pub program_name: Option<String>,
173+
#[serde(rename = "PROGRAM")]
174+
pub program: Option<String>,
175+
#[serde(rename = "Iterative Refinement Meta-Assembler (IRMA)")]
176+
pub irma: Option<String>,
177+
#[serde(rename = "Run_ID")]
178+
pub run_id: Option<String>,
179+
#[serde(rename = "Instrument")]
180+
pub instrument: Option<String>,
181+
}
182+
168183
#[derive(Debug)]
169184
pub struct SeqData {
170185
pub name: String,
171186
pub sequence: String,
172187
}
173188

174189
/////////////// Structs to hold dais-ribosome data ///////////////
175-
/// Insertion Data
176-
#[derive(Serialize, Deserialize, Debug)]
177-
pub struct InsertionData {
178-
#[serde(rename = "ID")]
179-
pub sample_id: Option<String>,
180-
#[serde(rename = "C_type")]
181-
pub ctype: Option<String>,
182-
#[serde(rename = "Ref_ID")]
183-
pub reference: String,
184-
#[serde(rename = "Protein")]
185-
pub protein: String,
186-
#[serde(rename = "Upstream_aa")]
187-
pub upstream_aa_position: String,
188-
#[serde(rename = "Inserted_nucleotides")]
189-
pub inserted_nucleotides: String,
190-
#[serde(rename = "Inserted_residues")]
191-
pub inserted_residues: String,
192-
#[serde(rename = "Upstream_nt")]
193-
pub upstream_nt: String,
194-
#[serde(rename = "Codon_shift")]
195-
pub in_frame: String,
196-
}
197-
198-
/// Deletions Data
199-
#[derive(Serialize, Deserialize, Debug)]
200-
pub struct DeletionsData {
201-
#[serde(rename = "ID")]
202-
pub sample_id: Option<String>,
203-
#[serde(rename = "C_type")]
204-
pub ctype: Option<String>,
205-
#[serde(rename = "Ref_ID")]
206-
pub reference: String,
207-
#[serde(rename = "Protein")]
208-
pub protein: String,
209-
#[serde(rename = "VH")]
210-
pub vh: Option<String>,
211-
#[serde(rename = "Del_AA_start")]
212-
pub del_start_aa_position: Option<String>,
213-
#[serde(rename = "Del_AA_end")]
214-
pub del_end_aa_position: Option<String>,
215-
#[serde(rename = "Del_AA_len")]
216-
pub del_aa_length: String,
217-
#[serde(rename = "In_frame")]
218-
pub in_frame: String,
219-
#[serde(rename = "CDS_ID")]
220-
pub cds_id: Option<String>,
221-
#[serde(rename = "Del_CDS_start")]
222-
pub del_start_cds_position: String,
223-
#[serde(rename = "Del_CDS_end")]
224-
pub del_end_cds_position: String,
225-
#[serde(rename = "Del_CDS_len")]
226-
pub del_cds_length: Option<String>,
227-
}
228-
229190
/// Dais Sequence Data
230191
#[derive(Serialize, Deserialize, Debug)]
231192
pub struct DaisSeqData {
@@ -374,6 +335,33 @@ where
374335
Ok(records)
375336
}
376337

338+
/// Read tab-delimited data and include the sample name
339+
fn process_txt_without_sample<R, T>(reader: R, has_headers: bool) -> std::vec::Vec<T>
340+
where
341+
R: Read,
342+
T: for<'de> Deserialize<'de>,
343+
{
344+
let mut rdr = ReaderBuilder::new()
345+
.has_headers(has_headers)
346+
.delimiter(b'\t')
347+
.from_reader(reader);
348+
349+
let mut records: Vec<T> = Vec::new();
350+
for result in rdr.deserialize() {
351+
match result {
352+
Ok(record) => {
353+
records.push(record);
354+
}
355+
Err(e) => {
356+
// Log a warning and skip the invalid record
357+
eprintln!("Warning: Failed to deserialize record: {e}");
358+
}
359+
}
360+
}
361+
362+
records
363+
}
364+
377365
/// Read in the coverage files made by IRMA and save to a vector of `CoverageData`
378366
pub fn coverage_data_collection(
379367
irma_path: impl AsRef<Path>,
@@ -452,6 +440,8 @@ pub fn reads_data_collection(
452440
/// Collecting allele data created by IRMA and and save to vector of `AllelesData`
453441
pub fn allele_data_collection(
454442
irma_path: &Path,
443+
platform: &str,
444+
runid: &str,
455445
) -> Result<Vec<AllelesData>, Box<dyn std::error::Error>> {
456446
let pattern = format!(
457447
"{}/*/IRMA/*/tables/*variants.txt",
@@ -471,6 +461,11 @@ pub fn allele_data_collection(
471461
// Read the data from the file and include the sample name
472462
let mut records: Vec<AllelesData> = process_txt_with_sample(reader, true, &sample)?;
473463
records.retain(|record| record.minority_frequency >= 0.05);
464+
// Add platform and runid to each record
465+
for record in &mut records {
466+
record.instrument = Some(platform.to_string());
467+
record.run_id = Some(runid.to_string());
468+
}
474469
alleles_data.append(&mut records);
475470
}
476471
Err(e) => println!("Error reading file: {e}"),
@@ -483,6 +478,8 @@ pub fn allele_data_collection(
483478
/// Note that insertions and deletions are being added to the same Vec<Indelsdata>
484479
pub fn indels_data_collection(
485480
irma_path: impl AsRef<Path>,
481+
platform: &str,
482+
runid: &str,
486483
) -> Result<Vec<IndelsData>, Box<dyn std::error::Error>> {
487484
let pattern1 = format!(
488485
"{}/*/IRMA/*/tables/*insertions.txt",
@@ -505,6 +502,11 @@ pub fn indels_data_collection(
505502

506503
// Read the data from the file and include the sample name
507504
let mut records: Vec<IndelsData> = process_txt_with_sample(reader, true, &sample)?;
505+
// Add platform and runid to each record
506+
for record in &mut records {
507+
record.instrument = Some(platform.to_string());
508+
record.run_id = Some(runid.to_string());
509+
}
508510
indels_data.append(&mut records);
509511
}
510512
Err(e) => println!("Error reading file: {e}"),
@@ -646,6 +648,44 @@ pub fn get_reference_lens(
646648
Ok(ref_len_map)
647649
}
648650

651+
/// Collect read info created by IRMA and save to struct of `RunInfo`
652+
pub fn run_info_collection(
653+
irma_path: impl AsRef<Path>,
654+
platform: &str,
655+
runid: &str,
656+
) -> Result<Vec<RunInfo>, Box<dyn std::error::Error>> {
657+
let pattern = format!(
658+
"{}/*/IRMA/*/logs/run_info.txt",
659+
irma_path.as_ref().display()
660+
);
661+
662+
let mut run_info: Vec<RunInfo> = Vec::new();
663+
664+
// Start to iterate over all files matching the pattern
665+
for entry in glob(&pattern).expect("Failed to read glob pattern") {
666+
match entry {
667+
Ok(path) => {
668+
let file = File::open(&path)?;
669+
let reader = BufReader::new(file);
670+
671+
// Read the data from the file
672+
let mut records: Vec<RunInfo> = process_txt_without_sample(reader, true);
673+
for line in &mut records {
674+
line.run_id = Some(runid.to_string());
675+
line.instrument = Some(platform.to_string());
676+
}
677+
run_info.extend(records);
678+
679+
// Break after processing the first valid file
680+
break;
681+
}
682+
Err(e) => println!("Error reading file: {e}"),
683+
}
684+
}
685+
686+
Ok(run_info)
687+
}
688+
649689
/////////////// Data reading functions for DAIS-ribosome ///////////////
650690
/// Read tab-delimited data a withouot including sample name
651691
pub fn process_txt<R, T>(reader: R, has_headers: bool) -> Result<Vec<T>, Box<dyn std::error::Error>>
@@ -667,67 +707,6 @@ where
667707
Ok(records)
668708
}
669709

670-
/// Read in dais-ribosome ins file fto `InsertionData` struct
671-
pub fn dais_insertion_data_collection(
672-
dais_path: impl AsRef<Path>,
673-
) -> Result<Vec<InsertionData>, Box<dyn std::error::Error>> {
674-
// Construct the glob pattern for matching files
675-
//If using * situation, you will have to use glob
676-
let pattern = format!(
677-
"{}/aggregate_outputs/dais-ribosome/DAIS_ribosome.ins",
678-
dais_path.as_ref().display()
679-
);
680-
681-
let mut dais_ins_data: Vec<InsertionData> = Vec::new();
682-
683-
// Use the glob crate to find all matching files
684-
for entry in glob(&pattern)? {
685-
match entry {
686-
Ok(path) => {
687-
let file = File::open(&path)?;
688-
let reader = BufReader::new(file);
689-
let mut records: Vec<InsertionData> = process_txt(reader, false)?;
690-
dais_ins_data.append(&mut records);
691-
}
692-
Err(e) => {
693-
eprintln!("Error processing file: {e}");
694-
}
695-
}
696-
}
697-
698-
Ok(dais_ins_data)
699-
}
700-
701-
/// Read in dais-ribosome ins file fto `DeletionsData` struct
702-
pub fn dais_deletion_data_collection(
703-
dais_path: impl AsRef<Path>,
704-
) -> Result<Vec<DeletionsData>, Box<dyn std::error::Error>> {
705-
// Construct the glob pattern for matching files
706-
//If using * situation, you will have to use glob
707-
let pattern = format!(
708-
"{}/aggregate_outputs/dais-ribosome/DAIS_ribosome.del",
709-
dais_path.as_ref().display()
710-
);
711-
712-
let mut dais_del_data: Vec<DeletionsData> = Vec::new();
713-
714-
// Use the glob crate to find all matching files
715-
for entry in glob(&pattern)? {
716-
match entry {
717-
Ok(path) => {
718-
let file = File::open(&path)?;
719-
let reader = BufReader::new(file);
720-
let mut records: Vec<DeletionsData> = process_txt(reader, false)?;
721-
dais_del_data.append(&mut records);
722-
}
723-
Err(e) => {
724-
eprintln!("Error processing file: {e}");
725-
}
726-
}
727-
}
728-
Ok(dais_del_data)
729-
}
730-
731710
/// Read in dais-ribosome ins file fto `DaisSeqData` struct
732711
pub fn dais_sequence_data_collection(
733712
dais_path: impl AsRef<Path>,

src/io/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
pub mod data_ingest;
2+
pub mod write_csv_files;
3+
pub mod write_fasta_files;

0 commit comments

Comments
 (0)