Skip to content

Commit 2997e95

Browse files
committed
aavar computing for flu. make dais_vars.json and aavars.csv. started data_processing.rs.
1 parent 1319d89 commit 2997e95

File tree

5 files changed

+128
-9
lines changed

5 files changed

+128
-9
lines changed

src/processes/prepare_mira_reports.rs

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#![allow(dead_code, unused_imports)]
2-
use crate::utils::{data_ingest::*, writing_outputs::*};
2+
use crate::utils::{data_ingest::*, data_processing::*, writing_outputs::*};
33
use clap::Parser;
44
use csv::ReaderBuilder;
55
use either::Either;
@@ -131,10 +131,10 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
131131
}));
132132

133133
//Read in DAIS-ribosome data
134-
let dais_ins_data = dais_insertion_data_collection(&args.irma_path);
135-
let dais_del_data = dias_deletion_data_collection(&args.irma_path);
136-
let dais_seq_data = dias_sequence_data_collection(&args.irma_path);
137-
let dais_ref_data = dais_ref_seq_data_collection(&args.workdir_path, "flu");
134+
let dais_ins_data = dais_insertion_data_collection(&args.irma_path)?;
135+
let dais_del_data = dias_deletion_data_collection(&args.irma_path)?;
136+
let dais_seq_data = dias_sequence_data_collection(&args.irma_path)?;
137+
let dais_ref_data = dais_ref_seq_data_collection(&args.workdir_path, "flu")?;
138138

139139
//TODO: remove these at end
140140
//println!("{samplesheet:?}");
@@ -149,6 +149,12 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
149149
//println!("dais ref data: {dais_ref_data:#?}");
150150
//println!("ref length data: {ref_lengths:#?}");
151151

152+
//Calculate AA variants for aavars.csv and dais_vars.json
153+
//TODO: circle back for the rsv and sc2 situations
154+
let dais_vars_data = compute_dais_variants(&dais_ref_data, &dais_seq_data)?;
155+
156+
println!("{dais_vars_data:?}");
157+
152158
/////////////// Write the structs to JSON files and CSV files ///////////////
153159
// Writing out coverage data
154160
let coverage_struct_values = vec![
@@ -227,7 +233,7 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
227233
&vtype_columns,
228234
)?;
229235

230-
// Writing out allele csv and josn file
236+
// Writing out allele csv and json file
231237
let allele_struct_values = vec![
232238
"Sample",
233239
"Upstream_Position",
@@ -301,6 +307,7 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
301307
&indels_struct_values,
302308
)?;
303309

310+
// Write out ref_data.json
304311
write_ref_data_json(
305312
"/home/xpa3/mira-oxide/test/ref_data.json",
306313
&ref_lengths,
@@ -309,6 +316,29 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
309316
&segcolor,
310317
)?;
311318

319+
// write out the dais_vars.json and the {runid}_aavars.csv
320+
let aavars_columns = vec![
321+
"sample_id",
322+
"reference_id",
323+
"protein",
324+
"aa_variant_count",
325+
"aa_variants",
326+
];
327+
write_structs_to_split_json_file(
328+
"/home/xpa3/mira-oxide/test/dais_vars.json",
329+
&dais_vars_data,
330+
&aavars_columns,
331+
&aavars_columns,
332+
)?;
333+
334+
write_structs_to_csv_file(
335+
&format!("/home/xpa3/mira-oxide/test/{}_aavars.csv", &args.runid),
336+
&dais_vars_data,
337+
&aavars_columns,
338+
&aavars_columns,
339+
)?;
340+
341+
/////////////// Write the structs to parquet files if flag invoked ///////////////
312342
// Write fields to parq if flag given
313343
write_reads_to_parquet(&read_data, "/home/xpa3/mira-oxide/test/read_data.parquet")?;
314344

src/utils/data_ingest.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ pub struct DeletionsData {
197197
pub del_cds_length: Option<String>,
198198
}
199199

200-
/// Sequence Data
200+
/// Dais Sequence Data
201201
#[derive(Serialize, Deserialize, Debug)]
202202
pub struct DaisSeqData {
203203
#[serde(rename = "ID")]

src/utils/data_processing.rs

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
use crate::utils::data_ingest;
2+
use either::Either;
3+
use glob::glob;
4+
use serde::{self, Deserialize, Serialize, de::DeserializeOwned};
5+
use std::{collections::HashSet, error::Error};
6+
7+
use super::data_ingest::DaisSeqData;
8+
9+
/// Dais Variants
10+
#[derive(Serialize, Deserialize, Debug)]
11+
pub struct DaisVarsData {
12+
pub sample_id: Option<String>,
13+
pub reference_id: String,
14+
pub protein: String,
15+
pub aa_variant_count: i32,
16+
pub aa_variants: String,
17+
}
18+
19+
// Function to append a new string with a comma
20+
pub fn append_with_comma(base: &mut String, new_entry: &str) {
21+
if !base.is_empty() {
22+
base.push(',');
23+
base.push_str(new_entry);
24+
} else {
25+
base.push_str(new_entry);
26+
}
27+
}
28+
29+
pub fn compute_dais_variants(
30+
ref_seqs_data: &Vec<DaisSeqData>,
31+
sample_seqs_data: &Vec<DaisSeqData>,
32+
) -> Result<Vec<DaisVarsData>, Box<dyn Error>> {
33+
let mut dais_vars_data: Vec<DaisVarsData> = Vec::new();
34+
35+
// Compute aa variants
36+
for sample_entry in sample_seqs_data {
37+
for ref_entry in ref_seqs_data {
38+
if sample_entry.reference == ref_entry.reference
39+
&& sample_entry.protein == ref_entry.protein
40+
{
41+
let sample_aa_seq = sample_entry.aa_aln.clone();
42+
let ref_aa_seq = ref_entry.aa_aln.clone();
43+
let mut var_aa_count = 0;
44+
45+
// Use .chars() to iterate over aa
46+
let mut aa_vars = String::new();
47+
for (index, (sample_aa, ref_aa)) in
48+
sample_aa_seq.chars().zip(ref_aa_seq.chars()).enumerate()
49+
{
50+
if sample_aa != ref_aa {
51+
let pos = index + 1;
52+
var_aa_count += 1;
53+
let variant = format!("{ref_aa}{pos}{sample_aa}");
54+
append_with_comma(&mut aa_vars, &variant);
55+
}
56+
}
57+
58+
let dais_vars_entry = DaisVarsData {
59+
sample_id: sample_entry.sample_id.clone(),
60+
reference_id: sample_entry.reference.clone(),
61+
protein: sample_entry.protein.clone(),
62+
aa_variant_count: var_aa_count,
63+
aa_variants: aa_vars,
64+
};
65+
dais_vars_data.push(dais_vars_entry);
66+
}
67+
}
68+
}
69+
70+
// Sort by protein, sample_id, and aa_variant_count
71+
dais_vars_data.sort_by(|a, b| {
72+
a.protein
73+
.cmp(&b.protein)
74+
.then(a.sample_id.cmp(&b.sample_id))
75+
.then(a.aa_variant_count.cmp(&b.aa_variant_count))
76+
});
77+
78+
// Remove duplicates based on sample_id and protein, keeping the first entry
79+
let mut unique_entries = Vec::new();
80+
let mut seen = HashSet::new();
81+
82+
for entry in dais_vars_data {
83+
let key = (entry.sample_id.clone(), entry.protein.clone());
84+
if seen.insert(key) {
85+
unique_entries.push(entry);
86+
}
87+
}
88+
89+
Ok(unique_entries)
90+
}

src/utils/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub mod data_ingest;
2+
pub mod data_processing;
23
pub mod writing_outputs;

src/utils/writing_outputs.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ use std::io::Write;
1111
use std::sync::Arc;
1212
use std::{error::Error, fs::File};
1313

14-
use super::data_ingest::CoverageData;
15-
1614
/////////////// Functions to write to json and csv files ///////////////
1715
/// Function to serialize a vector of structs into split-oriented JSON with precision and indexing
1816
pub fn write_structs_to_split_json_file<T: Serialize>(

0 commit comments

Comments
 (0)