Skip to content

Commit e5176f8

Browse files
committed
add in sc2 and rsv handling dais data processing
1 parent ddfc6f7 commit e5176f8

File tree

2 files changed

+141
-17
lines changed

2 files changed

+141
-17
lines changed

src/processes/prepare_mira_reports.rs

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,9 @@ fn read_yaml<R: std::io::Read>(reader: R) -> Result<QCConfig, Box<dyn std::error
112112
}
113113

114114
pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Error>> {
115-
/////////////// Read in all data ///////////////
115+
///////////////////////////////////////////////////////////////////////////
116+
/////////////// Read in and process data from IRMA and Dais ///////////////
117+
///////////////////////////////////////////////////////////////////////////
116118
// Read in samplesheet
117119
let samplesheet_path = create_reader(args.samplesheet)?;
118120
let samplesheet = if &args.platform == "illumina" {
@@ -134,13 +136,12 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
134136
let qc_yaml_path = create_reader(args.qc_yaml)?;
135137
let qc_config: QCConfig = read_yaml(qc_yaml_path)?;
136138

137-
//Read in IRMA data
139+
// Read in IRMA data
138140
let coverage_data = coverage_data_collection(&args.irma_path, &args.platform, &args.runid)?;
139141
let read_data = reads_data_collection(&args.irma_path, &args.platform, &args.runid)?;
140142
let vtype_data = create_vtype_data(&read_data);
141143
let allele_data = allele_data_collection(&args.irma_path)?;
142144
let indel_data = indels_data_collection(&args.irma_path)?;
143-
//TODO: feed in organism from argument
144145
//let seq_data = amended_consensus_data_collection(&args.irma_path, virus);
145146
let ref_lengths = match get_reference_lens(&args.irma_path) {
146147
Ok(data) => data,
@@ -158,10 +159,13 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
158159
let dais_ins_data = dais_insertion_data_collection(&args.irma_path)?;
159160
let dais_del_data = dias_deletion_data_collection(&args.irma_path)?;
160161
let dais_seq_data = dias_sequence_data_collection(&args.irma_path)?;
161-
//TODO: sc2 handling
162-
let dais_ref_data = dais_ref_seq_data_collection(&args.workdir_path, &args.virus)?;
163-
164-
//TODO: remove these at end
162+
let mut dais_ref_data: Vec<DaisSeqData> = Vec::new();
163+
if args.virus.to_lowercase() == "flu" || args.virus.to_lowercase() == "rsv" {
164+
dais_ref_data = dais_ref_seq_data_collection(&args.workdir_path, &args.virus)?;
165+
} else if args.virus.to_lowercase() == "sc2-wgs" {
166+
dais_ref_data = dais_ref_seq_data_collection(&args.workdir_path, "sc2")?;
167+
}
168+
//TODO: remove print statements at end
165169
//println!("{qc_config:?}")
166170
//println!("cov data: {coverage_data:?}");
167171
//println!("Allele data: {allele_data:?}");
@@ -171,22 +175,33 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
171175
//println!("dais del data: {dais_del_data:#?}");
172176
//println!("dais seq data: {dais_seq_data:#?}");
173177
//println!("dais ref data: {dais_ref_data:#?}");
174-
println!("ref length data: {ref_lengths:#?}");
178+
//println!("ref length data: {ref_lengths:#?}");
175179

176180
//Calculate AA variants for aavars.csv and dais_vars.json
177-
//TODO: circle back for the rsv and sc2 situations
178-
let dais_vars_data = compute_dais_variants(&dais_ref_data, &dais_seq_data)?;
181+
let mut dais_vars_data: Vec<DaisVarsData> = Vec::new();
182+
183+
if args.virus.to_lowercase() == "flu" {
184+
dais_vars_data = compute_dais_variants(&dais_ref_data, &dais_seq_data)?;
185+
} else if args.virus.to_lowercase() == "sc2-wgs"
186+
|| args.virus.to_lowercase() == "sc2"
187+
|| args.virus.to_lowercase() == "rsv"
188+
{
189+
println!("sc2");
190+
dais_vars_data = compute_cvv_dais_variants(&dais_ref_data, &dais_seq_data)?;
191+
}
192+
179193
negative_qc_statement(
180194
"/home/xpa3/mira-oxide/test/qc_statement.json",
181195
&read_data,
182196
&neg_control_list,
183197
)?;
184198

199+
// Calculating the % coverage and median coverage for summary
185200
let melted_reads_df = melt_reads_data(&read_data);
186201
let mut calculated_cov_df: Vec<ProcessedCoverage> = Vec::new();
187202
let mut calculated_position_cov_df: Vec<ProcessedCoverage> = Vec::new();
188-
// TODO: add in SC2 handling
189-
if args.virus.to_lowercase() == "flu" {
203+
204+
if args.virus.to_lowercase() == "flu" || args.virus.to_lowercase() == "rsv" {
190205
calculated_cov_df = process_wgs_coverage_data(&coverage_data, &ref_lengths);
191206
} else if args.virus.to_lowercase() == "sc2-spike" {
192207
calculated_cov_df =
@@ -199,10 +214,13 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
199214

200215
//println!("{dais_vars_data:?}");
201216
//println!("{melted_reads_df:?}");
202-
println!("{calculated_cov_df:?}");
203-
println!("{calculated_position_cov_df:?}");
204-
/*
217+
//println!("{calculated_cov_df:?}");
218+
//println!("{calculated_position_cov_df:?}");
219+
220+
/////////////////////////////////////////////////////////////////////////////
205221
/////////////// Write the structs to JSON files and CSV files ///////////////
222+
/////////////////////////////////////////////////////////////////////////////
223+
206224
// Writing out coverage data
207225
let coverage_struct_values = vec![
208226
"Sample",
@@ -384,7 +402,7 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
384402
&dais_vars_data,
385403
&aavars_columns,
386404
&aavars_columns,
387-
)?; */
405+
)?;
388406

389407
/////////////// Write the structs to parquet files if flag invoked ///////////////
390408
// Write fields to parq if flag given

src/utils/data_processing.rs

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ pub fn return_seg_data(
215215
(segments, segset, segcolor)
216216
}
217217

218+
//////////////// Functions used to process the variants found in dais outputs ///////////////
218219
// Function to calculate the aa variants - this is specifically for flu right now
219220
pub fn compute_dais_variants(
220221
ref_seqs_data: &Vec<DaisSeqData>,
@@ -279,7 +280,112 @@ pub fn compute_dais_variants(
279280
Ok(unique_entries)
280281
}
281282

282-
//////////////// Functions using to create irma_summary ///////////////
283+
/// Compute CVV DAIS Variants
284+
pub fn compute_cvv_dais_variants(
285+
ref_seqs_data: &Vec<DaisSeqData>,
286+
sample_seqs_data: &Vec<DaisSeqData>,
287+
) -> Result<Vec<DaisVarsData>, Box<dyn Error>> {
288+
let mut merged_data = merge_sequences(ref_seqs_data, sample_seqs_data)?;
289+
290+
// Compute AA Variants
291+
for entry in &mut merged_data {
292+
entry.insertion = compute_aa_variants(&entry.aa_aln, &entry.aa_seq);
293+
}
294+
295+
for entry in &mut merged_data {
296+
entry.insertions_shift_frame = if entry.insertion.is_empty() {
297+
"0".to_string()
298+
} else {
299+
entry.insertion.split(',').count().to_string()
300+
};
301+
}
302+
303+
// Filter and sort the data - keep first
304+
merged_data.sort_by(|a, b| {
305+
a.protein
306+
.cmp(&b.protein)
307+
.then(a.sample_id.cmp(&b.sample_id))
308+
.then(a.insertions_shift_frame.cmp(&b.insertions_shift_frame))
309+
});
310+
311+
let mut unique_data = HashMap::new();
312+
for entry in merged_data {
313+
let key = (entry.sample_id.clone(), entry.protein.clone());
314+
unique_data.entry(key).or_insert(entry);
315+
}
316+
317+
// Convert DaisSeqData to DaisVarsData and collect into a Vec
318+
let result: Vec<DaisVarsData> = unique_data
319+
.into_values()
320+
.map(|entry| DaisVarsData {
321+
sample_id: entry.sample_id,
322+
reference_id: entry.reference.clone(),
323+
protein: entry.protein.clone(),
324+
aa_variant_count: entry.insertions_shift_frame.parse::<i32>().unwrap_or(0),
325+
aa_variants: entry.insertion.clone(),
326+
})
327+
.collect();
328+
329+
Ok(result)
330+
}
331+
332+
/// Merge sequences based on Coordspace and Protein - used by compute_cvv_dais_variants fn
333+
fn merge_sequences(
334+
ref_seqs_data: &Vec<DaisSeqData>,
335+
sample_seqs_data: &Vec<DaisSeqData>,
336+
) -> Result<Vec<DaisSeqData>, Box<dyn Error>> {
337+
let mut merged_data = Vec::new();
338+
339+
for sample_entry in sample_seqs_data {
340+
for ref_entry in ref_seqs_data {
341+
if sample_entry.reference == ref_entry.reference
342+
&& sample_entry.protein == ref_entry.protein
343+
{
344+
// Create a new owned DaisSeqData instance - it was this or lifetimes...
345+
let merged_entry = DaisSeqData {
346+
sample_id: sample_entry.sample_id.clone(),
347+
ctype: sample_entry.ctype.clone(),
348+
reference: sample_entry.reference.clone(),
349+
protein: sample_entry.protein.clone(),
350+
vh: sample_entry.vh.clone(),
351+
aa_seq: ref_entry.aa_seq.clone(), // Use ref_entry's AA sequence
352+
aa_aln: sample_entry.aa_aln.clone(),
353+
cds_id: sample_entry.cds_id.clone(),
354+
insertion: sample_entry.insertion.clone(),
355+
insertions_shift_frame: sample_entry.insertions_shift_frame.clone(),
356+
cds_sequence: sample_entry.cds_sequence.clone(),
357+
aligned_cds_sequence: sample_entry.aligned_cds_sequence.clone(),
358+
reference_nt_positions: sample_entry.reference_nt_positions.clone(),
359+
sample_nt_positions: sample_entry.sample_nt_positions.clone(),
360+
};
361+
362+
merged_data.push(merged_entry); // Push the owned value
363+
}
364+
}
365+
}
366+
367+
Ok(merged_data)
368+
}
369+
370+
/// Compute AA variants - used by compute_cvv_dais_variants fn
371+
fn compute_aa_variants(aligned_aa_sequence: &str, ref_aa_sequence: &str) -> String {
372+
let mut aa_vars = String::new();
373+
374+
for (index, (sample_aa, ref_aa)) in aligned_aa_sequence
375+
.chars()
376+
.zip(ref_aa_sequence.chars())
377+
.enumerate()
378+
{
379+
if sample_aa != ref_aa {
380+
let pos = index + 1;
381+
let variant = format!("{ref_aa}{pos}{sample_aa}");
382+
append_with_comma(&mut aa_vars, &variant);
383+
}
384+
}
385+
386+
aa_vars
387+
}
388+
//////////////// Functions used to create irma_summary ///////////////
283389
/// Flip orientation of the reads structs
284390
pub fn melt_reads_data(records: &Vec<ReadsData>) -> Vec<MeltedRecord> {
285391
let mut result = Vec::new();

0 commit comments

Comments
 (0)