Skip to content

Commit f4d5aa8

Browse files
committed
write out ref_data.json with it's unique pattern
1 parent 7b6cf85 commit f4d5aa8

File tree

3 files changed

+96
-31
lines changed

3 files changed

+96
-31
lines changed

src/processes/prepare_mira_reports.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ use crate::utils::{data_ingest::*, writing_outputs::*};
33
use clap::Parser;
44
use csv::ReaderBuilder;
55
use either::Either;
6-
use serde::{self, Deserialize, de::DeserializeOwned};
6+
use serde::{self, Deserialize, Serialize, de::DeserializeOwned};
7+
use serde_json::json;
78
use std::sync::Arc;
89
use std::{
910
error::Error,
@@ -115,9 +116,19 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
115116
let vtype_data = create_vtype_data(&read_data);
116117
let allele_data = allele_data_collection(&args.irma_path)?;
117118
let indel_data = indels_data_collection(&args.irma_path)?;
118-
let ref_lengths = get_reference_lens(&args.irma_path);
119119
//TODO: feed in organism from argument
120120
let seq_data = amended_consensus_data_collection(&args.irma_path, "flu");
121+
let ref_lengths = match get_reference_lens(&args.irma_path) {
122+
Ok(data) => data,
123+
Err(e) => {
124+
eprintln!("Error getting reference lengths: {e}");
125+
return Err(e);
126+
}
127+
};
128+
let (segments, segset, segcolor) =
129+
return_seg_data(extract_field(coverage_data.clone(), |item| {
130+
item.reference_name.clone()
131+
}));
121132

122133
//Read in DAIS-ribosome data
123134
let dais_ins_data = dias_insertion_data_collection(&args.irma_path);
@@ -136,7 +147,7 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
136147
//println!("dais del data: {dais_del_data:#?}");
137148
//println!("dais seq data: {dais_seq_data:#?}");
138149
//println!("dais ref data: {dais_ref_data:#?}");
139-
println!("ref length data: {ref_lengths:#?}");
150+
//println!("ref length data: {ref_lengths:#?}");
140151

141152
/////////////// Write the structs to JSON files and CSV files ///////////////
142153
// Writing out coverage data
@@ -290,6 +301,14 @@ pub fn prepare_mira_reports_process(args: ReportsArgs) -> Result<(), Box<dyn Err
290301
&indels_struct_values,
291302
)?;
292303

304+
write_ref_data_json(
305+
"/home/xpa3/mira-oxide/test/ref_data.json",
306+
&ref_lengths,
307+
&segments,
308+
&segset,
309+
&segcolor,
310+
)?;
311+
293312
// Write fields to parq if flag given
294313
write_reads_to_parquet(&read_data, "/home/xpa3/mira-oxide/test/read_data.parquet")?;
295314

src/utils/data_ingest.rs

Lines changed: 49 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use either::Either;
33
use glob::glob;
44
use serde::{self, Deserialize, Serialize, de::DeserializeOwned};
55
use std::{
6-
collections::HashMap,
6+
collections::{HashMap, HashSet},
77
error::Error,
88
fs::{self, File, OpenOptions},
99
io::{self, BufRead, BufReader, Read, Stdin, stdin},
@@ -142,12 +142,6 @@ pub struct SeqData {
142142
sequence: String,
143143
}
144144

145-
#[derive(Debug)]
146-
pub struct RefLengthData {
147-
name: String,
148-
length: usize,
149-
}
150-
151145
/////////////// Structs to hold dais-ribosome data ///////////////
152146
/// Insertion Data
153147
#[derive(Serialize, Deserialize, Debug)]
@@ -602,61 +596,88 @@ pub fn create_vtype_data(reads_data: &Vec<ReadsData>) -> Vec<ProcessedRecord> {
602596
processed_records
603597
}
604598

605-
/// Collect the reference lengths from the IRMA outputs
599+
// Function to collect reference lengths from IRMA outputs
606600
pub fn get_reference_lens(
607601
irma_path: &PathBuf,
608-
) -> Result<Vec<RefLengthData>, Box<dyn std::error::Error>> {
609-
// Determine the glob pattern based on the organism
602+
) -> Result<HashMap<String, usize>, Box<dyn std::error::Error>> {
610603
let pattern = format!(
611604
"{}/*/IRMA/*/intermediate/0-ITERATIVE-REFERENCES/R0*ref",
612605
irma_path.to_string_lossy()
613606
);
614607

615-
let mut ref_len_data: Vec<RefLengthData> = Vec::new();
608+
let mut ref_len_map: HashMap<String, usize> = HashMap::new();
616609

617-
// Iterate over all files matching the pattern
618-
for entry in glob(&pattern).expect("Failed to read glob pattern") {
610+
for entry in glob(&pattern)? {
619611
match entry {
620612
Ok(path) => {
621613
let file = File::open(&path)?;
622614
let reader = BufReader::new(file);
623615

624-
// Parse the file line by line (assuming FASTA format)
625616
let mut ref_name = String::new();
626617
let mut current_sequence = String::new();
627618

628619
for line in reader.lines() {
629620
let line = line?;
630621
if line.starts_with('>') {
631-
// If there's an existing sequence, save its length
632622
if !ref_name.is_empty() {
633-
ref_len_data.push(RefLengthData {
634-
name: ref_name.clone(),
635-
length: current_sequence.len(),
636-
});
623+
ref_len_map.insert(ref_name.clone(), current_sequence.len());
637624
}
638-
// Start a new sequence
639-
ref_name = line[1..].to_string(); // Remove '>'
625+
ref_name = line[1..].to_string();
640626
current_sequence.clear();
641627
} else {
642-
// Append to the current sequence
643628
current_sequence.push_str(&line);
644629
}
645630
}
646631

647-
// Save the last sequence's length
648632
if !ref_name.is_empty() {
649-
ref_len_data.push(RefLengthData {
650-
name: ref_name,
651-
length: current_sequence.len(),
652-
});
633+
// Insert the last reference name and its length into the HashMap
634+
ref_len_map.insert(ref_name, current_sequence.len());
653635
}
654636
}
655637
Err(e) => println!("Error reading file: {e}"),
656638
}
657639
}
658640

659-
Ok(ref_len_data)
641+
Ok(ref_len_map)
642+
}
643+
644+
// Function to process reference names and generate segments, segset, and segcolor
645+
pub fn return_seg_data(
646+
reference_names: Vec<String>,
647+
) -> (Vec<String>, Vec<String>, HashMap<String, &'static str>) {
648+
let mut segments: Vec<String> = reference_names.into_iter().collect();
649+
segments.sort();
650+
segments.dedup();
651+
652+
let color_palette = vec![
653+
"#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f",
654+
"#bcbd22", "#17becf",
655+
];
656+
657+
let mut segset: Vec<String> = Vec::new();
658+
for segment in &segments {
659+
let parts: Vec<&str> = segment.split('_').collect();
660+
if parts.len() > 1 {
661+
segset.push(parts[1].to_string());
662+
} else {
663+
segset.push(segment.clone());
664+
}
665+
}
666+
667+
let segset: Vec<String> = segset
668+
.into_iter()
669+
.collect::<std::collections::HashSet<_>>()
670+
.into_iter()
671+
.collect();
672+
673+
let mut segcolor: HashMap<String, &str> = HashMap::new();
674+
for (i, seg) in segset.iter().enumerate() {
675+
if i < color_palette.len() {
676+
segcolor.insert(seg.clone(), color_palette[i]);
677+
}
678+
}
679+
680+
(segments, segset, segcolor)
660681
}
661682

662683
/////////////// Data reading functions for DAIS-ribosome ///////////////

src/utils/writing_outputs.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ use csv::Writer;
66
use parquet::arrow::ArrowWriter;
77
use serde::Serialize;
88
use serde_json::{Value, json};
9+
use std::collections::HashMap;
10+
use std::io::Write;
911
use std::sync::Arc;
1012
use std::{error::Error, fs::File};
1113

@@ -81,6 +83,29 @@ pub fn write_structs_to_csv_file<T: Serialize>(
8183
Ok(())
8284
}
8385

86+
/// make ref_data.json - has unique set up
87+
pub fn write_ref_data_json(
88+
file_path: &str,
89+
ref_lens: &HashMap<String, usize>,
90+
segments: &Vec<String>,
91+
segset: &Vec<String>,
92+
segcolor: &HashMap<String, &str>,
93+
) -> Result<(), Box<dyn Error>> {
94+
let json_data = json!({
95+
"ref_lens": ref_lens,
96+
"segments": segments,
97+
"segset": segset,
98+
"segcolor": segcolor,
99+
});
100+
101+
// Write JSON to a file
102+
let mut file = File::create(file_path)?;
103+
file.write_all(serde_json::to_string_pretty(&json_data)?.as_bytes())?;
104+
105+
println!("Data written to ref_data.json");
106+
107+
Ok(())
108+
}
84109
/////////////// Functions to write parquet files out ///////////////
85110
/// Functions to convert values in a vecxtor of structs to vector
86111
/// Some perform type converions

0 commit comments

Comments
 (0)