Skip to content

Commit 0ae5272

Browse files
authored
Merge branch 'add_prepmirareports' into var_of_int_format_tweak
2 parents 469dddf + c913c6b commit 0ae5272

File tree

12 files changed

+1958
-96
lines changed

12 files changed

+1958
-96
lines changed

Cargo.toml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,21 @@ edition = "2024"
55
description = "A set of rusty tools for use in MIRA"
66

77
[dependencies]
8+
arrow = { version = "55.2.0", default-features = false}
9+
parquet = { version = "55.2.0", default-features = false, features = ["arrow"] }
810
clap = { version = "4", features = ["derive"] }
911
csv = "1.3.1"
1012
either = "1"
11-
serde = { version = "1.0.219", features = ["derive"] }
12-
serde_yaml = "0.9"
1313
glob = "0.3.2"
1414
ordered-float = "5.0.0"
1515
#plotly = "0.12.1"
1616
plotly = { git = "https://github.com/plotly/plotly.rs.git", branch = "main" }
17-
17+
serde = { version = "1.0.219", features = ["derive"] }
18+
serde_json = "1.0"
19+
serde_yaml_ng = "0.10.0"
1820
zoe = { version = "0.0.19", default-features = false, features = [
1921
"multiversion",
2022
] }
2123

2224
[profile.release]
23-
strip = true
25+
strip = true

src/main.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ enum Commands {
2828
NTDiffs(NTDiffsArgs),
2929
/// Plotter
3030
Plotter(PlotterArgs),
31+
/// Prepare MIRA report
32+
PrepareMiraReports(ReportsArgs),
3133
}
3234

3335
fn main() {
@@ -40,22 +42,25 @@ fn main() {
4042
}
4143
Commands::PositionsOfInterest(cmd_args) => positions_of_interest_process(cmd_args)
4244
.expect(&format!("{module}::PositionsOfInterest")),
45+
4346
Commands::FindChemistry(cmd_args) => {
4447
find_chemistry_process(cmd_args).unwrap_or_die(&format!("{module}::FindChemistry"))
4548
}
4649
Commands::Hamming(cmd_args) => {
47-
all_sample_hd_process(cmd_args).unwrap_or_die(&format!("{module}::Hamming"))
50+
all_sample_hd_process(cmd_args).unwrap_or_die(&format!("{module}::Hamming"));
4851
}
4952
Commands::NTDiffs(cmd_args) => all_sample_nt_diffs_process(cmd_args),
5053
Commands::Plotter(cmd_args) => {
51-
plotter_process(cmd_args).expect(&format!("{module}::Plotter"))
54+
plotter_process(cmd_args).unwrap_or_else(|_| panic!("{module}::Plotter"))
5255
}
56+
Commands::PrepareMiraReports(cmd_args) => prepare_mira_reports_process(cmd_args)
57+
.unwrap_or_else(|_| panic!("{module}::PrepareMiraReports")),
5358
_ => {
5459
eprintln!("mira-oxide: unrecognized command {:?}", args.command);
5560
std::process::exit(1)
5661
}
5762
}
5863
}
5964

60-
mod processes;
61-
pub use crate::processes::*;
65+
pub mod processes;
66+
pub mod utils;

src/processes/all_sample_nt_diffs.rs

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,7 @@ use std::{
55
io::{BufReader, BufWriter, Write, stdin, stdout},
66
path::PathBuf,
77
};
8-
use zoe::{
9-
data::fasta::FastaNT,
10-
prelude::*,
11-
};
8+
use zoe::{data::fasta::FastaNT, prelude::*};
129

1310
#[derive(Debug, Parser)]
1411
#[command(
@@ -70,39 +67,36 @@ pub fn all_sample_nt_diffs_process(args: NTDiffsArgs) {
7067
record.map(|r| {
7168
let FastaNT { name, sequence } = r.recode_to_dna();
7269
ValidSeq {
73-
name,
70+
name,
7471
sequence,
7572
}
7673
}))
7774
.collect::<Result<Vec<_>, _>>()
7875
.unwrap_or_die("Could not process other data.");
79-
80-
writeln!(
81-
&mut writer,
82-
"sequence_1{}sequence_2{}nt_sequence_1{}position{}nt_sequence_2",
83-
delim, delim, delim, delim
84-
).unwrap();
8576

77+
writeln!(
78+
&mut writer,
79+
"sequence_1{delim}sequence_2{delim}nt_sequence_1{delim}position{delim}nt_sequence_2"
80+
)
81+
.unwrap();
82+
83+
all_sequences.iter().for_each(|f| {
84+
let name_1 = &f.name;
85+
let seq1 = &f.sequence;
8686
all_sequences.iter().for_each(|f| {
87-
let name_1 = &f.name;
88-
let seq1 = &f.sequence;
89-
all_sequences.iter().for_each(|f| {
90-
let name_2 = &f.name;
91-
let seq2 = &f.sequence;
92-
for (i, (nt1, nt2)) in seq1.iter().zip(seq2.iter()).enumerate() {
93-
if nt1 != nt2 {
94-
let nucleotide1 = char::from(*nt1);
95-
let nucleotide2 = char::from(*nt2);
96-
writeln!(
97-
&mut writer,
98-
"{}{}{}{}{}{}{}{}{}",
99-
name_1, delim, name_2, delim, nucleotide1, delim, i, delim, nucleotide2
100-
)
101-
.unwrap();
102-
}
87+
let name_2 = &f.name;
88+
let seq2 = &f.sequence;
89+
for (i, (nt1, nt2)) in seq1.iter().zip(seq2.iter()).enumerate() {
90+
if nt1 != nt2 {
91+
let nucleotide1 = char::from(*nt1);
92+
let nucleotide2 = char::from(*nt2);
93+
writeln!(
94+
&mut writer,
95+
"{name_1}{delim}{name_2}{delim}{nucleotide1}{delim}{i}{delim}{nucleotide2}"
96+
)
97+
.unwrap();
10398
}
104-
});
99+
}
105100
});
106-
107-
101+
});
108102
}

src/processes/find_chemistry.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ fn get_config_path(args: &FindChemArgs, seq_len: Option<usize>) -> String {
228228
.wd_path
229229
.to_str()
230230
.expect("Failed to convert work directory path to string");
231-
format!("{}{}", wd_path, path_extension)
231+
format!("{wd_path}{path_extension}")
232232
}
233233

234234
#[derive(Debug)]
@@ -274,7 +274,7 @@ impl fmt::Display for ChemistryOutput {
274274
/// sequences, returns None
275275
fn get_average_line_length(fastq: &PathBuf) -> Result<Option<usize>, std::io::Error> {
276276
let sample_size = 5;
277-
let file = File::open(&fastq)?;
277+
let file = File::open(fastq)?;
278278
let buf_reader = BufReader::new(file);
279279
let fastq_reader = FastQReader::new(buf_reader);
280280

@@ -313,7 +313,7 @@ pub fn find_chemistry_process(args: FindChemArgs) -> Result<(), std::io::Error>
313313
//let args = CheckChemArgs::parse();
314314
// handle input validation to ensure valid combinations of
315315
if let Err(e) = args.validate() {
316-
eprintln!("Error: {}", e);
316+
eprintln!("Error: {e}");
317317
std::process::exit(1);
318318
}
319319
// parse the arguments into output format

src/processes/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ pub mod all_sample_nt_diffs;
33
pub mod find_chemistry;
44
pub mod plotter;
55
pub mod positions_of_interest;
6+
pub mod prepare_mira_reports;
67
pub mod variants_of_interest;

src/processes/plotter.rs

Lines changed: 51 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ fn generate_plot_coverage(input_directory: &PathBuf) -> Result<Plot, Box<dyn Err
162162

163163
// Set the figure title
164164
let layout = Layout::new()
165-
.title(&format!(
165+
.title(format!(
166166
"Coverage | {}",
167167
input_directory
168168
.file_name()
@@ -201,61 +201,59 @@ fn generate_plot_coverage_seg(input_directory: &PathBuf) -> Result<Plot, Box<dyn
201201
let mut file_paths = Vec::new();
202202

203203
// First, count files and collect paths
204-
for entry in glob(&format!(
204+
for path in (glob(&format!(
205205
"{}/tables/*coverage.txt",
206206
input_directory.display()
207-
))? {
208-
if let Ok(path) = entry {
209-
//file_count += 1;
210-
file_paths.push(path);
211-
}
207+
))?)
208+
.flatten()
209+
{
210+
//file_count += 1;
211+
file_paths.push(path);
212212
}
213213

214214
// Calculate grid dimensions for subplots
215215
let rows = 4; //((file_count + 2) as f64).sqrt().ceil() as usize;
216216
let cols = 2; //(file_count + rows - 1) / rows; // Ceiling division
217217

218218
// Load variant data into a HashMap keyed by segment name
219+
// TODO: consider a struct with named fields
219220
let mut variants_data: HashMap<String, Vec<(u32, String, String, u32, u32, f32)>> =
220221
HashMap::new();
221222

222223
// Look for variant files with matching prefixes in the directory
223-
for entry in glob(&format!(
224+
for variant_path in (glob(&format!(
224225
"{}/tables/*variants.txt",
225226
input_directory.display()
226-
))? {
227-
if let Ok(variant_path) = entry {
228-
let file = File::open(&variant_path)?;
229-
230-
// Create a TSV reader
231-
let mut rdr = ReaderBuilder::new()
232-
.delimiter(b'\t')
233-
.has_headers(true)
234-
.from_reader(file);
235-
236-
for result in rdr.records() {
237-
let record = result?;
238-
if record.len() >= 8 {
239-
let segment_name = record[0].to_string();
240-
let position: u32 = record[1].parse()?;
241-
let consensus_allele: String = record[3].to_string();
242-
let minority_allele: String = record[4].to_string();
243-
let consensus_count: u32 = record[5].parse()?;
244-
let minority_count: u32 = record[6].parse()?;
245-
let minority_frequency: f32 = record[8].parse()?;
246-
247-
variants_data
248-
.entry(segment_name)
249-
.or_insert_with(Vec::new)
250-
.push((
251-
position,
252-
consensus_allele,
253-
minority_allele,
254-
consensus_count,
255-
minority_count,
256-
minority_frequency,
257-
));
258-
}
227+
))?)
228+
.flatten()
229+
{
230+
let file = File::open(&variant_path)?;
231+
232+
// Create a TSV reader
233+
let mut rdr = ReaderBuilder::new()
234+
.delimiter(b'\t')
235+
.has_headers(true)
236+
.from_reader(file);
237+
238+
for result in rdr.records() {
239+
let record = result?;
240+
if record.len() >= 8 {
241+
let segment_name = record[0].to_string();
242+
let position: u32 = record[1].parse()?;
243+
let consensus_allele: String = record[3].to_string();
244+
let minority_allele: String = record[4].to_string();
245+
let consensus_count: u32 = record[5].parse()?;
246+
let minority_count: u32 = record[6].parse()?;
247+
let minority_frequency: f32 = record[8].parse()?;
248+
249+
variants_data.entry(segment_name).or_default().push((
250+
position,
251+
consensus_allele,
252+
minority_allele,
253+
consensus_count,
254+
minority_count,
255+
minority_frequency,
256+
));
259257
}
260258
}
261259
}
@@ -356,7 +354,7 @@ fn generate_plot_coverage_seg(input_directory: &PathBuf) -> Result<Plot, Box<dyn
356354
// Create trace for minority values with consistent color (but with transparency)
357355
let minority_trace = Scatter::new(variant_positions, minority_values)
358356
.mode(Mode::Markers)
359-
.name(&format!("{}", segment_name))
357+
.name(&segment_name)
360358
.marker(
361359
plotly::common::Marker::new()
362360
.color(segment_color)
@@ -383,7 +381,7 @@ fn generate_plot_coverage_seg(input_directory: &PathBuf) -> Result<Plot, Box<dyn
383381
.columns(cols)
384382
.pattern(GridPattern::Independent),
385383
)
386-
.title(&format!(
384+
.title(format!(
387385
"Segment Coverage | {}",
388386
input_directory
389387
.file_name()
@@ -505,17 +503,15 @@ fn generate_sankey_plot(input_directory: &PathBuf) -> Result<Plot, Box<dyn Error
505503

506504
// Process data and build node map first
507505
let mut records = Vec::new();
508-
for line in lines {
509-
if let Ok(line) = line {
510-
let parts: Vec<&str> = line.split('\t').collect();
511-
if parts.len() >= 3 {
512-
let record = parts[0];
513-
let reads: u32 = parts[1].parse().unwrap_or(0);
514-
515-
// Skip "NA" values and 0 reads
516-
if parts[1] != "NA" && reads > 0 {
517-
records.push((record.to_string(), reads));
518-
}
506+
for line in lines.map_while(Result::ok) {
507+
let parts: Vec<&str> = line.split('\t').collect();
508+
if parts.len() >= 3 {
509+
let record = parts[0];
510+
let reads: u32 = parts[1].parse().unwrap_or(0);
511+
512+
// Skip "NA" values and 0 reads
513+
if parts[1] != "NA" && reads > 0 {
514+
records.push((record.to_string(), reads));
519515
}
520516
}
521517
}
@@ -733,7 +729,7 @@ fn generate_sankey_plot(input_directory: &PathBuf) -> Result<Plot, Box<dyn Error
733729

734730
// Set layout
735731
let layout = Layout::new()
736-
.title(&format!(
732+
.title(format!(
737733
"Read Assignment | {}",
738734
input_directory
739735
.file_name()

0 commit comments

Comments
 (0)