Skip to content

Commit ae0e1d8

Browse files
authored
Commands should output fastq rather than fasta (#60)
* Commands should output fastq rather than fasta
1 parent 7cce4d1 commit ae0e1d8

7 files changed

Lines changed: 386 additions & 41 deletions

File tree

Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/hidive/Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,7 @@ plotters = "=0.3.7"
4242

4343
[profile.release]
4444
opt-level = 2
45-
codegen-units = 16
45+
codegen-units = 16
46+
47+
[dev-dependencies]
48+
tempfile = "3.8.1"

src/hidive/src/main.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ enum Commands {
165165
/// Stream loci from CRAM/BAM/FASTA files stored locally or in Google Cloud Storage.
166166
#[clap(arg_required_else_help = true)]
167167
Fetch {
168-
/// Output path for FASTA file with reads spanning locus of interest.
168+
/// Output path for FASTQ file with reads spanning locus of interest.
169169
#[clap(short, long, value_parser, default_value = "/dev/stdout")]
170170
output: PathBuf,
171171

@@ -189,7 +189,7 @@ enum Commands {
189189
/// Find more sequences (both aligned and unaligned) overlapping previously fetched reads.
190190
#[clap(arg_required_else_help = true)]
191191
Rescue {
192-
/// Output path for FASTA file with reads spanning locus of interest.
192+
/// Output path for FASTQ file with reads spanning locus of interest.
193193
#[clap(short, long, value_parser, default_value = "/dev/stdout")]
194194
output: PathBuf,
195195

@@ -217,7 +217,7 @@ enum Commands {
217217
#[clap(short, long, value_parser, required = true)]
218218
fasta_paths: Vec<PathBuf>,
219219

220-
/// Indexed WGS BAM, CRAM, or FASTA files from which to extract relevant sequences.
220+
/// Indexed WGS BAM, CRAM, or FASTA files from which to find relevant sequences.
221221
#[clap(required = true, value_parser)]
222222
seq_paths: Vec<PathBuf>,
223223
},

src/hidive/src/rescue.rs

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use bio::data_structures::interval_tree::IntervalTree;
2-
use bio::io::fasta::Reader;
2+
use bio::io::fastq::{self, Reader};
33
use bio::utils::Interval;
44
use clap::ValueEnum;
55
use flate2::write::GzEncoder;
@@ -10,7 +10,7 @@ use num_format::{Locale, ToFormattedString};
1010
use std::cmp::PartialEq;
1111
use std::collections::{BTreeMap, HashMap};
1212
use std::fs::File;
13-
use std::io::Write;
13+
use std::io::{BufWriter, Write};
1414
use std::sync::atomic::{AtomicUsize, Ordering};
1515
use std::sync::Arc;
1616
use std::{collections::HashSet, path::PathBuf};
@@ -42,10 +42,10 @@ pub fn start(
4242
search_option: SearchOption,
4343
ref_path: Option<PathBuf>,
4444
loci: Option<Vec<String>>,
45-
fasta_paths: &Vec<PathBuf>,
45+
fastq_paths: &Vec<PathBuf>,
4646
seq_paths: &Vec<PathBuf>,
4747
) {
48-
let fasta_urls = skydive::parse::parse_file_names(fasta_paths);
48+
let fastq_urls = skydive::parse::parse_file_names(fastq_paths);
4949
let seq_urls = skydive::parse::parse_file_names(seq_paths);
5050

5151
// Get the system's temporary directory path
@@ -57,8 +57,8 @@ pub fn start(
5757

5858
// Read the FASTA files and prepare a hashmap of k-mers to search for in the other files.
5959
let mut kmer_set = HashSet::new();
60-
for fasta_url in fasta_urls {
61-
let reader = Reader::from_file(fasta_url.to_file_path().unwrap()).unwrap();
60+
for fastq_url in fastq_urls {
61+
let reader = Reader::from_file(fastq_url.to_file_path().unwrap()).unwrap();
6262

6363
let mut reads = Vec::new();
6464
for record in reader.records().flatten() {
@@ -153,17 +153,35 @@ pub fn start(
153153
);
154154
}
155155

156+
/*
156157
let file = File::create(output).expect("Could not open file for writing.");
157158
let mut writer: Box<dyn Write> = if output.to_str().unwrap().ends_with(".gz") {
158159
Box::new(GzEncoder::new(file, Compression::default()))
159160
} else {
160161
Box::new(file)
161162
};
163+
*/
164+
165+
let mut buf_writer = BufWriter::new(File::create(output).unwrap());
166+
let mut fastq_writer = fastq::Writer::new(&mut buf_writer);
162167

163168
for record in all_records.iter() {
164-
let fw_seq = record.seq().as_bytes();
165-
let rc_seq = fw_seq.reverse_complement();
169+
if record.is_reverse() {
170+
let rv_seq = record.seq().as_bytes().reverse_complement();
171+
let mut rv_qual = record.qual().iter().map(|&q| q + 33).collect::<Vec<u8>>();
172+
rv_qual.reverse();
173+
let rv_record = fastq::Record::with_attrs(&String::from_utf8_lossy(record.qname()), None, &rv_seq, &rv_qual);
174+
175+
fastq_writer.write_record(&rv_record).unwrap();
176+
} else {
177+
let fw_seq = record.seq().as_bytes();
178+
let fw_qual = record.qual().iter().map(|&q| q + 33).collect::<Vec<u8>>();
179+
let fw_record = fastq::Record::with_attrs(&String::from_utf8_lossy(record.qname()), None, &fw_seq, &fw_qual);
180+
181+
fastq_writer.write_record(&fw_record).unwrap();
182+
}
166183

184+
/*
167185
writeln!(
168186
writer,
169187
">read_{}_{}_{}_{}",
@@ -183,6 +201,7 @@ pub fn start(
183201
}
184202
)
185203
.expect("Could not write to file");
204+
*/
186205
}
187206

188207
if !all_records.is_empty() {

src/hidive/tests/fetch_tests.rs

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
use std::path::PathBuf;
2+
use std::process::Command;
3+
use tempfile::TempDir;
4+
5+
/// Validates that a FASTQ file exists and contains valid records
6+
fn validate_fastq_output(path: &PathBuf) {
7+
let reader = bio::io::fastq::Reader::from_file(path).expect("Failed to open output file");
8+
let mut records = Vec::new();
9+
10+
for result in reader.records() {
11+
// Each record should be valid and parseable
12+
let record = result.expect("Error reading FASTQ record");
13+
records.push(record);
14+
}
15+
16+
// Basic validation that we got some records
17+
assert!(!records.is_empty(), "No FASTQ records found in output file");
18+
19+
// Validate each record has the expected components
20+
for record in records {
21+
// Check sequence name exists
22+
assert!(!record.id().is_empty(), "FASTQ record missing sequence ID");
23+
24+
// Check sequence exists and is not empty
25+
assert!(!record.seq().is_empty(), "FASTQ record has empty sequence");
26+
27+
// Check quality scores exist and match sequence length
28+
assert_eq!(record.seq().len(), record.qual().len(),
29+
"Quality score length does not match sequence length");
30+
}
31+
}
32+
33+
#[test]
34+
fn test_fetch_basic() {
35+
// Create a temporary directory for test files
36+
let temp_dir = TempDir::new().unwrap();
37+
let output_path = temp_dir.path().join("output.fastq");
38+
39+
// Test BAM file
40+
let test_bam = "gs://fc-8c3900db-633f-477f-96b3-fb31ae265c44/results/PBFlowcell/m84060_230907_210011_s2/reads/ccs/aligned/m84060_230907_210011_s2.bam";
41+
42+
// Run the fetch command
43+
let output = Command::new("cargo")
44+
.args([
45+
"run",
46+
"--",
47+
"fetch",
48+
"--loci",
49+
"chr15:23960193-23963918",
50+
"--output",
51+
output_path.to_str().unwrap(),
52+
test_bam,
53+
])
54+
.output()
55+
.expect("Failed to execute command");
56+
57+
// Check if the command was successful
58+
assert!(output.status.success(), "Command failed: {:?}", output);
59+
60+
// Verify the output file exists
61+
assert!(output_path.exists(), "Output file was not created");
62+
63+
// Validate the format of the output FASTQ file
64+
validate_fastq_output(&output_path);
65+
}
66+
67+
#[test]
68+
fn test_fetch_with_padding() {
69+
let temp_dir = TempDir::new().unwrap();
70+
let output_path = temp_dir.path().join("output.fastq");
71+
let test_bam = "gs://fc-8c3900db-633f-477f-96b3-fb31ae265c44/results/PBFlowcell/m84060_230907_210011_s2/reads/ccs/aligned/m84060_230907_210011_s2.bam";
72+
73+
let output = Command::new("cargo")
74+
.args([
75+
"run",
76+
"--",
77+
"fetch",
78+
"--loci",
79+
"chr15:23960193-23963918",
80+
"--padding",
81+
"100",
82+
"--output",
83+
output_path.to_str().unwrap(),
84+
test_bam
85+
])
86+
.output()
87+
.expect("Failed to execute command");
88+
89+
assert!(output.status.success(), "Command failed: {:?}", output);
90+
assert!(output_path.exists(), "Output file was not created");
91+
validate_fastq_output(&output_path);
92+
}
93+
94+
// #[test]
95+
// fn test_fetch_with_haplotype() {
96+
// let temp_dir = TempDir::new().unwrap();
97+
// let output_path = temp_dir.path().join("output.fastq");
98+
// let test_bam = "gs://fc-8c3900db-633f-477f-96b3-fb31ae265c44/results/PBFlowcell/m84060_230907_210011_s2/reads/ccs/aligned/m84060_230907_210011_s2.bam";
99+
100+
// let output = Command::new("cargo")
101+
// .args([
102+
// "run",
103+
// "--",
104+
// "fetch",
105+
// "--loci",
106+
// "chr15:23960193-23963918",
107+
// "--haplotype",
108+
// "1",
109+
// "--output",
110+
// output_path.to_str().unwrap(),
111+
// test_bam,
112+
// ])
113+
// .output()
114+
// .expect("Failed to execute command");
115+
116+
// assert!(output.status.success(), "Command failed: {:?}", output);
117+
// assert!(output_path.exists(), "Output file was not created");
118+
// validate_fastq_output(&output_path);
119+
// }
120+
121+
#[test]
122+
fn test_fetch_invalid_locus() {
123+
let temp_dir = TempDir::new().unwrap();
124+
let output_path = temp_dir.path().join("output.fastq");
125+
let test_bam = "gs://fc-8c3900db-633f-477f-96b3-fb31ae265c44/results/PBFlowcell/m84060_230907_210011_s2/reads/ccs/aligned/m84060_230907_210011_s2.bam";
126+
127+
let output = Command::new("cargo")
128+
.args([
129+
"run",
130+
"--",
131+
"fetch",
132+
"--loci",
133+
"invalid_locus",
134+
"--output",
135+
output_path.to_str().unwrap(),
136+
test_bam,
137+
])
138+
.output()
139+
.expect("Failed to execute command");
140+
141+
// This should fail because the locus format is invalid
142+
assert!(!output.status.success(), "Command should have failed");
143+
}
144+
145+
#[test]
146+
fn test_fetch_missing_required_args() {
147+
let temp_dir = TempDir::new().unwrap();
148+
let output_path = temp_dir.path().join("output.fastq");
149+
150+
let output = Command::new("cargo")
151+
.args([
152+
"run",
153+
"--",
154+
"fetch",
155+
"--output",
156+
output_path.to_str().unwrap(),
157+
// Missing required --loci argument and BAM file
158+
])
159+
.output()
160+
.expect("Failed to execute command");
161+
162+
// This should fail because required arguments are missing
163+
assert!(!output.status.success(), "Command should have failed");
164+
}

0 commit comments

Comments
 (0)