Skip to content

Commit 65f05d1

Browse files
committed
exporting align and blast parsers to microbiorust-py
1 parent 641607c commit 65f05d1

9 files changed

Lines changed: 6138 additions & 85 deletions

File tree

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
use anyhow::{Context, Result};
2+
use async_compression::tokio::bufread::GzipDecoder as AsyncGzDecoder;
3+
use clap::Parser;
4+
use quick_xml::events::Event;
5+
use quick_xml::reader::Reader;
6+
use quick_xml::escape::unescape;
7+
use serde::Serialize;
8+
use serde_json::ser::Serializer as JsonSerializer;
9+
use microBioRust::blast::*;
10+
use std::io::Cursor;
11+
use tokio::io::{self, AsyncBufRead, AsyncBufReadExt, AsyncRead, AsyncWriteExt, BufReader};
12+
13+
#[derive(Parser, Debug)]
14+
#[command(name = "blast-parsers", author, version, about = "async microBioRust BLAST parsers: for outfmt6 (single line tabular) and outfmt5 (xml)")]
15+
struct Cli {
16+
///Use .gz for gzip-compressed files.
17+
#[arg(short, long, default_value = "-")]
18+
input: String,
19+
/// Format: '6' (tabular) or '5' (xml). If omitted we try to infer by file suffix only
20+
#[arg(short, long)]
21+
format: Option<String>,
22+
/// Output newline-delimited JSON (one JSON object per record/iteration)
23+
#[arg(long)]
24+
json: bool,
25+
}
26+
27+
#[tokio::main]
28+
async fn main() -> Result<()> {
29+
let args = Cli::parse();
30+
let fmt = infer_format(&args.input, &args.format);
31+
let reader_box = open_async_reader(&args.input).await?;
32+
if fmt == "6" {
33+
stream_outfmt6_to_json(reader_box).await?;
34+
} else {
35+
// Build AsyncBlastXmlIter from reader_box
36+
let iter_reader = reader_box;
37+
let mut iter = AsyncBlastXmlIter::from_reader(iter_reader);
38+
while let Some(res) = iter.next_iteration().await {
39+
match res {
40+
Ok(iter_rec) => {
41+
if args.json {
42+
let mut buf = Vec::new();
43+
serde_json::to_writer(&mut buf, &iter_rec)?;
44+
buf.push(b'\n');
45+
tokio::io::stdout().write_all(&buf).await?;
46+
} else {
47+
println!("query {:?} hits {}", iter_rec.query_def, iter_rec.hits.len());
48+
}
49+
}
50+
Err(e) => eprintln!("xml parse error: {}", e),
51+
}
52+
}
53+
}
54+
55+
Ok(())
56+
}

microBioRust/src/blast.rs

Lines changed: 19 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@
9797
#![allow(unused_imports)]
9898
use anyhow::{Context, Result};
9999
use async_compression::tokio::bufread::GzipDecoder as AsyncGzDecoder;
100-
use clap::Parser;
101100
use quick_xml::events::Event;
102101
use quick_xml::reader::Reader;
103102
use quick_xml::escape::unescape;
@@ -173,11 +172,11 @@ pub struct Statistics {
173172
// first macro: For Option fields (Strings, Option<u32>, etc.)
174173
macro_rules! read_parse_opt {
175174
($self:expr, $tag:expr, $parent_opt:expr, $field:ident, $ok_ty:ty) => {{
176-
// 1. Read the content first. This borrows 'self', but the borrow ends
177-
// as soon as this statement finishes.
175+
//read content first. This borrows 'self', but the borrow ends
176+
//as soon as this statement finishes.
178177
let res = $self.read_tag_content($tag).await;
179178

180-
// 2. Now 'self' is free again. We can handle the result.
179+
//now 'self' is free again and we can handle the result.
181180
match res {
182181
Ok(text) => {
183182
// 3. Re-borrow only the specific field we want to update.
@@ -566,58 +565,29 @@ where
566565
}
567566

568567

568+
//here we use input in a format such as this in order to capture the required format of XML (5) or Tabular (6) and true or false for Json output
569+
//see examples for further detail
569570

570-
#[derive(Parser, Debug)]
571-
#[command(name = "blast-parsers", author, version, about = "async microBioRust BLAST parsers: for outfmt6 (single line tabular) and outfmt5 (xml)")]
572-
struct Cli {
573-
///Use .gz for gzip-compressed files.
574-
#[arg(short, long, default_value = "-")]
575-
input: String,
576-
/// Format: '6' (tabular) or '5' (xml). If omitted we try to infer by file suffix only
577-
#[arg(short, long)]
578-
format: Option<String>,
579-
/// Output newline-delimited JSON (one JSON object per record/iteration)
580-
#[arg(long)]
581-
json: bool,
582-
}
571+
//#[derive(Parser, Debug)]
572+
//#[command(name = "blast-parsers", author, version, about = "async microBioRust BLAST parsers: for outfmt6 (single line tabular) and outfmt5 (xml)")]
573+
//struct Cli {
574+
// ///Use .gz for gzip-compressed files.
575+
// #[arg(short, long, default_value = "-")]
576+
// input: String,
577+
// /// Format: '6' (tabular) or '5' (xml). If omitted we try to infer by file suffix only
578+
// #[arg(short, long)]
579+
// format: Option<String>,
580+
// /// Output newline-delimited JSON (one JSON object per record/iteration)
581+
// #[arg(long)]
582+
// json: bool,
583+
//}
583584

584-
fn infer_format(path: &str, explicit: &Option<String>) -> String {
585+
pub fn infer_format(path: &str, explicit: &Option<String>) -> String {
585586
if let Some(f) = explicit { return f.clone(); }
586587
if path.ends_with(".xml") || path.ends_with(".xml.gz") { "5".to_string() }
587588
else { "6".to_string() }
588589
}
589590

590-
#[tokio::main]
591-
async fn main() -> Result<()> {
592-
let args = Cli::parse();
593-
let fmt = infer_format(&args.input, &args.format);
594-
let reader_box = open_async_reader(&args.input).await?;
595-
if fmt == "6" {
596-
stream_outfmt6_to_json(reader_box).await?;
597-
} else {
598-
// Build AsyncBlastXmlIter from reader_box
599-
let iter_reader = reader_box;
600-
let mut iter = AsyncBlastXmlIter::from_reader(iter_reader);
601-
while let Some(res) = iter.next_iteration().await {
602-
match res {
603-
Ok(iter_rec) => {
604-
if args.json {
605-
let mut buf = Vec::new();
606-
serde_json::to_writer(&mut buf, &iter_rec)?;
607-
buf.push(b'\n');
608-
tokio::io::stdout().write_all(&buf).await?;
609-
} else {
610-
println!("query {:?} hits {}", iter_rec.query_def, iter_rec.hits.len());
611-
}
612-
}
613-
Err(e) => eprintln!("xml parse error: {}", e),
614-
}
615-
}
616-
}
617-
618-
Ok(())
619-
}
620-
621591
// Unit tests (async if relevant)
622592
#[cfg(test)]
623593
mod tests {
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
LOCUS NC_000913 913 bp DNA linear CON 01-Sep-2025
2+
DEFINITION Escherichia coli
3+
ACCESSION NC_000913
4+
KEYWORDS .
5+
SOURCE Escherichia coli str. K-12 substr. MG1655
6+
ORGANISM Escherichia coli str. K-12 substr. MG1655
7+
Bacteria; Pseudomonadati; Pseudomonadota; Gammaproteobacteria;
8+
Enterobacterales; Enterobacteriaceae; Escherichia.
9+
FEATURES Location/Qualifiers
10+
source <1..>913
11+
/id="source_1"
12+
/organism="Escherichia coli str. K-12 substr. MG1655"
13+
/mol_type="genomic DNA"
14+
/strain="K-12"
15+
/sub_strain="MG1655"
16+
/db_xref="taxon:511145"
17+
source complement(1..913)
18+
gene complement(10..363)
19+
/gene="rplR"
20+
/locus_tag="b3304"
21+
/gene_synonym="ECK3291"
22+
/db_xref="ASAP:ABE-0010825"
23+
/db_xref="ECOCYC:EG10879"
24+
/db_xref="GeneID:947804"
25+
CDS complement(10..363)
26+
/gene="rplR"
27+
/locus_tag="b3304"
28+
/gene_synonym="ECK3291"
29+
/codon_start=1
30+
/transl_table=11
31+
/product="50S ribosomal subunit protein L18"
32+
/protein_id="NP_417763.1"
33+
/db_xref="UniProtKB/Swiss-Prot:P0C018"
34+
/db_xref="ASAP:ABE-0010825"
35+
/db_xref="ECOCYC:EG10879"
36+
/db_xref="GeneID:947804"
37+
/translation="MDKKSARIRRATRARRKLQELGATRLVVHRTPRHIYAQVIAPNG
38+
SEVLVAASTVEKAIAEQLKYTGNKDAAAAVGKAVAERALEKGIKDVSFDRSGFQYHGR
39+
VQALADAAREAGLQF"
40+
gene complement(373..906)
41+
/gene="rplF"
42+
/locus_tag="b3305"
43+
/gene_synonym="ECK3292"
44+
/db_xref="ASAP:ABE-0010827"
45+
/db_xref="ECOCYC:EG10869"
46+
/db_xref="GeneID:947803"
47+
CDS complement(373..906)
48+
/gene="rplF"
49+
/locus_tag="b3305"
50+
/gene_synonym="ECK3292"
51+
/codon_start=1
52+
/transl_table=11
53+
/product="50S ribosomal subunit protein L6"
54+
/protein_id="NP_417764.1"
55+
/db_xref="UniProtKB/Swiss-Prot:P0AG55"
56+
/db_xref="ASAP:ABE-0010827"
57+
/db_xref="ECOCYC:EG10869"
58+
/db_xref="GeneID:947803"
59+
/translation="MSRVAKAPVVVPAGVDVKINGQVITIKGKNGELTRTLNDAVEVK
60+
HADNTLTFGPRDGYADGWAQAGTARALLNSMVIGVTEGFTKKLQLVGVGYRAAVKGNV
61+
INLSLGFSHPVDHQLPAGITAECPTQTEIVLKGADKQVIGQVAADLRAYRRPEPYKGK
62+
GVRYADEVVRTKEAKKK"
63+
BASE COUNT 214 a 256 c 223 g 220 t
64+
ORIGIN
65+
1 acctctacct tagaactgaa ggccagcttc acgggcagca tctgccagtg cctggacacg
66+
61 accatgatat tggaacccgg aacggtcaaa ggatacatct ttgatgcctt tttccagagc
67+
121 gcgttcagcg acagctttac ccacagctgc agccgcgtct ttgttaccgg tgtacttcag
68+
181 ttgttcagcg atagcttttt ctacagtaga agcagctacc agaacttcag aaccgttcgg
69+
241 tgcaattacc tgtgcgtaaa tgtgacgcgg ggtacgatgt accaccaggc gagttgcgcc
70+
301 cagctcctgg agcttgcggc gtgcgcgggt cgcacgacgg atacgagcag atttcttatc
71+
361 catagtgtta ccttacttct tcttagcctc tttggtacgc acgacttcgt cggcgtaacg
72+
421 aacacccttg cctttataag gctcaggacg acggtaggcg cgcagatccg ctgcaacctg
73+
481 gccgatcacc tgcttatcag cgcctttcag cacgatttca gtctgagtcg gacattcagc
74+
541 agtgataccc gcaggcagct gatggtcaac aggatgagag aaacccagag acaggttaat
75+
601 cacattgcct ttaaccgctg cacggtaacc tacaccaacc agctgcagct tcttagtgaa
76+
661 gccttcggta acaccgataa ccattgagtt cagcagggca cgcgcggtac cagcctgtgc
77+
721 ccaaccgtct gcgtaaccat cacgcggacc gaaggtcagg gtattatctg catgtttaac
78+
781 ttcaacagca tcgttgagag tacgagtcag ctcgccgttt ttacctttga tcgtaataac
79+
841 ctgaccgttg atttttacgt caacgccggc aggaacaacg accggtgctt tagcaacacg
80+
901 agacattttt tcc
81+
//
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
>seq1
2+
ATGC-ATGCATGCATGC
3+
>seq2
4+
ATGCAATGCTTGCATGC
5+
>seq3
6+
TTGCAATCCATGCAAGC

0 commit comments

Comments
 (0)