diff --git a/Cargo.lock b/Cargo.lock index 8335134e..10da7106 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -295,25 +295,24 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" +checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" [[package]] name = "bzip2" -version = "0.4.4" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +checksum = "75b89e7c29231c673a61a46e722602bcd138298f6b9e81e71119693534585f5c" dependencies = [ "bzip2-sys", - "libc", ] [[package]] name = "bzip2-sys" -version = "0.1.11+1.0.8" +version = "0.1.12+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +checksum = "72ebc2f1a417f01e1da30ef264ee86ae31d2dcd2d603ea283d3c244a883ca2a9" dependencies = [ "cc", "libc", @@ -2938,12 +2937,13 @@ dependencies = [ [[package]] name = "noodles" -version = "0.85.0" +version = "0.93.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ecd3b4ddb5792b0358e035fbd41543613e4234dc2d0f13c814e3d5e705d8c8" +checksum = "74eea88cacba7e16cb6aff1bc3e077f3f9e036e59b9107a8415bf9242d0c2d97" dependencies = [ "noodles-bam", "noodles-bcf", + "noodles-bed", "noodles-bgzf", "noodles-core", "noodles-cram", @@ -2951,6 +2951,7 @@ dependencies = [ "noodles-fasta", "noodles-fastq", "noodles-gff", + "noodles-gtf", "noodles-sam", "noodles-tabix", "noodles-vcf", @@ -2958,13 +2959,12 @@ dependencies = [ [[package]] name = "noodles-bam" -version = "0.70.0" +version = "0.76.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57dbbc6b91efed384ceac16c1f4f764bc07d4941bacf7d5d7fdccfef48f52031" +checksum = "9ac61d405ab62bd0a01956a2b1d83a7d5ce8b207ca0caf91e2b3472037f8eac8" dependencies = [ "bstr", "byteorder", - "bytes", "futures", "indexmap", "memchr", @@ -2972,30 +2972,45 @@ dependencies = [ "noodles-core", "noodles-csi", "noodles-sam", + "pin-project-lite", "tokio", ] [[package]] name = "noodles-bcf" -version = "0.64.0" +version = "0.71.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7280783c4f1b8adc6c27d2122978cb3ce0c3dee241394a6af2a72df9b312f7ba" +checksum = "c76d44299bdc4c950dcbbfeb87b99e79de9c4082f51d892a6d7aa194fba27613" dependencies = [ "byteorder", "futures", "indexmap", + "memchr", "noodles-bgzf", "noodles-core", "noodles-csi", "noodles-vcf", + "pin-project-lite", "tokio", ] +[[package]] +name = "noodles-bed" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48a1daed588234e40eebfe1766368a8594244b2ca312719d46e298f1933083b2" +dependencies = [ + "bstr", + "lexical-core", + "memchr", + "noodles-core", +] + [[package]] name = "noodles-bgzf" -version = "0.33.0" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b50aaa8f0a3c8a0b738b641a6d1a78d9fd30a899ab2d398779ee3c4eb80f1c1" +checksum = "c10c4cb29950028823653395ebedfcaf5b652e504aacf816b510fb0268daa2eb" dependencies = [ "byteorder", "bytes", @@ -3009,24 +3024,23 @@ dependencies = [ [[package]] name = "noodles-core" -version = "0.15.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5a8c6b020d1205abef2b0fab4463a6c5ecc3c8f4d561ca8b0d1a42323376200" +checksum = "962b13b79312f773a12ffcb0cdaccab6327f8343b6f440a888eff10c749d52b0" dependencies = [ "bstr", ] [[package]] name = "noodles-cram" -version = "0.71.0" +version = "0.79.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd62fd2ae0b45fe9792fd07f439294f3a8c059069cdad6c45e9d8224a26431af" +checksum = "c75663c2a0d6f0142581f810702add845d99b89dea2e3b8b472f7a18b7ffc2c3" dependencies = [ "async-compression", "bitflags", "bstr", "byteorder", - "bytes", "bzip2", "flate2", "futures", @@ -3043,9 +3057,9 @@ dependencies = [ [[package]] name = "noodles-csi" -version = "0.40.0" +version = "0.44.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99fdf34a300b37bc15160afcbec76487750bf6e3d9a6ff69c8e4fd635bc66745" +checksum = "45b402963653cacd5df474889a1cfb969e2f66469023836ad3b0a2c75e2b4e7c" dependencies = [ "bit-vec", "bstr", @@ -3058,9 +3072,9 @@ dependencies = [ [[package]] name = "noodles-fasta" -version = "0.45.0" +version = "0.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde46cd7109fec9bb035ddceb95476531d057c1e61106b79a3a5b8d7ee7d5ee9" +checksum = "fbd5e79e45ac2ab99e912b59a1f3dd1efcbe3ef419bf0e6fcc8eec9b425a1c1f" dependencies = [ "bstr", "bytes", @@ -3072,9 +3086,9 @@ dependencies = [ [[package]] name = "noodles-fastq" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1606247d99eae65370cdb0ef5590f109a5286d57c06da8e738466cf95a4509d5" +checksum = "fb444bbacd329b4d867086de4bcf159e80a19f9832c9ea04aa0f5867cb77255c" dependencies = [ "bstr", "futures", @@ -3084,9 +3098,9 @@ dependencies = [ [[package]] name = "noodles-gff" -version = "0.39.0" +version = "0.44.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31846b5fc99ad5c03afe8518d97f046c69e4a55dba9e6ea572c1553cc2731473" +checksum = "ff9038cf76a843a570683b28bd500c3994d491f6452cdfb9a45ec38892d16564" dependencies = [ "futures", "indexmap", @@ -3097,11 +3111,22 @@ dependencies = [ "tokio", ] +[[package]] +name = "noodles-gtf" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94e375815c4c1881709b2eb22127ab50943e8418134a0505811f6a2da1855d60" +dependencies = [ + "noodles-bgzf", + "noodles-core", + "noodles-csi", +] + [[package]] name = "noodles-sam" -version = "0.66.0" +version = "0.72.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c26c31cfbb05b20fc8fb3cf1ac556efca0ff4b3455112705dd2ea81ae01e7d81" +checksum = "9c7f543ae568acd97307c261fe9b34225af2e625a74d077e992ca8b37afae793" dependencies = [ "bitflags", "bstr", @@ -3112,14 +3137,15 @@ dependencies = [ "noodles-bgzf", "noodles-core", "noodles-csi", + "pin-project-lite", "tokio", ] [[package]] name = "noodles-tabix" -version = "0.46.0" +version = "0.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f378e617a2ce78c59e33aeb66c8f70ab0c259fbf857be304ea1cf2d9ea9d8be4" +checksum = "e46ee069d57770840554e3d8bd02e7415b3a2d972cdea7525bea02472032c234" dependencies = [ "byteorder", "indexmap", @@ -3131,9 +3157,9 @@ dependencies = [ [[package]] name = "noodles-vcf" -version = "0.68.0" +version = "0.74.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb178aa387c98c386d92a6ef14881fd84efaf279626cde851587f44bfcac6879" +checksum = "9d68867aeaeda06d1149c85b4ebbefc45d31786f41b34868195ccd9dad7db532" dependencies = [ "futures", "indexmap", diff --git a/Cargo.toml b/Cargo.toml index 915ffc42..4ba97b8a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ include_dir = "0.7.4" intervaltree = "0.2.7" itertools = "0.13.0" lexical-core = "1.0.2" -noodles = { version = "0.85.0", features = ["async", "bgzf", "core", "fasta", "gff", "vcf"] } +noodles = { version = "0.93.0", features = ["async", "bed", "bgzf", "core", "fasta", "gff", "gtf", "vcf"] } petgraph = "0.6.5" remove_dir_all = "1.0.0" rusqlite = { version = "0.32.1", features = ["bundled", "array", "session"] } diff --git a/fixtures/beds/simple.bed b/fixtures/beds/simple.bed new file mode 100644 index 00000000..5da53bb2 --- /dev/null +++ b/fixtures/beds/simple.bed @@ -0,0 +1,4 @@ +m123 1 10 abc123.1 0 - 1 10 0,0,0 3 102,188,129, 0,3508,4691, +m123 5 8 xyz.1 0 - 5 8 0,0,0 1 113, 0, +m123 10 16 xyz.2 0 + 10 16 0,0,0 2 142,326, 0,10710, +m123 14 23 foo.1 0 + 14 23 0,0,0 2 142,326, 0,10710, diff --git a/fixtures/gffs/simple.gff3 b/fixtures/gffs/simple.gff3 new file mode 100644 index 00000000..b64ac0bf --- /dev/null +++ b/fixtures/gffs/simple.gff3 @@ -0,0 +1,8 @@ +m123 HAVANA gene 1 20 . - . ID=ENSG00000294541.1 +m123 HAVANA transcript 1 20 . - . ID=ENST00000724296.1;Parent=ENSG00000294541.1 +m123 HAVANA exon 4 8 . - . ID=exon:ENST00000724296.1:1;Parent=ENST00000724296.1 +m123 HAVANA exon 10 14 . - . ID=exon:ENST00000724296.1:2;Parent=ENST00000724296.1 +m123 HAVANA exon 16 19 . - . ID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1 +m123 ENSEMBL gene 3 15 . - . ID=ENSG00000277248.1 +m123 ENSEMBL transcript 3 15 . - . ID=ENST00000615943.1;Parent=ENSG00000277248.1 +m123 ENSEMBL exon 3 15 . - . ID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1 diff --git a/src/annotations/gff.rs b/src/annotations/gff.rs index df2daffb..fd3da6d8 100644 --- a/src/annotations/gff.rs +++ b/src/annotations/gff.rs @@ -47,7 +47,7 @@ pub fn propagate_gff( .map(|(name, path)| (name.clone(), path.sequence(conn).len() as i64)) .collect::>(); - for result in reader.records() { + for result in reader.record_bufs() { let record = result?; let path_name = record.reference_sequence_name().to_string(); let annotation = Annotation { @@ -62,7 +62,7 @@ pub fn propagate_gff( let score = record.score(); let phase = record.phase(); - let mut updated_record_builder = gff::Record::builder() + let mut updated_record_builder = gff::RecordBuf::builder() .set_reference_sequence_name(path_name) .set_source(record.source().to_string()) .set_type(record.ty().to_string()) @@ -156,7 +156,7 @@ mod tests { for (i, result) in reader .expect("Could not read output file!") - .records() + .record_bufs() .enumerate() { let record = result.unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 4afc9652..42f1b540 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ mod progress_bar; pub mod range; #[cfg(test)] pub mod test_helpers; +pub mod translate; pub mod updates; pub mod views; diff --git a/src/main.rs b/src/main.rs index 2d07ee55..c1a655b6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,6 +24,7 @@ use gen::models::sample::Sample; use gen::operation_management; use gen::operation_management::{parse_patch_operations, OperationError}; use gen::patch; +use gen::translate; use gen::updates::fasta::update_with_fasta; use gen::updates::gaf::{transform_csv_to_fasta, update_with_gaf}; use gen::updates::genbank::update_with_genbank; @@ -36,11 +37,10 @@ use gen::views::patch::view_patches; use itertools::Itertools; use noodles::core::Region; -use noodles::gff::directive::name; use rusqlite::{types::Value, Connection}; use std::fmt::Debug; use std::fs::File; -use std::io::Write; +use std::io::{BufReader, Write}; use std::ops::Deref; use std::path::{Path, PathBuf}; use std::{io, str}; @@ -73,6 +73,22 @@ enum Commands { #[arg(long)] format_csv_for_gaf: Option, }, + /// Translate coordinates of standard bioinformatic file formats. + #[command(arg_required_else_help(true))] + Translate { + /// Transform coordinates of a BED to graph nodes + #[arg(long)] + bed: Option, + /// Transform coordinates of a GFF to graph nodes + #[arg(long)] + gff: Option, + /// The name of the collection to map sequences against + #[arg(short, long)] + collection: Option, + /// The sample name whose graph coordinates are mapped against + #[arg(short, long)] + sample: Option, + }, /// Import a new sequence collection. #[command(arg_required_else_help(true))] Import { @@ -703,6 +719,49 @@ fn main() { conn.execute("END TRANSACTION", []).unwrap(); operation_conn.execute("END TRANSACTION", []).unwrap(); } + Some(Commands::Translate { + bed, + gff, + collection, + sample, + }) => { + let collection = &collection + .clone() + .unwrap_or_else(|| get_default_collection(&operation_conn)); + if let Some(bed) = bed { + let stdout = io::stdout(); + let mut handle = stdout.lock(); + let mut bed_file = File::open(bed).unwrap(); + match translate::bed::translate_bed( + &conn, + collection, + sample.as_deref(), + &mut bed_file, + &mut handle, + ) { + Ok(_) => {} + Err(err) => { + panic!("Error Translating Bed. {err}"); + } + } + } else if let Some(gff) = gff { + let stdout = io::stdout(); + let mut handle = stdout.lock(); + let mut gff_file = BufReader::new(File::open(gff).unwrap()); + match translate::gff::translate_gff( + &conn, + collection, + sample.as_deref(), + &mut gff_file, + &mut handle, + ) { + Ok(_) => {} + Err(err) => { + panic!("Error Translating GFF. {err}"); + } + } + } + } Some(Commands::Operations { interactive, branch, diff --git a/src/models/strand.rs b/src/models/strand.rs index 6fdd276a..fb7ef275 100644 --- a/src/models/strand.rs +++ b/src/models/strand.rs @@ -1,3 +1,4 @@ +use noodles::gff::record::Strand as GFFStrand; use rusqlite::types::{FromSql, FromSqlResult, ToSqlOutput, Value, ValueRef}; use rusqlite::ToSql; use serde::{Deserialize, Serialize}; @@ -11,6 +12,17 @@ pub enum Strand { ImportantButUnknown, } +impl Strand { + pub fn is_ambiguous(a: Strand) -> bool { + a == Strand::ImportantButUnknown || a == Strand::Unknown + } + pub fn is_compatible(a: Strand, b: Strand) -> bool { + let a_ambig = Strand::is_ambiguous(a); + let b_ambig = Strand::is_ambiguous(b); + (a_ambig || b_ambig) || a == b + } +} + // example https://docs.rs/rusqlite/latest/rusqlite/types/index.html impl ToSql for Strand { fn to_sql(&self) -> rusqlite::Result> { @@ -36,6 +48,28 @@ impl From for Value { } } +impl From for Strand { + fn from(value: GFFStrand) -> Strand { + match value { + GFFStrand::Forward => Strand::Forward, + GFFStrand::Unknown => Strand::Unknown, + GFFStrand::Reverse => Strand::Reverse, + GFFStrand::None => Strand::Unknown, + } + } +} + +impl From for GFFStrand { + fn from(value: Strand) -> GFFStrand { + match value { + Strand::Forward => GFFStrand::Forward, + Strand::Unknown => GFFStrand::Unknown, + Strand::Reverse => GFFStrand::Reverse, + Strand::ImportantButUnknown => GFFStrand::None, + } + } +} + impl FromSql for Strand { fn column_result(value: ValueRef) -> FromSqlResult { let result = match value.as_str() { diff --git a/src/translate.rs b/src/translate.rs new file mode 100644 index 00000000..f3ee3e41 --- /dev/null +++ b/src/translate.rs @@ -0,0 +1,4 @@ +pub mod bed; +pub mod gff; +#[cfg(test)] +pub mod test_helpers; diff --git a/src/translate/bed.rs b/src/translate/bed.rs new file mode 100644 index 00000000..0737a305 --- /dev/null +++ b/src/translate/bed.rs @@ -0,0 +1,157 @@ +use crate::graph::{project_path, GraphNode}; +use crate::models::block_group::BlockGroup; +use crate::models::node::Node; +use crate::models::sample::Sample; +use crate::models::strand::Strand; +use interavl::IntervalTree; +use noodles::bed; +use noodles::bed::feature::record_buf::{other_fields::Value, OtherFields}; +use noodles::core::Position; +use rusqlite::Connection; +use std::cmp::{max, min}; +use std::collections::HashMap; +use std::io::{Error, Read, Write}; + +pub fn translate_bed<'a, R, W>( + conn: &Connection, + collection: &str, + sample: impl Into>, + reader: R, + writer: &mut W, +) -> Result<(), Error> +where + R: Read, + W: Write, +{ + let sample = sample.into(); + let mut record = bed::Record::default(); + let mut bed_reader = bed::io::reader::Builder::<3>.build_from_reader(reader); + let mut bed_writer = bed::io::Writer::<3, _>::new(writer); + + let bgs = Sample::get_block_groups(conn, collection, sample); + let sample_bgs: HashMap = HashMap::from_iter( + bgs.iter() + .map(|bg| (bg.name.clone(), bg)) + .collect::>(), + ); + let mut paths: HashMap> = HashMap::new(); + + while bed_reader.read_record(&mut record)? != 0 { + let ref_name = record.reference_sequence_name().to_string(); + // noodles converts to 1 index, keep it 0. + let start = record.feature_start().unwrap().get() as i64 - 1; + let end = record.feature_end().unwrap().unwrap().get() as i64; + if let Some(bg) = sample_bgs.get(&ref_name) { + let projection = paths.entry(bg.id).or_insert_with(|| { + let path = BlockGroup::get_current_path(conn, bg.id); + let graph = BlockGroup::get_graph(conn, bg.id); + let mut tree = IntervalTree::default(); + let mut position: i64 = 0; + for (node, strand) in project_path(&graph, &path.blocks(conn)) { + if !Node::is_terminal(node.node_id) { + let end_position = position + node.length(); + tree.insert(position..end_position, (node, strand)); + position = end_position; + } + } + tree + }); + let range = start..end; + let values: Vec<_> = record.other_fields().iter().map(Value::from).collect(); + let other_fields = OtherFields::from(values); + for (overlap, (node, _strand)) in projection.iter_overlaps(&range) { + let overlap_start = max(start, overlap.start) as usize; + let overlap_end = min(end, overlap.end) as usize; + let out_record = bed::feature::RecordBuf::<3>::builder() + .set_reference_sequence_name(format!("{nid}", nid = node.node_id)) + .set_feature_start(Position::try_from(overlap_start + 1).unwrap()) + .set_feature_end(Position::try_from(overlap_end).unwrap()) + .set_other_fields(other_fields.clone()) + .build(); + bed_writer.write_feature_record(&out_record)?; + } + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::models::metadata; + use crate::models::operations::setup_db; + use crate::test_helpers::{get_connection, get_operation_connection, setup_gen_dir}; + use crate::translate::bed::translate_bed; + use crate::translate::test_helpers::get_simple_sequence; + use crate::updates::vcf::update_with_vcf; + use std::fs::File; + use std::path::PathBuf; + + #[test] + fn translates_coordinates_to_nodes() { + setup_gen_dir(); + let conn = &get_connection(None); + let db_uuid = metadata::get_db_uuid(conn); + let op_conn = &get_operation_connection(None); + setup_db(op_conn, &db_uuid); + + let vcf_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures/simple.vcf"); + let bed_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures/beds/simple.bed"); + let collection = "test".to_string(); + + get_simple_sequence(conn); + + update_with_vcf( + &vcf_path.to_str().unwrap().to_string(), + &collection, + "".to_string(), + "".to_string(), + conn, + op_conn, + None, + ) + .unwrap(); + let mut buffer = Vec::new(); + // "foo" is a sample from simple.vcf + translate_bed( + conn, + &collection, + "foo", + File::open(bed_path.clone()).unwrap(), + &mut buffer, + ) + .unwrap(); + let results = String::from_utf8(buffer).unwrap(); + assert_eq!( + results, + "\ + 3\t1\t3\tabc123.1\t0\t-\t1\t10\t0,0,0\t3\t102,188,129,\t0,3508,4691,\n\ + 3\t3\t5\tabc123.1\t0\t-\t1\t10\t0,0,0\t3\t102,188,129,\t0,3508,4691,\n\ + 3\t5\t10\tabc123.1\t0\t-\t1\t10\t0,0,0\t3\t102,188,129,\t0,3508,4691,\n\ + 3\t5\t8\txyz.1\t0\t-\t5\t8\t0,0,0\t1\t113,\t0,\n\ + 3\t10\t16\txyz.2\t0\t+\t10\t16\t0,0,0\t2\t142,326,\t0,10710,\n\ + 3\t14\t17\tfoo.1\t0\t+\t14\t23\t0,0,0\t2\t142,326,\t0,10710,\n\ + 4\t17\t23\tfoo.1\t0\t+\t14\t23\t0,0,0\t2\t142,326,\t0,10710,\n" + ); + + // The None sample has no variants, so should be a simple mapping and covers the split node + let mut buffer = Vec::new(); + translate_bed( + conn, + &collection, + None, + File::open(bed_path).unwrap(), + &mut buffer, + ) + .unwrap(); + let results = String::from_utf8(buffer).unwrap(); + assert_eq!( + results, + "\ + 3\t1\t10\tabc123.1\t0\t-\t1\t10\t0,0,0\t3\t102,188,129,\t0,3508,4691,\n\ + 3\t5\t8\txyz.1\t0\t-\t5\t8\t0,0,0\t1\t113,\t0,\n\ + 3\t10\t16\txyz.2\t0\t+\t10\t16\t0,0,0\t2\t142,326,\t0,10710,\n\ + 3\t14\t17\tfoo.1\t0\t+\t14\t23\t0,0,0\t2\t142,326,\t0,10710,\n\ + 4\t17\t23\tfoo.1\t0\t+\t14\t23\t0,0,0\t2\t142,326,\t0,10710,\n" + ); + } +} diff --git a/src/translate/gff.rs b/src/translate/gff.rs new file mode 100644 index 00000000..550f14b8 --- /dev/null +++ b/src/translate/gff.rs @@ -0,0 +1,187 @@ +use crate::graph::{project_path, GraphNode}; +use crate::models::block_group::BlockGroup; +use crate::models::node::Node; +use crate::models::sample::Sample; +use crate::models::strand::Strand; +use interavl::IntervalTree; +use noodles::core::Position; +use noodles::gff; +use rusqlite::Connection; +use std::cmp::{max, min}; +use std::collections::HashMap; +use std::io::{BufRead, Error, Read, Write}; + +pub fn translate_gff<'a, R, W>( + conn: &Connection, + collection: &str, + sample: impl Into>, + reader: R, + writer: &mut W, +) -> Result<(), Error> +where + R: Read + BufRead, + W: Write, +{ + let sample = sample.into(); + let mut gff_reader = gff::io::Reader::new(reader); + let mut gff_writer = gff::io::Writer::new(writer); + + let bgs = Sample::get_block_groups(conn, collection, sample); + let sample_bgs: HashMap = HashMap::from_iter( + bgs.iter() + .map(|bg| (bg.name.clone(), bg)) + .collect::>(), + ); + let mut paths: HashMap> = HashMap::new(); + + for result in gff_reader.record_bufs() { + let record = result?; + let ref_name = record.reference_sequence_name(); + let start = record.start().get() as i64; + let end = record.end().get() as i64; + if let Some(bg) = sample_bgs.get(ref_name) { + let projection = paths.entry(bg.id).or_insert_with(|| { + let path = BlockGroup::get_current_path(conn, bg.id); + let graph = BlockGroup::get_graph(conn, bg.id); + let mut tree = IntervalTree::default(); + let mut position: i64 = 0; + for (node, strand) in project_path(&graph, &path.blocks(conn)) { + if !Node::is_terminal(node.node_id) { + // GFF indexing is one based, inclusive, so we add 1 to the start. + // Take a sequence that is 1-4 in our coordinates, this converts to: + // 0123456 + // ATCGATC + // 1234567 + // 1-4 in our zero-based half open interval would be 2-4 in GFF coordinates + let end_position = position + node.length(); + tree.insert(position + 1..end_position, (node, strand)); + position = end_position; + } + } + tree + }); + let range = start..end; + for (overlap, (node, _overlap_strand)) in projection.iter_overlaps(&range) { + let overlap_start = max(start, overlap.start) as usize; + let overlap_end = min(end, overlap.end) as usize; + + let updated_record_builder = + gff::RecordBuf::builder() + .set_reference_sequence_name(format!("{nid}", nid = node.node_id)) + .set_source(record.source().to_string()) + .set_type(record.ty().to_string()) + .set_start(Position::try_from(overlap_start).expect( + "Could not convert start ({overlap_start}) to usize for propagation", + )) + .set_end(Position::try_from(overlap_end).expect( + "Could not convert end ({overlap_end}) to usize for propagation", + )) + .set_strand(record.strand()) + .set_attributes(record.attributes().clone()); + gff_writer.write_record(&updated_record_builder.build())?; + } + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::models::metadata; + use crate::models::operations::setup_db; + use crate::test_helpers::{get_connection, get_operation_connection, setup_gen_dir}; + use crate::translate::test_helpers::get_simple_sequence; + use crate::updates::vcf::update_with_vcf; + use std::fs::File; + use std::io::BufReader; + use std::path::PathBuf; + + #[test] + fn translates_coordinates_to_nodes() { + setup_gen_dir(); + let conn = &get_connection(None); + let db_uuid = metadata::get_db_uuid(conn); + let op_conn = &get_operation_connection(None); + setup_db(op_conn, &db_uuid); + + let vcf_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures/simple.vcf"); + let gff_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures/gffs/simple.gff3"); + let collection = "test".to_string(); + + get_simple_sequence(conn); + update_with_vcf( + &vcf_path.to_str().unwrap().to_string(), + &collection, + "".to_string(), + "".to_string(), + conn, + op_conn, + None, + ) + .unwrap(); + let mut buffer = Vec::new(); + // "foo" is a sample from simple.vcf + translate_gff( + conn, + &collection, + "foo", + BufReader::new(File::open(gff_path.clone()).unwrap()), + &mut buffer, + ) + .unwrap(); + let results = String::from_utf8(buffer).unwrap(); + assert_eq!( + results, + "\ + 3\tHAVANA\tgene\t1\t3\t.\t-\t.\tID=ENSG00000294541.1\n\ + 3\tHAVANA\tgene\t4\t5\t.\t-\t.\tID=ENSG00000294541.1\n\ + 3\tHAVANA\tgene\t6\t17\t.\t-\t.\tID=ENSG00000294541.1\n\ + 4\tHAVANA\tgene\t18\t20\t.\t-\t.\tID=ENSG00000294541.1\n\ + 3\tHAVANA\ttranscript\t1\t3\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n\ + 3\tHAVANA\ttranscript\t4\t5\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n\ + 3\tHAVANA\ttranscript\t6\t17\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n\ + 4\tHAVANA\ttranscript\t18\t20\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n\ + 3\tHAVANA\texon\t4\t5\t.\t-\t.\tID=exon:ENST00000724296.1:1;Parent=ENST00000724296.1\n\ + 3\tHAVANA\texon\t6\t8\t.\t-\t.\tID=exon:ENST00000724296.1:1;Parent=ENST00000724296.1\n\ + 3\tHAVANA\texon\t10\t14\t.\t-\t.\tID=exon:ENST00000724296.1:2;Parent=ENST00000724296.1\n\ + 3\tHAVANA\texon\t16\t17\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n\ + 4\tHAVANA\texon\t18\t19\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n\ + 3\tENSEMBL\tgene\t4\t5\t.\t-\t.\tID=ENSG00000277248.1\n\ + 3\tENSEMBL\tgene\t6\t15\t.\t-\t.\tID=ENSG00000277248.1\n\ + 3\tENSEMBL\ttranscript\t4\t5\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n\ + 3\tENSEMBL\ttranscript\t6\t15\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n\ + 3\tENSEMBL\texon\t4\t5\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n\ + 3\tENSEMBL\texon\t6\t15\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n\ + " + ); + + // The None sample has no variants, so should be a simple mapping with the single split + let mut buffer = Vec::new(); + translate_gff( + conn, + &collection, + None, + BufReader::new(File::open(gff_path.clone()).unwrap()), + &mut buffer, + ) + .unwrap(); + let results = String::from_utf8(buffer).unwrap(); + assert_eq!( + results, + "\ + 3\tHAVANA\tgene\t1\t17\t.\t-\t.\tID=ENSG00000294541.1\n\ + 4\tHAVANA\tgene\t18\t20\t.\t-\t.\tID=ENSG00000294541.1\n\ + 3\tHAVANA\ttranscript\t1\t17\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n\ + 4\tHAVANA\ttranscript\t18\t20\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n\ + 3\tHAVANA\texon\t4\t8\t.\t-\t.\tID=exon:ENST00000724296.1:1;Parent=ENST00000724296.1\n\ + 3\tHAVANA\texon\t10\t14\t.\t-\t.\tID=exon:ENST00000724296.1:2;Parent=ENST00000724296.1\n\ + 3\tHAVANA\texon\t16\t17\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n\ + 4\tHAVANA\texon\t18\t19\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n\ + 3\tENSEMBL\tgene\t3\t15\t.\t-\t.\tID=ENSG00000277248.1\n\ + 3\tENSEMBL\ttranscript\t3\t15\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n\ + 3\tENSEMBL\texon\t3\t15\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n\ + " + ); + } +} diff --git a/src/translate/test_helpers.rs b/src/translate/test_helpers.rs new file mode 100644 index 00000000..cd02506a --- /dev/null +++ b/src/translate/test_helpers.rs @@ -0,0 +1,99 @@ +use crate::calculate_hash; +use crate::models::block_group::BlockGroup; +use crate::models::block_group_edge::{BlockGroupEdge, BlockGroupEdgeData}; +use crate::models::collection::Collection; +use crate::models::edge::Edge; +use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}; +use crate::models::path::Path; +use crate::models::sequence::Sequence; +use crate::models::strand::Strand; +use rusqlite::Connection; + +pub fn get_simple_sequence(conn: &Connection) -> i64 { + let collection = Collection::create(conn, "test"); + let seq1 = Sequence::new() + .sequence_type("DNA") + .sequence("ATCGATCGATCGATCGA") + .save(conn); + let seq2 = Sequence::new() + .sequence_type("DNA") + .sequence("TCGGGAACACACAGAGA") + .save(conn); + let node1 = Node::create( + conn, + &seq1.hash, + calculate_hash(&format!( + "{collection}.m123:{hash}", + collection = collection.name, + hash = seq1.hash + )), + ); + let node2 = Node::create( + conn, + &seq2.hash, + calculate_hash(&format!( + "{collection}.m123:{hash}", + collection = collection.name, + hash = seq2.hash + )), + ); + let block_group = BlockGroup::create(conn, &collection.name, None, "m123"); + + let edge_into = Edge::create( + conn, + PATH_START_NODE_ID, + 0, + Strand::Forward, + node1, + 0, + Strand::Forward, + ); + let middle_edge = Edge::create( + conn, + node1, + seq1.length, + Strand::Forward, + node2, + 0, + Strand::Forward, + ); + let edge_out_of = Edge::create( + conn, + node2, + seq2.length, + Strand::Forward, + PATH_END_NODE_ID, + 0, + Strand::Forward, + ); + + let new_block_group_edges = vec![ + BlockGroupEdgeData { + block_group_id: block_group.id, + edge_id: edge_into.id, + chromosome_index: 0, + phased: 0, + }, + BlockGroupEdgeData { + block_group_id: block_group.id, + edge_id: middle_edge.id, + chromosome_index: 0, + phased: 0, + }, + BlockGroupEdgeData { + block_group_id: block_group.id, + edge_id: edge_out_of.id, + chromosome_index: 0, + phased: 0, + }, + ]; + + BlockGroupEdge::bulk_create(conn, &new_block_group_edges); + Path::create( + conn, + "m123", + block_group.id, + &[edge_into.id, middle_edge.id, edge_out_of.id], + ); + block_group.id +}