Skip to content

Commit a083f7c

Browse files
committed
adding seqspec rust tooling for python-rust parity
1 parent ecf412a commit a083f7c

25 files changed

+3794
-65
lines changed

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,15 @@ crate-type = ["rlib", "cdylib"]
1313

1414
[dependencies]
1515
clap = { version = "4.5.46", features = ["derive"] }
16+
jsonschema = "0.33.0"
1617
pyo3 = { version = "0.25", optional = true, features = ["extension-module", "abi3-py312"] }
1718
# pythonize = "0.25.0"
1819
serde = { version = "1", features = ["derive"] }
1920
serde_json = "1"
2021
serde_yaml = "0.9"
2122
thiserror = "1"
23+
reqwest = { version = "0.12", features = ["blocking", "rustls-tls"] }
24+
flate2 = "1"
2225

2326

2427
[features]

README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,22 @@ Ali Sina Booeshaghi, Xi Chen, Lior Pachter, A machine-readable specification for
3737
- [Contribute a `seqspec` : `docs/CONTRIBUTING.md`](docs/CONTRIBUTING.md)
3838
- [Watch a YouTube video about `seqspec`](https://youtu.be/NSj6Vpzy8tU)
3939
- [Read the manuscript that describes `seqspec`](https://doi.org/10.1093/bioinformatics/btae168)
40+
41+
## Rust implementation
42+
43+
- [] build : Generate a complete seqspec with natural language.
44+
- [x] check : Validate seqspec file against specification (verify check)
45+
- [x] find : Find objects in seqspec file
46+
- [x] file : List files present in seqspec file
47+
- [x] format : Autoformat seqspec file
48+
- [x] index : Identify position of elements in seqspec file
49+
- [x] info : Get information from seqspec file
50+
- [x] init : Generate a new empty seqspec file
51+
- [x] insert : Insert regions or reads into an existing spec (TODO: move Input structs to models)
52+
- [x] methods : Convert seqspec file into methods section
53+
- [x] modify : Modify attributes of various elements in seqspec file
54+
- [x] onlist : Get onlist file for elements in seqspec file
55+
- [] print : Display the sequence and/or library structure from seqspec file
56+
- [x] split : Split seqspec file by modality
57+
- [x] upgrade : Upgrade seqspec file to current version
58+
- [x] version: Get seqspec tool version and seqspec file version

seqspec/seqspec_index.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -353,9 +353,11 @@ def get_coordinate_by_read_id(spec: Assay, modality: str, read_id: str) -> Coord
353353

354354
return coord
355355

356+
356357
FEATURE_REGION_TYPES = {"CDNA", "GDNA", "PROTEIN", "TAG", "SGRNA_TARGET"}
357358

358-
def format_kallisto_bus(indices: List[Coordinate], subregion_type=None):
359+
360+
def format_kallisto_bus(indices: List[Coordinate], subregion_type=None) -> str:
359361
bcs = []
360362
umi = []
361363
feature = []
@@ -376,7 +378,9 @@ def format_kallisto_bus(indices: List[Coordinate], subregion_type=None):
376378
return x
377379

378380

379-
def format_kallisto_bus_force_single(indices: List[Coordinate], subregion_type=None):
381+
def format_kallisto_bus_force_single(
382+
indices: List[Coordinate], subregion_type=None
383+
) -> str:
380384
bcs = []
381385
umi = []
382386
feature = []
@@ -408,7 +412,7 @@ def format_kallisto_bus_force_single(indices: List[Coordinate], subregion_type=N
408412

409413
# this one should only return one string
410414
# TODO: return to this
411-
def format_seqkit_subseq(indices: List[Coordinate], subregion_type=None):
415+
def format_seqkit_subseq(indices: List[Coordinate], subregion_type=None) -> str:
412416
# The x string format is start:stop (1-indexed)
413417
# x = ""
414418
# region = indices[0]
@@ -422,7 +426,7 @@ def format_seqkit_subseq(indices: List[Coordinate], subregion_type=None):
422426
return x
423427

424428

425-
def format_tab(indices: List[Coordinate], subregion_type=None):
429+
def format_tab(indices: List[Coordinate], subregion_type=None) -> str:
426430
x = ""
427431
for idx, coord in enumerate(indices):
428432
rcv = coord.rcv
@@ -433,7 +437,7 @@ def format_tab(indices: List[Coordinate], subregion_type=None):
433437
return x[:-1]
434438

435439

436-
def format_starsolo(indices: List[Coordinate], subregion_type=None):
440+
def format_starsolo(indices: List[Coordinate], subregion_type=None) -> str:
437441
bcs = []
438442
umi = []
439443
cdna = []
@@ -451,7 +455,7 @@ def format_starsolo(indices: List[Coordinate], subregion_type=None):
451455
return x
452456

453457

454-
def format_simpleaf(indices: List[Coordinate], subregion_type=None):
458+
def format_simpleaf(indices: List[Coordinate], subregion_type=None) -> str:
455459
x = ""
456460
xl = []
457461
for idx, coord in enumerate(indices):
@@ -469,7 +473,7 @@ def format_simpleaf(indices: List[Coordinate], subregion_type=None):
469473
return "".join(xl)
470474

471475

472-
def format_zumis(indices: List[Coordinate], subregion_type=None):
476+
def format_zumis(indices: List[Coordinate], subregion_type=None) -> str:
473477
xl = []
474478
for idx, coord in enumerate(indices):
475479
x = ""
@@ -486,7 +490,7 @@ def format_zumis(indices: List[Coordinate], subregion_type=None):
486490

487491

488492
def stable_deduplicate_fqs(fqs):
489-
# stably deduplicate gdna_fqs
493+
# stably deduplicate fqs
490494
seen_fqs = set()
491495
deduplicated_fqs = []
492496
for r in fqs:
@@ -496,7 +500,7 @@ def stable_deduplicate_fqs(fqs):
496500
return deduplicated_fqs
497501

498502

499-
def format_chromap(indices: List[Coordinate], subregion_type=None):
503+
def format_chromap(indices: List[Coordinate], subregion_type=None) -> str:
500504
bc_fqs = []
501505
bc_str = []
502506
gdna_fqs = []
@@ -563,7 +567,7 @@ def filter_groupby_region_type(g, keep=["umi", "barcode", "cdna"]):
563567
return g
564568

565569

566-
def format_relative(indices: List[Coordinate], subregion_type=None):
570+
def format_relative(indices: List[Coordinate], subregion_type=None) -> str:
567571
x = ""
568572
for idx, coord in enumerate(indices):
569573
rg_strand = coord.strand # noqa
@@ -622,9 +626,6 @@ def groupby_region_type(rgns):
622626
return d
623627

624628

625-
# def group_regions_by_region_type(rgns):
626-
627-
628629
def format_splitcode_row(obj, rgncdiffs, idx=0, rev=False, complement=False):
629630
# print(obj.region_id, idx)
630631
# TODO only have one object left and one object right of the sequence
@@ -689,7 +690,7 @@ def format_splitcode_row(obj, rgncdiffs, idx=0, rev=False, complement=False):
689690
return {"region_type": obj.region_type, "fmt": e}
690691

691692

692-
def format_splitcode(indices: List[Coordinate], subregion_type=None):
693+
def format_splitcode(indices: List[Coordinate], subregion_type=None) -> str:
693694
# extraction based on fixed sequences
694695
# extraction based on onlist sequences
695696
# umi - bc3 - link2 - bc2 - link1 - bc1 - read

src/lib.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@ pub use models::{file, region, read, onlist, assay};
44
pub mod utils;
55
pub mod seqspec_version;
66
pub mod seqspec_format;
7+
pub mod seqspec_find;
8+
pub mod seqspec_index;
9+
pub mod seqspec_file;
10+
pub mod seqspec_split;
11+
pub mod seqspec_info;
12+
pub mod seqspec_init;
13+
pub mod seqspec_methods;
14+
pub mod seqspec_modify;
15+
pub mod seqspec_upgrade;
16+
pub mod seqspec_insert;
17+
pub mod seqspec_check;
18+
pub mod seqspec_onlist;
719

820
// #[cfg(feature = "python-binding")]
921
// mod py_module; // lives in src/py_module.rs

src/main.rs

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,18 @@
55

66
use seqspec::seqspec_version;
77
use seqspec::seqspec_format;
8+
use seqspec::seqspec_find;
9+
use seqspec::seqspec_index;
10+
use seqspec::seqspec_file;
11+
use seqspec::seqspec_split;
12+
use seqspec::seqspec_info;
13+
use seqspec::seqspec_init;
14+
use seqspec::seqspec_methods;
15+
use seqspec::seqspec_modify;
16+
use seqspec::seqspec_upgrade;
17+
use seqspec::seqspec_insert;
18+
use seqspec::seqspec_check;
19+
use seqspec::seqspec_onlist;
820
use seqspec::utils;
921

1022
use clap::{Parser, Subcommand};
@@ -20,13 +32,37 @@ struct Args {
2032
enum Commands {
2133
Version(seqspec_version::VersionArgs),
2234
Format(seqspec_format::FormatArgs),
35+
Find(seqspec_find::FindArgs),
36+
Index(seqspec_index::IndexArgs),
37+
File(seqspec_file::FileArgs),
38+
Split(seqspec_split::SplitArgs),
39+
Info(seqspec_info::InfoArgs),
40+
Init(seqspec_init::InitArgs),
41+
Methods(seqspec_methods::MethodsArgs),
42+
Modify(seqspec_modify::ModifyArgs),
43+
Upgrade(seqspec_upgrade::UpgradeArgs),
44+
Insert(seqspec_insert::InsertArgs),
45+
Check(seqspec_check::CheckArgs),
46+
Onlist(seqspec_onlist::OnlistArgs),
2347
// other subcommands later...
2448
}
2549

2650
fn main() {
2751
let args = Args::parse();
2852
match args.subcmd {
29-
Commands::Version(args) => seqspec_version::validate_version_args(&args),
30-
Commands::Format(args) => seqspec_format::validate_format_args(&args),
53+
Commands::Version(args) => seqspec_version::run_version(&args),
54+
Commands::Format(args) => seqspec_format::run_format(&args),
55+
Commands::Find(args) => seqspec_find::run_find(&args),
56+
Commands::Index(args) => seqspec_index::run_index(&args),
57+
Commands::File(args) => seqspec_file::run_file(&args)
58+
,Commands::Split(args) => seqspec_split::run_split(&args)
59+
,Commands::Info(args) => seqspec_info::run_info(&args)
60+
,Commands::Init(args) => seqspec_init::run_init(&args)
61+
,Commands::Methods(args) => seqspec_methods::run_methods(&args)
62+
,Commands::Modify(args) => seqspec_modify::run_modify(&args)
63+
,Commands::Upgrade(args) => seqspec_upgrade::run_upgrade(&args)
64+
,Commands::Insert(args) => seqspec_insert::run_insert(&args)
65+
,Commands::Check(args) => { seqspec_check::run_check(&args); }
66+
,Commands::Onlist(args) => seqspec_onlist::run_onlist(&args)
3167
}
3268
}

src/models/assay.rs

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ pub struct Assay {
5252
pub library_spec: Vec<Region>,
5353
}
5454

55+
pub enum Codec { Yaml, Json }
56+
57+
5558
impl Assay {
5659
pub fn new(
5760
assay_id: String,
@@ -85,8 +88,21 @@ impl Assay {
8588
serde_json::to_string(self)
8689
}
8790

91+
pub fn to_bytes(&self, codec: Codec) -> Result<Vec<u8>, Box<dyn std::error::Error>> {
92+
Ok(match codec {
93+
Codec::Yaml => serde_yaml::to_string(self)?.into_bytes(),
94+
Codec::Json => serde_json::to_vec(self)?,
95+
})
96+
}
97+
pub fn from_bytes(bytes: &[u8], codec: Codec) -> Result<Self, Box<dyn std::error::Error>> {
98+
Ok(match codec {
99+
Codec::Yaml => serde_yaml::from_slice(bytes)?,
100+
Codec::Json => serde_json::from_slice(bytes)?,
101+
})
102+
}
103+
88104
// Core helpers ----------------------------------------------------
89-
pub fn update_spec(&mut self) {
105+
pub fn update_spec(&mut self) -> () {
90106
for r in &mut self.library_spec {
91107
r.update_attr();
92108
}
@@ -110,9 +126,9 @@ impl Assay {
110126

111127
pub fn get_read(&self, read_id: &str) -> Option<Read> {
112128
self.sequence_spec
113-
.iter()
114-
.find(|r| r.read_id == read_id)
115-
.cloned()
129+
.iter()
130+
.find(|r| r.read_id == read_id)
131+
.cloned()
116132
}
117133

118134
/// Insert regions under the top-level region for `modality`.

src/models/coordinate.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
use serde::{Serialize, Deserialize};
2+
3+
use crate::models::region::RegionCoordinate;
4+
5+
#[derive(Debug, Serialize, Deserialize)]
6+
pub struct Coordinate {
7+
pub query_id: String,
8+
pub query_name: String,
9+
pub query_type: String,
10+
pub rcv: Vec<RegionCoordinate>,
11+
#[serde(default = "default_strand")]
12+
pub strand: String,
13+
}
14+
15+
fn default_strand() -> String {
16+
"pos".to_string()
17+
}

src/models/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ pub mod assay;
22
pub mod file;
33
pub mod onlist;
44
pub mod read;
5-
pub mod region;
5+
pub mod region;
6+
pub mod coordinate;

src/models/region.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ impl Region {
156156
use std::collections::BTreeSet;
157157
let mut set = BTreeSet::new();
158158
for r in self.get_leaves() {
159-
set.insert(r.region_type);
159+
set.insert(r.region_type.clone());
160160
}
161161
set.into_iter().collect()
162162
}

0 commit comments

Comments
 (0)