diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index d2effb2f..b1892d02 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -30,13 +30,28 @@ jobs: - uses: actions/checkout@v4 - name: Setup Pages uses: actions/configure-pages@v5 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" - uses: actions/setup-node@v4 with: node-version: 18.x - name: Install MyST Markdown run: npm install -g mystmd + - name: Install Python dependencies for examples + run: | + python -m pip install --upgrade pip + python -m pip install pyyaml jsonschema requests biopython pydantic opentelemetry-sdk + - name: Build examples site + env: + MPLBACKEND: Agg + run: python docs/examples/build_examples.py - name: Build HTML Assets run: myst build --html + - name: Copy examples into Pages artifact + run: | + mkdir -p _build/html/examples + cp -r docs/examples/site/. _build/html/examples/ - name: Upload artifact uses: actions/upload-pages-artifact@v3 with: diff --git a/.gitignore b/.gitignore index aa422e5b..c7957298 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ .vscode/ docs/_build/ +_build/ +.mplconfig/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..8ea909dc --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "seqspec" +version = "0.4.0" +edition = "2021" +license = "MIT" +repository = "https://github.com/pachterlab/seqspec" +description = "Sequencing specification tools (Rust core + PyO3 bindings)" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lib] +name = "seqspec" +crate-type = ["rlib", "cdylib"] + +[dependencies] +anyhow = "1" +clap = { version = "4.5.46", features = ["derive", "env"] } +jsonschema = "0.33.0" +pyo3 = { version = "0.25", optional = true, features = ["extension-module", "abi3-py312"] } +# pythonize = "0.25.0" +serde = { version = "1", features = ["derive"] } +serde_json = "1" +serde_yaml = "0.9" +thiserror = "1" +reqwest = { version = "0.12", features = ["blocking", "rustls-tls"] } +flate2 = "1" +toml = "0.8" + + +[features] +default = [] +python-binding = ["pyo3"] # enable PyO3 only when building Python wheels diff --git a/README.md b/README.md index 160f9498..30ce014a 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,18 @@ We have multiple tutorials to get you up and running with `seqspec`: 2. Understand how to [manipulate `seqspec` files](docs/USING_SEQSPEC.ipynb) using the `seqspec` command-line tool. +## Current release + +`seqspec 0.4.0` keeps the Python and Rust implementations aligned around the same core command set. + +- `seqspec upgrade` upgrades `0.3.0` specs to `0.4.0` in both implementations. +- `seqspec` loads gzipped specs directly, so `.yaml.gz` works anywhere a spec path is accepted. +- `seqspec auth` manages host-matched auth profiles for remote resources, and `seqspec check` / `seqspec onlist` can use them with `--auth-profile`. +- `seqspec check` now emits warnings for valid but risky geometry, such as reads that cover the same declared regions. +- `seqspec onlist -s region-type` now errors when the same region type appears across multiple reads, so ambiguous joins are explicit. +- `seqspec print -f seqspec-html` writes a self-contained HTML view of the library and reads. +- `seqspec build` is deprecated. + ## Citation The `seqspec` format and tool are described in this [publication](https://doi.org/10.1093/bioinformatics/btae168). If you use `seqspec` please cite @@ -29,11 +41,35 @@ Ali Sina Booeshaghi, Xi Chen, Lior Pachter, A machine-readable specification for ## Documentation - [Install `seqspec`: `docs/INSTALLATION.md`](docs/INSTALLATION.md) -- [Learn about the `seqspec` file format: `docs/DOCUMENTATION.md`](docs/SEQSPEC_FILE.md) -- [Learn about the `seqspec` tool: `docs/DOCUMENTATION.md`](docs/SEQSPEC_TOOL.md) -- [Learn about the `seqspec` specification : `docs/SPECIFICATION.md`](docs/SPECIFICATION.md) -- [Write a `seqspec`: `docs/TUTORIAL.md`](docs/TUTORIAL.md) -- [View example `seqspec` files: `https://www.sina.bio/seqspec-builder/assays.html`](https://www.sina.bio/seqspec-builder/assays.html) +- [Learn about the `seqspec` file format: `docs/SEQSPEC_FILE.md`](docs/SEQSPEC_FILE.md) +- [Learn about the `seqspec` tool: `docs/SEQSPEC_TOOL.md`](docs/SEQSPEC_TOOL.md) +- [Learn about the `seqspec` specification: `docs/SPECIFICATION.md`](docs/SPECIFICATION.md) +- [Write a `seqspec` from a simple example: `docs/TUTORIAL_SIMPLE.md`](docs/TUTORIAL_SIMPLE.md) +- [Write a `seqspec` from a template: `docs/TUTORIAL_FROM_TEMPLATE.md`](docs/TUTORIAL_FROM_TEMPLATE.md) +- [Write a more complex `seqspec`: `docs/TUTORIAL_COMPLEX.md`](docs/TUTORIAL_COMPLEX.md) +- [Browse the generated example site: `docs/examples/site/assays.html`](docs/examples/site/assays.html) - [Contribute a `seqspec` : `docs/CONTRIBUTING.md`](docs/CONTRIBUTING.md) - [Watch a YouTube video about `seqspec`](https://youtu.be/NSj6Vpzy8tU) - [Read the manuscript that describes `seqspec`](https://doi.org/10.1093/bioinformatics/btae168) + +## Rust implementation + +- [x] auth : Manage remote authentication profiles. +- build : Deprecated in both CLIs. +- [x] check : Validate seqspec file against specification (verify check) +- [x] find : Find objects in seqspec file +- [x] file : List files present in seqspec file +- [x] format : Autoformat seqspec file +- [x] index : Identify position of elements in seqspec file +- [x] info : Get information from seqspec file +- [x] init : Generate a new empty seqspec file +- [x] insert : Insert regions or reads into an existing spec (TODO: move Input structs to models) +- [x] methods : Convert seqspec file into methods section +- [x] modify : Modify attributes of various elements in seqspec file +- [x] onlist : Get onlist file for elements in seqspec file +- [x] print : Display the sequence and/or library structure from seqspec file +- [x] split : Split seqspec file by modality +- [x] upgrade : Upgrade seqspec file to current version +- [x] version: Get seqspec tool version and seqspec file version + +The standalone Rust CLI supports `library-ascii`, `seqspec-ascii`, and `seqspec-html` in `seqspec print`. `seqspec-png` remains Python-only for now. diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index feb666b3..793f1f20 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -7,6 +7,32 @@ authors: # Changelog +## [0.4.1] - Unreleased + +### Added + +- `seqspec auth` in Python and Rust with `init`, `path`, `list`, and `resolve` subcommands. +- `seqspec print -f seqspec-html`, a self-contained HTML view that shows the library molecule, reads, and nested region metadata. +- Additional parity tests for Python and Rust command behavior. +- `seqspec check` now emits warning diagnostics for overlapping read geometry, with guidance to use `seqspec index --no-overlap` when needed. +- A generated examples site under `docs/examples/site`, with rendered assay reports, read templates, region templates, and a searchable assay catalog. + +### Changed + +- `seqspec upgrade` now upgrades `0.3.0` specs to `0.4.0` in both implementations. +- Python and Rust now share the same core command surface for `auth`, `check`, `find`, `file`, `format`, `index`, `info`, `init`, `insert`, `methods`, `modify`, `onlist`, `print`, `split`, `upgrade`, and `version`. +- `seqspec build` is deprecated in both CLIs and remains as a compatibility stub. +- Older specs are loaded more permissively before upgrade, which makes `0.2.x` and `0.3.x` specs easier to normalize. +- `seqspec onlist -s region-type` now errors when matches span multiple reads in a modality. Use `-s read` or `-s region` to disambiguate. +- The maintained examples now live under `docs/examples/assays`, `docs/examples/reads`, and `docs/examples/regions`, with one colocated `docs/examples/build_examples.py` script to normalize YAML, write the manifest, and regenerate the site. +- GitHub Pages now publishes the generated examples site under `/examples/` alongside the main MyST documentation site. + +### Fixed + +- Rust `load_spec` now reads gzipped seqspec YAML. +- Python `seqspec check` and `seqspec onlist` can use auth profiles for remote resources. +- Python local gzipped onlist validation now detects `.gz` files correctly. + ## [0.4.0] - 2025-08-24 ### Added @@ -144,7 +170,7 @@ TODO: - `assay_spec` renamed `library_spec` - Reorganize specification document - Move contribution guidelines from `SPECIFICATION.md` to `CONTRIBUTION.md` -- Move example `Region`s from `SPECIFCATION.md` to `seqspec/docs/regions` +- Move example `Region`s from `SPECIFCATION.md` to `docs/examples/regions` - `seqspec index` defaults to indexing reads, `--region` indexes regions - Change descriptors of attributes `assay_id`, `doi` - `Assay` attribute `assay` changed to `assay_id` diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md index e538183b..d5297598 100644 --- a/docs/INSTALLATION.md +++ b/docs/INSTALLATION.md @@ -25,8 +25,21 @@ pip install seqspec uv pip install seqspec ``` -Verify the installation +Install from source if you want the current working tree. + +```bash +# Python package with the Rust core +uv run maturin develop + +# standalone Rust CLI +cargo install --path . +``` + +Verify the installation. ```bash seqspec --version +seqspec auth path ``` + +`seqspec` accepts plain YAML and gzipped YAML (`.yaml.gz`). Remote resources can be configured with `seqspec auth` and used with `--auth-profile` in commands such as `seqspec check` and `seqspec onlist`. diff --git a/docs/SEQSPEC_TOOL.md b/docs/SEQSPEC_TOOL.md index 547dfdd2..80243260 100644 --- a/docs/SEQSPEC_TOOL.md +++ b/docs/SEQSPEC_TOOL.md @@ -22,14 +22,15 @@ The `seqspec` specification is detailed in [here](SEQSPEC_FILE.md). Please revie ``` usage: seqspec [-h] ... -seqspec 0.3.0: A machine-readable file format for genomic library sequence and structure. +seqspec 0.4.0: A machine-readable file format for genomic library sequence and structure. GitHub: https://github.com/pachterlab/seqspec Documentation: https://pachterlab.github.io/seqspec/ positional arguments: - build Generate a complete seqspec with natural language (LLM-assisted) + auth Manage remote authentication profiles + build Deprecated. This command will be removed. check Validate seqspec file against specification find Find objects in seqspec file file List files present in seqspec file @@ -52,27 +53,80 @@ optional arguments: `seqspec` operates on `seqspec` compatible YAML files that follow the specification. All of the following examples will use the `seqspec` specification for the [DOGMAseq-DIG](https://doi.org/10.1186/s13059-022-02698-8) assay which can be found here: `seqspec/examples/specs/dogmaseq-dig/spec.yaml`. +Any command that takes `yaml` also accepts gzipped specs such as `spec.yaml.gz`. + +The `build` command is deprecated. Use `seqspec init`, `seqspec insert`, and `seqspec modify` instead. + :::{attention} **IMPORTANT**: Many `seqspec` commands require that the specification be properly formatted and error-corrected. Errors in the spec can be found with `seqspec check` (see below for instructions). The spec can be properly formatted (or "filled in") with `seqspec format`. It is recommended to run `seqspec format` followed by `seqspec check` after writing a new `seqspec` (or correcting errors in an existing one). ::: +## `seqspec auth`: Manage remote authentication profiles + +Use auth profiles when a spec points to protected remote files such as IGVF-hosted onlists or FASTQs. + +```bash +seqspec auth ... +``` + +`seqspec auth` has four subcommands: + +- `init`: create or update a profile that maps one or more hosts to credential environment variables +- `path`: show where the auth config file lives +- `list`: list configured profiles +- `resolve`: show which profile would be used for a given URL + +The auth config is host-based. The profile stores environment variable names, not secrets. + +### Examples + +```bash +# create an IGVF profile +seqspec auth init \ + --profile igvf \ + --host api.data.igvf.org \ + --host data.igvf.org \ + --kind basic \ + --username-env IGVF_ACCESS_KEY_ID \ + --password-env IGVF_ACCESS_KEY_SECRET + +# inspect the config path +seqspec auth path + +# list configured profiles +seqspec auth list + +# resolve a URL to a profile +seqspec auth resolve https://api.data.igvf.org/reference-files/IGVFFI5429KKCK/ +``` + ## `seqspec check`: Validate seqspec file against specification Check that the `seqspec` file is correctly formatted and consistent with the [specification](https://github.com/IGVF/seqspec/blob/main/docs/SPECIFICATION.md). ```bash -seqspec check [-h] [-o OUT] [--skip {igvf,igvf_onlist_skip}] yaml +seqspec check [-h] [-o OUT] [--skip {igvf,igvf_onlist_skip,structural}] [--auth-profile PROFILE] yaml ``` ```python -from seqspec.seqspec_check import run_check +from seqspec.seqspec_check import seqspec_check +from seqspec.utils import load_spec -run_check(schema_fn: str, spec_fn: str, o: str) +spec = load_spec("spec.yaml", strict=False) +seqspec_check(spec, filter_type=None, auth_profile=None) ``` - optionally, `-o OUT` can be used to write the output to a file. -- optionally, `--skip {igvf,igvf_onlist_skip}` can filter out known IGVF-specific warnings (see source for list). -- `yaml` corresponds to the `seqspec` file. +- optionally, `--skip {igvf,igvf_onlist_skip,structural}` can filter out known diagnostic classes (see source for list). +- optionally, `--auth-profile PROFILE` uses a named auth profile when checking remote files. +- `yaml` corresponds to the `seqspec` file and may be plain YAML or `.yaml.gz`. + +`seqspec check` emits diagnostics with two severities: + +- `error`: the spec is invalid and should be fixed. +- `warning`: the spec is valid, but the declared geometry may still need explicit downstream handling. + +Warnings do not mean the spec is malformed. They flag cases that are easy to miss, such as two reads in the same modality covering the same declared regions. In those cases, downstream tools may need explicit overlap handling such as `seqspec index --no-overlap`. A list of checks performed: @@ -99,6 +153,7 @@ A list of checks performed: 15. Check that for every region with subregions, the region `min_len`/`max_len` equals the sum of the subregions' `min_len`/`max_len`. 16. Check that for every region with subregions, the region `sequence` equals the left-to-right concatenation of the subregions' `sequence`s. 17. Check that each read's `max_len` does not exceed the sequence-able range of library elements after (pos strand) or before (neg strand) the primer. +18. Warn when two reads in the same modality cover the same declared regions. This often needs explicit overlap handling with `seqspec index --no-overlap`. Below are a list of example errors one may encounter when checking a spec: @@ -138,6 +193,13 @@ Below are a list of example errors one may encounter when checking a spec: $ seqspec check spec.yaml [error 1] None is not of type 'string' in spec['assay'] [error 2] 'Ribonucleic acid' is not one of ['rna', 'tag', 'protein', 'atac', 'crispr'] in spec['modalities'][0] + +# check a spec with protected remote resources +$ seqspec check --auth-profile igvf spec.yaml + +# a valid spec can still emit overlap warnings +$ seqspec check overlap_spec.yaml +[warning 1] reads 'rna_R1' and 'rna_R2' in modality 'rna' both cover region(s) 'barcode', 'umi'. Downstream tools may require explicit overlap handling such as `seqspec index --no-overlap` ``` ## `seqspec find`: Find objects in seqspec file @@ -556,13 +618,15 @@ $ seqspec modify -m atac -o mod_spec.yaml -i atac_R1 --files "R1_1.fastq.gz,fast ## `seqspec onlist`: Get onlist file(s) for elements in seqspec file ```bash -seqspec onlist [-h] [-o OUT] [-s SELECTOR] [-f {product,multi}] -m MODALITY [-i ID] yaml +seqspec onlist [-h] [-o OUT] [-s SELECTOR] [-f {product,multi}] [--auth-profile PROFILE] -m MODALITY [-i ID] yaml ``` ```python -from seqspec.seqspec_onlist import run_onlist +from seqspec.seqspec_onlist import get_onlists +from seqspec.utils import load_spec -run_onlist(spec_fn, modality, ids, idtype, fmt, o) +spec = load_spec("spec.yaml") +get_onlists(spec, modality="rna", selector="region-type", id="barcode") ``` - optionally, `-o OUT` when set with `-f`, writes the joined onlist to this file; when set without `-f`, downloads remote onlists locally and prints paths. @@ -575,9 +639,10 @@ run_onlist(spec_fn, modality, ids, idtype, fmt, o) - `-f` selects how to combine multiple onlists: - `product` (cartesian product) - `multi` (row-aligned, zip with padding) -- `yaml` corresponds to the `seqspec` file. +- optionally, `--auth-profile PROFILE` uses a named auth profile for protected remote onlists. +- `yaml` corresponds to the `seqspec` file and may be plain YAML or `.yaml.gz`. -_Note_: If, for example, there are multiple regions with the specified `region_type` in the modality (e.g. multiple barcodes), then `seqspec onlist` will return a path to an onlist that it generates where the entries in that onlist are the cartesian product of the onlists for all of the regions found. +_Note_: `-s region-type` is only valid when the matching regions come from one read geometry. If the same `region_type` appears across multiple reads in the modality, `seqspec onlist` errors and asks you to use `-s read` or `-s region` instead. ### Examples @@ -589,6 +654,14 @@ $ seqspec onlist -m rna -s read -i rna_R1 spec.yaml # Get onlist for barcode region type $ seqspec onlist -m rna -s region-type -i barcode spec.yaml /path/to/spec/folder/RNA-737K-arc-v1.txt + +# Ambiguous region-type matches across reads are rejected +$ seqspec onlist -m rna -s region-type -i barcode ambiguous_spec.yaml +region-type 'barcode' matches regions in multiple reads for modality 'rna': rna_R1, rna_R2. Use -s read or -s region to disambiguate. + +# Get an onlist from a protected remote source +$ seqspec onlist --auth-profile igvf -m crispr -s region-type -i barcode spec.yaml +/path/to/spec/folder/IGVFFI5429KKCK.txt.gz ``` ## `seqspec print`: Display the sequence and/or library structure from seqspec file @@ -600,17 +673,21 @@ seqspec print [-h] [-o OUT] [-f FORMAT] yaml ``` ```python -from seqspec.seqspec_print import run_seqspec_print -run_seqspec_print(spec_fn, fmt, o) +from seqspec.seqspec_print import seqspec_print +from seqspec.utils import load_spec + +seqspec_print(load_spec("spec.yaml"), "seqspec-html") ``` - optionally, `-o OUT` to set the path of printed file. - optionally, `-f FORMAT` is the format of the printed file. Can be one of: - `library-ascii`: prints an ascii tree of the library_spec - - `seqspec-html`: prints an html of both the library_spec and sequence_spec (TODO this is incomplete) + - `seqspec-html`: prints a self-contained interactive HTML view of the library structure, reads, and metadata - `seqspec-png`: prints a png summary of modality structures - `seqspec-ascii`: prints an ascii representation of both the library_spec and sequence_spec -- `yaml` corresponds to the `seqspec` file. +- `yaml` corresponds to the `seqspec` file and may be plain YAML or `.yaml.gz`. + +The Python CLI supports all four formats. The standalone Rust CLI supports `library-ascii`, `seqspec-ascii`, and `seqspec-html`. ### Examples @@ -670,17 +747,7 @@ TGTGAGAAAGGGATGTGCTGCGAGAAGGCTAGAXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # Print the sequence and library structure as html -$ seqspec print -f seqspec-html spec.yaml - - - - - + + +

Assays

This catalog lists the maintained assay examples in current `0.4.0` structure.

NameAssaySequencerModalitiesDateReportseqspec
10x-ATAC10xATACIlluminaatac19 October 2018viewyaml
10x-RNA-5prime10xRNA5primeIlluminarna2016-08-01viewyaml
10x-ATAC-RNA10xMultiomeIllumina NovaSeq 6000rna, atac15 September 2020viewyaml
10x-ATAC-RNA-MULTI/Illumina10x-ATAC-RNA-MULTIIllumina NovaSeq 6000rna, atac, tag17 June 2019viewyaml
10x-RNA-v110xRNAv1Illuminarna15 April 2016viewyaml
10x-RNA-v210xRNAv2Illuminarna16 January 2017viewyaml
10x-RNA-v310xRNAv3Illumina NovaSeq 6000rna—viewyaml
10xCRISPR10xCRISPRIlluminacrispr, rna10 June 2020viewyaml
10xFB-3prime10xFB protein and RNA assay 3'Illuminarna, protein12 February 2018viewyaml
10xFB protein and RNA assay 5'10xFB protein and RNA assay 5'Illuminarna, protein—viewyaml
10xFB-VDJ-5prime10xVDJ and RNA 5'Illuminarna, vdj21 October 2016viewyaml
10xv3 scrnaseq element adept truseq dualelement-adept-truseq-dual-indexElement Avitirna21 February 2024viewyaml
10xv3 scrnaseq illumina truseq dualIllumina-novaseq-truseq-dual-indexIllumina NovaSeq 6000rna21 February 2024viewyaml
BD-Rhapsody-EBBD-Rhapsody-EBIlluminarna31 August 2022viewyaml
BD-Rhapsody-v1/IlluminaBD-Rhapsody-v1Illuminarna20 December 2018viewyaml
CEL-SeqCEL-SeqIlluminarna30 August 2012viewyaml
CEL-Seq2/IlluminaCEL-Seq2Illuminarna28 April 2016viewyaml
DOGMAseq-DIG/IlluminaDOGMAseq-DIGIllumina NovaSeq 6000 (S2 Reagent Kit v1.5)protein, tag, rna, atac23 June 2022viewyaml
DOGMAseq-LLL/IlluminaDOGMAseq-LLLIllumina NovaSeq 6000 (S2 Reagent Kit v1.5)protein, rna, atac23 June 2022viewyaml
Drop-seq/IlluminaDrop-seqIlluminarna21 May 2015viewyaml
inDropv2/IlluminaIndropv2Illuminarna08 December 2016viewyaml
ISSAAC-seqissaac_seq—RNA, ATAC15 September 2020viewyaml
MARS-seqmars_seq—RNA17 May 2019viewyaml
mcSCRB-seqmcscrb_seq—RNA26 July 2018viewyaml
Microwell-seqmicrowell_seq—RNA22 February 2018viewyaml
WT Mega v2WT-Mega-v2Illuminarna—viewyaml
Pi-ATAC-seqpi_atac_seq—ATAC02 November 2018viewyaml
PIPseq V2pipseqv2—RNA06 March 2023viewyaml
PIPseq V3pipseqv3—RNA06 March 2023viewyaml
Quartz-seqquartz_seq—RNA17 April 2013viewyaml
Quartz-seq2quartz_seq2—RNA09 March 2018viewyaml
CRISPRa_screeningsccrispraNextSeq2000 p3 kitrna, crispr28 March 2023viewyaml
sci-RNA-seqsci_rna_seq—RNA18 August 2017viewyaml
sci-RNA-seq3sci_rna_seq3—RNA20 February 2019viewyaml
scifi-RNA-Seqscifi_rna_seq—RNA31 May 2021viewyaml
Seq-wellseq_well—RNA13 February 2017viewyaml
Seq-Well-S3seq_well_s3—RNA02 July 2019viewyaml
SHARE-SeqSHARE-SeqIlluminaatac, rna23 October 2020viewyaml
Smart-seq2smart_seq2—RNA02 January 2014viewyaml
Smart-seq3smart_seq3—RNA_end, RNA_internal04 May 2020viewyaml
sn-m3C-seqsn-m3C-seqIllumina NextSeq500methyl, hic09 September 2019viewyaml
snmCT-seqsnmCT-seqIllumina NextSeq500methyl, rna09 March 2022viewyaml
SPLiT-seqsplit_seq—RNA15 March 2018viewyaml
STRT-seqstrt_seq—RNA05 April 2012viewyaml
STRT-seq-2istrt_seq_2i—RNA27 November 2017viewyaml
STRT-seq-C1strt_seq_c1—RNA27 April 2019viewyaml
SugarSeq/IlluminaSugarSeqIllumina NextSeqrna, tag—viewyaml
SureCellsurecell—RNA27 April 2019viewyaml
10x Genomicstang2009—RNA06 April 2009viewyaml
templatetemplate—mode—viewyaml
VASA-seq-dropvasa_seq_drop—RNA27 June 2022viewyaml
VASA-seqvasa_seq_plate—RNA27 June 2022viewyaml
+ + + diff --git a/docs/examples/site/assays/10x_atac.html b/docs/examples/site/assays/10x_atac.html new file mode 100644 index 00000000..b7f2bb24 --- /dev/null +++ b/docs/examples/site/assays/10x_atac.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10x_rna_5prime.html b/docs/examples/site/assays/10x_rna_5prime.html new file mode 100644 index 00000000..07ad6182 --- /dev/null +++ b/docs/examples/site/assays/10x_rna_5prime.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10x_rna_atac.html b/docs/examples/site/assays/10x_rna_atac.html new file mode 100644 index 00000000..329a7de2 --- /dev/null +++ b/docs/examples/site/assays/10x_rna_atac.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10x_rna_atac_multi.html b/docs/examples/site/assays/10x_rna_atac_multi.html new file mode 100644 index 00000000..8cd42f53 --- /dev/null +++ b/docs/examples/site/assays/10x_rna_atac_multi.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10x_rna_v1.html b/docs/examples/site/assays/10x_rna_v1.html new file mode 100644 index 00000000..63edfba4 --- /dev/null +++ b/docs/examples/site/assays/10x_rna_v1.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10x_rna_v2.html b/docs/examples/site/assays/10x_rna_v2.html new file mode 100644 index 00000000..42f41bf3 --- /dev/null +++ b/docs/examples/site/assays/10x_rna_v2.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10x_rna_v3.html b/docs/examples/site/assays/10x_rna_v3.html new file mode 100644 index 00000000..32830b6b --- /dev/null +++ b/docs/examples/site/assays/10x_rna_v3.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10xcrispr.html b/docs/examples/site/assays/10xcrispr.html new file mode 100644 index 00000000..31907e69 --- /dev/null +++ b/docs/examples/site/assays/10xcrispr.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10xfb_3prime.html b/docs/examples/site/assays/10xfb_3prime.html new file mode 100644 index 00000000..35838774 --- /dev/null +++ b/docs/examples/site/assays/10xfb_3prime.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10xfb_5prime.html b/docs/examples/site/assays/10xfb_5prime.html new file mode 100644 index 00000000..77d71578 --- /dev/null +++ b/docs/examples/site/assays/10xfb_5prime.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10xfb_vdj_5prime.html b/docs/examples/site/assays/10xfb_vdj_5prime.html new file mode 100644 index 00000000..be1a954a --- /dev/null +++ b/docs/examples/site/assays/10xfb_vdj_5prime.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10xv3_scrnaseq_element_adept_truseq_dual.html b/docs/examples/site/assays/10xv3_scrnaseq_element_adept_truseq_dual.html new file mode 100644 index 00000000..ea56ddb0 --- /dev/null +++ b/docs/examples/site/assays/10xv3_scrnaseq_element_adept_truseq_dual.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/10xv3_scrnaseq_illumina_truseq_dual.html b/docs/examples/site/assays/10xv3_scrnaseq_illumina_truseq_dual.html new file mode 100644 index 00000000..475daba4 --- /dev/null +++ b/docs/examples/site/assays/10xv3_scrnaseq_illumina_truseq_dual.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/bd_rhapsody_eb.html b/docs/examples/site/assays/bd_rhapsody_eb.html new file mode 100644 index 00000000..9dc3aba4 --- /dev/null +++ b/docs/examples/site/assays/bd_rhapsody_eb.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/bd_rhapsody_v1.html b/docs/examples/site/assays/bd_rhapsody_v1.html new file mode 100644 index 00000000..eeea8e41 --- /dev/null +++ b/docs/examples/site/assays/bd_rhapsody_v1.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/cel_seq.html b/docs/examples/site/assays/cel_seq.html new file mode 100644 index 00000000..f6767aea --- /dev/null +++ b/docs/examples/site/assays/cel_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/cel_seq2.html b/docs/examples/site/assays/cel_seq2.html new file mode 100644 index 00000000..9e5a0e9d --- /dev/null +++ b/docs/examples/site/assays/cel_seq2.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/dogmaseq_dig.html b/docs/examples/site/assays/dogmaseq_dig.html new file mode 100644 index 00000000..0647476a --- /dev/null +++ b/docs/examples/site/assays/dogmaseq_dig.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/dogmaseq_lll.html b/docs/examples/site/assays/dogmaseq_lll.html new file mode 100644 index 00000000..27006547 --- /dev/null +++ b/docs/examples/site/assays/dogmaseq_lll.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/drop_seq.html b/docs/examples/site/assays/drop_seq.html new file mode 100644 index 00000000..21c17d53 --- /dev/null +++ b/docs/examples/site/assays/drop_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/indropv2.html b/docs/examples/site/assays/indropv2.html new file mode 100644 index 00000000..bdc61b4b --- /dev/null +++ b/docs/examples/site/assays/indropv2.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/issaac_seq.html b/docs/examples/site/assays/issaac_seq.html new file mode 100644 index 00000000..ccc83df1 --- /dev/null +++ b/docs/examples/site/assays/issaac_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/mars_seq.html b/docs/examples/site/assays/mars_seq.html new file mode 100644 index 00000000..8a708737 --- /dev/null +++ b/docs/examples/site/assays/mars_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/mcscrb_seq.html b/docs/examples/site/assays/mcscrb_seq.html new file mode 100644 index 00000000..1012ccfa --- /dev/null +++ b/docs/examples/site/assays/mcscrb_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/microwell_seq.html b/docs/examples/site/assays/microwell_seq.html new file mode 100644 index 00000000..726ad83c --- /dev/null +++ b/docs/examples/site/assays/microwell_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/parse_wt_v2.html b/docs/examples/site/assays/parse_wt_v2.html new file mode 100644 index 00000000..4dc10bb1 --- /dev/null +++ b/docs/examples/site/assays/parse_wt_v2.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/pi_atac_seq.html b/docs/examples/site/assays/pi_atac_seq.html new file mode 100644 index 00000000..a541e45b --- /dev/null +++ b/docs/examples/site/assays/pi_atac_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/pipseqv2.html b/docs/examples/site/assays/pipseqv2.html new file mode 100644 index 00000000..ab479e9c --- /dev/null +++ b/docs/examples/site/assays/pipseqv2.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/pipseqv3.html b/docs/examples/site/assays/pipseqv3.html new file mode 100644 index 00000000..7a79c8ff --- /dev/null +++ b/docs/examples/site/assays/pipseqv3.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/quartz_seq.html b/docs/examples/site/assays/quartz_seq.html new file mode 100644 index 00000000..b95f43b8 --- /dev/null +++ b/docs/examples/site/assays/quartz_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/quartz_seq2.html b/docs/examples/site/assays/quartz_seq2.html new file mode 100644 index 00000000..9e16f3fe --- /dev/null +++ b/docs/examples/site/assays/quartz_seq2.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/sccrispra.html b/docs/examples/site/assays/sccrispra.html new file mode 100644 index 00000000..f3ac1fd0 --- /dev/null +++ b/docs/examples/site/assays/sccrispra.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/sci_rna_seq.html b/docs/examples/site/assays/sci_rna_seq.html new file mode 100644 index 00000000..559896a3 --- /dev/null +++ b/docs/examples/site/assays/sci_rna_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/sci_rna_seq3.html b/docs/examples/site/assays/sci_rna_seq3.html new file mode 100644 index 00000000..001e3399 --- /dev/null +++ b/docs/examples/site/assays/sci_rna_seq3.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/scifi_rna_seq.html b/docs/examples/site/assays/scifi_rna_seq.html new file mode 100644 index 00000000..d62362cb --- /dev/null +++ b/docs/examples/site/assays/scifi_rna_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/seq_well.html b/docs/examples/site/assays/seq_well.html new file mode 100644 index 00000000..e5e12cae --- /dev/null +++ b/docs/examples/site/assays/seq_well.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/seq_well_s3.html b/docs/examples/site/assays/seq_well_s3.html new file mode 100644 index 00000000..02400f47 --- /dev/null +++ b/docs/examples/site/assays/seq_well_s3.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/share_seq.html b/docs/examples/site/assays/share_seq.html new file mode 100644 index 00000000..6119db87 --- /dev/null +++ b/docs/examples/site/assays/share_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/smart_seq2.html b/docs/examples/site/assays/smart_seq2.html new file mode 100644 index 00000000..a9e9ea6f --- /dev/null +++ b/docs/examples/site/assays/smart_seq2.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/smart_seq3.html b/docs/examples/site/assays/smart_seq3.html new file mode 100644 index 00000000..d51d5c31 --- /dev/null +++ b/docs/examples/site/assays/smart_seq3.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/sn_m3c_seq.html b/docs/examples/site/assays/sn_m3c_seq.html new file mode 100644 index 00000000..03a57ac6 --- /dev/null +++ b/docs/examples/site/assays/sn_m3c_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/snmctseq.html b/docs/examples/site/assays/snmctseq.html new file mode 100644 index 00000000..c3c8263e --- /dev/null +++ b/docs/examples/site/assays/snmctseq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/split_seq.html b/docs/examples/site/assays/split_seq.html new file mode 100644 index 00000000..1f1e0ea8 --- /dev/null +++ b/docs/examples/site/assays/split_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/strt_seq.html b/docs/examples/site/assays/strt_seq.html new file mode 100644 index 00000000..d7b99158 --- /dev/null +++ b/docs/examples/site/assays/strt_seq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/strt_seq_2i.html b/docs/examples/site/assays/strt_seq_2i.html new file mode 100644 index 00000000..0ac1358c --- /dev/null +++ b/docs/examples/site/assays/strt_seq_2i.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/strt_seq_c1.html b/docs/examples/site/assays/strt_seq_c1.html new file mode 100644 index 00000000..c0300c20 --- /dev/null +++ b/docs/examples/site/assays/strt_seq_c1.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/sugarseq.html b/docs/examples/site/assays/sugarseq.html new file mode 100644 index 00000000..37000f91 --- /dev/null +++ b/docs/examples/site/assays/sugarseq.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/surecell.html b/docs/examples/site/assays/surecell.html new file mode 100644 index 00000000..541b0504 --- /dev/null +++ b/docs/examples/site/assays/surecell.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/tang2009.html b/docs/examples/site/assays/tang2009.html new file mode 100644 index 00000000..c363a0fa --- /dev/null +++ b/docs/examples/site/assays/tang2009.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/template.html b/docs/examples/site/assays/template.html new file mode 100644 index 00000000..f20f0012 --- /dev/null +++ b/docs/examples/site/assays/template.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/vasa_seq_drop.html b/docs/examples/site/assays/vasa_seq_drop.html new file mode 100644 index 00000000..54f163c6 --- /dev/null +++ b/docs/examples/site/assays/vasa_seq_drop.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/assays/vasa_seq_plate.html b/docs/examples/site/assays/vasa_seq_plate.html new file mode 100644 index 00000000..51de6a23 --- /dev/null +++ b/docs/examples/site/assays/vasa_seq_plate.html @@ -0,0 +1,1194 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/docs/examples/site/index.html b/docs/examples/site/index.html new file mode 100644 index 00000000..23942dbc --- /dev/null +++ b/docs/examples/site/index.html @@ -0,0 +1,135 @@ + + + + + + seqspec examples + + + +

seqspec examples

This site is generated from the maintained examples under docs/examples. It shows the current assay examples, read templates, and region templates.

Note. seqspec files are tightly tied to the data, assay design, sequencer, and library construction they describe. These examples are representative specs derived from Teichlab library structures. They are useful reference points, but they may be incomplete or incorrect for any one concrete assay or dataset that a user defines.

The assay pages below are rendered with seqspec print -f seqspec-html. Use the search box to filter the catalog.

NameAssaySequencerModalitiesDateReportseqspec
10x-ATAC10xATACIlluminaatac19 October 2018viewyaml
10x-RNA-5prime10xRNA5primeIlluminarna2016-08-01viewyaml
10x-ATAC-RNA10xMultiomeIllumina NovaSeq 6000rna, atac15 September 2020viewyaml
10x-ATAC-RNA-MULTI/Illumina10x-ATAC-RNA-MULTIIllumina NovaSeq 6000rna, atac, tag17 June 2019viewyaml
10x-RNA-v110xRNAv1Illuminarna15 April 2016viewyaml
10x-RNA-v210xRNAv2Illuminarna16 January 2017viewyaml
10x-RNA-v310xRNAv3Illumina NovaSeq 6000rna—viewyaml
10xCRISPR10xCRISPRIlluminacrispr, rna10 June 2020viewyaml
10xFB-3prime10xFB protein and RNA assay 3'Illuminarna, protein12 February 2018viewyaml
10xFB protein and RNA assay 5'10xFB protein and RNA assay 5'Illuminarna, protein—viewyaml
10xFB-VDJ-5prime10xVDJ and RNA 5'Illuminarna, vdj21 October 2016viewyaml
10xv3 scrnaseq element adept truseq dualelement-adept-truseq-dual-indexElement Avitirna21 February 2024viewyaml
10xv3 scrnaseq illumina truseq dualIllumina-novaseq-truseq-dual-indexIllumina NovaSeq 6000rna21 February 2024viewyaml
BD-Rhapsody-EBBD-Rhapsody-EBIlluminarna31 August 2022viewyaml
BD-Rhapsody-v1/IlluminaBD-Rhapsody-v1Illuminarna20 December 2018viewyaml
CEL-SeqCEL-SeqIlluminarna30 August 2012viewyaml
CEL-Seq2/IlluminaCEL-Seq2Illuminarna28 April 2016viewyaml
DOGMAseq-DIG/IlluminaDOGMAseq-DIGIllumina NovaSeq 6000 (S2 Reagent Kit v1.5)protein, tag, rna, atac23 June 2022viewyaml
DOGMAseq-LLL/IlluminaDOGMAseq-LLLIllumina NovaSeq 6000 (S2 Reagent Kit v1.5)protein, rna, atac23 June 2022viewyaml
Drop-seq/IlluminaDrop-seqIlluminarna21 May 2015viewyaml
inDropv2/IlluminaIndropv2Illuminarna08 December 2016viewyaml
ISSAAC-seqissaac_seq—RNA, ATAC15 September 2020viewyaml
MARS-seqmars_seq—RNA17 May 2019viewyaml
mcSCRB-seqmcscrb_seq—RNA26 July 2018viewyaml
Microwell-seqmicrowell_seq—RNA22 February 2018viewyaml
WT Mega v2WT-Mega-v2Illuminarna—viewyaml
Pi-ATAC-seqpi_atac_seq—ATAC02 November 2018viewyaml
PIPseq V2pipseqv2—RNA06 March 2023viewyaml
PIPseq V3pipseqv3—RNA06 March 2023viewyaml
Quartz-seqquartz_seq—RNA17 April 2013viewyaml
Quartz-seq2quartz_seq2—RNA09 March 2018viewyaml
CRISPRa_screeningsccrispraNextSeq2000 p3 kitrna, crispr28 March 2023viewyaml
sci-RNA-seqsci_rna_seq—RNA18 August 2017viewyaml
sci-RNA-seq3sci_rna_seq3—RNA20 February 2019viewyaml
scifi-RNA-Seqscifi_rna_seq—RNA31 May 2021viewyaml
Seq-wellseq_well—RNA13 February 2017viewyaml
Seq-Well-S3seq_well_s3—RNA02 July 2019viewyaml
SHARE-SeqSHARE-SeqIlluminaatac, rna23 October 2020viewyaml
Smart-seq2smart_seq2—RNA02 January 2014viewyaml
Smart-seq3smart_seq3—RNA_end, RNA_internal04 May 2020viewyaml
sn-m3C-seqsn-m3C-seqIllumina NextSeq500methyl, hic09 September 2019viewyaml
snmCT-seqsnmCT-seqIllumina NextSeq500methyl, rna09 March 2022viewyaml
SPLiT-seqsplit_seq—RNA15 March 2018viewyaml
STRT-seqstrt_seq—RNA05 April 2012viewyaml
STRT-seq-2istrt_seq_2i—RNA27 November 2017viewyaml
STRT-seq-C1strt_seq_c1—RNA27 April 2019viewyaml
SugarSeq/IlluminaSugarSeqIllumina NextSeqrna, tag—viewyaml
SureCellsurecell—RNA27 April 2019viewyaml
10x Genomicstang2009—RNA06 April 2009viewyaml
templatetemplate—mode—viewyaml
VASA-seq-dropvasa_seq_drop—RNA27 June 2022viewyaml
VASA-seqvasa_seq_plate—RNA27 June 2022viewyaml
+ + + + + diff --git a/docs/examples/site/reads.html b/docs/examples/site/reads.html new file mode 100644 index 00000000..bab152d4 --- /dev/null +++ b/docs/examples/site/reads.html @@ -0,0 +1,109 @@ + + + + + + seqspec read templates + + + +

Reads

This catalog lists the maintained read templates used by the assay examples.

TemplateReadsRead idsLink
element_adept_workflow.read.yaml4Index 1, Index 2, Read 1, Read 2yaml
element_elevate_workflow.read.yaml4Index 1, Index 2, Read 1, Read 2yaml
illumina_novaseq_novakit_dual_index.read.yaml4Read 1, Index 1, Index 2, Read 2yaml
illumina_novaseq_truseq_dual_index.read.yaml4Read 1, Index 1, Index 2, Read 2yaml
illumina_novaseq_truseq_single_index.read.yaml3Read 1, Index 1, Read 2yaml
oxford_nanopore.read.yaml1Read 1yaml
+ + + diff --git a/docs/examples/site/regions.html b/docs/examples/site/regions.html new file mode 100644 index 00000000..28132e3b --- /dev/null +++ b/docs/examples/site/regions.html @@ -0,0 +1,109 @@ + + + + + + seqspec region templates + + + +

Regions

This catalog lists the maintained region templates used by the assay examples.

TemplateRegionsRegion idsLink
10x5pv2.rgn.yaml4barcode, umi, 10x5pTSO, cDNAyaml
10x5pv3.rgn.yaml4barcode, umi, 10x5pTSO, cDNAyaml
10xv2.rgn.yaml3barcode, umi, cDNAyaml
10xv3-CRISPR.rgn.yaml7barcode, umi, linker, sgRNA-scaffold, antibody_barcode, linker-2, tsoyaml
10xv3-FB.rgn.yaml6barcode, umi, linker, random-1, antibody_barcode, random-2yaml
10xv3.rgn.yaml3barcode, umi, cDNAyaml
10xv4.rgn.yaml3barcode, umi, cDNAyaml
barcode.rgn.yaml1barcodeyaml
bd_rhapsody_eb.rgn.yaml8bd_eb_vb, bd_cls1, bd_eb_cls_linker1, bd_cls2, bd_eb_cls_linker2, bd_cls3, umi, cDNAyaml
bd_rhapsody_v1.rgn.yaml7bd_cls1, bd_cls_linker1, bd_cls2, bd_cls_linker2, bd_cls3, umi, cDNAyaml
cdna.rgn.yaml1cDNAyaml
celseq.rgn.yaml2barcode, cDNAyaml
celseq2.rgn.yaml3umi, barcode, cDNAyaml
citeseq.yaml3protein_cell_bc, protein_umi, protein_seqyaml
dropseq.rgn.yaml3dropseq_barcode, umi, cDNAyaml
dropseq_seqa_r1.rgn.yaml1dropseq_seqA_read1yaml
dropseq_seqb_r1.rgn.yaml1dropseq_seqB_read1yaml
fipresci.rgn.yaml6barcode, umi, 10x5pTSO, cDNA, ME2, tn5_barcodeyaml
flashseq.rgn.yaml4linker-1, umi, linker-2, cDNAyaml
gdna.rgn.yaml1gDNAyaml
hydroprna.rgn.yaml7cDNA, umi, hydroprna_bc3, hydroprna_linker2, hydroprna_bc2, hydroprna_linker1, hydroprna_bc1yaml
illumina_index.rgn.yaml1indexyaml
illumina_p5.rgn.yaml1illumina_p5yaml
illumina_p7.rgn.yaml1illumina_p7yaml
marsseq2.rgn.yaml5marsseq2_platebarcode_linker, marsseq2_platebarcode, cDNA, umi, marsseq2_rt1_barcodeyaml
mcscrbseq.rgn.yaml3mcscrb_barcode, umi, cDNAyaml
microwellseq.rgn.yaml7microwellseq_bc1, microwellseq_linker1, microwellseq_bc2, microwellseq_linker2, microwellseq_bc3, umi, cDNAyaml
nextera_r1.rgn.yaml1nextera_read1yaml
nextera_r1_joined.rgn.yaml1nextera_read1yaml
nextera_r2.rgn.yaml1nextera_read2yaml
nextera_r2_joined.rgn.yaml1nextera_read2yaml
petriseq.rgn.yaml7umi, petriseq_bc3, petriseq_linker3, petriseq_bc2, petriseq_linker2, petriseq_bc1, cDNAyaml
pipseqv2.rgn.yaml7pipseq_v2_bc1, pipseq_v2_linker1, pipseq_v2_bc2, pipseq_v2_linker2, pipseq_v2_bc3, umi, cDNAyaml
pipseqv3.rgn.yaml9pipseq_v3_bc1, pipseq_v3_linker1, pipseq_v3_bc2, pipseq_v3_linker2, pipseq_v3_bc3, pipseq_v3_linker3, pipseq_v3_bc4, umi, cDNAyaml
pipseqv4.rgn.yaml10pipseq_v4_vb, pipseq_v3_bc1, pipseq_v3_linker1, pipseq_v3_bc2, pipseq_v3_linker2, pipseq_v3_bc3, pipseq_v3_linker3, pipseq_v3_bc4, umi, cDNAyaml
quartzseq.rgn.yaml3barcode, umi, cDNAyaml
scirnaseq.rgn.yaml10i5_barcode, truseq_adapter, hairpin_barcode, linker, umi, rt_barcode, cdna, me, s7, i7_barcodeyaml
smartseq2.rgn.yaml1cDNAyaml
smartseq3-express.rgn.yaml4linker-1, umi, linker-2, cDNAyaml
smartseq3.rgn.yaml4linker-1, umi, linker-2, cDNAyaml
strtseq.rgn.yaml8well-barcode, linker-1, umi, linker-2, cDNA, ME, linker-3, subarray-barcodeyaml
surecell_3p_wta.yaml9surecell_3p_wta_bc1, surecell_3p_wta_linker1, surecell_3p_wta_bc2, surecell_3p_wta_linker2, surecell_3p_wta_bc3, surecell_3p_wta_linker3, umi, surecell_3p_wta_linker4, cDNAyaml
truseq_r1.rgn.yaml1truseq_read1yaml
truseq_r2.rgn.yaml1truseq_read2yaml
umi.rgn.yaml1umiyaml
+ + + diff --git a/docs/regions/nextera_r1_joined.rgn.yaml b/docs/regions/nextera_r1_joined.rgn.yaml deleted file mode 100644 index 8aea78b1..00000000 --- a/docs/regions/nextera_r1_joined.rgn.yaml +++ /dev/null @@ -1,27 +0,0 @@ -- region_id: nextera_read1 - region_type: nextera_read1 - name: nextera_read1 - sequence_type: fixed - sequence: TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG - min_len: 33 - max_len: 33 - onlist: null - regions: - - region_id: s5 - region_type: s5 - name: s5 - sequence_type: fixed - sequence: TCGTCGGCAGCGTC - min_len: 14 - max_len: 14 - onlist: null - regions: [] - - region_id: ME1 - region_type: ME1 - name: ME1 - sequence_type: fixed - sequence: AGATGTGTATAAGAGACAG - min_len: 19 - max_len: 19 - onlist: null - regions: [] diff --git a/docs/regions/nextera_r2_joined.rgn.yaml b/docs/regions/nextera_r2_joined.rgn.yaml deleted file mode 100644 index feed791d..00000000 --- a/docs/regions/nextera_r2_joined.rgn.yaml +++ /dev/null @@ -1,27 +0,0 @@ -- region_id: nextera_read2 - region_type: nextera_read2 - name: nextera_read2 - sequence_type: joined - sequence: CTGTCTCTTATACACATCTCCGAGCCCACGAGAC - min_len: 34 - max_len: 34 - onlist: null - regions: - - region_id: ME2 - region_type: ME2 - name: ME2 - sequence_type: fixed - sequence: CTGTCTCTTATACACATCT - min_len: 19 - max_len: 19 - onlist: null - regions: [] - - region_id: s7 - region_type: s7 - name: s7 - sequence_type: fixed - sequence: CCGAGCCCACGAGAC - min_len: 15 - max_len: 15 - onlist: null - regions: [] diff --git a/myst.yml b/myst.yml index 086af85e..02736330 100644 --- a/myst.yml +++ b/myst.yml @@ -31,14 +31,13 @@ project: site: template: book-theme title: seqspec - logo_text: Machine readable specification for genomics assays actions: - title: GitHub url: https://github.com/pachterlab/seqspec - nav: [] - # https://mystmd.org/guide/website-templates - # - title: Examples - # url: ./builder.html - # options: - # favicon: favicon.ico - # logo: site_logo.png + nav: + - title: Examples + url: https://pachterlab.github.io/seqspec/examples/ + options: + logo_text: Machine readable specification for genomics assays + # favicon: favicon.ico + # logo: site_logo.png diff --git a/pyproject.toml b/pyproject.toml index cf995cc6..f605dd76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,11 @@ [build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" +requires = ["maturin>=1.9,<2.0"] +build-backend = "maturin" [project] name = "seqspec" version = "0.4.0" + description = "A tool for working with sequencing specifications" readme = "README.md" requires-python = ">=3.12" @@ -20,6 +21,9 @@ classifiers = [ "Programming Language :: Python :: 3.13", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Utilities", + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ "pyyaml>=6.0", @@ -60,7 +64,7 @@ license-files = ["LICENSE"] include = ["seqspec", "seqspec.*"] [tool.setuptools.package-data] -"seqspec" = ["schema/*"] +"seqspec" = ["schema/*", "py.typed", "_core.pyi", "report_assets/*"] [tool.pytest.ini_options] testpaths = ["tests"] @@ -98,4 +102,11 @@ dev = [ "pre-commit>=4.2.0", "build>=1.0.0", "ruff>=0.9.0", -] \ No newline at end of file +] + + +[tool.maturin] +# Build the PyO3 module named *seqspec* (matches #[pymodule] above) +module-name = "seqspec._core" +features = ["python-binding"] +python-source = "." # ship python/seqspec/* alongside the native module diff --git a/seqspec/Assay.py b/seqspec/Assay.py index a9ecb805..5a96f0a0 100644 --- a/seqspec/Assay.py +++ b/seqspec/Assay.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union +from typing import Iterable, List, Optional, Type, Union import yaml from pydantic import BaseModel, Field, PrivateAttr @@ -8,26 +8,16 @@ from . import __version__ +# from ._core import Assay as _RustAssay +# from ._core import Read as _RustRead +# from ._core import Region as _RustRegion + class SeqProtocol(BaseModel): protocol_id: Optional[str] = Field(default_factory=lambda: "auto-id") name: str modality: str - def to_dict(self): - return self.model_dump() - - def update_from(self, patch: Union["SeqProtocol", "SeqProtocolInput"]) -> None: - if isinstance(patch, SeqProtocolInput): - for field in patch.model_fields_set: - value = getattr(patch, field) - if value is not None: - setattr(self, field, value) - return - if isinstance(patch, SeqProtocol): - for field in self.model_fields.keys(): # type: ignore[attr-defined] - setattr(self, field, getattr(patch, field)) - class SeqProtocolInput(BaseModel): """ @@ -68,20 +58,6 @@ class SeqKit(BaseModel): name: Optional[str] modality: str - def to_dict(self): - return self.model_dump() - - def update_from(self, patch: Union["SeqKit", "SeqKitInput"]) -> None: - if isinstance(patch, SeqKitInput): - for field in patch.model_fields_set: - value = getattr(patch, field) - if value is not None: - setattr(self, field, value) - return - if isinstance(patch, SeqKit): - for field in self.model_fields.keys(): # type: ignore[attr-defined] - setattr(self, field, getattr(patch, field)) - class SeqKitInput(BaseModel): """ @@ -118,20 +94,6 @@ class LibProtocol(BaseModel): name: str modality: str - def to_dict(self): - return self.model_dump() - - def update_from(self, patch: Union["LibProtocol", "LibProtocolInput"]) -> None: - if isinstance(patch, LibProtocolInput): - for field in patch.model_fields_set: - value = getattr(patch, field) - if value is not None: - setattr(self, field, value) - return - if isinstance(patch, LibProtocol): - for field in self.model_fields.keys(): # type: ignore[attr-defined] - setattr(self, field, getattr(patch, field)) - class LibProtocolInput(BaseModel): """ @@ -170,20 +132,6 @@ class LibKit(BaseModel): name: Optional[str] modality: str - def to_dict(self): - return self.model_dump() - - def update_from(self, patch: Union["LibKit", "LibKitInput"]) -> None: - if isinstance(patch, LibKitInput): - for field in patch.model_fields_set: - value = getattr(patch, field) - if value is not None: - setattr(self, field, value) - return - if isinstance(patch, LibKit): - for field in self.model_fields.keys(): # type: ignore[attr-defined] - setattr(self, field, getattr(patch, field)) - class LibKitInput(BaseModel): """ @@ -215,6 +163,60 @@ def to_libkit(self) -> LibKit: ) +def coerce_protocol_kit_list(value, cls: Type[BaseModel], modalities: Iterable[str]): + """ + Coerce a string or list of strings/objects/dicts into a list of protocol/kit objects (or None). + + Supports: + - "NovaSeq" -> [cls(protocol_id|kit_id="NovaSeq", name="NovaSeq", modality=m) for m in modalities] + - ["A","B"] -> expanded per modality + - [{"protocol_id": "...", ...}] -> cls(**dict) + - [cls(...), "X", {...}] -> mixed inputs + """ + if value is None: + return None + + # identify target family (protocol vs kit) by class + is_protocol = cls.__name__ in {"SeqProtocol", "LibProtocol"} + is_kit = cls.__name__ in {"SeqKit", "LibKit"} + + if not (is_protocol or is_kit): + raise ValueError("cls must be one of: SeqProtocol, LibProtocol, SeqKit, LibKit") + + def make_obj(val, modality: str): + if isinstance(val, cls): + return val + if isinstance(val, dict): + return cls(**val) + if isinstance(val, str): + if is_protocol: + return cls(protocol_id=val, name=val, modality=modality) + else: + return cls(kit_id=val, name=val, modality=modality) + raise TypeError(f"Unsupported item type for {cls.__name__}: {type(val)!r}") + + if isinstance(value, str): + return [make_obj(value, m) for m in modalities] + + if isinstance(value, list): + out = [] + for item in value: + if isinstance(item, str): + out.extend(make_obj(item, m) for m in modalities) + else: + # dict or already-typed object: keep as a single item + # if it lacks modality, caller's responsibility (your spec usually includes it) + out.append(make_obj(item, next(iter(modalities), ""))) + return out + + # already a typed object (rare), wrap into list + if isinstance(value, cls): + return [value] + + # last resort: pass through + return value + + class Assay(BaseModel): seqspec_version: Optional[str] = __version__ assay_id: str @@ -236,6 +238,9 @@ class Assay(BaseModel): # Not part of the public schema; populated when loading from disk. _spec_path: Optional[str] = PrivateAttr(default=None) + def model_post_init(self, __context) -> None: + self.normalize_protocols_kits() + def __repr__(self) -> str: rds = [] rgns = [] @@ -254,15 +259,8 @@ def __repr__(self) -> str: Regions: {"\n".join(rgns)} """ - # return str(self.model_dump()) return s - def to_dict(self): - return self.model_dump() - - def to_JSON(self): - return self.model_dump_json(indent=4) - def to_YAML(self, fname: Optional[str] = None): yaml_str = yaml.dump(self.model_dump(), sort_keys=False) if fname is None: @@ -277,6 +275,7 @@ def print_sequence(self): print("\n", end="") def update_spec(self): + self.normalize_protocols_kits() for r in self.library_spec: r.update_attr() @@ -357,7 +356,81 @@ def insert_reads( self.sequence_spec.insert(insert_idx, read) insert_idx += 1 - # update_from removed per new approach + def normalize_protocols_kits(self) -> None: + """Normalize str-valued protocol/kit fields into lists of objects.""" + self.sequence_protocol = coerce_protocol_kit_list( + self.sequence_protocol, SeqProtocol, self.modalities + ) + self.sequence_kit = coerce_protocol_kit_list( + self.sequence_kit, SeqKit, self.modalities + ) + self.library_protocol = coerce_protocol_kit_list( + self.library_protocol, LibProtocol, self.modalities + ) + self.library_kit = coerce_protocol_kit_list( + self.library_kit, LibKit, self.modalities + ) + + +# class RustAssay: +# __slots__ = ("_inner",) + +# def __init__(self, inner: _RustAssay) -> None: +# object.__setattr__(self, "_inner", inner) + +# # generic forwarding +# def __getattr__(self, name): +# return getattr(self._inner, name) + +# def __setattr__(self, name, value): +# if name == "_inner": +# return object.__setattr__(self, name, value) +# return setattr(self._inner, name, value) + +# # constructors +# @classmethod +# def from_model(cls, m: "Assay") -> "RustAssay": +# return cls(_RustAssay.from_json(m.model_dump_json())) + +# def snapshot(self) -> "Assay": +# return Assay.model_validate_json(self._inner.to_json()) + +# # helpers: DTO outputs for downstream Python code +# def list_modalities(self) -> List[str]: +# return list(self._inner.list_modalities()) + +# def get_libspec(self, modality: str) -> Region: +# r: _RustRegion = self._inner.get_libspec(modality) +# return Region.model_validate_json(r.to_json()) + +# def get_seqspec(self, modality: str) -> List[Read]: +# rlist: List[_RustRead] = self._inner.get_seqspec(modality) +# return [Read.model_validate_json(r.to_json()) for r in rlist] + +# def get_read(self, read_id: str) -> Read: +# r: _RustRead = self._inner.get_read(read_id) +# return Read.model_validate_json(r.to_json()) + +# def update_spec(self) -> None: +# self._inner.update_spec() + +# def insert_reads( +# self, reads: List[Read], modality: str, after: Optional[str] = None +# ) -> None: +# # Convert DTOs to Rust via JSON (serde builds Vec) +# raw: List[_RustRead] = [_RustRead.from_json(r.model_dump_json()) for r in reads] +# self._inner.insert_reads(raw, modality, after) + +# def insert_regions( +# self, regions: List[Region], modality: str, after: Optional[str] = None +# ) -> None: +# raw: List[_RustRegion] = [ +# _RustRegion.from_json(r.model_dump_json()) for r in regions +# ] +# self._inner.insert_regions(raw, modality, after) + +# def __repr__(self) -> str: +# return self._inner.__repr__() class AssayInput(BaseModel): diff --git a/seqspec/File.py b/seqspec/File.py index d0b07df6..1a1a3000 100644 --- a/seqspec/File.py +++ b/seqspec/File.py @@ -3,6 +3,10 @@ from pydantic import BaseModel, Field +# from ._core import File as _RustFile + +__all__ = ["File"] + class File(BaseModel): file_id: str @@ -13,15 +17,10 @@ class File(BaseModel): urltype: str md5: str - def __repr__(self) -> str: - s = f"""{self.file_id}""" - return s - - def to_dict(self): - return self.model_dump() + # add an updatae_spec attr that computes the md5 for the object - def update_file_id(self, file_id: str): - self.file_id = file_id + def __repr__(self) -> str: + return self.file_id class FileInput(BaseModel): @@ -82,13 +81,61 @@ class FileInput(BaseModel): ) def to_file(self) -> File: + # derive defaults from filename when needed + fname = self.filename or "" return File( - file_id=self.file_id or (Path(self.filename).name if self.filename else ""), - filename=self.filename or "", - filetype=self.filetype - or (Path(self.filename).suffix.lstrip(".") if self.filename else ""), + file_id=self.file_id or (Path(fname).name if fname else ""), + filename=fname, + filetype=self.filetype or (Path(fname).suffix.lstrip(".") if fname else ""), filesize=self.filesize or 0, url=self.url or "", urltype=self.urltype or "local", md5=self.md5 or "", ) + + +# class RustFile: +# __slots__ = ("_inner",) + +# def __init__(self, inner: _RustFile) -> None: +# self._inner = inner + +# @classmethod +# def new( +# cls, +# *, +# file_id: str, +# filename: str, +# filetype: str, +# filesize: int, +# url: str, +# urltype: str, +# md5: str, +# ) -> "RustFile": +# return cls( +# _RustFile(file_id, filename, filetype, int(filesize), url, urltype, md5) +# ) + +# def __getattr__(self, name): +# # called only if attribute not found on Rust object itself +# return getattr(self._inner, name) + +# def __setattr__(self, name, value): +# if name == "_inner": +# object.__setattr__(self, name, value) +# else: +# setattr(self._inner, name, value) + +# @classmethod +# def from_model(cls, m: File) -> "RustFile": +# return cls(_RustFile.from_json(m.model_dump_json())) + +# @classmethod +# def from_input(cls, i: FileInput) -> "RustFile": +# return cls.from_model(i.to_file()) + +# def snapshot(self) -> File: +# return File.model_validate_json(self._inner.to_json()) + +# def __repr__(self) -> str: +# return f"RustFile(file_id={self.file_id!r}, filename={self.filename!r}, size={self.filesize})" diff --git a/seqspec/Read.py b/seqspec/Read.py index ed4cb703..6b8c0077 100644 --- a/seqspec/Read.py +++ b/seqspec/Read.py @@ -5,6 +5,8 @@ from seqspec.File import File, FileInput from seqspec.Region import RegionCoordinate +# from ._core import Read as _RustRead + class Read(BaseModel): read_id: str @@ -22,12 +24,8 @@ def set_files(self, files: List[File] = []): def __repr__(self) -> str: strand = "+" if self.strand == "pos" else "-" s = f"""{strand}({self.min_len}, {self.max_len}){self.read_id}:{self.primer_id}""" - # return str(self.model_dump()) return s - def to_dict(self): - return self.model_dump() - def update_read_by_id( self, read_id=None, @@ -39,21 +37,21 @@ def update_read_by_id( strand=None, files=None, ): - if read_id: + if read_id is not None: self.read_id = read_id - if name: + if name is not None: self.name = name - if modality: + if modality is not None: self.modality = modality - if primer_id: + if primer_id is not None: self.primer_id = primer_id - if min_len: + if min_len is not None: self.min_len = min_len - if max_len: + if max_len is not None: self.max_len = max_len - if strand: + if strand is not None: self.strand = strand - if files: + if files is not None: self.files = files def get_read_by_file_id(self, file_id: str): @@ -62,7 +60,63 @@ def get_read_by_file_id(self, file_id: str): return self return None - # update_from removed per new approach + +# class RustRead: +# __slots__ = ("_inner",) + +# def __init__(self, inner: _RustRead) -> None: +# object.__setattr__(self, "_inner", inner) + +# # Generic forwarding + +# def __getattr__(self, name): +# # called only if attribute not found on Rust object itself +# return getattr(self._inner, name) + +# def __setattr__(self, name, value): +# if name == "_inner": +# object.__setattr__(self, name, value) +# else: +# setattr(self._inner, name, value) + +# @classmethod +# def new( +# cls, +# *, +# read_id: str, +# name: str, +# modality: str, +# primer_id: str, +# min_len: int, +# max_len: int, +# strand: str, +# files: List[RustFile] | None = None, +# ) -> "RustRead": +# rust_files = [f._inner for f in (files or [])] # pass raw RustFile inners +# inner = _RustRead( +# read_id, +# name, +# modality, +# primer_id, +# int(min_len), +# int(max_len), +# strand, +# rust_files, +# ) +# return cls(inner) + +# # convenience constructor: accept a Pydantic Read DTO +# @classmethod +# def from_model(cls, m: Read) -> "RustRead": +# # serde in Rust will build Vec from the nested DTOs +# return cls(_RustRead.from_json(m.model_dump_json())) + +# def snapshot(self) -> "Read": +# # Convert back into your Pydantic DTO +# return Read.model_validate_json(self._inner.to_json()) + +# def __repr__(self) -> str: +# return self._inner.__repr__() # uses Rust __repr__ class ReadCoordinate(BaseModel): diff --git a/seqspec/Region.py b/seqspec/Region.py index b1043c62..2f38b3e6 100644 --- a/seqspec/Region.py +++ b/seqspec/Region.py @@ -3,6 +3,9 @@ from pydantic import BaseModel, Field +# from ._core import Onlist as _RustOnlist +# from ._core import Region as _RustRegion + class SequenceType(str, Enum): FIXED = "fixed" @@ -57,11 +60,7 @@ class Onlist(BaseModel): urltype: str md5: str - def update_from(self, patch: "OnlistInput") -> None: - for field in patch.model_fields_set: - value = getattr(patch, field) - if value is not None: - setattr(self, field, value) + # add a update_spec attribute that computes the md5 for the object class OnlistInput(BaseModel): @@ -125,6 +124,39 @@ def to_onlist(self) -> Onlist: ) +# class RustOnlist: +# __slots__ = ("_inner",) + +# def __init__(self, inner: _RustOnlist) -> None: +# object.__setattr__(self, "_inner", inner) + +# # Generic forwarding +# def __getattr__(self, name): # only called if not found on self +# return getattr(self._inner, name) + +# def __setattr__(self, name, value): +# # write straight into PyO3 fields since you use #[pyo3(get, set)] +# setattr(self._inner, name, value) + +# @classmethod +# def new(cls, **kwargs) -> "RustOnlist": +# return cls(_RustOnlist(**kwargs)) + +# @classmethod +# def from_model(cls, m: Onlist) -> "RustOnlist": +# return cls(_RustOnlist.from_json(m.model_dump_json())) + +# @classmethod +# def from_input(cls, i: OnlistInput) -> "RustOnlist": +# return cls.from_model(i.to_onlist()) + +# def snapshot(self) -> Onlist: +# return Onlist.model_validate_json(self._inner.to_json()) + +# def __repr__(self) -> str: +# return f"RustOnlist(file_id={self._inner.file_id!r}, filename={self._inner.filename!r})" + + class Region(BaseModel): region_id: str region_type: Union[str, RegionType] @@ -208,8 +240,6 @@ def get_onlist(self) -> Optional[Onlist]: """Get the onlist associated with this region.""" return self.onlist - # update_from removed per new approach - def get_leaves(self, leaves: Optional[List["Region"]] = None) -> List["Region"]: # print(leaves) if leaves is None: @@ -282,13 +312,20 @@ def update_region_by_id( target = self.get_region_by_id(target_region_id) if target: r = target[0] - r.region_id = region_id or r.region_id - r.region_type = region_type or r.region_type - r.name = name or r.name - r.sequence_type = sequence_type or r.sequence_type - r.sequence = sequence or r.sequence - r.min_len = min_len or r.min_len - r.max_len = max_len or r.max_len + if region_id is not None: + r.region_id = region_id + if region_type is not None: + r.region_type = region_type + if name is not None: + r.name = name + if sequence_type is not None: + r.sequence_type = sequence_type + if sequence is not None: + r.sequence = sequence + if min_len is not None: + r.min_len = min_len + if max_len is not None: + r.max_len = max_len def reverse(self): if self.regions: @@ -308,6 +345,175 @@ def complement(self): Region.model_rebuild() +# class RustRegion: +# __slots__ = ("_inner",) + +# def __init__(self, inner: _RustRegion) -> None: +# object.__setattr__(self, "_inner", inner) + +# # generic forwarding +# def __getattr__(self, name): +# return getattr(self._inner, name) + +# def __setattr__(self, name, value): +# if name == "_inner": +# return object.__setattr__(self, name, value) +# # strict: if setting 'regions' or 'onlist', expect Rust proxies +# if name == "regions": +# coerced = [r._inner for r in value] if value else [] +# return setattr(self._inner, "regions", coerced) +# if name == "onlist": +# coerced = value._inner if value is not None else None +# return setattr(self._inner, "onlist", coerced) +# return setattr(self._inner, name, value) + +# # constructors +# @classmethod +# def new( +# cls, +# *, +# region_id: str, +# region_type: str, +# name: str, +# sequence_type: str, +# sequence: str = "", +# min_len: int = 0, +# max_len: int = 1024, +# onlist: Optional["RustOnlist"] = None, +# regions: Optional[List["RustRegion"]] = None, +# ) -> "RustRegion": +# raw_onlist: Optional[_RustOnlist] = ( +# onlist._inner if onlist is not None else None +# ) +# raw_regions: List[_RustRegion] = [r._inner for r in (regions or [])] +# inner = _RustRegion( +# region_id, +# region_type, +# name, +# sequence_type, +# sequence, +# int(min_len), +# int(max_len), +# raw_onlist, +# raw_regions, +# ) +# return cls(inner) + +# @classmethod +# def from_model(cls, m: "Region") -> "RustRegion": +# # leverage serde to build nested regions/onlist +# return cls(_RustRegion.from_json(m.model_dump_json())) + +# def snapshot(self) -> "Region": +# return Region.model_validate_json(self._inner.to_json()) + +# # conversions for list-returning methods +# @staticmethod +# def _to_dto_list(items: List[_RustRegion]) -> List["Region"]: +# return [Region.model_validate_json(it.to_json()) for it in items] + +# # wrappers with DTO output +# def get_sequence(self) -> str: +# return self._inner.get_sequence() + +# def get_len(self) -> tuple[int, int]: +# mn, mx = self._inner.get_len() +# return int(mn), int(mx) + +# def update_attr(self) -> None: +# self._inner.update_attr() + +# def get_region_by_id(self, region_id: str) -> List["Region"]: +# items = self._inner.get_region_by_id(region_id) +# rgns = [Region.model_validate_json(it.to_json()) for it in items] +# return rgns + +# def get_region_by_region_type(self, region_type: str) -> List["Region"]: +# items = self._inner.get_region_by_region_type(region_type) +# rgns = [Region.model_validate_json(it.to_json()) for it in items] +# return rgns + +# def get_onlist_regions(self) -> List["Region"]: +# items = self._inner.get_onlist_regions() +# rgns = [Region.model_validate_json(it.to_json()) for it in items] +# return rgns + +# def get_onlist(self) -> Optional[Onlist]: +# ol = self._inner.get_onlist() +# return Onlist.model_validate_json(ol.to_json()) if ol is not None else None + +# def get_leaves(self) -> List["Region"]: +# items = self._inner.get_leaves() +# rgns = [Region.model_validate_json(it.to_json()) for it in items] +# return rgns + +# def get_leaves_with_region_id(self, region_id: str) -> List["Region"]: +# items = self._inner.get_leaves_with_region_id(region_id) +# rgns = [Region.model_validate_json(it.to_json()) for it in items] +# return rgns + +# def get_leaf_region_types(self) -> Set[str]: +# return set(self._inner.get_leaf_region_types()) + +# def to_newick(self, n: str = "") -> str: +# # 'n' is unused on Rust side, kept for parity with Python signature +# return self._inner.to_newick() + +# def update_region( +# self, +# region_id: str, +# region_type: str, +# name: str, +# sequence_type: str, +# sequence: str, +# min_len: int, +# max_len: int, +# onlist: Optional[Onlist], +# ) -> None: +# raw_ol = _RustOnlist.from_json(onlist.model_dump_json()) if onlist else None +# self._inner.update_region( +# region_id, +# region_type, +# name, +# sequence_type, +# sequence, +# int(min_len), +# int(max_len), +# raw_ol, +# ) + +# def update_region_by_id( +# self, +# target_region_id: str, +# region_id: Optional[str] = None, +# region_type: Optional[str] = None, +# name: Optional[str] = None, +# sequence_type: Optional[str] = None, +# sequence: Optional[str] = None, +# min_len: Optional[int] = None, +# max_len: Optional[int] = None, +# ) -> None: +# self._inner.update_region_by_id( +# target_region_id, +# region_id, +# region_type, +# name, +# sequence_type, +# sequence, +# min_len, +# max_len, +# ) + +# def reverse(self) -> None: +# self._inner.reverse() + +# def complement(self) -> None: +# self._inner.complement() + +# def __repr__(self) -> str: +# return self._inner.__repr__() + + class RegionInput(BaseModel): """ Input payload for constructing a `Region` (node in the library structure). diff --git a/seqspec/_core.pyi b/seqspec/_core.pyi new file mode 100644 index 00000000..d1cb5084 --- /dev/null +++ b/seqspec/_core.pyi @@ -0,0 +1,244 @@ +from typing import List, Optional, Tuple + +class File: + file_id: str + filename: str + filetype: str + filesize: int + url: str + urltype: str + md5: str + + def __init__( + self, + file_id: str, + filename: str, + filetype: str, + filesize: int, + url: str, + urltype: str, + md5: str, + ) -> None: ... + @staticmethod + def from_json(json_str: str) -> "File": ... + def to_json(self) -> str: ... + +class Onlist: + file_id: str + filename: str + filetype: str + filesize: int + url: str + urltype: str + md5: str + + def __init__( + self, + file_id: str, + filename: str, + filetype: str, + filesize: int, + url: str, + urltype: str, + md5: str, + ) -> None: ... + @staticmethod + def from_json(json_str: str) -> "Onlist": ... + def to_json(self) -> str: ... + +class Read: + read_id: str + name: str + modality: str + primer_id: str + min_len: int + max_len: int + strand: str + files: list[File] + + def __init__( + self, + read_id: str, + name: str, + modality: str, + primer_id: str, + min_len: int, + max_len: int, + strand: str, + files: list[File] = ..., + ) -> None: ... + @staticmethod + def from_json(json_str: str) -> "Read": ... + def to_json(self) -> str: ... + def update_files(self, files: list[File]) -> None: ... + def update_read_by_id( + self, + read_id: str | None, + name: str | None, + modality: str | None, + primer_id: str | None, + min_len: int | None, + max_len: int | None, + strand: str | None, + files: list[File] | None, + ) -> None: ... + def get_read_by_file_id(self, file_id: str) -> "Read | None": ... + +# seqspec/_core.pyi (add alongside File/Onlist/Read) + +class Region: + region_id: str + region_type: str + name: str + sequence_type: str + sequence: str + min_len: int + max_len: int + onlist: Optional[Onlist] + regions: List["Region"] + + def __init__( + self, + region_id: str, + region_type: str, + name: str, + sequence_type: str, + sequence: str, + min_len: int, + max_len: int, + onlist: Optional[Onlist] = ..., + regions: List["Region"] = ..., + ) -> None: ... + @staticmethod + def from_json(json_str: str) -> "Region": ... + def to_json(self) -> str: ... + + # Core helpers + def get_sequence(self) -> str: ... + def get_len(self) -> Tuple[int, int]: ... + def update_attr(self) -> None: ... + + # Queries + def get_region_by_id(self, region_id: str) -> List["Region"]: ... + def get_region_by_region_type(self, region_type: str) -> List["Region"]: ... + def get_onlist_regions(self) -> List["Region"]: ... + def get_onlist(self) -> Optional[Onlist]: ... + def get_leaves(self) -> List["Region"]: ... + def get_leaves_with_region_id(self, region_id: str) -> List["Region"]: ... + def get_leaf_region_types(self) -> List[str]: ... + def to_newick(self) -> str: ... + + # Mutations + def update_region( + self, + region_id: str, + region_type: str, + name: str, + sequence_type: str, + sequence: str, + min_len: int, + max_len: int, + onlist: Optional[Onlist], + ) -> None: ... + def update_region_by_id( + self, + target_region_id: str, + region_id: Optional[str] = ..., + region_type: Optional[str] = ..., + name: Optional[str] = ..., + sequence_type: Optional[str] = ..., + sequence: Optional[str] = ..., + min_len: Optional[int] = ..., + max_len: Optional[int] = ..., + ) -> None: ... + + # Transforms + def reverse(self) -> None: ... + def complement(self) -> None: ... + def __repr__(self) -> str: ... + +# ---- Assay metadata records ---------------------------------------- +class SeqProtocol: + protocol_id: str + name: str + modality: str + def __init__(self, protocol_id: str, name: str, modality: str) -> None: ... + +class SeqKit: + kit_id: str + name: str | None + modality: str + def __init__(self, kit_id: str, name: str | None, modality: str) -> None: ... + +class LibProtocol: + protocol_id: str + name: str + modality: str + def __init__(self, protocol_id: str, name: str, modality: str) -> None: ... + +class LibKit: + kit_id: str + name: str | None + modality: str + def __init__(self, kit_id: str, name: str | None, modality: str) -> None: ... + +# ---- Assay ---------------------------------------------------------- + +class Assay: + seqspec_version: Optional[str] + assay_id: str + name: str + doi: str + date: str + description: str + modalities: List[str] + lib_struct: str + + # lists of typed objects (or None) + sequence_protocol: Optional[List[SeqProtocol]] + sequence_kit: Optional[List[SeqKit]] + library_protocol: Optional[List[LibProtocol]] + library_kit: Optional[List[LibKit]] + + # specs + sequence_spec: List["Read"] + library_spec: List["Region"] + + def __init__( + self, + assay_id: str, + name: str, + doi: str, + date: str, + description: str, + modalities: List[str], + lib_struct: str, + sequence_spec: List["Read"] = ..., + library_spec: List["Region"] = ..., + sequence_protocol: Optional[List[SeqProtocol]] = ..., + sequence_kit: Optional[List[SeqKit]] = ..., + library_protocol: Optional[List[LibProtocol]] = ..., + library_kit: Optional[List[LibKit]] = ..., + seqspec_version: Optional[str] = ..., + ) -> None: ... + + # JSON I/O + @staticmethod + def from_json(json_str: str) -> "Assay": ... + def to_json(self) -> str: ... + + # helpers / queries + def update_spec(self) -> None: ... + def list_modalities(self) -> List[str]: ... + def get_libspec(self, modality: str) -> "Region": ... + def get_seqspec(self, modality: str) -> List["Read"]: ... + def get_read(self, read_id: str) -> "Read": ... + + # mutations + def insert_regions( + self, regions: List["Region"], modality: str, after: str | None = ... + ) -> None: ... + def insert_reads( + self, reads: List["Read"], modality: str, after: str | None = ... + ) -> None: ... + def __repr__(self) -> str: ... diff --git a/seqspec/auth.py b/seqspec/auth.py new file mode 100644 index 00000000..21abea24 --- /dev/null +++ b/seqspec/auth.py @@ -0,0 +1,233 @@ +import json +import os +import tomllib +from pathlib import Path +from typing import Dict, List, Optional, Tuple +from urllib.parse import urlparse + +AUTH_CONFIG_ENV = "SEQSPEC_AUTH_CONFIG" + + +class AuthProfile: + def __init__( + self, hosts: List[str], kind: str, username_env: str, password_env: str + ) -> None: + self.hosts = hosts + self.kind = kind + self.username_env = username_env + self.password_env = password_env + + @classmethod + def from_dict(cls, data: Dict) -> "AuthProfile": + return cls( + hosts=list(data.get("hosts", [])), + kind=str(data.get("kind", "basic")), + username_env=str(data["username_env"]), + password_env=str(data["password_env"]), + ) + + def to_dict(self) -> Dict: + return { + "hosts": self.hosts, + "kind": self.kind, + "username_env": self.username_env, + "password_env": self.password_env, + } + + def matches_host(self, host: str) -> bool: + return any(candidate.lower() == host.lower() for candidate in self.hosts) + + +class AuthRegistry: + def __init__(self, location: Dict, profiles: Dict[str, AuthProfile]) -> None: + self.location = location + self.profiles = profiles + + @classmethod + def load(cls) -> "AuthRegistry": + location = config_location() + profiles = load_profiles(location) + return cls(location, profiles) + + def profile_summaries(self) -> List[Dict]: + summaries = [] + for name, profile in self.profiles.items(): + summaries.append( + { + "name": name, + "kind": profile.kind, + "hosts": profile.hosts, + "username_env": profile.username_env, + "username_present": os.environ.get(profile.username_env) + is not None, + "password_env": profile.password_env, + "password_present": os.environ.get(profile.password_env) + is not None, + } + ) + return summaries + + def resolve_summary( + self, url: str, selected_profile: Optional[str] = None + ) -> Dict[str, object]: + host = host_from_url(url) + resolved = self.resolve_profile(host, selected_profile) + profile = None + if resolved is not None: + name, match = resolved + profile = { + "name": name, + "kind": match.kind, + "hosts": match.hosts, + "username_env": match.username_env, + "username_present": os.environ.get(match.username_env) is not None, + "password_env": match.password_env, + "password_present": os.environ.get(match.password_env) is not None, + } + return {"url": url, "host": host, "profile": profile} + + def resolve_profile( + self, host: str, selected_profile: Optional[str] = None + ) -> Optional[Tuple[str, AuthProfile]]: + if selected_profile is not None: + if selected_profile not in self.profiles: + raise ValueError( + f"auth profile '{selected_profile}' is not defined in {display_config_path(self.location)}" + ) + profile = self.profiles[selected_profile] + if not profile.matches_host(host): + raise ValueError( + f"auth profile '{selected_profile}' does not match host '{host}'" + ) + return (selected_profile, profile) + + matches = [ + (name, profile) + for name, profile in self.profiles.items() + if profile.matches_host(host) + ] + if len(matches) > 1: + names = ", ".join(name for name, _ in matches) + raise ValueError(f"multiple auth profiles match host '{host}': {names}") + if len(matches) == 1: + return matches[0] + return None + + def resolve_requests_auth( + self, url: str, selected_profile: Optional[str] = None + ) -> Optional[Tuple[str, str]]: + host = host_from_url(url) + resolved = self.resolve_profile(host, selected_profile) + if resolved is None: + return None + profile_name, profile = resolved + username = os.environ.get(profile.username_env) + if username is None: + raise ValueError( + f"auth profile '{profile_name}' requires env var '{profile.username_env}' for host '{host}'" + ) + password = os.environ.get(profile.password_env) + if password is None: + raise ValueError( + f"auth profile '{profile_name}' requires env var '{profile.password_env}' for host '{host}'" + ) + return (username, password) + + +def config_location() -> Dict[str, object]: + env_path = os.environ.get(AUTH_CONFIG_ENV) + if env_path: + path = Path(env_path) + return { + "path": path, + "source": f"env:{AUTH_CONFIG_ENV}", + "exists": path.exists(), + } + + xdg = os.environ.get("XDG_CONFIG_HOME") + if xdg: + path = Path(xdg) / "seqspec" / "auth.toml" + return {"path": path, "source": "xdg_config_home", "exists": path.exists()} + + home = os.environ.get("HOME") + if home: + path = Path(home) / ".config" / "seqspec" / "auth.toml" + return {"path": path, "source": "home_default", "exists": path.exists()} + + return {"path": None, "source": "unavailable", "exists": False} + + +def load_profiles(location: Dict[str, object]) -> Dict[str, AuthProfile]: + path = location.get("path") + if path is None: + return {} + + path = Path(path) + if not path.exists(): + if str(location["source"]).startswith("env:"): + raise ValueError(f"auth config does not exist: {path}") + return {} + + with open(path, "rb") as stream: + config = tomllib.load(stream) + + profiles = config.get("profiles", {}) + return {name: AuthProfile.from_dict(profile) for name, profile in profiles.items()} + + +def init_profile(profile_name: str, profile: AuthProfile) -> Dict[str, object]: + location = config_location() + path = location.get("path") + if path is None: + raise ValueError("no auth config path is available on this system") + + path = Path(path) + created_config = not path.exists() + profiles = {} + if path.exists(): + profiles = load_profiles(location) + updated_profile = profile_name in profiles + profiles[profile_name] = profile + + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(render_config(profiles)) + + return { + "profile": profile_name, + "path": str(path), + "created_config": created_config, + "updated_profile": updated_profile, + "hosts": profile.hosts, + "kind": profile.kind, + "username_env": profile.username_env, + "password_env": profile.password_env, + } + + +def render_config(profiles: Dict[str, AuthProfile]) -> str: + lines: List[str] = [] + for name, profile in profiles.items(): + lines.append(f"[profiles.{name}]") + hosts = ", ".join(json.dumps(host) for host in profile.hosts) + lines.append(f"hosts = [{hosts}]") + lines.append(f'kind = "{profile.kind}"') + lines.append(f'username_env = "{profile.username_env}"') + lines.append(f'password_env = "{profile.password_env}"') + lines.append("") + return "\n".join(lines).rstrip() + "\n" + + +def host_from_url(url: str) -> str: + parsed = urlparse(url) + if not parsed.scheme: + raise ValueError(f"URL '{url}' does not contain a scheme") + if not parsed.hostname: + raise ValueError(f"URL '{url}' has an empty host") + return parsed.hostname.lower() + + +def display_config_path(location: Dict[str, object]) -> str: + path = location.get("path") + if path is None: + return "" + return str(path) diff --git a/seqspec/examples_docs.py b/seqspec/examples_docs.py new file mode 100644 index 00000000..bb808aa3 --- /dev/null +++ b/seqspec/examples_docs.py @@ -0,0 +1,1214 @@ +"""Build and validate the examples docs tree. + +This module keeps ``docs/examples`` in one consistent shape: + +1. maintained canonical examples under ``assays``, ``reads``, and ``regions`` +2. a generated static site under ``site`` + +The build step is designed to be repeatable. It rewrites the canonical YAML, +writes a manifest, and renders the site. +""" + +from __future__ import annotations + +import re +import shutil +from dataclasses import dataclass +from html import escape +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +import yaml + +from seqspec.seqspec_check import seqspec_check +from seqspec.seqspec_print_html import print_seqspec_html +from seqspec.utils import load_spec, safe_load_strip_tags + +MANIFEST_VERSION = 1 +ANCHOR_REGION_TYPES = { + "illumina_p5", + "illumina_p7", + "truseq_read1", + "truseq_read2", + "nextera_read1", + "nextera_read2", + "s5", + "s7", +} +READ_CONTAINER_PATTERN = re.compile( + r"(?i)(^|[^a-z0-9])(r[12]|i[12]|read\s*[12]|index\s*[12])([^a-z0-9]|$)" +) +POSITIVE_READ_HINT = re.compile(r"(?i)(r1|i1|read\s*1|index\s*1|i7)") +NEGATIVE_READ_HINT = re.compile(r"(?i)(r2|i2|read\s*2|index\s*2|i5)") +PLACEHOLDER_MD5_VALUES = {"", "md5", None} + + +@dataclass(frozen=True) +class ExamplePaths: + """Resolved paths for the examples tree.""" + + repo_root: Path + docs_examples: Path + assays_dir: Path + reads_dir: Path + regions_dir: Path + site_dir: Path + site_assays_dir: Path + manifest_path: Path + readme_path: Path + + +def example_paths(repo_root: Path | None = None) -> ExamplePaths: + """Build the path bundle used throughout the examples tooling.""" + root = repo_root or Path(__file__).resolve().parent.parent + docs_examples = root / "docs" / "examples" + site_dir = docs_examples / "site" + return ExamplePaths( + repo_root=root, + docs_examples=docs_examples, + assays_dir=docs_examples / "assays", + reads_dir=docs_examples / "reads", + regions_dir=docs_examples / "regions", + site_dir=site_dir, + site_assays_dir=site_dir / "assays", + manifest_path=docs_examples / "examples.yaml", + readme_path=docs_examples / "README.md", + ) + + +def build_examples_tree(repo_root: Path | None = None) -> None: + """Build the consolidated examples tree.""" + paths = example_paths(repo_root) + ensure_example_directories(paths) + normalize_canonical_examples(paths) + manifest = build_manifest(paths) + write_yaml(paths.manifest_path, manifest) + write_examples_readme(paths, manifest) + write_site(paths, manifest) + remove_superseded_example_sources(paths) + + +def validate_examples_tree(repo_root: Path | None = None) -> list[str]: + """Validate the generated examples tree and return any problems.""" + paths = example_paths(repo_root) + errors: list[str] = [] + + if not paths.manifest_path.exists(): + return [f"missing manifest: {paths.manifest_path}"] + + manifest = yaml.safe_load(paths.manifest_path.read_text()) or {} + assays = manifest.get("assays", []) + + for assay in assays: + canonical_path = paths.docs_examples / assay["canonical_path"] + if not canonical_path.exists(): + errors.append(f"missing canonical assay: {canonical_path}") + continue + + spec = load_spec(canonical_path, strict=False) + if spec.seqspec_version != "0.4.0": + errors.append(f"canonical assay does not declare 0.4.0: {canonical_path}") + + raw_text = canonical_path.read_text() + for token in ( + "!Assay", + "!Read", + "!Region", + "!Onlist", + "parent_id:", + "location:", + ): + if token in raw_text: + errors.append(f"legacy field '{token}' found in {canonical_path}") + + try: + html = print_seqspec_html(spec) + except Exception as exc: # pragma: no cover - surfaced in tests + errors.append(f"failed to render HTML for {canonical_path}: {exc}") + else: + if "seqspec" not in html: + errors.append(f"unexpected HTML output for {canonical_path}") + + if assay["status"] == "validated": + diagnostics = seqspec_check(spec) + if diagnostics: + errors.append( + f"validated assay does not pass seqspec check: {canonical_path}" + ) + + assay_html_path = paths.site_dir / assay["html_path"] + if not assay_html_path.exists(): + errors.append(f"missing assay HTML page: {assay_html_path}") + + for page_name in ("index.html", "assays.html", "reads.html", "regions.html"): + page_path = paths.site_dir / page_name + if not page_path.exists(): + errors.append(f"missing site page: {page_path}") + + for forbidden in paths.docs_examples.rglob("*"): + if forbidden.name in {".git", ".DS_Store"}: + errors.append(f"forbidden path remains in examples tree: {forbidden}") + + return errors + + +def ensure_example_directories(paths: ExamplePaths) -> None: + """Create the directory layout used by the consolidated examples tree.""" + for directory in ( + paths.docs_examples, + paths.assays_dir, + paths.reads_dir, + paths.regions_dir, + paths.site_dir, + paths.site_assays_dir, + ): + directory.mkdir(parents=True, exist_ok=True) + + +def normalize_canonical_examples(paths: ExamplePaths) -> None: + """Normalize the maintained assay, read, and region examples in place.""" + for assay_path in sorted(paths.assays_dir.glob("*.yaml")): + assay = safe_load_strip_tags(assay_path) + normalized = normalize_assay_dict(assay, canonical=True) + write_yaml(assay_path, normalized) + + for read_path in sorted(paths.reads_dir.glob("*.yaml")): + read_data = safe_load_strip_tags(read_path) + normalized = normalize_read_template(read_data) + write_yaml(read_path, normalized) + + for region_path in sorted(paths.regions_dir.glob("*.yaml")): + region_data = safe_load_strip_tags(region_path) + normalized = normalize_region_template(region_data) + write_yaml(region_path, normalized) + + +def build_manifest(paths: ExamplePaths) -> dict[str, Any]: + """Build the manifest that drives validation and site generation.""" + assays: list[dict[str, Any]] = [] + + for assay_path in sorted(paths.assays_dir.glob("*.yaml")): + spec = load_spec(assay_path, strict=False) + slug = assay_path.name.removesuffix(".spec.yaml") + try: + diagnostics = seqspec_check(spec) + except Exception as exc: + diagnostics = [ + { + "severity": "error", + "error_type": "manifest_check", + "error_message": str(exc), + "error_object": assay_path.name, + } + ] + status = assay_status(slug, diagnostics) + assays.append( + { + "slug": slug, + "name": display_assay_name(spec.name, slug), + "assay_id": text_or_empty(spec.assay_id) or slug, + "assay_link": text_or_empty(spec.lib_struct), + "canonical_path": str(Path("assays") / assay_path.name), + "sequence_protocol": display_sequence_protocol(spec.sequence_protocol), + "date": text_or_empty(spec.date), + "status": status, + "modalities": list(spec.modalities), + "notes": assay_notes(status, diagnostics), + "html_path": str(Path("assays") / f"{slug}.html"), + } + ) + + return { + "manifest_version": MANIFEST_VERSION, + "assays": assays, + } + + +def assay_status(slug: str, diagnostics: list[dict[str, Any]]) -> str: + """Choose the manifest status for one maintained assay example.""" + if slug == "template": + return "template" + if not diagnostics: + return "validated" + return "example" + + +def assay_notes(status: str, diagnostics: list[dict[str, Any]]) -> str: + """Write one short note for an assay manifest entry.""" + if status == "template": + return "Structure template with intentionally incomplete metadata." + if status == "validated": + return "Current 0.4.0 example that passes seqspec check." + if any(d["error_type"] == "check_schema" for d in diagnostics): + return "Current 0.4.0 example with intentionally incomplete local metadata." + return "Current 0.4.0 example that loads and renders but is not fully check-clean." + + +def write_examples_readme(paths: ExamplePaths, manifest: dict[str, Any]) -> None: + """Write the top-level README for docs/examples.""" + assays = manifest["assays"] + templates = sum(1 for assay in assays if assay["status"] == "template") + validated = sum(1 for assay in assays if assay["status"] == "validated") + examples = sum(1 for assay in assays if assay["status"] == "example") + + lines = [ + "# Examples", + "", + "This directory holds the maintained example material for `seqspec`.", + "It has two parts: canonical examples and a generated HTML site.", + "", + "## Layout", + "", + "- `assays/`: maintained assay examples in current `0.4.0` structure", + "- `reads/`: maintained read templates in current structure", + "- `regions/`: maintained region templates in current structure", + "- `site/`: generated static HTML pages built from the maintained examples", + "- `examples.yaml`: manifest used for validation and site generation", + "", + "## Status", + "", + f"- `{templates}` `template` assays: intentionally incomplete structure templates", + f"- `{examples}` `example` assays: current examples that load and render but may not fully pass `seqspec check`", + f"- `{validated}` `validated` assays: current examples that pass `seqspec check`", + "", + "## Regenerate", + "", + "Run:", + "", + "```bash", + "uv run python docs/examples/build_examples.py", + "```", + "", + "The script rewrites the maintained YAML, writes the manifest, and regenerates the site.", + ] + paths.readme_path.write_text("\n".join(lines) + "\n") + + +def write_site(paths: ExamplePaths, manifest: dict[str, Any]) -> None: + """Write the generated static HTML site.""" + if paths.site_dir.exists(): + shutil.rmtree(paths.site_dir) + paths.site_assays_dir.mkdir(parents=True, exist_ok=True) + + assays = manifest["assays"] + write_text(paths.site_dir / "index.html", site_index_html(assays)) + write_text(paths.site_dir / "assays.html", site_assays_html(assays)) + write_text(paths.site_dir / "reads.html", site_reads_html(paths)) + write_text(paths.site_dir / "regions.html", site_regions_html(paths)) + + for assay in assays: + spec_path = paths.docs_examples / assay["canonical_path"] + spec = load_spec(spec_path, strict=False) + html_path = paths.site_dir / assay["html_path"] + html_path.parent.mkdir(parents=True, exist_ok=True) + html_path.write_text(print_seqspec_html(spec)) + + +def remove_superseded_example_sources(paths: ExamplePaths) -> None: + """Remove old source directories that were replaced by the archive tree.""" + for source in ( + paths.docs_examples / "legacy", + paths.docs_examples / "seqspec-builder", + paths.docs_examples / "seqspec-builder1", + paths.docs_examples / "seqspec", + ): + if source.exists(): + shutil.rmtree(source) + + +def normalize_assay_dict(data: dict[str, Any], canonical: bool) -> dict[str, Any]: + """Normalize one assay YAML payload.""" + assay = dict(data) + assay["seqspec_version"] = "0.4.0" + assay["sequence_spec"] = reconcile_read_modalities( + normalize_sequence_spec( + assay.get("sequence_spec", []), + preserve_files=not canonical, + template_reads=False, + ), + list(assay.get("modalities", [])), + ) + assay["library_spec"] = [ + normalize_region_dict(region) for region in assay.get("library_spec", []) + ] + assay = ensure_top_level_modality_regions(assay) + assay["sequence_spec"] = rewrite_sequence_spec_from_library(assay) + assay["sequence_spec"] = reconcile_read_primers(assay["sequence_spec"], assay) + + return ordered_mapping( + seqspec_version=assay.get("seqspec_version"), + assay_id=assay.get("assay_id"), + name=assay.get("name"), + doi=assay.get("doi"), + date=assay.get("date"), + description=assay.get("description"), + modalities=list(assay.get("modalities", [])), + lib_struct=assay.get("lib_struct"), + sequence_protocol=assay.get("sequence_protocol"), + sequence_kit=assay.get("sequence_kit"), + library_protocol=assay.get("library_protocol"), + library_kit=assay.get("library_kit"), + sequence_spec=assay["sequence_spec"], + library_spec=assay["library_spec"], + ) + + +def ensure_top_level_modality_regions(assay: dict[str, Any]) -> dict[str, Any]: + """Wrap old top-level region lists into one modality region when needed.""" + modalities = list(assay.get("modalities", [])) + library_spec = list(assay.get("library_spec", [])) + top_level_ids = {region.get("region_id") for region in library_spec} + + if all(modality in top_level_ids for modality in modalities): + return assay + if len(modalities) != 1: + return assay + + modality = modalities[0] + assay["library_spec"] = [ + ordered_mapping( + region_id=modality, + region_type=modality, + name=assay.get("name") or modality, + sequence_type="joined", + sequence="".join( + str(region.get("sequence") or "") for region in library_spec + ), + min_len=sum(zero_if_none(region.get("min_len")) for region in library_spec), + max_len=sum(zero_if_none(region.get("max_len")) for region in library_spec), + onlist=None, + regions=library_spec, + ) + ] + return assay + + +def reconcile_read_modalities( + reads: list[dict[str, Any]], + modalities: list[str], +) -> list[dict[str, Any]]: + """Rewrite obvious legacy workflow modalities onto assay modalities.""" + normalized_reads: list[dict[str, Any]] = [] + for read in reads: + read_modality = read.get("modality") + if read_modality in modalities: + normalized_reads.append(read) + continue + + inferred = infer_read_modality(read, modalities) + if inferred: + normalized_reads.append({**read, "modality": inferred}) + continue + + normalized_reads.append(read) + return normalized_reads + + +def infer_read_modality(read: dict[str, Any], modalities: list[str]) -> str | None: + """Infer a read modality from its ids when the file still uses an old workflow label.""" + if len(modalities) == 1: + return modalities[0] + + probes = [ + str(read.get("read_id") or ""), + str(read.get("primer_id") or ""), + str(read.get("name") or ""), + ] + for modality in modalities: + prefixes = (f"{modality}-", f"{modality}_", f"{modality}.") + if any(probe.startswith(prefixes) for probe in probes): + return modality + return None + + +def rewrite_sequence_spec_from_library(assay: dict[str, Any]) -> list[dict[str, Any]]: + """Derive sequence_spec reads from legacy read containers when possible.""" + original_reads = list(assay.get("sequence_spec", [])) + original_lookup = build_read_lookup(original_reads) + rewritten_reads: list[dict[str, Any]] = [] + + for modality in assay.get("modalities", []): + libspec = get_modality_libspec(assay, modality) + if libspec is None: + continue + + rewrite = rewrite_legacy_modality(libspec, modality, original_lookup) + if rewrite["reads"]: + libspec["regions"] = rewrite["regions"] + rewritten_reads.extend(rewrite["reads"]) + continue + + rewritten_reads.extend( + [read for read in original_reads if read.get("modality") == modality] + ) + + return rewritten_reads or original_reads + + +def reconcile_read_primers( + reads: list[dict[str, Any]], + assay: dict[str, Any], +) -> list[dict[str, Any]]: + """Replace generic primer ids with region ids that exist in the library tree.""" + updated_reads: list[dict[str, Any]] = [] + for read in reads: + modality = read.get("modality") + libspec = get_modality_libspec(assay, modality) + if libspec is None: + updated_reads.append(read) + continue + + leaf_ids = {leaf["region_id"] for leaf in region_leaves(libspec)} + if read.get("primer_id") in leaf_ids: + updated_reads.append(read) + continue + + inferred = infer_read_anchor(read, libspec) + if inferred is None: + updated_reads.append(read) + continue + + updated_reads.append( + { + **read, + "primer_id": inferred["primer_id"], + "strand": inferred["strand"], + } + ) + return updated_reads + + +def infer_read_anchor( + read: dict[str, Any], + libspec: dict[str, Any], +) -> dict[str, str] | None: + """Infer the best primer region for one read from its label and the library tree.""" + anchors = { + leaf["region_type"]: leaf["region_id"] + for leaf in region_leaves(libspec) + if leaf.get("region_type") in ANCHOR_REGION_TYPES + } + probe = f"{read.get('read_id', '')} {read.get('name', '')}" + + def choose( + preferred_types: list[str], + strand: str, + ) -> dict[str, str] | None: + for region_type in preferred_types: + primer_id = anchors.get(region_type) + if primer_id: + return {"primer_id": primer_id, "strand": strand} + return None + + if re.search(r"(?i)(i2|index\s*2|i5)", probe): + return choose( + ["truseq_read1", "nextera_read1", "s5", "illumina_p5"], + "neg", + ) + if re.search(r"(?i)(i1|index\s*1|i7)", probe): + return choose( + ["truseq_read2", "nextera_read2", "s7", "illumina_p7"], + "pos", + ) + if re.search(r"(?i)(r2|read\s*2)", probe): + return choose( + ["truseq_read2", "nextera_read2", "s7", "illumina_p7"], + "neg", + ) + if re.search(r"(?i)(r1|read\s*1)", probe): + return choose( + ["truseq_read1", "nextera_read1", "s5", "illumina_p5"], + "pos", + ) + return None + + +def region_leaves(region: dict[str, Any]) -> list[dict[str, Any]]: + """Return the leaf regions below one region tree node.""" + children = region.get("regions") or [] + if not children: + return [region] + leaves: list[dict[str, Any]] = [] + for child in children: + leaves.extend(region_leaves(child)) + return leaves + + +def normalize_read_template(data: dict[str, Any]) -> dict[str, Any]: + """Normalize one read template file.""" + return ordered_mapping( + library_protocol=data.get("library_protocol"), + library_kit=data.get("library_kit"), + sequence_protocol=data.get("sequence_protocol"), + sequence_kit=data.get("sequence_kit"), + sequence_spec=normalize_sequence_spec( + data.get("sequence_spec", []), + preserve_files=False, + template_reads=True, + ), + ) + + +def normalize_region_template(data: Any) -> list[dict[str, Any]]: + """Normalize one region template file.""" + if isinstance(data, dict): + regions = data.get("regions", []) + else: + regions = data or [] + return [normalize_region_dict(region) for region in regions] + + +def normalize_sequence_spec( + reads: list[dict[str, Any]], + preserve_files: bool, + template_reads: bool, +) -> list[dict[str, Any]]: + """Normalize a sequence_spec list.""" + normalized_reads: list[dict[str, Any]] = [] + for read in reads: + normalized_files: list[dict[str, Any]] + if template_reads: + normalized_files = [] + else: + normalized_files = normalize_file_list( + read.get("files", []), + keep_without_url=preserve_files, + ) + normalized_reads.append( + ordered_mapping( + read_id=read.get("read_id"), + name=read.get("name"), + modality=read.get("modality"), + primer_id=read.get("primer_id"), + min_len=read.get("min_len"), + max_len=read.get("max_len"), + strand=read.get("strand"), + files=normalized_files, + ) + ) + return normalized_reads + + +def normalize_region_dict(region: dict[str, Any]) -> dict[str, Any]: + """Normalize one region tree node.""" + child_regions = [ + normalize_region_dict(child) for child in (region.get("regions") or []) + ] + return ordered_mapping( + region_id=region.get("region_id"), + region_type=region.get("region_type"), + name=region.get("name"), + sequence_type=region.get("sequence_type"), + sequence=region.get("sequence"), + min_len=region.get("min_len"), + max_len=region.get("max_len"), + onlist=normalize_onlist(region.get("onlist")), + regions=child_regions, + ) + + +def normalize_onlist(onlist: dict[str, Any] | None) -> dict[str, Any] | None: + """Normalize one onlist mapping.""" + if onlist is None: + return None + + location = onlist.get("location") + url = text_or_empty(onlist.get("url")) + urltype = text_or_empty(onlist.get("urltype")) or text_or_empty(location) + if not urltype and url: + urltype = "http" if url.startswith(("http://", "https://")) else "local" + + return ordered_mapping( + file_id=onlist.get("file_id"), + filename=onlist.get("filename"), + filetype=onlist.get("filetype", ""), + filesize=zero_if_none(onlist.get("filesize")), + url=url, + urltype=urltype, + md5=clean_md5(onlist.get("md5")), + ) + + +def normalize_file_list( + files: list[dict[str, Any]], + keep_without_url: bool, +) -> list[dict[str, Any]]: + """Normalize one list of read files.""" + normalized: list[dict[str, Any]] = [] + for file_obj in files or []: + file_row = normalize_file_dict(file_obj) + if not file_row: + continue + if not keep_without_url and not file_row["url"]: + continue + normalized.append(file_row) + return normalized + + +def normalize_file_dict(file_obj: dict[str, Any]) -> dict[str, Any] | None: + """Normalize one read file mapping.""" + url = text_or_empty(file_obj.get("url")) + urltype = text_or_empty(file_obj.get("urltype")) + if not urltype and url: + urltype = "http" if url.startswith(("http://", "https://")) else "local" + + file_id = file_obj.get("file_id") or file_obj.get("filename") or url + filename = file_obj.get("filename") or file_id + if not file_id and not filename and not url: + return None + + return ordered_mapping( + file_id=file_id, + filename=filename, + filetype=file_obj.get("filetype", ""), + filesize=zero_if_none(file_obj.get("filesize")), + url=url, + urltype=urltype, + md5=clean_md5(file_obj.get("md5")), + ) + + +def rewrite_legacy_modality( + libspec: dict[str, Any], + modality: str, + original_lookup: dict[str, list[dict[str, Any]]], +) -> dict[str, Any]: + """Rewrite one top-level modality region by extracting read containers.""" + rewritten_regions: list[dict[str, Any]] = [] + derived_reads: list[dict[str, Any]] = [] + manual_notes: list[str] = [] + children = list(libspec.get("regions", [])) + + for index, child in enumerate(children): + if not is_read_container_candidate(child): + rewritten_regions.append(child) + continue + + if not child.get("regions"): + manual_notes.append( + f"read candidate '{child.get('region_id')}' has no child regions" + ) + rewritten_regions.append(child) + continue + + primer = infer_primer_and_strand(children, index, child) + if primer is None: + manual_notes.append( + f"could not infer primer and strand for '{child.get('region_id')}'" + ) + rewritten_regions.append(child) + continue + + read_id = str(child.get("region_id") or child.get("name")) + read_name = child.get("name") or read_id + normalized_read = ordered_mapping( + read_id=read_id, + name=read_name, + modality=modality, + primer_id=primer["primer_id"], + min_len=child.get("min_len"), + max_len=child.get("max_len"), + strand=primer["strand"], + files=lookup_existing_files(original_lookup, read_id, read_name), + ) + derived_reads.append(normalized_read) + rewritten_regions.extend(child.get("regions", [])) + + return { + "regions": rewritten_regions, + "reads": derived_reads, + "manual_notes": manual_notes, + } + + +def is_read_container_candidate(region: dict[str, Any]) -> bool: + """Return True when a legacy region is really a read container.""" + region_type = str(region.get("region_type") or "").lower() + if region_type in {"fastq", "gz"}: + return True + probe = f"{region.get('region_id', '')} {region.get('name', '')}" + return bool(READ_CONTAINER_PATTERN.search(probe)) + + +def infer_primer_and_strand( + siblings: list[dict[str, Any]], + index: int, + candidate: dict[str, Any], +) -> dict[str, str] | None: + """Infer read anchor and strand from flanking primer-like regions.""" + prev_anchor = siblings[index - 1] if index > 0 else None + next_anchor = siblings[index + 1] if index + 1 < len(siblings) else None + + prev_ok = prev_anchor is not None and is_anchor_region(prev_anchor) + next_ok = next_anchor is not None and is_anchor_region(next_anchor) + + if prev_ok and not next_ok: + return {"primer_id": prev_anchor["region_id"], "strand": "pos"} + if next_ok and not prev_ok: + return {"primer_id": next_anchor["region_id"], "strand": "neg"} + if not prev_ok and not next_ok: + return None + + hint = read_direction_hint(candidate) + if hint == "pos": + return {"primer_id": prev_anchor["region_id"], "strand": "pos"} + if hint == "neg": + return {"primer_id": next_anchor["region_id"], "strand": "neg"} + return None + + +def is_anchor_region(region: dict[str, Any]) -> bool: + """Return True if a sibling region looks like a sequencing primer anchor.""" + return str(region.get("region_type") or "") in ANCHOR_REGION_TYPES + + +def read_direction_hint(candidate: dict[str, Any]) -> str | None: + """Infer read direction from the read name or id when both flanks are anchors.""" + probe = f"{candidate.get('region_id', '')} {candidate.get('name', '')}" + if NEGATIVE_READ_HINT.search(probe): + return "neg" + if POSITIVE_READ_HINT.search(probe): + return "pos" + return None + + +def build_read_lookup( + sequence_spec: list[dict[str, Any]], +) -> dict[str, list[dict[str, Any]]]: + """Build a lookup from read ids and names to their meaningful file lists.""" + lookup: dict[str, list[dict[str, Any]]] = {} + for read in sequence_spec: + files = normalize_file_list(read.get("files", []), keep_without_url=False) + if not files: + continue + for key in (read.get("read_id"), read.get("name")): + if key: + lookup[normalize_slug(str(key))] = files + return lookup + + +def lookup_existing_files( + lookup: dict[str, list[dict[str, Any]]], + read_id: str, + read_name: str, +) -> list[dict[str, Any]]: + """Look up meaningful file rows for a derived read.""" + for key in (normalize_slug(read_id), normalize_slug(read_name)): + if key in lookup: + return lookup[key] + return [] + + +def preserve_existing_reads( + sequence_spec: list[dict[str, Any]], + modalities: list[str], + modality: str, +) -> list[dict[str, Any]]: + """Preserve normalized reads when no legacy read containers were derived.""" + preserved: list[dict[str, Any]] = [] + for read in normalize_sequence_spec( + sequence_spec, + preserve_files=False, + template_reads=False, + ): + read_modality = read.get("modality") + if read_modality == modality: + preserved.append(read) + continue + if len(modalities) == 1 and read_modality not in modalities: + preserved.append({**read, "modality": modality}) + return preserved + + +def get_modality_libspec(assay: dict[str, Any], modality: str) -> dict[str, Any] | None: + """Return the top-level library region that matches one modality.""" + for region in assay.get("library_spec", []): + if region.get("region_id") == modality: + return region + return None + + +def ordered_mapping(**items: Any) -> dict[str, Any]: + """Create an ordered mapping without dropping falsy but meaningful values.""" + return dict(items) + + +def clean_md5(value: Any) -> str: + """Normalize placeholder md5 values.""" + if value in PLACEHOLDER_MD5_VALUES: + return "" + return str(value) + + +def text_or_empty(value: Any) -> str: + """Normalize an optional scalar to a string.""" + if value is None: + return "" + return str(value) + + +def is_external_url(value: str) -> bool: + """Return True when a metadata value looks like an external link.""" + if not value: + return False + parsed = urlparse(value) + return parsed.scheme in {"http", "https"} and bool(parsed.netloc) + + +def zero_if_none(value: Any) -> int: + """Normalize an optional integer-like field.""" + if value in (None, ""): + return 0 + try: + return int(value) + except (TypeError, ValueError): + return 0 + + +def normalize_slug(value: str) -> str: + """Normalize a name or path stem to a stable comparison slug.""" + return re.sub(r"[^a-z0-9]+", "", value.lower()) + + +def write_yaml(path: Path, data: Any) -> None: + """Write YAML with stable formatting.""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + yaml.safe_dump( + data, + sort_keys=False, + allow_unicode=True, + default_flow_style=False, + ) + ) + + +def write_text(path: Path, text: str) -> None: + """Write plain text to disk.""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text) + + +def site_css() -> str: + """Inline CSS used by the generated catalog pages.""" + return """ +body { + margin: 0; + font-family: "IBM Plex Sans", -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; + color: #1a1d21; + background: #ffffff; +} +.wrap { + max-width: 980px; + margin: 0 auto; + padding: 32px 24px 64px; +} +h1, h2 { + margin: 0 0 12px; + font-weight: 600; +} +p { + margin: 0 0 14px; + line-height: 1.6; +} +.nav { + display: flex; + gap: 18px; + margin: 18px 0 26px; + font-size: 14px; +} +.nav a, +a { + color: #1d4ed8; + text-decoration: none; +} +.nav a:hover, +a:hover { + text-decoration: underline; +} +.note { + margin: 0 0 18px; + padding: 12px 14px; + border: 1px solid #dcdfe3; + border-radius: 6px; + background: #f8fafc; +} +.search { + margin: 18px 0 20px; +} +.search input { + width: 100%; + padding: 10px 12px; + border: 1px solid #dcdfe3; + border-radius: 6px; + font: inherit; + color: inherit; + background: #ffffff; +} +.search input:focus { + outline: none; + border-color: #94a3b8; +} +.search-note { + margin-top: 8px; + font-size: 13px; + color: #68707a; +} +table { + width: 100%; + border-collapse: collapse; + font-size: 14px; +} +th, td { + border-bottom: 1px solid #e5e7eb; + padding: 10px 12px; + text-align: left; + vertical-align: top; +} +th { + font-size: 12px; + color: #68707a; + text-transform: uppercase; + letter-spacing: 0.04em; +} +.mono { + font-family: "IBM Plex Mono", Menlo, monospace; + font-size: 12px; +} +.tag { + display: inline-block; + padding: 2px 7px; + border-radius: 999px; + font-size: 11px; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.04em; +} +.tag.template { background: #f3f4f6; color: #4b5563; } +.tag.example { background: #eff6ff; color: #1d4ed8; } +.tag.validated { background: #f0fdf4; color: #166534; } +""" + + +def page_shell(title: str, body: str, script: str = "") -> str: + """Wrap one catalog page in a small self-contained HTML shell.""" + return ( + "\n" + '\n' + "\n" + ' \n' + ' \n' + f" {escape(title)}\n" + f" \n" + "\n" + "\n" + f"{body}\n" + f"{script}\n" + "\n" + "\n" + ) + + +def site_nav() -> str: + """Render the shared site navigation.""" + return ( + '" + ) + + +def site_index_html(assays: list[dict[str, Any]]) -> str: + """Render the site landing page.""" + rows = "".join(assay_catalog_rows(assays)) + body = ( + '
' + "

seqspec examples

" + '

This site is generated from the maintained examples under docs/examples. ' + "It shows the current assay examples, read templates, and region templates.

" + '
' + '

Note. seqspec files are tightly tied to the data, assay design, sequencer, and library construction they describe. ' + 'These examples are representative specs derived from Teichlab library structures. ' + "They are useful reference points, but they may be incomplete or incorrect for any one concrete assay or dataset that a user defines.

" + "
" + f"{site_nav()}" + '

The assay pages below are rendered with seqspec print -f seqspec-html. ' + "Use the search box to filter the catalog.

" + '" + '' + "" + f"{rows}" + "
NameAssaySequencerModalitiesDateReportseqspec
" + "
" + ) + script = """ + +""" + return page_shell("seqspec examples", body, script) + + +def site_assays_html(assays: list[dict[str, Any]]) -> str: + """Render the assay catalog page.""" + body = ( + '
' + "

Assays

" + "

This catalog lists the maintained assay examples in current `0.4.0` structure.

" + f"{site_nav()}" + "" + "" + f"{''.join(assay_catalog_rows(assays, row_class=''))}" + "
NameAssaySequencerModalitiesDateReportseqspec
" + "
" + ) + return page_shell("seqspec assays", body) + + +def assay_catalog_rows( + assays: list[dict[str, Any]], + row_class: str = "assay-row", +) -> list[str]: + """Render the assay catalog rows used by the overview and assay pages.""" + rows: list[str] = [] + class_attr = f' class="{row_class}"' if row_class else "" + for assay in assays: + modalities = ", ".join(assay["modalities"]) + sequencer = assay["sequence_protocol"] or "\u2014" + date = assay["date"] or "\u2014" + assay_id = escape(assay["assay_id"]) + assay_link = assay.get("assay_link", "") + assay_cell = ( + f'{assay_id}' + if is_external_url(assay_link) + else assay_id + ) + rows.append( + f"" + f"{escape(assay['name'])}" + f'{assay_cell}' + f"{escape(sequencer)}" + f"{escape(modalities)}" + f"{escape(date)}" + f'view' + f'yaml' + "" + ) + return rows + + +def site_reads_html(paths: ExamplePaths) -> str: + """Render the read templates catalog page.""" + rows = [] + for read_path in sorted(paths.reads_dir.glob("*.yaml")): + payload = safe_load_strip_tags(read_path) + reads = payload.get("sequence_spec", []) + ids = ", ".join(str(read.get("read_id")) for read in reads) + rows.append( + "" + f"{escape(read_path.name)}" + f"{len(reads)}" + f'{escape(ids)}' + f'yaml' + "" + ) + + body = ( + '
' + "

Reads

" + "

This catalog lists the maintained read templates used by the assay examples.

" + f"{site_nav()}" + "" + "" + f"{''.join(rows)}" + "
TemplateReadsRead idsLink
" + "
" + ) + return page_shell("seqspec read templates", body) + + +def site_regions_html(paths: ExamplePaths) -> str: + """Render the region templates catalog page.""" + rows = [] + for region_path in sorted(paths.regions_dir.glob("*.yaml")): + payload = safe_load_strip_tags(region_path) + region_ids = ", ".join(str(region.get("region_id")) for region in payload) + rows.append( + "" + f"{escape(region_path.name)}" + f"{len(payload)}" + f'{escape(region_ids)}' + f'yaml' + "" + ) + + body = ( + '
' + "

Regions

" + "

This catalog lists the maintained region templates used by the assay examples.

" + f"{site_nav()}" + "" + "" + f"{''.join(rows)}" + "
TemplateRegionsRegion idsLink
" + "
" + ) + return page_shell("seqspec region templates", body) + + +def display_assay_name(name: Any, slug: str) -> str: + """Choose a stable display name for one assay entry.""" + text = text_or_empty(name) + if text and text.lower() != "example assay": + return text + return slug.replace("_", " ").replace("-", " ") + + +def display_sequence_protocol(value: Any) -> str: + """Choose a readable sequencer label from a protocol field.""" + if value in (None, "", []): + return "" + if isinstance(value, list): + labels: list[str] = [] + for item in value: + if isinstance(item, dict): + label = text_or_empty(item.get("name")) or text_or_empty( + item.get("protocol_id") + ) + else: + label = text_or_empty(getattr(item, "name", None)) or text_or_empty( + getattr(item, "protocol_id", None) + ) + if label and label not in labels: + labels.append(label) + return ", ".join(labels) + return text_or_empty(value) diff --git a/seqspec/main.py b/seqspec/main.py index e7598fdb..36845ba5 100644 --- a/seqspec/main.py +++ b/seqspec/main.py @@ -4,29 +4,61 @@ It handles argument parsing, command routing, and execution of subcommands. """ +import importlib import logging import sys import warnings from argparse import ArgumentParser, Namespace, RawTextHelpFormatter -from typing import Any, Callable, Dict +from typing import Any, Callable, Dict, Tuple from . import __version__ -from .seqspec_build import run_build, setup_build_args -from .seqspec_check import run_check, setup_check_args -from .seqspec_file import run_file, setup_file_args -from .seqspec_find import run_find, setup_find_args -from .seqspec_format import run_format, setup_format_args -from .seqspec_index import run_index, setup_index_args -from .seqspec_info import run_info, setup_info_args -from .seqspec_init import run_init, setup_init_args -from .seqspec_insert import run_insert, setup_insert_args -from .seqspec_methods import run_methods, setup_methods_args -from .seqspec_modify import run_modify, setup_modify_args -from .seqspec_onlist import run_onlist, setup_onlist_args -from .seqspec_print import run_print, setup_print_args -from .seqspec_split import run_split, setup_split_args -from .seqspec_upgrade import run_upgrade, setup_upgrade_args -from .seqspec_version import run_version, setup_version_args + +COMMAND_MODULES: Dict[str, Tuple[str, str, str]] = { + "auth": ("seqspec_auth", "setup_auth_args", "run_auth"), + "check": ("seqspec_check", "setup_check_args", "run_check"), + "file": ("seqspec_file", "setup_file_args", "run_file"), + "find": ("seqspec_find", "setup_find_args", "run_find"), + "format": ("seqspec_format", "setup_format_args", "run_format"), + "index": ("seqspec_index", "setup_index_args", "run_index"), + "info": ("seqspec_info", "setup_info_args", "run_info"), + "init": ("seqspec_init", "setup_init_args", "run_init"), + "insert": ("seqspec_insert", "setup_insert_args", "run_insert"), + "methods": ("seqspec_methods", "setup_methods_args", "run_methods"), + "modify": ("seqspec_modify", "setup_modify_args", "run_modify"), + "onlist": ("seqspec_onlist", "setup_onlist_args", "run_onlist"), + "print": ("seqspec_print", "setup_print_args", "run_print"), + "split": ("seqspec_split", "setup_split_args", "run_split"), + "upgrade": ("seqspec_upgrade", "setup_upgrade_args", "run_upgrade"), + "version": ("seqspec_version", "setup_version_args", "run_version"), +} + + +def load_command(command: str) -> Tuple[Callable, Callable]: + if command == "build": + return setup_build_args, run_build + + module_name, setup_name, run_name = COMMAND_MODULES[command] + module = importlib.import_module(f".{module_name}", __package__) + return getattr(module, setup_name), getattr(module, run_name) + + +def setup_build_args(parser) -> ArgumentParser: + subparser = parser.add_parser( + "build", + description=""" +The LLM-backed build command is deprecated and will be removed. +--- +""", + help="Deprecated. This command will be removed.", + formatter_class=RawTextHelpFormatter, + ) + return subparser + + +def run_build(_: ArgumentParser, __: Namespace) -> None: + raise RuntimeError( + "seqspec build is deprecated. Use seqspec init/insert/modify or construct the spec directly." + ) def setup_parser(): @@ -44,6 +76,7 @@ def setup_parser(): """, formatter_class=RawTextHelpFormatter, ) + parser.add_argument("--version", action="version", version=f"seqspec {__version__}") subparsers = parser.add_subparsers( dest="command", @@ -51,25 +84,12 @@ def setup_parser(): ) # Setup the arguments for all subcommands - command_to_parser = { - "build": setup_build_args(subparsers), - "check": setup_check_args(subparsers), - "find": setup_find_args(subparsers), - "file": setup_file_args(subparsers), - "format": setup_format_args(subparsers), - # "convert": setup_convert_args(subparsers), - "index": setup_index_args(subparsers), - "info": setup_info_args(subparsers), - "init": setup_init_args(subparsers), - "insert": setup_insert_args(subparsers), - "methods": setup_methods_args(subparsers), - "modify": setup_modify_args(subparsers), - "onlist": setup_onlist_args(subparsers), - "print": setup_print_args(subparsers), - "split": setup_split_args(subparsers), - "upgrade": setup_upgrade_args(subparsers), - "version": setup_version_args(subparsers), - } + command_to_parser = {} + for command in ["auth", "build", *COMMAND_MODULES.keys()]: + if command in command_to_parser: + continue + setup_func, _ = load_command(command) + command_to_parser[command] = setup_func(subparsers) return parser, command_to_parser @@ -89,8 +109,6 @@ def handle_no_args( if len(sys.argv) == 2: if sys.argv[1] in command_to_parser: command_to_parser[sys.argv[1]].print_help(sys.stderr) - elif sys.argv[1] == "--version": - print(f"seqspec {__version__}") else: parser.print_help(sys.stderr) sys.exit(1) @@ -100,6 +118,16 @@ def main() -> None: """Main entry point for the seqspec CLI.""" warnings.simplefilter("default", DeprecationWarning) + if len(sys.argv) == 2 and sys.argv[1] == "--version": + print(f"seqspec {__version__}") + sys.exit(0) + if len(sys.argv) >= 2 and sys.argv[1] == "build": + print( + "seqspec build is deprecated. Use seqspec init/insert/modify or construct the spec directly.", + file=sys.stderr, + ) + sys.exit(1) + logging.basicConfig( stream=sys.stderr, format="[%(levelname)s] %(message)s", @@ -111,25 +139,10 @@ def main() -> None: args = parser.parse_args() # Setup validator and runner for all subcommands - command_to_function: Dict[str, Callable[[ArgumentParser, Namespace], Any]] = { - "format": run_format, - "print": run_print, - "build": run_build, - "check": run_check, - "find": run_find, - "index": run_index, - "info": run_info, - "init": run_init, - "methods": run_methods, - "modify": run_modify, - "onlist": run_onlist, - "split": run_split, - "version": run_version, - "file": run_file, - "upgrade": run_upgrade, - # "convert": run_convert, - "insert": run_insert, - } + command_to_function: Dict[str, Callable[[ArgumentParser, Namespace], Any]] = {} + for command in command_to_parser: + _, run_func = load_command(command) + command_to_function[command] = run_func try: command_to_function[sys.argv[1]](parser, args) diff --git a/seqspec/py.typed b/seqspec/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/seqspec/report_assets/__init__.py b/seqspec/report_assets/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/seqspec/report_assets/__init__.py @@ -0,0 +1 @@ + diff --git a/seqspec/report_assets/app.js b/seqspec/report_assets/app.js new file mode 100644 index 00000000..53c94860 --- /dev/null +++ b/seqspec/report_assets/app.js @@ -0,0 +1,632 @@ +(function () { + const dataElement = document.getElementById("seqspec-view-data"); + const app = document.getElementById("app"); + const tip = document.getElementById("mol-tip"); + const data = JSON.parse(dataElement.textContent); + const repositoryUrl = + window.SEQSPEC_REPOSITORY || "https://github.com/pachterlab/seqspec"; + const toolVersion = window.SEQSPEC_TOOL_VERSION || data.seqspec_version || ""; + const selected = {}; + + function esc(value) { + if (value == null || value === "") { + return "\u2014"; + } + const div = document.createElement("div"); + div.textContent = String(value); + return div.innerHTML; + } + + function fmtValue(value) { + if (value == null || value === "") { + return "\u2014"; + } + if (typeof value === "boolean") { + return value ? "true" : "false"; + } + if (typeof value === "number") { + return Number.isInteger(value) ? value.toLocaleString() : value.toPrecision(4); + } + return esc(value); + } + + function lengthLabel(minLen, maxLen) { + if (minLen === maxLen) { + return `${fmtValue(minLen)} bp`; + } + return `${fmtValue(minLen)}-${fmtValue(maxLen)} bp`; + } + + function bpRangeLabel(start, end) { + return `${fmtValue(start)}-${fmtValue(end)} bp`; + } + + function kvList(rows) { + return `
${rows + .map( + ([key, value, mono]) => + `
${esc( + key, + )}
${value}
`, + ) + .join("")}
`; + } + + function detailSection(title, body) { + if (!body) { + return ""; + } + return `
${esc( + title, + )}
${body}
`; + } + + function detailTable(title, rows, columns) { + if (!rows || !rows.length) { + return ""; + } + const keys = + columns || + Array.from( + rows.reduce((seen, row) => { + Object.keys(row).forEach((key) => seen.add(key)); + return seen; + }, new Set()), + ); + return detailSection( + title, + `
${keys + .map((key) => ``) + .join("")}${rows + .map( + (row) => + `${keys.map((key) => ``).join("")}`, + ) + .join("")}
${esc(key)}
${fmtValue(row[key])}
`, + ); + } + + function pathLabel(pathNames) { + return (pathNames || []).join(" / "); + } + + function regionTooltip(region) { + let html = `
${esc(region.name)}
`; + html += `
${esc(region.region_type)} \u00b7 ${esc( + region.sequence_type, + )} \u00b7 ${esc(lengthLabel(region.min_len, region.max_len))}
`; + html += `
${esc(bpRangeLabel(region.bp_start, region.bp_end))}
`; + if (!region.is_leaf) { + html += `
${esc(region.child_region_ids.length)} child regions
`; + } + if (region.sequence) { + const preview = + region.sequence.length <= 40 + ? region.sequence + : `${region.sequence.slice(0, 37)}\u2026`; + html += `
${esc(preview)}
`; + } + return html; + } + + function readTooltip(read) { + let html = `
${esc(read.label || read.read_id)}
`; + html += `
${esc(read.read_id)} \u00b7 ${esc(read.strand)} \u00b7 ${esc( + lengthLabel(read.min_len, read.max_len), + )}
`; + html += `
${esc(bpRangeLabel(read.start, read.end))} anchored at ${esc( + read.primer_id, + )}
`; + return html; + } + + function seqTypeColor(sequenceType) { + if (sequenceType === "onlist") { + return "var(--reg-onlist)"; + } + if (sequenceType === "random") { + return "var(--reg-random)"; + } + return "var(--reg-fixed)"; + } + + function seqTypeStroke(sequenceType) { + if (sequenceType === "onlist") { + return "var(--reg-onlist-stroke)"; + } + if (sequenceType === "random") { + return "var(--reg-random-stroke)"; + } + return "var(--reg-fixed-stroke)"; + } + + function regionIsSelected(region, currentSelection) { + if (!currentSelection || currentSelection.kind !== "region") { + return false; + } + return (region.path_region_ids || []).includes(currentSelection.id); + } + + function buildMolSvg(modality) { + const leafRegions = modality.regions || []; + const regionNodes = modality.region_nodes || []; + const groupRegions = regionNodes.filter((region) => !region.is_leaf); + const reads = modality.reads || []; + const totalBp = + modality.total_bp || leafRegions.reduce((sum, region) => sum + region.len, 0); + const currentSelection = selected[modality.modality] || null; + + const pad = { left: 10, right: 10 }; + const molW = 860; + const barY = 70; + const barH = 22; + const svgW = molW + pad.left + pad.right; + const bpScale = molW / Math.max(totalBp, 1); + const minPx = 18; + const groupGap = 12; + const groupHeight = 8; + const groupLevels = groupRegions.length + ? Math.max(...groupRegions.map((region) => region.depth)) + 1 + : 0; + const groupTrackTop = barY - groupLevels * groupGap - 4; + const rects = []; + + let rawWidths = leafRegions.map((region) => region.len * bpScale); + let deficit = 0; + rawWidths = rawWidths.map((px) => { + if (px < minPx) { + deficit += minPx - px; + return minPx; + } + return px; + }); + if (deficit > 0) { + const shrinkTotal = rawWidths + .filter((px) => px > minPx) + .reduce((sum, px) => sum + px, 0); + if (shrinkTotal > 0) { + rawWidths = rawWidths.map((px) => + px > minPx ? px - deficit * (px / shrinkTotal) : px, + ); + } + } + + let currentX = pad.left; + leafRegions.forEach((region, index) => { + const width = rawWidths[index]; + rects.push({ ...region, x: currentX, w: width }); + currentX += width; + }); + + function bpToX(bp) { + if (bp <= 0) { + return pad.left; + } + for (const region of rects) { + if (bp <= region.bp_end) { + const fraction = region.len === 0 ? 0 : (bp - region.bp_start) / region.len; + return region.x + fraction * region.w; + } + } + const last = rects[rects.length - 1]; + return last ? last.x + last.w : pad.left; + } + + let svg = ""; + svg += ``; + svg += ``; + + groupRegions.forEach((region) => { + const x = bpToX(region.bp_start); + const width = Math.max(bpToX(region.bp_end) - x, 1); + const y = groupTrackTop + region.depth * groupGap; + const selectedClass = regionIsSelected(region, currentSelection) ? " selected" : ""; + svg += ``; + if (width > 42) { + svg += `${esc(region.name)}`; + } + }); + + rects.forEach((region) => { + const selectedClass = regionIsSelected(region, currentSelection) ? " selected" : ""; + svg += ``; + }); + + const labelY = barY + barH + 10; + rects.forEach((region) => { + const centerX = region.x + region.w / 2; + if (region.w > 14) { + svg += `${esc( + region.name, + )}`; + } + }); + + svg += `0`; + rects.forEach((region) => { + svg += `${region.bp_end}`; + }); + + const readColors = ["#1e40af", "#059669", "#d97706", "#dc2626", "#7c3aed"]; + const posReads = reads.filter((read) => read.strand === "pos"); + const negReads = reads.filter((read) => read.strand === "neg"); + + function drawRead(read, yBase, above, colorIndex) { + const color = readColors[colorIndex % readColors.length]; + const x1 = bpToX(read.start); + const x2 = bpToX(read.end); + const arrowSize = 5; + const selectedClass = + currentSelection && currentSelection.kind === "read" && currentSelection.id === read.read_id + ? " selected" + : ""; + const groupAttrs = `class="read-group${selectedClass}" data-kind="read" data-modality="${esc( + modality.modality, + )}" data-id="${esc(read.read_id)}"`; + + if (above) { + const y = yBase; + svg += `${esc(read.label || read.read_id)}`; + } else { + const y = yBase; + svg += `${esc( + read.label || read.read_id, + )}`; + } + } + + let posY = groupTrackTop - 14; + posReads.forEach((read, index) => { + drawRead(read, posY, true, index); + posY -= 22; + }); + + let negY = barY + barH + 48; + negReads.forEach((read, index) => { + drawRead(read, negY, false, posReads.length + index); + negY += 22; + }); + + const svgH = Math.max(negY + 10, barY + barH + 50); + return `${svg}`; + } + + function selectorRow(config) { + const { + modalityName, + kind, + id, + label, + sub, + meta, + active, + depth = 0, + nodeType = "", + hasChildren = false, + } = config; + const padding = kind === "region" ? 10 + depth * 16 : 10; + const marker = kind === "region" ? (hasChildren ? "\u25a1" : "\u2022") : "\u2192"; + const typeClass = nodeType ? ` ${nodeType}` : ""; + + return ``; + } + + function selectorHtml(modality) { + const currentSelection = selected[modality.modality] || null; + const regionRows = (modality.region_nodes || []).map((region) => + selectorRow({ + modalityName: modality.modality, + kind: "region", + id: region.region_id, + label: region.name, + sub: `${region.region_type} \u00b7 ${region.sequence_type}`, + meta: region.is_leaf + ? `${bpRangeLabel(region.bp_start, region.bp_end)}` + : `${bpRangeLabel(region.bp_start, region.bp_end)} \u00b7 ${ + region.child_region_ids.length + } children`, + active: + currentSelection && + currentSelection.kind === "region" && + currentSelection.id === region.region_id, + depth: region.depth || 0, + nodeType: region.is_leaf ? "leaf" : "branch", + hasChildren: !region.is_leaf, + }), + ); + + const readRows = (modality.reads || []).map((read) => + selectorRow({ + modalityName: modality.modality, + kind: "read", + id: read.read_id, + label: read.label || read.read_id, + sub: `${read.strand} \u00b7 ${read.primer_id}`, + meta: bpRangeLabel(read.start, read.end), + active: + currentSelection && currentSelection.kind === "read" && currentSelection.id === read.read_id, + depth: 0, + }), + ); + + return `
regions
${ + regionRows.join("") || '
No regions.
' + }
reads
${ + readRows.join("") || '
No reads.
' + }
`; + } + + function detailShell(header, sections) { + return `
${header}
${sections.join( + "", + )}
`; + } + + function modalitySummary(modality) { + const rows = [ + ["modality", esc(modality.modality), true], + ["library region", esc(modality.library_region_id), true], + ["total length", esc(`${modality.total_bp} bp`), true], + ["region count", esc(modality.region_nodes.length), true], + ["read count", esc(modality.reads.length), true], + ]; + + const sections = [ + detailSection( + "summary", + `
Select a region or read to inspect its metadata.
${kvList( + rows, + )}`, + ), + detailTable("sequence protocols", modality.sequence_protocols, ["protocol_id", "name"]), + detailTable("sequence kits", modality.sequence_kits, ["kit_id", "name"]), + detailTable("library protocols", modality.library_protocols, ["protocol_id", "name"]), + detailTable("library kits", modality.library_kits, ["kit_id", "name"]), + ].filter(Boolean); + + return detailShell("modality", sections); + } + + function regionDetails(modality, region) { + const children = (modality.region_nodes || []).filter( + (node) => node.parent_region_id === region.region_id, + ); + const rows = [ + ["region id", esc(region.region_id), true], + ["name", esc(region.name), false], + ["path", esc(pathLabel(region.path_names)), true], + ["region type", esc(region.region_type), true], + ["sequence type", esc(region.sequence_type), true], + ["length", esc(lengthLabel(region.min_len, region.max_len)), true], + ["bp range", esc(bpRangeLabel(region.bp_start, region.bp_end)), true], + ["leaf", esc(region.is_leaf ? "true" : "false"), true], + ["children", esc(region.child_region_ids.length), true], + ]; + + const sections = [ + detailSection("metadata", kvList(rows)), + region.sequence + ? detailSection("sequence", `
${esc(region.sequence)}
`) + : "", + children.length + ? detailTable( + "child regions", + children.map((child) => ({ + region_id: child.region_id, + name: child.name, + region_type: child.region_type, + sequence_type: child.sequence_type, + bp_range: bpRangeLabel(child.bp_start, child.bp_end), + length: child.len, + })), + ["region_id", "name", "region_type", "sequence_type", "bp_range", "length"], + ) + : "", + region.onlist + ? detailTable("onlist", [region.onlist], [ + "file_id", + "filename", + "filetype", + "urltype", + "url", + "md5", + ]) + : "", + ].filter(Boolean); + + return detailShell(`region \u00b7 ${esc(region.name)}`, sections); + } + + function readDetails(read) { + const rows = [ + ["read id", esc(read.read_id), true], + ["name", esc(read.name), false], + ["strand", esc(read.strand), true], + ["primer id", esc(read.primer_id), true], + ["length", esc(lengthLabel(read.min_len, read.max_len)), true], + ["bp range", esc(bpRangeLabel(read.start, read.end)), true], + ]; + + const sections = [ + detailSection("metadata", kvList(rows)), + detailTable("files", read.files, ["file_id", "filename", "filetype", "urltype", "url", "md5"]), + ].filter(Boolean); + + return detailShell(`read \u00b7 ${esc(read.label || read.read_id)}`, sections); + } + + function selectionHtml(modality) { + const currentSelection = selected[modality.modality] || null; + if (!currentSelection) { + return modalitySummary(modality); + } + if (currentSelection.kind === "region") { + const region = (modality.region_nodes || []).find( + (node) => node.region_id === currentSelection.id, + ); + if (region) { + return regionDetails(modality, region); + } + } + if (currentSelection.kind === "read") { + const read = (modality.reads || []).find((node) => node.read_id === currentSelection.id); + if (read) { + return readDetails(read); + } + } + return modalitySummary(modality); + } + + function assaySummary() { + const rows = [ + ["assay id", esc(data.assay_id), true], + ["name", esc(data.assay_name), false], + ["seqspec version", esc(data.seqspec_version || toolVersion), true], + ["date", esc(data.date || ""), false], + [ + "doi", + data.doi + ? `${esc(data.doi)}` + : "\u2014", + false, + ], + [ + "library structure", + data.lib_struct + ? `${esc(data.lib_struct)}` + : "\u2014", + false, + ], + [ + "modalities", + esc((data.modalities || []).map((modality) => modality.modality).join(", ")), + true, + ], + ]; + return `
Assay
${esc( + data.description || "", + )}
${kvList(rows)}
`; + } + + function render() { + const modalities = data.modalities || []; + let html = `
seqspec view
assay ${esc( + data.assay_name, + )}|id ${esc( + data.assay_id, + )}|version ${esc( + data.seqspec_version || toolVersion, + )}|repo ${esc(repositoryUrl)}
`; + html += assaySummary(); + + modalities.forEach((modality) => { + html += `
Library Structure \u2014 ${esc( + data.assay_name, + )} (${esc(modality.modality)})
${buildMolSvg( + modality, + )}
fixedonlistrandomnested region span
${selectorHtml( + modality, + )}
${selectionHtml(modality)}
`; + }); + + app.innerHTML = html; + bind(); + } + + function bind() { + document.querySelectorAll("[data-kind][data-modality][data-id]").forEach((element) => { + element.addEventListener("click", (event) => { + event.preventDefault(); + event.stopPropagation(); + const modality = element.getAttribute("data-modality"); + const kind = element.getAttribute("data-kind"); + const id = element.getAttribute("data-id"); + const existing = selected[modality]; + if (existing && existing.kind === kind && existing.id === id) { + delete selected[modality]; + } else { + selected[modality] = { kind, id }; + } + render(); + }); + }); + + document.querySelectorAll(".region-rect[data-id], .group-rect[data-id]").forEach((element) => { + const modality = element.getAttribute("data-modality"); + const regionId = element.getAttribute("data-id"); + const mod = (data.modalities || []).find((item) => item.modality === modality); + const region = mod && (mod.region_nodes || []).find((item) => item.region_id === regionId); + if (!region) { + return; + } + element.addEventListener("mouseenter", () => { + tip.innerHTML = regionTooltip(region); + tip.classList.add("show"); + }); + element.addEventListener("mousemove", (event) => { + tip.style.left = `${event.clientX + 12}px`; + tip.style.top = `${event.clientY - 10}px`; + }); + element.addEventListener("mouseleave", () => { + tip.classList.remove("show"); + }); + }); + + document.querySelectorAll(".read-group[data-id]").forEach((element) => { + const modality = element.getAttribute("data-modality"); + const readId = element.getAttribute("data-id"); + const mod = (data.modalities || []).find((item) => item.modality === modality); + const read = mod && (mod.reads || []).find((item) => item.read_id === readId); + if (!read) { + return; + } + element.addEventListener("mouseenter", () => { + tip.innerHTML = readTooltip(read); + tip.classList.add("show"); + }); + element.addEventListener("mousemove", (event) => { + tip.style.left = `${event.clientX + 12}px`; + tip.style.top = `${event.clientY - 10}px`; + }); + element.addEventListener("mouseleave", () => { + tip.classList.remove("show"); + }); + }); + } + + render(); +})(); diff --git a/seqspec/report_assets/style.css b/seqspec/report_assets/style.css new file mode 100644 index 00000000..b3c9ed3c --- /dev/null +++ b/seqspec/report_assets/style.css @@ -0,0 +1,538 @@ +:root { + --bg: #ffffff; + --bg-alt: #f7f8f9; + --bg-hover: #eef0f2; + --border: #dcdfe3; + --border-light: #e9ecef; + --text: #1a1d21; + --text-2: #4a5058; + --text-3: #848a92; + --mono: "IBM Plex Mono", "Menlo", "Monaco", "Cascadia Mono", "Segoe UI Mono", monospace; + --sans: "IBM Plex Sans", -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; + --reg-fixed: #e2e5e9; + --reg-onlist: #bbf7d0; + --reg-random: #bfdbfe; + --reg-fixed-stroke: #b0b5bc; + --reg-onlist-stroke: #4ade80; + --reg-random-stroke: #60a5fa; + --read-pos: #1e40af; + --read-neg: #92400e; +} + +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +body { + font-family: var(--sans); + font-size: 13.5px; + line-height: 1.55; + color: var(--text); + background: var(--bg); + -webkit-font-smoothing: antialiased; +} + +.wrap { + max-width: 980px; + margin: 0 auto; + padding: 32px 24px 64px; +} + +.hdr { + margin-bottom: 18px; +} + +.hdr-title { + font-family: var(--mono); + font-size: 14px; + font-weight: 600; + margin-bottom: 6px; +} + +.hdr-row { + font-size: 12px; + color: var(--text-2); + line-height: 1.7; +} + +.hdr-row .l { + color: var(--text-3); + font-weight: 500; + margin-right: 2px; +} + +.hdr-row .sep { + margin: 0 8px; + color: var(--border); +} + +.inline-link { + color: var(--read-pos); + text-decoration: none; +} + +.inline-link:hover { + text-decoration: underline; +} + +.section, +.meta-section { + margin-bottom: 18px; + border: 1px solid var(--border-light); + border-radius: 4px; + overflow: hidden; +} + +.section-head, +.meta-section-head { + font-size: 11px; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.06em; + color: var(--text-3); + padding: 8px 14px; + background: var(--bg-alt); + border-bottom: 1px solid var(--border-light); +} + +.section-body, +.meta-section-body { + padding: 12px 14px; +} + +.description { + font-size: 12.5px; + color: var(--text-2); + margin-bottom: 12px; +} + +.kv-list { + display: grid; + grid-template-columns: minmax(120px, 180px) minmax(0, 1fr); + row-gap: 6px; + column-gap: 12px; + align-items: start; +} + +.kv-row { + display: contents; +} + +.kv-key { + color: var(--text-3); + font-size: 11px; + text-transform: uppercase; + letter-spacing: 0.05em; + font-weight: 600; +} + +.kv-value { + color: var(--text-2); + min-width: 0; + word-break: break-word; +} + +.kv-value.mono, +.table-wrap td, +.selection-note, +.region-seq, +.selector-row, +.modality-meta, +.detail-shell-head { + font-family: var(--mono); +} + +.modality-section, +.mol-section { + margin-bottom: 18px; + border: 1px solid var(--border-light); + border-radius: 4px; + overflow: hidden; +} + +.modality-head, +.mol-section-head { + display: flex; + align-items: center; + justify-content: space-between; + gap: 12px; + padding: 8px 14px; + background: var(--bg-alt); + border-bottom: 1px solid var(--border-light); +} + +.modality-title { + font-size: 11px; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.06em; + color: var(--text-3); +} + +.modality-meta, +.mol-section-meta { + color: var(--text-3); + font-size: 11px; +} + +.mol-section-head { + display: block; +} + +.mol-section-meta { + padding: 8px 14px; + border-bottom: 1px solid var(--border-light); + background: var(--bg); +} + +.mol-body { + padding: 16px 14px 12px; + overflow-x: auto; + border-bottom: 1px solid var(--border-light); +} + +.mol-svg { + display: block; +} + +.mol-svg .region-rect, +.mol-svg .group-rect, +.mol-svg .read-group { + cursor: pointer; +} + +.mol-svg .region-rect, +.mol-svg .group-rect { + transition: opacity 0.12s, stroke-width 0.12s, fill-opacity 0.12s; +} + +.mol-svg .region-rect:hover, +.mol-svg .group-rect:hover, +.mol-svg .read-group:hover { + opacity: 0.8; +} + +.mol-svg .region-rect.selected { + stroke: var(--text); + stroke-width: 1.5; +} + +.mol-svg .group-rect { + fill: rgba(255, 255, 255, 0.8); + stroke: var(--text-3); + stroke-width: 0.85; +} + +.mol-svg .group-rect.selected { + stroke: var(--text); + stroke-width: 1.5; + fill: rgba(14, 165, 233, 0.08); +} + +.mol-svg .group-label { + font-family: var(--mono); + font-size: 8.5px; + fill: var(--text-3); +} + +.mol-svg .region-label { + font-family: var(--mono); + font-size: 9.5px; + fill: var(--text-2); + pointer-events: none; +} + +.mol-svg .read-line { + stroke-width: 1.5; + fill: none; +} + +.mol-svg .read-group.selected .read-line, +.mol-svg .read-group.selected polygon { + stroke-width: 2; + filter: drop-shadow(0 0 0.2rem rgba(0, 0, 0, 0.18)); +} + +.mol-svg .read-label { + font-family: var(--mono); + font-size: 9px; + fill: var(--text-2); +} + +.mol-svg .bp-label { + font-family: var(--mono); + font-size: 7.5px; + fill: var(--text-3); +} + +.mol-tip { + display: none; + position: fixed; + z-index: 100; + background: var(--text); + color: #f0f1f2; + font-size: 11.5px; + font-family: var(--mono); + padding: 8px 11px; + border-radius: 4px; + line-height: 1.6; + max-width: 340px; + pointer-events: none; + box-shadow: 0 4px 16px rgba(0, 0, 0, 0.18); +} + +.mol-tip.show { + display: block; +} + +.mol-tip .tip-name { + font-weight: 600; + color: #fff; +} + +.mol-tip .tip-dim { + color: #9ca3af; +} + +.mol-legend { + display: flex; + flex-wrap: wrap; + gap: 14px; + padding: 8px 14px; + border-top: 1px solid var(--border-light); + font-size: 11px; + color: var(--text-3); +} + +.mol-legend .leg-swatch { + display: inline-block; + width: 10px; + height: 10px; + border-radius: 2px; + margin-right: 4px; + vertical-align: middle; +} + +.mol-legend .leg-swatch.outline { + background: transparent; + border: 1px solid var(--text-3); +} + +.detail-layout { + display: grid; + grid-template-columns: minmax(270px, 320px) minmax(0, 1fr); + gap: 14px; + padding: 12px 14px 14px; +} + +.selector-pane, +.detail-pane { + min-width: 0; +} + +.selector-group, +.detail-shell { + border: 1px solid var(--border-light); + border-radius: 4px; + overflow: hidden; + background: var(--bg); +} + +.selector-group + .selector-group { + margin-top: 10px; +} + +.selector-head, +.detail-shell-head { + font-size: 10px; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.06em; + color: var(--text-3); + padding: 8px 10px; + background: var(--bg-alt); + border-bottom: 1px solid var(--border-light); +} + +.selector-row { + display: grid; + grid-template-columns: minmax(0, 1fr) auto; + gap: 10px; + width: 100%; + padding: 7px 10px; + border: none; + border-bottom: 1px solid var(--border-light); + background: var(--bg); + color: var(--text-2); + text-align: left; + font-size: 11.5px; + cursor: pointer; +} + +.selector-row:last-child { + border-bottom: none; +} + +.selector-row:hover { + background: var(--bg-hover); +} + +.selector-row.active { + background: #eef4ff; + color: var(--text); +} + +.selector-row.branch .selector-label { + font-weight: 600; +} + +.selector-main { + min-width: 0; +} + +.selector-label-line { + display: flex; + align-items: baseline; + gap: 6px; + min-width: 0; +} + +.selector-marker { + width: 10px; + color: var(--text-3); + flex-shrink: 0; +} + +.selector-label { + min-width: 0; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} + +.selector-sub { + display: block; + color: var(--text-3); + font-size: 10.5px; + margin-top: 1px; +} + +.selector-meta { + color: var(--text-3); + white-space: nowrap; + font-size: 10.5px; + text-align: right; +} + +.detail-shell-body { + padding: 0 12px; +} + +.detail-section { + padding: 10px 0 12px; + border-bottom: 1px solid var(--border-light); +} + +.detail-section:last-child { + border-bottom: none; +} + +.detail-title { + font-size: 10px; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.06em; + color: var(--text-3); + margin-bottom: 8px; +} + +.detail-content { + min-width: 0; +} + +.selection-note { + font-size: 11px; + color: var(--text-3); + margin-bottom: 10px; +} + +.region-seq { + white-space: pre-wrap; + word-break: break-word; + font-size: 11.5px; + line-height: 1.6; + color: var(--text-2); +} + +.table-wrap { + overflow-x: auto; +} + +.table-wrap table { + width: 100%; + border-collapse: collapse; + font-size: 11.5px; +} + +.table-wrap th { + text-align: left; + padding: 4px 8px; + font-weight: 600; + font-size: 10px; + text-transform: uppercase; + letter-spacing: 0.04em; + color: var(--text-3); + background: var(--bg); + border-bottom: 1px solid var(--border); + white-space: nowrap; +} + +.table-wrap td { + padding: 3px 8px; + border-bottom: 1px solid var(--border-light); + font-size: 11px; + word-break: break-word; + color: var(--text-2); +} + +.table-wrap tr:last-child td { + border-bottom: none; +} + +.empty-state { + padding: 10px; + color: var(--text-3); + font-size: 12px; +} + +@media (max-width: 900px) { + .wrap { + padding: 20px 14px 40px; + } + + .modality-head { + flex-direction: column; + align-items: flex-start; + } + + .detail-layout { + grid-template-columns: 1fr; + } +} + +@media (max-width: 640px) { + .kv-list { + grid-template-columns: 110px minmax(0, 1fr); + } + + .selector-row { + grid-template-columns: 1fr; + } + + .selector-meta { + text-align: left; + padding-left: 26px; + } +} diff --git a/seqspec/report_assets/template.html b/seqspec/report_assets/template.html new file mode 100644 index 00000000..e758f812 --- /dev/null +++ b/seqspec/report_assets/template.html @@ -0,0 +1,24 @@ + + + + + + seqspec view + + + +
+
+ + + + + + diff --git a/seqspec/seqspec_auth.py b/seqspec/seqspec_auth.py new file mode 100644 index 00000000..2f5b6b4d --- /dev/null +++ b/seqspec/seqspec_auth.py @@ -0,0 +1,74 @@ +"""Auth module for seqspec CLI.""" + +import json +from argparse import ArgumentParser, Namespace, RawTextHelpFormatter + +from seqspec.auth import AuthProfile, AuthRegistry, init_profile + + +def setup_auth_args(parser) -> ArgumentParser: + subparser = parser.add_parser( + "auth", + description=""" +Manage remote authentication profiles. + +Examples: +seqspec auth init --profile igvf --host api.data.igvf.org --host data.igvf.org --username-env IGVF_ACCESS_KEY_ID --password-env IGVF_ACCESS_KEY_SECRET +seqspec auth path +seqspec auth list +seqspec auth resolve https://api.data.igvf.org/reference-files/... +--- +""", + help="Manage remote authentication profiles", + formatter_class=RawTextHelpFormatter, + ) + subparsers = subparser.add_subparsers(dest="auth_command", metavar="") + + init_parser = subparsers.add_parser("init", help="Create or update an auth profile") + init_parser.add_argument("--profile", required=True) + init_parser.add_argument("--host", dest="hosts", action="append", required=True) + init_parser.add_argument("--kind", default="basic", choices=["basic"]) + init_parser.add_argument("--username-env", required=True) + init_parser.add_argument("--password-env", required=True) + + subparsers.add_parser("path", help="Show auth config path") + subparsers.add_parser("list", help="List auth profiles") + + resolve_parser = subparsers.add_parser("resolve", help="Resolve profile for a URL") + resolve_parser.add_argument("url") + resolve_parser.add_argument("--auth-profile", default=None) + + return subparser + + +def run_auth(parser: ArgumentParser, args: Namespace) -> None: + if args.auth_command == "init": + value = init_profile( + args.profile, + AuthProfile( + hosts=args.hosts, + kind=args.kind, + username_env=args.username_env, + password_env=args.password_env, + ), + ) + elif args.auth_command == "path": + registry = AuthRegistry.load() + location = registry.location + value = { + "kind": "auth_config_path", + "source": location["source"], + "path": str(location["path"]) if location["path"] else None, + "exists": location["exists"], + } + elif args.auth_command == "list": + registry = AuthRegistry.load() + value = registry.profile_summaries() + elif args.auth_command == "resolve": + registry = AuthRegistry.load() + value = registry.resolve_summary(args.url, args.auth_profile) + else: + parser.error("auth requires a subcommand") + return + + print(json.dumps(value, indent=2)) diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index bd4ec30c..8ede28bb 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -3,7 +3,9 @@ This module provides functionality to validate seqspec files against the specification schema. """ +import os from argparse import ArgumentParser, Namespace, RawTextHelpFormatter +from itertools import combinations from os import path from pathlib import Path from typing import Dict, List, Optional @@ -12,7 +14,14 @@ from jsonschema import Draft4Validator from seqspec.Assay import Assay -from seqspec.utils import file_exists, load_spec +from seqspec.Region import itx_read, project_regions_to_coordinates +from seqspec.utils import ( + file_exists, + load_spec, + local_onlist_locator, + local_resource_url, + map_read_id_to_regions, +) def setup_check_args(parser): @@ -44,7 +53,14 @@ def setup_check_args(parser): help="Skip checks", type=str, default=None, - choices=["igvf", "igvf_onlist_skip"], + choices=["igvf", "igvf_onlist_skip", "structural"], + ) + subparser.add_argument( + "--auth-profile", + metavar="PROFILE", + help="Authentication profile for remote resource checks", + type=str, + default=os.environ.get("SEQSPEC_AUTH_PROFILE"), ) subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) @@ -61,21 +77,49 @@ def validate_check_args(parser: ArgumentParser, args: Namespace) -> None: parser.error(f"Output path exists but is not a file: {args.output}") +def make_diagnostic( + severity: str, error_type: str, error_message: str, error_object: str +): + return { + "severity": severity, + "error_type": error_type, + "error_message": error_message, + "error_object": error_object, + } + + +def normalize_diagnostics(diagnostics: List[Dict]) -> List[Dict]: + return [ + make_diagnostic( + severity=diagnostic.get("severity", "error"), + error_type=diagnostic["error_type"], + error_message=diagnostic["error_message"], + error_object=diagnostic["error_object"], + ) + for diagnostic in diagnostics + ] + + def format_error(errobj, idx=0): - return f"[error {idx}] {errobj['error_message']}" + severity = errobj.get("severity", "error") + return f"[{severity} {idx}] {errobj['error_message']}" -def seqspec_check(spec: Assay, filter_type: Optional[str] = None) -> List[Dict]: - """Core functionality to check a seqspec and return filtered errors. +def seqspec_check( + spec: Assay, + filter_type: Optional[str] = None, + auth_profile: Optional[str] = None, +) -> List[Dict]: + """Core functionality to check a seqspec and return filtered diagnostics. Args: spec: The Assay object to check - filter_type: Optional filter type to apply to errors (e.g. "igvf", "igvf_onlist_skip") + filter_type: Optional filter type to apply to diagnostics (e.g. "igvf", "igvf_onlist_skip") Returns: - List of error dictionaries + List of diagnostic dictionaries """ - errors = check(spec) + errors = check(spec, auth_profile=auth_profile) if filter_type: errors = filter_errors(errors, filter_type) @@ -87,7 +131,7 @@ def run_check(parser: ArgumentParser, args: Namespace): validate_check_args(parser, args) spec = load_spec(args.yaml, strict=False) - errors = seqspec_check(spec, args.skip) + errors = seqspec_check(spec, args.skip, args.auth_profile) if args.output: with open(args.output, "w") as f: @@ -110,9 +154,33 @@ def run_check(parser: ArgumentParser, args: Namespace): IGVF_ONLIST_SKIP_FILTERS = IGVF_FILTERS + [ {"error_type": "check_onlist_files_exist", "error_object": "onlist"} ] +STRUCTURAL_CHECK_TYPES = [ + "check_unique_modalities", + "check_region_ids_modalities", + "check_unique_read_ids", + "check_unique_read_primer_strand_pairs", + "check_unique_region_ids", + "check_read_modalities", + "check_primer_ids_in_region_ids", + "check_sequence_types", + "check_region_lengths", + "check_sequence_lengths", + "check_read_file_count", + "check_region_against_subregion_length", + "check_region_against_subregion_sequence", + "check_read_length_against_library", + "check_overlapping_read_regions", +] def filter_errors(errors, filter_type): + if filter_type == "structural": + return [ + error + for error in errors + if error["error_type"] not in STRUCTURAL_CHECK_TYPES + ] + filters = None if filter_type == "igvf": filters = IGVF_FILTERS @@ -137,14 +205,14 @@ def filter_errors(errors, filter_type): return errors -def check(spec: Assay): +def check(spec: Assay, auth_profile: Optional[str] = None): # Variety of checks against schema def check_schema(spec: Assay, errors=[], idx=0): schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") with open(schema_fn, "r") as stream: schema = yaml.load(stream, Loader=yaml.Loader) validator = Draft4Validator(schema) - for idx, error in enumerate(validator.iter_errors(spec.to_dict()), 1): + for idx, error in enumerate(validator.iter_errors(spec.model_dump()), 1): err_elements = [repr(index) for index in error.path] err_path = f"spec[{']['.join(err_elements)}]" errobj = { @@ -202,8 +270,19 @@ def check_onlist_files_exist(spec, errors, idx): for ol in olrgns: if ol.urltype == "local": - if ol.filename[:-3] == ".gz": - check = ol.url + try: + locator = local_onlist_locator(ol) + except ValueError as err: + errobj = { + "error_type": "check_onlist_files_exist", + "error_message": str(err), + "error_object": "onlist", + } + errors.append(errobj) + idx += 1 + continue + if locator.endswith(".gz"): + check = locator if spec_base and not Path(check).is_absolute(): check = str((spec_base / check).resolve()) if not path.exists(check): @@ -215,8 +294,8 @@ def check_onlist_files_exist(spec, errors, idx): errors.append(errobj) idx += 1 else: - check = ol.url - check_gz = ol.url + ".gz" + check = locator + check_gz = locator + ".gz" if spec_base: if not Path(check).is_absolute(): check = str((spec_base / check).resolve()) @@ -233,17 +312,16 @@ def check_onlist_files_exist(spec, errors, idx): elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp": # ping the link with a simple http request to check if the file exists at that URI if spec.seqspec_version == "0.3.0": - if not file_exists(ol.url): + if not file_exists(ol.url, auth_profile): errobj = { "error_type": "check_onlist_files_exist", "error_message": f"{ol.filename} does not exist", "error_object": "onlist", } - errors.append(errobj) idx += 1 else: - if not file_exists(ol.url): + if not file_exists(ol.url, auth_profile): errobj = { "error_type": "check_onlist_files_exist", "error_message": f"{ol.filename} does not exist", @@ -283,7 +361,17 @@ def check_read_files_exist(spec, errors, idx): for read in spec.sequence_spec: for f in read.files: if f.urltype == "local": - check = f.url + try: + check = local_resource_url(f.url, f.filename, "file") + except ValueError as err: + errobj = { + "error_type": "check_read_files_exist", + "error_message": str(err), + "error_object": "file", + } + errors.append(errobj) + idx += 1 + continue if spec_base and not Path(check).is_absolute(): check = str((spec_base / check).resolve()) if not path.exists(check): @@ -296,7 +384,7 @@ def check_read_files_exist(spec, errors, idx): idx += 1 elif f.urltype == "http" or f.urltype == "https" or f.urltype == "ftp": # ping the link with a simple http request to check if the file exists at that URI - if not file_exists(f.url): + if not file_exists(f.url, auth_profile): errobj = { "error_type": "check_read_files_exist", "error_message": f"{f.filename} does not exist", @@ -446,6 +534,62 @@ def check_read_length_against_library(spec: Assay, errors, idx): return (errors, idx) + def check_overlapping_read_regions(spec: Assay, errors, idx): + for modality in spec.modalities: + reads = spec.get_seqspec(modality) + projected_reads = [] + + for read in reads: + try: + mapped_read, regions = map_read_id_to_regions( + spec, modality, read.read_id + ) + except IndexError: + continue + + region_coordinates = project_regions_to_coordinates(regions) + projected_reads.append( + (mapped_read, itx_read(region_coordinates, 0, mapped_read.max_len)) + ) + + for (left_read, left_regions), (right_read, right_regions) in combinations( + projected_reads, 2 + ): + right_region_ids = {region.region_id for region in right_regions} + shared_region_ids = [] + seen_region_ids = set() + + for region in left_regions: + if ( + region.region_id in right_region_ids + and region.region_id not in seen_region_ids + ): + shared_region_ids.append(region.region_id) + seen_region_ids.add(region.region_id) + + if not shared_region_ids: + continue + + region_list = ", ".join( + f"'{region_id}'" for region_id in shared_region_ids + ) + errors.append( + make_diagnostic( + severity="warning", + error_type="check_overlapping_read_regions", + error_message=( + f"reads '{left_read.read_id}' and '{right_read.read_id}' in modality " + f"'{modality}' both cover region(s) {region_list}. Downstream tools " + "may require explicit overlap handling such as " + "`seqspec index --no-overlap`" + ), + error_object="read", + ) + ) + idx += 1 + + return (errors, idx) + def check_sequence_types(spec, errors, idx): modes = spec.modalities @@ -651,9 +795,10 @@ def check_sub_sequences(rgn, errors, idx): "check_region_against_subregion_length": check_region_against_subregion_length, "check_region_against_subregion_sequence": check_region_against_subregion_sequence, "check_read_length_against_library": check_read_length_against_library, + "check_overlapping_read_regions": check_overlapping_read_regions, } for k, v in checks.items(): # print(k) errors, idx = v(spec, errors, idx) - return errors + return normalize_diagnostics(errors) diff --git a/seqspec/seqspec_convert.py b/seqspec/seqspec_convert.py index 6eee1250..663413fc 100644 --- a/seqspec/seqspec_convert.py +++ b/seqspec/seqspec_convert.py @@ -179,7 +179,7 @@ def seqspec_to_token(spec): specs_regions = {} modalities = spec.list_modalities() for modality in modalities: - regions = [i.to_dict() for i in spec.get_libspec(modality).get_leaves()] + regions = [i.model_dump() for i in spec.get_libspec(modality).get_leaves()] specs_regions[modality] = regions # Convert to tokenized matrix diff --git a/seqspec/seqspec_file.py b/seqspec/seqspec_file.py index 88ca1bf0..d13224d9 100644 --- a/seqspec/seqspec_file.py +++ b/seqspec/seqspec_file.py @@ -270,7 +270,7 @@ def format_json_files( for items in zip(*files.values()): if k == "all": for key, item in zip(files.keys(), items): - d = item.to_dict() + d = item.model_dump() if item.urltype == "local" and fp: d["url"] = str(spec_fn.parent / d["url"]) x.append(d) diff --git a/seqspec/seqspec_index.py b/seqspec/seqspec_index.py index dabe0373..ff596b59 100644 --- a/seqspec/seqspec_index.py +++ b/seqspec/seqspec_index.py @@ -353,9 +353,11 @@ def get_coordinate_by_read_id(spec: Assay, modality: str, read_id: str) -> Coord return coord + FEATURE_REGION_TYPES = {"CDNA", "GDNA", "PROTEIN", "TAG", "SGRNA_TARGET"} -def format_kallisto_bus(indices: List[Coordinate], subregion_type=None): + +def format_kallisto_bus(indices: List[Coordinate], subregion_type=None) -> str: bcs = [] umi = [] feature = [] @@ -376,7 +378,9 @@ def format_kallisto_bus(indices: List[Coordinate], subregion_type=None): return x -def format_kallisto_bus_force_single(indices: List[Coordinate], subregion_type=None): +def format_kallisto_bus_force_single( + indices: List[Coordinate], subregion_type=None +) -> str: bcs = [] umi = [] feature = [] @@ -408,7 +412,7 @@ def format_kallisto_bus_force_single(indices: List[Coordinate], subregion_type=N # this one should only return one string # TODO: return to this -def format_seqkit_subseq(indices: List[Coordinate], subregion_type=None): +def format_seqkit_subseq(indices: List[Coordinate], subregion_type=None) -> str: # The x string format is start:stop (1-indexed) # x = "" # region = indices[0] @@ -422,7 +426,7 @@ def format_seqkit_subseq(indices: List[Coordinate], subregion_type=None): return x -def format_tab(indices: List[Coordinate], subregion_type=None): +def format_tab(indices: List[Coordinate], subregion_type=None) -> str: x = "" for idx, coord in enumerate(indices): rcv = coord.rcv @@ -433,14 +437,16 @@ def format_tab(indices: List[Coordinate], subregion_type=None): return x[:-1] -def format_starsolo(indices: List[Coordinate], subregion_type=None): +def format_starsolo(indices: List[Coordinate], subregion_type=None) -> str: bcs = [] umi = [] cdna = [] for idx, coord in enumerate(indices): for cut in coord.rcv: if cut.region_type.upper() == "BARCODE": - bcs.append(f"--soloCBstart {cut.start + 1} --soloCBlen {cut.stop}") + bcs.append( + f"--soloCBstart {cut.start + 1} --soloCBlen {cut.stop - cut.start}" + ) elif cut.region_type.upper() == "UMI": umi.append( f"--soloUMIstart {cut.start + 1} --soloUMIlen {cut.stop - cut.start}" @@ -451,7 +457,7 @@ def format_starsolo(indices: List[Coordinate], subregion_type=None): return x -def format_simpleaf(indices: List[Coordinate], subregion_type=None): +def format_simpleaf(indices: List[Coordinate], subregion_type=None) -> str: x = "" xl = [] for idx, coord in enumerate(indices): @@ -469,7 +475,7 @@ def format_simpleaf(indices: List[Coordinate], subregion_type=None): return "".join(xl) -def format_zumis(indices: List[Coordinate], subregion_type=None): +def format_zumis(indices: List[Coordinate], subregion_type=None) -> str: xl = [] for idx, coord in enumerate(indices): x = "" @@ -486,7 +492,7 @@ def format_zumis(indices: List[Coordinate], subregion_type=None): def stable_deduplicate_fqs(fqs): - # stably deduplicate gdna_fqs + # stably deduplicate fqs seen_fqs = set() deduplicated_fqs = [] for r in fqs: @@ -496,7 +502,7 @@ def stable_deduplicate_fqs(fqs): return deduplicated_fqs -def format_chromap(indices: List[Coordinate], subregion_type=None): +def format_chromap(indices: List[Coordinate], subregion_type=None) -> str: bc_fqs = [] bc_str = [] gdna_fqs = [] @@ -563,7 +569,7 @@ def filter_groupby_region_type(g, keep=["umi", "barcode", "cdna"]): return g -def format_relative(indices: List[Coordinate], subregion_type=None): +def format_relative(indices: List[Coordinate], subregion_type=None) -> str: x = "" for idx, coord in enumerate(indices): rg_strand = coord.strand # noqa @@ -622,9 +628,6 @@ def groupby_region_type(rgns): return d -# def group_regions_by_region_type(rgns): - - def format_splitcode_row(obj, rgncdiffs, idx=0, rev=False, complement=False): # print(obj.region_id, idx) # TODO only have one object left and one object right of the sequence @@ -689,7 +692,7 @@ def format_splitcode_row(obj, rgncdiffs, idx=0, rev=False, complement=False): return {"region_type": obj.region_type, "fmt": e} -def format_splitcode(indices: List[Coordinate], subregion_type=None): +def format_splitcode(indices: List[Coordinate], subregion_type=None) -> str: # extraction based on fixed sequences # extraction based on onlist sequences # umi - bc3 - link2 - bc2 - link1 - bc1 - read diff --git a/seqspec/seqspec_info.py b/seqspec/seqspec_info.py index e3594ec6..aa9bc6cb 100644 --- a/seqspec/seqspec_info.py +++ b/seqspec/seqspec_info.py @@ -152,7 +152,7 @@ def seqspec_info_meta(spec: Assay) -> Dict: Returns: Dictionary containing meta information """ - sd = spec.to_dict() + sd = spec.model_dump() del sd["library_spec"] del sd["sequence_spec"] del sd["modalities"] @@ -173,7 +173,8 @@ def seqspec_info_library_spec(spec: Assay) -> Dict: for m in modalities: libspec = spec.get_libspec(m) leaves = libspec.get_leaves() - result[m] = leaves if leaves else [] + r = leaves if leaves else [] + result[m] = [i.model_dump() for i in r] return {"library_spec": result} @@ -254,9 +255,7 @@ def format_sequence_spec(info: Dict, fmt: str = "tab") -> str: ) return "\n".join(lines) elif fmt == "json": - return json.dumps( - [i.model_dump() for i in info["sequence_spec"]], sort_keys=False, indent=4 - ) + return json.dumps(info["sequence_spec"], sort_keys=False, indent=4) return "" @@ -280,9 +279,5 @@ def format_library_spec(info: Dict, fmt: str = "tab") -> str: ) return "\n".join(lines) elif fmt == "json": - return json.dumps( - {m: [i.model_dump() for i in r] for m, r in info["library_spec"].items()}, - sort_keys=False, - indent=4, - ) + return json.dumps(info["library_spec"], sort_keys=False, indent=4) return "" diff --git a/seqspec/seqspec_insert.py b/seqspec/seqspec_insert.py index cbf893a6..4db1d5ea 100644 --- a/seqspec/seqspec_insert.py +++ b/seqspec/seqspec_insert.py @@ -11,11 +11,14 @@ from pathlib import Path from typing import List, Optional +import yaml + from seqspec.Assay import Assay from seqspec.Read import ReadInput from seqspec.Region import RegionInput from seqspec.utils import ( load_reads, + load_regions, load_spec, write_pydantic_to_file_or_stdout, ) @@ -113,6 +116,20 @@ def validate_insert_args(args: Namespace): raise FileNotFoundError(f"Spec file not found: {args.yaml}") +def load_resource_payload(resource: str): + """Load inline JSON/YAML or a JSON/YAML file into a Python object.""" + resource_path = Path(resource) + if resource_path.exists(): + content = resource_path.read_text() + else: + content = resource + + try: + return json.loads(content) + except json.JSONDecodeError: + return yaml.safe_load(content) + + def run_insert(_: ArgumentParser, args: Namespace) -> None: """Execute the ``insert`` command. @@ -133,14 +150,13 @@ def run_insert(_: ArgumentParser, args: Namespace) -> None: validate_insert_args(args) spec: Assay = load_spec(args.yaml) - # TODO validate the resource you are loading against the object, i guess this does it already - resource_data = json.loads(args.resource) - if args.selector == "reads": + resource_data = load_resource_payload(args.resource) + if args.selector == "read": resource_data = load_reads(resource_data) spec = seqspec_insert_reads(spec, args.modality, resource_data, args.after) else: - resource_data = load_reads(resource_data) - spec = seqspec_insert_reads(spec, args.modality, resource_data, args.after) + resource_data = load_regions(resource_data) + spec = seqspec_insert_regions(spec, args.modality, resource_data, args.after) spec.update_spec() write_pydantic_to_file_or_stdout(spec, args.output) diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py index be85ff40..0b6f3c0e 100644 --- a/seqspec/seqspec_onlist.py +++ b/seqspec/seqspec_onlist.py @@ -14,6 +14,7 @@ from seqspec.seqspec_find import find_by_region_id, find_by_region_type from seqspec.utils import ( load_spec, + local_onlist_locator, map_read_id_to_regions, read_local_list, read_remote_list, @@ -87,6 +88,13 @@ def setup_onlist_args(parser) -> ArgumentParser: default=None, required=True, ) + subparser.add_argument( + "--auth-profile", + metavar="PROFILE", + help="Authentication profile for remote onlists", + type=str, + default=None, + ) return subparser @@ -118,11 +126,22 @@ def run_onlist(parser: ArgumentParser, args: Namespace) -> None: if args.format: # Join operation - requires download and output path save_path = args.output or Path(args.yaml).resolve().parent - result_path = join_onlists_and_save(onlists, args.format, save_path, base_path) + result_path = join_onlists_and_save( + onlists, + args.format, + save_path, + base_path, + auth_profile=args.auth_profile, + ) print(result_path) elif args.output: # Download operation - download remote files to output location - result_paths = download_onlists_to_path(onlists, args.output, base_path) + result_paths = download_onlists_to_path( + onlists, + args.output, + base_path, + auth_profile=args.auth_profile, + ) for path_info in result_paths: print(f"{path_info['url']}") else: @@ -137,6 +156,7 @@ def get_onlists(spec: Assay, modality: str, selector: str, id: str) -> List[Onli if selector == "region-type": # Prefer ordering by read orientation when possible to ensure # consistency with the `read` selector behavior. + matches_by_read: List[tuple[str, List[Onlist]]] = [] reads: List[Read] = spec.get_seqspec(modality) for rd in reads: try: @@ -150,7 +170,15 @@ def get_onlists(spec: Assay, modality: str, selector: str, id: str) -> List[Onli if ol: ordered_onlists.append(ol) if ordered_onlists: - return ordered_onlists + matches_by_read.append((rd.read_id, ordered_onlists)) + + if len(matches_by_read) == 1: + return matches_by_read[0][1] + if len(matches_by_read) > 1: + read_ids = ", ".join(read_id for read_id, _ in matches_by_read) + raise ValueError( + f"region-type '{id}' matches regions in multiple reads for modality '{modality}': {read_ids}. Use -s read or -s region to disambiguate." + ) # Fallback: original region-type traversal order regions = find_by_region_type(spec, modality, id) @@ -195,7 +223,7 @@ def get_onlist_urls(onlists: List[Onlist], base_path: Path) -> List[Dict[str, st urls = [] for onlist in onlists: if onlist.urltype == "local": - url = str(base_path / Path(onlist.url)) + url = str(base_path / Path(local_onlist_locator(onlist))) else: url = onlist.url urls.append({"file_id": onlist.file_id, "url": url}) @@ -203,7 +231,10 @@ def get_onlist_urls(onlists: List[Onlist], base_path: Path) -> List[Dict[str, st def download_onlists_to_path( - onlists: List[Onlist], output_path: Path, base_path: Path + onlists: List[Onlist], + output_path: Path, + base_path: Path, + auth_profile: str | None = None, ) -> List[Dict[str, str]]: """Download remote onlists and return local paths.""" downloaded_paths = [] @@ -211,11 +242,11 @@ def download_onlists_to_path( for onlist in onlists: if onlist.urltype == "local": # Local file - just return the path - local_path = base_path / Path(onlist.url) + local_path = base_path / Path(local_onlist_locator(onlist)) downloaded_paths.append({"file_id": onlist.file_id, "url": str(local_path)}) else: # Remote file - download it - onlist_elements = read_remote_list(onlist) + onlist_elements = read_remote_list(onlist, auth_profile=auth_profile) # Create unique filename for this onlist filename = f"{onlist.file_id}_{output_path.name}" download_path = output_path.parent / filename @@ -228,7 +259,11 @@ def download_onlists_to_path( def join_onlists_and_save( - onlists: List[Onlist], format_type: str, output_path: Path, base_path: Path + onlists: List[Onlist], + format_type: str, + output_path: Path, + base_path: Path, + auth_profile: str | None = None, ) -> str: """Download onlists, join them, and save to output path.""" # Download all onlists first @@ -237,7 +272,7 @@ def join_onlists_and_save( if onlist.urltype == "local": content = read_local_list(onlist, str(base_path)) else: - content = read_remote_list(onlist) + content = read_remote_list(onlist, auth_profile=auth_profile) onlist_contents.append(content) # Join the onlists diff --git a/seqspec/seqspec_print_html.py b/seqspec/seqspec_print_html.py index d925aa97..cb460878 100644 --- a/seqspec/seqspec_print_html.py +++ b/seqspec/seqspec_print_html.py @@ -1,250 +1,275 @@ -"""Print HTML module for seqspec. +"""Render seqspec HTML views. -This module provides functionality to generate HTML representations of seqspec files. -It is used by the print command with the 'seqspec-html' format option. +This module renders a single self-contained HTML page that shows the library +geometry for each modality and lets the user inspect metadata by selecting +regions and reads in the diagram. """ -from typing import List, Optional +import json +from importlib.resources import files +from typing import Any from seqspec.Assay import Assay -from seqspec.Read import File, Read -from seqspec.Region import Onlist, Region, complement_sequence -from seqspec.seqspec_print_utils import libseq +from seqspec.Region import Region, project_regions_to_coordinates + +REPOSITORY_URL = "https://github.com/pachterlab/seqspec" def print_seqspec_html(spec: Assay) -> str: - """Generate HTML representation of seqspec.""" - return htmlTemplate(spec) - - -def headerTemplate(name: str, doi: str, description: str, modalities: List[str]) -> str: - s = f"""

{name}

-
    -
  • - {doi} -
  • -
  • - {description} -
  • -
  • {", ".join(modalities)}
  • -
- """ - return s - - -def colorSeq(regions: List[Region]) -> str: - return "".join( - [f"<{r.region_type}>{r.sequence}" for r in regions] - ) + """Render a self-contained HTML view for a seqspec assay.""" + payload = build_seqspec_view_data(spec) + return render_seqspec_html(payload, spec.seqspec_version or "") -def atomicRegionTemplate( - region: Region, - name: str, - region_type: str, - sequence_type: str, - sequence: str, - min_len: int, - max_len: int, - onlist: Optional[Onlist], - regions: Optional[List[Region]], -) -> str: - seq = ( - colorSeq(region.get_leaves()) - if regions - else f"<{region_type}>{sequence}" +def build_seqspec_view_data(spec: Assay) -> dict[str, Any]: + """Build the JSON payload consumed by the seqspec HTML viewer.""" + return { + "assay_id": spec.assay_id, + "assay_name": spec.name, + "seqspec_version": spec.seqspec_version, + "doi": spec.doi, + "date": spec.date, + "description": spec.description, + "lib_struct": spec.lib_struct, + "modalities": [ + build_modality_view(spec, modality) for modality in spec.modalities + ], + } + + +def build_modality_view(spec: Assay, modality: str) -> dict[str, Any]: + """Build the HTML view payload for one modality.""" + libspec = spec.get_libspec(modality) + if libspec is None: + raise ValueError(f"modality '{modality}' not found in library_spec") + + region_nodes, regions, total_bp = region_views(libspec) + reads = [] + for read in spec.get_seqspec(modality): + projected = project_read(libspec, read) + if projected is not None: + reads.append(projected) + + return { + "modality": modality, + "library_region_id": libspec.region_id, + "total_bp": total_bp, + "sequence_protocols": protocol_rows( + spec.sequence_protocol, modality, "protocol_id" + ), + "sequence_kits": protocol_rows(spec.sequence_kit, modality, "kit_id"), + "library_protocols": protocol_rows( + spec.library_protocol, modality, "protocol_id" + ), + "library_kits": protocol_rows(spec.library_kit, modality, "kit_id"), + "region_nodes": region_nodes, + "regions": regions, + "reads": reads, + } + + +def region_views( + libspec: Region, +) -> tuple[list[dict[str, Any]], list[dict[str, Any]], int]: + """Build both the full region tree and the flattened leaf regions.""" + + region_nodes, leaf_regions, total_bp = walk_regions( + libspec.regions, depth=0, bp_start=0 ) + return region_nodes, leaf_regions, total_bp + + +def walk_regions( + regions: list[Region], + depth: int, + bp_start: int, + parent_region_id: str | None = None, + path_region_ids: list[str] | None = None, + path_names: list[str] | None = None, +) -> tuple[list[dict[str, Any]], list[dict[str, Any]], int]: + """Walk a region tree and return region nodes, leaf regions, and the next bp offset.""" + region_nodes: list[dict[str, Any]] = [] + leaf_regions: list[dict[str, Any]] = [] + current_bp = bp_start - ol = f"{onlist.filename} (md5: {onlist.md5})" if onlist else None - lst = [] - if regions: - for idx, r in enumerate(regions): - s = atomicRegionTemplate( - r, - r.region_id, - r.region_type, - r.sequence_type, - r.sequence, - r.min_len, - r.max_len, - r.onlist, - r.regions, + for region in regions: + region_path_ids = [*(path_region_ids or []), region.region_id] + region_path_names = [*(path_names or []), region.name] + start = current_bp + + if region.regions: + child_nodes, child_leaves, current_bp = walk_regions( + region.regions, + depth=depth + 1, + bp_start=current_bp, + parent_region_id=region.region_id, + path_region_ids=region_path_ids, + path_names=region_path_names, ) - lst.append(s) - subseq = "
  • ".join(lst) - subseq = f"
    1. {subseq}
    " - else: - subseq = "" - - s = f"""
    - {name} -
      -
    • region_type: {region_type}
    • -
    • sequence_type: {sequence_type}
    • -
    • - sequence: -
      -{seq}
      -
    • -
    • min_len: {min_len}
    • -
    • max_len: {max_len}
    • -
    • onlist: {ol}
    • -
    • regions: {subseq} -
    • -
    -
    - """ - return s - - -def regionsTemplate(regions: List[Region]) -> str: - templates = [ - atomicRegionTemplate( - r, - r.region_id, - r.region_type, - r.sequence_type, - r.sequence, - r.min_len, - r.max_len, - r.onlist, - r.regions, + end = current_bp + region_nodes.append( + region_node( + region=region, + depth=depth, + parent_region_id=parent_region_id, + path_region_ids=region_path_ids, + path_names=region_path_names, + start=start, + end=end, + ) + ) + region_nodes.extend(child_nodes) + leaf_regions.extend(child_leaves) + else: + end = current_bp + region.max_len + node = region_node( + region=region, + depth=depth, + parent_region_id=parent_region_id, + path_region_ids=region_path_ids, + path_names=region_path_names, + start=start, + end=end, + ) + current_bp = end + region_nodes.append(node) + leaf_regions.append(node) + + return region_nodes, leaf_regions, current_bp + + +def region_node( + region: Region, + depth: int, + parent_region_id: str | None, + path_region_ids: list[str], + path_names: list[str], + start: int, + end: int, +) -> dict[str, Any]: + """Build one serialized region node.""" + return { + "region_id": region.region_id, + "region_type": str(region.region_type), + "name": region.name, + "sequence_type": str(region.sequence_type), + "sequence": region.sequence, + "min_len": region.min_len, + "max_len": region.max_len, + "len": end - start, + "bp_start": start, + "bp_end": end, + "depth": depth, + "parent_region_id": parent_region_id, + "path_region_ids": path_region_ids, + "path_names": path_names, + "is_leaf": len(region.regions) == 0, + "child_region_ids": [child.region_id for child in region.regions], + "onlist": onlist_row(region.onlist), + } + + +def project_read(libspec: Region, read) -> dict[str, Any] | None: + """Project one read onto the library coordinate system.""" + leaves = libspec.get_leaves_with_region_id(read.primer_id) + try: + primer_index = next( + index + for index, leaf in enumerate(leaves) + if leaf.region_id == read.primer_id ) - for idx, r in enumerate(regions) - ] - s = f"""
    1. - {"
    2. ".join(templates)} -
    """ - return s + except StopIteration: + return None + cuts = project_regions_to_coordinates(leaves) + primer = cuts[primer_index] + + if read.strand == "pos": + start = primer.stop + end = start + read.max_len + else: + end = primer.start + start = end - read.max_len + return { + "read_id": read.read_id, + "name": read.name, + "label": read.name, + "primer_id": read.primer_id, + "min_len": read.min_len, + "max_len": read.max_len, + "strand": read.strand, + "start": start, + "end": end, + "files": [file_row(file) for file in read.files], + } -def libStructTemplate(spec: Assay, modality: str) -> str: - libspec = spec.get_libspec(modality) - seqspec = spec.get_seqspec(modality) # noqa - p, n = libseq(spec, modality) - - cseq = colorSeq(libspec.get_leaves()) - seq = "\n".join( - [ - "\n".join(p), - cseq, - complement_sequence(libspec.sequence), - "\n".join(n), - ] + +def protocol_rows(entries: Any, modality: str, id_key: str) -> list[dict[str, Any]]: + """Collect protocol or kit metadata rows for one modality.""" + if entries is None: + return [] + rows = [] + if isinstance(entries, str): + return [{id_key: entries, "name": entries}] + for entry in entries: + if getattr(entry, "modality", None) != modality: + continue + row = {id_key: getattr(entry, id_key, ""), "name": getattr(entry, "name", None)} + rows.append(row) + return rows + + +def onlist_row(onlist) -> dict[str, Any] | None: + """Convert an onlist object into a JSON row.""" + if onlist is None: + return None + return { + "file_id": onlist.file_id, + "filename": onlist.filename, + "filetype": onlist.filetype, + "filesize": onlist.filesize, + "url": onlist.url, + "urltype": onlist.urltype, + "md5": onlist.md5, + } + + +def file_row(file) -> dict[str, Any]: + """Convert a read file into a JSON row.""" + return { + "file_id": file.file_id, + "filename": file.filename, + "filetype": file.filetype, + "filesize": file.filesize, + "url": file.url, + "urltype": file.urltype, + "md5": file.md5, + } + + +def render_seqspec_html(payload: dict[str, Any], tool_version: str) -> str: + """Render the final HTML page.""" + template = asset_text("template.html") + style = asset_text("style.css") + app = asset_text("app.js") + report_json = escape_script_json(json.dumps(payload, ensure_ascii=False)) + repository_json = json.dumps(REPOSITORY_URL) + version_json = json.dumps(tool_version) + + return ( + template.replace("__STYLE__", style) + .replace("__APP__", app) + .replace("__DATA__", report_json) + .replace("__REPOSITORY__", repository_json) + .replace("__TOOL_VERSION__", version_json) ) - s = f""" -
    {modality}
    -
    -{seq}
    - """ - return s - - -def atomicReadTemplate(read: Read) -> str: - files = "".join(atomicFileTemplate(f) for f in read.files) if read.files else "" - - s = f""" -
    - {read.name} -
      -
    • read_id: {read.read_id}
    • -
    • primer_id: {read.primer_id}
    • -
    • min_len: {read.min_len}
    • -
    • max_len: {read.max_len}
    • -
    • strand: {read.strand}
    • -
    • - files: -
        - {files} -
      -
    • -
    -
    - """ - return s - - -def atomicFileTemplate(file: File) -> str: - s = f""" -
  • {file.filename} (md5: {file.md5})
  • - """ - return s - - -def readsTemplate(reads: List[Read]) -> str: - s = f"""
    1. - {"
    2. ".join([atomicReadTemplate(r) for r in reads])} -
    """ - return s - - -def multiModalTemplate(spec: Assay) -> str: - modes = spec.modalities - s = "" - for m in modes: - libspec = spec.get_libspec(m) - seqspec = spec.get_seqspec(m) - - s += f""" - {libStructTemplate(spec, m)} -

    Sequence structure

    - {readsTemplate(seqspec)} -

    Library structure

    - {regionsTemplate(libspec.get_leaves())} - """ - return s - - -def htmlTemplate(spec: Assay) -> str: - s = f""" - - - - - - - -
    -
    Back
    -
    - {headerTemplate(spec.name, spec.doi, spec.description, spec.modalities)} -
    -
    -

    Final library

    - {multiModalTemplate(spec)} -
    -
    - - - """ - return s + + +def asset_text(name: str) -> str: + """Read a packaged asset file.""" + return files("seqspec.report_assets").joinpath(name).read_text(encoding="utf-8") + + +def escape_script_json(value: str) -> str: + """Escape JSON before embedding it in a script tag.""" + return value.replace(" None: """Run the upgrade command.""" validate_upgrade_args(parser, args) - spec = load_spec(args.yaml) - version = spec.seqspec_version + spec = load_spec(args.yaml, strict=False) + version = spec.seqspec_version or "0.0.0" upgraded_spec = seqspec_upgrade(spec, version) if args.output: @@ -65,11 +65,12 @@ def run_upgrade(parser: ArgumentParser, args: Namespace) -> None: def seqspec_upgrade(spec: Assay, version: str) -> Assay: """Upgrade spec to current version.""" UPGRADE = { - "0.0.0": upgrade_0_0_0_to_0_3_0, - "0.1.0": upgrade_0_1_0_to_0_3_0, - "0.1.1": upgrade_0_1_1_to_0_3_0, - "0.2.0": upgrade_0_2_0_to_0_3_0, - "0.3.0": upgrade_0_3_0_to_0_3_0, + "0.0.0": upgrade_0_0_0_to_0_4_0, + "0.1.0": upgrade_0_1_0_to_0_4_0, + "0.1.1": upgrade_0_1_1_to_0_4_0, + "0.2.0": upgrade_0_2_0_to_0_4_0, + "0.3.0": upgrade_0_3_0_to_0_4_0, + "0.4.0": upgrade_0_4_0_to_0_4_0, } if version not in UPGRADE: @@ -80,13 +81,19 @@ def seqspec_upgrade(spec: Assay, version: str) -> Assay: return UPGRADE[version](spec) -def upgrade_0_3_0_to_0_3_0(spec: Assay) -> Assay: +def upgrade_0_4_0_to_0_4_0(spec: Assay) -> Assay: """No upgrade needed for current version.""" return spec -def upgrade_0_2_0_to_0_3_0(spec: Assay) -> Assay: - """Upgrade spec from version 0.2.0 to 0.3.0.""" +def upgrade_0_3_0_to_0_4_0(spec: Assay) -> Assay: + """Upgrade spec from version 0.3.0 to 0.4.0.""" + spec.seqspec_version = "0.4.0" + return spec + + +def upgrade_0_2_0_to_0_4_0(spec: Assay) -> Assay: + """Upgrade spec from version 0.2.0 to 0.4.0.""" # Set files to empty for specs < v0.3.0 for r in spec.sequence_spec: r.set_files( @@ -119,19 +126,19 @@ def upgrade_0_2_0_to_0_3_0(spec: Assay) -> Assay: md5=md5, ) spec.seqspec_version = "0.3.0" - return spec + return upgrade_0_3_0_to_0_4_0(spec) -def upgrade_0_1_1_to_0_3_0(spec: Assay) -> Assay: - """Upgrade spec from version 0.1.1 to 0.3.0.""" - return upgrade_0_2_0_to_0_3_0(spec) +def upgrade_0_1_1_to_0_4_0(spec: Assay) -> Assay: + """Upgrade spec from version 0.1.1 to 0.4.0.""" + return upgrade_0_2_0_to_0_4_0(spec) -def upgrade_0_1_0_to_0_3_0(spec: Assay) -> Assay: - """Upgrade spec from version 0.1.0 to 0.3.0.""" - return upgrade_0_2_0_to_0_3_0(spec) +def upgrade_0_1_0_to_0_4_0(spec: Assay) -> Assay: + """Upgrade spec from version 0.1.0 to 0.4.0.""" + return upgrade_0_2_0_to_0_4_0(spec) -def upgrade_0_0_0_to_0_3_0(spec: Assay) -> Assay: - """Upgrade spec from version 0.0.0 to 0.3.0.""" - return upgrade_0_2_0_to_0_3_0(spec) +def upgrade_0_0_0_to_0_4_0(spec: Assay) -> Assay: + """Upgrade spec from version 0.0.0 to 0.4.0.""" + return upgrade_0_2_0_to_0_4_0(spec) diff --git a/seqspec/utils.py b/seqspec/utils.py index 4ec5fdd7..c05e83d9 100644 --- a/seqspec/utils.py +++ b/seqspec/utils.py @@ -27,6 +27,7 @@ SeqKitInput, SeqProtocolInput, ) +from seqspec.auth import AuthRegistry from seqspec.File import File, FileInput from seqspec.Read import Read, ReadInput from seqspec.Region import Onlist, Region, RegionInput @@ -444,13 +445,26 @@ def load_spec_stream(spec_stream: IO) -> Assay: return assay +# def yaml_safe_dump(obj): +# if isinstance(obj, list): +# return [o.model_dump() if hasattr(o, "model_dump") else o for o in obj] +# elif hasattr(obj, "model_dump"): +# return obj.model_dump() +# else: +# return obj + + +# # rust compatible via snapshot def yaml_safe_dump(obj): if isinstance(obj, list): - return [o.model_dump() if hasattr(o, "model_dump") else o for o in obj] - elif hasattr(obj, "model_dump"): + return [yaml_safe_dump(o) for o in obj] + if hasattr(obj, "model_dump"): return obj.model_dump() - else: - return obj + # allow engines/proxies to serialize via snapshot() + snap = getattr(obj, "snapshot", None) + if callable(snap): + return yaml_safe_dump(snap()) + return obj def load_genbank(gbk_fn: str): @@ -489,8 +503,18 @@ def yield_onlist_contents(stream): yield line.strip().split()[0] +def local_resource_url(url: str, filename: str, resource: str) -> str: + if not url: + raise ValueError(f"local {resource} '{filename}' has empty url") + return str(url) + + +def local_onlist_locator(onlist: Onlist) -> str: + return local_resource_url(onlist.url, onlist.filename, "onlist") + + def read_local_list(onlist: Onlist, base_path: str = "") -> List[str]: - filename = os.path.join(base_path, onlist.filename) + filename = os.path.join(base_path, local_onlist_locator(onlist)) stream = open(filename, "rb") # do we need to decompress? if filename.endswith(".gz"): @@ -506,7 +530,9 @@ def read_local_list(onlist: Onlist, base_path: str = "") -> List[str]: return results -def read_remote_list(onlist: Onlist, base_path: str = "") -> List[str]: +def read_remote_list( + onlist: Onlist, base_path: str = "", auth_profile: Optional[str] = None +) -> List[str]: """Given an onlist object read the local or remote data""" filename = str(onlist.filename) if onlist.url: @@ -515,7 +541,7 @@ def read_remote_list(onlist: Onlist, base_path: str = "") -> List[str]: stream = None try: # open stream - auth = get_remote_auth_token() + auth = get_remote_auth_token(filename, auth_profile) response = requests.get(filename, stream=True, auth=auth) response.raise_for_status() # Read into an in-memory bytes buffer to satisfy type expectations @@ -546,16 +572,21 @@ def read_remote_list(onlist: Onlist, base_path: str = "") -> List[str]: return results -def get_remote_auth_token(): - """Look for authentication tokens for accessing remote resources""" +def get_remote_auth_token( + uri: Optional[str] = None, auth_profile: Optional[str] = None +) -> Optional[Tuple[str, str]]: + """Look for authentication tokens for accessing remote resources.""" + if uri is not None: + registry = AuthRegistry.load() + auth = registry.resolve_requests_auth(uri, auth_profile) + if auth is not None: + return auth + username = os.environ.get("IGVF_API_KEY") password = os.environ.get("IGVF_SECRET_KEY") - if not (username is None or password is None): - auth = (username, password) - else: - auth = None - - return auth + if username is None or password is None: + return None + return (username, password) def region_ids_in_spec(seqspec, modality, region_ids): @@ -567,23 +598,17 @@ def region_ids_in_spec(seqspec, modality, region_ids): return found -def file_exists(uri): +def file_exists(uri: str, auth_profile: Optional[str] = None) -> bool: try: - if uri.startswith("https://api.data.igvf.org"): - auth = get_remote_auth_token() - if auth is None: - print("Warning: IGVF_API_KEY and IGVF_SECRET_KEY not set") - r = requests.head(uri, auth=auth) - if r.status_code == 307: - # igvf download link will redirect to a presigned amazon s3 url, HEAD request will not work. - r = requests.get(r.headers["Location"], headers={"Range": "bytes=0-0"}) - return r.status_code == 206 - return r.status_code == 200 - r = requests.head(uri) - if r.status_code == 302: - return file_exists(r.headers["Location"]) - return r.status_code == 200 - except requests.ConnectionError: + auth = get_remote_auth_token(uri, auth_profile) + r = requests.get( + uri, + headers={"Range": "bytes=0-0"}, + auth=auth, + allow_redirects=True, + ) + return r.status_code in (200, 206) + except requests.RequestException: return False diff --git a/src/auth.rs b/src/auth.rs new file mode 100644 index 00000000..719ff7b9 --- /dev/null +++ b/src/auth.rs @@ -0,0 +1,654 @@ +use anyhow::{anyhow, bail, Context, Result}; +use clap::ValueEnum; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::env; +use std::fs; +use std::io::Read as IoRead; +use std::path::PathBuf; +use std::process::{Command, Stdio}; + +const AUTH_CONFIG_ENV: &str = "SEQSPEC_AUTH_CONFIG"; + +#[derive(Debug, Clone, Serialize)] +pub struct ConfigLocation { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, + pub source: String, + pub exists: bool, +} + +#[derive(Debug, Clone, Default, Deserialize, Serialize)] +struct AuthConfigFile { + #[serde(default)] + profiles: BTreeMap, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, ValueEnum)] +#[serde(rename_all = "lowercase")] +pub enum AuthKind { + Basic, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub struct AuthProfile { + #[serde(default)] + pub hosts: Vec, + pub kind: AuthKind, + pub username_env: String, + pub password_env: String, +} + +#[derive(Debug, Clone)] +pub struct AuthRegistry { + location: ConfigLocation, + profiles: BTreeMap, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub struct ProfileSummary { + pub name: String, + pub kind: AuthKind, + pub hosts: Vec, + pub username_env: String, + pub username_present: bool, + pub password_env: String, + pub password_present: bool, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub struct ResolvedProfileSummary { + pub url: String, + pub host: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub profile: Option, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub struct InitProfileOutput { + pub profile: String, + pub path: PathBuf, + pub created_config: bool, + pub updated_profile: bool, + pub hosts: Vec, + pub kind: AuthKind, + pub username_env: String, + pub password_env: String, +} + +#[derive(Debug, Clone)] +pub struct RemoteAccess { + registry: AuthRegistry, + selected_profile: Option, +} + +#[derive(Debug, Clone)] +struct ResolvedCredentials { + username: String, + password: String, +} + +impl AuthRegistry { + pub fn load() -> Result { + let location = config_location(); + let profiles = load_profiles(&location)?; + Ok(Self { location, profiles }) + } + + pub fn location(&self) -> &ConfigLocation { + &self.location + } + + pub fn profile_summaries(&self) -> Vec { + self.profiles + .iter() + .map(|(name, profile)| ProfileSummary { + name: name.clone(), + kind: profile.kind.clone(), + hosts: profile.hosts.clone(), + username_env: profile.username_env.clone(), + username_present: env::var_os(&profile.username_env).is_some(), + password_env: profile.password_env.clone(), + password_present: env::var_os(&profile.password_env).is_some(), + }) + .collect() + } + + pub fn resolve_summary( + &self, + url: &str, + selected_profile: Option<&str>, + ) -> Result { + let host = host_from_url(url)?; + let profile = self + .resolve_profile(&host, selected_profile)? + .map(|(name, profile)| ProfileSummary { + name, + kind: profile.kind.clone(), + hosts: profile.hosts.clone(), + username_env: profile.username_env.clone(), + username_present: env::var_os(&profile.username_env).is_some(), + password_env: profile.password_env.clone(), + password_present: env::var_os(&profile.password_env).is_some(), + }); + Ok(ResolvedProfileSummary { + url: url.to_string(), + host, + profile, + }) + } + + fn resolve_credentials( + &self, + url: &str, + selected_profile: Option<&str>, + ) -> Result> { + let host = host_from_url(url)?; + let Some((profile_name, profile)) = self.resolve_profile(&host, selected_profile)? else { + return Ok(None); + }; + + let username = env::var(&profile.username_env).with_context(|| { + format!( + "auth profile '{}' requires env var '{}' for host '{}'", + profile_name, profile.username_env, host + ) + })?; + let password = env::var(&profile.password_env).with_context(|| { + format!( + "auth profile '{}' requires env var '{}' for host '{}'", + profile_name, profile.password_env, host + ) + })?; + + Ok(Some(ResolvedCredentials { username, password })) + } + + fn resolve_profile( + &self, + host: &str, + selected_profile: Option<&str>, + ) -> Result> { + if let Some(profile_name) = selected_profile { + let profile = self.profiles.get(profile_name).ok_or_else(|| { + anyhow!( + "auth profile '{}' is not defined in {}", + profile_name, + display_config_path(&self.location) + ) + })?; + if !profile.matches_host(host) { + bail!( + "auth profile '{}' does not match host '{}'", + profile_name, + host + ); + } + return Ok(Some((profile_name.to_string(), profile))); + } + + let matches: Vec<(String, &AuthProfile)> = self + .profiles + .iter() + .filter(|(_, profile)| profile.matches_host(host)) + .map(|(name, profile)| (name.clone(), profile)) + .collect(); + + match matches.len() { + 0 => Ok(None), + 1 => Ok(matches.into_iter().next()), + _ => { + let names = matches + .into_iter() + .map(|(name, _)| name) + .collect::>() + .join(", "); + bail!("multiple auth profiles match host '{}': {}", host, names) + } + } + } +} + +impl AuthProfile { + fn matches_host(&self, host: &str) -> bool { + self.hosts + .iter() + .any(|candidate| candidate.eq_ignore_ascii_case(host)) + } +} + +impl RemoteAccess { + pub fn anonymous() -> Self { + Self { + registry: AuthRegistry { + location: ConfigLocation { + path: None, + source: "anonymous".to_string(), + exists: false, + }, + profiles: BTreeMap::new(), + }, + selected_profile: None, + } + } + + pub fn load(selected_profile: Option<&str>) -> Result { + let registry = AuthRegistry::load()?; + if let Some(profile_name) = selected_profile { + if !registry.profiles.contains_key(profile_name) { + bail!( + "auth profile '{}' is not defined in {}", + profile_name, + display_config_path(registry.location()) + ); + } + } + Ok(Self { + registry, + selected_profile: selected_profile.map(|profile| profile.to_string()), + }) + } + + pub fn with_reader(&self, url: &str, read_fn: F) -> Result + where + F: FnOnce(Box) -> Result, + { + let credentials = self + .registry + .resolve_credentials(url, self.selected_profile.as_deref())?; + + let mut command = Command::new("curl"); + command.arg("-fsSL"); + if let Some(credentials) = credentials { + command + .arg("--user") + .arg(format!("{}:{}", credentials.username, credentials.password)); + } + command + .arg(url) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + let mut child = command + .spawn() + .with_context(|| format!("failed to spawn curl for '{}'", url))?; + + let mut stderr = child + .stderr + .take() + .ok_or_else(|| anyhow!("curl did not provide stderr for '{}'", url))?; + + let read_result = { + let stdout = child + .stdout + .take() + .ok_or_else(|| anyhow!("curl did not provide stdout for '{}'", url))?; + read_fn(Box::new(stdout)) + }; + + let status = child + .wait() + .with_context(|| format!("failed to wait for curl while reading '{}'", url))?; + + let mut stderr_text = String::new(); + let _ = stderr.read_to_string(&mut stderr_text); + let stderr_text = stderr_text.trim(); + + match (read_result, status.success()) { + (Ok(value), true) => Ok(value), + (Ok(_), false) => { + if stderr_text.is_empty() { + bail!("curl exited with status {} while reading '{}'", status, url); + } + bail!( + "curl exited with status {} while reading '{}': {}", + status, + url, + stderr_text + ); + } + (Err(err), true) => Err(err), + (Err(err), false) => { + if stderr_text.is_empty() { + Err(err.context(format!( + "curl exited with status {} while reading '{}'", + status, url + ))) + } else { + Err(err.context(format!( + "curl exited with status {} while reading '{}': {}", + status, url, stderr_text + ))) + } + } + } + } + + pub fn url_exists(&self, url: &str) -> Result { + let credentials = self + .registry + .resolve_credentials(url, self.selected_profile.as_deref())?; + + let mut command = Command::new("curl"); + command.arg("-fsSL"); + command.arg("-r").arg("0-0"); + command.arg("-o").arg("/dev/null"); + if let Some(credentials) = credentials { + command + .arg("--user") + .arg(format!("{}:{}", credentials.username, credentials.password)); + } + command.arg(url).stderr(Stdio::piped()); + + let output = command + .output() + .with_context(|| format!("failed to spawn curl for '{}'", url))?; + + if output.status.success() { + return Ok(true); + } + + let stderr_text = String::from_utf8_lossy(&output.stderr); + let stderr_text = stderr_text.trim(); + if stderr_text.contains("404") || stderr_text.contains("403") { + return Ok(false); + } + Ok(false) + } +} + +pub fn init_profile(profile_name: &str, profile: AuthProfile) -> Result { + let location = config_location(); + let path = location + .path + .clone() + .ok_or_else(|| anyhow!("no auth config path is available on this system"))?; + + let mut config = if path.exists() { + read_config_file(&path)? + } else { + AuthConfigFile::default() + }; + + let created_config = !path.exists(); + let updated_profile = config.profiles.contains_key(profile_name); + config + .profiles + .insert(profile_name.to_string(), profile.clone()); + + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).with_context(|| { + format!( + "failed to create auth config directory '{}'", + parent.display() + ) + })?; + } + + let text = toml::to_string_pretty(&config) + .with_context(|| format!("failed to serialize auth config '{}'", path.display()))?; + fs::write(&path, text) + .with_context(|| format!("failed to write auth config '{}'", path.display()))?; + + Ok(InitProfileOutput { + profile: profile_name.to_string(), + path, + created_config, + updated_profile, + hosts: profile.hosts, + kind: profile.kind, + username_env: profile.username_env, + password_env: profile.password_env, + }) +} + +fn config_location() -> ConfigLocation { + if let Some(path) = env::var_os(AUTH_CONFIG_ENV).map(PathBuf::from) { + return ConfigLocation { + exists: path.exists(), + path: Some(path), + source: format!("env:{}", AUTH_CONFIG_ENV), + }; + } + + if let Some(base) = env::var_os("XDG_CONFIG_HOME").map(PathBuf::from) { + let path = base.join("seqspec").join("auth.toml"); + return ConfigLocation { + exists: path.exists(), + path: Some(path), + source: "xdg_config_home".to_string(), + }; + } + + if let Some(home) = env::var_os("HOME").map(PathBuf::from) { + let path = home.join(".config").join("seqspec").join("auth.toml"); + return ConfigLocation { + exists: path.exists(), + path: Some(path), + source: "home_default".to_string(), + }; + } + + ConfigLocation { + path: None, + source: "unavailable".to_string(), + exists: false, + } +} + +fn load_profiles(location: &ConfigLocation) -> Result> { + let Some(path) = &location.path else { + return Ok(BTreeMap::new()); + }; + + if !path.exists() { + if location.source.starts_with("env:") { + bail!("auth config does not exist: {}", path.display()); + } + return Ok(BTreeMap::new()); + } + + let parsed = read_config_file(path)?; + Ok(parsed.profiles) +} + +fn read_config_file(path: &PathBuf) -> Result { + let text = fs::read_to_string(path) + .with_context(|| format!("failed to read auth config '{}'", path.display()))?; + toml::from_str(&text) + .with_context(|| format!("failed to parse auth config '{}'", path.display())) +} + +fn host_from_url(url: &str) -> Result { + let (_, rest) = url + .split_once("://") + .ok_or_else(|| anyhow!("URL '{}' does not contain a scheme", url))?; + let authority = rest + .split('/') + .next() + .ok_or_else(|| anyhow!("URL '{}' does not contain an authority", url))?; + let without_userinfo = authority.rsplit('@').next().unwrap_or(authority); + + let host = if without_userinfo.starts_with('[') { + let end = without_userinfo + .find(']') + .ok_or_else(|| anyhow!("URL '{}' has an invalid IPv6 host", url))?; + &without_userinfo[1..end] + } else { + without_userinfo + .split(':') + .next() + .ok_or_else(|| anyhow!("URL '{}' has an invalid host", url))? + }; + + if host.is_empty() { + bail!("URL '{}' has an empty host", url); + } + + Ok(host.to_ascii_lowercase()) +} + +fn display_config_path(location: &ConfigLocation) -> String { + location + .path + .as_ref() + .map(|path| path.display().to_string()) + .unwrap_or_else(|| "".to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::{Read, Write}; + use std::net::TcpListener; + use std::path::Path; + use std::sync::{Mutex, OnceLock}; + use std::thread; + + fn env_lock() -> &'static Mutex<()> { + static LOCK: OnceLock> = OnceLock::new(); + LOCK.get_or_init(|| Mutex::new(())) + } + + fn write_config(root: &Path, text: &str) -> PathBuf { + let path = root.join("auth.toml"); + fs::write(&path, text).unwrap(); + path + } + + #[test] + fn test_registry_loads_profiles_from_env_path() { + let _guard = env_lock().lock().unwrap(); + let root = env::temp_dir().join(format!( + "seqspec-auth-{}-{}", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + )); + fs::create_dir_all(&root).unwrap(); + let path = write_config( + &root, + r#" +[profiles.igvf] +hosts = ["api.data.igvf.org", "data.igvf.org"] +kind = "basic" +username_env = "IGVF_ACCESS_KEY_ID" +password_env = "IGVF_ACCESS_KEY_SECRET" +"#, + ); + + env::set_var(AUTH_CONFIG_ENV, &path); + let registry = AuthRegistry::load().unwrap(); + env::remove_var(AUTH_CONFIG_ENV); + + assert_eq!(registry.profile_summaries().len(), 1); + assert_eq!(registry.profile_summaries()[0].name, "igvf"); + + fs::remove_dir_all(root).unwrap(); + } + + #[test] + fn test_resolve_summary_matches_host() { + let _guard = env_lock().lock().unwrap(); + let root = env::temp_dir().join(format!( + "seqspec-auth-{}-{}", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + )); + fs::create_dir_all(&root).unwrap(); + let path = write_config( + &root, + r#" +[profiles.igvf] +hosts = ["api.data.igvf.org"] +kind = "basic" +username_env = "IGVF_ACCESS_KEY_ID" +password_env = "IGVF_ACCESS_KEY_SECRET" +"#, + ); + env::set_var(AUTH_CONFIG_ENV, &path); + let registry = AuthRegistry::load().unwrap(); + env::remove_var(AUTH_CONFIG_ENV); + + let resolved = registry + .resolve_summary("https://api.data.igvf.org/reference-files/foo", None) + .unwrap(); + + assert_eq!(resolved.host, "api.data.igvf.org"); + assert_eq!(resolved.profile.unwrap().name, "igvf"); + + fs::remove_dir_all(root).unwrap(); + } + + #[test] + fn test_remote_access_sends_basic_auth() { + let _guard = env_lock().lock().unwrap(); + let root = env::temp_dir().join(format!( + "seqspec-auth-{}-{}", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + )); + fs::create_dir_all(&root).unwrap(); + let path = write_config( + &root, + r#" +[profiles.local] +hosts = ["127.0.0.1"] +kind = "basic" +username_env = "SEQSPEC_TEST_USER" +password_env = "SEQSPEC_TEST_PASS" +"#, + ); + env::set_var(AUTH_CONFIG_ENV, &path); + env::set_var("SEQSPEC_TEST_USER", "alice"); + env::set_var("SEQSPEC_TEST_PASS", "secret"); + + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let mut buffer = [0_u8; 4096]; + let size = stream.read(&mut buffer).unwrap(); + let request = String::from_utf8_lossy(&buffer[..size]); + let authorized = request.contains("Authorization: Basic YWxpY2U6c2VjcmV0"); + let (status, body) = if authorized { + ("200 OK", "AAAA\nCCCC\n") + } else { + ("401 Unauthorized", "missing auth\n") + }; + let response = format!( + "HTTP/1.1 {}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}", + status, + body.len(), + body + ); + stream.write_all(response.as_bytes()).unwrap(); + }); + + let access = RemoteAccess::load(Some("local")).unwrap(); + let text = access + .with_reader(&format!("http://{}/barcodes.txt", addr), |mut reader| { + let mut text = String::new(); + reader.read_to_string(&mut text)?; + Ok(text) + }) + .unwrap(); + + env::remove_var(AUTH_CONFIG_ENV); + env::remove_var("SEQSPEC_TEST_USER"); + env::remove_var("SEQSPEC_TEST_PASS"); + + server.join().unwrap(); + + assert_eq!(text, "AAAA\nCCCC\n"); + fs::remove_dir_all(root).unwrap(); + } +} diff --git a/src/compat.rs b/src/compat.rs new file mode 100644 index 00000000..2a29f3e9 --- /dev/null +++ b/src/compat.rs @@ -0,0 +1,362 @@ +use serde::Deserialize; +use std::path::Path; + +use crate::models::assay::{Assay, LibKit, LibProtocol, SeqKit, SeqProtocol}; +use crate::models::file::File; +use crate::models::onlist::Onlist; +use crate::models::read::Read; +use crate::models::region::Region; + +#[derive(Clone, Debug, Deserialize)] +pub struct AssayCompat { + pub seqspec_version: Option, + pub assay_id: Option, + pub name: Option, + pub doi: Option, + pub date: Option, + pub description: Option, + pub modalities: Option>, + pub lib_struct: Option, + + pub sequence_protocol: Option>, + pub sequence_kit: Option>, + pub library_protocol: Option>, + pub library_kit: Option>, + + pub sequence_spec: Option>, + pub library_spec: Option>, +} + +impl AssayCompat { + pub fn into_assay(self) -> Assay { + let modalities = self.modalities.unwrap_or_default(); + Assay::new( + self.assay_id.unwrap_or_default(), + self.name.unwrap_or_default(), + self.doi.unwrap_or_default(), + self.date.unwrap_or_default(), + self.description.unwrap_or_default(), + modalities.clone(), + self.lib_struct.unwrap_or_default(), + self.sequence_spec + .unwrap_or_default() + .into_iter() + .map(CompatRead::into_read) + .collect(), + self.library_spec + .unwrap_or_default() + .into_iter() + .map(CompatRegion::into_region) + .collect(), + normalize_field( + self.sequence_protocol, + &modalities, + |value, modality| SeqProtocol { + protocol_id: value.clone(), + name: value, + modality, + }, + CompatSeqProtocol::into_seqprotocol, + ), + normalize_field( + self.sequence_kit, + &modalities, + |value, modality| SeqKit { + kit_id: value.clone(), + name: Some(value), + modality, + }, + CompatSeqKit::into_seqkit, + ), + normalize_field( + self.library_protocol, + &modalities, + |value, modality| LibProtocol { + protocol_id: value.clone(), + name: value, + modality, + }, + CompatLibProtocol::into_libprotocol, + ), + normalize_field( + self.library_kit, + &modalities, + |value, modality| LibKit { + kit_id: value.clone(), + name: Some(value), + modality, + }, + CompatLibKit::into_libkit, + ), + self.seqspec_version, + ) + } +} + +#[derive(Clone, Debug, Deserialize)] +#[serde(untagged)] +pub enum CompatField { + Text(String), + Item(T), + Items(Vec>), +} + +#[derive(Clone, Debug, Deserialize)] +#[serde(untagged)] +pub enum CompatFieldItem { + Text(String), + Item(T), +} + +fn normalize_field( + value: Option>, + modalities: &[String], + from_text: FText, + into_output: FInto, +) -> Option> +where + FText: Fn(String, String) -> Output, + FInto: Fn(Input) -> Output, +{ + let value = value?; + let mut out = Vec::new(); + + match value { + CompatField::Text(text) => { + for modality in modalities { + out.push(from_text(text.clone(), modality.clone())); + } + } + CompatField::Item(item) => out.push(into_output(item)), + CompatField::Items(items) => { + for item in items { + match item { + CompatFieldItem::Text(text) => { + for modality in modalities { + out.push(from_text(text.clone(), modality.clone())); + } + } + CompatFieldItem::Item(item) => out.push(into_output(item)), + } + } + } + } + + Some(out) +} + +#[derive(Clone, Debug, Deserialize)] +pub struct CompatSeqProtocol { + pub protocol_id: Option, + pub name: Option, + pub modality: Option, +} + +impl CompatSeqProtocol { + fn into_seqprotocol(self) -> SeqProtocol { + SeqProtocol { + protocol_id: self.protocol_id.unwrap_or_else(|| "auto-id".to_string()), + name: self.name.unwrap_or_default(), + modality: self.modality.unwrap_or_default(), + } + } +} + +#[derive(Clone, Debug, Deserialize)] +pub struct CompatSeqKit { + pub kit_id: Option, + pub name: Option, + pub modality: Option, +} + +impl CompatSeqKit { + fn into_seqkit(self) -> SeqKit { + SeqKit { + kit_id: self.kit_id.unwrap_or_default(), + name: self.name, + modality: self.modality.unwrap_or_default(), + } + } +} + +#[derive(Clone, Debug, Deserialize)] +pub struct CompatLibProtocol { + pub protocol_id: Option, + pub name: Option, + pub modality: Option, +} + +impl CompatLibProtocol { + fn into_libprotocol(self) -> LibProtocol { + LibProtocol { + protocol_id: self.protocol_id.unwrap_or_default(), + name: self.name.unwrap_or_default(), + modality: self.modality.unwrap_or_default(), + } + } +} + +#[derive(Clone, Debug, Deserialize)] +pub struct CompatLibKit { + pub kit_id: Option, + pub name: Option, + pub modality: Option, +} + +impl CompatLibKit { + fn into_libkit(self) -> LibKit { + LibKit { + kit_id: self.kit_id.unwrap_or_default(), + name: self.name, + modality: self.modality.unwrap_or_default(), + } + } +} + +#[derive(Clone, Debug, Deserialize)] +pub struct CompatFile { + pub file_id: Option, + pub filename: Option, + pub filetype: Option, + pub filesize: Option, + pub url: Option, + pub urltype: Option, + pub md5: Option, +} + +impl CompatFile { + fn into_file(self) -> File { + let filename = self.filename.unwrap_or_default(); + let basename = if filename.is_empty() { + String::new() + } else { + Path::new(&filename) + .file_name() + .map(|name| name.to_string_lossy().into_owned()) + .unwrap_or_default() + }; + let filetype = if let Some(filetype) = self.filetype { + filetype + } else if filename.is_empty() { + String::new() + } else { + Path::new(&filename) + .extension() + .map(|ext| ext.to_string_lossy().into_owned()) + .unwrap_or_default() + }; + + File::new( + self.file_id.unwrap_or(basename), + filename, + filetype, + self.filesize.unwrap_or_default(), + self.url.unwrap_or_default(), + self.urltype.unwrap_or_else(|| "local".to_string()), + self.md5.unwrap_or_default(), + ) + } +} + +#[derive(Clone, Debug, Deserialize)] +pub struct CompatOnlist { + pub file_id: Option, + pub filename: Option, + pub filetype: Option, + pub filesize: Option, + pub url: Option, + pub urltype: Option, + pub md5: Option, +} + +impl CompatOnlist { + fn into_onlist(self) -> Onlist { + Onlist::new( + self.file_id.unwrap_or_default(), + self.filename.unwrap_or_default(), + self.filetype.unwrap_or_default(), + self.filesize.unwrap_or_default(), + self.url.unwrap_or_default(), + self.urltype.unwrap_or_else(|| "local".to_string()), + self.md5.unwrap_or_default(), + ) + } +} + +#[derive(Clone, Debug, Deserialize)] +pub struct CompatRead { + pub read_id: Option, + pub name: Option, + pub modality: Option, + pub primer_id: Option, + pub min_len: Option, + pub max_len: Option, + pub strand: Option, + pub files: Option>, +} + +impl CompatRead { + fn into_read(self) -> Read { + let read_id = self.read_id.unwrap_or_default(); + Read::new( + read_id.clone(), + self.name.unwrap_or_else(|| read_id.clone()), + self.modality.unwrap_or_default(), + self.primer_id.unwrap_or_default(), + self.min_len.unwrap_or_default(), + self.max_len.unwrap_or_default(), + self.strand.unwrap_or_else(|| "pos".to_string()), + self.files + .unwrap_or_default() + .into_iter() + .map(CompatFile::into_file) + .collect(), + ) + } +} + +#[derive(Clone, Debug, Deserialize)] +pub struct CompatRegion { + pub region_id: Option, + pub region_type: Option, + pub name: Option, + pub sequence_type: Option, + pub sequence: Option, + pub min_len: Option, + pub max_len: Option, + pub onlist: Option, + pub regions: Option>, +} + +impl CompatRegion { + fn into_region(self) -> Region { + let region_id = self.region_id.unwrap_or_default(); + let min_len = self.min_len.unwrap_or_default(); + let max_len = self + .max_len + .unwrap_or(if min_len == 0 { 1024 } else { min_len }); + let sequence_type = self.sequence_type.unwrap_or_else(|| "fixed".to_string()); + let sequence = match self.sequence { + Some(sequence) => sequence, + None if sequence_type == "random" => "X".repeat(min_len as usize), + None if sequence_type == "onlist" => "N".repeat(min_len as usize), + None => String::new(), + }; + + Region::new( + region_id.clone(), + self.region_type.unwrap_or_else(|| region_id.clone()), + self.name.unwrap_or(region_id), + sequence_type, + sequence, + min_len, + max_len, + self.onlist.map(CompatOnlist::into_onlist), + self.regions + .unwrap_or_default() + .into_iter() + .map(CompatRegion::into_region) + .collect(), + ) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 00000000..49460c7d --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,27 @@ +mod compat; + +pub mod models; + +pub use models::{assay, coordinate, file, onlist, read, region}; +pub mod auth; +pub mod seqspec_auth; +pub mod seqspec_check; +pub mod seqspec_file; +pub mod seqspec_find; +pub mod seqspec_format; +pub mod seqspec_html; +pub mod seqspec_index; +pub mod seqspec_info; +pub mod seqspec_init; +pub mod seqspec_insert; +pub mod seqspec_methods; +pub mod seqspec_modify; +pub mod seqspec_onlist; +pub mod seqspec_print; +pub mod seqspec_split; +pub mod seqspec_upgrade; +pub mod seqspec_version; +pub mod utils; + +// #[cfg(feature = "python-binding")] +// mod py_module; // lives in src/py_module.rs diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 00000000..cae953de --- /dev/null +++ b/src/main.rs @@ -0,0 +1,134 @@ +#![allow(warnings)] +#![allow(unused_imports)] +#![allow(unused_variables)] +#![allow(dead_code)] + +use seqspec::seqspec_auth; +use seqspec::seqspec_check; +use seqspec::seqspec_file; +use seqspec::seqspec_find; +use seqspec::seqspec_format; +use seqspec::seqspec_index; +use seqspec::seqspec_info; +use seqspec::seqspec_init; +use seqspec::seqspec_insert; +use seqspec::seqspec_methods; +use seqspec::seqspec_modify; +use seqspec::seqspec_onlist; +use seqspec::seqspec_print; +use seqspec::seqspec_split; +use seqspec::seqspec_upgrade; +use seqspec::seqspec_version; +use seqspec::utils; + +use clap::{Parser, Subcommand}; + +const BUILD_DEPRECATED_MESSAGE: &str = + "seqspec build is deprecated. Use seqspec init/insert/modify or construct the spec directly."; + +#[derive(Parser, Debug)] +#[command(name = "seqspec", version)] +struct Args { + #[command(subcommand)] + subcmd: Commands, +} + +#[derive(clap::Args, Debug)] +struct BuildArgs {} + +#[derive(Subcommand, Debug)] +enum Commands { + Auth(seqspec_auth::AuthArgs), + Build(BuildArgs), + Version(seqspec_version::VersionArgs), + Format(seqspec_format::FormatArgs), + Find(seqspec_find::FindArgs), + Index(seqspec_index::IndexArgs), + File(seqspec_file::FileArgs), + Split(seqspec_split::SplitArgs), + Info(seqspec_info::InfoArgs), + Init(seqspec_init::InitArgs), + Methods(seqspec_methods::MethodsArgs), + Modify(seqspec_modify::ModifyArgs), + Upgrade(seqspec_upgrade::UpgradeArgs), + Insert(seqspec_insert::InsertArgs), + Check(seqspec_check::CheckArgs), + Onlist(seqspec_onlist::OnlistArgs), + Print(seqspec_print::PrintArgs), + // other subcommands later... +} + +fn main() { + let args = Args::parse(); + match args.subcmd { + Commands::Auth(args) => seqspec_auth::run(&args).unwrap(), + Commands::Build(_) => run_build_deprecated(), + Commands::Version(args) => seqspec_version::run_version(&args), + Commands::Format(args) => seqspec_format::run_format(&args), + Commands::Find(args) => seqspec_find::run_find(&args), + Commands::Index(args) => seqspec_index::run_index(&args), + Commands::File(args) => seqspec_file::run_file(&args), + Commands::Split(args) => seqspec_split::run_split(&args), + Commands::Info(args) => seqspec_info::run_info(&args), + Commands::Init(args) => seqspec_init::run_init(&args), + Commands::Methods(args) => seqspec_methods::run_methods(&args), + Commands::Modify(args) => seqspec_modify::run_modify(&args), + Commands::Upgrade(args) => seqspec_upgrade::run_upgrade(&args), + Commands::Insert(args) => seqspec_insert::run_insert(&args), + Commands::Check(args) => { + seqspec_check::run_check(&args); + } + Commands::Onlist(args) => seqspec_onlist::run_onlist(&args), + Commands::Print(args) => seqspec_print::run_print(&args), + } +} + +fn run_build_deprecated() { + eprintln!("{}", BUILD_DEPRECATED_MESSAGE); + std::process::exit(1); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_build_subcommand_is_recognized() { + let args = Args::try_parse_from(["seqspec", "build"]).unwrap(); + assert!(matches!(args.subcmd, Commands::Build(_))); + } + + #[test] + fn test_build_deprecated_message_matches_python_cli() { + assert_eq!( + BUILD_DEPRECATED_MESSAGE, + "seqspec build is deprecated. Use seqspec init/insert/modify or construct the spec directly." + ); + } + + #[test] + fn test_find_defaults_match_python_cli() { + let args = Args::try_parse_from(["seqspec", "find", "-m", "rna", "spec.yaml"]).unwrap(); + match args.subcmd { + Commands::Find(find_args) => { + assert_eq!(find_args.selector, "region"); + assert!(find_args.id.is_none()); + } + _ => panic!("expected find subcommand"), + } + } + + #[test] + fn test_index_region_type_selector_is_rejected() { + let result = Args::try_parse_from([ + "seqspec", + "index", + "-m", + "rna", + "-s", + "region-type", + "spec.yaml", + ]); + assert!(result.is_err()); + } +} diff --git a/src/models/assay.rs b/src/models/assay.rs new file mode 100644 index 00000000..7b76be54 --- /dev/null +++ b/src/models/assay.rs @@ -0,0 +1,676 @@ +use serde::{Deserialize, Serialize}; +use std::io::{Error, ErrorKind}; + +use crate::models::file::File; +use crate::models::read::Read; +use crate::models::region::Region; + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct SeqProtocol { + pub protocol_id: String, + pub name: String, + pub modality: String, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct SeqKit { + pub kit_id: String, + pub name: Option, + pub modality: String, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct LibProtocol { + pub protocol_id: String, + pub name: String, + pub modality: String, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct LibKit { + pub kit_id: String, + pub name: Option, + pub modality: String, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct Assay { + pub seqspec_version: Option, + pub assay_id: String, + pub name: String, + pub doi: String, + pub date: String, + pub description: String, + pub modalities: Vec, + pub lib_struct: String, + + // Note we don't support the string type, only the object type + pub sequence_protocol: Option>, + pub sequence_kit: Option>, + pub library_protocol: Option>, + pub library_kit: Option>, + + pub sequence_spec: Vec, + pub library_spec: Vec, +} + +impl Assay { + pub fn new( + assay_id: String, + name: String, + doi: String, + date: String, + description: String, + modalities: Vec, + lib_struct: String, + sequence_spec: Vec, + library_spec: Vec, + sequence_protocol: Option>, + sequence_kit: Option>, + library_protocol: Option>, + library_kit: Option>, + seqspec_version: Option, + ) -> Self { + Self { + seqspec_version, + assay_id, + name, + doi, + date, + description, + modalities, + lib_struct, + sequence_protocol, + sequence_kit, + library_protocol, + library_kit, + sequence_spec, + library_spec, + } + } + + // JSON I/O -------------------------------------------------------- + pub fn from_json(json_str: &str) -> Result { + serde_json::from_str(json_str) + } + + pub fn to_json(&self) -> Result { + serde_json::to_string(self) + } + + pub fn to_bytes(&self) -> Result, std::io::Error> { + serde_yaml::to_string(self) + .map(|s| s.into_bytes()) + .map_err(|e| { + Error::new( + ErrorKind::InvalidData, + format!("failed to serialize assay: {e}"), + ) + }) + } + pub fn from_bytes(bytes: &[u8]) -> Result { + serde_yaml::from_slice(bytes).map_err(|e| { + Error::new( + ErrorKind::InvalidData, + format!("failed to parse assay YAML: {e}"), + ) + }) + } + + // Core helpers ---------------------------------------------------- + pub fn update_spec(&mut self) -> () { + for r in &mut self.library_spec { + r.update_attr(); + } + } + + pub fn list_modalities(&self) -> Vec { + self.modalities.clone() + } + + pub fn get_libspec(&self, modality: &str) -> Option { + self.modalities + .iter() + .position(|m| m == modality) + .and_then(|idx| self.library_spec.get(idx).cloned()) + .filter(|region| region.region_id == modality) + } + + pub fn get_seqspec(&self, modality: &str) -> Vec { + self.sequence_spec + .iter() + .filter(|r| r.modality == modality) + .cloned() + .collect() + } + + pub fn get_read(&self, read_id: &str) -> Option { + self.sequence_spec + .iter() + .find(|r| r.read_id == read_id) + .cloned() + } + + /// Insert regions under the top-level region for `modality`. + /// If `after` is Some(id), insert right after that child; else insert at index 0. + pub fn insert_regions( + &mut self, + regions: Vec, + modality: &str, + after: Option<&str>, + ) -> Result<(), String> { + let idx = self + .modalities + .iter() + .position(|m| m == modality) + .ok_or_else(|| format!("Modality '{modality}' not found"))?; + + let target = self + .library_spec + .get_mut(idx) + .ok_or_else(|| format!("Library spec missing at modality '{modality}'"))?; + + // Compute insertion index + let insert_idx = match after { + Some(aid) => target + .regions + .iter() + .position(|r| r.region_id == aid) + .map(|pos| pos + 1) + .ok_or_else(|| format!("No region with id '{aid}' under modality '{modality}'"))?, + None => 0, + }; + + target.regions.splice(insert_idx..insert_idx, regions); + target.update_attr(); + + Ok(()) + } + + /// Insert reads; if `after` is Some(id), insert right after that read. + /// Otherwise insert at the beginning. Also set read.modality = modality. + pub fn insert_reads( + &mut self, + mut reads: Vec, + modality: &str, + after: Option<&str>, + ) -> Result<(), String> { + if !self.modalities.iter().any(|m| m == modality) { + return Err(format!("Modality '{modality}' not found")); + } + + // set modality on incoming reads (reuses allocation) + let modality_owned = modality.to_owned(); + for r in &mut reads { + r.modality.clone_from(&modality_owned); + } + + // compute insertion index + let insert_idx = match after { + Some(aid) => self + .sequence_spec + .iter() + .position(|r| r.read_id == aid) + .map(|p| p + 1) // insert after the found read + .unwrap_or(self.sequence_spec.len()), // if not found, append + None => 0, // insert at beginning + }; + + // insert all reads at once + self.sequence_spec.splice(insert_idx..insert_idx, reads); + + Ok(()) + } + + pub fn __repr__(&self) -> String { + format!( + "Assay: {} Modalities: {:?}", + self.assay_id, self.modalities + ) + } + + /// Returns the common file count if all reads have the same, non-zero length. + fn file_count(reads: &[Read]) -> Option { + let first = reads.first()?.files.len(); + if first == 0 { + return None; + } + if reads.iter().all(|r| r.files.len() == first) { + Some(first) + } else { + None + } + } + + pub fn generate_group_ids(&self, modality: &str) -> Vec { + let reads = self.get_seqspec(modality); + let n = match Self::file_count(&reads) { + Some(n) => n, + None => return vec![], + }; + (0..n).collect() + } + + pub fn get_read_by_group_id(&self, modality: &str, group_id: usize) -> Option { + let reads = self.get_seqspec(modality); + let _n_files = Self::file_count(&reads)?; + if reads.is_empty() { + return None; + } + let read_idx = group_id % reads.len(); + reads.get(read_idx).map(|r| r.read_id.clone()) + } + + pub fn get_files_by_group_id(&self, modality: &str, group_id: usize) -> Option> { + let reads = self.get_seqspec(modality); + let _n_files = Self::file_count(&reads)?; + if reads.is_empty() { + return None; + } + let read_idx = group_id % reads.len(); + reads.get(read_idx).map(|r| r.files.clone()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + use std::path::PathBuf; + + fn sample_file() -> File { + File::new( + "f1".into(), + "R1.fq.gz".into(), + "fastq".into(), + 1024, + "R1.fq.gz".into(), + "local".into(), + "".into(), + ) + } + + fn sample_read(id: &str, modality: &str) -> Read { + Read::new( + id.into(), + id.into(), + modality.into(), + "primer1".into(), + 100, + 150, + "pos".into(), + vec![sample_file()], + ) + } + + fn sample_region(id: &str) -> Region { + Region::new( + id.into(), + "barcode".into(), + id.into(), + "fixed".into(), + "ATCG".into(), + 4, + 4, + None, + vec![], + ) + } + + fn sample_assay() -> Assay { + Assay::new( + "test_assay".into(), + "Test Assay".into(), + "https://doi.org/test".into(), + "20240101".into(), + "Test description".into(), + vec!["rna".into()], + "".into(), + vec![sample_read("R1", "rna"), sample_read("R2", "rna")], + vec![Region::new( + "rna".into(), + "rna".into(), + "rna".into(), + "joined".into(), + "".into(), + 0, + 0, + None, + vec![ + Region::new( + "primer1".into(), + "truseq_read1".into(), + "primer1".into(), + "fixed".into(), + "".into(), + 0, + 0, + None, + vec![], + ), + sample_region("bc"), + sample_region("umi"), + ], + )], + None, + None, + None, + None, + Some("0.3.0".into()), + ) + } + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + // ---- Creation & serialization ---- + + #[test] + fn test_assay_creation() { + let a = sample_assay(); + assert_eq!(a.assay_id, "test_assay"); + assert_eq!(a.name, "Test Assay"); + assert_eq!(a.modalities, vec!["rna"]); + assert_eq!(a.sequence_spec.len(), 2); + assert_eq!(a.library_spec.len(), 1); + } + + #[test] + fn test_assay_list_modalities() { + let a = sample_assay(); + assert_eq!(a.list_modalities(), vec!["rna"]); + } + + #[test] + fn test_assay_get_libspec() { + let a = sample_assay(); + let lib = a.get_libspec("rna"); + assert!(lib.is_some()); + assert_eq!(lib.unwrap().region_id, "rna"); + } + + #[test] + fn test_assay_get_libspec_not_found() { + let a = sample_assay(); + assert!(a.get_libspec("atac").is_none()); + } + + #[test] + fn test_assay_get_libspec_rejects_top_level_region_id_mismatch() { + let mut a = sample_assay(); + a.library_spec[0].region_id = "wrong".into(); + assert!(a.get_libspec("rna").is_none()); + } + + #[test] + fn test_assay_get_seqspec() { + let a = sample_assay(); + let reads = a.get_seqspec("rna"); + assert_eq!(reads.len(), 2); + } + + #[test] + fn test_assay_get_seqspec_empty() { + let a = sample_assay(); + let reads = a.get_seqspec("unknown"); + assert!(reads.is_empty()); + } + + #[test] + fn test_assay_get_read() { + let a = sample_assay(); + let read = a.get_read("R1"); + assert!(read.is_some()); + assert_eq!(read.unwrap().read_id, "R1"); + } + + #[test] + fn test_assay_get_read_not_found() { + let a = sample_assay(); + assert!(a.get_read("nonexistent").is_none()); + } + + #[test] + fn test_assay_update_spec() { + let mut a = sample_assay(); + a.update_spec(); + let lib = a.get_libspec("rna").unwrap(); + assert_eq!(lib.min_len, 8); // bc(4) + umi(4) + primer(0) + assert_eq!(lib.max_len, 8); + } + + // ---- Insert ---- + + #[test] + fn test_assay_insert_regions() { + let mut a = sample_assay(); + let orig_count = a.get_libspec("rna").unwrap().regions.len(); + let new_region = sample_region("new_bc"); + a.insert_regions(vec![new_region], "rna", None).unwrap(); + + let lib = a.get_libspec("rna").unwrap(); + assert_eq!(lib.regions.len(), orig_count + 1); + assert_eq!(lib.regions[0].region_id, "new_bc"); // inserted at beginning + } + + #[test] + fn test_assay_insert_regions_after() { + let mut a = sample_assay(); + let new_region = sample_region("new_bc"); + a.insert_regions(vec![new_region], "rna", Some("bc")) + .unwrap(); + + let lib = a.get_libspec("rna").unwrap(); + // Find position of new_bc — should be right after "bc" + let bc_pos = lib + .regions + .iter() + .position(|r| r.region_id == "bc") + .unwrap(); + let new_pos = lib + .regions + .iter() + .position(|r| r.region_id == "new_bc") + .unwrap(); + assert_eq!(new_pos, bc_pos + 1); + } + + #[test] + fn test_assay_insert_regions_invalid_modality() { + let mut a = sample_assay(); + let result = a.insert_regions(vec![sample_region("x")], "atac", None); + assert!(result.is_err()); + } + + #[test] + fn test_assay_insert_reads() { + let mut a = sample_assay(); + let orig_count = a.sequence_spec.len(); + let new_read = Read::new( + "I1".into(), + "Index 1".into(), + "".into(), + "p".into(), + 8, + 8, + "pos".into(), + vec![], + ); + a.insert_reads(vec![new_read], "rna", None).unwrap(); + + assert_eq!(a.sequence_spec.len(), orig_count + 1); + assert_eq!(a.sequence_spec[0].read_id, "I1"); // inserted at beginning + assert_eq!(a.sequence_spec[0].modality, "rna"); // modality set + } + + #[test] + fn test_assay_insert_reads_after() { + let mut a = sample_assay(); + let new_read = Read::new( + "I1".into(), + "Index 1".into(), + "".into(), + "p".into(), + 8, + 8, + "pos".into(), + vec![], + ); + a.insert_reads(vec![new_read], "rna", Some("R1")).unwrap(); + + let r1_pos = a + .sequence_spec + .iter() + .position(|r| r.read_id == "R1") + .unwrap(); + let i1_pos = a + .sequence_spec + .iter() + .position(|r| r.read_id == "I1") + .unwrap(); + assert_eq!(i1_pos, r1_pos + 1); + } + + #[test] + fn test_assay_insert_reads_invalid_modality() { + let mut a = sample_assay(); + let new_read = Read::new( + "I1".into(), + "Index 1".into(), + "".into(), + "p".into(), + 8, + 8, + "pos".into(), + vec![], + ); + let result = a.insert_reads(vec![new_read], "atac", None); + assert!(result.is_err()); + } + + #[test] + fn test_assay_json_roundtrip() { + let a = sample_assay(); + let json = a.to_json().unwrap(); + let a2 = Assay::from_json(&json).unwrap(); + assert_eq!(a, a2); + } + + #[test] + fn test_assay_yaml_roundtrip() { + let a = sample_assay(); + let bytes = a.to_bytes().unwrap(); + let a2 = Assay::from_bytes(&bytes).unwrap(); + assert_eq!(a.assay_id, a2.assay_id); + assert_eq!(a.modalities, a2.modalities); + assert_eq!(a.sequence_spec.len(), a2.sequence_spec.len()); + } + + #[test] + fn test_assay_repr() { + let a = sample_assay(); + let repr = a.__repr__(); + assert_eq!(repr, "Assay: test_assay Modalities: [\"rna\"]"); + } + + // ---- Real spec tests ---- + + #[test] + fn test_load_dogma_spec() { + let spec = dogma_spec(); + assert_eq!(spec.assay_id, "DOGMAseq-DIG"); + } + + #[test] + fn test_dogma_list_modalities() { + let spec = dogma_spec(); + let mods = spec.list_modalities(); + assert!(mods.contains(&"rna".to_string())); + assert!(mods.contains(&"atac".to_string())); + assert!(mods.contains(&"protein".to_string())); + assert!(mods.contains(&"tag".to_string())); + assert_eq!(mods.len(), 4); + } + + #[test] + fn test_dogma_get_libspec_rna() { + let spec = dogma_spec(); + let lib = spec.get_libspec("rna").expect("rna modality"); + assert_eq!(lib.region_id, "rna"); + } + + #[test] + fn test_dogma_get_seqspec_rna() { + let spec = dogma_spec(); + let reads = spec.get_seqspec("rna"); + assert_eq!(reads.len(), 2); + assert_eq!(reads[0].read_id, "rna_R1"); + assert_eq!(reads[1].read_id, "rna_R2"); + for r in &reads { + assert_eq!(r.modality, "rna"); + } + } + + #[test] + fn test_dogma_get_read() { + let spec = dogma_spec(); + // DOGMAseq has reads like "rna_R1", "rna_R2", "atac_R1", etc. + // Find the first rna read + let rna_reads = spec.get_seqspec("rna"); + assert!(!rna_reads.is_empty()); + let read_id = &rna_reads[0].read_id; + let found = spec.get_read(read_id); + assert!(found.is_some()); + assert_eq!(&found.unwrap().read_id, read_id); + } + + // ---- Group ID methods ---- + + #[test] + fn test_generate_group_ids() { + let a = sample_assay(); + let ids = a.generate_group_ids("rna"); + // Both reads have 1 file each, so group_ids = [0] + assert_eq!(ids, vec![0]); + } + + #[test] + fn test_generate_group_ids_empty() { + let a = sample_assay(); + let ids = a.generate_group_ids("nonexistent"); + assert!(ids.is_empty()); + } + + #[test] + fn test_get_read_by_group_id() { + let a = sample_assay(); + let read_id = a.get_read_by_group_id("rna", 0); + assert!(read_id.is_some()); + assert_eq!(read_id.unwrap(), "R1"); + } + + #[test] + fn test_get_read_by_group_id_wraps() { + let a = sample_assay(); + // group_id 1 should wrap to read index 1 + let read_id = a.get_read_by_group_id("rna", 1); + assert!(read_id.is_some()); + assert_eq!(read_id.unwrap(), "R2"); + } + + #[test] + fn test_get_files_by_group_id() { + let a = sample_assay(); + let files = a.get_files_by_group_id("rna", 0); + assert!(files.is_some()); + assert_eq!(files.unwrap().len(), 1); + } + + #[test] + fn test_get_files_by_group_id_not_found() { + let a = sample_assay(); + let files = a.get_files_by_group_id("nonexistent", 0); + assert!(files.is_none()); + } +} diff --git a/src/models/coordinate.rs b/src/models/coordinate.rs new file mode 100644 index 00000000..0943f14e --- /dev/null +++ b/src/models/coordinate.rs @@ -0,0 +1,17 @@ +use serde::{Deserialize, Serialize}; + +use crate::models::region::RegionCoordinate; + +#[derive(Debug, Serialize, Deserialize)] +pub struct Coordinate { + pub query_id: String, + pub query_name: String, + pub query_type: String, + pub rcv: Vec, + #[serde(default = "default_strand")] + pub strand: String, +} + +fn default_strand() -> String { + "pos".to_string() +} diff --git a/src/models/file.rs b/src/models/file.rs new file mode 100644 index 00000000..6d5808c2 --- /dev/null +++ b/src/models/file.rs @@ -0,0 +1,104 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct File { + pub file_id: String, + pub filename: String, + pub filetype: String, + pub filesize: i64, + pub url: String, + pub urltype: String, + pub md5: String, +} + +impl File { + pub fn new( + file_id: String, + filename: String, + filetype: String, + filesize: i64, + url: String, + urltype: String, + md5: String, + ) -> Self { + Self { + file_id, + filename, + filetype, + filesize, + url, + urltype, + md5, + } + } + + pub fn from_json(json_str: &str) -> Result { + serde_json::from_str(json_str) + } + + pub fn to_json(&self) -> Result { + serde_json::to_string(self) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sample_file() -> File { + File::new( + "file1".into(), + "reads_R1.fastq.gz".into(), + "fastq".into(), + 1024, + "reads_R1.fastq.gz".into(), + "local".into(), + "abc123".into(), + ) + } + + #[test] + fn test_file_creation() { + let f = sample_file(); + assert_eq!(f.file_id, "file1"); + assert_eq!(f.filename, "reads_R1.fastq.gz"); + assert_eq!(f.filetype, "fastq"); + assert_eq!(f.filesize, 1024); + assert_eq!(f.url, "reads_R1.fastq.gz"); + assert_eq!(f.urltype, "local"); + assert_eq!(f.md5, "abc123"); + } + + #[test] + fn test_file_json_roundtrip() { + let f = sample_file(); + let json = f.to_json().unwrap(); + let f2 = File::from_json(&json).unwrap(); + assert_eq!(f, f2); + } + + #[test] + fn test_file_partial_eq() { + let f1 = sample_file(); + let f2 = sample_file(); + assert_eq!(f1, f2); + + let f3 = File::new( + "file2".into(), + "other.fq".into(), + "fastq".into(), + 0, + "".into(), + "local".into(), + "".into(), + ); + assert_ne!(f1, f3); + } + + #[test] + fn test_file_clone() { + let f1 = sample_file(); + let f2 = f1.clone(); + assert_eq!(f1, f2); + } +} diff --git a/src/models/mod.rs b/src/models/mod.rs new file mode 100644 index 00000000..f50b59fc --- /dev/null +++ b/src/models/mod.rs @@ -0,0 +1,6 @@ +pub mod assay; +pub mod coordinate; +pub mod file; +pub mod onlist; +pub mod read; +pub mod region; diff --git a/src/models/onlist.rs b/src/models/onlist.rs new file mode 100644 index 00000000..fbf64525 --- /dev/null +++ b/src/models/onlist.rs @@ -0,0 +1,85 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct Onlist { + pub file_id: String, + pub filename: String, + pub filetype: String, + pub filesize: i64, + pub url: String, + pub urltype: String, + pub md5: String, +} + +impl Onlist { + pub fn new( + file_id: String, + filename: String, + filetype: String, + filesize: i64, + url: String, + urltype: String, + md5: String, + ) -> Self { + Self { + file_id, + filename, + filetype, + filesize, + url, + urltype, + md5, + } + } + + pub fn from_json(json_str: &str) -> Result { + serde_json::from_str(json_str) + } + pub fn to_json(&self) -> Result { + serde_json::to_string(self) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sample_onlist() -> Onlist { + Onlist::new( + "ol1".into(), + "barcodes.txt".into(), + "txt".into(), + 1024, + "barcodes.txt".into(), + "local".into(), + "abc123".into(), + ) + } + + #[test] + fn test_onlist_creation() { + let ol = sample_onlist(); + assert_eq!(ol.file_id, "ol1"); + assert_eq!(ol.filename, "barcodes.txt"); + assert_eq!(ol.filetype, "txt"); + assert_eq!(ol.filesize, 1024); + assert_eq!(ol.url, "barcodes.txt"); + assert_eq!(ol.urltype, "local"); + assert_eq!(ol.md5, "abc123"); + } + + #[test] + fn test_onlist_json_roundtrip() { + let ol = sample_onlist(); + let json = ol.to_json().unwrap(); + let ol2 = Onlist::from_json(&json).unwrap(); + assert_eq!(ol, ol2); + } + + #[test] + fn test_onlist_clone() { + let ol = sample_onlist(); + let ol2 = ol.clone(); + assert_eq!(ol, ol2); + } +} diff --git a/src/models/read.rs b/src/models/read.rs new file mode 100644 index 00000000..0937bb56 --- /dev/null +++ b/src/models/read.rs @@ -0,0 +1,281 @@ +use crate::models::file::File; +use crate::models::region::RegionCoordinate; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct Read { + pub read_id: String, + pub name: String, + pub modality: String, + pub primer_id: String, + pub min_len: i64, + pub max_len: i64, + /// "pos" | "neg" + pub strand: String, + pub files: Vec, +} + +impl Read { + pub fn new( + read_id: String, + name: String, + modality: String, + primer_id: String, + min_len: i64, + max_len: i64, + strand: String, + files: Vec, + ) -> Self { + Self { + read_id, + name, + modality, + primer_id, + min_len, + max_len, + strand, + files, + } + } + + pub fn from_json(json_str: &str) -> Result { + serde_json::from_str(json_str) + } + + pub fn to_json(&self) -> Result { + serde_json::to_string(self) + } + + pub fn update_files(&mut self, files: Vec) { + self.files = files; + } + + pub fn update_read_by_id( + &mut self, + read_id: Option, + name: Option, + modality: Option, + primer_id: Option, + min_len: Option, + max_len: Option, + strand: Option, + files: Option>, + ) { + if let Some(v) = read_id { + self.read_id = v; + } + if let Some(v) = name { + self.name = v; + } + if let Some(v) = modality { + self.modality = v; + } + if let Some(v) = primer_id { + self.primer_id = v; + } + if let Some(v) = min_len { + self.min_len = v; + } + if let Some(v) = max_len { + self.max_len = v; + } + if let Some(v) = strand { + self.strand = v; + } + if let Some(v) = files { + self.files = v; + } + } + + /// Return self if any File has matching file_id, else None. + pub fn get_read_by_file_id(&self, file_id: &str) -> Option { + if self.files.iter().any(|f| f.file_id == file_id) { + Some(self.clone()) + } else { + None + } + } + + pub fn repr(&self) -> String { + let sign = if self.strand == "pos" { "+" } else { "-" }; + format!( + "{sign}({}, {}){}:{}", + self.min_len, self.max_len, self.read_id, self.primer_id + ) + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ReadCoordinate { + pub read: Read, + pub rcv: Vec, // rcv: "read coordinate vector" +} + +impl ReadCoordinate { + pub fn new(read: Read, rcv: Vec) -> Self { + Self { read, rcv } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sample_file() -> File { + File::new( + "file1".into(), + "R1.fq.gz".into(), + "fastq".into(), + 1024, + "R1.fq.gz".into(), + "local".into(), + "".into(), + ) + } + + fn sample_read() -> Read { + Read::new( + "test_read".into(), + "Test Read".into(), + "rna".into(), + "test_primer".into(), + 100, + 150, + "pos".into(), + vec![sample_file()], + ) + } + + #[test] + fn test_read_creation() { + let r = sample_read(); + assert_eq!(r.read_id, "test_read"); + assert_eq!(r.name, "Test Read"); + assert_eq!(r.modality, "rna"); + assert_eq!(r.primer_id, "test_primer"); + assert_eq!(r.min_len, 100); + assert_eq!(r.max_len, 150); + assert_eq!(r.strand, "pos"); + assert_eq!(r.files.len(), 1); + } + + #[test] + fn test_read_update_files() { + let mut r = sample_read(); + let new_files = vec![File::new( + "f2".into(), + "R2.fq".into(), + "fastq".into(), + 0, + "".into(), + "local".into(), + "".into(), + )]; + r.update_files(new_files); + assert_eq!(r.files.len(), 1); + assert_eq!(r.files[0].file_id, "f2"); + } + + #[test] + fn test_read_update_by_id_partial() { + let mut r = sample_read(); + r.update_read_by_id( + None, + Some("Updated Name".into()), + None, + None, + Some(200), + None, + None, + None, + ); + assert_eq!(r.read_id, "test_read"); // unchanged + assert_eq!(r.name, "Updated Name"); + assert_eq!(r.min_len, 200); + assert_eq!(r.max_len, 150); // unchanged + } + + #[test] + fn test_read_update_by_id_none_keeps_original() { + let mut r = sample_read(); + r.update_read_by_id(None, None, None, None, None, None, None, None); + assert_eq!(r.read_id, "test_read"); + assert_eq!(r.name, "Test Read"); + assert_eq!(r.modality, "rna"); + } + + #[test] + fn test_read_get_by_file_id_found() { + let r = sample_read(); + let result = r.get_read_by_file_id("file1"); + assert!(result.is_some()); + assert_eq!(result.unwrap().read_id, "test_read"); + } + + #[test] + fn test_read_get_by_file_id_not_found() { + let r = sample_read(); + assert!(r.get_read_by_file_id("nonexistent").is_none()); + } + + #[test] + fn test_read_repr_pos() { + let r = sample_read(); + let repr = r.repr(); + assert_eq!(repr, "+(100, 150)test_read:test_primer"); + } + + #[test] + fn test_read_repr_neg() { + let mut r = sample_read(); + r.strand = "neg".into(); + let repr = r.repr(); + assert_eq!(repr, "-(100, 150)test_read:test_primer"); + } + + #[test] + fn test_read_json_roundtrip() { + let r = sample_read(); + let json = r.to_json().unwrap(); + let r2 = Read::from_json(&json).unwrap(); + assert_eq!(r, r2); + } + + #[test] + fn test_read_coordinate_creation() { + let r = sample_read(); + let region = crate::models::region::Region::new( + "bc".into(), + "barcode".into(), + "barcode".into(), + "fixed".into(), + "ATCG".into(), + 4, + 4, + None, + vec![], + ); + let rc = crate::models::region::RegionCoordinate::new(region, 0, 4); + let read_coord = ReadCoordinate::new(r.clone(), vec![rc]); + assert_eq!(read_coord.read.read_id, "test_read"); + assert_eq!(read_coord.rcv.len(), 1); + } + + #[test] + fn test_read_real_spec_properties() { + let spec = crate::utils::load_spec(&std::path::PathBuf::from("tests/fixtures/spec.yaml")); + let rna_reads = spec.get_seqspec("rna"); + assert_eq!(rna_reads.len(), 2); + // rna_R1 + assert_eq!(rna_reads[0].read_id, "rna_R1"); + assert_eq!(rna_reads[0].modality, "rna"); + assert_eq!(rna_reads[0].strand, "pos"); + assert_eq!(rna_reads[0].min_len, 28); + assert_eq!(rna_reads[0].max_len, 28); + // rna_R2 + assert_eq!(rna_reads[1].read_id, "rna_R2"); + assert_eq!(rna_reads[1].strand, "neg"); + assert_eq!(rna_reads[1].min_len, 102); + assert_eq!(rna_reads[1].max_len, 102); + } +} diff --git a/src/models/region.rs b/src/models/region.rs new file mode 100644 index 00000000..eff2bf1d --- /dev/null +++ b/src/models/region.rs @@ -0,0 +1,1078 @@ +use serde::{Deserialize, Serialize}; + +use crate::models::onlist::Onlist; +use crate::utils::complement_seq; + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct Region { + pub region_id: String, + pub region_type: String, // keep String for simplicity + pub name: String, + pub sequence_type: String, // "fixed" | "random" | "onlist" | "joined" + pub sequence: String, + pub min_len: i64, + pub max_len: i64, + pub onlist: Option, + pub regions: Vec, +} + +impl Region { + pub fn new( + region_id: String, + region_type: String, + name: String, + sequence_type: String, + sequence: String, + min_len: i64, + max_len: i64, + onlist: Option, + regions: Vec, + ) -> Self { + Self { + region_id, + region_type, + name, + sequence_type, + sequence, + min_len, + max_len, + onlist, + regions, + } + } + + // ---- JSON I/O --------------------------------------------------- + pub fn from_json(json_str: &str) -> Result { + serde_json::from_str(json_str) + } + pub fn to_json(&self) -> Result { + serde_json::to_string(self) + } + + // ---- Core helpers ----------------------------------------------- + pub fn get_sequence(&self) -> String { + if self.regions.is_empty() { + if self.sequence.is_empty() { + "X".repeat(self.min_len as usize) + } else { + self.sequence.clone() + } + } else { + let mut s = String::new(); + for r in &self.regions { + s.push_str(&r.get_sequence()); + } + s + } + } + + pub fn get_len(&self) -> (i64, i64) { + if self.regions.is_empty() { + (self.min_len, self.max_len) + } else { + let mut mn = 0i64; + let mut mx = 0i64; + for r in &self.regions { + let (c_min, c_max) = r.get_len(); + mn += c_min; + mx += c_max; + } + (mn, mx) + } + } + + pub fn update_attr(&mut self) { + for r in &mut self.regions { + r.update_attr(); + } + let (mn, mx) = self.get_len(); + self.min_len = mn; + self.max_len = mx; + + self.sequence = match self.sequence_type.as_str() { + "random" => "X".repeat(self.min_len as usize), + "onlist" => "N".repeat(self.min_len as usize), + _ => self.get_sequence(), + }; + } + + // ---- Queries ---------------------------------------------------- + pub fn get_region_by_id(&self, region_id: &str) -> Vec { + let mut found = Vec::new(); + if self.region_id == region_id { + found.push(self.clone()); + } + for r in &self.regions { + found.extend(r.get_region_by_id(region_id)); + } + found + } + + pub fn get_region_by_region_type(&self, region_type: &str) -> Vec { + let mut found = Vec::new(); + if self.region_type == region_type { + found.push(self.clone()); + } + for r in &self.regions { + found.extend(r.get_region_by_region_type(region_type)); + } + found + } + + pub fn get_onlist_regions(&self) -> Vec { + let mut found = Vec::new(); + if self.onlist.is_some() { + found.push(self.clone()); + } + for r in &self.regions { + found.extend(r.get_onlist_regions()); + } + found + } + + pub fn get_onlist(&self) -> Option { + self.onlist.clone() + } + + pub fn get_leaves(&self) -> Vec { + let mut leaves = Vec::new(); + if self.regions.is_empty() { + leaves.push(self.clone()); + } else { + for r in &self.regions { + leaves.extend(r.get_leaves()); + } + } + leaves + } + + pub fn get_leaves_with_region_id(&self, region_id: &str) -> Vec { + let mut leaves = Vec::new(); + if self.region_id == region_id { + // if it matches, include this node (don’t descend) + leaves.push(self.clone()); + } else if self.regions.is_empty() { + // if atomic, include it + leaves.push(self.clone()); + } else { + for r in &self.regions { + leaves.extend(r.get_leaves_with_region_id(region_id)); + } + } + leaves + } + + pub fn get_leaf_region_types(&self) -> Vec { + use std::collections::BTreeSet; + let mut set = BTreeSet::new(); + for r in self.get_leaves() { + set.insert(r.region_type.clone()); + } + set.into_iter().collect() + } + + pub fn to_newick(&self) -> String { + if self.regions.is_empty() { + format!("'{}:{}'", self.region_id, self.max_len) + } else { + let inner: Vec = self.regions.iter().map(|r| r.to_newick()).collect(); + format!("({}){}", inner.join(","), self.region_id) + } + } + + // ---- Mutations -------------------------------------------------- + pub fn update_region( + &mut self, + region_id: String, + region_type: String, + name: String, + sequence_type: String, + sequence: String, + min_len: i64, + max_len: i64, + onlist: Option, + ) { + self.region_id = region_id; + self.region_type = region_type; + self.name = name; + self.sequence_type = sequence_type; + self.sequence = sequence; + self.min_len = min_len; + self.max_len = max_len; + self.onlist = onlist; + } + + pub fn update_region_by_id( + &mut self, + target_region_id: String, + region_id: Option, + region_type: Option, + name: Option, + sequence_type: Option, + sequence: Option, + min_len: Option, + max_len: Option, + ) { + if self.region_id == target_region_id { + if let Some(v) = region_id { + self.region_id = v; + } + if let Some(v) = region_type { + self.region_type = v; + } + if let Some(v) = name { + self.name = v; + } + if let Some(v) = sequence_type { + self.sequence_type = v; + } + if let Some(v) = sequence { + self.sequence = v; + } + if let Some(v) = min_len { + self.min_len = v; + } + if let Some(v) = max_len { + self.max_len = v; + } + return; + } + for r in &mut self.regions { + r.update_region_by_id( + target_region_id.clone(), + region_id.clone(), + region_type.clone(), + name.clone(), + sequence_type.clone(), + sequence.clone(), + min_len, + max_len, + ); + } + } + + pub fn reverse(&mut self) { + if self.regions.is_empty() { + self.sequence = self.sequence.chars().rev().collect(); + } else { + // preserve left-to-right topology; reverse inside each child + for r in &mut self.regions { + r.reverse(); + } + } + } + + pub fn complement(&mut self) { + if self.regions.is_empty() { + self.sequence = complement_seq(&self.sequence); + } else { + for r in &mut self.regions { + r.complement(); + } + } + } + + pub fn repr(&self) -> String { + format!("{}({}, {})", self.region_type, self.min_len, self.max_len) + } +} + +/// Region + half-open coordinates [start, stop) +/// (Python: RegionCoordinate(Region) with start/stop) +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct RegionCoordinate { + /// Flatten so JSON/YAML has Region fields at top-level (like inheritance) + #[serde(flatten)] + pub region: Region, + pub start: i64, + pub stop: i64, +} + +impl RegionCoordinate { + pub fn new(region: Region, start: i64, stop: i64) -> Self { + Self { + region, + start, + stop, + } + } + + pub fn repr(&self) -> String { + format!("{}({}, {})", self.region.region_type, self.start, self.stop) + } + + pub fn display_string(&self) -> String { + format!( + "RegionCoordinate {} [{}]: [{}, {})", + self.region.name, self.region.region_type, self.start, self.stop + ) + } + + /// Compute the "difference" interval per Python __sub__ logic. + /// Returns a new RegionCoordinate with region_type="difference", + /// sequence_type="diff", and sequence = "X" * len. + pub fn difference(&self, other: &Self) -> Option { + let (new_start, new_stop) = if self.stop <= other.start { + (self.stop, other.start) // self .. other gap + } else if other.stop <= self.start { + (other.stop, self.start) // other .. self gap + } else if self.start == other.start && self.stop == other.stop { + (self.start, self.stop) // identical intervals + } else { + return None; // overlapping but not identical + }; + + let len = (new_stop - new_start) as usize; // guaranteed >= 0 here + let seq = "X".repeat(len); + + let new_region = Region { + region_id: format!("{} - {}", self.region.region_id, other.region.region_id), + region_type: "difference".to_string(), + name: format!("{} - {}", self.region.name, other.region.name), + sequence_type: "diff".to_string(), + sequence: seq, + min_len: (new_stop - new_start) as i64, + max_len: (new_stop - new_start) as i64, + onlist: None, + regions: Vec::new(), + }; + + Some(Self { + region: new_region, + start: new_start, + stop: new_stop, + }) + } +} + +/// Python: RegionCoordinateDifference +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct RegionCoordinateDifference { + pub obj: RegionCoordinate, + pub fixed: RegionCoordinate, + pub rgncdiff: RegionCoordinate, + /// "", "-", or "+" + #[serde(default)] + pub loc: String, +} + +impl RegionCoordinateDifference { + pub fn new(obj: RegionCoordinate, fixed: RegionCoordinate, rgncdiff: RegionCoordinate) -> Self { + let loc = if obj.stop <= fixed.start { + "-".to_string() + } else if obj.start >= fixed.stop { + "+".to_string() + } else { + "".to_string() + }; + Self { + obj, + fixed, + rgncdiff, + loc, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + use std::path::PathBuf; + + fn leaf(id: &str, seq: &str, len: i64) -> Region { + Region::new( + id.into(), + "barcode".into(), + id.into(), + "fixed".into(), + seq.into(), + len, + len, + None, + vec![], + ) + } + + fn joined(id: &str, children: Vec) -> Region { + Region::new( + id.into(), + "joined".into(), + id.into(), + "joined".into(), + "".into(), + 0, + 0, + None, + children, + ) + } + + fn dogma_spec() -> crate::models::assay::Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + // ---- Creation ---- + + #[test] + fn test_region_creation() { + let r = leaf("bc", "ATCG", 4); + assert_eq!(r.region_id, "bc"); + assert_eq!(r.region_type, "barcode"); + assert_eq!(r.sequence_type, "fixed"); + assert_eq!(r.sequence, "ATCG"); + assert_eq!(r.min_len, 4); + assert_eq!(r.max_len, 4); + assert!(r.onlist.is_none()); + assert!(r.regions.is_empty()); + } + + // ---- get_sequence ---- + + #[test] + fn test_get_sequence_simple() { + let r = leaf("bc", "ATCG", 4); + assert_eq!(r.get_sequence(), "ATCG"); + } + + #[test] + fn test_get_sequence_empty() { + let r = Region::new( + "bc".into(), + "barcode".into(), + "bc".into(), + "random".into(), + "".into(), + 8, + 8, + None, + vec![], + ); + assert_eq!(r.get_sequence(), "XXXXXXXX"); + } + + #[test] + fn test_get_sequence_nested() { + let parent = joined("parent", vec![leaf("a", "AAAA", 4), leaf("b", "CCCC", 4)]); + assert_eq!(parent.get_sequence(), "AAAACCCC"); + } + + // ---- get_len ---- + + #[test] + fn test_get_len_simple() { + let r = Region::new( + "r".into(), + "umi".into(), + "r".into(), + "random".into(), + "".into(), + 10, + 12, + None, + vec![], + ); + assert_eq!(r.get_len(), (10, 12)); + } + + #[test] + fn test_get_len_nested() { + let parent = joined( + "parent", + vec![ + leaf("a", "AAAA", 4), + Region::new( + "b".into(), + "umi".into(), + "b".into(), + "random".into(), + "".into(), + 10, + 12, + None, + vec![], + ), + ], + ); + assert_eq!(parent.get_len(), (14, 16)); + } + + // ---- update_attr ---- + + #[test] + fn test_update_attr_fixed() { + let mut parent = joined("parent", vec![leaf("a", "AAAA", 4), leaf("b", "CCCC", 4)]); + parent.update_attr(); + assert_eq!(parent.min_len, 8); + assert_eq!(parent.max_len, 8); + assert_eq!(parent.sequence, "AAAACCCC"); + } + + #[test] + fn test_update_attr_random() { + let mut r = Region::new( + "r".into(), + "umi".into(), + "r".into(), + "random".into(), + "".into(), + 10, + 10, + None, + vec![], + ); + r.update_attr(); + assert_eq!(r.sequence, "XXXXXXXXXX"); + } + + #[test] + fn test_update_attr_onlist() { + let onlist = Onlist::new( + "ol".into(), + "list.txt".into(), + "txt".into(), + 0, + "list.txt".into(), + "local".into(), + "".into(), + ); + let mut r = Region::new( + "r".into(), + "barcode".into(), + "r".into(), + "onlist".into(), + "".into(), + 16, + 16, + Some(onlist), + vec![], + ); + r.update_attr(); + assert_eq!(r.sequence, "NNNNNNNNNNNNNNNN"); + assert_eq!(r.sequence.len(), 16); + } + + // ---- Queries ---- + + #[test] + fn test_get_region_by_id() { + let parent = joined( + "parent", + vec![leaf("target", "ATCG", 4), leaf("other", "GGGG", 4)], + ); + let found = parent.get_region_by_id("target"); + assert_eq!(found.len(), 1); + assert_eq!(found[0].region_id, "target"); + } + + #[test] + fn test_get_region_by_id_not_found() { + let r = leaf("bc", "ATCG", 4); + assert!(r.get_region_by_id("missing").is_empty()); + } + + #[test] + fn test_get_region_by_region_type() { + let parent = joined( + "parent", + vec![ + leaf("a", "AAAA", 4), + Region::new( + "u".into(), + "umi".into(), + "u".into(), + "random".into(), + "".into(), + 10, + 10, + None, + vec![], + ), + ], + ); + let barcodes = parent.get_region_by_region_type("barcode"); + assert_eq!(barcodes.len(), 1); + assert_eq!(barcodes[0].region_id, "a"); + + let umis = parent.get_region_by_region_type("umi"); + assert_eq!(umis.len(), 1); + } + + #[test] + fn test_get_onlist_regions() { + let onlist = Onlist::new( + "ol".into(), + "list.txt".into(), + "txt".into(), + 0, + "".into(), + "local".into(), + "".into(), + ); + let parent = joined( + "parent", + vec![ + Region::new( + "bc".into(), + "barcode".into(), + "bc".into(), + "onlist".into(), + "".into(), + 16, + 16, + Some(onlist), + vec![], + ), + leaf("other", "AAAA", 4), + ], + ); + let onlist_regions = parent.get_onlist_regions(); + assert_eq!(onlist_regions.len(), 1); + assert_eq!(onlist_regions[0].region_id, "bc"); + } + + #[test] + fn test_get_onlist() { + let r = leaf("bc", "ATCG", 4); + assert!(r.get_onlist().is_none()); + + let onlist = Onlist::new( + "ol".into(), + "list.txt".into(), + "txt".into(), + 0, + "".into(), + "local".into(), + "".into(), + ); + let r2 = Region::new( + "bc".into(), + "barcode".into(), + "bc".into(), + "onlist".into(), + "".into(), + 16, + 16, + Some(onlist.clone()), + vec![], + ); + assert_eq!(r2.get_onlist().unwrap(), onlist); + } + + #[test] + fn test_get_leaves() { + let parent = joined( + "parent", + vec![ + leaf("a", "AAAA", 4), + joined("inner", vec![leaf("b", "CCCC", 4), leaf("c", "GGGG", 4)]), + ], + ); + let leaves = parent.get_leaves(); + assert_eq!(leaves.len(), 3); + assert_eq!(leaves[0].region_id, "a"); + assert_eq!(leaves[1].region_id, "b"); + assert_eq!(leaves[2].region_id, "c"); + } + + #[test] + fn test_get_leaves_with_region_id() { + let parent = joined( + "parent", + vec![ + leaf("a", "AAAA", 4), + joined("inner", vec![leaf("b", "CCCC", 4), leaf("c", "GGGG", 4)]), + ], + ); + // Stops at "inner" and includes it instead of descending + let leaves = parent.get_leaves_with_region_id("inner"); + assert_eq!(leaves.len(), 2); + assert_eq!(leaves[0].region_id, "a"); + assert_eq!(leaves[1].region_id, "inner"); + } + + #[test] + fn test_get_leaf_region_types() { + let parent = joined( + "parent", + vec![ + leaf("a", "AAAA", 4), // barcode + Region::new( + "u".into(), + "umi".into(), + "u".into(), + "random".into(), + "".into(), + 10, + 10, + None, + vec![], + ), + leaf("b", "CCCC", 4), // barcode + ], + ); + let types = parent.get_leaf_region_types(); + assert_eq!(types, vec!["barcode", "umi"]); + } + + // ---- Newick ---- + + #[test] + fn test_to_newick_leaf() { + let r = Region::new( + "bc".into(), + "barcode".into(), + "bc".into(), + "fixed".into(), + "ATCG".into(), + 4, + 4, + None, + vec![], + ); + assert_eq!(r.to_newick(), "'bc:4'"); + } + + #[test] + fn test_to_newick_nested() { + let parent = joined("parent", vec![leaf("a", "AAAA", 4), leaf("b", "CCCC", 4)]); + assert_eq!(parent.to_newick(), "('a:4','b:4')parent"); + } + + // ---- Mutations ---- + + #[test] + fn test_reverse_leaf() { + let mut r = leaf("bc", "ATCG", 4); + r.reverse(); + assert_eq!(r.sequence, "GCTA"); + } + + #[test] + fn test_reverse_nested() { + let mut parent = joined("parent", vec![leaf("a", "ATCG", 4), leaf("b", "GGCC", 4)]); + parent.reverse(); + assert_eq!(parent.regions[0].sequence, "GCTA"); + assert_eq!(parent.regions[1].sequence, "CCGG"); + } + + #[test] + fn test_complement_leaf() { + let mut r = leaf("bc", "ATCG", 4); + r.complement(); + assert_eq!(r.sequence, "TAGC"); + } + + #[test] + fn test_complement_nested() { + let mut parent = joined("parent", vec![leaf("a", "ATCG", 4), leaf("b", "AAAA", 4)]); + parent.complement(); + assert_eq!(parent.regions[0].sequence, "TAGC"); + assert_eq!(parent.regions[1].sequence, "TTTT"); + } + + #[test] + fn test_update_region() { + let mut r = leaf("old", "ATCG", 4); + r.update_region( + "new".into(), + "umi".into(), + "New Name".into(), + "random".into(), + "XXXX".into(), + 4, + 4, + None, + ); + assert_eq!(r.region_id, "new"); + assert_eq!(r.region_type, "umi"); + assert_eq!(r.name, "New Name"); + assert_eq!(r.sequence_type, "random"); + assert_eq!(r.sequence, "XXXX"); + } + + #[test] + fn test_update_region_by_id() { + let mut parent = joined( + "parent", + vec![leaf("target", "ATCG", 4), leaf("other", "GGGG", 4)], + ); + parent.update_region_by_id( + "target".into(), + None, + None, + Some("Updated Name".into()), + None, + Some("CCCC".into()), + None, + None, + ); + assert_eq!(parent.regions[0].region_id, "target"); // unchanged + assert_eq!(parent.regions[0].name, "Updated Name"); + assert_eq!(parent.regions[0].sequence, "CCCC"); + } + + #[test] + fn test_update_region_by_id_none_keeps_original() { + let mut r = leaf("bc", "ATCG", 4); + r.update_region_by_id("bc".into(), None, None, None, None, None, None, None); + assert_eq!(r.region_id, "bc"); + assert_eq!(r.name, "bc"); + assert_eq!(r.sequence, "ATCG"); + } + + #[test] + fn test_region_repr() { + let r = Region::new( + "bc".into(), + "barcode".into(), + "bc".into(), + "fixed".into(), + "ATCG".into(), + 16, + 16, + None, + vec![], + ); + assert_eq!(r.repr(), "barcode(16, 16)"); + } + + #[test] + fn test_region_json_roundtrip() { + let r = leaf("bc", "ATCG", 4); + let json = r.to_json().unwrap(); + let r2 = Region::from_json(&json).unwrap(); + assert_eq!(r, r2); + } + + // ---- RegionCoordinate ---- + + #[test] + fn test_region_coordinate_creation() { + let r = leaf("bc", "ATCG", 4); + let rc = RegionCoordinate::new(r.clone(), 0, 4); + assert_eq!(rc.start, 0); + assert_eq!(rc.stop, 4); + assert_eq!(rc.region.region_id, "bc"); + } + + #[test] + fn test_region_coordinate_repr() { + let r = leaf("bc", "ATCG", 4); + let rc = RegionCoordinate::new(r, 10, 20); + assert_eq!(rc.repr(), "barcode(10, 20)"); + } + + #[test] + fn test_region_coordinate_difference_gap() { + let r1 = leaf("a", "AAAA", 4); + let r2 = leaf("b", "CCCC", 4); + let rc1 = RegionCoordinate::new(r1, 0, 4); + let rc2 = RegionCoordinate::new(r2, 10, 14); + + let diff = rc1.difference(&rc2).unwrap(); + assert_eq!(diff.start, 4); + assert_eq!(diff.stop, 10); + assert_eq!(diff.region.region_type, "difference"); + assert_eq!(diff.region.sequence, "XXXXXX"); + } + + #[test] + fn test_region_coordinate_difference_identical() { + let r1 = leaf("a", "AAAA", 4); + let r2 = leaf("b", "CCCC", 4); + let rc1 = RegionCoordinate::new(r1, 5, 10); + let rc2 = RegionCoordinate::new(r2, 5, 10); + + let diff = rc1.difference(&rc2).unwrap(); + assert_eq!(diff.start, 5); + assert_eq!(diff.stop, 10); + } + + #[test] + fn test_region_coordinate_difference_overlap_returns_none() { + let r1 = leaf("a", "AAAA", 4); + let r2 = leaf("b", "CCCC", 4); + let rc1 = RegionCoordinate::new(r1, 0, 10); + let rc2 = RegionCoordinate::new(r2, 5, 15); + + assert!(rc1.difference(&rc2).is_none()); + } + + #[test] + fn test_region_coordinate_difference_loc() { + let r1 = leaf("a", "AAAA", 4); + let r2 = leaf("b", "CCCC", 4); + let r3 = leaf("d", "XXXX", 4); + + let obj = RegionCoordinate::new(r1.clone(), 0, 4); + let fixed = RegionCoordinate::new(r2, 10, 14); + let rgncdiff = RegionCoordinate::new(r3, 4, 10); + + let diff = RegionCoordinateDifference::new(obj, fixed.clone(), rgncdiff); + assert_eq!(diff.loc, "-"); // obj.stop <= fixed.start + + let obj2 = RegionCoordinate::new(r1, 20, 24); + let r4 = leaf("e", "YYYY", 4); + let rgncdiff2 = RegionCoordinate::new(r4, 14, 20); + let diff2 = RegionCoordinateDifference::new(obj2, fixed, rgncdiff2); + assert_eq!(diff2.loc, "+"); // obj.start >= fixed.stop + } + + // ---- Real spec tests ---- + + #[test] + fn test_get_sequence_real_spec() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + let seq = rna_lib.get_sequence(); + assert_eq!(seq.len(), 197); + assert!(seq.starts_with("ACACTCTTTCCCTACACGACGCTCTTCCGATCT")); + assert!(seq.ends_with("AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC")); + // middle contains N (barcode) and X (UMI/cDNA) + assert!(seq.contains('N')); + assert!(seq.contains('X')); + } + + #[test] + fn test_get_len_real_spec() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + let (mn, mx) = rna_lib.get_len(); + assert_eq!(mn, 197); + assert_eq!(mx, 197); + } + + #[test] + fn test_get_region_by_id_real_spec() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + let found = rna_lib.get_region_by_id("rna_cell_bc"); + assert_eq!(found.len(), 1); + assert_eq!(found[0].region_id, "rna_cell_bc"); + assert_eq!(found[0].region_type, "barcode"); + assert_eq!(found[0].min_len, 16); + assert_eq!(found[0].max_len, 16); + assert_eq!(found[0].sequence, "NNNNNNNNNNNNNNNN"); + } + + #[test] + fn test_get_region_by_type_real_spec() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + let barcodes = rna_lib.get_region_by_region_type("barcode"); + assert_eq!(barcodes.len(), 1); + assert_eq!(barcodes[0].region_id, "rna_cell_bc"); + } + + #[test] + fn test_get_onlist_regions_real_spec() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + let onlists = rna_lib.get_onlist_regions(); + assert_eq!(onlists.len(), 1); + assert_eq!(onlists[0].region_id, "rna_cell_bc"); + } + + #[test] + fn test_get_leaves_real_spec() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + let leaves = rna_lib.get_leaves(); + assert_eq!(leaves.len(), 5); + let leaf_ids: Vec<&str> = leaves.iter().map(|l| l.region_id.as_str()).collect(); + assert_eq!( + leaf_ids, + vec![ + "rna_truseq_read1", + "rna_cell_bc", + "rna_umi", + "cdna", + "rna_truseq_read2" + ] + ); + } + + #[test] + fn test_get_leaf_region_types_real_spec() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + let types = rna_lib.get_leaf_region_types(); + // Returns sorted via BTreeSet + assert_eq!( + types, + vec!["barcode", "cdna", "truseq_read1", "truseq_read2", "umi"] + ); + } + + #[test] + fn test_to_newick_real_spec() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + let newick = rna_lib.to_newick(); + assert_eq!( + newick, + "('rna_truseq_read1:33','rna_cell_bc:16','rna_umi:12','cdna:102','rna_truseq_read2:34')rna" + ); + } + + #[test] + fn test_update_attr_real_spec() { + let spec = dogma_spec(); + let mut rna_lib = spec.get_libspec("rna").expect("rna modality").clone(); + let (mn_before, mx_before) = rna_lib.get_len(); + rna_lib.update_attr(); + // After update_attr, min/max should match computed values + assert_eq!(rna_lib.min_len, mn_before); + assert_eq!(rna_lib.max_len, mx_before); + assert_eq!(rna_lib.min_len, 197); + assert_eq!(rna_lib.max_len, 197); + assert_eq!(rna_lib.sequence_type, "joined"); + } + + #[test] + fn test_reverse_real_spec() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + // Get a leaf with a known sequence + let read1 = rna_lib.get_region_by_id("rna_truseq_read1"); + assert_eq!(read1.len(), 1); + let mut r = read1[0].clone(); + let orig_seq = r.sequence.clone(); + assert_eq!(orig_seq, "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"); + r.reverse(); + let expected: String = orig_seq.chars().rev().collect(); + assert_eq!(r.sequence, expected); + assert_eq!(r.sequence, "TCTAGCCTTCTCGCAGCACATCCCTTTCTCACA"); + } + + #[test] + fn test_complement_real_spec() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + let read1 = rna_lib.get_region_by_id("rna_truseq_read1"); + assert_eq!(read1.len(), 1); + let mut r = read1[0].clone(); + assert_eq!(r.sequence, "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"); + r.complement(); + assert_eq!(r.sequence, "TGTGAGAAAGGGATGTGCTGCGAGAAGGCTAGA"); + } + + #[test] + fn test_reverse_complement_barcode() { + // Barcode with all N's: reverse and complement should both be N's + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").expect("rna modality"); + let bc = rna_lib.get_region_by_id("rna_cell_bc"); + assert_eq!(bc.len(), 1); + + let mut r = bc[0].clone(); + assert_eq!(r.sequence, "NNNNNNNNNNNNNNNN"); + r.reverse(); + assert_eq!(r.sequence, "NNNNNNNNNNNNNNNN"); + + let mut r2 = bc[0].clone(); + r2.complement(); + assert_eq!(r2.sequence, "NNNNNNNNNNNNNNNN"); + } +} diff --git a/src/py_module.rs b/src/py_module.rs new file mode 100644 index 00000000..f600eb4b --- /dev/null +++ b/src/py_module.rs @@ -0,0 +1,16 @@ +#[cfg(feature = "python-binding")] +use pyo3::prelude::*; + +#[cfg_attr(feature = "python-binding", pyo3::pymodule)] +fn _core(_py: Python<'_>, m: &Bound<'_, pyo3::types::PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) +} \ No newline at end of file diff --git a/src/schema/seqspec.schema.json b/src/schema/seqspec.schema.json new file mode 100644 index 00000000..5e9eb5a2 --- /dev/null +++ b/src/schema/seqspec.schema.json @@ -0,0 +1,583 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "Assay.schema.json", + "title": "Assay", + "description": "A Assay of DNA", + "type": "object", + "properties": { + "seqspec_version": { + "description": "Version of the seqspec specification used", + "type": "string", + "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" + }, + "assay_id": { + "description": "Identifier for the assay", + "type": "string" + }, + "name": { + "description": "The name of the assay", + "type": "string" + }, + "doi": { + "description": "the doi of the paper that describes the assay", + "type": "string" + }, + "date": { + "description": "The seqspec creation date", + "type": "string", + "pattern": "^(0?[1-9]|[12][0-9]|3[01])\\s(January|February|March|April|May|June|July|August|September|October|November|December)\\s(19|20)\\d\\d$" + }, + "description": { + "description": "A short description of the assay", + "type": "string" + }, + "modalities": { + "description": "The modalities the assay targets", + "type": "array", + "items": { + "type": "string", + "enum": [ + "dna", + "rna", + "tag", + "protein", + "atac", + "crispr" + ] + } + }, + "lib_struct": { + "description": "The link to Teichmann's libstructs page derived for this sequence", + "type": "string" + }, + "library_protocol": { + "description": "The protocol/machine/tool to generate the library insert", + "anyOf": [ + { + "type": "string", + "enum": [ + "imaging assay (OBI:0000185)", + "self-transcribing active regulatory region sequencing assay (OBI:0002041)", + "Hi-C assay (OBI:0002440)", + "single-nucleus ATAC-seq (OBI:0002762)", + "protein-protein interaction detection assay (OBI:0000288)", + "cas mediated mutagenesis (OBI:0003133)", + "bulk RNA-seq assay (OBI:0003090)", + "ChIP-seq assay (OBI:0000716)", + "bulk assay for transposase-accessible chromatin using sequencing (OBI:0003089)", + "single-nucleus RNA sequencing assay (OBI:0003109)", + "single-cell RNA sequencing assay (OBI:0002631)", + "micro-C assay (OBI:0003302)", + "transcription factor binding site identification by ChIP-Seq assay (OBI:0002019)", + "CRISPR screen (NTR:0000520)", + "flow cytometry assay (OBI:0000916)", + "single-nucleus methylCytosine Transcriptome sequencing (NTR:0000764)", + "DNA sequencing assay (OBI:0000626)", + "spatial transcriptomics (NTR:0000761)", + "single-cell ATAC-seq (OBI:0002764)", + "massively parallel reporter assay (OBI:0002675)", + "chromosome conformation capture-on-chip assay (OBI:0002458)", + "single nucleus methylation chromatin conformation capture seq (NTR:0000745)", + "in vitro CRISPR screen using flow cytometry (OBI:0003661)", + "in vitro CRISPR screen using single-cell RNA-seq (OBI:0003660)", + "Custom" + ] + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "protocol_id": { + "type": "string" + }, + "name": { + "type": [ + "string", + "null" + ] + }, + "modality": { + "type": "string" + } + }, + "required": [ + "protocol_id", + "modality" + ], + "additionalProperties": false + }, + "minItems": 1 + } + ] + }, + "library_kit": { + "description": "The kit used to make the library sequence_protocol compatible", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "kit_id": { + "type": "string" + }, + "name": { + "type": [ + "string", + "null" + ] + }, + "modality": { + "type": "string" + } + }, + "required": [ + "kit_id", + "modality" + ], + "additionalProperties": false + }, + "minItems": 1 + } + ] + }, + "sequence_protocol": { + "description": "The protocol/machine/tool to generate sequences", + "anyOf": [ + { + "type": "string", + "enum": [ + "ONT GridION X5 (EFO:0008633)", + "Illumina NovaSeq 6000 (EFO:0008637)", + "PacBio Sequel system (EFO:0008630)", + "454 GS FLX sequencer (EFO:0004432)", + "Illumina HiSeq X (EFO:0008567)", + "Illumina NextSeq 500 (EFO:0009173)", + "Illumina Genome Analyzer II (EFO:0004201)", + "Visium Spatial Gene Expression (EFO:0010961)", + "AB SOLiD 4 System (EFO:0004438)", + "PacBio Sequel II System (EFO:0700015)", + "Illumina Genome Analyzer IIx (EFO:0004202)", + "ONT MinION (EFO:0008632)", + "ONT PromethION 2 Solo (EFO:0700019)", + "ONT PromethION (EFO:0008634)", + "Illumina MiniSeq (EFO:0008636)", + "Illumina MiSeq (EFO:0004205)", + "Illumina HiSeq 1000 (EFO:0004204)", + "Illumina HiSeq 2500 (EFO:0008565)", + "Illumina NextSeq 550 (EFO:0008566)", + "Illumina HiSeq 4000 (EFO:0008563)", + "Illumina NextSeq 2000 (EFO:0010963)", + "Illumina NovaSeq X (NTR:0000765)", + "PacBio RS II (EFO:0008631)", + "Illumina NovaSeq X (EFO:0022840)", + "Custom" + ] + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "protocol_id": { + "type": "string" + }, + "name": { + "type": [ + "string", + "null" + ] + }, + "modality": { + "type": "string" + } + }, + "required": [ + "protocol_id", + "modality" + ], + "additionalProperties": false + }, + "minItems": 1 + } + ] + }, + "sequence_kit": { + "description": "The kit used with the protocol to sequence the library", + "anyOf": [ + { + "type": "string", + "enum": [ + "HiSeq SBS Kit v4", + "HiSeq SR Cluster Kit v4-cBot-HS", + "HiSeq PE Cluster Kit v4-cBot-HS", + "HiSeq SR Rapid Cluster Kit v2", + "HiSeq PE Rapid Cluster Kit v2", + "HiSeq Rapid SBS Kit v2", + "HiSeq 3000/4000 SBS Kit", + "HiSeq 3000/4000 SR Cluster Kit", + "HiSeq 3000/4000 PE Cluster Kit", + "MiSeq Reagent Kit v2", + "NextSeq 500 Mid Output Kit", + "NextSeq 500 High Output Kit", + "NextSeq 500 Mid Output v2 Kit", + "NextSeq 500 High Output v2 Kit", + "NextSeq 500/550 Mid-Output v2.5 Kit", + "NextSeq 500/550 High-Output v2.5 Kit", + "TG NextSeq 500/550 Mid-Output Kit v2.5", + "TG NextSeq 500/550 High-Output Kit v2.5", + "NextSeq 1000/2000 P1 Reagent Kit", + "NextSeq 1000/2000 P2 Reagent Kit", + "NextSeq 1000/2000 P3 Reagent Kit", + "NextSeq 1000/2000 P1 XLEAP-SBS Reagent Kit", + "NextSeq 1000/2000 P2 XLEAP-SBS Reagent Kit", + "NextSeq 2000 P3 XLEAP-SBS Reagent Kit", + "NextSeq 2000 P4 XLEAP-SBS Reagent Kit", + "NovaSeq 6000 SP Reagent Kit v1.5", + "NovaSeq 6000 S1 Reagent Kit v1.5", + "NovaSeq 6000 S2 Reagent Kit v1.5", + "NovaSeq 6000 S4 Reagent Kit v1.5", + "NovaSeq X Series 1.5B Reagent Kit", + "NovaSeq X Series 10B Reagent Kit", + "NovaSeq X Series 25B Reagent Kit", + "ONT Ligation Sequencing Kit V14", + "Sequel sequencing kit 3.0", + "Sequel II sequencing kit 2.0", + "Custom" + ] + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "kit_id": { + "type": "string" + }, + "name": { + "type": [ + "string", + "null" + ] + }, + "modality": { + "type": "string" + } + }, + "required": [ + "kit_id", + "modality" + ], + "additionalProperties": false + }, + "minItems": 1 + } + ] + }, + "sequence_spec": { + "description": "The spec for the sequencer", + "type": "array", + "items": { + "$ref": "#/$defs/read" + } + }, + "library_spec": { + "description": "The spec for the assay", + "type": "array", + "items": { + "$ref": "#/$defs/region" + } + } + }, + "required": [ + "seqspec_version", + "assay_id", + "name", + "doi", + "date", + "description", + "modalities", + "lib_struct", + "library_protocol", + "library_kit", + "sequence_protocol", + "sequence_kit" + ], + "$defs": { + "region": { + "title": "Region", + "description": "A region of DNA", + "type": "object", + "properties": { + "region_id": { + "description": "identifier for the region", + "type": "string" + }, + "region_type": { + "description": "the type of region", + "type": "string", + "enum": [ + "atac", + "barcode", + "bead_TSO", + "cdna", + "crispr", + "custom_primer", + "dna", + "fastq", + "fastq_link", + "gdna", + "hic", + "illumina_p5", + "illumina_p7", + "index5", + "index7", + "linker", + "ME1", + "ME2", + "methyl", + "named", + "nextera_read1", + "nextera_read2", + "poly_A", + "poly_G", + "poly_T", + "poly_C", + "protein", + "rna", + "s5", + "s7", + "sgrna_target", + "tag", + "truseq_read1", + "truseq_read2", + "umi" + ] + }, + "sequence_type": { + "description": "The type of the sequence", + "type": "string", + "enum": [ + "fixed", + "random", + "onlist", + "joined" + ] + }, + "sequence": { + "description": "The sequence", + "type": "string" + }, + "min_len": { + "description": "The minimum length of the sequence", + "type": "integer", + "minimum": 0, + "maximum": 2048 + }, + "max_len": { + "description": "The maximum length of the sequence", + "type": "integer", + "minimum": 0, + "maximum": 2048 + }, + "onlist": { + "description": "The file containing the sequence if seq_type = onlist", + "type": [ + "object", + "null" + ], + "properties": { + "file_id": { + "description": "filename", + "type": "string" + }, + "filename": { + "description": "filename for the onlist", + "type": "string" + }, + "filetype": { + "description": "the type of file", + "type": "string" + }, + "filesize": { + "description": "the size of the file in bytes", + "type": "integer" + }, + "url": { + "description": "The path or url to the file", + "type": "string" + }, + "urltype": { + "description": "type of file path", + "type": "string", + "enum": [ + "local", + "ftp", + "http", + "https" + ] + }, + "location": { + "description": "location of onlist", + "type": "string", + "enum": [ + "local", + "remote" + ] + }, + "md5": { + "description": "md5sum for the file pointed to by filename", + "type": "string", + "pattern": "^[a-f0-9]{32}$" + } + }, + "required": [ + "file_id", + "filename", + "filetype", + "filesize", + "url", + "urltype", + "md5" + ] + }, + "regions": { + "description": "The regions being joined", + "type": "array", + "items": { + "$ref": "#/$defs/region" + } + } + }, + "required": [ + "region_id", + "region_type", + "sequence_type", + "sequence", + "min_len", + "max_len" + ], + "if": { + "properties": { + "min_len": { + "const": 0 + } + } + }, + "then": { + "properties": { + "sequence": { + "type": "string", + "pattern": "^[ACGTRYMKSWHBVDNX]*$" + } + } + }, + "else": { + "properties": { + "sequence": { + "type": "string", + "minLength": 1, + "pattern": "^[ACGTRYMKSWHBVDNX]+$" + } + } + } + }, + "read": { + "title": "Read", + "type": "object", + "properties": { + "read_id": { + "type": "string", + "description": "The unique identifier for the read." + }, + "name": { + "type": "string", + "description": "The name of the read." + }, + "modality": { + "type": "string", + "description": "The modality of the assay generating the read." + }, + "primer_id": { + "type": "string", + "description": "The region id of the primer used." + }, + "min_len": { + "type": "integer", + "minimum": 0, + "description": "The minimum length of the read, must be greater than or equal to 0." + }, + "max_len": { + "type": "integer", + "exclusiveMinimum": 0, + "description": "The maximum length of the read, must be greater than 0." + }, + "strand": { + "type": "string", + "enum": [ + "pos", + "neg" + ], + "description": "The strand orientation of the read, either positive ('pos') or negative ('neg')." + }, + "files": { + "description": "An array of files containing the reads", + "type": "array", + "items": { + "type": "object", + "properties": { + "file_id": { + "description": "filename", + "type": "string" + }, + "filename": { + "description": "filename", + "type": "string" + }, + "filetype": { + "description": "the type of file", + "type": "string" + }, + "filesize": { + "description": "the size of the file in bytes", + "type": "integer" + }, + "url": { + "description": "The path or url to the file", + "type": "string" + }, + "urltype": { + "description": "type of file path", + "type": "string", + "enum": [ + "local", + "ftp", + "http", + "https" + ] + }, + "md5": { + "description": "md5sum for the file pointed to by filename", + "type": "string", + "pattern": "^[a-f0-9]{32}$" + } + } + } + } + }, + "required": [ + "read_id", + "modality", + "primer_id", + "min_len", + "max_len", + "strand" + ], + "additionalProperties": false + } + } +} \ No newline at end of file diff --git a/src/seqspec_auth.rs b/src/seqspec_auth.rs new file mode 100644 index 00000000..5e760a3e --- /dev/null +++ b/src/seqspec_auth.rs @@ -0,0 +1,118 @@ +use crate::auth::{init_profile, AuthKind, AuthProfile, AuthRegistry, RemoteAccess}; +use anyhow::Result; +use clap::{Args, Subcommand}; +use serde::Serialize; + +#[derive(Debug, Args)] +pub struct AuthArgs { + #[command(subcommand)] + pub command: AuthCommand, +} + +#[derive(Debug, Subcommand)] +pub enum AuthCommand { + Init(AuthInitArgs), + Path(AuthPathArgs), + List(AuthListArgs), + Resolve(AuthResolveArgs), +} + +#[derive(Debug, Args)] +pub struct AuthInitArgs { + #[arg(long, value_name = "PROFILE", required = true)] + pub profile: String, + + #[arg(long = "host", value_name = "HOST", required = true)] + pub hosts: Vec, + + #[arg(long, value_enum, default_value_t = AuthKind::Basic)] + pub kind: AuthKind, + + #[arg(long, value_name = "ENV", required = true)] + pub username_env: String, + + #[arg(long, value_name = "ENV", required = true)] + pub password_env: String, +} + +#[derive(Debug, Args)] +pub struct AuthPathArgs {} + +#[derive(Debug, Args)] +pub struct AuthListArgs {} + +#[derive(Debug, Args)] +pub struct AuthResolveArgs { + #[arg(help = "Remote URL to inspect", value_name = "URL")] + pub url: String, + + #[arg(long, env = "SEQSPEC_AUTH_PROFILE", value_name = "PROFILE")] + pub auth_profile: Option, +} + +#[derive(Debug, Serialize)] +struct PathOutput<'a> { + kind: &'a str, + source: &'a str, + path: Option, + exists: bool, +} + +pub fn run(args: &AuthArgs) -> Result<()> { + match &args.command { + AuthCommand::Init(args) => run_init(args), + AuthCommand::Path(args) => run_path(args), + AuthCommand::List(args) => run_list(args), + AuthCommand::Resolve(args) => run_resolve(args), + } +} + +fn run_init(args: &AuthInitArgs) -> Result<()> { + let output = init_profile( + &args.profile, + AuthProfile { + hosts: args.hosts.clone(), + kind: args.kind.clone(), + username_env: args.username_env.clone(), + password_env: args.password_env.clone(), + }, + )?; + print_value(&output) +} + +fn run_path(args: &AuthPathArgs) -> Result<()> { + let registry = AuthRegistry::load()?; + let location = registry.location(); + let output = PathOutput { + kind: "auth_config_path", + source: &location.source, + path: location + .path + .as_ref() + .map(|path| path.display().to_string()), + exists: location.exists, + }; + let _ = args; + print_value(&output) +} + +fn run_list(args: &AuthListArgs) -> Result<()> { + let registry = AuthRegistry::load()?; + let profiles = registry.profile_summaries(); + let _ = args; + print_value(&profiles) +} + +fn run_resolve(args: &AuthResolveArgs) -> Result<()> { + let registry = AuthRegistry::load()?; + let resolved = registry.resolve_summary(&args.url, args.auth_profile.as_deref())?; + if let Some(profile_name) = args.auth_profile.as_deref() { + let _ = RemoteAccess::load(Some(profile_name))?; + } + print_value(&resolved) +} + +fn print_value(value: &T) -> Result<()> { + println!("{}", serde_json::to_string_pretty(value)?); + Ok(()) +} diff --git a/src/seqspec_check.rs b/src/seqspec_check.rs new file mode 100644 index 00000000..f193acb6 --- /dev/null +++ b/src/seqspec_check.rs @@ -0,0 +1,1228 @@ +use crate::auth::RemoteAccess; +use crate::models::assay::Assay; +use crate::models::region::{Region, RegionCoordinate}; +use crate::utils; +use clap::Args; +use jsonschema; +use std::collections::HashSet; +use std::fs; +use std::io::Write; +use std::path::{Path, PathBuf}; + +#[derive(Debug, Args)] +pub struct CheckArgs { + #[clap(short, long, help = "Path to output file", value_name = "OUT")] + output: Option, + + #[clap( + short, + long, + help = "Skip checks", + value_name = "SKIP", + default_value = None, + value_parser = ["igvf", "igvf_onlist_skip", "structural"], + )] + skip: Option, + + #[clap(long, env = "SEQSPEC_AUTH_PROFILE", value_name = "PROFILE")] + auth_profile: Option, + + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, +} + +pub fn run_check(args: &CheckArgs) -> Vec { + validate_check_args(args); + let spec = utils::load_spec(&args.yaml); + let remote_access = RemoteAccess::load(args.auth_profile.as_deref()).unwrap_or_else(|err| { + eprintln!("{}", err); + std::process::exit(1); + }); + let errors = + seqspec_check_with_remote_access(&spec, args.skip.as_deref(), &args.yaml, &remote_access) + .unwrap_or_else(|err| { + eprintln!("{}", err); + std::process::exit(1); + }); + + if let Some(out) = &args.output { + let mut f = fs::File::create(out).unwrap(); + for (idx, e) in errors.iter().enumerate() { + writeln!(f, "{}", format_error(e, idx + 1)).unwrap(); + } + } else { + for (idx, e) in errors.iter().enumerate() { + println!("{}", format_error(e, idx + 1)); + } + } + errors +} + +fn validate_check_args(args: &CheckArgs) { + if !args.yaml.exists() { + eprintln!("Input file does not exist: {}", args.yaml.display()); + std::process::exit(1); + } + if let Some(out) = &args.output { + if out.exists() && !out.is_file() { + eprintln!("Output path exists but is not a file: {}", out.display()); + std::process::exit(1); + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ErrorObj { + pub severity: String, + pub error_type: String, + pub error_message: String, + pub error_object: String, +} + +fn format_error(e: &ErrorObj, idx: usize) -> String { + format!("[{} {}] {}", e.severity, idx, e.error_message) +} + +pub fn seqspec_check(spec: &Assay, filter_type: Option<&str>, spec_path: &Path) -> Vec { + let access = RemoteAccess::anonymous(); + let mut errors = check(spec, spec_path, &access).unwrap(); + if let Some(ft) = filter_type { + errors = filter_errors(errors, ft); + } + errors +} + +pub fn seqspec_check_with_remote_access( + spec: &Assay, + filter_type: Option<&str>, + spec_path: &Path, + remote_access: &RemoteAccess, +) -> anyhow::Result> { + let mut errors = check(spec, spec_path, remote_access)?; + if let Some(ft) = filter_type { + errors = filter_errors(errors, ft); + } + Ok(errors) +} + +/// All error_type values produced by structural (non-filesystem) checks. +const STRUCTURAL_CHECK_TYPES: &[&str] = &[ + "check_unique_modalities", + "check_region_ids_modalities", + "check_unique_read_ids", + "check_unique_read_primer_strand_pairs", + "check_unique_region_ids", + "check_read_modalities", + "check_primer_ids_in_region_ids", + "check_sequence_types", + "check_region_lengths", + "check_sequence_lengths", + "check_read_file_count", + "check_region_against_subregion_length", + "check_region_against_subregion_sequence", + "check_read_length_against_library", + "check_overlapping_read_regions", +]; + +fn filter_errors(errors: Vec, filter_type: &str) -> Vec { + if filter_type == "structural" { + return errors + .into_iter() + .filter(|e| !STRUCTURAL_CHECK_TYPES.contains(&e.error_type.as_str())) + .collect(); + } + + let igvf_filters = vec![ + ("check_schema", "'lib_struct'"), + ("check_schema", "'library_protocol'"), + ("check_schema", "'library_kit'"), + ("check_schema", "'sequence_protocol'"), + ("check_schema", "'sequence_kit'"), + ("check_schema", "'md5'"), + ]; + let igvf_onlist_skip_filters = { + let mut v = igvf_filters.clone(); + v.push(("check_onlist_files_exist", "onlist")); + v + }; + let filters = match filter_type { + "igvf" => igvf_filters, + "igvf_onlist_skip" => igvf_onlist_skip_filters, + _ => vec![], + }; + if filters.is_empty() { + return errors; + } + errors + .into_iter() + .filter(|e| { + !filters + .iter() + .any(|(t, o)| e.error_type == *t && e.error_object == *o) + }) + .collect() +} + +/// Run all structural (non-filesystem) checks on a seqspec. +/// This excludes check_schema, check_onlist_files_exist, and check_read_files_exist +/// which require filesystem access or CARGO_MANIFEST_DIR. +pub fn seqspec_check_structural(spec: &Assay) -> Vec { + let errors: Vec = Vec::new(); + let idx = 0usize; + + macro_rules! run { + ($f:ident, $errors:expr) => {{ + let (e2, _i2) = $f(spec, $errors, idx); + e2 + }}; + } + + let errors = run!(check_unique_modalities, errors); + let errors = run!(check_region_ids_modalities, errors); + let errors = run!(check_unique_read_ids, errors); + let errors = run!(check_unique_read_primer_strand_pairs, errors); + let errors = run!(check_unique_region_ids, errors); + let errors = run!(check_read_modalities, errors); + let errors = run!(check_primer_ids_in_region_ids, errors); + let errors = run!(check_sequence_types, errors); + let errors = run!(check_region_lengths, errors); + let errors = run!(check_sequence_lengths, errors); + let errors = run!(check_read_file_count, errors); + let errors = run!(check_region_against_subregion_length, errors); + let errors = run!(check_region_against_subregion_sequence, errors); + let errors = run!(check_read_length_against_library, errors); + let errors = run!(check_overlapping_read_regions, errors); + errors +} + +fn check( + spec: &Assay, + spec_path: &Path, + remote_access: &RemoteAccess, +) -> anyhow::Result> { + let errors: Vec = Vec::new(); + let idx = 0usize; + + // check_schema + let (e, _i) = check_schema(spec, errors, idx); + let mut errors = e; + + // Structural checks + errors.extend(seqspec_check_structural(spec)); + + // Filesystem checks + let spec_base = spec_path.parent().map(|p| p.to_path_buf()); + let (e_on, _i_on) = + check_onlist_files_exist(spec, errors, idx, spec_base.as_ref(), remote_access)?; + errors = e_on; + let (e_rf, _i_rf) = + check_read_files_exist(spec, errors, idx, spec_base.as_ref(), remote_access)?; + errors = e_rf; + + Ok(errors) +} + +fn push_diagnostic( + errors: &mut Vec, + idx: &mut usize, + severity: &str, + et: &str, + msg: String, + obj: &str, +) { + errors.push(ErrorObj { + severity: severity.to_string(), + error_type: et.to_string(), + error_message: msg, + error_object: obj.to_string(), + }); + *idx += 1; +} + +fn push_error(errors: &mut Vec, idx: &mut usize, et: &str, msg: String, obj: &str) { + push_diagnostic(errors, idx, "error", et, msg, obj); +} + +fn push_warning(errors: &mut Vec, idx: &mut usize, et: &str, msg: String, obj: &str) { + push_diagnostic(errors, idx, "warning", et, msg, obj); +} + +fn check_schema(spec: &Assay, mut errors: Vec, mut idx: usize) -> (Vec, usize) { + // Load schema from src/schema/seqspec.schema.json + let schema_path = + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/schema/seqspec.schema.json"); + let schema_str = std::fs::read_to_string(&schema_path).unwrap_or_else(|_| "{}".to_string()); + let schema_json: serde_json::Value = + serde_json::from_str(&schema_str).unwrap_or(serde_json::json!({})); + + if let Ok(validator) = jsonschema::validator_for(&schema_json) { + let instance = serde_json::to_value(spec).unwrap_or(serde_json::json!({})); + for error in validator.iter_errors(&instance) { + // Convert instance path (JSON Pointer) to bracket notation like Python's + let pointer = format!("{}", error.instance_path); + let bracket_path = if pointer.is_empty() { + "spec".to_string() + } else { + let parts: String = pointer + .trim_start_matches('/') + .split('/') + .map(|seg| { + if seg.chars().all(|c| c.is_ascii_digit()) { + format!("[{}]", seg) + } else { + format!("[\"{}\"]", seg) + } + }) + .collect(); + format!("spec{}", parts) + }; + let msg = format!("{} in {}", error, bracket_path); + let last_obj = if bracket_path == "spec" { + "spec".to_string() + } else { + // Extract last segment without brackets/quotes + let seg = bracket_path.rsplit('[').next().unwrap_or("spec"); + seg.trim_end_matches(']').trim_matches('"').to_string() + }; + errors.push(ErrorObj { + severity: "error".to_string(), + error_type: "check_schema".to_string(), + error_message: msg, + error_object: last_obj, + }); + idx += 1; + } + } + (errors, idx) +} + +fn check_unique_modalities( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + let set: HashSet<_> = spec.modalities.iter().collect(); + if set.len() != spec.modalities.len() { + push_error( + &mut errors, + &mut idx, + "check_unique_modalities", + format!("modalities [{}] are not unique", spec.modalities.join(", ")), + "modalities", + ); + } + (errors, idx) +} + +fn check_region_ids_modalities( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + let modes: HashSet = spec.modalities.iter().cloned().collect(); + for r in &spec.library_spec { + if !modes.contains(&r.region_id) { + push_error( + &mut errors, + &mut idx, + "check_region_ids_modalities", + format!( + "region_id '{}' of the first level of the spec does not correspond to a modality [{}]", + r.region_id, + spec.modalities.join(", ") + ), + "region", + ); + } + } + (errors, idx) +} + +fn check_onlist_files_exist( + spec: &Assay, + mut errors: Vec, + mut idx: usize, + spec_base: Option<&PathBuf>, + remote_access: &RemoteAccess, +) -> anyhow::Result<(Vec, usize)> { + let mut onlists = Vec::new(); + for m in &spec.modalities { + if let Some(lib) = spec.get_libspec(m) { + for r in lib.get_onlist_regions() { + if let Some(ol) = r.onlist { + onlists.push(ol); + } + } + } + } + for ol in onlists { + match ol.urltype.as_str() { + "local" => { + let mut candidates: Vec = Vec::new(); + let locator = match utils::local_onlist_locator(&ol) { + Ok(locator) => locator, + Err(err) => { + push_error( + &mut errors, + &mut idx, + "check_onlist_files_exist", + err, + "onlist", + ); + continue; + } + }; + let p = PathBuf::from(locator); + candidates.push(if let Some(base) = spec_base { + if p.is_absolute() { + p.clone() + } else { + base.join(&p) + } + } else { + p.clone() + }); + // also try .gz variant + let gz = PathBuf::from(format!("{}.gz", locator)); + candidates.push(if let Some(base) = spec_base { + if gz.is_absolute() { + gz.clone() + } else { + base.join(&gz) + } + } else { + gz.clone() + }); + if !candidates.iter().any(|c| c.exists()) { + push_error( + &mut errors, + &mut idx, + "check_onlist_files_exist", + format!("{} does not exist", ol.filename), + "onlist", + ); + } + } + "http" | "https" | "ftp" => { + if ol.url.is_empty() || !remote_access.url_exists(&ol.url)? { + push_error( + &mut errors, + &mut idx, + "check_onlist_files_exist", + format!("{} does not exist", ol.filename), + "onlist", + ); + } + } + _ => {} + } + } + Ok((errors, idx)) +} + +fn check_unique_read_ids( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + let mut seen: HashSet = HashSet::new(); + for read in &spec.sequence_spec { + if !seen.insert(read.read_id.clone()) { + push_error( + &mut errors, + &mut idx, + "check_unique_read_ids", + format!("read_id '{}' is not unique across all reads", read.read_id), + "read", + ); + } + } + (errors, idx) +} + +fn check_read_files_exist( + spec: &Assay, + mut errors: Vec, + mut idx: usize, + spec_base: Option<&PathBuf>, + remote_access: &RemoteAccess, +) -> anyhow::Result<(Vec, usize)> { + for read in &spec.sequence_spec { + for f in &read.files { + match f.urltype.as_str() { + "local" => { + let locator = match utils::local_resource_url(&f.url, &f.filename, "file") { + Ok(locator) => locator, + Err(err) => { + push_error( + &mut errors, + &mut idx, + "check_read_files_exist", + err, + "file", + ); + continue; + } + }; + let p = PathBuf::from(locator); + let full = if let Some(base) = spec_base { + if p.is_absolute() { + p.clone() + } else { + base.join(&p) + } + } else { + p.clone() + }; + if !full.exists() { + push_error( + &mut errors, + &mut idx, + "check_read_files_exist", + format!("{} does not exist", f.filename), + "file", + ); + } + } + "http" | "https" | "ftp" => { + if f.url.is_empty() || !remote_access.url_exists(&f.url)? { + push_error( + &mut errors, + &mut idx, + "check_read_files_exist", + format!("{} does not exist", f.filename), + "file", + ); + } + } + _ => {} + } + } + } + Ok((errors, idx)) +} + +fn check_unique_read_primer_strand_pairs( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + let mut seen: HashSet<(String, String)> = HashSet::new(); + for read in &spec.sequence_spec { + let key = (read.primer_id.clone(), read.strand.clone()); + if !seen.insert(key) { + push_error( + &mut errors, + &mut idx, + "check_unique_read_primer_strand_pairs", + format!( + "primer_id '{}' and strand '{}' tuple is not unique across all reads", + read.primer_id, read.strand + ), + "read", + ); + } + } + (errors, idx) +} + +fn check_unique_region_ids( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + let modes = &spec.modalities; + let mut rids: HashSet = HashSet::new(); + for m in modes { + if let Some(lib) = spec.get_libspec(m) { + for r in lib.get_leaves() { + if !rids.insert(r.region_id.clone()) { + push_error( + &mut errors, + &mut idx, + "check_unique_region_ids", + format!( + "region_id '{}' is not unique across all regions", + r.region_id + ), + "region", + ); + } + } + } + } + (errors, idx) +} + +fn check_read_modalities( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + let modes: HashSet = spec.modalities.iter().cloned().collect(); + for read in &spec.sequence_spec { + if !modes.contains(&read.modality) { + push_error( + &mut errors, + &mut idx, + "check_read_modalities", + format!( + "read '{}' modality '{}' does not exist in the modalities", + read.read_id, read.modality + ), + "read", + ); + } + } + (errors, idx) +} + +fn check_primer_ids_in_region_ids( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + let modes = &spec.modalities; + let mut rids: HashSet = HashSet::new(); + for m in modes { + if let Some(lib) = spec.get_libspec(m) { + for r in lib.get_leaves() { + rids.insert(r.region_id); + } + } + } + for read in &spec.sequence_spec { + if !rids.contains(&read.primer_id) { + push_error( + &mut errors, + &mut idx, + "check_primer_ids_in_region_ids", + format!( + "'{}' primer_id '{}' does not exist in the library_spec", + read.read_id, read.primer_id + ), + "read", + ); + } + } + (errors, idx) +} + +fn check_sequence_types( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + fn recurse(r: &Region, errors: &mut Vec, idx: &mut usize) { + if r.sequence_type == "fixed" && !r.regions.is_empty() { + push_error( + errors, + idx, + "check_sequence_types", + format!( + "'{}' sequence_type is 'fixed' and contains subregions", + r.region_id + ), + "region", + ); + } + if r.sequence_type == "joined" && r.regions.is_empty() { + push_error( + errors, + idx, + "check_sequence_types", + format!( + "'{}' sequence_type is 'joined' and does not contain subregions", + r.region_id + ), + "region", + ); + } + if r.sequence_type == "random" && !r.regions.is_empty() { + push_error( + errors, + idx, + "check_sequence_types", + format!( + "'{}' sequence_type is 'random' and contains subregions", + r.region_id + ), + "region", + ); + } + if r.sequence_type == "random" { + let all_x = r.sequence.chars().all(|c| c == 'X'); + let len_ok = (r.min_len as usize) <= r.sequence.len() + && r.sequence.len() <= (r.max_len as usize); + if !(all_x && len_ok) { + push_error( + errors, + idx, + "check_sequence_types", + format!( + "'{}' sequence_type is 'random' and sequence is not all X's", + r.region_id + ), + "region", + ); + } + } + for c in &r.regions { + recurse(c, errors, idx); + } + } + for m in &spec.modalities { + if let Some(lib) = spec.get_libspec(m) { + recurse(&lib, &mut errors, &mut idx); + } + } + (errors, idx) +} + +fn check_region_lengths( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + fn recurse(r: &Region, errors: &mut Vec, idx: &mut usize) { + for c in &r.regions { + recurse(c, errors, idx); + } + if r.max_len < r.min_len { + push_error( + errors, + idx, + "check_region_lengths", + format!("'{}' max_len is less than min_len", r.region_id), + "region", + ); + } + } + for m in &spec.modalities { + if let Some(lib) = spec.get_libspec(m) { + recurse(&lib, &mut errors, &mut idx); + } + } + (errors, idx) +} + +fn check_sequence_lengths( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + fn recurse(r: &Region, errors: &mut Vec, idx: &mut usize) { + for c in &r.regions { + recurse(c, errors, idx); + } + if !r.sequence.is_empty() { + let l = r.sequence.len(); + if !((r.min_len as usize) <= l && l <= (r.max_len as usize)) { + push_error( + errors, + idx, + "check_sequence_lengths", + format!( + "'{}' sequence '{}' has length {}, expected range ({}, {})", + r.region_id, r.sequence, l, r.min_len, r.max_len + ), + "region", + ); + } + } + } + for m in &spec.modalities { + if let Some(lib) = spec.get_libspec(m) { + recurse(&lib, &mut errors, &mut idx); + } + } + (errors, idx) +} + +fn check_read_file_count( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + let counts: Vec = spec.sequence_spec.iter().map(|r| r.files.len()).collect(); + let uniq: HashSet = counts.iter().cloned().collect(); + if uniq.len() != 1 { + push_error( + &mut errors, + &mut idx, + "check_read_file_count", + "Reads must have the same number of files".to_string(), + "read", + ); + } + (errors, idx) +} + +fn check_region_against_subregion_length( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + fn recurse(r: &Region, errors: &mut Vec, idx: &mut usize) { + if !r.regions.is_empty() { + let min_sum: i64 = r.regions.iter().map(|s| s.min_len).sum(); + let max_sum: i64 = r.regions.iter().map(|s| s.max_len).sum(); + if r.min_len != min_sum || r.max_len != max_sum { + push_error( + errors, + idx, + "check_region_against_subregion_length", + format!( + "Region '{}' min_len/max_len ({}, {}) does not match sum of subregions ({}, {})", + r.region_id, r.min_len, r.max_len, min_sum, max_sum + ), + "region", + ); + } + for c in &r.regions { + recurse(c, errors, idx); + } + } + } + for m in &spec.modalities { + if let Some(lib) = spec.get_libspec(m) { + recurse(&lib, &mut errors, &mut idx); + } + } + (errors, idx) +} + +fn check_region_against_subregion_sequence( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + fn recurse(r: &Region, errors: &mut Vec, idx: &mut usize) { + if !r.regions.is_empty() { + let concat: String = r.regions.iter().map(|s| s.sequence.clone()).collect(); + if r.sequence != concat { + push_error( + errors, + idx, + "check_region_against_subregion_sequence", + format!( + "Region '{}' sequence '{}' does not match concatenation of subregions '{}'", + r.region_id, r.sequence, concat + ), + "region", + ); + } + for c in &r.regions { + recurse(c, errors, idx); + } + } + } + for m in &spec.modalities { + if let Some(lib) = spec.get_libspec(m) { + recurse(&lib, &mut errors, &mut idx); + } + } + (errors, idx) +} + +fn check_read_length_against_library( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + for read in &spec.sequence_spec { + let mode = &read.modality; + let Some(libspec) = spec.get_libspec(mode) else { + continue; + }; + let leaves = libspec.get_leaves_with_region_id(&read.primer_id); + let pidx = leaves.iter().position(|o| o.region_id == read.primer_id); + let Some(pidx) = pidx else { + push_error( + &mut errors, + &mut idx, + "check_read_length_against_library", + format!( + "'{}' primer_id '{}' not found in library leaves for modality '{}'", + read.read_id, read.primer_id, mode + ), + "read", + ); + continue; + }; + + let elements: Vec = if read.strand == "pos" { + leaves[pidx + 1..].to_vec() + } else { + leaves[..pidx].to_vec() + }; + + let sum_max: i64 = elements.iter().map(|o| o.max_len).sum(); + if read.max_len > sum_max { + let where_str = if read.strand == "pos" { + "after" + } else { + "before" + }; + push_error( + &mut errors, + &mut idx, + "check_read_length_against_library", + format!( + "'{}' max read length (max_len={}) is greater than sequence-able range (max_sum={}) for library elements {} primer '{}'", + read.read_id, read.max_len, sum_max, where_str, read.primer_id + ), + "read", + ); + } + } + (errors, idx) +} + +fn check_overlapping_read_regions( + spec: &Assay, + mut errors: Vec, + mut idx: usize, +) -> (Vec, usize) { + for modality in &spec.modalities { + let reads = spec.get_seqspec(modality); + let mut projected_reads: Vec<(String, Vec)> = Vec::new(); + + for read in reads { + let Ok((mapped_read, regions)) = + utils::map_read_id_to_regions(spec, modality, &read.read_id) + else { + continue; + }; + + let region_coordinates = utils::project_regions_to_coordinates(regions); + let clipped = utils::itx_read(region_coordinates, 0, mapped_read.max_len); + projected_reads.push((mapped_read.read_id, clipped)); + } + + for left_idx in 0..projected_reads.len() { + for right_idx in left_idx + 1..projected_reads.len() { + let (left_read_id, left_regions) = &projected_reads[left_idx]; + let (right_read_id, right_regions) = &projected_reads[right_idx]; + let right_region_ids: HashSet = right_regions + .iter() + .map(|region| region.region.region_id.clone()) + .collect(); + let mut shared_region_ids: Vec = Vec::new(); + let mut seen_region_ids: HashSet = HashSet::new(); + + for region in left_regions { + let region_id = region.region.region_id.clone(); + if right_region_ids.contains(®ion_id) + && seen_region_ids.insert(region_id.clone()) + { + shared_region_ids.push(region_id); + } + } + + if shared_region_ids.is_empty() { + continue; + } + + let region_list = shared_region_ids + .iter() + .map(|region_id| format!("'{}'", region_id)) + .collect::>() + .join(", "); + push_warning( + &mut errors, + &mut idx, + "check_overlapping_read_regions", + format!( + "reads '{}' and '{}' in modality '{}' both cover region(s) {}. Downstream tools may require explicit overlap handling such as `seqspec index --no-overlap`", + left_read_id, right_read_id, modality, region_list + ), + "read", + ); + } + } + } + + (errors, idx) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_check_valid_spec() { + let spec = dogma_spec(); + let spec_path = PathBuf::from("tests/fixtures/spec.yaml"); + let diagnostics = seqspec_check(&spec, None, &spec_path); + // DOGMAseq-DIG is well-formed; only file-existence errors or overlap warnings are expected + for e in &diagnostics { + assert!( + e.error_type == "check_onlist_files_exist" + || e.error_type == "check_read_files_exist" + || (e.severity == "warning" + && e.error_type == "check_overlapping_read_regions"), + "Unexpected diagnostic type: {} - {}", + e.error_type, + e.error_message, + ); + } + // No structural/validation error diagnostics + let structural_errors: Vec<_> = diagnostics + .iter() + .filter(|e| { + e.severity == "error" + && e.error_type != "check_onlist_files_exist" + && e.error_type != "check_read_files_exist" + }) + .collect(); + assert!(structural_errors.is_empty()); + } + + #[test] + fn test_check_onlist_files_exist_prefers_local_url() { + let spec_path = PathBuf::from("tests/fixtures/onlist_read_clip/spec.yaml"); + let mut spec = load_spec(&spec_path); + let library = spec.get_libspec("rna").unwrap(); + let barcode = library.get_region_by_id("barcode_a").pop().unwrap(); + let expected_url = barcode.onlist.unwrap().url; + + let barcode_region = spec + .library_spec + .get_mut(0) + .unwrap() + .regions + .iter_mut() + .find(|region| region.region_id == "barcode_a") + .unwrap(); + barcode_region.onlist.as_mut().unwrap().filename = "display.txt".into(); + + let diagnostics = seqspec_check(&spec, None, &spec_path); + assert!( + diagnostics.iter().all(|diagnostic| { + diagnostic.error_type != "check_onlist_files_exist" + || !diagnostic.error_message.contains(&expected_url) + }), + "local onlist existence should resolve through url before filename" + ); + assert!( + diagnostics + .iter() + .all(|diagnostic| diagnostic.error_type != "check_onlist_files_exist"), + "changing the display filename should not trigger an onlist existence error" + ); + } + + #[test] + fn test_check_onlist_files_exist_errors_when_local_url_is_empty() { + let spec_path = PathBuf::from("tests/fixtures/onlist_read_clip/spec.yaml"); + let mut spec = load_spec(&spec_path); + let barcode_region = spec + .library_spec + .get_mut(0) + .unwrap() + .regions + .iter_mut() + .find(|region| region.region_id == "barcode_a") + .unwrap(); + barcode_region.onlist.as_mut().unwrap().url.clear(); + + let diagnostics = seqspec_check(&spec, None, &spec_path); + assert!(diagnostics.iter().any(|diagnostic| { + diagnostic.error_type == "check_onlist_files_exist" + && diagnostic.error_message == "local onlist 'barcode_a.txt' has empty url" + })); + } + + #[test] + fn test_check_read_files_exist_errors_when_local_url_is_empty() { + let spec_path = PathBuf::from("tests/fixtures/onlist_read_clip/spec.yaml"); + let mut spec = load_spec(&spec_path); + spec.sequence_spec[0].files[0].url.clear(); + + let diagnostics = seqspec_check(&spec, None, &spec_path); + assert!(diagnostics.iter().any(|diagnostic| { + diagnostic.error_type == "check_read_files_exist" + && diagnostic.error_message == "local file 'rna_read.fastq.gz' has empty url" + })); + } + + #[test] + fn test_error_obj_structure() { + let e = ErrorObj { + severity: "error".into(), + error_type: "test_check".into(), + error_message: "something went wrong".into(), + error_object: "region".into(), + }; + assert_eq!(e.severity, "error"); + assert_eq!(e.error_type, "test_check"); + assert_eq!(e.error_message, "something went wrong"); + assert_eq!(e.error_object, "region"); + } + + #[test] + fn test_filter_errors_igvf() { + let errors = vec![ + ErrorObj { + severity: "error".into(), + error_type: "check_schema".into(), + error_message: "missing field".into(), + error_object: "'lib_struct'".into(), + }, + ErrorObj { + severity: "error".into(), + error_type: "check_unique_modalities".into(), + error_message: "duplicate".into(), + error_object: "modality".into(), + }, + ]; + let filtered = filter_errors(errors, "igvf"); + // The check_schema/'lib_struct' error should be filtered out + assert_eq!(filtered.len(), 1); + assert_eq!(filtered[0].error_type, "check_unique_modalities"); + } + + #[test] + fn test_filter_errors_unknown_type() { + let errors = vec![ErrorObj { + severity: "error".into(), + error_type: "test".into(), + error_message: "msg".into(), + error_object: "obj".into(), + }]; + let filtered = filter_errors(errors, "unknown_filter"); + assert_eq!(filtered.len(), 1); // no filtering applied + } + + #[test] + fn test_filter_errors_igvf_onlist_skip() { + let errors = vec![ + ErrorObj { + severity: "error".into(), + error_type: "check_schema".into(), + error_message: "missing field".into(), + error_object: "'lib_struct'".into(), + }, + ErrorObj { + severity: "error".into(), + error_type: "check_onlist_files_exist".into(), + error_message: "file missing".into(), + error_object: "onlist".into(), + }, + ErrorObj { + severity: "error".into(), + error_type: "check_unique_modalities".into(), + error_message: "duplicate".into(), + error_object: "modality".into(), + }, + ]; + let filtered = filter_errors(errors, "igvf_onlist_skip"); + // Both check_schema/'lib_struct' and check_onlist_files_exist/onlist should be filtered + assert_eq!(filtered.len(), 1); + assert_eq!(filtered[0].error_type, "check_unique_modalities"); + } + + #[test] + fn test_check_with_igvf_filter() { + let spec = dogma_spec(); + let spec_path = PathBuf::from("tests/fixtures/spec.yaml"); + let unfiltered = seqspec_check(&spec, None, &spec_path); + let filtered = seqspec_check(&spec, Some("igvf"), &spec_path); + assert!(filtered.len() <= unfiltered.len()); + } + + #[test] + fn test_check_invalid_spec_duplicate_modalities() { + use crate::models::region::Region; + let spec = Assay::new( + "test".into(), + "test".into(), + "".into(), + "".into(), + "".into(), + vec!["rna".into(), "rna".into()], // duplicate + "".into(), + vec![], + vec![ + Region::new( + "rna".into(), + "rna".into(), + "rna".into(), + "joined".into(), + "".into(), + 0, + 0, + None, + vec![], + ), + Region::new( + "rna".into(), + "rna".into(), + "rna".into(), + "joined".into(), + "".into(), + 0, + 0, + None, + vec![], + ), + ], + None, + None, + None, + None, + None, + ); + let spec_path = PathBuf::from("tests/fixtures/spec.yaml"); + let errors = seqspec_check(&spec, None, &spec_path); + let has_dup = errors + .iter() + .any(|e| e.error_type == "check_unique_modalities"); + assert!(has_dup, "Should detect duplicate modalities"); + } + + #[test] + fn test_check_sequence_types_flags_random_region_with_n_sequence() { + let spec_path = PathBuf::from("tests/fixtures/random_with_n/spec.yaml"); + let spec = load_spec(&spec_path); + let errors = seqspec_check(&spec, None, &spec_path); + + assert!(errors.iter().any(|error| { + error.error_type == "check_sequence_types" + && error + .error_message + .contains("'index7' sequence_type is 'random' and sequence is not all X's") + })); + } + + #[test] + fn test_check_warns_on_overlapping_read_regions() { + let spec_path = PathBuf::from("tests/fixtures/check_overlap_warning/spec.yaml"); + let spec = load_spec(&spec_path); + let diagnostics = seqspec_check(&spec, None, &spec_path); + + let errors: Vec<_> = diagnostics + .iter() + .filter(|diagnostic| diagnostic.severity == "error") + .collect(); + let warnings: Vec<_> = diagnostics + .iter() + .filter(|diagnostic| diagnostic.severity == "warning") + .collect(); + + assert!(errors.is_empty()); + assert_eq!(warnings.len(), 1); + assert_eq!(warnings[0].error_type, "check_overlapping_read_regions"); + assert!(warnings[0] + .error_message + .contains("seqspec index --no-overlap")); + assert!(warnings[0].error_message.contains("'barcode'")); + assert!(warnings[0].error_message.contains("'umi'")); + } +} diff --git a/src/seqspec_file.rs b/src/seqspec_file.rs new file mode 100644 index 00000000..15f3d3a5 --- /dev/null +++ b/src/seqspec_file.rs @@ -0,0 +1,646 @@ +use crate::models::assay::Assay; +use crate::models::file::File; +use crate::models::onlist::Onlist; +use crate::utils; +use clap::Args; +use std::collections::{HashMap, HashSet}; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +#[derive(Debug, Args)] +pub struct FileArgs { + #[clap(short, long, help = "Output file path", value_name = "OUT")] + output: Option, + + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, + + #[clap( + short, + long, + help = "IDs", + value_name = "IDs", + required = false, + value_delimiter = ',' + )] + ids: Option>, + + #[clap( + short, + long, + help = "Modality", + value_name = "MODALITY", + required = true + )] + modality: String, + + #[clap( + short, + long, + help = "Selector", + value_name = "SELECTOR", + value_parser = ["read", "region", "file", "region-type"], + default_value = "read" + )] + selector: String, + + #[clap( + short = 'f', + long, + help = "Format", + value_name = "FORMAT", + value_parser = ["paired", "interleaved", "index", "list", "json"], + default_value = "paired" + )] + format: String, + + #[clap( + short, + long, + help = "Key", + value_name = "KEY", + value_parser = ["file_id", "filename", "filetype", "filesize", "url", "urltype", "md5", "all"], + default_value = "file_id" + )] + key: String, + + #[clap(long, help = "Use full path for local urls", default_value = "false")] + fullpath: bool, +} + +pub fn run_file(args: &FileArgs) { + validate_file_args(args); + let spec = utils::load_spec(&args.yaml); + + let ids = args.ids.clone(); + let files = seqspec_file(&spec, &args.modality, ids.as_ref(), &args.selector); + + if !files.is_empty() { + let result = match args.format.as_str() { + "list" => format_list_files_metadata(&files, &args.key, &args.yaml, args.fullpath), + "paired" | "interleaved" | "index" => format_list_files( + &files, + &args.format, + Some(&args.key), + &args.yaml, + args.fullpath, + ), + "json" => format_json_files(&files, &args.key, &args.yaml, args.fullpath), + _ => String::new(), + }; + + if let Some(output) = &args.output { + let mut file = fs::File::create(output).unwrap(); + writeln!(file, "{}", result).unwrap(); + } else { + println!("{}", result); + } + } +} + +fn validate_file_args(args: &FileArgs) { + if !args.yaml.exists() { + eprintln!("Please use `seqspec file -h` for help."); + std::process::exit(1); + } + if ["filesize", "filetype", "urltype", "md5"].contains(&args.key.as_str()) + && ["paired", "interleaved", "index"].contains(&args.format.as_str()) + { + eprintln!( + "Format '{}' valid only with key 'file_id', 'filename', or 'url'", + args.format + ); + std::process::exit(1); + } +} + +pub fn seqspec_file( + spec: &Assay, + modality: &String, + ids: Option<&Vec>, + selector: &String, +) -> HashMap> { + let list_files = |sel: &str| -> HashMap> { + match sel { + "read" => list_read_files(spec, modality), + "region" => list_region_files(spec, modality), + "file" => list_all_files(spec, modality), + _ => HashMap::new(), + } + }; + + let list_files_by_id = |sel: &str, ids: &Vec| -> HashMap> { + match sel { + "read" => list_files_by_read_id(spec, modality, ids), + "file" => list_files_by_file_id(spec, modality, ids), + "region" => list_files_by_region_id(spec, modality, ids), + "region-type" => list_files_by_region_type(spec, modality, ids), + _ => HashMap::new(), + } + }; + + match ids { + None => list_files(&selector), + Some(v) if v.is_empty() => list_files(&selector), + Some(v) => list_files_by_id(&selector, v), + } +} + +fn list_read_files(spec: &Assay, modality: &String) -> HashMap> { + let mut files: HashMap> = HashMap::new(); + for rd in spec.get_seqspec(modality) { + if !rd.files.is_empty() { + files.insert(rd.read_id, rd.files); + } + } + files +} + +fn list_all_files(spec: &Assay, modality: &String) -> HashMap> { + let mut rd = list_read_files(spec, modality); + let rgn = list_region_files(spec, modality); + rd.extend(rgn); + rd +} + +fn list_onlist_files(spec: &Assay, modality: &String) -> HashMap> { + let mut files: HashMap> = HashMap::new(); + let regions = spec.get_libspec(modality).unwrap().get_onlist_regions(); + for r in regions { + if let Some(ol) = r.onlist { + files.entry(r.region_id).or_default().push(ol); + } + } + files +} + +fn list_region_files(spec: &Assay, modality: &String) -> HashMap> { + // convert Onlist to File-shaped map by copying fields + let onlists = list_onlist_files(spec, modality); + let mut files: HashMap> = HashMap::new(); + for (region_id, v) in onlists { + let mut out: Vec = Vec::new(); + for ol in v { + out.push(File { + file_id: ol.file_id, + filename: ol.filename, + filetype: ol.filetype, + filesize: ol.filesize, + url: ol.url, + urltype: ol.urltype, + md5: ol.md5, + }); + } + files.insert(region_id, out); + } + files +} + +fn format_list_files_metadata( + files: &HashMap>, + k: &String, + _spec_fn: &PathBuf, + _fp: bool, +) -> String { + let mut x: Vec = Vec::new(); + for row in iter_file_rows(files) { + if k == "all" { + for (key, item) in row { + x.push(format!( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + key, + item.file_id, + item.filename, + item.filetype, + item.filesize, + item.url, + item.urltype, + item.md5 + )); + } + } else { + for (key, item) in row { + let attr = match k.as_str() { + "url" => item.url.clone(), + "file_id" => item.file_id.clone(), + "filename" => item.filename.clone(), + "filetype" => item.filetype.clone(), + "filesize" => item.filesize.to_string(), + "urltype" => item.urltype.clone(), + "md5" => item.md5.clone(), + _ => String::new(), + }; + x.push(format!("{}\t{}\t{}", key, item.file_id, attr)); + } + } + } + x.join("\n") +} + +fn format_json_files( + files: &HashMap>, + k: &String, + spec_fn: &PathBuf, + fp: bool, +) -> String { + use serde_json::json; + let mut x: Vec = Vec::new(); + for row in iter_file_rows(files) { + for (_key, item) in row { + if k == "all" { + let mut d = serde_json::to_value(item).unwrap(); + if item.urltype == "local" && fp { + if let Some(obj) = d.as_object_mut() { + obj.insert("url".to_string(), json!(full_path(spec_fn, &item.url))); + } + } + x.push(d); + } else { + let mut attr = match k.as_str() { + "url" => maybe_full(&item.url, &item.urltype, spec_fn, fp), + _ => String::new(), + }; + if k != "url" { + attr = match k.as_str() { + "file_id" => item.file_id.clone(), + "filename" => item.filename.clone(), + "filetype" => item.filetype.clone(), + "filesize" => item.filesize.to_string(), + "urltype" => item.urltype.clone(), + "md5" => item.md5.clone(), + _ => String::new(), + }; + } + x.push(json!({ "file_id": item.file_id, k: attr })); + } + } + } + serde_json::to_string_pretty(&x).unwrap() +} + +fn format_list_files( + files: &HashMap>, + fmt: &String, + k: Option<&String>, + spec_fn: &PathBuf, + fp: bool, +) -> String { + let mut out: Vec = Vec::new(); + if fmt == "paired" { + for row in iter_file_rows(files) { + let mut t: Vec = Vec::new(); + for (_key, i) in row { + let val = if let Some(key) = k { + let mut attr = match key.as_str() { + "url" => maybe_full(&i.url, &i.urltype, spec_fn, fp), + _ => String::new(), + }; + if key != &"url".to_string() { + attr = match key.as_str() { + "file_id" => i.file_id.clone(), + "filename" => i.filename.clone(), + "filetype" => i.filetype.clone(), + "filesize" => i.filesize.to_string(), + "urltype" => i.urltype.clone(), + "md5" => i.md5.clone(), + _ => String::new(), + }; + } + attr + } else { + i.filename.clone() + }; + t.push(val); + } + out.push(t.join("\t")); + } + } else if fmt == "interleaved" || fmt == "list" { + for row in iter_file_rows(files) { + for (_key, i) in row { + let id = if let Some(key) = k { + let mut attr = match key.as_str() { + "url" => maybe_full(&i.url, &i.urltype, spec_fn, fp), + _ => String::new(), + }; + if key != &"url".to_string() { + attr = match key.as_str() { + "file_id" => i.file_id.clone(), + "filename" => i.filename.clone(), + "filetype" => i.filetype.clone(), + "filesize" => i.filesize.to_string(), + "urltype" => i.urltype.clone(), + "md5" => i.md5.clone(), + _ => String::new(), + }; + } + attr + } else { + i.filename.clone() + }; + out.push(id); + } + } + } else if fmt == "index" { + let mut t: Vec = Vec::new(); + for row in iter_file_rows(files) { + for (_key, i) in row { + let id = if let Some(key) = k { + let mut attr = match key.as_str() { + "url" => maybe_full(&i.url, &i.urltype, spec_fn, fp), + _ => String::new(), + }; + if key != &"url".to_string() { + attr = match key.as_str() { + "file_id" => i.file_id.clone(), + "filename" => i.filename.clone(), + "filetype" => i.filetype.clone(), + "filesize" => i.filesize.to_string(), + "urltype" => i.urltype.clone(), + "md5" => i.md5.clone(), + _ => String::new(), + }; + } + attr + } else { + i.filename.clone() + }; + t.push(id); + } + } + out.push(t.join(",")); + } + out.join("\n") +} + +fn ordered_file_columns<'a>( + files: &'a HashMap>, +) -> Vec<(&'a String, &'a Vec)> { + let mut columns: Vec<(&String, &Vec)> = files.iter().collect(); + columns.sort_by(|(left, _), (right, _)| left.cmp(right)); + columns +} + +fn iter_file_rows<'a>(files: &'a HashMap>) -> Vec> { + let columns = ordered_file_columns(files); + let row_count = columns + .iter() + .map(|(_, items)| items.len()) + .min() + .unwrap_or(0); + let mut rows = Vec::new(); + for row_idx in 0..row_count { + let mut row = Vec::new(); + for (key, items) in &columns { + row.push((*key, &items[row_idx])); + } + rows.push(row); + } + rows +} + +fn list_files_by_read_id( + spec: &Assay, + modality: &String, + read_ids: &Vec, +) -> HashMap> { + let mut files: HashMap> = HashMap::new(); + let ids: HashSet = read_ids.iter().cloned().collect(); + for read in spec.get_seqspec(modality) { + if ids.contains(&read.read_id) && !read.files.is_empty() { + files.entry(read.read_id).or_default().extend(read.files); + } + } + files +} + +fn list_files_by_file_id( + spec: &Assay, + modality: &String, + file_ids: &Vec, +) -> HashMap> { + let mut files: HashMap> = HashMap::new(); + let ids: HashSet = file_ids.iter().cloned().collect(); + for read in spec.get_seqspec(modality) { + for file in read.files { + if ids.contains(&file.filename) { + files.entry(read.read_id.clone()).or_default().push(file); + } + } + } + files +} + +fn list_files_by_region_id( + spec: &Assay, + modality: &String, + region_ids: &Vec, +) -> HashMap> { + let files = list_region_files(spec, modality); + let ids: HashSet = region_ids.iter().cloned().collect(); + let mut new_files: HashMap> = HashMap::new(); + for (region_id, region_files) in files { + if ids.contains(®ion_id) { + new_files.entry(region_id).or_default().extend(region_files); + } + } + new_files +} + +fn list_files_by_region_type( + spec: &Assay, + modality: &String, + region_types: &Vec, +) -> HashMap> { + let files = list_region_files(spec, modality); + let ids: HashSet = region_types.iter().cloned().collect(); + let mut new_files: HashMap> = HashMap::new(); + for (region_id, region_files) in files { + let m = spec.get_libspec(modality).unwrap(); + let regions = m.get_region_by_id(®ion_id); + let r = regions.first().unwrap().clone(); + if ids.contains(&r.region_type) { + new_files.entry(region_id).or_default().extend(region_files); + } + } + new_files +} + +fn maybe_full(url: &String, urltype: &String, spec_fn: &PathBuf, fp: bool) -> String { + if urltype == "local" && fp { + full_path(spec_fn, url) + } else { + url.clone() + } +} + +fn full_path(spec_fn: &PathBuf, url: &String) -> String { + let parent: PathBuf = spec_fn + .parent() + .map(|p| p.to_path_buf()) + .unwrap_or_else(|| PathBuf::from(".")); + parent.join(url).to_string_lossy().to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_seqspec_file_read_selector() { + let spec = dogma_spec(); + let files = seqspec_file(&spec, &"rna".into(), None, &"read".into()); + assert!(!files.is_empty()); + // Each key should be a read_id + for (read_id, file_list) in &files { + assert!(!read_id.is_empty()); + assert!(!file_list.is_empty()); + } + } + + #[test] + fn test_seqspec_file_region_selector() { + let spec = dogma_spec(); + let files = seqspec_file(&spec, &"rna".into(), None, &"region".into()); + // Region files come from onlists, which the rna modality should have + assert!(!files.is_empty()); + } + + #[test] + fn test_seqspec_file_all_selector() { + let spec = dogma_spec(); + let all_files = seqspec_file(&spec, &"rna".into(), None, &"file".into()); + let read_files = seqspec_file(&spec, &"rna".into(), None, &"read".into()); + let region_files = seqspec_file(&spec, &"rna".into(), None, &"region".into()); + // "file" selector returns union of read and region files + assert!(all_files.len() >= read_files.len()); + assert!(all_files.len() >= region_files.len()); + } + + #[test] + fn test_seqspec_file_by_read_id() { + let spec = dogma_spec(); + let rna_reads = spec.get_seqspec("rna"); + let read_id = rna_reads[0].read_id.clone(); + let ids = vec![read_id.clone()]; + let files = seqspec_file(&spec, &"rna".into(), Some(&ids), &"read".into()); + assert!(files.contains_key(&read_id)); + } + + #[test] + fn test_full_path() { + let spec_fn = PathBuf::from("/data/specs/spec.yaml"); + let url = "barcodes.txt".to_string(); + let result = full_path(&spec_fn, &url); + assert_eq!(result, "/data/specs/barcodes.txt"); + } + + #[test] + fn test_maybe_full_local() { + let spec_fn = PathBuf::from("/data/specs/spec.yaml"); + let url = "file.txt".to_string(); + let result = maybe_full(&url, &"local".to_string(), &spec_fn, true); + assert_eq!(result, "/data/specs/file.txt"); + } + + #[test] + fn test_maybe_full_remote() { + let spec_fn = PathBuf::from("/data/specs/spec.yaml"); + let url = "http://example.com/file.txt".to_string(); + let result = maybe_full(&url, &"http".to_string(), &spec_fn, true); + assert_eq!(result, "http://example.com/file.txt"); + } + + #[test] + fn test_format_list_files_paired_uses_row_major_order() { + let mut files = HashMap::new(); + files.insert( + "rna_R1".to_string(), + vec![ + File::new( + "r1_a".into(), + "r1_a.fastq.gz".into(), + "fastq".into(), + 0, + "r1_a.fastq.gz".into(), + "local".into(), + "".into(), + ), + File::new( + "r1_b".into(), + "r1_b.fastq.gz".into(), + "fastq".into(), + 0, + "r1_b.fastq.gz".into(), + "local".into(), + "".into(), + ), + ], + ); + files.insert( + "rna_R2".to_string(), + vec![ + File::new( + "r2_a".into(), + "r2_a.fastq.gz".into(), + "fastq".into(), + 0, + "r2_a.fastq.gz".into(), + "local".into(), + "".into(), + ), + File::new( + "r2_b".into(), + "r2_b.fastq.gz".into(), + "fastq".into(), + 0, + "r2_b.fastq.gz".into(), + "local".into(), + "".into(), + ), + ], + ); + + let rendered = format_list_files( + &files, + &"paired".to_string(), + Some(&"filename".to_string()), + &PathBuf::from("spec.yaml"), + false, + ); + assert_eq!( + rendered, + "r1_a.fastq.gz\tr2_a.fastq.gz\nr1_b.fastq.gz\tr2_b.fastq.gz" + ); + } + + #[test] + fn test_format_list_files_metadata_keeps_raw_local_url() { + let mut files = HashMap::new(); + files.insert( + "rna_R1".to_string(), + vec![File::new( + "r1".into(), + "r1.fastq.gz".into(), + "fastq".into(), + 0, + "relative/r1.fastq.gz".into(), + "local".into(), + "".into(), + )], + ); + + let rendered = format_list_files_metadata( + &files, + &"url".to_string(), + &PathBuf::from("/tmp/spec.yaml"), + true, + ); + assert_eq!(rendered, "rna_R1\tr1\trelative/r1.fastq.gz"); + } +} diff --git a/src/seqspec_find.rs b/src/seqspec_find.rs new file mode 100644 index 00000000..7c98d921 --- /dev/null +++ b/src/seqspec_find.rs @@ -0,0 +1,227 @@ +use crate::seqspec_file::seqspec_file as seqspec_file_lookup; +use crate::utils; +use std::fs; +use std::io::Write; + +use crate::models::assay::Assay; +use crate::models::file::File; +use crate::models::read::Read; +use crate::models::region::Region; +use clap::Args; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; + +#[derive(Debug, Args)] +pub struct FindArgs { + #[clap(short, long, help = "Output file path", value_name = "OUT")] + pub output: Option, + + #[clap(help = "Sequencing specification yaml file", required = true)] + pub yaml: PathBuf, + + #[clap( + short, + long, + help = "Selector", + value_name = "SELECTOR", + default_value = "region", + value_parser = ["read", "region", "file", "region-type"] + )] + pub selector: String, + + #[clap(long, hide = true)] + pub rtype: bool, + + #[clap( + short, + long, + help = "Modality", + value_name = "MODALITY", + required = true + )] + pub modality: String, + + #[clap(short, long, help = "ID", value_name = "ID")] + pub id: Option, +} + +pub fn validate_find_args(args: &FindArgs) -> () { + if !args.yaml.exists() { + eprintln!("Please use `seqspec find -h` for help."); + std::process::exit(1); + } + if args.selector.is_empty() { + eprintln!("Please use `seqspec find -h` for help."); + std::process::exit(1); + } +} + +pub fn run_find(args: &FindArgs) { + validate_find_args(args); + let spec = utils::load_spec(&args.yaml); + + let found = seqspec_find(&spec, &args.selector, &args.modality, args.id.as_deref()); + let yaml_str = match found { + FindResult::Reads(v) => serde_yaml::to_string(&v).unwrap(), + FindResult::Regions(v) => serde_yaml::to_string(&v).unwrap(), + FindResult::Files(v) => serde_yaml::to_string(&v).unwrap(), + }; + // write to output + if let Some(output) = &args.output { + let mut file = fs::File::create(output).unwrap(); + writeln!(file, "{}", yaml_str).unwrap(); + } else { + println!("{}", yaml_str); + } +} + +pub fn find_by_region_type(spec: &Assay, modality: &str, region_type: &str) -> Vec { + let m = spec.get_libspec(modality); + m.unwrap().get_region_by_region_type(region_type) +} + +pub fn find_by_region_id(spec: &Assay, modality: &str, region_id: &str) -> Vec { + let m = spec.get_libspec(modality); + match m { + Some(m) => m.get_region_by_id(region_id), + None => Vec::new(), + } +} + +pub fn find_by_file_id(spec: &Assay, modality: &str, file_id: &str) -> Vec { + let selector = "file".to_string(); + seqspec_file_lookup(spec, &modality.to_string(), None, &selector) + .into_values() + .flatten() + .filter(|f| f.file_id == file_id) + .collect() +} + +pub fn find_by_read_id(spec: &Assay, modality: &str, read_id: &str) -> Vec { + let m = spec.get_seqspec(modality); + m.iter().filter(|r| r.read_id == read_id).cloned().collect() +} + +#[derive(Debug, Serialize, Deserialize)] +pub enum FindResult { + Regions(Vec), + Reads(Vec), + Files(Vec), +} + +pub fn seqspec_find(spec: &Assay, selector: &str, modality: &str, id: Option<&str>) -> FindResult { + let Some(id) = id else { + return match selector { + "read" => FindResult::Reads(Vec::new()), + "region" | "region-type" => FindResult::Regions(Vec::new()), + "file" => FindResult::Files(Vec::new()), + _ => panic!("Invalid selector: {}", selector), + }; + }; + match selector { + "read" => FindResult::Reads(find_by_read_id(spec, modality, id)), + "region" => FindResult::Regions(find_by_region_id(spec, modality, id)), + "file" => FindResult::Files(find_by_file_id(spec, modality, id)), + "region-type" => FindResult::Regions(find_by_region_type(spec, modality, id)), + _ => panic!("Invalid selector: {}", selector), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + use std::path::PathBuf; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_find_by_region_type() { + let spec = dogma_spec(); + let barcodes = find_by_region_type(&spec, "rna", "barcode"); + assert_eq!(barcodes.len(), 1); + assert_eq!(barcodes[0].region_id, "rna_cell_bc"); + assert_eq!(barcodes[0].region_type, "barcode"); + } + + #[test] + fn test_find_by_region_id() { + let spec = dogma_spec(); + let found = find_by_region_id(&spec, "rna", "rna_cell_bc"); + assert_eq!(found.len(), 1); + assert_eq!(found[0].region_id, "rna_cell_bc"); + assert_eq!(found[0].min_len, 16); + } + + #[test] + fn test_find_by_read_id() { + let spec = dogma_spec(); + let found = find_by_read_id(&spec, "rna", "rna_R1"); + assert_eq!(found.len(), 1); + assert_eq!(found[0].read_id, "rna_R1"); + assert_eq!(found[0].strand, "pos"); + assert_eq!(found[0].max_len, 28); + } + + #[test] + fn test_find_by_file_id() { + let spec = dogma_spec(); + let rna_reads = spec.get_seqspec("rna"); + let file_id = &rna_reads[0].files[0].file_id; + let found = find_by_file_id(&spec, "rna", file_id); + assert_eq!(found.len(), 1); + assert_eq!(&found[0].file_id, file_id); + } + + #[test] + fn test_find_by_file_id_includes_region_onlists() { + let spec = dogma_spec(); + let found = find_by_file_id(&spec, "rna", "RNA-737K-arc-v1.txt"); + assert_eq!(found.len(), 1); + assert_eq!(found[0].filename, "RNA-737K-arc-v1.txt"); + } + + #[test] + fn test_find_no_results() { + let spec = dogma_spec(); + let found = find_by_region_id(&spec, "rna", "nonexistent_region_id"); + assert!(found.is_empty()); + + let found = find_by_read_id(&spec, "rna", "nonexistent_read"); + assert!(found.is_empty()); + } + + #[test] + fn test_seqspec_find_dispatches() { + let spec = dogma_spec(); + let result = seqspec_find(&spec, "region-type", "rna", Some("barcode")); + match result { + FindResult::Regions(v) => { + assert_eq!(v.len(), 1); + assert_eq!(v[0].region_id, "rna_cell_bc"); + } + _ => panic!("Expected Regions variant"), + } + + let result = seqspec_find(&spec, "read", "rna", Some("rna_R1")); + match result { + FindResult::Reads(v) => { + assert_eq!(v.len(), 1); + assert_eq!(v[0].read_id, "rna_R1"); + } + _ => panic!("Expected Reads variant"), + } + } + + #[test] + fn test_seqspec_find_returns_empty_without_id() { + let spec = dogma_spec(); + let result = seqspec_find(&spec, "region", "rna", None); + match result { + FindResult::Regions(v) => assert!(v.is_empty()), + _ => panic!("Expected Regions variant"), + } + } +} diff --git a/src/seqspec_format.rs b/src/seqspec_format.rs new file mode 100644 index 00000000..280afcb0 --- /dev/null +++ b/src/seqspec_format.rs @@ -0,0 +1,170 @@ +use crate::utils; +use std::fs::File; +use std::io::Write; +use std::path::PathBuf; + +use crate::assay::Assay; +use clap::Args; + +#[derive(Debug, Args)] +pub struct FormatArgs { + #[clap(short, long, help = "Output file path", value_name = "OUT")] + output: Option, + + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, +} + +pub fn validate_format_args(args: &FormatArgs) -> () { + // just call the runner and print any error nicely + if !args.yaml.exists() { + eprintln!("Please use `seqspec format -h` for help."); + std::process::exit(1); + } +} + +pub fn run_format(args: &FormatArgs) { + validate_format_args(args); + let spec = &mut utils::load_spec(&args.yaml); + let output = &args.output; + seqspec_format(spec); + if let Some(output) = output { + let mut file = File::create(output).unwrap(); + writeln!(file, "{}", serde_yaml::to_string(&spec).unwrap()).unwrap(); + } else { + println!("{}", serde_yaml::to_string(&spec).unwrap()); + } +} + +fn seqspec_format(spec: &mut Assay) -> () { + spec.update_spec(); +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::models::assay::Assay; + use crate::models::region::Region; + use crate::utils::load_spec; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + fn leaf(id: &str, seq: &str, stype: &str, len: i64) -> Region { + Region::new( + id.into(), + "barcode".into(), + id.into(), + stype.into(), + seq.into(), + len, + len, + None, + vec![], + ) + } + + fn make_joined(children: Vec) -> Assay { + let parent = Region::new( + "rna".into(), + "rna".into(), + "rna".into(), + "joined".into(), + "".into(), + 0, + 0, + None, + children, + ); + Assay::new( + "test".into(), + "test".into(), + "".into(), + "".into(), + "".into(), + vec!["rna".into()], + "".into(), + vec![], + vec![parent], + None, + None, + None, + None, + None, + ) + } + + #[test] + fn test_format_updates_joined_parent() { + let mut spec = make_joined(vec![ + leaf("bc", "ATCG", "fixed", 4), + leaf("umi", "AACCGG", "fixed", 6), + ]); + seqspec_format(&mut spec); + let lib = spec.get_libspec("rna").unwrap(); + assert_eq!(lib.min_len, 10); + assert_eq!(lib.max_len, 10); + assert_eq!(lib.sequence, "ATCGAACCGG"); + } + + #[test] + fn test_format_random_sequences() { + let mut spec = make_joined(vec![leaf("bc", "", "random", 16)]); + seqspec_format(&mut spec); + let lib = spec.get_libspec("rna").unwrap(); + let bc = &lib.regions[0]; + assert_eq!(bc.sequence, "X".repeat(16)); + } + + #[test] + fn test_format_onlist_sequences() { + let mut spec = make_joined(vec![leaf("bc", "", "onlist", 16)]); + seqspec_format(&mut spec); + let lib = spec.get_libspec("rna").unwrap(); + let bc = &lib.regions[0]; + assert_eq!(bc.sequence, "N".repeat(16)); + } + + #[test] + fn test_format_preserves_fixed() { + let mut spec = make_joined(vec![leaf("linker", "ATCGATCG", "fixed", 8)]); + seqspec_format(&mut spec); + let lib = spec.get_libspec("rna").unwrap(); + assert_eq!(lib.regions[0].sequence, "ATCGATCG"); + } + + #[test] + fn test_format_nested_regions() { + let inner = Region::new( + "inner".into(), + "inner".into(), + "inner".into(), + "joined".into(), + "".into(), + 0, + 0, + None, + vec![ + leaf("bc", "ATCG", "fixed", 4), + leaf("umi", "AACC", "fixed", 4), + ], + ); + let mut spec = make_joined(vec![inner, leaf("linker", "GG", "fixed", 2)]); + seqspec_format(&mut spec); + let lib = spec.get_libspec("rna").unwrap(); + assert_eq!(lib.min_len, 10); + assert_eq!(lib.sequence, "ATCGAACCGG"); + } + + #[test] + fn test_format_dogma_spec() { + let mut spec = dogma_spec(); + seqspec_format(&mut spec); + // All modalities should have well-formed library_spec after format + for m in spec.list_modalities() { + let lib = spec.get_libspec(&m).unwrap(); + assert!(lib.min_len > 0, "modality {} should have min_len > 0", m); + } + } +} diff --git a/src/seqspec_html.rs b/src/seqspec_html.rs new file mode 100644 index 00000000..921e8b1a --- /dev/null +++ b/src/seqspec_html.rs @@ -0,0 +1,566 @@ +use crate::models::assay::{Assay, LibKit, LibProtocol, SeqKit, SeqProtocol}; +use crate::models::file::File; +use crate::models::onlist::Onlist; +use crate::models::read::Read; +use crate::models::region::Region; +use crate::utils; +use serde::Serialize; + +const TEMPLATE_HTML: &str = include_str!("../seqspec/report_assets/template.html"); +const STYLE_CSS: &str = include_str!("../seqspec/report_assets/style.css"); +const APP_JS: &str = include_str!("../seqspec/report_assets/app.js"); + +#[derive(Debug, Serialize)] +pub struct SeqspecViewData { + assay_id: String, + assay_name: String, + seqspec_version: Option, + doi: String, + date: String, + description: String, + lib_struct: String, + modalities: Vec, +} + +#[derive(Debug, Serialize)] +pub struct ModalityView { + modality: String, + library_region_id: String, + total_bp: i64, + sequence_protocols: Vec, + sequence_kits: Vec, + library_protocols: Vec, + library_kits: Vec, + region_nodes: Vec, + regions: Vec, + reads: Vec, +} + +#[derive(Debug, Serialize)] +pub struct MetadataRow { + #[serde(skip_serializing_if = "Option::is_none")] + protocol_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + kit_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + name: Option, +} + +#[derive(Clone, Debug, Serialize)] +pub struct RegionView { + region_id: String, + region_type: String, + name: String, + sequence_type: String, + sequence: String, + min_len: i64, + max_len: i64, + len: i64, + bp_start: i64, + bp_end: i64, + depth: usize, + parent_region_id: Option, + path_region_ids: Vec, + path_names: Vec, + is_leaf: bool, + child_region_ids: Vec, + onlist: Option, +} + +#[derive(Clone, Debug, Serialize)] +pub struct OnlistView { + file_id: String, + filename: String, + filetype: String, + filesize: i64, + url: String, + urltype: String, + md5: String, +} + +#[derive(Debug, Serialize)] +pub struct ReadView { + read_id: String, + name: String, + label: String, + primer_id: String, + min_len: i64, + max_len: i64, + strand: String, + start: i64, + end: i64, + files: Vec, +} + +#[derive(Debug, Serialize)] +pub struct FileView { + file_id: String, + filename: String, + filetype: String, + filesize: i64, + url: String, + urltype: String, + md5: String, +} + +pub fn render_seqspec_html(spec: &Assay) -> Result { + let payload = build_seqspec_view_data(spec)?; + let payload_json = + escape_script_json(&serde_json::to_string(&payload).map_err(|e| e.to_string())?); + let repository_json = + serde_json::to_string(env!("CARGO_PKG_REPOSITORY")).map_err(|e| e.to_string())?; + let version_json = + serde_json::to_string(env!("CARGO_PKG_VERSION")).map_err(|e| e.to_string())?; + + Ok(TEMPLATE_HTML + .replace("__STYLE__", STYLE_CSS) + .replace("__APP__", APP_JS) + .replace("__DATA__", &payload_json) + .replace("__REPOSITORY__", &repository_json) + .replace("__TOOL_VERSION__", &version_json)) +} + +pub fn build_seqspec_view_data(spec: &Assay) -> Result { + let mut modalities = Vec::new(); + for modality in &spec.modalities { + modalities.push(build_modality_view(spec, modality)?); + } + + Ok(SeqspecViewData { + assay_id: spec.assay_id.clone(), + assay_name: spec.name.clone(), + seqspec_version: spec.seqspec_version.clone(), + doi: spec.doi.clone(), + date: spec.date.clone(), + description: spec.description.clone(), + lib_struct: spec.lib_struct.clone(), + modalities, + }) +} + +fn build_modality_view(spec: &Assay, modality: &str) -> Result { + let libspec = spec + .get_libspec(modality) + .ok_or_else(|| format!("modality '{}' not found in library_spec", modality))?; + + let (region_nodes, regions, total_bp) = region_views(&libspec); + let mut reads = Vec::new(); + for read in spec.get_seqspec(modality) { + reads.push(project_read(&libspec, &read)?); + } + + Ok(ModalityView { + modality: modality.to_string(), + library_region_id: libspec.region_id.clone(), + total_bp, + sequence_protocols: seq_protocol_rows(spec.sequence_protocol.as_ref(), modality), + sequence_kits: seq_kit_rows(spec.sequence_kit.as_ref(), modality), + library_protocols: lib_protocol_rows(spec.library_protocol.as_ref(), modality), + library_kits: lib_kit_rows(spec.library_kit.as_ref(), modality), + region_nodes, + regions, + reads, + }) +} + +fn region_views(libspec: &Region) -> (Vec, Vec, i64) { + walk_regions(&libspec.regions, 0, 0, None, Vec::new(), Vec::new()) +} + +fn walk_regions( + regions: &[Region], + depth: usize, + bp_start: i64, + parent_region_id: Option, + path_region_ids: Vec, + path_names: Vec, +) -> (Vec, Vec, i64) { + let mut region_nodes = Vec::new(); + let mut leaf_regions = Vec::new(); + let mut current_bp = bp_start; + + for region in regions { + let mut region_path_ids = path_region_ids.clone(); + region_path_ids.push(region.region_id.clone()); + let mut region_path_names = path_names.clone(); + region_path_names.push(region.name.clone()); + let start = current_bp; + + if region.regions.is_empty() { + let end = start + region.max_len; + let node = build_region_node( + region, + depth, + parent_region_id.clone(), + region_path_ids, + region_path_names, + start, + end, + ); + current_bp = end; + region_nodes.push(node.clone()); + leaf_regions.push(node); + } else { + let (child_nodes, child_leaves, next_bp) = walk_regions( + ®ion.regions, + depth + 1, + current_bp, + Some(region.region_id.clone()), + region_path_ids.clone(), + region_path_names.clone(), + ); + current_bp = next_bp; + let node = build_region_node( + region, + depth, + parent_region_id.clone(), + region_path_ids, + region_path_names, + start, + current_bp, + ); + region_nodes.push(node); + region_nodes.extend(child_nodes); + leaf_regions.extend(child_leaves); + } + } + + (region_nodes, leaf_regions, current_bp) +} + +fn build_region_node( + region: &Region, + depth: usize, + parent_region_id: Option, + path_region_ids: Vec, + path_names: Vec, + start: i64, + end: i64, +) -> RegionView { + RegionView { + region_id: region.region_id.clone(), + region_type: region.region_type.clone(), + name: region.name.clone(), + sequence_type: region.sequence_type.clone(), + sequence: region.sequence.clone(), + min_len: region.min_len, + max_len: region.max_len, + len: end - start, + bp_start: start, + bp_end: end, + depth, + parent_region_id, + path_region_ids, + path_names, + is_leaf: region.regions.is_empty(), + child_region_ids: region + .regions + .iter() + .map(|child| child.region_id.clone()) + .collect(), + onlist: onlist_view(region.onlist.clone()), + } +} + +fn project_read(libspec: &Region, read: &Read) -> Result { + let leaves = libspec.get_leaves_with_region_id(&read.primer_id); + let primer_index = leaves + .iter() + .position(|leaf| leaf.region_id == read.primer_id) + .ok_or_else(|| { + format!( + "primer_id '{}' not found in library '{}'", + read.primer_id, libspec.region_id + ) + })?; + let cuts = utils::project_regions_to_coordinates(leaves); + let primer = &cuts[primer_index]; + + let (start, end) = if read.strand == "pos" { + let start = primer.stop; + (start, start + read.max_len) + } else { + let end = primer.start; + (end - read.max_len, end) + }; + + Ok(ReadView { + read_id: read.read_id.clone(), + name: read.name.clone(), + label: read.name.clone(), + primer_id: read.primer_id.clone(), + min_len: read.min_len, + max_len: read.max_len, + strand: read.strand.clone(), + start, + end, + files: read.files.iter().map(file_view).collect(), + }) +} + +fn onlist_view(onlist: Option) -> Option { + onlist.map(|onlist| OnlistView { + file_id: onlist.file_id, + filename: onlist.filename, + filetype: onlist.filetype, + filesize: onlist.filesize, + url: onlist.url, + urltype: onlist.urltype, + md5: onlist.md5, + }) +} + +fn file_view(file: &File) -> FileView { + FileView { + file_id: file.file_id.clone(), + filename: file.filename.clone(), + filetype: file.filetype.clone(), + filesize: file.filesize, + url: file.url.clone(), + urltype: file.urltype.clone(), + md5: file.md5.clone(), + } +} + +fn seq_protocol_rows(entries: Option<&Vec>, modality: &str) -> Vec { + entries + .map(|entries| { + entries + .iter() + .filter(|entry| entry.modality == modality) + .map(|entry| MetadataRow { + protocol_id: Some(entry.protocol_id.clone()), + kit_id: None, + name: Some(entry.name.clone()), + }) + .collect() + }) + .unwrap_or_default() +} + +fn seq_kit_rows(entries: Option<&Vec>, modality: &str) -> Vec { + entries + .map(|entries| { + entries + .iter() + .filter(|entry| entry.modality == modality) + .map(|entry| MetadataRow { + protocol_id: None, + kit_id: Some(entry.kit_id.clone()), + name: entry.name.clone(), + }) + .collect() + }) + .unwrap_or_default() +} + +fn lib_protocol_rows(entries: Option<&Vec>, modality: &str) -> Vec { + entries + .map(|entries| { + entries + .iter() + .filter(|entry| entry.modality == modality) + .map(|entry| MetadataRow { + protocol_id: Some(entry.protocol_id.clone()), + kit_id: None, + name: Some(entry.name.clone()), + }) + .collect() + }) + .unwrap_or_default() +} + +fn lib_kit_rows(entries: Option<&Vec>, modality: &str) -> Vec { + entries + .map(|entries| { + entries + .iter() + .filter(|entry| entry.modality == modality) + .map(|entry| MetadataRow { + protocol_id: None, + kit_id: Some(entry.kit_id.clone()), + name: entry.name.clone(), + }) + .collect() + }) + .unwrap_or_default() +} + +fn escape_script_json(value: &str) -> String { + value.replace(" Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + fn nested_spec() -> Assay { + let fixed_a = Region::new( + "fixed_a".into(), + "linker".into(), + "fixed a".into(), + "fixed".into(), + "AAA".into(), + 3, + 3, + None, + vec![], + ); + let fixed_t = Region::new( + "fixed_t".into(), + "linker".into(), + "fixed t".into(), + "fixed".into(), + "T".into(), + 1, + 1, + None, + vec![], + ); + let joined_block = Region::new( + "joined_block".into(), + "named".into(), + "joined block".into(), + "joined".into(), + "AAAT".into(), + 4, + 4, + None, + vec![fixed_a, fixed_t], + ); + let umi = Region::new( + "umi".into(), + "umi".into(), + "umi".into(), + "random".into(), + "XX".into(), + 2, + 2, + None, + vec![], + ); + let libspec = Region::new( + "rna".into(), + "rna".into(), + "rna".into(), + "joined".into(), + "AAATXX".into(), + 6, + 6, + None, + vec![joined_block, umi], + ); + let read = Read::new( + "rna_R1".into(), + "Read 1".into(), + "rna".into(), + "joined_block".into(), + 2, + 2, + "pos".into(), + vec![], + ); + Assay::new( + "nested-assay".into(), + "Nested Assay".into(), + "".into(), + "2026-03-24".into(), + "nested regions".into(), + vec!["rna".into()], + "".into(), + vec![read], + vec![libspec], + None, + None, + None, + None, + Some("0.4.0".into()), + ) + } + + #[test] + fn test_build_seqspec_view_data_contains_modalities() { + let payload = build_seqspec_view_data(&dogma_spec()).unwrap(); + assert_eq!(payload.assay_id, "DOGMAseq-DIG"); + assert_eq!(payload.modalities.len(), 4); + assert!(payload + .modalities + .iter() + .any(|modality| modality.modality == "rna")); + } + + #[test] + fn test_build_seqspec_view_data_projects_reads() { + let payload = build_seqspec_view_data(&dogma_spec()).unwrap(); + let rna = payload + .modalities + .iter() + .find(|modality| modality.modality == "rna") + .unwrap(); + let read = rna + .reads + .iter() + .find(|read| read.read_id == "rna_R2") + .unwrap(); + assert_eq!(read.strand, "neg"); + assert!(read.start < read.end); + assert_eq!(read.files.len(), 1); + } + + #[test] + fn test_render_seqspec_html_contains_payload() { + let html = render_seqspec_html(&dogma_spec()).unwrap(); + assert!(html.contains("seqspec-view-data")); + assert!(html.contains("DOGMAseq-DIG")); + assert!(html.contains("region-rect")); + } + + #[test] + fn test_build_seqspec_view_data_keeps_nested_regions() { + let payload = build_seqspec_view_data(&nested_spec()).unwrap(); + let modality = &payload.modalities[0]; + let parent = modality + .region_nodes + .iter() + .find(|node| node.region_id == "joined_block") + .unwrap(); + let child = modality + .region_nodes + .iter() + .find(|node| node.region_id == "fixed_a") + .unwrap(); + assert!(!parent.is_leaf); + assert_eq!( + parent.child_region_ids, + vec!["fixed_a".to_string(), "fixed_t".to_string()] + ); + assert_eq!( + child.path_region_ids, + vec!["joined_block".to_string(), "fixed_a".to_string()] + ); + } + + #[test] + fn test_build_seqspec_view_data_projects_reads_from_parent_region() { + let payload = build_seqspec_view_data(&nested_spec()).unwrap(); + let read = &payload.modalities[0].reads[0]; + assert_eq!(read.primer_id, "joined_block"); + assert_eq!(read.start, 4); + assert_eq!(read.end, 6); + } + + #[test] + fn test_render_seqspec_html_contains_nested_region_payload() { + let html = render_seqspec_html(&nested_spec()).unwrap(); + assert!(html.contains("joined_block")); + assert!(html.contains("group-rect")); + } +} diff --git a/src/seqspec_index.rs b/src/seqspec_index.rs new file mode 100644 index 00000000..3f8937a0 --- /dev/null +++ b/src/seqspec_index.rs @@ -0,0 +1,1020 @@ +use crate::utils; +use std::fs; +use std::io::Write; +use std::path::PathBuf; +// use std::str::FromStr; + +use crate::models::assay::Assay; +use crate::models::coordinate::Coordinate; +use crate::models::file::File; +use crate::models::region::{Region, RegionCoordinate, RegionCoordinateDifference}; +use crate::seqspec_find::find_by_region_id; +use clap::Args; +use std::collections::{HashMap, HashSet}; + +#[derive(Debug, Args)] +pub struct IndexArgs { + #[clap(short, long, help = "Output file path", value_name = "OUT")] + output: Option, + + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, + + #[clap( + short, + long, + help = "Tool", + value_name = "TOOL", + value_parser = ["chromap", "kb", "kb-single", "relative", "seqkit", "simpleaf", "starsolo", "splitcode", "tab", "zumis"], + default_value = "tab", + )] + tool: String, + + #[clap( + short, + long, + help = "Selector", + value_name = "SELECTOR", + value_parser = ["read", "region", "file"], + default_value = "read", + )] + selector: String, + + #[clap( + short, + long, + help = "Modality", + value_name = "MODALITY", + required = true + )] + modality: String, + + #[clap( + short, + long, + help = "IDs (comma-separated)", + value_name = "IDS", + value_delimiter = ',' + )] + ids: Option>, + + #[clap(long, help = "Returns 3'->5' region order", default_value = "false")] + rev: bool, + + #[clap(long, hide = true, value_name = "SUBREGIONTYPE")] + subregion_type: Option, + + #[clap( + long, + help = "Disable overlap (default: False)", + default_value = "false" + )] + no_overlap: bool, +} + +pub fn validate_index_args(args: &IndexArgs) -> () { + if !args.yaml.exists() { + eprintln!("Please use `seqspec index -h` for help."); + std::process::exit(1); + } + if args.modality.is_empty() { + eprintln!("Please use `seqspec index -h` for help."); + std::process::exit(1); + } + if args.selector.is_empty() { + eprintln!("Please use `seqspec index -h` for help."); + std::process::exit(1); + } +} + +pub fn run_index(args: &IndexArgs) { + validate_index_args(args); + let spec = utils::load_spec(&args.yaml); + + let ids = args.ids.as_ref().unwrap_or(&Vec::new()).clone(); + + let mut index = seqspec_index(&spec, &args.modality, &ids, &args.selector, &args.rev); + if args.no_overlap { + index = filter_index_no_overlap(index); + } + let fmt = format_index(&index, &args.tool, &args.subregion_type); + // write to output + if let Some(output) = &args.output { + let mut file = fs::File::create(output).unwrap(); + writeln!(file, "{}", fmt).unwrap(); + } else { + println!("{}", fmt); + } +} + +pub fn seqspec_index( + spec: &Assay, + modality: &String, + ids: &Vec, + idtype: &String, + _rev: &bool, +) -> Vec { + match (idtype.as_str(), ids.is_empty()) { + ("file", true) => get_index_by_files(spec, modality), + ("read", true) => get_index_by_reads(spec, modality), + ("region", true) => get_index_by_regions(spec, modality), + ("file", false) => get_index_by_file_ids(spec, modality, ids), + ("region", false) => get_index_by_region_ids(spec, modality, ids), + ("read", false) => get_index_by_read_ids(spec, modality, ids), + _ => Vec::new(), + } +} + +pub fn format_index( + index: &Vec, + tool: &String, + subregion_type: &Option, +) -> String { + match tool.as_str() { + "chromap" => format_chromap(index), + "kb" => format_kallisto_bus(index), + "kb-single" => format_kallisto_bus_force_single(index), + "relative" => format_relative(index), + "seqkit" => format_seqkit_subseq(index, subregion_type.as_deref()), + "simpleaf" => format_simpleaf(index), + "starsolo" => format_starsolo(index), + "splitcode" => format_splitcode(index), + "tab" => format_tab(index), + "zumis" => format_zumis(index), + _ => String::new(), + } +} + +// Declarations (implemented below) +fn get_index_by_files(spec: &Assay, modality: &String) -> Vec { + let mut all_files: Vec = Vec::new(); + for r in spec.get_seqspec(modality) { + for f in r.files { + all_files.push(f); + } + } + let file_ids: Vec = all_files.into_iter().map(|f| f.file_id).collect(); + get_index_by_file_ids(spec, modality, &file_ids) +} +fn get_index_by_reads(spec: &Assay, modality: &String) -> Vec { + let read_ids: Vec = spec + .get_seqspec(modality) + .into_iter() + .map(|r| r.read_id) + .collect(); + get_index_by_read_ids(spec, modality, &read_ids) +} +fn get_index_by_regions(spec: &Assay, modality: &String) -> Vec { + let rgn = spec.get_libspec(modality).expect("Modality not found"); + get_index_by_region_ids(spec, modality, &vec![rgn.region_id]) +} +fn get_index_by_file_ids( + spec: &Assay, + modality: &String, + file_ids: &Vec, +) -> Vec { + let files_map = list_files_by_file_id(spec, modality, file_ids); + let mut indices: Vec = Vec::new(); + for (read_id, files) in files_map { + let mut coord = get_coordinate_by_read_id(spec, modality, &read_id); + if let Some(first) = files.first() { + coord.query_id = first.file_id.clone(); + coord.query_name = first.filename.clone(); + coord.query_type = "File".to_string(); + } + indices.push(coord); + } + indices +} +fn get_index_by_region_ids( + spec: &Assay, + modality: &String, + region_ids: &Vec, +) -> Vec { + let mut indices: Vec = Vec::new(); + for id in region_ids { + let coord = get_coordinate_by_region_id(spec, modality, id); + indices.push(coord); + } + indices +} +fn get_index_by_read_ids( + spec: &Assay, + modality: &String, + read_ids: &Vec, +) -> Vec { + let mut indices: Vec = Vec::new(); + for id in read_ids { + let coord = get_coordinate_by_read_id(spec, modality, id); + indices.push(coord); + } + indices +} +fn get_coordinate_by_region_id(spec: &Assay, modality: &String, region_id: &str) -> Coordinate { + let regions = find_by_region_id(spec, modality, region_id); + let rgn = regions.first().expect("Region not found").clone(); + let leaves: Vec = rgn.get_leaves(); + let cuts: Vec = utils::project_regions_to_coordinates(leaves); + Coordinate { + query_id: rgn.region_id, + query_name: rgn.name, + query_type: "Region".to_string(), + rcv: cuts, + strand: "pos".to_string(), + } +} +fn get_coordinate_by_read_id(spec: &Assay, modality: &String, read_id: &str) -> Coordinate { + let (read, rgns) = + utils::map_read_id_to_regions(spec, modality, read_id).expect("read mapping failed"); + let rcs: Vec = utils::project_regions_to_coordinates(rgns); + let new_rcs: Vec = utils::itx_read(rcs, 0, read.max_len); + Coordinate { + query_id: read.read_id, + query_name: read.name, + query_type: "Read".to_string(), + rcv: new_rcs, + strand: read.strand, + } +} +fn filter_index_no_overlap(mut indices: Vec) -> Vec { + let mut seen: HashSet = HashSet::new(); + for idx in &mut indices { + let mut new_rcv: Vec = Vec::new(); + for rgn in idx.rcv.iter() { + let rid = rgn.region.region_id.clone(); + if !seen.contains(&rid) { + new_rcv.push(rgn.clone()); + seen.insert(rid); + } + } + idx.rcv = new_rcv; + } + indices +} +fn format_kallisto_bus(indices: &Vec) -> String { + let mut bcs: Vec = Vec::new(); + let mut umi: Vec = Vec::new(); + let mut feature: Vec = Vec::new(); + for (idx, obj) in indices.iter().enumerate() { + for cut in &obj.rcv { + let rt = cut.region.region_type.to_uppercase(); + if rt == "BARCODE" { + bcs.push(format!("{},{}{}{}", idx, cut.start, ",", cut.stop)); + } else if rt == "UMI" { + umi.push(format!("{},{}{}{}", idx, cut.start, ",", cut.stop)); + } else if matches!( + rt.as_str(), + "CDNA" | "GDNA" | "PROTEIN" | "TAG" | "SGRNA_TARGET" + ) { + feature.push(format!("{},{}{}{}", idx, cut.start, ",", cut.stop)); + } + } + } + if umi.is_empty() { + umi.push("-1,-1,-1".to_string()); + } + if bcs.is_empty() { + bcs.push("-1,-1,-1".to_string()); + } + format!("{}:{}:{}", bcs.join(","), umi.join(","), feature.join(",")) +} +fn format_kallisto_bus_force_single(indices: &Vec) -> String { + let mut bcs: Vec = Vec::new(); + let mut umi: Vec = Vec::new(); + let mut feature: Vec = Vec::new(); + let mut longest_feature: Option = None; + let mut max_length: i64 = 0; + for (idx, coord) in indices.iter().enumerate() { + for cut in &coord.rcv { + let rt = cut.region.region_type.to_uppercase(); + if rt == "BARCODE" { + bcs.push(format!("{},{}{}{}", idx, cut.start, ",", cut.stop)); + } else if rt == "UMI" { + umi.push(format!("{},{}{}{}", idx, cut.start, ",", cut.stop)); + } else if matches!( + rt.as_str(), + "CDNA" | "GDNA" | "PROTEIN" | "TAG" | "SGRNA_TARGET" + ) { + let length = cut.stop - cut.start; + if length > max_length { + max_length = length; + longest_feature = Some(format!("{},{}{}{}", idx, cut.start, ",", cut.stop)); + } + } + } + } + if umi.is_empty() { + umi.push("-1,-1,-1".to_string()); + } + if bcs.is_empty() { + bcs.push("-1,-1,-1".to_string()); + } + if let Some(lf) = longest_feature { + feature.push(lf); + } + format!("{}:{}:{}", bcs.join(","), umi.join(","), feature.join(",")) +} +fn format_seqkit_subseq(indices: &Vec, subregion_type: Option<&str>) -> String { + if indices.is_empty() { + return String::new(); + } + let coord = &indices[0]; + let mut x = String::new(); + if let Some(srt) = subregion_type { + for cut in &coord.rcv { + if cut.region.region_type == srt { + x = format!("{}:{}\n", cut.start + 1, cut.stop); + } + } + } + x +} +fn format_tab(indices: &Vec) -> String { + let mut x = String::new(); + for coord in indices { + for cut in &coord.rcv { + x.push_str(&format!( + "{}\t{}\t{}\t{}\t{}\n", + coord.query_id, cut.region.name, cut.region.region_type, cut.start, cut.stop + )); + } + } + if x.ends_with('\n') { + x.pop(); + } + x +} +fn format_starsolo(indices: &Vec) -> String { + let mut bcs: Vec = Vec::new(); + let mut umi: Vec = Vec::new(); + for coord in indices { + for cut in &coord.rcv { + let rt = cut.region.region_type.to_uppercase(); + if rt == "BARCODE" { + bcs.push(format!( + "--soloCBstart {} --soloCBlen {}", + cut.start + 1, + cut.stop - cut.start + )); + } else if rt == "UMI" { + umi.push(format!( + "--soloUMIstart {} --soloUMIlen {}", + cut.start + 1, + cut.stop - cut.start + )); + } + } + } + if let (Some(bc), Some(u)) = (bcs.first(), umi.first()) { + format!("--soloType CB_UMI_Simple {} {}", bc, u) + } else { + String::new() + } +} +fn format_simpleaf(indices: &Vec) -> String { + let mut xl: Vec = Vec::new(); + for (idx, coord) in indices.iter().enumerate() { + let mut x = format!("{}{{", idx + 1); + for cut in &coord.rcv { + let rt = cut.region.region_type.to_uppercase(); + let len = cut.stop - cut.start; + if rt == "BARCODE" { + x.push_str(&format!("b[{}]", len)); + } else if rt == "UMI" { + x.push_str(&format!("u[{}]", len)); + } else if rt == "CDNA" { + x.push_str(&format!("r[{}]", len)); + } + } + x.push_str("x:}"); + xl.push(x); + } + xl.join("") +} +fn format_zumis(indices: &Vec) -> String { + let mut xl: Vec = Vec::new(); + for coord in indices { + let mut x = String::new(); + for cut in &coord.rcv { + let rt = cut.region.region_type.to_uppercase(); + if rt == "BARCODE" { + x.push_str(&format!("- BCS({}-{})\n", cut.start + 1, cut.stop)); + } else if rt == "UMI" { + x.push_str(&format!("- UMI({}-{})\n", cut.start + 1, cut.stop)); + } else if rt == "CDNA" { + x.push_str(&format!("- cDNA({}-{})\n", cut.start + 1, cut.stop)); + } + } + xl.push(x); + } + let mut out = xl.join("\n"); + if out.ends_with('\n') { + out.pop(); + } + out +} +fn format_chromap(indices: &Vec) -> String { + let mut bc_fqs: Vec = Vec::new(); + let mut bc_str: Vec = Vec::new(); + let mut gdna_fqs: Vec = Vec::new(); + let mut gdna_str: Vec = Vec::new(); + for coord in indices { + let strand_suffix = if coord.strand == "pos" { + String::new() + } else { + ":-".to_string() + }; + for cut in &coord.rcv { + let rt = cut.region.region_type.to_uppercase(); + if rt == "BARCODE" { + bc_fqs.push(coord.query_id.clone()); + bc_str.push(format!( + "bc:{}:{}{}", + cut.start, + cut.stop - 1, + strand_suffix + )); + } else if rt == "GDNA" { + gdna_fqs.push(coord.query_id.clone()); + gdna_str.push(format!("{}:{}", cut.start, cut.stop - 1)); + } + } + } + if bc_fqs.iter().collect::>().len() > 1 { + panic!("chromap only supports barcodes from one fastq"); + } + if gdna_fqs.iter().collect::>().len() > 2 { + panic!("chromap only supports genomic dna from two fastqs"); + } + let barcode_fq = bc_fqs.first().cloned().unwrap_or_default(); + let dedup_gdna_fqs = { + let mut seen: HashSet = HashSet::new(); + let mut out: Vec = Vec::new(); + for r in &gdna_fqs { + if !seen.contains(r) { + out.push(r.clone()); + seen.insert(r.clone()); + } + } + out + }; + let read1_fq = dedup_gdna_fqs.get(0).cloned().unwrap_or_default(); + let read2_fq = dedup_gdna_fqs.get(1).cloned().unwrap_or_default(); + let read_str = gdna_str + .iter() + .enumerate() + .map(|(i, ele)| format!("r{}:{}", i + 1, ele)) + .collect::>() + .join(","); + let bc_str_join = bc_str.join(","); + format!( + "-1 {} -2 {} --barcode {} --read-format {},{}", + read1_fq, read2_fq, barcode_fq, bc_str_join, read_str + ) +} +fn compute_relative(rcs: &Vec) -> Vec { + let mut d: Vec = Vec::new(); + for obj in rcs { + for fixed in rcs { + if let Some(diff) = obj.difference(fixed) { + d.push(RegionCoordinateDifference::new( + obj.clone(), + fixed.clone(), + diff, + )); + } + } + } + d +} +fn filter_differences( + d: Vec, + filter_region_type: &str, +) -> Vec { + let mut f: Vec = Vec::new(); + for rcd in d.into_iter() { + if rcd.obj.region.region_type != filter_region_type + && rcd.fixed.region.region_type == filter_region_type + { + f.push(rcd); + } + } + f +} +fn format_relative(indices: &Vec) -> String { + let mut x = String::new(); + for coord in indices { + let diffs = compute_relative(&coord.rcv); + let mut filtered = filter_differences(diffs, "linker"); + filtered.sort_by_key(|diff| diff.obj.region.region_type.clone()); + for diff in filtered { + x.push_str(&format!( + "{}\t{}\t{}\t{}\t{}\n", + diff.obj.region.region_id, + diff.fixed.region.region_id, + diff.rgncdiff.start, + diff.rgncdiff.stop, + diff.loc + )); + } + } + x +} +// Splitcode formatting: port essential behavior (forward/complement/reverse/rc groups) +#[derive(Clone)] +struct SplitRow { + region_type: String, + fmt: String, +} + +fn format_splitcode(indices: &Vec) -> String { + use std::collections::HashMap; + + fn compute_relative(rcs: &Vec) -> Vec { + let mut d: Vec = Vec::new(); + for obj in rcs { + for fixed in rcs { + if let Some(diff) = obj.difference(fixed) { + d.push(RegionCoordinateDifference::new( + obj.clone(), + fixed.clone(), + diff, + )); + } + } + } + d + } + + fn filter_differences( + d: Vec, + filter_region_type: &str, + ) -> Vec { + let mut f: Vec = Vec::new(); + for rcd in d.into_iter() { + if rcd.obj.region.region_type != filter_region_type + && rcd.fixed.region.region_type == filter_region_type + { + f.push(rcd); + } + } + f + } + + fn groupby_region_id( + rgns: &Vec, + ) -> HashMap)> { + let mut d: HashMap)> = + HashMap::new(); + for rgn in rgns { + let key = rgn.obj.region.region_id.clone(); + d.entry(key) + .and_modify(|(_, v)| v.push(rgn.clone())) + .or_insert((rgn.obj.clone(), vec![rgn.clone()])); + } + d + } + + fn filter_groupby_region_type( + g: &mut HashMap)>, + ) { + let keys: Vec = g.keys().cloned().collect(); + for k in keys { + let (obj, _) = g.get(&k).unwrap(); + let t = obj.region.region_type.to_lowercase(); + if t != "umi" && t != "barcode" && t != "cdna" { + g.remove(&k); + } + } + } + + fn format_splitcode_row( + obj: &RegionCoordinate, + rgncdiffs: &Vec, + idx: i32, + rev: bool, + complement: bool, + ) -> SplitRow { + let mut e = String::new(); + if obj.region.region_type.to_lowercase() == "cdna" { + if rev && !complement { + e.push_str(&format!("", obj.region.region_id)); + } else if rev && complement { + e.push_str(&format!("<~rc_{}>", obj.region.region_id)); + } else if !rev && complement { + e.push_str(&format!("<~c_{}>", obj.region.region_id)); + } else { + e.push_str(&format!("", obj.region.region_id)); + } + if idx == 0 { + e = format!("0:0{}", e); + } else if idx == -1 { + e = format!("{}0:-1", e); + } + } else { + let tag = if rev && !complement { + "r" + } else if rev && complement { + "rc" + } else if !rev && complement { + "c" + } else { + "f" + }; + e.push_str(&format!( + "<{}_{}[{}]>", + tag, obj.region.region_type, obj.region.min_len + )); + } + + let mut p1 = false; + let mut m1 = false; + let mut srtdiffs = rgncdiffs.clone(); + srtdiffs.sort_by_key(|x| x.rgncdiff.region.min_len); + for diffs in srtdiffs.iter() { + let fixed = &diffs.fixed.region; + let loc = &diffs.loc; + let diff = &diffs.rgncdiff; + if fixed.region_type == "linker" { + let minl = diff.region.min_len; + let minl_str = if minl == 0 { + String::new() + } else { + format!("{}", minl) + }; + if loc == "+" && !p1 { + if rev && !complement { + e = format!("{}{}{{{}r}}", e, minl_str, fixed.region_id); + } else if rev && complement { + e = format!("{}{}{{{}rc}}", e, minl_str, fixed.region_id); + } else if !rev && complement { + e = format!("{{{}c}}{}{}", fixed.region_id, minl_str, e); + } else { + e = format!("{{{}f}}{}{}", fixed.region_id, minl_str, e); + } + p1 = true; + } else if loc == "-" && !m1 { + if rev && !complement { + e = format!("{{{}r}}{}{}", fixed.region_id, minl_str, e); + } else if rev && complement { + e = format!("{{{}rc}}{}{}", fixed.region_id, minl_str, e); + } else if !rev && complement { + e = format!("{}{}{{{}c}}", e, minl_str, fixed.region_id); + } else { + e = format!("{}{}{{{}f}}", e, minl_str, fixed.region_id); + } + m1 = true; + } + } + } + SplitRow { + region_type: obj.region.region_type.clone(), + fmt: e, + } + } + + let mut x = String::new(); + let mut e = String::new(); + for coord in indices { + let d = compute_relative(&coord.rcv); + let f = filter_differences(d, "linker"); + let mut g = groupby_region_id(&f); + filter_groupby_region_type(&mut g); + let gv: Vec<(RegionCoordinate, Vec)> = + g.into_values().collect(); + + let mut frows: Vec = Vec::new(); + let mut rrows: Vec = Vec::new(); + let mut crows: Vec = Vec::new(); + let mut rcrows: Vec = Vec::new(); + + for (i, (gb_obj, gb_rgncdiffs)) in gv.iter().enumerate() { + let last = i + 1 == gv.len(); + let idx_val: i32 = if last { -1 } else { i as i32 }; + frows.push(format_splitcode_row( + gb_obj, + gb_rgncdiffs, + idx_val, + false, + false, + )); + rrows.push(format_splitcode_row( + gb_obj, + gb_rgncdiffs, + idx_val, + true, + false, + )); + crows.push(format_splitcode_row( + gb_obj, + gb_rgncdiffs, + idx_val, + false, + true, + )); + rcrows.push(format_splitcode_row( + gb_obj, + gb_rgncdiffs, + idx_val, + true, + true, + )); + } + + let mut g_frows: HashMap> = HashMap::new(); + let mut g_crows: HashMap> = HashMap::new(); + let mut g_rrows: HashMap> = HashMap::new(); + let mut g_rcrows: HashMap> = HashMap::new(); + for r in frows { + g_frows.entry(r.region_type).or_default().push(r.fmt); + } + for r in crows { + g_crows.entry(r.region_type).or_default().push(r.fmt); + } + for r in rrows { + g_rrows.entry(r.region_type).or_default().push(r.fmt); + } + for r in rcrows { + g_rcrows.entry(r.region_type).or_default().push(r.fmt); + } + + for (_gr, v) in g_frows { + e.push_str(&format!("@extract {}\n", v.join(","))); + } + for (_gr, v) in g_crows { + e.push_str(&format!("@extract {}\n", v.join(","))); + } + for (_gr, v) in g_rrows { + e.push_str(&format!("@extract {}\n", v.join(","))); + } + for (_gr, v) in g_rcrows { + e.push_str(&format!("@extract {}\n", v.join(","))); + } + } + x.push_str(&e); + + for coord in indices { + x.push_str("groups\tids\ttags\tdistances\tlocations\n"); + let mut idx = 1; + for cut in &coord.rcv { + if cut.region.region_type == "linker" { + x.push_str(&format!( + "group{}\t{}f\t{}\t3:3:3\t0:0:0\n", + idx, cut.region.name, cut.region.sequence + )); + let comp = utils::complement_seq(&cut.region.sequence); + x.push_str(&format!( + "group{}\t{}c\t{}\t3:3:3\t0:0:0\n", + idx, cut.region.name, comp + )); + x.push_str(&format!( + "group{}\t{}r\t{}\t3:3:3\t0:0:0\n", + idx, + cut.region.name, + cut.region.sequence.chars().rev().collect::() + )); + x.push_str(&format!( + "group{}\t{}rc\t{}\t3:3:3\t0:0:0\n", + idx, + cut.region.name, + comp.chars().rev().collect::() + )); + idx += 1; + } + } + } + x +} +fn list_files_by_file_id( + spec: &Assay, + modality: &String, + file_ids: &Vec, +) -> HashMap> { + let mut files: HashMap> = HashMap::new(); + let ids: HashSet = file_ids.iter().cloned().collect(); + for read in spec.get_seqspec(modality) { + if !read.files.is_empty() { + for file in read.files { + // parity with Python: membership against provided ids uses filename there + if ids.contains(&file.filename) { + files.entry(read.read_id.clone()).or_default().push(file); + } + } + } + } + files +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_index_by_reads() { + let spec = dogma_spec(); + let modality = "rna".to_string(); + let indices = get_index_by_reads(&spec, &modality); + assert_eq!(indices.len(), 2); // RNA has 2 reads: rna_R1, rna_R2 + assert_eq!(indices[0].query_id, "rna_R1"); + assert_eq!(indices[1].query_id, "rna_R2"); + // R1 has barcode + UMI = 2 regions, R2 has cDNA = 1 region + assert_eq!(indices[0].rcv.len(), 2); + assert_eq!(indices[1].rcv.len(), 1); + } + + #[test] + fn test_index_by_read_ids() { + let spec = dogma_spec(); + let modality = "rna".to_string(); + let rna_reads = spec.get_seqspec("rna"); + assert!(!rna_reads.is_empty()); + let read_ids: Vec = vec![rna_reads[0].read_id.clone()]; + let indices = get_index_by_read_ids(&spec, &modality, &read_ids); + assert_eq!(indices.len(), 1); + assert!(!indices[0].rcv.is_empty()); + } + + #[test] + fn test_index_by_regions() { + let spec = dogma_spec(); + let modality = "rna".to_string(); + let indices = get_index_by_regions(&spec, &modality); + assert_eq!(indices.len(), 1); + assert_eq!(indices[0].query_id, "rna"); + assert_eq!(indices[0].query_type, "Region"); + // RNA library has 5 leaf regions + assert_eq!(indices[0].rcv.len(), 5); + } + + #[test] + fn test_format_tab() { + let indices = rna_indices(); + let result = format_index(&indices, &"tab".to_string(), &None); + let lines: Vec<&str> = result.lines().collect(); + assert_eq!(lines.len(), 3); + assert_eq!(lines[0], "rna_R1\tCell Barcode\tbarcode\t0\t16"); + assert_eq!(lines[1], "rna_R1\tumi\tumi\t16\t28"); + assert_eq!(lines[2], "rna_R2\tcdna\tcdna\t0\t102"); + } + + #[test] + fn test_format_kb() { + let indices = rna_indices(); + let result = format_index(&indices, &"kb".to_string(), &None); + assert_eq!(result, "0,0,16:0,16,28:1,0,102"); + } + + #[test] + fn test_seqspec_index_dispatch() { + let spec = dogma_spec(); + let modality = "rna".to_string(); + let ids: Vec = vec![]; + let idtype = "read".to_string(); + let rev = false; + let indices = seqspec_index(&spec, &modality, &ids, &idtype, &rev); + assert_eq!(indices.len(), 2); + assert_eq!(indices[0].query_id, "rna_R1"); + assert_eq!(indices[1].query_id, "rna_R2"); + } + + // ---- Format function tests ---- + + fn rna_indices() -> Vec { + let spec = dogma_spec(); + get_index_by_reads(&spec, &"rna".to_string()) + } + + #[test] + fn test_format_starsolo() { + let indices = rna_indices(); + let result = format_index(&indices, &"starsolo".to_string(), &None); + assert_eq!( + result, + "--soloType CB_UMI_Simple --soloCBstart 1 --soloCBlen 16 --soloUMIstart 17 --soloUMIlen 12" + ); + } + + #[test] + fn test_format_simpleaf() { + let indices = rna_indices(); + let result = format_index(&indices, &"simpleaf".to_string(), &None); + assert_eq!(result, "1{b[16]u[12]x:}2{r[102]x:}"); + } + + #[test] + fn test_format_zumis() { + let indices = rna_indices(); + let result = format_index(&indices, &"zumis".to_string(), &None); + assert!(result.contains("BCS(1-16)")); + assert!(result.contains("UMI(17-28)")); + assert!(result.contains("cDNA(1-102)")); + } + + #[test] + fn test_format_kb_single() { + let indices = rna_indices(); + let result = format_index(&indices, &"kb-single".to_string(), &None); + // kb-single selects only the longest feature read + assert_eq!(result, "0,0,16:0,16,28:1,0,102"); + } + + #[test] + fn test_index_by_files() { + let spec = dogma_spec(); + let modality = "rna".to_string(); + let indices = get_index_by_files(&spec, &modality); + assert!(!indices.is_empty()); + for coord in &indices { + assert_eq!(coord.query_type, "File"); + } + } + + #[test] + fn test_index_by_region_ids() { + let spec = dogma_spec(); + let modality = "rna".to_string(); + let lib = spec.get_libspec("rna").unwrap(); + let region_id = lib.region_id.clone(); + let indices = get_index_by_region_ids(&spec, &modality, &vec![region_id]); + assert_eq!(indices.len(), 1); + assert_eq!(indices[0].query_type, "Region"); + } + + #[test] + fn test_index_different_modalities() { + let spec = dogma_spec(); + for modality in ["rna", "atac", "protein", "tag"] { + let m = modality.to_string(); + let indices = get_index_by_reads(&spec, &m); + assert!( + !indices.is_empty(), + "modality {} should have indices", + modality + ); + } + } + + #[test] + fn test_filter_index_no_overlap() { + let spec = dogma_spec(); + let modality = "rna".to_string(); + let indices = get_index_by_reads(&spec, &modality); + let orig_count: usize = indices.iter().map(|i| i.rcv.len()).sum(); + let filtered = filter_index_no_overlap(indices); + // Non-overlapping DOGMA RNA reads should be unchanged. + let filt_count: usize = filtered.iter().map(|i| i.rcv.len()).sum(); + assert_eq!(filt_count, orig_count); + } + + #[test] + fn test_filter_index_no_overlap_removes_regions_seen_in_earlier_reads() { + let spec = load_spec(&PathBuf::from( + "tests/fixtures/check_overlap_warning/spec.yaml", + )); + let modality = "rna".to_string(); + let indices = get_index_by_reads(&spec, &modality); + let filtered = filter_index_no_overlap(indices); + + assert_eq!(filtered.len(), 2); + assert_eq!(filtered[0].query_id, "rna_R1"); + assert_eq!(filtered[0].rcv.len(), 2); + assert_eq!( + filtered[0] + .rcv + .iter() + .map(|region| region.region.region_id.clone()) + .collect::>(), + vec!["barcode".to_string(), "umi".to_string()] + ); + assert_eq!(filtered[1].query_id, "rna_R2"); + assert_eq!(filtered[1].rcv.len(), 0); + } + + #[test] + fn test_index_dispatch_file_selector() { + let spec = dogma_spec(); + let modality = "rna".to_string(); + let ids: Vec = vec![]; + let idtype = "file".to_string(); + let rev = false; + let indices = seqspec_index(&spec, &modality, &ids, &idtype, &rev); + assert!(!indices.is_empty()); + } + + #[test] + fn test_index_dispatch_region_with_ids() { + let spec = dogma_spec(); + let modality = "rna".to_string(); + let lib = spec.get_libspec("rna").unwrap(); + let ids = vec![lib.region_id.clone()]; + let idtype = "region".to_string(); + let rev = false; + let indices = seqspec_index(&spec, &modality, &ids, &idtype, &rev); + assert_eq!(indices.len(), 1); + } +} diff --git a/src/seqspec_info.rs b/src/seqspec_info.rs new file mode 100644 index 00000000..7ff07377 --- /dev/null +++ b/src/seqspec_info.rs @@ -0,0 +1,425 @@ +use crate::models::assay::Assay; +use crate::models::read::Read; +use crate::models::region::Region; +use crate::utils; +use clap::Args; +use serde_json::{json, Value}; +use std::collections::BTreeMap; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +#[derive(Debug, Args)] +pub struct InfoArgs { + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, + + #[clap( + short, + long, + help = "Object to display", + value_name = "KEY", + default_value = "meta", + value_parser = ["modalities", "meta", "sequence_spec", "library_spec"] + )] + key: String, + + #[clap( + short, + long, + help = "The output format", + value_name = "FORMAT", + default_value = "tab", + value_parser = ["tab", "json"] + )] + format: String, + + #[clap(short, long, help = "Path to output file", value_name = "OUT")] + output: Option, +} + +pub fn run_info(args: &InfoArgs) { + validate_info_args(args); + let spec = utils::load_spec(&args.yaml); + + let info = seqspec_info(&spec, &args.key); + let result = format_info(&spec, info, &args.key, &args.format); + + if let Some(out) = &args.output { + let mut f = fs::File::create(out).unwrap(); + writeln!(f, "{}", result).unwrap(); + } else { + println!("{}", result); + } +} + +fn validate_info_args(args: &InfoArgs) { + if !args.yaml.exists() { + eprintln!("Please use `seqspec info -h` for help."); + std::process::exit(1); + } + if let Some(out) = &args.output { + if out.exists() && !out.is_file() { + eprintln!("Output path exists but is not a file: {}", out.display()); + std::process::exit(1); + } + } +} + +// ---------------- Core info ---------------- + +#[derive(Clone)] +enum InfoData { + Modalities(Vec), + Meta(Value), + SequenceSpec(Vec), + LibrarySpec(BTreeMap>), +} + +fn seqspec_info(spec: &Assay, key: &str) -> InfoData { + match key { + "modalities" => InfoData::Modalities(spec.list_modalities()), + "meta" => InfoData::Meta(seqspec_info_meta(spec)), + "sequence_spec" => InfoData::SequenceSpec(spec.sequence_spec.clone()), + "library_spec" => InfoData::LibrarySpec(seqspec_info_library_spec(spec)), + _ => panic!("Unsupported info key: {}", key), + } +} + +fn seqspec_info_meta(spec: &Assay) -> Value { + // preserve a logical, stable order of fields similar to Python's model_dump + let mut m = serde_json::Map::new(); + if let Some(v) = &spec.seqspec_version { + m.insert("seqspec_version".to_string(), json!(v)); + } + m.insert("assay_id".to_string(), json!(spec.assay_id)); + m.insert("name".to_string(), json!(spec.name)); + m.insert("doi".to_string(), json!(spec.doi)); + m.insert("date".to_string(), json!(spec.date)); + m.insert("description".to_string(), json!(spec.description)); + m.insert("lib_struct".to_string(), json!(spec.lib_struct)); + if let Some(v) = &spec.library_kit { + m.insert("library_kit".to_string(), json!(v)); + } + if let Some(v) = &spec.library_protocol { + m.insert("library_protocol".to_string(), json!(v)); + } + if let Some(v) = &spec.sequence_kit { + m.insert("sequence_kit".to_string(), json!(v)); + } + if let Some(v) = &spec.sequence_protocol { + m.insert("sequence_protocol".to_string(), json!(v)); + } + Value::Object(m) +} + +fn seqspec_info_library_spec(spec: &Assay) -> BTreeMap> { + let mut result: BTreeMap> = BTreeMap::new(); + for m in spec.list_modalities() { + if let Some(libspec) = spec.get_libspec(&m) { + let leaves = libspec.get_leaves(); + result.insert(m, leaves); + } + } + result +} + +// ---------------- Formatting ---------------- + +fn format_info(_spec: &Assay, info: InfoData, key: &str, fmt: &str) -> String { + match (key, fmt) { + ("modalities", "tab") => format_modalities_tab(&info), + ("modalities", "json") => format_modalities_json(&info), + ("meta", "tab") => format_meta_tab(&info), + ("meta", "json") => format_meta_json(&info), + ("sequence_spec", "tab") => format_sequence_spec_tab(&info), + ("sequence_spec", "json") => format_sequence_spec_json(&info), + ("library_spec", "tab") => format_library_spec_tab(&info), + ("library_spec", "json") => format_library_spec_json(&info), + _ => String::new(), + } +} + +fn format_modalities_tab(info: &InfoData) -> String { + if let InfoData::Modalities(v) = info { + v.join("\t") + } else { + String::new() + } +} +fn format_modalities_json(info: &InfoData) -> String { + if let InfoData::Modalities(v) = info { + serde_json::to_string_pretty(v).unwrap() + } else { + String::new() + } +} + +fn format_meta_tab(info: &InfoData) -> String { + if let InfoData::Meta(v) = info { + let obj = v.as_object().unwrap(); + let mut vals: Vec = Vec::new(); + for k in [ + "seqspec_version", + "assay_id", + "name", + "doi", + "date", + "description", + "lib_struct", + "library_kit", + "library_protocol", + "sequence_kit", + "sequence_protocol", + ] { + if let Some(val) = obj.get(k) { + vals.push(if val.is_null() { + String::new() + } else { + val.to_string().trim_matches('"').to_string() + }); + } + } + vals.join("\t") + } else { + String::new() + } +} +fn format_meta_json(info: &InfoData) -> String { + if let InfoData::Meta(v) = info { + serde_json::to_string_pretty(v).unwrap() + } else { + String::new() + } +} + +fn format_sequence_spec_tab(info: &InfoData) -> String { + if let InfoData::SequenceSpec(reads) = info { + let mut lines: Vec = Vec::new(); + for r in reads { + let files = if r.files.is_empty() { + String::new() + } else { + r.files + .iter() + .map(|f| f.file_id.clone()) + .collect::>() + .join(",") + }; + lines.push(format!( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + r.modality, r.read_id, r.strand, r.min_len, r.max_len, r.primer_id, r.name, files + )); + } + lines.join("\n") + } else { + String::new() + } +} +fn format_sequence_spec_json(info: &InfoData) -> String { + if let InfoData::SequenceSpec(reads) = info { + serde_json::to_string_pretty(reads).unwrap() + } else { + String::new() + } +} + +fn format_library_spec_tab(info: &InfoData) -> String { + if let InfoData::LibrarySpec(map) = info { + let mut lines: Vec = Vec::new(); + for (modality, regions) in map { + for r in regions { + let file = r + .onlist + .as_ref() + .map(|o| o.filename.clone()) + .unwrap_or_else(|| "None".to_string()); + lines.push(format!( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + modality, + r.region_id, + r.region_type, + r.name, + r.sequence_type, + r.sequence, + r.min_len, + r.max_len, + file + )); + } + } + lines.join("\n") + } else { + String::new() + } +} +fn format_library_spec_json(info: &InfoData) -> String { + if let InfoData::LibrarySpec(map) = info { + serde_json::to_string_pretty(map).unwrap() + } else { + String::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_info_modalities() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "modalities"); + if let InfoData::Modalities(v) = info { + assert!(v.contains(&"rna".to_string())); + assert!(v.contains(&"atac".to_string())); + assert_eq!(v.len(), 4); + } else { + panic!("Expected Modalities variant"); + } + } + + #[test] + fn test_info_meta() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "meta"); + if let InfoData::Meta(v) = info { + let obj = v.as_object().unwrap(); + assert!(obj.contains_key("assay_id")); + assert!(obj.contains_key("name")); + assert_eq!(obj["assay_id"].as_str().unwrap(), "DOGMAseq-DIG"); + } else { + panic!("Expected Meta variant"); + } + } + + #[test] + fn test_info_sequence_spec() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "sequence_spec"); + if let InfoData::SequenceSpec(reads) = info { + assert_eq!(reads.len(), 9); // 2 RNA + 3 ATAC + 2 Protein + 2 Tag + let rna_reads: Vec<_> = reads.iter().filter(|r| r.modality == "rna").collect(); + assert_eq!(rna_reads.len(), 2); + let atac_reads: Vec<_> = reads.iter().filter(|r| r.modality == "atac").collect(); + assert_eq!(atac_reads.len(), 3); + } else { + panic!("Expected SequenceSpec variant"); + } + } + + #[test] + fn test_info_library_spec() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "library_spec"); + if let InfoData::LibrarySpec(map) = info { + assert_eq!(map.len(), 4); + assert_eq!(map["rna"].len(), 5); // 5 RNA leaf regions + assert_eq!(map["rna"][0].region_id, "rna_truseq_read1"); + } else { + panic!("Expected LibrarySpec variant"); + } + } + + #[test] + fn test_format_modalities_tab() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "modalities"); + let result = format_info(&spec, info, "modalities", "tab"); + let parts: Vec<&str> = result.split('\t').collect(); + assert_eq!(parts.len(), 4); + assert!(parts.contains(&"rna")); + assert!(parts.contains(&"atac")); + assert!(parts.contains(&"protein")); + assert!(parts.contains(&"tag")); + } + + #[test] + fn test_format_meta_json() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "meta"); + let result = format_info(&spec, info, "meta", "json"); + let parsed: Value = serde_json::from_str(&result).unwrap(); + assert!(parsed.is_object()); + assert_eq!(parsed["assay_id"].as_str().unwrap(), "DOGMAseq-DIG"); + } + + #[test] + fn test_format_meta_tab() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "meta"); + let result = format_info(&spec, info, "meta", "tab"); + let parts: Vec<&str> = result.split('\t').collect(); + // Tab format includes: version, assay_id, name, doi, date, description, lib_struct, ... + assert!(parts.len() >= 7); + assert!(parts.contains(&"DOGMAseq-DIG")); // assay_id + assert!(parts.contains(&"DOGMAseq-DIG")); // name too + } + + #[test] + fn test_format_sequence_spec_tab() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "sequence_spec"); + let result = format_info(&spec, info, "sequence_spec", "tab"); + let lines: Vec<&str> = result.lines().collect(); + assert_eq!(lines.len(), 9); // 9 reads total + // First line should be an RNA read + assert!( + lines[0].starts_with("rna\t") + || lines[0].starts_with("protein\t") + || lines[0].starts_with("tag\t") + || lines[0].starts_with("atac\t") + ); + // Check that rna and atac both appear + let rna_lines = lines.iter().filter(|l| l.starts_with("rna\t")).count(); + assert_eq!(rna_lines, 2); + let atac_lines = lines.iter().filter(|l| l.starts_with("atac\t")).count(); + assert_eq!(atac_lines, 3); + } + + #[test] + fn test_format_sequence_spec_json() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "sequence_spec"); + let result = format_info(&spec, info, "sequence_spec", "json"); + let parsed: Value = serde_json::from_str(&result).unwrap(); + let arr = parsed.as_array().unwrap(); + assert_eq!(arr.len(), 9); + } + + #[test] + fn test_format_library_spec_tab() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "library_spec"); + let result = format_info(&spec, info, "library_spec", "tab"); + let lines: Vec<&str> = result.lines().collect(); + // Count lines per modality + let rna_lines = lines.iter().filter(|l| l.starts_with("rna\t")).count(); + assert_eq!(rna_lines, 5); // 5 RNA leaf regions + } + + #[test] + fn test_format_library_spec_json() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "library_spec"); + let result = format_info(&spec, info, "library_spec", "json"); + let parsed: Value = serde_json::from_str(&result).unwrap(); + let obj = parsed.as_object().unwrap(); + assert_eq!(obj.len(), 4); + assert_eq!(obj["rna"].as_array().unwrap().len(), 5); + } + + #[test] + fn test_format_modalities_json() { + let spec = dogma_spec(); + let info = seqspec_info(&spec, "modalities"); + let result = format_info(&spec, info, "modalities", "json"); + let parsed: Value = serde_json::from_str(&result).unwrap(); + let arr = parsed.as_array().unwrap(); + assert_eq!(arr.len(), 4); + } +} diff --git a/src/seqspec_init.rs b/src/seqspec_init.rs new file mode 100644 index 00000000..64f438b3 --- /dev/null +++ b/src/seqspec_init.rs @@ -0,0 +1,151 @@ +use crate::models::assay::Assay; +use crate::models::region::Region; +use clap::Args; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +#[derive(Debug, Args)] +pub struct InitArgs { + #[clap(short, long, help = "Assay name", required = true)] + name: String, + + #[clap( + short, + long, + help = "Comma-separated list of modalities (e.g. rna,atac)", + required = true + )] + modalities: String, + + #[clap(long, help = "DOI of the assay", default_value = "")] + doi: String, + + #[clap(long, help = "Short description", default_value = "")] + description: String, + + #[clap(long, help = "Date (YYYY-MM-DD)", default_value = "")] + date: String, + + #[clap(short, long, help = "Output YAML (default stdout)", value_name = "OUT")] + output: Option, +} + +pub fn run_init(args: &InitArgs) { + validate_init_args(args); + + let modalities: Vec = args + .modalities + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + + let mut spec = seqspec_init( + &args.name, + &args.doi, + &args.date, + &args.description, + modalities, + ); + + spec.update_spec(); + + let yaml = spec.to_bytes().expect("Failed to serialize assay to YAML"); + if let Some(out) = &args.output { + let mut f = fs::File::create(out).unwrap(); + f.write_all(&yaml).unwrap(); + } else { + println!("{}", String::from_utf8_lossy(&yaml)); + } +} + +fn validate_init_args(args: &InitArgs) { + if args.name.is_empty() { + eprintln!("Assay name is required"); + std::process::exit(1); + } + if args.modalities.is_empty() { + eprintln!("Modalities must be provided"); + std::process::exit(1); + } +} + +pub fn seqspec_init( + name: &str, + doi: &str, + date: &str, + description: &str, + modalities: Vec, +) -> Assay { + let meta_regions: Vec = modalities + .iter() + .map(|modality| { + Region::new( + modality.clone(), // region_id + "meta".to_string(), // region_type + modality.clone(), // name + "".to_string(), // sequence_type + "".to_string(), // sequence + 0, // min_len + 0, // max_len + None, // onlist + Vec::new(), // regions + ) + }) + .collect(); + + Assay::new( + "".to_string(), // assay_id + name.to_string(), + doi.to_string(), + date.to_string(), + description.to_string(), + modalities, + "".to_string(), // lib_struct + Vec::new(), // sequence_spec + meta_regions, // library_spec + None, // sequence_protocol + None, // sequence_kit + None, // library_protocol + None, // library_kit + None, // seqspec_version + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_init_creates_assay() { + let spec = seqspec_init( + "TestAssay", + "10.1234/test", + "2024-01-01", + "A test assay", + vec!["rna".into(), "atac".into()], + ); + assert_eq!(spec.name, "TestAssay"); + assert_eq!(spec.doi, "10.1234/test"); + assert_eq!(spec.modalities, vec!["rna", "atac"]); + assert_eq!(spec.library_spec.len(), 2); + assert!(spec.sequence_spec.is_empty()); + } + + #[test] + fn test_init_library_spec_regions() { + let spec = seqspec_init("Test", "", "", "", vec!["rna".into()]); + let lib = spec.get_libspec("rna").unwrap(); + assert_eq!(lib.region_id, "rna"); + assert_eq!(lib.region_type, "meta"); + } + + #[test] + fn test_init_single_modality() { + let spec = seqspec_init("Test", "", "", "", vec!["protein".into()]); + assert_eq!(spec.modalities.len(), 1); + assert_eq!(spec.library_spec.len(), 1); + assert_eq!(spec.library_spec[0].region_id, "protein"); + } +} diff --git a/src/seqspec_insert.rs b/src/seqspec_insert.rs new file mode 100644 index 00000000..864e840d --- /dev/null +++ b/src/seqspec_insert.rs @@ -0,0 +1,504 @@ +use crate::models::assay::Assay; +use crate::models::file::File; +use crate::models::read::Read; +use crate::models::region::Region; +use crate::utils; +use clap::Args; +use serde::Deserialize; +use serde_json::Value; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +#[derive(Debug, Args)] +pub struct InsertArgs { + #[clap( + short, + long, + help = "Target modality", + value_name = "MODALITY", + required = true + )] + modality: String, + + #[clap( + short, + long, + help = "Section to insert into", + value_name = "SELECTOR", + value_parser = ["region", "read"], + required = true + )] + selector: String, + + #[clap( + short, + long, + help = "Path or inline JSON (expects array of objects)", + value_name = "IN", + required = true + )] + resource: String, + + #[clap(long, help = "Insert after ID (region or read)")] + after: Option, + + #[clap(help = "Draft spec to modify", required = true)] + yaml: PathBuf, + + #[clap( + short, + long, + help = "Write updated spec (default stdout)", + value_name = "OUT" + )] + output: Option, +} + +pub fn run_insert(args: &InsertArgs) { + validate_insert_args(args); + let mut spec = utils::load_spec(&args.yaml); + + let payload = parse_resource(&args.resource); + match args.selector.as_str() { + "read" => { + let reads = load_reads_from_value(&payload, &args.modality); + spec = seqspec_insert_reads(spec, &args.modality, reads, args.after.as_deref()) + .unwrap_or_else(|err| { + eprintln!("{err}"); + std::process::exit(1); + }); + } + "region" => { + let regions = load_regions_from_value(&payload); + spec = seqspec_insert_regions(spec, &args.modality, regions, args.after.as_deref()) + .unwrap_or_else(|err| { + eprintln!("{err}"); + std::process::exit(1); + }); + } + _ => {} + } + + spec.update_spec(); + let bytes = spec.to_bytes().unwrap(); + if let Some(out) = &args.output { + let mut f = fs::File::create(out).unwrap(); + f.write_all(&bytes).unwrap(); + } else { + println!("{}", String::from_utf8_lossy(&bytes)); + } +} + +fn validate_insert_args(args: &InsertArgs) { + if args.selector == "region" && matches!(args.after.as_deref(), Some("")) { + eprintln!("Invalid --after value"); + std::process::exit(1); + } + if !args.yaml.exists() { + eprintln!("Spec file not found: {}", args.yaml.display()); + std::process::exit(1); + } +} + +fn parse_resource(resource: &str) -> Value { + let payload = match PathBuf::from(resource) { + path if path.exists() => fs::read_to_string(path).expect("failed to read --resource file"), + _ => resource.to_string(), + }; + + serde_json::from_str(&payload) + .or_else(|_| serde_yaml::from_str(&payload)) + .expect("--resource must be inline JSON/YAML or a JSON/YAML file") +} + +#[derive(Debug, Deserialize, Clone)] +struct FileInput { + file_id: Option, + filename: Option, + filetype: Option, + filesize: Option, + url: Option, + urltype: Option, + md5: Option, +} + +impl FileInput { + fn to_file(&self) -> File { + let id = self + .file_id + .clone() + .or_else(|| self.filename.clone()) + .unwrap_or_default(); + File::new( + id, + self.filename.clone().unwrap_or_default(), + self.filetype.clone().unwrap_or_default(), + self.filesize.unwrap_or(0), + self.url.clone().unwrap_or_default(), + self.urltype.clone().unwrap_or_default(), + self.md5.clone().unwrap_or_default(), + ) + } +} + +#[derive(Debug, Deserialize, Clone)] +struct ReadInput { + read_id: Option, + name: Option, + primer_id: Option, + min_len: Option, + max_len: Option, + strand: Option, + files: Option>, +} + +impl ReadInput { + fn to_read(&self, modality: &str) -> Option { + let read_id = self.read_id.clone()?; + let name = self.name.clone().unwrap_or_else(|| read_id.clone()); + let files = self + .files + .as_ref() + .map(|v| v.iter().map(|f| f.to_file()).collect()) + .unwrap_or_else(|| Vec::new()); + Some(Read::new( + read_id, + name, + modality.to_string(), + self.primer_id.clone().unwrap_or_default(), + self.min_len.unwrap_or(0), + self.max_len.unwrap_or(0), + self.strand.clone().unwrap_or_else(|| "pos".to_string()), + files, + )) + } +} + +#[derive(Debug, Deserialize, Clone)] +struct RegionInput { + region_id: Option, + region_type: Option, + name: Option, + sequence_type: Option, + sequence: Option, + min_len: Option, + max_len: Option, + regions: Option>, // allow nested +} + +impl RegionInput { + fn to_region(&self) -> Option { + let region_id = self.region_id.clone()?; + let name = self.name.clone().unwrap_or_else(|| region_id.clone()); + let children: Vec = self + .regions + .as_ref() + .map(|v| v.iter().filter_map(|c| c.to_region()).collect()) + .unwrap_or_else(|| Vec::new()); + Some(Region::new( + region_id, + self.region_type.clone().unwrap_or_default(), + name, + self.sequence_type.clone().unwrap_or_default(), + self.sequence.clone().unwrap_or_default(), + self.min_len.unwrap_or(0), + self.max_len.unwrap_or(0), + None, + children, + )) + } +} + +fn load_reads_from_value(val: &Value, modality: &str) -> Vec { + let arr = match val { + Value::Array(items) => items, + Value::Object(map) => map + .get("reads") + .and_then(Value::as_array) + .expect("--resource must be an array of reads or a mapping with key 'reads'"), + _ => panic!("--resource must be an array of reads or a mapping with key 'reads'"), + }; + let mut out: Vec = Vec::new(); + for item in arr { + let ri: ReadInput = serde_json::from_value(item.clone()).expect("Invalid read object"); + if let Some(r) = ri.to_read(modality) { + out.push(r); + } + } + out +} + +fn load_regions_from_value(val: &Value) -> Vec { + let arr = match val { + Value::Array(items) => items, + Value::Object(map) => map + .get("regions") + .and_then(Value::as_array) + .expect("--resource must be an array of regions or a mapping with key 'regions'"), + _ => { + panic!("--resource must be an array of regions or a mapping with key 'regions'") + } + }; + let mut out: Vec = Vec::new(); + for item in arr { + let ri: RegionInput = serde_json::from_value(item.clone()).expect("Invalid region object"); + if let Some(r) = ri.to_region() { + out.push(r); + } + } + out +} + +pub fn seqspec_insert_reads( + mut spec: Assay, + modality: &str, + reads: Vec, + after: Option<&str>, +) -> Result { + spec.insert_reads(reads, modality, after)?; + Ok(spec) +} + +pub fn seqspec_insert_regions( + mut spec: Assay, + modality: &str, + regions: Vec, + after: Option<&str>, +) -> Result { + spec.insert_regions(regions, modality, after)?; + Ok(spec) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + use std::path::PathBuf; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_insert_reads_at_beginning() { + let spec = dogma_spec(); + let orig_count = spec.get_seqspec("rna").len(); + let new_read = Read::new( + "test_read".into(), + "Test Read".into(), + "rna".into(), + "primer1".into(), + 50, + 50, + "pos".into(), + vec![], + ); + let spec = seqspec_insert_reads(spec, "rna", vec![new_read], None).unwrap(); + let reads = spec.get_seqspec("rna"); + assert_eq!(reads.len(), orig_count + 1); + assert_eq!(reads[0].read_id, "test_read"); // inserted at beginning + } + + #[test] + fn test_insert_reads_after_specific_read() { + let spec = dogma_spec(); + let new_read = Read::new( + "test_read".into(), + "Test Read".into(), + "rna".into(), + "primer1".into(), + 50, + 50, + "pos".into(), + vec![], + ); + let spec = seqspec_insert_reads(spec, "rna", vec![new_read], Some("rna_R1")).unwrap(); + let reads = spec.get_seqspec("rna"); + assert_eq!(reads.len(), 3); + assert_eq!(reads[0].read_id, "rna_R1"); + assert_eq!(reads[1].read_id, "test_read"); // after rna_R1 + assert_eq!(reads[2].read_id, "rna_R2"); + } + + #[test] + fn test_insert_reads_sets_modality() { + let spec = dogma_spec(); + let new_read = Read::new( + "test_read".into(), + "Test Read".into(), + "wrong".into(), + "".into(), + 50, + 50, + "pos".into(), + vec![], + ); + let spec = seqspec_insert_reads(spec, "rna", vec![new_read], None).unwrap(); + let reads = spec.get_seqspec("rna"); + assert_eq!(reads[0].modality, "rna"); // modality was corrected + } + + #[test] + fn test_insert_regions_at_beginning() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").unwrap(); + let orig_child_count = rna_lib.regions.len(); + let new_region = Region::new( + "test_region".into(), + "custom".into(), + "Test Region".into(), + "fixed".into(), + "ACGT".into(), + 4, + 4, + None, + vec![], + ); + let spec = seqspec_insert_regions(spec, "rna", vec![new_region], None).unwrap(); + let rna_lib = spec.get_libspec("rna").unwrap(); + assert_eq!(rna_lib.regions.len(), orig_child_count + 1); + assert_eq!(rna_lib.regions[0].region_id, "test_region"); // at beginning + } + + #[test] + fn test_insert_regions_after_specific_region() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").unwrap(); + let first_child_id = rna_lib.regions[0].region_id.clone(); + let new_region = Region::new( + "test_region".into(), + "custom".into(), + "Test Region".into(), + "fixed".into(), + "ACGT".into(), + 4, + 4, + None, + vec![], + ); + let spec = + seqspec_insert_regions(spec, "rna", vec![new_region], Some(&first_child_id)).unwrap(); + let rna_lib = spec.get_libspec("rna").unwrap(); + assert_eq!(rna_lib.regions[0].region_id, first_child_id); + assert_eq!(rna_lib.regions[1].region_id, "test_region"); + } + + #[test] + fn test_parse_resource_reads_json() { + let json = r#"[{"read_id":"r1","name":"Read 1","min_len":50,"max_len":50,"strand":"pos"}]"#; + let val = parse_resource(json); + let reads = load_reads_from_value(&val, "rna"); + assert_eq!(reads.len(), 1); + assert_eq!(reads[0].read_id, "r1"); + assert_eq!(reads[0].name, "Read 1"); + assert_eq!(reads[0].modality, "rna"); + } + + #[test] + fn test_parse_resource_reads_mapping() { + let json = r#"{"reads":[{"read_id":"r1","name":"Read 1","min_len":50,"max_len":50,"strand":"pos"}]}"#; + let val = parse_resource(json); + let reads = load_reads_from_value(&val, "rna"); + assert_eq!(reads.len(), 1); + assert_eq!(reads[0].read_id, "r1"); + assert_eq!(reads[0].modality, "rna"); + } + + #[test] + fn test_parse_resource_regions_json() { + let json = r#"[{"region_id":"r1","region_type":"barcode","name":"BC","sequence_type":"onlist","sequence":"NNNN","min_len":4,"max_len":4}]"#; + let val = parse_resource(json); + let regions = load_regions_from_value(&val); + assert_eq!(regions.len(), 1); + assert_eq!(regions[0].region_id, "r1"); + assert_eq!(regions[0].region_type, "barcode"); + assert_eq!(regions[0].sequence, "NNNN"); + } + + #[test] + fn test_parse_resource_regions_mapping() { + let json = r#"{"regions":[{"region_id":"r1","region_type":"barcode","name":"BC","sequence_type":"onlist","sequence":"NNNN","min_len":4,"max_len":4}]}"#; + let val = parse_resource(json); + let regions = load_regions_from_value(&val); + assert_eq!(regions.len(), 1); + assert_eq!(regions[0].region_id, "r1"); + } + + #[test] + fn test_parse_resource_accepts_yaml_file() { + let tmp_path = std::env::temp_dir().join(format!( + "seqspec_insert_{}_resource.yaml", + std::process::id() + )); + fs::write( + &tmp_path, + "- region_id: inserted_region\n region_type: linker\n name: Inserted Region\n sequence_type: fixed\n sequence: ACGT\n min_len: 4\n max_len: 4\n", + ) + .unwrap(); + + let parsed = parse_resource(tmp_path.to_str().unwrap()); + assert!(parsed.is_array()); + assert_eq!(parsed[0]["region_id"], "inserted_region"); + + let _ = fs::remove_file(tmp_path); + } + + #[test] + fn test_insert_regions_updates_len() { + let spec = dogma_spec(); + let rna_lib = spec.get_libspec("rna").unwrap(); + let (orig_min, orig_max) = rna_lib.get_len(); + let new_region = Region::new( + "extra".into(), + "custom".into(), + "Extra".into(), + "fixed".into(), + "ACGTACGT".into(), + 8, + 8, + None, + vec![], + ); + let spec = seqspec_insert_regions(spec, "rna", vec![new_region], None).unwrap(); + let rna_lib = spec.get_libspec("rna").unwrap(); + // update_attr is called inside insert_regions, so lengths should be updated + assert_eq!(rna_lib.min_len, orig_min + 8); + assert_eq!(rna_lib.max_len, orig_max + 8); + } + + #[test] + fn test_insert_reads_invalid_modality_returns_error() { + let spec = dogma_spec(); + let new_read = Read::new( + "bad_read".into(), + "Bad Read".into(), + "".into(), + "primer1".into(), + 10, + 10, + "pos".into(), + vec![], + ); + let result = seqspec_insert_reads(spec, "missing", vec![new_read], None); + assert!(result.is_err()); + } + + #[test] + fn test_insert_regions_missing_after_returns_error() { + let spec = dogma_spec(); + let new_region = Region::new( + "bad_region".into(), + "custom".into(), + "Bad Region".into(), + "fixed".into(), + "ACGT".into(), + 4, + 4, + None, + vec![], + ); + let result = seqspec_insert_regions(spec, "rna", vec![new_region], Some("missing")); + assert!(result.is_err()); + } +} diff --git a/src/seqspec_methods.rs b/src/seqspec_methods.rs new file mode 100644 index 00000000..ae7bcd89 --- /dev/null +++ b/src/seqspec_methods.rs @@ -0,0 +1,276 @@ +use crate::models::assay::Assay; +use crate::models::file::File as ReadFile; +use crate::models::read::Read; +use crate::models::region::Region; +use crate::utils; +use clap::Args; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +#[derive(Debug, Args)] +pub struct MethodsArgs { + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, + + #[clap( + short, + long, + help = "Modality", + value_name = "MODALITY", + required = true + )] + modality: String, + + #[clap(short, long, help = "Path to output file", value_name = "OUT")] + output: Option, +} + +pub fn run_methods(args: &MethodsArgs) { + validate_methods_args(args); + let spec = utils::load_spec(&args.yaml); + + let text = seqspec_methods(&spec, &args.modality); + if let Some(out) = &args.output { + let mut f = fs::File::create(out).unwrap(); + write!(f, "{}", text).unwrap(); + } else { + println!("{}", text); + } +} + +fn validate_methods_args(args: &MethodsArgs) { + if !args.yaml.exists() { + eprintln!("Please use `seqspec methods -h` for help."); + std::process::exit(1); + } + if let Some(out) = &args.output { + if out.exists() && !out.is_file() { + eprintln!("Output path exists but is not a file: {}", out.display()); + std::process::exit(1); + } + } +} + +pub fn seqspec_methods(spec: &Assay, modality: &str) -> String { + let mut m = format!( + "Methods\nThe {} portion of the {} assay was generated on {}.\n ", + modality, spec.name, spec.date + ); + m.push_str(&format_library_spec(spec, modality)); + m +} + +fn format_library_spec(spec: &Assay, modality: &str) -> String { + let leaves = spec + .get_libspec(modality) + .expect("modality not found") + .get_leaves(); + + let lib_prot: Option = match &spec.library_protocol { + Some(v) => v + .iter() + .find(|p| p.modality == modality) + .map(|p| p.protocol_id.clone()), + None => None, + }; + let lib_kit: Option = match &spec.library_kit { + Some(v) => v + .iter() + .find(|k| k.modality == modality) + .map(|k| k.kit_id.clone()), + None => None, + }; + let seq_prot: Option = match &spec.sequence_protocol { + Some(v) => v + .iter() + .find(|p| p.modality == modality) + .map(|p| p.protocol_id.clone()), + None => None, + }; + let seq_kit: Option = match &spec.sequence_kit { + Some(v) => v + .iter() + .find(|k| k.modality == modality) + .map(|k| k.kit_id.clone()), + None => None, + }; + + let mut s = String::new(); + s.push_str("\nLibary structure\n\n"); + s.push_str(&format!( + "The library was generated using the {} library protocol and {} library kit. The library contains the following elements:\n\n", + lib_prot.unwrap_or_else(|| "None".to_string()), + lib_kit.unwrap_or_else(|| "None".to_string()) + )); + for (idx, r) in leaves.iter().enumerate() { + s.push_str(&format_region(r, (idx + 1) as i32)); + } + s.push_str("\nSequence structure\n\n"); + s.push_str(&format!( + "The library was sequenced on a {} using the {} sequencing kit. The library was sequenced using the following configuration:\n\n", + seq_prot.unwrap_or_else(|| "None".to_string()), + seq_kit.unwrap_or_else(|| "None".to_string()) + )); + let reads = spec.get_seqspec(modality); + for (idx, r) in reads.iter().enumerate() { + s.push_str(&format_read(r, (idx + 1) as i32)); + } + s +} + +fn format_region(region: &Region, idx: i32) -> String { + let mut s = format!( + "{}. {}: {}-{}bp {} sequence ({})", + idx, region.name, region.min_len, region.max_len, region.sequence_type, region.sequence + ); + if let Some(ol) = ®ion.onlist { + s.push_str(&format!(", onlist file: {}.\n", ol.filename)); + } else { + s.push_str(".\n"); + } + s +} + +fn format_read(read: &Read, idx: i32) -> String { + let strand = if read.strand == "pos" { + "positive" + } else { + "negative" + }; + let mut s = format!( + "- {}: {} cycles on the {} strand using the {} primer. The following files contain the sequences in Read {}:\n", + read.name, read.max_len, strand, read.primer_id, idx + ); + if !read.files.is_empty() { + for (i, f) in read.files.iter().enumerate() { + s.push_str(&format_read_file(f, (i + 1) as i32)); + } + } + s +} + +fn format_read_file(file: &ReadFile, idx: i32) -> String { + format!("- File {}: {}\n", idx, file.filename) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + use std::path::PathBuf; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_methods_output_rna() { + let spec = dogma_spec(); + let text = seqspec_methods(&spec, "rna"); + assert!(text.starts_with("Methods\nThe rna portion of the DOGMAseq-DIG/Illumina assay")); + assert!(text.contains("Libary structure")); + assert!(text.contains("Sequence structure")); + // Should mention the 5 RNA leaf regions + assert!(text.contains("Cell Barcode")); + assert!(text.contains("umi")); + assert!(text.contains("cdna")); + } + + #[test] + fn test_format_region() { + let region = Region::new( + "bc".into(), + "barcode".into(), + "Cell Barcode".into(), + "onlist".into(), + "NNNNNNNNNNNNNNNN".into(), + 16, + 16, + None, + vec![], + ); + let s = format_region(®ion, 1); + assert_eq!( + s, + "1. Cell Barcode: 16-16bp onlist sequence (NNNNNNNNNNNNNNNN).\n" + ); + } + + #[test] + fn test_format_region_with_onlist() { + let onlist = crate::models::onlist::Onlist::new( + "ol".into(), + "barcodes.txt".into(), + "txt".into(), + 0, + "".into(), + "local".into(), + "".into(), + ); + let region = Region::new( + "bc".into(), + "barcode".into(), + "Cell Barcode".into(), + "onlist".into(), + "N".repeat(16), + 16, + 16, + Some(onlist), + vec![], + ); + let s = format_region(®ion, 1); + assert_eq!(s, "1. Cell Barcode: 16-16bp onlist sequence (NNNNNNNNNNNNNNNN), onlist file: barcodes.txt.\n"); + } + + #[test] + fn test_format_read() { + let read = Read::new( + "R1".into(), + "Read 1".into(), + "rna".into(), + "truseq_read1".into(), + 28, + 28, + "pos".into(), + vec![], + ); + let s = format_read(&read, 1); + assert_eq!(s, "- Read 1: 28 cycles on the positive strand using the truseq_read1 primer. The following files contain the sequences in Read 1:\n"); + } + + #[test] + fn test_format_read_file() { + let f = ReadFile::new( + "f1".into(), + "reads_R1.fastq.gz".into(), + "fastq".into(), + 1024, + "reads_R1.fastq.gz".into(), + "local".into(), + "".into(), + ); + let s = format_read_file(&f, 1); + assert_eq!(s, "- File 1: reads_R1.fastq.gz\n"); + } + + #[test] + fn test_methods_other_modalities() { + let spec = dogma_spec(); + for modality in ["atac", "protein", "tag"] { + let text = seqspec_methods(&spec, modality); + assert!(text.contains("Methods")); + assert!(text.contains(modality)); + } + } + + #[test] + fn test_format_library_spec_sections() { + let spec = dogma_spec(); + let text = format_library_spec(&spec, "rna"); + assert!(text.contains("Libary structure")); + assert!(text.contains("Sequence structure")); + assert!(text.contains("library protocol")); + assert!(text.contains("sequencing kit")); + } +} diff --git a/src/seqspec_modify.rs b/src/seqspec_modify.rs new file mode 100644 index 00000000..685d9a28 --- /dev/null +++ b/src/seqspec_modify.rs @@ -0,0 +1,415 @@ +use crate::models::assay::Assay; +use crate::models::file::File; +use crate::utils; +use clap::Args; +use serde_json::Value; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +#[derive(Debug, Args)] +pub struct ModifyArgs { + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, + + #[clap( + short, + long, + help = "Modality of the assay", + value_name = "MODALITY", + required = true + )] + modality: String, + + #[clap( + short, + long, + help = "JSON array of objects to modify", + value_name = "KEYS", + required = true + )] + keys: String, + + #[clap(short = 'i', hide = true, value_name = "IDs")] + legacy_ids: Option, + + #[clap( + short, + long, + help = "Selector", + value_name = "SELECTOR", + default_value = "read", + value_parser = ["read","region","file","seqkit","seqprotocol","libkit","libprotocol","assay"] + )] + selector: String, + + #[clap(short, long, help = "Path to output file", value_name = "OUT")] + output: Option, +} + +pub fn run_modify(args: &ModifyArgs) { + validate_modify_args(args); + let mut spec = utils::load_spec(&args.yaml); + + let keys: Vec = serde_json::from_str(&args.keys).expect("--keys must be a JSON array"); + let selector = args.selector.as_str(); + spec = seqspec_modify(spec, &args.modality, keys, selector); + + spec.update_spec(); + + let yaml = spec.to_bytes().unwrap(); + if let Some(out) = &args.output { + let mut f = fs::File::create(out).unwrap(); + f.write_all(&yaml).unwrap(); + } else { + println!("{}", String::from_utf8_lossy(&yaml)); + } +} + +fn validate_modify_args(args: &ModifyArgs) { + if !args.yaml.exists() { + eprintln!("Please use `seqspec modify -h` for help."); + std::process::exit(1); + } + if let Some(out) = &args.output { + if out.exists() && !out.is_file() { + eprintln!("Output path exists but is not a file: {}", out.display()); + std::process::exit(1); + } + } +} + +pub fn seqspec_modify(mut spec: Assay, modality: &str, keys: Vec, selector: &str) -> Assay { + match selector { + "read" => modify_reads(&mut spec, modality, &keys), + "region" => modify_regions(&mut spec, modality, &keys), + "file" => modify_files(&mut spec, modality, &keys), + "seqkit" => modify_seqkits(&mut spec, &keys), + "seqprotocol" => modify_seqprotocols(&mut spec, &keys), + "libkit" => modify_libkits(&mut spec, &keys), + "libprotocol" => modify_libprotocols(&mut spec, &keys), + "assay" => modify_assay(&mut spec, &keys), + _ => (), + } + spec +} + +fn vstr(v: &Value, key: &str) -> Option { + v.get(key).and_then(|x| x.as_str().map(|s| s.to_string())) +} +fn vi64(v: &Value, key: &str) -> Option { + v.get(key).and_then(|x| x.as_i64()) +} + +fn modify_reads(spec: &mut Assay, modality: &str, keys: &Vec) { + for patch in keys { + let Some(read_id) = vstr(patch, "read_id") else { + continue; + }; + if let Some(rd) = spec + .sequence_spec + .iter_mut() + .find(|r| r.modality == modality && r.read_id == read_id) + { + // files optional + let files_opt: Option> = patch.get("files").and_then(|arr| { + arr.as_array().map(|items| { + items + .iter() + .filter_map(|it| { + Some(File::new( + it.get("file_id")?.as_str()?.to_string(), + it.get("filename") + .and_then(|x| x.as_str()) + .unwrap_or("") + .to_string(), + it.get("filetype") + .and_then(|x| x.as_str()) + .unwrap_or("") + .to_string(), + it.get("filesize").and_then(|x| x.as_i64()).unwrap_or(0), + it.get("url") + .and_then(|x| x.as_str()) + .unwrap_or("") + .to_string(), + it.get("urltype") + .and_then(|x| x.as_str()) + .unwrap_or("") + .to_string(), + it.get("md5") + .and_then(|x| x.as_str()) + .unwrap_or("") + .to_string(), + )) + }) + .collect() + }) + }); + + rd.update_read_by_id( + vstr(patch, "read_id"), + vstr(patch, "name"), + vstr(patch, "modality"), + vstr(patch, "primer_id"), + vi64(patch, "min_len"), + vi64(patch, "max_len"), + vstr(patch, "strand"), + files_opt, + ); + } + } +} + +fn modify_regions(spec: &mut Assay, modality: &str, keys: &Vec) { + // find index for modality to get mutable region tree + if let Some(idx) = spec.modalities.iter().position(|m| m == modality) { + if let Some(target) = spec.library_spec.get_mut(idx) { + for patch in keys { + let Some(target_region_id) = vstr(patch, "region_id") else { + continue; + }; + target.update_region_by_id( + target_region_id, + vstr(patch, "region_id"), + vstr(patch, "region_type"), + vstr(patch, "name"), + vstr(patch, "sequence_type"), + vstr(patch, "sequence"), + vi64(patch, "min_len"), + vi64(patch, "max_len"), + ); + } + } + } +} + +fn modify_files(spec: &mut Assay, modality: &str, keys: &Vec) { + for patch in keys { + let Some(file_id) = vstr(patch, "file_id") else { + continue; + }; + for r in spec + .sequence_spec + .iter_mut() + .filter(|r| r.modality == modality) + { + for f in &mut r.files { + if f.file_id == file_id { + if let Some(v) = vstr(patch, "filename") { + f.filename = v; + } + if let Some(v) = vstr(patch, "filetype") { + f.filetype = v; + } + if let Some(v) = vi64(patch, "filesize") { + f.filesize = v; + } + if let Some(v) = vstr(patch, "url") { + f.url = v; + } + if let Some(v) = vstr(patch, "urltype") { + f.urltype = v; + } + if let Some(v) = vstr(patch, "md5") { + f.md5 = v; + } + } + } + } + } +} + +fn modify_seqkits(spec: &mut Assay, keys: &Vec) { + if let Some(kits) = spec.sequence_kit.as_mut() { + for patch in keys { + let Some(kit_id) = vstr(patch, "kit_id") else { + continue; + }; + if let Some(k) = kits.iter_mut().find(|k| k.kit_id == kit_id) { + if let Some(v) = vstr(patch, "name") { + k.name = Some(v); + } + if let Some(v) = vstr(patch, "modality") { + k.modality = v; + } + } + } + } +} + +fn modify_seqprotocols(spec: &mut Assay, keys: &Vec) { + if let Some(protocols) = spec.sequence_protocol.as_mut() { + for patch in keys { + let Some(protocol_id) = vstr(patch, "protocol_id") else { + continue; + }; + if let Some(p) = protocols.iter_mut().find(|p| p.protocol_id == protocol_id) { + if let Some(v) = vstr(patch, "name") { + p.name = v; + } + if let Some(v) = vstr(patch, "modality") { + p.modality = v; + } + } + } + } +} + +fn modify_libkits(spec: &mut Assay, keys: &Vec) { + if let Some(kits) = spec.library_kit.as_mut() { + for patch in keys { + let Some(kit_id) = vstr(patch, "kit_id") else { + continue; + }; + if let Some(k) = kits.iter_mut().find(|k| k.kit_id == kit_id) { + if let Some(v) = vstr(patch, "name") { + k.name = Some(v); + } + if let Some(v) = vstr(patch, "modality") { + k.modality = v; + } + } + } + } +} + +fn modify_libprotocols(spec: &mut Assay, keys: &Vec) { + if let Some(protocols) = spec.library_protocol.as_mut() { + for patch in keys { + let Some(protocol_id) = vstr(patch, "protocol_id") else { + continue; + }; + if let Some(p) = protocols.iter_mut().find(|p| p.protocol_id == protocol_id) { + if let Some(v) = vstr(patch, "name") { + p.name = v; + } + if let Some(v) = vstr(patch, "modality") { + p.modality = v; + } + } + } + } +} + +fn modify_assay(spec: &mut Assay, keys: &Vec) { + for patch in keys { + let Some(assay_id) = vstr(patch, "assay_id") else { + continue; + }; + if assay_id != spec.assay_id { + continue; + } + if let Some(v) = vstr(patch, "name") { + spec.name = v; + } + if let Some(v) = vstr(patch, "doi") { + spec.doi = v; + } + if let Some(v) = vstr(patch, "date") { + spec.date = v; + } + if let Some(v) = vstr(patch, "description") { + spec.description = v; + } + if let Some(v) = vstr(patch, "lib_struct") { + spec.lib_struct = v; + } + if let Some(v) = vstr(patch, "assay_id") { + spec.assay_id = v; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + use serde_json::json; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_modify_read_name() { + let spec = dogma_spec(); + let rna_reads = spec.get_seqspec("rna"); + let read_id = rna_reads[0].read_id.clone(); + let keys = vec![json!({"read_id": read_id, "name": "Updated Name"})]; + let modified = seqspec_modify(spec, "rna", keys, "read"); + let read = modified.get_read(&read_id).unwrap(); + assert_eq!(read.name, "Updated Name"); + } + + #[test] + fn test_modify_region_name() { + let spec = dogma_spec(); + let lib = spec.get_libspec("rna").unwrap(); + let leaves = lib.get_leaves(); + let target = &leaves[0]; + let keys = vec![json!({"region_id": target.region_id, "name": "New Name"})]; + let modified = seqspec_modify(spec, "rna", keys, "region"); + let lib = modified.get_libspec("rna").unwrap(); + let found = lib.get_region_by_id(&target.region_id); + assert!(!found.is_empty()); + assert_eq!(found[0].name, "New Name"); + } + + #[test] + fn test_modify_file_url() { + let spec = dogma_spec(); + let rna_reads = spec.get_seqspec("rna"); + // Find a read with files + let read_with_files = rna_reads.iter().find(|r| !r.files.is_empty()); + if let Some(rd) = read_with_files { + let file_id = rd.files[0].file_id.clone(); + let keys = vec![json!({"file_id": file_id, "url": "http://new.url/file.fq.gz"})]; + let modified = seqspec_modify(spec, "rna", keys, "file"); + let updated_read = modified.get_read(&rd.read_id).unwrap(); + let f = updated_read + .files + .iter() + .find(|f| f.file_id == file_id) + .unwrap(); + assert_eq!(f.url, "http://new.url/file.fq.gz"); + } + } + + #[test] + fn test_modify_assay_fields() { + let spec = dogma_spec(); + let assay_id = spec.assay_id.clone(); + let keys = vec![ + json!({"assay_id": assay_id, "name": "New Assay Name", "description": "Updated desc"}), + ]; + let modified = seqspec_modify(spec, "rna", keys, "assay"); + assert_eq!(modified.name, "New Assay Name"); + assert_eq!(modified.description, "Updated desc"); + } + + #[test] + fn test_modify_unknown_selector() { + let spec = dogma_spec(); + let keys = vec![json!({"id": "x"})]; + let modified = seqspec_modify(spec.clone(), "rna", keys, "unknown"); + assert_eq!(modified.assay_id, spec.assay_id); + } + + #[test] + fn test_modify_read_preserves_global_read_order() { + let spec = dogma_spec(); + let original_ids: Vec = spec + .sequence_spec + .iter() + .map(|read| read.read_id.clone()) + .collect(); + let keys = vec![json!({"read_id": "rna_R1", "name": "Updated Name"})]; + let modified = seqspec_modify(spec, "rna", keys, "read"); + let modified_ids: Vec = modified + .sequence_spec + .iter() + .map(|read| read.read_id.clone()) + .collect(); + assert_eq!(modified_ids, original_ids); + } +} diff --git a/src/seqspec_onlist.rs b/src/seqspec_onlist.rs new file mode 100644 index 00000000..bf3b8855 --- /dev/null +++ b/src/seqspec_onlist.rs @@ -0,0 +1,511 @@ +use crate::auth::RemoteAccess; +use crate::models::assay::Assay; +use crate::models::onlist::Onlist; +use crate::seqspec_find::{find_by_region_id, find_by_region_type}; +use crate::utils; +use clap::Args; +use std::fs; +use std::io::Write; +use std::path::{Path, PathBuf}; + +#[derive(Debug, Args)] +pub struct OnlistArgs { + #[clap( + short, + long, + help = "Path to output file (required for download/join operations)", + value_name = "OUT" + )] + output: Option, + + #[clap(short, long, help = "Selector", value_parser = ["read", "region", "region-type"], default_value = "read")] + selector: String, + + #[clap(short, long, help = "Format for combining multiple onlists", value_parser = ["product", "multi"], default_value = None)] + format: Option, + + #[clap(short = 'i', long, help = "ID to search for")] + id: Option, + + #[clap(short, long, help = "Modality", required = true)] + modality: String, + + #[clap(long, env = "SEQSPEC_AUTH_PROFILE", value_name = "PROFILE")] + auth_profile: Option, + + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, +} + +pub fn run_onlist(args: &OnlistArgs) { + validate_onlist_args(args); + let base_path = args + .yaml + .parent() + .unwrap_or_else(|| Path::new(".")) + .to_path_buf(); + let spec = utils::load_spec(&args.yaml); + let remote_access = RemoteAccess::load(args.auth_profile.as_deref()).unwrap_or_else(|err| { + eprintln!("{}", err); + std::process::exit(1); + }); + + let onlists = get_onlists(&spec, &args.modality, &args.selector, args.id.as_deref()) + .unwrap_or_else(|err| { + eprintln!("{}", err); + std::process::exit(1); + }); + if onlists.is_empty() { + println!("No onlists found"); + return; + } + + if let Some(fmt) = &args.format { + let save_path = args.output.clone().unwrap_or(base_path.join("joined.txt")); + let result_path = + join_onlists_and_save(&onlists, fmt, &save_path, &base_path, &remote_access); + println!("{}", result_path.display()); + } else if let Some(out) = &args.output { + let result_paths = download_onlists_to_path(&onlists, out, &base_path, &remote_access); + for p in result_paths { + println!("{}", p.url); + } + } else { + let urls = get_onlist_urls(&onlists, &base_path); + for u in urls { + println!("{}", u.url); + } + } +} + +fn validate_onlist_args(args: &OnlistArgs) { + if !args.yaml.exists() { + eprintln!("Input file does not exist: {}", args.yaml.display()); + std::process::exit(1); + } + if let Some(out) = &args.output { + if out.exists() && !out.is_file() { + eprintln!("Output path exists but is not a file: {}", out.display()); + std::process::exit(1); + } + } +} + +fn get_onlists( + spec: &Assay, + modality: &str, + selector: &str, + id: Option<&str>, +) -> Result, String> { + match selector { + "region-type" => { + let mut out: Vec = Vec::new(); + let mut matches_by_read: Vec<(String, Vec)> = Vec::new(); + for rd in spec.get_seqspec(modality) { + if let Ok((_read, rgns)) = + utils::map_read_id_to_regions(spec, modality, &rd.read_id) + { + let mut ordered: Vec = Vec::new(); + for r in rgns { + if r.region_type == id.unwrap_or("") { + if let Some(ol) = r.get_onlist() { + ordered.push(ol); + } + } + } + if !ordered.is_empty() { + matches_by_read.push((rd.read_id.clone(), ordered)); + } + } + } + if matches_by_read.len() == 1 { + return Ok(matches_by_read.remove(0).1); + } + if matches_by_read.len() > 1 { + let read_ids = matches_by_read + .iter() + .map(|(read_id, _)| read_id.as_str()) + .collect::>() + .join(", "); + return Err(format!( + "region-type '{}' matches regions in multiple reads for modality '{}': {}. Use -s read or -s region to disambiguate.", + id.unwrap_or(""), + modality, + read_ids + )); + } + let regions = find_by_region_type(spec, modality, id.unwrap_or("")); + for r in regions { + if let Some(ol) = r.get_onlist() { + out.push(ol); + } + } + Ok(out) + } + "region" => { + let mut out: Vec = Vec::new(); + let regions = find_by_region_id(spec, modality, id.unwrap_or("")); + for r in regions { + if let Some(ol) = r.get_onlist() { + out.push(ol); + } + } + if out.is_empty() { + return Err(format!("No onlist found for region {}", id.unwrap_or(""))); + } + Ok(out) + } + "read" => { + let (read, rgns) = utils::map_read_id_to_regions(spec, modality, id.unwrap_or("")) + .map_err(|err| err.to_string())?; + let region_coordinates = utils::project_regions_to_coordinates(rgns); + let clipped = utils::itx_read(region_coordinates, 0, read.max_len); + let mut out: Vec = Vec::new(); + for rc in clipped { + if let Some(ol) = rc.region.get_onlist() { + out.push(ol); + } + } + Ok(out) + } + _ => Ok(Vec::new()), + } +} + +struct UrlInfo { + url: String, +} + +fn get_onlist_urls(onlists: &Vec, base_path: &Path) -> Vec { + let mut urls = Vec::new(); + for ol in onlists { + let url = if ol.urltype == "local" { + base_path + .join(utils::local_onlist_locator(ol).unwrap_or_else(|err| { + eprintln!("{}", err); + std::process::exit(1); + })) + .to_string_lossy() + .to_string() + } else { + ol.url.clone() + }; + urls.push(UrlInfo { url }); + } + urls +} + +struct PathInfo { + url: String, +} + +fn download_onlists_to_path( + onlists: &Vec, + output_path: &Path, + base_path: &Path, + remote_access: &RemoteAccess, +) -> Vec { + let mut out = Vec::new(); + for ol in onlists { + if ol.urltype == "local" { + let local = base_path.join(utils::local_onlist_locator(ol).unwrap_or_else(|err| { + eprintln!("{}", err); + std::process::exit(1); + })); + out.push(PathInfo { + url: local.to_string_lossy().to_string(), + }); + } else { + let content = utils::read_remote_list(&ol.url, remote_access).unwrap_or_default(); + let filename = format!( + "{}_{}", + ol.file_id, + output_path + .file_name() + .unwrap_or_default() + .to_string_lossy() + ); + let download_path = output_path + .parent() + .unwrap_or_else(|| Path::new(".")) + .join(filename); + write_onlist(&content, &download_path); + out.push(PathInfo { + url: download_path.to_string_lossy().to_string(), + }); + } + } + out +} + +fn join_onlists_and_save( + onlists: &Vec, + format_type: &str, + output_path: &Path, + base_path: &Path, + remote_access: &RemoteAccess, +) -> PathBuf { + let mut contents: Vec> = Vec::new(); + for ol in onlists { + let content = if ol.urltype == "local" { + let locator = utils::local_onlist_locator(ol).unwrap_or_else(|err| { + eprintln!("{}", err); + std::process::exit(1); + }); + utils::read_local_list(&base_path.join(locator)).unwrap_or_default() + } else { + utils::read_remote_list(&ol.url, remote_access).unwrap_or_default() + }; + contents.push(content); + } + let joined = join_onlist_contents(contents, format_type); + write_onlist(&joined, output_path); + output_path.to_path_buf() +} + +fn write_onlist(onlist: &Vec, path: &Path) { + let mut f = fs::File::create(path).unwrap(); + for line in onlist { + writeln!(f, "{}", line).unwrap(); + } +} + +fn join_onlist_contents(lists: Vec>, format_type: &str) -> Vec { + match format_type { + "product" => join_product_onlist(lists), + "multi" => join_multi_onlist(lists), + _ => Vec::new(), + } +} + +fn join_product_onlist(lsts: Vec>) -> Vec { + fn cartesian(acc: Vec, rest: &[Vec]) -> Vec { + if rest.is_empty() { + return acc; + } + let mut out = Vec::new(); + for a in &acc { + for b in &rest[0] { + out.push(format!("{}{}", a, b)); + } + } + cartesian(out, &rest[1..]) + } + if lsts.is_empty() { + return Vec::new(); + } + let init = lsts[0].clone(); + cartesian(init, &lsts[1..]) +} + +fn join_multi_onlist(lsts: Vec>) -> Vec { + let max_len = lsts.iter().map(|v| v.len()).max().unwrap_or(0); + let mut out = Vec::new(); + for i in 0..max_len { + let row: Vec = lsts + .iter() + .map(|v| v.get(i).cloned().unwrap_or_else(|| "-".to_string())) + .collect(); + out.push(row.join(" ")); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn unique_test_dir(prefix: &str) -> PathBuf { + std::env::temp_dir().join(format!( + "{}-{}-{}", + prefix, + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() + )) + } + + #[test] + fn test_join_product() { + let lists = vec![vec!["A".into(), "B".into()], vec!["1".into(), "2".into()]]; + let result = join_product_onlist(lists); + assert_eq!(result.len(), 4); + assert!(result.contains(&"A1".to_string())); + assert!(result.contains(&"A2".to_string())); + assert!(result.contains(&"B1".to_string())); + assert!(result.contains(&"B2".to_string())); + } + + #[test] + fn test_join_product_three_lists() { + let lists = vec![ + vec!["A".into(), "B".into()], + vec!["1".into()], + vec!["x".into(), "y".into()], + ]; + let result = join_product_onlist(lists); + assert_eq!(result.len(), 4); // 2 * 1 * 2 + assert!(result.contains(&"A1x".to_string())); + assert!(result.contains(&"B1y".to_string())); + } + + #[test] + fn test_join_product_empty() { + let lists: Vec> = vec![]; + let result = join_product_onlist(lists); + assert!(result.is_empty()); + } + + #[test] + fn test_join_multi() { + let lists = vec![ + vec!["A".into(), "B".into(), "C".into()], + vec!["1".into(), "2".into()], + ]; + let result = join_multi_onlist(lists); + assert_eq!(result.len(), 3); // max length + assert_eq!(result[0], "A 1"); + assert_eq!(result[1], "B 2"); + assert_eq!(result[2], "C -"); // padded with "-" + } + + #[test] + fn test_join_multi_empty() { + let lists: Vec> = vec![]; + let result = join_multi_onlist(lists); + assert!(result.is_empty()); + } + + #[test] + fn test_join_onlist_contents_dispatches() { + let lists = vec![vec!["A".into(), "B".into()], vec!["1".into(), "2".into()]]; + let product = join_onlist_contents(lists.clone(), "product"); + assert_eq!(product.len(), 4); + + let multi = join_onlist_contents(lists, "multi"); + assert_eq!(multi.len(), 2); + + let unknown = join_onlist_contents(vec![], "invalid"); + assert!(unknown.is_empty()); + } + + #[test] + fn test_get_onlists_by_region_type_errors_when_matches_span_reads() { + let spec = crate::utils::load_spec(&std::path::PathBuf::from("tests/fixtures/spec.yaml")); + let err = get_onlists(&spec, "rna", "region-type", Some("barcode")).unwrap_err(); + assert!(err.contains("matches regions in multiple reads")); + assert!(err.contains("rna_R1")); + assert!(err.contains("rna_R2")); + } + + #[test] + fn test_get_onlists_by_region() { + let spec = crate::utils::load_spec(&std::path::PathBuf::from("tests/fixtures/spec.yaml")); + let onlists = get_onlists(&spec, "rna", "region", Some("rna_cell_bc")).unwrap(); + assert_eq!(onlists.len(), 1); + assert_eq!(onlists[0].filename, "RNA-737K-arc-v1.txt"); + } + + #[test] + fn test_get_onlists_by_read() { + let spec = crate::utils::load_spec(&std::path::PathBuf::from("tests/fixtures/spec.yaml")); + let rna_reads = spec.get_seqspec("rna"); + assert_eq!(rna_reads.len(), 2); + // rna_R1 maps to regions including rna_cell_bc which has an onlist + let onlists = get_onlists(&spec, "rna", "read", Some(&rna_reads[0].read_id)).unwrap(); + assert_eq!(onlists.len(), 1); + assert_eq!(onlists[0].filename, "RNA-737K-arc-v1.txt"); + } + + #[test] + fn test_get_onlists_by_read_respects_read_window() { + let spec = crate::utils::load_spec(&std::path::PathBuf::from( + "tests/fixtures/onlist_read_clip/spec.yaml", + )); + let onlists = get_onlists(&spec, "rna", "read", Some("rna_read")).unwrap(); + let file_ids = onlists + .iter() + .map(|onlist| onlist.file_id.as_str()) + .collect::>(); + assert_eq!(file_ids, vec!["barcode_a.txt"]); + } + + #[test] + fn test_get_onlists_empty_modality() { + let spec = crate::utils::load_spec(&std::path::PathBuf::from("tests/fixtures/spec.yaml")); + let onlists = get_onlists(&spec, "rna", "region-type", Some("nonexistent_type")).unwrap(); + assert!(onlists.is_empty()); + } + + #[test] + fn test_get_onlists_region_type_uses_read_order_when_unique() { + let spec = crate::utils::load_spec(&std::path::PathBuf::from( + "tests/fixtures/onlist_issue_68/spec.yaml", + )); + let onlists = get_onlists(&spec, "rna", "region-type", Some("barcode")).unwrap(); + let file_ids = onlists + .iter() + .map(|onlist| onlist.file_id.as_str()) + .collect::>(); + assert_eq!(file_ids, vec!["barcode_b.txt", "barcode_a.txt"]); + } + + #[test] + fn test_get_onlists_region_type_errors_when_matches_span_multiple_reads() { + let spec = crate::utils::load_spec(&std::path::PathBuf::from( + "tests/fixtures/onlist_ambiguous_region_type/spec.yaml", + )); + let err = get_onlists(&spec, "rna", "region-type", Some("barcode")).unwrap_err(); + assert!(err.contains("matches regions in multiple reads")); + assert!(err.contains("rna_read_1")); + assert!(err.contains("rna_read_2")); + } + + #[test] + fn test_get_onlist_urls_prefers_local_url() { + let base = PathBuf::from("/tmp/spec-root"); + let onlists = vec![Onlist::new( + "ol1".into(), + "display.txt".into(), + "txt".into(), + 0, + "nested/whitelist.txt".into(), + "local".into(), + String::new(), + )]; + + let urls = get_onlist_urls(&onlists, &base); + assert_eq!(urls[0].url, "/tmp/spec-root/nested/whitelist.txt"); + } + + #[test] + fn test_join_onlists_and_save_reads_local_onlists_from_url() { + let root = unique_test_dir("seqspec-onlist"); + let nested = root.join("nested"); + std::fs::create_dir_all(&nested).unwrap(); + std::fs::write(nested.join("whitelist.txt"), "AAAA\nCCCC\n").unwrap(); + + let onlists = vec![Onlist::new( + "ol1".into(), + "display.txt".into(), + "txt".into(), + 0, + "nested/whitelist.txt".into(), + "local".into(), + String::new(), + )]; + let output = root.join("joined.txt"); + let remote_access = RemoteAccess::anonymous(); + + let result_path = + join_onlists_and_save(&onlists, "product", &output, &root, &remote_access); + + assert_eq!(result_path, output); + assert_eq!(std::fs::read_to_string(&output).unwrap(), "AAAA\nCCCC\n"); + + std::fs::remove_dir_all(root).unwrap(); + } +} diff --git a/src/seqspec_print.rs b/src/seqspec_print.rs new file mode 100644 index 00000000..bbe27c66 --- /dev/null +++ b/src/seqspec_print.rs @@ -0,0 +1,211 @@ +use crate::models::assay::Assay; +use crate::models::region::{Region, RegionCoordinate}; +use crate::seqspec_html; +use crate::utils; +use clap::Args; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +#[derive(Debug, Args)] +pub struct PrintArgs { + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, + + #[clap(short, long, help = "Path to output file", value_name = "OUT")] + output: Option, + + #[clap( + short, + long, + help = "Format", + value_name = "FORMAT", + default_value = "library-ascii", + value_parser = ["library-ascii", "seqspec-ascii", "seqspec-html", "seqspec-png"], + )] + format: String, +} + +pub fn run_print(args: &PrintArgs) { + validate_print_args(args); + + let spec = utils::load_spec(&args.yaml); + let result = seqspec_print(&spec, &args.format).unwrap_or_else(|err| { + eprintln!("{}", err); + std::process::exit(1); + }); + + if let Some(out) = &args.output { + let mut fh = fs::File::create(out).unwrap(); + writeln!(fh, "{result}").unwrap(); + } else { + println!("{result}"); + } +} + +fn validate_print_args(args: &PrintArgs) { + if !args.yaml.exists() { + eprintln!("Input file does not exist: {}", args.yaml.display()); + std::process::exit(1); + } + if let Some(out) = &args.output { + if out.exists() && !out.is_file() { + eprintln!("Output path exists but is not a file: {}", out.display()); + std::process::exit(1); + } + } +} + +pub fn seqspec_print(spec: &Assay, fmt: &str) -> Result { + match fmt { + "library-ascii" => Ok(print_library_ascii(spec)), + "seqspec-ascii" => print_seqspec_ascii(spec), + "seqspec-html" => seqspec_html::render_seqspec_html(spec), + "seqspec-png" => Err("seqspec-png is not implemented in the Rust CLI yet".to_string()), + _ => Err(format!("Unsupported format: {}", fmt)), + } +} + +fn print_seqspec_ascii(spec: &Assay) -> Result { + let mut parts = Vec::new(); + for modality in &spec.modalities { + let (p, n) = libseq(spec, modality)?; + parts.push(format_libseq(spec, modality, p, n)?); + } + Ok(parts.join("\n")) +} + +fn format_libseq( + spec: &Assay, + modality: &str, + positive: Vec, + negative: Vec, +) -> Result { + let libspec = spec + .get_libspec(modality) + .ok_or_else(|| format!("modality '{}' not found", modality))?; + + Ok([ + modality.to_string(), + "---".to_string(), + positive.join("\n"), + libspec.sequence.clone(), + utils::complement_seq(&libspec.sequence), + negative.join("\n"), + ] + .join("\n")) +} + +fn libseq(spec: &Assay, modality: &str) -> Result<(Vec, Vec), String> { + let libspec = spec + .get_libspec(modality) + .ok_or_else(|| format!("modality '{}' not found", modality))?; + let seqspec = spec.get_seqspec(modality); + + let mut positive = Vec::new(); + let mut negative = Vec::new(); + + for (idx, read) in seqspec.iter().enumerate() { + let leaves = libspec.get_leaves_with_region_id(&read.primer_id); + let primer_idx = leaves + .iter() + .position(|leaf| leaf.region_id == read.primer_id) + .ok_or_else(|| { + format!( + "primer_id '{}' not found in modality '{}'", + read.primer_id, modality + ) + })?; + + let cuts: Vec = utils::project_regions_to_coordinates(leaves); + let primer_pos = &cuts[primer_idx]; + let arrow_len = read.max_len.saturating_sub(1) as usize; + let arrow = "-".repeat(arrow_len); + + if read.strand == "pos" { + let space_len = primer_pos.stop.saturating_sub(1) as usize; + let ws = " ".repeat(space_len); + positive.push(format!("{}|{}>({}) {}", ws, arrow, idx + 1, read.read_id)); + } else { + let space_len = primer_pos.start.saturating_sub(read.max_len) as usize; + let ws = " ".repeat(space_len); + negative.push(format!("{}<{}|({}) {}", ws, arrow, idx + 1, read.read_id)); + } + } + + Ok((positive, negative)) +} + +fn print_library_ascii(spec: &Assay) -> String { + spec.library_spec + .iter() + .map(|region| render_region_tree(region, "", true)) + .collect::>() + .join("\n") +} + +fn render_region_tree(region: &Region, prefix: &str, is_last: bool) -> String { + let branch = if prefix.is_empty() { + "" + } else if is_last { + "└── " + } else { + "├── " + }; + let mut lines = vec![format!( + "{}{}{}({},{})", + prefix, branch, region.region_id, region.min_len, region.max_len + )]; + + let child_prefix = if prefix.is_empty() { + String::new() + } else if is_last { + format!("{} ", prefix) + } else { + format!("{}│ ", prefix) + }; + + for (idx, child) in region.regions.iter().enumerate() { + lines.push(render_region_tree( + child, + &child_prefix, + idx + 1 == region.regions.len(), + )); + } + + lines.join("\n") +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_print_library_ascii_contains_modalities_and_regions() { + let rendered = print_library_ascii(&dogma_spec()); + assert!(rendered.contains("rna(")); + assert!(rendered.contains("rna_cell_bc")); + assert!(rendered.contains("atac")); + } + + #[test] + fn test_print_seqspec_ascii_contains_reads_and_sequence() { + let rendered = print_seqspec_ascii(&dogma_spec()).unwrap(); + assert!(rendered.contains("rna")); + assert!(rendered.contains("rna_R1")); + assert!(rendered.contains("rna_R2")); + assert!(rendered.contains("---")); + } + + #[test] + fn test_print_seqspec_html_contains_payload() { + let html = seqspec_print(&dogma_spec(), "seqspec-html").unwrap(); + assert!(html.contains("seqspec-view-data")); + assert!(html.contains("DOGMAseq-DIG")); + } +} diff --git a/src/seqspec_split.rs b/src/seqspec_split.rs new file mode 100644 index 00000000..0602269f --- /dev/null +++ b/src/seqspec_split.rs @@ -0,0 +1,158 @@ +use crate::models::assay::Assay; +use crate::utils; +use clap::Args; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +#[derive(Debug, Args)] +pub struct SplitArgs { + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, + + #[clap( + short, + long, + help = "Path to output files", + value_name = "OUT", + required = true + )] + output: PathBuf, +} + +fn validate_split_args(args: &SplitArgs) { + if !args.yaml.exists() { + eprintln!("Please use `seqspec split -h` for help."); + std::process::exit(1); + } + if args.output.exists() && args.output.is_file() { + eprintln!("Output path exists: {}", args.output.display()); + std::process::exit(1); + } +} + +pub fn run_split(args: &SplitArgs) { + validate_split_args(args); + let spec = utils::load_spec(&args.yaml); + let specs = seqspec_split(&spec); + + // Determine prefix (name + "." or "spec.") + let prefix = args + .output + .file_name() + .and_then(|s| s.to_str()) + .filter(|s| !s.is_empty()) + .map(|s| format!("{}.", s)) + .unwrap_or_else(|| "spec.".to_string()); + + // Ensure output directory exists + let _ = fs::create_dir_all(&args.output); + + for mut spec_m in specs { + let modality = spec_m.list_modalities().get(0).cloned().unwrap_or_default(); + let out_path = args.output.join(format!("{}{}.yaml", prefix, modality)); + spec_m.update_spec(); + let bytes = spec_m.to_bytes().expect("serialize to YAML"); + let mut f = fs::File::create(out_path).expect("create output file"); + f.write_all(&bytes).expect("write YAML"); + } +} + +pub fn seqspec_split(spec: &Assay) -> Vec { + let mut specs: Vec = Vec::new(); + let modalities = spec.list_modalities(); + for modality in modalities { + let sequence_spec = spec.get_seqspec(&modality); + let library_spec = vec![spec + .get_libspec(&modality) + .expect("modality not found in library_spec")]; + + let mut spec_m = Assay::new( + spec.assay_id.clone(), + spec.name.clone(), + spec.doi.clone(), + spec.date.clone(), + spec.description.clone(), + vec![modality.clone()], + spec.lib_struct.clone(), + sequence_spec, + library_spec, + spec.sequence_protocol.clone(), + spec.sequence_kit.clone(), + spec.library_protocol.clone(), + spec.library_kit.clone(), + spec.seqspec_version.clone(), + ); + spec_m.update_spec(); + specs.push(spec_m); + } + specs +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_split_produces_one_per_modality() { + let spec = dogma_spec(); + let mods = spec.list_modalities(); + let splits = seqspec_split(&spec); + assert_eq!(splits.len(), mods.len()); + } + + #[test] + fn test_split_preserves_metadata() { + let spec = dogma_spec(); + let splits = seqspec_split(&spec); + for s in &splits { + assert_eq!(s.assay_id, spec.assay_id); + assert_eq!(s.name, spec.name); + assert_eq!(s.doi, spec.doi); + } + } + + #[test] + fn test_split_single_modality_each() { + let spec = dogma_spec(); + let splits = seqspec_split(&spec); + for s in &splits { + assert_eq!(s.modalities.len(), 1); + assert_eq!(s.library_spec.len(), 1); + } + } + + #[test] + fn test_split_rna_reads_only() { + let spec = dogma_spec(); + let splits = seqspec_split(&spec); + let rna_spec = splits.iter().find(|s| s.modalities[0] == "rna").unwrap(); + for r in &rna_spec.sequence_spec { + assert_eq!(r.modality, "rna"); + } + } + + #[test] + fn test_split_atac_reads_only() { + let spec = dogma_spec(); + let splits = seqspec_split(&spec); + let atac_spec = splits.iter().find(|s| s.modalities[0] == "atac").unwrap(); + for r in &atac_spec.sequence_spec { + assert_eq!(r.modality, "atac"); + } + } + + #[test] + fn test_split_preserves_total_reads() { + let spec = dogma_spec(); + let total_reads = spec.sequence_spec.len(); + let splits = seqspec_split(&spec); + let split_total: usize = splits.iter().map(|s| s.sequence_spec.len()).sum(); + assert_eq!(split_total, total_reads); + } +} diff --git a/src/seqspec_upgrade.rs b/src/seqspec_upgrade.rs new file mode 100644 index 00000000..6d9d86f8 --- /dev/null +++ b/src/seqspec_upgrade.rs @@ -0,0 +1,226 @@ +use crate::models::assay::Assay; +use crate::models::file::File; +use crate::models::onlist::Onlist; +use crate::utils; +use clap::Args; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +#[derive(Debug, Args)] +pub struct UpgradeArgs { + #[clap(help = "Sequencing specification yaml file", required = true)] + yaml: PathBuf, + + #[clap(short, long, help = "Path to output file", value_name = "OUT")] + output: Option, +} + +pub fn run_upgrade(args: &UpgradeArgs) { + validate_upgrade_args(args); + let spec = utils::load_spec(&args.yaml); + let version = spec + .seqspec_version + .clone() + .unwrap_or_else(|| "0.0.0".to_string()); + let upgraded = seqspec_upgrade(spec, &version); + + let bytes = upgraded.to_bytes().unwrap(); + if let Some(out) = &args.output { + let mut f = fs::File::create(out).unwrap(); + f.write_all(&bytes).unwrap(); + } else { + println!("{}", String::from_utf8_lossy(&bytes)); + } +} + +fn validate_upgrade_args(args: &UpgradeArgs) { + if !args.yaml.exists() { + eprintln!("Please use `seqspec upgrade -h` for help."); + std::process::exit(1); + } + if let Some(out) = &args.output { + if out.exists() && !out.is_file() { + eprintln!("Output path exists but is not a file: {}", out.display()); + std::process::exit(1); + } + } +} + +pub fn seqspec_upgrade(spec: Assay, version: &str) -> Assay { + match version { + "0.0.0" => upgrade_0_2_0_to_0_4_0(spec), + "0.1.0" => upgrade_0_2_0_to_0_4_0(spec), + "0.1.1" => upgrade_0_2_0_to_0_4_0(spec), + "0.2.0" => upgrade_0_2_0_to_0_4_0(spec), + "0.3.0" => upgrade_0_3_0_to_0_4_0(spec), + "0.4.0" => upgrade_0_4_0_to_0_4_0(spec), + _ => panic!("Unsupported version: {}", version), + } +} + +fn upgrade_0_3_0_to_0_4_0(spec: Assay) -> Assay { + // 0.3.0 and 0.4.0 are identical + let mut spec = spec; + spec.seqspec_version = Some("0.4.0".to_string()); + spec +} + +fn upgrade_0_4_0_to_0_4_0(spec: Assay) -> Assay { + spec +} + +fn upgrade_0_2_0_to_0_4_0(spec: Assay) -> Assay { + let mut spec = spec; + // ensure reads have files + for r in &mut spec.sequence_spec { + if r.files.is_empty() { + r.update_files(vec![File::new( + r.read_id.clone(), + r.read_id.clone(), + "".to_string(), + 0, + "".to_string(), + "".to_string(), + "".to_string(), + )]); + } + } + + // ensure onlist regions have fully-populated Onlist + for top in &mut spec.library_spec { + let leaves = top.get_leaves(); + for lf in leaves { + if let Some(ol) = &lf.onlist { + let filename = ol.filename.clone(); + let md5 = ol.md5.clone(); + let new_ol = Onlist { + file_id: filename.clone(), + filename: filename, + filetype: "".to_string(), + filesize: 0, + url: "".to_string(), + urltype: "".to_string(), + md5, + }; + // update the region by id with new onlist fields + top.update_region_by_id( + lf.region_id.clone(), + None, + None, + None, + None, + None, + None, + None, + ); + // We cannot directly set onlist via update_region_by_id, so re-find and set + // Traverse mutably to set onlist on matching leaf + fn set_onlist_mut( + r: &mut crate::models::region::Region, + id: &str, + new_ol: &Onlist, + ) { + if r.region_id == id { + r.onlist = Some(new_ol.clone()); + return; + } + for c in &mut r.regions { + set_onlist_mut(c, id, new_ol); + } + } + set_onlist_mut(top, &lf.region_id, &new_ol); + } + } + } + spec.seqspec_version = Some("0.4.0".to_string()); + spec +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + use std::path::PathBuf; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_upgrade_0_4_0_is_idempotent() { + let spec = dogma_spec(); + let orig_version = spec.seqspec_version.clone(); + let upgraded = seqspec_upgrade(spec.clone(), "0.4.0"); + assert_eq!(upgraded.seqspec_version, orig_version); + assert_eq!(upgraded.modalities, spec.modalities); + assert_eq!(upgraded.sequence_spec.len(), spec.sequence_spec.len()); + } + + #[test] + fn test_upgrade_0_3_0_sets_version() { + let mut spec = dogma_spec(); + spec.seqspec_version = Some("0.3.0".to_string()); + let upgraded = seqspec_upgrade(spec, "0.3.0"); + assert_eq!(upgraded.seqspec_version, Some("0.4.0".to_string())); + } + + #[test] + fn test_upgrade_0_2_0_adds_files_to_reads() { + let mut spec = dogma_spec(); + // Remove files from a read to simulate 0.2.0 + spec.sequence_spec[0].files.clear(); + assert!(spec.sequence_spec[0].files.is_empty()); + let upgraded = upgrade_0_2_0_to_0_4_0(spec); + // After upgrade, the read should have a placeholder file + assert!(!upgraded.sequence_spec[0].files.is_empty()); + assert_eq!( + upgraded.sequence_spec[0].files[0].file_id, + upgraded.sequence_spec[0].read_id + ); + } + + #[test] + fn test_upgrade_0_2_0_preserves_existing_files() { + let spec = dogma_spec(); + let orig_files_len = spec.sequence_spec[0].files.len(); + assert!(orig_files_len > 0); + let upgraded = upgrade_0_2_0_to_0_4_0(spec); + // Reads that already had files should keep them unchanged + assert_eq!(upgraded.sequence_spec[0].files.len(), orig_files_len); + } + + #[test] + fn test_upgrade_sets_version_0_4_0() { + let mut spec = dogma_spec(); + spec.seqspec_version = Some("0.2.0".to_string()); + let upgraded = seqspec_upgrade(spec, "0.2.0"); + assert_eq!(upgraded.seqspec_version, Some("0.4.0".to_string())); + } + + #[test] + fn test_upgrade_loaded_legacy_0_2_spec() { + let spec = load_spec(&PathBuf::from( + "tests/fixtures/legacy_0_2_missing_fields.yaml", + )); + assert!(spec.sequence_spec[0].files.is_empty()); + + let upgraded = seqspec_upgrade(spec, "0.2.0"); + assert_eq!(upgraded.seqspec_version, Some("0.4.0".to_string())); + assert_eq!(upgraded.sequence_spec[0].files.len(), 1); + assert_eq!( + upgraded.sequence_spec[0].files[0].file_id, + upgraded.sequence_spec[0].read_id + ); + + let barcode = upgraded.library_spec[0] + .get_region_by_id("barcode") + .into_iter() + .next() + .expect("barcode region"); + let onlist = barcode.onlist.expect("barcode onlist"); + assert_eq!(onlist.file_id, "whitelist.txt.gz"); + assert_eq!(onlist.filename, "whitelist.txt.gz"); + assert_eq!(onlist.md5, "abc123"); + } +} diff --git a/src/seqspec_version.rs b/src/seqspec_version.rs new file mode 100644 index 00000000..52c4bb15 --- /dev/null +++ b/src/seqspec_version.rs @@ -0,0 +1,108 @@ +use std::fs::File; +use std::io::Write; +use std::path::{Path, PathBuf}; + +use clap::Args; + +use crate::models::assay::Assay; +use crate::utils; + +/// `seqspec version` +#[derive(Debug, Args)] +pub struct VersionArgs { + /// Output file path (use '-' for stdout) + #[clap(short, long, value_name = "OUT")] + output: Option, + + /// Sequencing specification YAML file + #[clap(value_name = "YAML")] + yaml: PathBuf, +} + +pub fn validate_version_args(args: &VersionArgs) { + // just call the runner and print any error nicely + if !args.yaml.exists() { + eprintln!("Please use `seqspec version -h` for help."); + std::process::exit(1); + } +} + +pub fn run_version(args: &VersionArgs) { + validate_version_args(args); + let spec = utils::load_spec(&args.yaml); + let vinfo = seqspec_version(&spec); + let out = format_version(&vinfo); + + match args.output.as_deref() { + // stdout if --output omitted or explicitly '-' + None => { + println!("{out}"); + } + Some(p) if p == Path::new("-") => { + println!("{out}"); + } + Some(p) => { + let mut fh = File::create(p).unwrap(); + writeln!(fh, "{out}").unwrap(); + } + } +} + +/// Return both tool and file versions +pub fn seqspec_version(spec: &Assay) -> VersionInfo { + const TOOL_VERSION: &str = env!("CARGO_PKG_VERSION"); + VersionInfo { + tool_version: TOOL_VERSION.to_string(), + file_version: spec.seqspec_version.clone().unwrap_or_default(), + } +} + +pub struct VersionInfo { + pub tool_version: String, + pub file_version: String, +} + +pub fn format_version(v: &VersionInfo) -> String { + format!( + "seqspec version: {}\nseqspec file version: {}", + v.tool_version, v.file_version + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::load_spec; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_seqspec_version() { + let spec = dogma_spec(); + let v = seqspec_version(&spec); + assert!(!v.tool_version.is_empty()); + assert!(!v.file_version.is_empty()); + } + + #[test] + fn test_format_version_output() { + let v = VersionInfo { + tool_version: "1.0.0".into(), + file_version: "0.3.0".into(), + }; + let out = format_version(&v); + assert!(out.contains("seqspec version: 1.0.0")); + assert!(out.contains("seqspec file version: 0.3.0")); + } + + #[test] + fn test_version_roundtrip() { + let spec = dogma_spec(); + let v = seqspec_version(&spec); + let out = format_version(&v); + assert!(out.contains(&v.tool_version)); + assert!(out.contains(&v.file_version)); + } +} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 00000000..190666f8 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,525 @@ +use crate::auth::RemoteAccess; +use crate::compat::AssayCompat; +use crate::models::assay::Assay; +use crate::models::onlist::Onlist; +use crate::models::read::Read; +use crate::models::region::{Region, RegionCoordinate}; + +use flate2::read::GzDecoder; +use serde_yaml; +use std::io::Read as IoRead; + +pub fn complement_base(c: char) -> char { + match c { + 'A' => 'T', + 'T' => 'A', + 'G' => 'C', + 'C' => 'G', + 'R' => 'Y', + 'Y' => 'R', + 'S' => 'S', + 'W' => 'W', + 'K' => 'M', + 'M' => 'K', + 'B' => 'V', + 'D' => 'H', + 'V' => 'B', + 'H' => 'D', + 'N' => 'N', + 'X' => 'X', + _ => 'N', + } +} + +pub fn complement_seq(s: &str) -> String { + s.chars() + .map(|c| complement_base(c.to_ascii_uppercase())) + .collect() +} + +// pub fn to_pydict(py: Python<'_>, v: &T) -> Result { +// let obj = pythonize::pythonize(py, v)?; +// Ok(obj.into()) +// } + +pub fn load_spec(spec: &std::path::PathBuf) -> Assay { + let mut f: std::fs::File = std::fs::File::open(spec).expect("Could not open file."); + let mut magic = [0_u8; 2]; + f.read_exact(&mut magic) + .expect("Could not read file header."); + drop(f); + + let reader: Box = if magic == [0x1f, 0x8b] { + let gz = GzDecoder::new(std::fs::File::open(spec).expect("Could not open file.")); + Box::new(gz) + } else { + Box::new(std::fs::File::open(spec).expect("Could not open file.")) + }; + + let spec: AssayCompat = serde_yaml::from_reader(reader).expect("Could not read values."); + + spec.into_assay() +} + +pub fn local_resource_url<'a>( + url: &'a str, + filename: &str, + resource: &str, +) -> Result<&'a str, String> { + if url.is_empty() { + Err(format!("local {} '{}' has empty url", resource, filename)) + } else { + Ok(url) + } +} + +pub fn local_onlist_locator(onlist: &Onlist) -> Result<&str, String> { + local_resource_url(&onlist.url, &onlist.filename, "onlist") +} + +/// Read a local text file into Vec, handling optional .gz +pub fn read_local_list(path: &std::path::Path) -> Result, String> { + let p = if path.exists() { + path.to_path_buf() + } else { + let gz = std::path::PathBuf::from(format!("{}.gz", path.display())); + gz + }; + if !p.exists() { + return Err(format!("path not found: {}", path.display())); + } + if p.extension().map(|e| e == "gz").unwrap_or(false) { + let f = std::fs::File::open(&p).map_err(|e| e.to_string())?; + let mut dec = GzDecoder::new(f); + let mut s = String::new(); + use std::io::Read; + dec.read_to_string(&mut s).map_err(|e| e.to_string())?; + Ok(s.lines() + .map(|l| l.trim().to_string()) + .filter(|l| !l.is_empty()) + .collect()) + } else { + let s = std::fs::read_to_string(&p).map_err(|e| e.to_string())?; + Ok(s.lines() + .map(|l| l.trim().to_string()) + .filter(|l| !l.is_empty()) + .collect()) + } +} + +/// Fetch a remote text file (http/https/ftp) and return lines +pub fn read_remote_list(url: &str, remote_access: &RemoteAccess) -> Result, String> { + let text = remote_access + .with_reader(url, |mut reader| { + let mut data = Vec::new(); + reader.read_to_end(&mut data)?; + let text = if url.ends_with(".gz") { + let mut dec = GzDecoder::new(&data[..]); + let mut s = String::new(); + dec.read_to_string(&mut s)?; + s + } else { + String::from_utf8(data)? + }; + Ok(text) + }) + .map_err(|e| e.to_string())?; + Ok(text + .lines() + .map(|l| l.trim().to_string()) + .filter(|l| !l.is_empty()) + .collect()) +} + +/// Map a read_id to the ordered list of regions on that read's strand. +/// +/// Behavior mirrors Python utils.map_read_id_to_regions: +/// - Find the `Read` by `read_id` and its `primer_id`. +/// - From the library spec for `modality`, collect an ordered list where the +/// primer region is present (but not its children). +/// - If strand is "neg", return all regions before the primer in reverse order. +/// Else, return all regions after the primer in forward order. +pub fn map_read_id_to_regions( + spec: &Assay, + modality: &str, + read_id: &str, +) -> Result<(Read, Vec), String> { + // get the read object and primer id + let read = spec + .get_read(read_id) + .ok_or_else(|| format!("read_id '{}' not found", read_id))?; + let primer_id = read.primer_id.clone(); + + // get all atomic elements from library + let libspec = spec + .get_libspec(modality) + .ok_or_else(|| format!("modality '{}' not found in library_spec", modality))?; + + // get the (ordered) leaves ensuring region with primer_id is included (but not its children) + let leaves = libspec.get_leaves_with_region_id(&primer_id); + + // get the index of the primer in the list of leaves + let primer_idx = leaves + .iter() + .position(|leaf| leaf.region_id == primer_id) + .ok_or_else(|| { + let ids: Vec = leaves.iter().map(|l| l.region_id.clone()).collect(); + format!("primer_id '{}' not found in regions {:?}", primer_id, ids) + })?; + + // If we are on the opposite strand, we go in the opposite way + let rgns: Vec = if read.strand == "neg" { + let mut v = leaves[..primer_idx].to_vec(); + v.reverse(); + v + } else { + leaves[primer_idx + 1..].to_vec() + }; + + Ok((read, rgns)) +} + +/// Given an ordered list of Regions, produce contiguous RegionCoordinates +/// with half-open ranges [start, stop), where each region spans its max_len. +pub fn project_regions_to_coordinates(regions: Vec) -> Vec { + let mut rcs: Vec = Vec::new(); + let mut prev: i64 = 0; + for region in regions.into_iter() { + let nxt = prev + region.max_len; + rcs.push(RegionCoordinate::new(region, prev, nxt)); + prev = nxt; + } + rcs +} + +/// Intersect a list of RegionCoordinates with a read window [read_start, read_stop). +/// Returns only overlapping coordinates, trimmed to the window. +pub fn itx_read( + region_coordinates: Vec, + read_start: i64, + read_stop: i64, +) -> Vec { + let mut new_rcs: Vec = Vec::new(); + for rc in region_coordinates.into_iter() { + if read_start >= rc.stop || read_stop <= rc.start { + continue; + } + + let mut rc_copy = rc.clone(); + if read_start >= rc_copy.start { + rc_copy.start = read_start; + } + if read_stop < rc_copy.stop { + rc_copy.stop = read_stop; + } + new_rcs.push(rc_copy); + } + new_rcs +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::models::region::Region; + use std::path::PathBuf; + + fn dogma_spec() -> Assay { + load_spec(&PathBuf::from("tests/fixtures/spec.yaml")) + } + + #[test] + fn test_load_spec_reads_gzipped_yaml() { + let spec = load_spec(&PathBuf::from("tests/fixtures/spec.yaml.gz")); + assert_eq!(spec.assay_id, "DOGMAseq-DIG"); + assert_eq!(spec.seqspec_version, Some("0.4.0".to_string())); + } + + fn leaf(id: &str, len: i64) -> Region { + Region::new( + id.into(), + "barcode".into(), + id.into(), + "fixed".into(), + "A".repeat(len as usize), + len, + len, + None, + vec![], + ) + } + + // ---- complement ---- + + #[test] + fn test_complement_base_standard() { + assert_eq!(complement_base('A'), 'T'); + assert_eq!(complement_base('T'), 'A'); + assert_eq!(complement_base('C'), 'G'); + assert_eq!(complement_base('G'), 'C'); + } + + #[test] + fn test_complement_base_iupac() { + assert_eq!(complement_base('R'), 'Y'); + assert_eq!(complement_base('Y'), 'R'); + assert_eq!(complement_base('S'), 'S'); + assert_eq!(complement_base('W'), 'W'); + assert_eq!(complement_base('K'), 'M'); + assert_eq!(complement_base('M'), 'K'); + assert_eq!(complement_base('B'), 'V'); + assert_eq!(complement_base('V'), 'B'); + assert_eq!(complement_base('D'), 'H'); + assert_eq!(complement_base('H'), 'D'); + assert_eq!(complement_base('N'), 'N'); + assert_eq!(complement_base('X'), 'X'); + } + + #[test] + fn test_complement_base_unknown() { + assert_eq!(complement_base('Z'), 'N'); + assert_eq!(complement_base('?'), 'N'); + } + + #[test] + fn test_complement_seq() { + assert_eq!(complement_seq("ATCG"), "TAGC"); + assert_eq!(complement_seq("AAAA"), "TTTT"); + assert_eq!(complement_seq(""), ""); + } + + #[test] + fn test_complement_seq_lowercase() { + assert_eq!(complement_seq("atcg"), "TAGC"); + assert_eq!(complement_seq("AaTt"), "TTAA"); + } + + // ---- load_spec ---- + + #[test] + fn test_load_spec() { + let spec = dogma_spec(); + assert_eq!(spec.assay_id, "DOGMAseq-DIG"); + assert_eq!(spec.modalities.len(), 4); + assert_eq!(spec.sequence_spec.len(), 9); // 2 RNA + 3 ATAC + 2 Protein + 2 Tag + assert_eq!(spec.library_spec.len(), 4); + } + + #[test] + fn test_load_spec_accepts_legacy_scalar_protocol_fields() { + let spec = load_spec(&PathBuf::from( + "tests/fixtures/legacy_0_3_scalar_protocols.yaml", + )); + + assert_eq!(spec.seqspec_version, Some("0.3.0".to_string())); + assert_eq!(spec.modalities, vec!["rna".to_string()]); + + let sequence_protocol = spec.sequence_protocol.expect("sequence protocol"); + assert_eq!(sequence_protocol.len(), 1); + assert_eq!(sequence_protocol[0].protocol_id, "NovaSeq"); + assert_eq!(sequence_protocol[0].name, "NovaSeq"); + assert_eq!(sequence_protocol[0].modality, "rna"); + + let library_kit = spec.library_kit.expect("library kit"); + assert_eq!(library_kit.len(), 1); + assert_eq!(library_kit[0].kit_id, "LegacyKit"); + assert_eq!(library_kit[0].name.as_deref(), Some("LegacyKit")); + assert_eq!(library_kit[0].modality, "rna"); + } + + #[test] + fn test_load_spec_accepts_legacy_missing_files_and_short_onlist() { + let spec = load_spec(&PathBuf::from( + "tests/fixtures/legacy_0_2_missing_fields.yaml", + )); + + assert_eq!(spec.seqspec_version, Some("0.2.0".to_string())); + assert_eq!(spec.sequence_spec.len(), 1); + assert!(spec.sequence_spec[0].files.is_empty()); + + let barcode = spec.library_spec[0] + .get_region_by_id("barcode") + .into_iter() + .next() + .expect("barcode region"); + let onlist = barcode.onlist.expect("barcode onlist"); + assert_eq!(onlist.filename, "whitelist.txt.gz"); + assert_eq!(onlist.md5, "abc123"); + assert_eq!(onlist.file_id, ""); + assert_eq!(onlist.urltype, "local"); + } + + // ---- map_read_id_to_regions ---- + + #[test] + fn test_map_read_id_to_regions_pos() { + let spec = dogma_spec(); + let result = map_read_id_to_regions(&spec, "rna", "rna_R1"); + assert!(result.is_ok()); + let (r, regions) = result.unwrap(); + assert_eq!(r.read_id, "rna_R1"); + assert_eq!(r.strand, "pos"); + assert_eq!(regions.len(), 4); + let region_ids: Vec<&str> = regions.iter().map(|r| r.region_id.as_str()).collect(); + assert_eq!( + region_ids, + vec!["rna_cell_bc", "rna_umi", "cdna", "rna_truseq_read2"] + ); + } + + #[test] + fn test_map_read_id_to_regions_neg() { + let spec = dogma_spec(); + let result = map_read_id_to_regions(&spec, "rna", "rna_R2"); + assert!(result.is_ok()); + let (r, regions) = result.unwrap(); + assert_eq!(r.read_id, "rna_R2"); + assert_eq!(r.strand, "neg"); + assert_eq!(regions.len(), 4); + // Negative strand reverses the region order + let region_ids: Vec<&str> = regions.iter().map(|r| r.region_id.as_str()).collect(); + assert_eq!( + region_ids, + vec!["cdna", "rna_umi", "rna_cell_bc", "rna_truseq_read1"] + ); + } + + #[test] + fn test_map_read_id_to_regions_invalid_read() { + let spec = dogma_spec(); + let result = map_read_id_to_regions(&spec, "rna", "nonexistent_read"); + assert!(result.is_err()); + assert!(result.unwrap_err().contains("not found")); + } + + #[test] + fn test_map_read_id_to_regions_invalid_modality() { + let spec = dogma_spec(); + let result = map_read_id_to_regions(&spec, "nonexistent", "rna_R1"); + assert!(result.is_err()); + } + + // ---- project_regions_to_coordinates ---- + + #[test] + fn test_project_regions_to_coordinates() { + let regions = vec![leaf("a", 10), leaf("b", 20), leaf("c", 5)]; + let coords = project_regions_to_coordinates(regions); + assert_eq!(coords.len(), 3); + assert_eq!(coords[0].start, 0); + assert_eq!(coords[0].stop, 10); + assert_eq!(coords[1].start, 10); + assert_eq!(coords[1].stop, 30); + assert_eq!(coords[2].start, 30); + assert_eq!(coords[2].stop, 35); + } + + // ---- itx_read ---- + + #[test] + fn test_itx_read() { + let regions = vec![leaf("a", 10), leaf("b", 20), leaf("c", 5)]; + let coords = project_regions_to_coordinates(regions); + + // Read window [5, 25) — should trim a and b, exclude c + let result = itx_read(coords, 5, 25); + assert_eq!(result.len(), 2); + assert_eq!(result[0].start, 5); + assert_eq!(result[0].stop, 10); + assert_eq!(result[1].start, 10); + assert_eq!(result[1].stop, 25); + } + + #[test] + fn test_itx_read_no_overlap() { + let regions = vec![leaf("a", 10)]; + let coords = project_regions_to_coordinates(regions); + let result = itx_read(coords, 20, 30); + assert!(result.is_empty()); + } + + #[test] + fn test_itx_read_full_overlap() { + let regions = vec![leaf("a", 10)]; + let coords = project_regions_to_coordinates(regions); + let result = itx_read(coords, 0, 100); + assert_eq!(result.len(), 1); + assert_eq!(result[0].start, 0); + assert_eq!(result[0].stop, 10); + } + + // ---- read_local_list ---- + + #[test] + fn test_read_local_list_plain() { + let path = PathBuf::from("tests/fixtures/onlist_joined.txt"); + let result = read_local_list(&path).unwrap(); + assert_eq!(result.len(), 736320); + assert_eq!(result[0], "AAACAGCCAAACAACA"); + } + + #[test] + fn test_read_local_list_gz() { + let path = PathBuf::from("tests/fixtures/RNA-737K-arc-v1.txt.gz"); + let result = read_local_list(&path).unwrap(); + assert_eq!(result.len(), 736320); + } + + #[test] + fn test_read_local_list_gz_fallback() { + // Try path without .gz extension — read_local_list should find the .gz variant + let path = PathBuf::from("tests/fixtures/RNA-737K-arc-v1.txt"); + let result = read_local_list(&path).unwrap(); + assert_eq!(result.len(), 736320); + } + + #[test] + fn test_read_local_list_not_found() { + let path = PathBuf::from("tests/fixtures/nonexistent.txt"); + let result = read_local_list(&path); + assert!(result.is_err()); + } + + #[test] + fn test_local_onlist_locator_prefers_url_when_present() { + let onlist = Onlist::new( + "ol1".into(), + "display.txt".into(), + "txt".into(), + 0, + "nested/whitelist.txt".into(), + "local".into(), + String::new(), + ); + + assert_eq!( + local_onlist_locator(&onlist).unwrap(), + "nested/whitelist.txt" + ); + } + + #[test] + fn test_local_onlist_locator_errors_when_url_is_empty() { + let onlist = Onlist::new( + "ol1".into(), + "display.txt".into(), + "txt".into(), + 0, + String::new(), + "local".into(), + String::new(), + ); + + assert_eq!( + local_onlist_locator(&onlist).unwrap_err(), + "local onlist 'display.txt' has empty url" + ); + } + + #[test] + fn test_local_resource_url_errors_when_url_is_empty() { + assert_eq!( + local_resource_url("", "display.fastq.gz", "file").unwrap_err(), + "local file 'display.fastq.gz' has empty url" + ); + } +} diff --git a/tests/fixtures/check_overlap_warning/rna_R1.fastq b/tests/fixtures/check_overlap_warning/rna_R1.fastq new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/tests/fixtures/check_overlap_warning/rna_R1.fastq @@ -0,0 +1 @@ + diff --git a/tests/fixtures/check_overlap_warning/rna_R2.fastq b/tests/fixtures/check_overlap_warning/rna_R2.fastq new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/tests/fixtures/check_overlap_warning/rna_R2.fastq @@ -0,0 +1 @@ + diff --git a/tests/fixtures/check_overlap_warning/spec.yaml b/tests/fixtures/check_overlap_warning/spec.yaml new file mode 100644 index 00000000..2d89ff33 --- /dev/null +++ b/tests/fixtures/check_overlap_warning/spec.yaml @@ -0,0 +1,102 @@ +seqspec_version: 0.4.0 +assay_id: overlap-warning +name: Overlap Warning Fixture +doi: https://doi.org/10.0000/overlap-warning +date: 25 March 2026 +description: Minimal valid fixture with overlapping read coverage. +modalities: + - rna +lib_struct: "" +sequence_protocol: + - protocol_id: illumina-protocol + name: Illumina + modality: rna +sequence_kit: + - kit_id: test-sequencing-kit + name: Illumina + modality: rna +library_protocol: + - protocol_id: test-library-protocol + name: Test library protocol + modality: rna +library_kit: + - kit_id: test-library-kit + name: Test library kit + modality: rna +sequence_spec: + - read_id: rna_R1 + name: Read 1 + modality: rna + primer_id: primer_r1 + min_len: 8 + max_len: 8 + strand: pos + files: + - file_id: rna_R1.fastq + filename: rna_R1.fastq + filetype: fastq + filesize: 0 + url: rna_R1.fastq + urltype: local + md5: "00000000000000000000000000000000" + - read_id: rna_R2 + name: Read 2 + modality: rna + primer_id: primer_r2 + min_len: 8 + max_len: 8 + strand: neg + files: + - file_id: rna_R2.fastq + filename: rna_R2.fastq + filetype: fastq + filesize: 0 + url: rna_R2.fastq + urltype: local + md5: "11111111111111111111111111111111" +library_spec: + - region_id: rna + region_type: rna + name: RNA + sequence_type: joined + sequence: AAAACCCCGGGGTTTT + min_len: 16 + max_len: 16 + onlist: null + regions: + - region_id: primer_r1 + region_type: truseq_read1 + name: Primer R1 + sequence_type: fixed + sequence: AAAA + min_len: 4 + max_len: 4 + onlist: null + regions: [] + - region_id: barcode + region_type: barcode + name: Barcode + sequence_type: fixed + sequence: CCCC + min_len: 4 + max_len: 4 + onlist: null + regions: [] + - region_id: umi + region_type: umi + name: UMI + sequence_type: fixed + sequence: GGGG + min_len: 4 + max_len: 4 + onlist: null + regions: [] + - region_id: primer_r2 + region_type: truseq_read2 + name: Primer R2 + sequence_type: fixed + sequence: TTTT + min_len: 4 + max_len: 4 + onlist: null + regions: [] diff --git a/tests/fixtures/legacy_0_2_missing_fields.yaml b/tests/fixtures/legacy_0_2_missing_fields.yaml new file mode 100644 index 00000000..1b3928d6 --- /dev/null +++ b/tests/fixtures/legacy_0_2_missing_fields.yaml @@ -0,0 +1,51 @@ +!Assay +seqspec_version: 0.2.0 +assay_id: legacy-02 +name: legacy-02 +doi: "" +date: "2024-01-01" +description: legacy missing files and short onlist +modalities: + - rna +lib_struct: "" +sequence_spec: + - !Read + read_id: rna_R1 + name: rna_R1 + modality: rna + primer_id: truseq_read1 + min_len: 16 + max_len: 16 + strand: pos +library_spec: + - !Region + region_id: rna + region_type: rna + name: rna + sequence_type: joined + sequence: "" + min_len: 0 + max_len: 0 + onlist: null + regions: + - !Region + region_id: truseq_read1 + region_type: truseq_read1 + name: truseq_read1 + sequence_type: fixed + sequence: ACGTACGT + min_len: 8 + max_len: 8 + onlist: null + regions: [] + - !Region + region_id: barcode + region_type: barcode + name: barcode + sequence_type: onlist + min_len: 8 + max_len: 8 + onlist: + filename: whitelist.txt.gz + md5: abc123 + regions: [] diff --git a/tests/fixtures/legacy_0_3_scalar_protocols.yaml b/tests/fixtures/legacy_0_3_scalar_protocols.yaml new file mode 100644 index 00000000..f42be777 --- /dev/null +++ b/tests/fixtures/legacy_0_3_scalar_protocols.yaml @@ -0,0 +1,62 @@ +!Assay +seqspec_version: 0.3.0 +assay_id: legacy-03 +name: legacy-03 +doi: "" +date: "2025-01-01" +description: legacy scalar protocol fields +modalities: + - rna +lib_struct: "" +library_protocol: LegacyLibraryProtocol +library_kit: LegacyKit +sequence_protocol: NovaSeq +sequence_kit: NovaSeqKit +sequence_spec: + - !Read + read_id: rna_R1 + name: rna_R1 + modality: rna + primer_id: truseq_read1 + min_len: 28 + max_len: 28 + strand: pos + files: + - file_id: rna_R1.fastq.gz + filename: rna_R1.fastq.gz + filetype: gz + filesize: 0 + url: rna_R1.fastq.gz + urltype: local + md5: "" +library_spec: + - !Region + region_id: rna + region_type: rna + name: rna + sequence_type: joined + sequence: "" + min_len: 0 + max_len: 0 + onlist: null + regions: + - !Region + region_id: truseq_read1 + region_type: truseq_read1 + name: truseq_read1 + sequence_type: fixed + sequence: ACGTACGT + min_len: 8 + max_len: 8 + onlist: null + regions: [] + - !Region + region_id: umi + region_type: umi + name: umi + sequence_type: random + sequence: XXXXXXXXXX + min_len: 10 + max_len: 10 + onlist: null + regions: [] diff --git a/tests/fixtures/onlist_ambiguous_region_type/barcode_r1.txt b/tests/fixtures/onlist_ambiguous_region_type/barcode_r1.txt new file mode 100644 index 00000000..104cbc4e --- /dev/null +++ b/tests/fixtures/onlist_ambiguous_region_type/barcode_r1.txt @@ -0,0 +1 @@ +AA diff --git a/tests/fixtures/onlist_ambiguous_region_type/barcode_r2.txt b/tests/fixtures/onlist_ambiguous_region_type/barcode_r2.txt new file mode 100644 index 00000000..d8fd35e8 --- /dev/null +++ b/tests/fixtures/onlist_ambiguous_region_type/barcode_r2.txt @@ -0,0 +1 @@ +TT diff --git a/tests/fixtures/onlist_ambiguous_region_type/spec.yaml b/tests/fixtures/onlist_ambiguous_region_type/spec.yaml new file mode 100644 index 00000000..97ff368b --- /dev/null +++ b/tests/fixtures/onlist_ambiguous_region_type/spec.yaml @@ -0,0 +1,104 @@ +seqspec_version: 0.4.0 +assay_id: ambiguous_onlist_region_type +name: Ambiguous region-type onlist fixture +doi: https://doi.org/10.1101/ambiguous-onlist +date: 25 March 2026 +description: Minimal fixture where the same region type appears in two reads. +modalities: + - rna +lib_struct: '' +sequence_protocol: Illumina +sequence_kit: test-sequencing-kit +library_protocol: test-library-protocol +library_kit: test-library-kit +sequence_spec: + - read_id: rna_read_1 + name: Read 1 + modality: rna + primer_id: primer_r1 + min_len: 2 + max_len: 2 + strand: neg + files: + - file_id: rna_read_1.fastq.gz + filename: rna_read_1.fastq.gz + filetype: fastq + filesize: 0 + url: rna_read_1.fastq.gz + urltype: local + md5: "00000000000000000000000000000001" + - read_id: rna_read_2 + name: Read 2 + modality: rna + primer_id: primer_r2 + min_len: 2 + max_len: 2 + strand: neg + files: + - file_id: rna_read_2.fastq.gz + filename: rna_read_2.fastq.gz + filetype: fastq + filesize: 0 + url: rna_read_2.fastq.gz + urltype: local + md5: "00000000000000000000000000000002" +library_spec: + - region_id: rna + region_type: rna + name: RNA + sequence_type: joined + sequence: NNAANNGG + min_len: 8 + max_len: 8 + onlist: null + regions: + - region_id: barcode_r1 + region_type: barcode + name: Barcode R1 + sequence_type: onlist + sequence: NN + min_len: 2 + max_len: 2 + onlist: + file_id: barcode_r1.txt + filename: barcode_r1.txt + filetype: txt + filesize: 0 + url: barcode_r1.txt + urltype: local + md5: "11111111111111111111111111111111" + regions: [] + - region_id: primer_r1 + region_type: truseq_read1 + name: Primer R1 + sequence_type: fixed + sequence: AA + min_len: 2 + max_len: 2 + onlist: null + regions: [] + - region_id: barcode_r2 + region_type: barcode + name: Barcode R2 + sequence_type: onlist + sequence: NN + min_len: 2 + max_len: 2 + onlist: + file_id: barcode_r2.txt + filename: barcode_r2.txt + filetype: txt + filesize: 0 + url: barcode_r2.txt + urltype: local + md5: "22222222222222222222222222222222" + regions: [] + - region_id: primer_r2 + region_type: truseq_read2 + name: Primer R2 + sequence_type: fixed + sequence: GG + min_len: 2 + max_len: 2 + onlist: null + regions: [] diff --git a/tests/fixtures/onlist_issue_68/barcode_a.txt b/tests/fixtures/onlist_issue_68/barcode_a.txt new file mode 100644 index 00000000..9a855085 --- /dev/null +++ b/tests/fixtures/onlist_issue_68/barcode_a.txt @@ -0,0 +1,2 @@ +AA +AC diff --git a/tests/fixtures/onlist_issue_68/barcode_b.txt b/tests/fixtures/onlist_issue_68/barcode_b.txt new file mode 100644 index 00000000..a71fb484 --- /dev/null +++ b/tests/fixtures/onlist_issue_68/barcode_b.txt @@ -0,0 +1,2 @@ +TT +TG diff --git a/tests/fixtures/onlist_issue_68/spec.yaml b/tests/fixtures/onlist_issue_68/spec.yaml new file mode 100644 index 00000000..ebb47e5e --- /dev/null +++ b/tests/fixtures/onlist_issue_68/spec.yaml @@ -0,0 +1,80 @@ +seqspec_version: 0.4.0 +assay_id: issue68 +name: Issue 68 fixture +doi: https://doi.org/10.1101/issue68 +date: 25 March 2026 +description: Minimal fixture for onlist ordering regression. +modalities: + - rna +lib_struct: '' +sequence_protocol: Illumina +sequence_kit: test-sequencing-kit +library_protocol: test-library-protocol +library_kit: test-library-kit +sequence_spec: + - read_id: rna_read + name: Read 1 + modality: rna + primer_id: primer + min_len: 4 + max_len: 4 + strand: neg + files: + - file_id: rna_read.fastq.gz + filename: rna_read.fastq.gz + filetype: fastq + filesize: 0 + url: rna_read.fastq.gz + urltype: local + md5: "00000000000000000000000000000000" +library_spec: + - region_id: rna + region_type: rna + name: RNA + sequence_type: joined + sequence: NNNNGG + min_len: 6 + max_len: 6 + onlist: null + regions: + - region_id: barcode_a + region_type: barcode + name: Barcode A + sequence_type: onlist + sequence: NN + min_len: 2 + max_len: 2 + onlist: + file_id: barcode_a.txt + filename: barcode_a.txt + filetype: txt + filesize: 0 + url: barcode_a.txt + urltype: local + md5: "11111111111111111111111111111111" + regions: [] + - region_id: barcode_b + region_type: barcode + name: Barcode B + sequence_type: onlist + sequence: NN + min_len: 2 + max_len: 2 + onlist: + file_id: barcode_b.txt + filename: barcode_b.txt + filetype: txt + filesize: 0 + url: barcode_b.txt + urltype: local + md5: "22222222222222222222222222222222" + regions: [] + - region_id: primer + region_type: truseq_read2 + name: Primer + sequence_type: fixed + sequence: GG + min_len: 2 + max_len: 2 + onlist: null + regions: [] diff --git a/tests/fixtures/onlist_read_clip/barcode_a.txt b/tests/fixtures/onlist_read_clip/barcode_a.txt new file mode 100644 index 00000000..9a855085 --- /dev/null +++ b/tests/fixtures/onlist_read_clip/barcode_a.txt @@ -0,0 +1,2 @@ +AA +AC diff --git a/tests/fixtures/onlist_read_clip/barcode_b.txt b/tests/fixtures/onlist_read_clip/barcode_b.txt new file mode 100644 index 00000000..a71fb484 --- /dev/null +++ b/tests/fixtures/onlist_read_clip/barcode_b.txt @@ -0,0 +1,2 @@ +TT +TG diff --git a/tests/fixtures/onlist_read_clip/spec.yaml b/tests/fixtures/onlist_read_clip/spec.yaml new file mode 100644 index 00000000..b1f3425b --- /dev/null +++ b/tests/fixtures/onlist_read_clip/spec.yaml @@ -0,0 +1,80 @@ +seqspec_version: 0.4.0 +assay_id: onlist_read_clip +name: Onlist read clip fixture +doi: https://doi.org/10.1101/onlist-read-clip +date: 25 March 2026 +description: Minimal fixture to ensure read onlists respect the read window. +modalities: + - rna +lib_struct: '' +sequence_protocol: Illumina +sequence_kit: test-sequencing-kit +library_protocol: test-library-protocol +library_kit: test-library-kit +sequence_spec: + - read_id: rna_read + name: Read 1 + modality: rna + primer_id: primer + min_len: 2 + max_len: 2 + strand: pos + files: + - file_id: rna_read.fastq.gz + filename: rna_read.fastq.gz + filetype: fastq + filesize: 0 + url: rna_read.fastq.gz + urltype: local + md5: "00000000000000000000000000000000" +library_spec: + - region_id: rna + region_type: rna + name: RNA + sequence_type: joined + sequence: GGNNNN + min_len: 6 + max_len: 6 + onlist: null + regions: + - region_id: primer + region_type: truseq_read1 + name: Primer + sequence_type: fixed + sequence: GG + min_len: 2 + max_len: 2 + onlist: null + regions: [] + - region_id: barcode_a + region_type: barcode + name: Barcode A + sequence_type: onlist + sequence: NN + min_len: 2 + max_len: 2 + onlist: + file_id: barcode_a.txt + filename: barcode_a.txt + filetype: txt + filesize: 0 + url: barcode_a.txt + urltype: local + md5: "11111111111111111111111111111111" + regions: [] + - region_id: barcode_b + region_type: barcode + name: Barcode B + sequence_type: onlist + sequence: NN + min_len: 2 + max_len: 2 + onlist: + file_id: barcode_b.txt + filename: barcode_b.txt + filetype: txt + filesize: 0 + url: barcode_b.txt + urltype: local + md5: "22222222222222222222222222222222" + regions: [] diff --git a/tests/fixtures/random_with_n/fastqs/I1.fastq b/tests/fixtures/random_with_n/fastqs/I1.fastq new file mode 100644 index 00000000..e1164c11 --- /dev/null +++ b/tests/fixtures/random_with_n/fastqs/I1.fastq @@ -0,0 +1,4 @@ +@i1_1 +ACGT ++ +IIII diff --git a/tests/fixtures/random_with_n/spec.yaml b/tests/fixtures/random_with_n/spec.yaml new file mode 100644 index 00000000..9a74db99 --- /dev/null +++ b/tests/fixtures/random_with_n/spec.yaml @@ -0,0 +1,53 @@ +seqspec_version: 0.4.0 +assay_id: random-with-n +name: Random With N Fixture +doi: https://doi.org/10.0000/random-with-n +date: 2026-03-20 +description: Minimal fixture that uses N in a random region sequence +modalities: + - rna +lib_struct: "" +sequence_spec: + - read_id: I1 + name: Index Read + modality: rna + primer_id: primer + min_len: 4 + max_len: 4 + strand: pos + files: + - file_id: I1.fastq + filename: I1.fastq + filetype: fastq + filesize: 0 + url: fastqs/I1.fastq + urltype: local + md5: "" +library_spec: + - region_id: rna + region_type: rna + name: RNA + sequence_type: joined + sequence: AAAANNNN + min_len: 8 + max_len: 8 + onlist: null + regions: + - region_id: primer + region_type: truseq_read1 + name: Primer + sequence_type: fixed + sequence: AAAA + min_len: 4 + max_len: 4 + onlist: null + regions: [] + - region_id: index7 + region_type: index7 + name: Index 7 + sequence_type: random + sequence: NNNN + min_len: 4 + max_len: 4 + onlist: null + regions: [] diff --git a/tests/fixtures/spec.yaml b/tests/fixtures/spec.yaml index 326ef70c..e03934a6 100644 --- a/tests/fixtures/spec.yaml +++ b/tests/fixtures/spec.yaml @@ -3,450 +3,457 @@ assay_id: DOGMAseq-DIG name: DOGMAseq-DIG/Illumina doi: https://doi.org/10.1186/s13059-022-02698-8 date: 23 June 2022 -description: - DOGMAseq with digitonin (DIG) is a single-cell multi-omics assay that +description: DOGMAseq with digitonin (DIG) is a single-cell multi-omics assay that simultaneously measures protein, RNA, and chromatin accessibility in single cells. The assay is based on the DOGMAseq technology, which uses a DNA-barcoded antibody library to capture proteins of interest, followed by a single-cell RNA-seq protocol and a single-cell ATAC-seq protocol. The DOGMAseq-LLL assay is designed to be compatible with the 10x Genomics Chromium platform. modalities: - - protein - - tag - - rna - - atac -lib_struct: "" -sequence_protocol: Illumina NovaSeq 6000 (EFO:0008637) +- protein +- tag +- rna +- atac +lib_struct: '' +sequence_protocol: +- protocol_id: Illumina NovaSeq 6000 (EFO:0008637) + name: Illumina NovaSeq 6000 (EFO:0008637) + modality: protein +- protocol_id: Illumina NovaSeq 6000 (EFO:0008637) + name: Illumina NovaSeq 6000 (EFO:0008637) + modality: tag +- protocol_id: Illumina NovaSeq 6000 (EFO:0008637) + name: Illumina NovaSeq 6000 (EFO:0008637) + modality: rna +- protocol_id: Illumina NovaSeq 6000 (EFO:0008637) + name: Illumina NovaSeq 6000 (EFO:0008637) + modality: atac sequence_kit: - - kit_id: "NovaSeq 6000 S2 Reagent Kit v1.5 (100\u2009cycles)" - name: illumina - modality: rna - - kit_id: "NovaSeq 6000 S1 Reagent Kit v1.5 (100\u2009cycles)" - name: illumina - modality: tag - - kit_id: "NovaSeq 6000 S1 Reagent Kit v1.5 (100\u2009cycles)" - name: illumina - modality: protein - - kit_id: "NovaSeq 6000 S1 Reagent Kit v1.5 (100\u2009cycles)" - name: illumina - modality: atac +- kit_id: "NovaSeq 6000 S2 Reagent Kit v1.5 (100\u2009cycles)" + name: illumina + modality: rna +- kit_id: "NovaSeq 6000 S1 Reagent Kit v1.5 (100\u2009cycles)" + name: illumina + modality: tag +- kit_id: "NovaSeq 6000 S1 Reagent Kit v1.5 (100\u2009cycles)" + name: illumina + modality: protein +- kit_id: "NovaSeq 6000 S1 Reagent Kit v1.5 (100\u2009cycles)" + name: illumina + modality: atac library_protocol: - - protocol_id: - CG000338 Chromium Next GEM Multiome ATAC + Gene Expression Rev. D protocol - (10x Genomics) - name: DogmaSeq-DIG - modality: rna - - protocol_id: - CG000338 Chromium Next GEM Multiome ATAC + Gene Expression Rev. D protocol - (10x Genomics) - name: DogmaSeq-DIG - modality: atac - - protocol_id: - "TotalSeq\u2122-A Antibodies and Cell Hashing with 10x Single Cell\ - \ 3' Reagent Kit v3 or v3.1 single index protocol (BioLegend)" - name: DogmaSeq-DIG - modality: protein - - protocol_id: - "TotalSeq\u2122-A Antibodies and Cell Hashing with 10x Single Cell\ - \ 3' Reagent Kit v3 or v3.1 single index protocol (BioLegend)" - name: DogmaSeq-DIG - modality: tag +- protocol_id: CG000338 Chromium Next GEM Multiome ATAC + Gene Expression Rev. D protocol + (10x Genomics) + name: DogmaSeq-DIG + modality: rna +- protocol_id: CG000338 Chromium Next GEM Multiome ATAC + Gene Expression Rev. D protocol + (10x Genomics) + name: DogmaSeq-DIG + modality: atac +- protocol_id: "TotalSeq\u2122-A Antibodies and Cell Hashing with 10x Single Cell\ + \ 3' Reagent Kit v3 or v3.1 single index protocol (BioLegend)" + name: DogmaSeq-DIG + modality: protein +- protocol_id: "TotalSeq\u2122-A Antibodies and Cell Hashing with 10x Single Cell\ + \ 3' Reagent Kit v3 or v3.1 single index protocol (BioLegend)" + name: DogmaSeq-DIG + modality: tag library_kit: - - kit_id: Illumina Truseq Single Index - name: null - modality: rna +- kit_id: Illumina Truseq Single Index + name: null + modality: rna sequence_spec: - - read_id: protein_R1 - name: protein Read 1 - modality: protein - primer_id: protein_truseq_read1 - min_len: 28 - max_len: 28 - strand: pos - files: - - file_id: protein_R1_SRR18677644.fastq.gz - filename: protein_R1_SRR18677644.fastq.gz - filetype: fastq - filesize: 18174591 - url: fastqs/protein_R1_SRR18677644.fastq.gz - urltype: local - md5: 2525f8bececba54232e8857f9885f870 - - read_id: protein_R2 - name: protein Read 2 - modality: protein - primer_id: protein_truseq_read2 +- read_id: protein_R1 + name: protein Read 1 + modality: protein + primer_id: protein_truseq_read1 + min_len: 28 + max_len: 28 + strand: pos + files: + - file_id: protein_R1_SRR18677644.fastq.gz + filename: protein_R1_SRR18677644.fastq.gz + filetype: fastq + filesize: 18174591 + url: fastqs/protein_R1_SRR18677644.fastq.gz + urltype: local + md5: 2525f8bececba54232e8857f9885f870 +- read_id: protein_R2 + name: protein Read 2 + modality: protein + primer_id: protein_truseq_read2 + min_len: 15 + max_len: 15 + strand: neg + files: + - file_id: protein_R2_SRR18677644.fastq.gz + filename: protein_R2_SRR18677644.fastq.gz + filetype: fastq + filesize: 9411964 + url: fastqs/protein_R2_SRR18677644.fastq.gz + urltype: local + md5: 0100a6e62ebbc2db9e9346b796662922 +- read_id: tag_R1 + name: tag Read 1 + modality: tag + primer_id: tag_truseq_read1 + min_len: 28 + max_len: 28 + strand: pos + files: + - file_id: tag_R1_SRR18677640.fastq.gz + filename: tag_R1_SRR18677640.fastq.gz + filetype: fastq + filesize: 18034172 + url: fastqs/tag_R1_SRR18677640.fastq.gz + urltype: local + md5: 05cc5c1a87f0d646cb6d8b9cfca47f0f +- read_id: tag_R2 + name: tag Read 2 + modality: tag + primer_id: tag_truseq_read2 + min_len: 15 + max_len: 15 + strand: neg + files: + - file_id: tag_R2_SRR18677640.fastq.gz + filename: tag_R2_SRR18677640.fastq.gz + filetype: fastq + filesize: 7475374 + url: fastqs/tag_R2_SRR18677640.fastq.gz + urltype: local + md5: 8a364a8c74a77fae255f1650a5ff7cbb +- read_id: rna_R1 + name: rna Read 1 + modality: rna + primer_id: rna_truseq_read1 + min_len: 28 + max_len: 28 + strand: pos + files: + - file_id: rna_R1_SRR18677638.fastq.gz + filename: rna_R1_SRR18677638.fastq.gz + filetype: fastq + filesize: 18499436 + url: fastqs/rna_R1_SRR18677638.fastq.gz + urltype: local + md5: 7eb15a70da9b729b5a87e30b6596b641 +- read_id: rna_R2 + name: rna Read 2 + modality: rna + primer_id: rna_truseq_read2 + min_len: 102 + max_len: 102 + strand: neg + files: + - file_id: rna_R2_SRR18677638.fastq.gz + filename: rna_R2_SRR18677638.fastq.gz + filetype: fastq + filesize: 45812569 + url: fastqs/rna_R2_SRR18677638.fastq.gz + urltype: local + md5: 5e6915770e50f72e462e5b2575089c66 +- read_id: atac_R1 + name: atac Read 1 + modality: atac + primer_id: atac_truseq_read1 + min_len: 53 + max_len: 53 + strand: pos + files: + - file_id: atac_R1_SRR18677642.fastq.gz + filename: atac_R1_SRR18677642.fastq.gz + filetype: fastq + filesize: 40165947 + url: fastqs/atac_R1_SRR18677642.fastq.gz + urltype: local + md5: e1134f01b51e73ee71d391af152ef3ec +- read_id: atac_R2 + name: atac Read 2 + modality: atac + primer_id: atac_truseq_read2 + min_len: 24 + max_len: 24 + strand: pos + files: + - file_id: atac_R2_SRR18677642.fastq.gz + filename: atac_R2_SRR18677642.fastq.gz + filetype: fastq + filesize: 20987219 + url: fastqs/atac_R2_SRR18677642.fastq.gz + urltype: local + md5: 49eac34723c5d5dbaac161af5ba6bd44 +- read_id: atac_R3 + name: atac Read 3 + modality: atac + primer_id: atac_truseq_read2 + min_len: 53 + max_len: 53 + strand: neg + files: + - file_id: atac_R3_SRR18677642.fastq.gz + filename: atac_R3_SRR18677642.fastq.gz + filetype: fastq + filesize: 36580319 + url: fastqs/atac_R3_SRR18677642.fastq.gz + urltype: local + md5: 603f9e06910e166ed6df6d4134147db5 +library_spec: +- region_id: protein + region_type: protein + name: protein + sequence_type: joined + sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNXXXXXXXXXXXXXXXXXXXXXXXXXXXAGATCGGAAGAGCACACGTCTGAACTCCAGTCAC + min_len: 110 + max_len: 110 + onlist: null + regions: + - region_id: protein_truseq_read1 + region_type: truseq_read1 + name: Truseq Read 1 + sequence_type: fixed + sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCT + min_len: 33 + max_len: 33 + onlist: null + regions: [] + - region_id: protein_cell_bc + region_type: barcode + name: Cell Barcode + sequence_type: onlist + sequence: NNNNNNNNNNNNNNNN + min_len: 16 + max_len: 16 + onlist: + file_id: RNA-737K-arc-v1.txt + filename: RNA-737K-arc-v1.txt + filetype: txt + filesize: 2142553 + url: https://github.com/pachterlab/qcbc/raw/main/tests/10xMOME/RNA-737K-arc-v1.txt.gz + urltype: https + md5: a88cd21e801ae6f9a7d9a48b67ccf693 + regions: [] + - region_id: protein_umi + region_type: umi + name: umi + sequence_type: random + sequence: XXXXXXXXXXXX + min_len: 12 + max_len: 12 + onlist: null + regions: [] + - region_id: protein_seq + region_type: protein + name: protein + sequence_type: random + sequence: XXXXXXXXXXXXXXX min_len: 15 max_len: 15 - strand: neg - files: - - file_id: protein_R2_SRR18677644.fastq.gz - filename: protein_R2_SRR18677644.fastq.gz - filetype: fastq - filesize: 9411964 - url: fastqs/protein_R2_SRR18677644.fastq.gz - urltype: local - md5: 0100a6e62ebbc2db9e9346b796662922 - - read_id: tag_R1 - name: tag Read 1 - modality: tag - primer_id: tag_truseq_read1 - min_len: 28 - max_len: 28 - strand: pos - files: - - file_id: tag_R1_SRR18677640.fastq.gz - filename: tag_R1_SRR18677640.fastq.gz - filetype: fastq - filesize: 18034172 - url: fastqs/tag_R1_SRR18677640.fastq.gz - urltype: local - md5: 05cc5c1a87f0d646cb6d8b9cfca47f0f - - read_id: tag_R2 - name: tag Read 2 - modality: tag - primer_id: tag_truseq_read2 + onlist: + file_id: protein_feature_barcodes.txt + filename: protein_feature_barcodes.txt + filetype: txt + filesize: 4662 + url: protein_feature_barcodes.txt + urltype: local + md5: b5920c1fc1d22927607c31c96f5cf03e + regions: [] + - region_id: protein_truseq_read2 + region_type: truseq_read2 + name: Truseq Read 2 + sequence_type: fixed + sequence: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC + min_len: 34 + max_len: 34 + onlist: null + regions: [] +- region_id: tag + region_type: tag + name: tag + sequence_type: joined + sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNXXXXXXXXXXXXXXXXXXXXXXXXXXXAGATCGGAAGAGCACACGTCTGAACTCCAGTCAC + min_len: 110 + max_len: 110 + onlist: null + regions: + - region_id: tag_truseq_read1 + region_type: truseq_read1 + name: Truseq Read 1 + sequence_type: fixed + sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCT + min_len: 33 + max_len: 33 + onlist: null + regions: [] + - region_id: tag_cell_bc + region_type: barcode + name: Cell Barcode + sequence_type: onlist + sequence: NNNNNNNNNNNNNNNN + min_len: 16 + max_len: 16 + onlist: + file_id: RNA-737K-arc-v1.txt + filename: RNA-737K-arc-v1.txt + filetype: txt + filesize: 2142553 + url: https://github.com/pachterlab/qcbc/raw/main/tests/10xMOME/RNA-737K-arc-v1.txt.gz + urltype: https + md5: a88cd21e801ae6f9a7d9a48b67ccf693 + regions: [] + - region_id: tag_umi + region_type: umi + name: umi + sequence_type: random + sequence: XXXXXXXXXXXX + min_len: 12 + max_len: 12 + onlist: null + regions: [] + - region_id: tag_seq + region_type: tag + name: tag sequence + sequence_type: random + sequence: XXXXXXXXXXXXXXX min_len: 15 max_len: 15 - strand: neg - files: - - file_id: tag_R2_SRR18677640.fastq.gz - filename: tag_R2_SRR18677640.fastq.gz - filetype: fastq - filesize: 7475374 - url: fastqs/tag_R2_SRR18677640.fastq.gz - urltype: local - md5: 8a364a8c74a77fae255f1650a5ff7cbb - - read_id: rna_R1 - name: rna Read 1 - modality: rna - primer_id: rna_truseq_read1 - min_len: 28 - max_len: 28 - strand: pos - files: - - file_id: rna_R1_SRR18677638.fastq.gz - filename: rna_R1_SRR18677638.fastq.gz - filetype: fastq - filesize: 18499436 - url: fastqs/rna_R1_SRR18677638.fastq.gz - urltype: local - md5: 7eb15a70da9b729b5a87e30b6596b641 - - read_id: rna_R2 - name: rna Read 2 - modality: rna - primer_id: rna_truseq_read2 + onlist: + file_id: tag_feature_barcodes.txt + filename: tag_feature_barcodes.txt + filetype: txt + filesize: 208 + url: tag_feature_barcodes.txt + urltype: local + md5: de44ad6d5c4b9f381a352283a6831112 + regions: [] + - region_id: tag_truseq_read2 + region_type: truseq_read2 + name: Truseq Read 2 + sequence_type: fixed + sequence: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC + min_len: 34 + max_len: 34 + onlist: null + regions: [] +- region_id: rna + region_type: rna + name: rna + sequence_type: joined + sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXAGATCGGAAGAGCACACGTCTGAACTCCAGTCAC + min_len: 197 + max_len: 197 + onlist: null + regions: + - region_id: rna_truseq_read1 + region_type: truseq_read1 + name: Truseq Read 1 + sequence_type: fixed + sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCT + min_len: 33 + max_len: 33 + onlist: null + regions: [] + - region_id: rna_cell_bc + region_type: barcode + name: Cell Barcode + sequence_type: onlist + sequence: NNNNNNNNNNNNNNNN + min_len: 16 + max_len: 16 + onlist: + file_id: RNA-737K-arc-v1.txt + filename: RNA-737K-arc-v1.txt + filetype: txt + filesize: 2142553 + url: https://github.com/pachterlab/qcbc/raw/main/tests/10xMOME/RNA-737K-arc-v1.txt.gz + urltype: https + md5: a88cd21e801ae6f9a7d9a48b67ccf693 + regions: [] + - region_id: rna_umi + region_type: umi + name: umi + sequence_type: random + sequence: XXXXXXXXXXXX + min_len: 12 + max_len: 12 + onlist: null + regions: [] + - region_id: cdna + region_type: cdna + name: cdna + sequence_type: random + sequence: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX min_len: 102 max_len: 102 - strand: neg - files: - - file_id: rna_R2_SRR18677638.fastq.gz - filename: rna_R2_SRR18677638.fastq.gz - filetype: fastq - filesize: 45812569 - url: fastqs/rna_R2_SRR18677638.fastq.gz - urltype: local - md5: 5e6915770e50f72e462e5b2575089c66 - - read_id: atac_R1 - name: atac Read 1 - modality: atac - primer_id: atac_truseq_read1 - min_len: 53 - max_len: 53 - strand: pos - files: - - file_id: atac_R1_SRR18677642.fastq.gz - filename: atac_R1_SRR18677642.fastq.gz - filetype: fastq - filesize: 40165947 - url: fastqs/atac_R1_SRR18677642.fastq.gz - urltype: local - md5: e1134f01b51e73ee71d391af152ef3ec - - read_id: atac_R2 - name: atac Read 2 - modality: atac - primer_id: atac_truseq_read2 - min_len: 24 - max_len: 24 - strand: pos - files: - - file_id: atac_R2_SRR18677642.fastq.gz - filename: atac_R2_SRR18677642.fastq.gz - filetype: fastq - filesize: 20987219 - url: fastqs/atac_R2_SRR18677642.fastq.gz - urltype: local - md5: 49eac34723c5d5dbaac161af5ba6bd44 - - read_id: atac_R3 - name: atac Read 3 - modality: atac - primer_id: atac_truseq_read2 - min_len: 53 - max_len: 53 - strand: neg - files: - - file_id: atac_R3_SRR18677642.fastq.gz - filename: atac_R3_SRR18677642.fastq.gz - filetype: fastq - filesize: 36580319 - url: fastqs/atac_R3_SRR18677642.fastq.gz - urltype: local - md5: 603f9e06910e166ed6df6d4134147db5 -library_spec: - - region_id: protein - region_type: protein - name: protein - sequence_type: joined - sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNXXXXXXXXXXXXXXXXXXXXXXXXXXXAGATCGGAAGAGCACACGTCTGAACTCCAGTCAC - min_len: 110 - max_len: 110 onlist: null - regions: - - region_id: protein_truseq_read1 - region_type: truseq_read1 - name: Truseq Read 1 - sequence_type: fixed - sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCT - min_len: 33 - max_len: 33 - onlist: null - regions: [] - - region_id: protein_cell_bc - region_type: barcode - name: Cell Barcode - sequence_type: onlist - sequence: NNNNNNNNNNNNNNNN - min_len: 16 - max_len: 16 - onlist: - file_id: RNA-737K-arc-v1.txt - filename: RNA-737K-arc-v1.txt - filetype: txt - filesize: 2142553 - url: https://github.com/pachterlab/qcbc/raw/main/tests/10xMOME/RNA-737K-arc-v1.txt.gz - urltype: https - md5: a88cd21e801ae6f9a7d9a48b67ccf693 - regions: [] - - region_id: protein_umi - region_type: umi - name: umi - sequence_type: random - sequence: XXXXXXXXXXXX - min_len: 12 - max_len: 12 - onlist: null - regions: [] - - region_id: protein_seq - region_type: protein - name: protein - sequence_type: random - sequence: XXXXXXXXXXXXXXX - min_len: 15 - max_len: 15 - onlist: - file_id: protein_feature_barcodes.txt - filename: protein_feature_barcodes.txt - filetype: txt - filesize: 4662 - url: protein_feature_barcodes.txt - urltype: local - md5: b5920c1fc1d22927607c31c96f5cf03e - regions: [] - - region_id: protein_truseq_read2 - region_type: truseq_read2 - name: Truseq Read 2 - sequence_type: fixed - sequence: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC - min_len: 34 - max_len: 34 - onlist: null - regions: [] - - region_id: tag - region_type: tag - name: tag - sequence_type: joined - sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNXXXXXXXXXXXXXXXXXXXXXXXXXXXAGATCGGAAGAGCACACGTCTGAACTCCAGTCAC - min_len: 110 - max_len: 110 + regions: [] + - region_id: rna_truseq_read2 + region_type: truseq_read2 + name: Truseq Read 2 + sequence_type: fixed + sequence: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC + min_len: 34 + max_len: 34 + onlist: null + regions: [] +- region_id: atac + region_type: atac + name: atac + sequence_type: joined + sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCTXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXAGATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGACGCGNNNNNNNNNNNNNNNN + min_len: 191 + max_len: 191 + onlist: null + regions: + - region_id: atac_truseq_read1 + region_type: truseq_read1 + name: Truseq Read 1 + sequence_type: fixed + sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCT + min_len: 33 + max_len: 33 + onlist: null + regions: [] + - region_id: gDNA + region_type: gdna + name: gdna + sequence_type: random + sequence: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + min_len: 100 + max_len: 100 onlist: null - regions: - - region_id: tag_truseq_read1 - region_type: truseq_read1 - name: Truseq Read 1 - sequence_type: fixed - sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCT - min_len: 33 - max_len: 33 - onlist: null - regions: [] - - region_id: tag_cell_bc - region_type: barcode - name: Cell Barcode - sequence_type: onlist - sequence: NNNNNNNNNNNNNNNN - min_len: 16 - max_len: 16 - onlist: - file_id: RNA-737K-arc-v1.txt - filename: RNA-737K-arc-v1.txt - filetype: txt - filesize: 2142553 - url: https://github.com/pachterlab/qcbc/raw/main/tests/10xMOME/RNA-737K-arc-v1.txt.gz - urltype: https - md5: a88cd21e801ae6f9a7d9a48b67ccf693 - regions: [] - - region_id: tag_umi - region_type: umi - name: umi - sequence_type: random - sequence: XXXXXXXXXXXX - min_len: 12 - max_len: 12 - onlist: null - regions: [] - - region_id: tag_seq - region_type: tag - name: tag sequence - sequence_type: random - sequence: XXXXXXXXXXXXXXX - min_len: 15 - max_len: 15 - onlist: - file_id: tag_feature_barcodes.txt - filename: tag_feature_barcodes.txt - filetype: txt - filesize: 208 - url: tag_feature_barcodes.txt - urltype: local - md5: de44ad6d5c4b9f381a352283a6831112 - regions: [] - - region_id: tag_truseq_read2 - region_type: truseq_read2 - name: Truseq Read 2 - sequence_type: fixed - sequence: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC - min_len: 34 - max_len: 34 - onlist: null - regions: [] - - region_id: rna - region_type: rna - name: rna - sequence_type: joined - sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXAGATCGGAAGAGCACACGTCTGAACTCCAGTCAC - min_len: 197 - max_len: 197 + regions: [] + - region_id: atac_truseq_read2 + region_type: truseq_read2 + name: Truseq Read 2 + sequence_type: fixed + sequence: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC + min_len: 34 + max_len: 34 onlist: null - regions: - - region_id: rna_truseq_read1 - region_type: truseq_read1 - name: Truseq Read 1 - sequence_type: fixed - sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCT - min_len: 33 - max_len: 33 - onlist: null - regions: [] - - region_id: rna_cell_bc - region_type: barcode - name: Cell Barcode - sequence_type: onlist - sequence: NNNNNNNNNNNNNNNN - min_len: 16 - max_len: 16 - onlist: - file_id: RNA-737K-arc-v1.txt - filename: RNA-737K-arc-v1.txt - filetype: txt - filesize: 2142553 - url: https://github.com/pachterlab/qcbc/raw/main/tests/10xMOME/RNA-737K-arc-v1.txt.gz - urltype: https - md5: a88cd21e801ae6f9a7d9a48b67ccf693 - regions: [] - - region_id: rna_umi - region_type: umi - name: umi - sequence_type: random - sequence: XXXXXXXXXXXX - min_len: 12 - max_len: 12 - onlist: null - regions: [] - - region_id: cdna - region_type: cdna - name: cdna - sequence_type: random - sequence: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - min_len: 102 - max_len: 102 - onlist: null - regions: [] - - region_id: rna_truseq_read2 - region_type: truseq_read2 - name: Truseq Read 2 - sequence_type: fixed - sequence: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC - min_len: 34 - max_len: 34 - onlist: null - regions: [] - - region_id: atac - region_type: atac - name: atac - sequence_type: joined - sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCTXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXAGATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGACGCGNNNNNNNNNNNNNNNN - min_len: 191 - max_len: 191 + regions: [] + - region_id: spacer + region_type: linker + name: atac linker + sequence_type: fixed + sequence: CAGACGCG + min_len: 8 + max_len: 8 onlist: null - regions: - - region_id: atac_truseq_read1 - region_type: truseq_read1 - name: Truseq Read 1 - sequence_type: fixed - sequence: ACACTCTTTCCCTACACGACGCTCTTCCGATCT - min_len: 33 - max_len: 33 - onlist: null - regions: [] - - region_id: gDNA - region_type: gdna - name: gdna - sequence_type: random - sequence: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - min_len: 100 - max_len: 100 - onlist: null - regions: [] - - region_id: atac_truseq_read2 - region_type: truseq_read2 - name: Truseq Read 2 - sequence_type: fixed - sequence: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC - min_len: 34 - max_len: 34 - onlist: null - regions: [] - - region_id: spacer - region_type: linker - name: atac linker - sequence_type: fixed - sequence: CAGACGCG - min_len: 8 - max_len: 8 - onlist: null - regions: [] - - region_id: atac_cell_bc - region_type: barcode - name: Cell Barcode - sequence_type: onlist - sequence: NNNNNNNNNNNNNNNN - min_len: 16 - max_len: 16 - onlist: - file_id: ATA-737K-arc-v1_rc.txt - filename: ATA-737K-arc-v1_rc.txt - filetype: txt - filesize: 2465078 - url: https://github.com/pachterlab/qcbc/raw/main/tests/10xMOME/ATA-737K-arc-v1.txt.gz - urltype: https - md5: 3f9fc0f6ef9d72540ab010d0b5348aa1 - regions: [] + regions: [] + - region_id: atac_cell_bc + region_type: barcode + name: Cell Barcode + sequence_type: onlist + sequence: NNNNNNNNNNNNNNNN + min_len: 16 + max_len: 16 + onlist: + file_id: ATA-737K-arc-v1_rc.txt + filename: ATA-737K-arc-v1_rc.txt + filetype: txt + filesize: 2465078 + url: https://github.com/pachterlab/qcbc/raw/main/tests/10xMOME/ATA-737K-arc-v1.txt.gz + urltype: https + md5: 3f9fc0f6ef9d72540ab010d0b5348aa1 + regions: [] diff --git a/tests/test_assay.py b/tests/test_assay.py index 6153c966..f978d42a 100644 --- a/tests/test_assay.py +++ b/tests/test_assay.py @@ -1,7 +1,9 @@ import pytest import json from seqspec.Assay import Assay, Region, Read - +from seqspec.Assay import Assay, SeqProtocol, SeqKit, LibProtocol, LibKit +from seqspec.Read import ReadInput +from seqspec.Region import RegionInput def test_load_spec(dogmaseq_dig_spec): @@ -50,15 +52,6 @@ def test_get_read(dogmaseq_dig_spec): with pytest.raises(IndexError): dogmaseq_dig_spec.get_read("non_existent_read") -def test_to_json(dogmaseq_dig_spec): - """ - Test to_JSON method - """ - json_output = dogmaseq_dig_spec.to_JSON() - assert isinstance(json_output, str) - data = json.loads(json_output) - assert data["assay_id"] == "DOGMAseq-DIG" - def test_insert_regions(temp_spec): """ @@ -153,3 +146,121 @@ def test_print_sequence(dogmaseq_dig_spec, capsys): expected_sequence += "\n" assert captured.out == expected_sequence + +# def test_assay_kits_protocols_list_only(dogmaseq_dig_spec: Assay): +# pytest.importorskip("seqspec._core") + +# # Start from an existing spec and add lists of objects +# spec = dogmaseq_dig_spec.model_copy(deep=True) +# spec.sequence_protocol = [SeqProtocol(protocol_id="NovaSeq", name="Illumina NovaSeq", modality=spec.modalities[0])] +# spec.sequence_kit = [SeqKit(kit_id="NovaKit", name="NovaSeq v1.5", modality=spec.modalities[0])] +# spec.library_protocol = [LibProtocol(protocol_id="10x_rna", name="10x RNA", modality=spec.modalities[0])] +# spec.library_kit = [LibKit(kit_id="TruSeq", name="TruSeq Dual", modality=spec.modalities[0])] + +# ra = RustAssay.from_model(spec) +# snap = ra.snapshot() + +# assert isinstance(snap.sequence_protocol, list) +# assert getattr(snap.sequence_protocol[0], "protocol_id", None) == "NovaSeq" +# assert isinstance(snap.sequence_kit, list) +# assert getattr(snap.sequence_kit[0], "kit_id", None) == "NovaKit" +# assert isinstance(snap.library_protocol, list) +# assert getattr(snap.library_protocol[0], "name", None) == "10x RNA" +# assert isinstance(snap.library_kit, list) +# assert getattr(snap.library_kit[0], "name", None) == "TruSeq Dual" + + +# def test_rustassay_snapshot_and_modalities_parity(dogmaseq_dig_spec: Assay): +# pytest.importorskip("seqspec._core") + +# py = dogmaseq_dig_spec +# ra = RustAssay.from_model(py) +# snap = ra.snapshot() + +# # Whole-object parity (JSON) and modalities +# assert snap.model_dump_json() == py.model_dump_json() +# assert ra.list_modalities() == py.list_modalities() + +# # __repr__ parity (structural depiction) +# assert repr(snap) == repr(py) + + +# def test_rustassay_getters_parity(dogmaseq_dig_spec: Assay): +# pytest.importorskip("seqspec._core") +# py = dogmaseq_dig_spec +# ra = RustAssay.from_model(py) + +# # get_libspec parity (compare DTO json per modality) +# for m in py.list_modalities(): +# py_lib = py.get_libspec(m) +# ru_lib = ra.get_libspec(m) +# assert ru_lib.model_dump_json() == py_lib.model_dump_json() + +# # get_seqspec parity +# for m in py.list_modalities(): +# py_reads = py.get_seqspec(m) +# ru_reads = ra.get_seqspec(m) +# assert len(py_reads) == len(ru_reads) +# for pr, rr in zip(py_reads, ru_reads): +# assert rr.model_dump_json() == pr.model_dump_json() + +# # get_read parity +# # choose a known read id from the spec +# known = py.sequence_spec[0].read_id +# assert ra.get_read(known).model_dump_json() == py.get_read(known).model_dump_json() + + +# def test_rustassay_insert_reads_regions_parity(temp_spec: Assay): +# pytest.importorskip("seqspec._core") + +# # Work on a copy to avoid mutating fixtures +# py = temp_spec.model_copy(deep=True) +# ra = RustAssay.from_model(py) + +# # Prepare a new read and region +# new_read = Read( +# read_id="new_RX", +# name="New Read X", +# modality="rna", +# primer_id="primerX", +# min_len=42, +# max_len=42, +# strand="pos", +# ) +# new_region = Region( +# region_id="new_regX", +# region_type="named", +# name="new_regX", +# sequence_type="fixed", +# sequence="ACGT", +# min_len=4, +# max_len=4, +# regions=[], +# ) + +# # Insert in Python +# py.insert_reads([new_read], modality="rna") +# py.insert_regions([new_region], modality="rna") + +# # Insert in Rust and snapshot +# ra.insert_reads([new_read], modality="rna") +# ra.insert_regions([new_region], modality="rna") +# snap = ra.snapshot() + +# # Full object parity after inserts +# assert snap.model_dump_json() == py.model_dump_json() + + +# def test_rustassay_update_spec_parity(temp_spec: Assay): +# pytest.importorskip("seqspec._core") + +# py = temp_spec.model_copy(deep=True) +# ra = RustAssay.from_model(py) + +# # Trigger updates on both sides +# py.update_spec() +# ra.update_spec() +# snap = ra.snapshot() + +# # Parity after update (derived attributes on regions recomputed) +# assert snap.model_dump_json() == py.model_dump_json() \ No newline at end of file diff --git a/tests/test_auth.py b/tests/test_auth.py new file mode 100644 index 00000000..193a1972 --- /dev/null +++ b/tests/test_auth.py @@ -0,0 +1,65 @@ +from pathlib import Path + +from seqspec.auth import AuthProfile, AuthRegistry, init_profile + + +def test_auth_registry_loads_profiles_from_env_path(monkeypatch, tmp_path): + config_path = tmp_path / "auth.toml" + config_path.write_text( + """ +[profiles.igvf] +hosts = ["api.data.igvf.org", "data.igvf.org"] +kind = "basic" +username_env = "IGVF_ACCESS_KEY_ID" +password_env = "IGVF_ACCESS_KEY_SECRET" +""".strip() + + "\n" + ) + monkeypatch.setenv("SEQSPEC_AUTH_CONFIG", str(config_path)) + + registry = AuthRegistry.load() + + assert len(registry.profile_summaries()) == 1 + assert registry.profile_summaries()[0]["name"] == "igvf" + + +def test_auth_registry_resolves_requests_auth(monkeypatch, tmp_path): + config_path = tmp_path / "auth.toml" + config_path.write_text( + """ +[profiles.igvf] +hosts = ["api.data.igvf.org"] +kind = "basic" +username_env = "IGVF_ACCESS_KEY_ID" +password_env = "IGVF_ACCESS_KEY_SECRET" +""".strip() + + "\n" + ) + monkeypatch.setenv("SEQSPEC_AUTH_CONFIG", str(config_path)) + monkeypatch.setenv("IGVF_ACCESS_KEY_ID", "alice") + monkeypatch.setenv("IGVF_ACCESS_KEY_SECRET", "secret") + + registry = AuthRegistry.load() + auth = registry.resolve_requests_auth( + "https://api.data.igvf.org/reference-files/foo", "igvf" + ) + + assert auth == ("alice", "secret") + + +def test_init_profile_creates_config(monkeypatch, tmp_path): + config_path = tmp_path / "config" / "auth.toml" + monkeypatch.setenv("SEQSPEC_AUTH_CONFIG", str(config_path)) + + output = init_profile( + "igvf", + AuthProfile( + hosts=["api.data.igvf.org", "data.igvf.org"], + kind="basic", + username_env="IGVF_ACCESS_KEY_ID", + password_env="IGVF_ACCESS_KEY_SECRET", + ), + ) + + assert output["created_config"] is True + assert Path(output["path"]).exists() diff --git a/tests/test_examples_docs.py b/tests/test_examples_docs.py new file mode 100644 index 00000000..c8679a7c --- /dev/null +++ b/tests/test_examples_docs.py @@ -0,0 +1,83 @@ +import yaml + +from seqspec.examples_docs import example_paths, validate_examples_tree +from seqspec.seqspec_check import seqspec_check +from seqspec.utils import load_spec + + +def test_examples_tree_validates(): + assert validate_examples_tree() == [] + + +def test_manifest_paths_exist(): + paths = example_paths() + manifest = yaml.safe_load(paths.manifest_path.read_text()) + + for assay in manifest["assays"]: + canonical_path = paths.docs_examples / assay["canonical_path"] + html_path = paths.site_dir / assay["html_path"] + assert canonical_path.exists() + assert html_path.exists() + + +def test_canonical_assays_are_current_and_load(): + paths = example_paths() + for assay_path in sorted(paths.assays_dir.glob("*.yaml")): + spec = load_spec(assay_path, strict=False) + assert spec.seqspec_version == "0.4.0" + + +def test_validated_assays_pass_check(): + paths = example_paths() + manifest = yaml.safe_load(paths.manifest_path.read_text()) + + for assay in manifest["assays"]: + if assay["status"] != "validated": + continue + spec = load_spec(paths.docs_examples / assay["canonical_path"], strict=False) + assert seqspec_check(spec) == [] + + +def test_no_superseded_example_sources_remain(): + paths = example_paths() + assert not (paths.docs_examples / "legacy").exists() + assert not (paths.docs_examples / "seqspec").exists() + assert not (paths.docs_examples / "seqspec-builder").exists() + assert not (paths.docs_examples / "seqspec-builder1").exists() + + forbidden = [path for path in paths.docs_examples.rglob("*") if path.name == ".git"] + assert forbidden == [] + + +def test_generated_site_pages_exist(): + paths = example_paths() + for name in ("index.html", "assays.html", "reads.html", "regions.html"): + assert (paths.site_dir / name).exists() + + assay_pages = list((paths.site_dir / "assays").glob("*.html")) + assert assay_pages + + +def test_assay_catalog_links_to_assay_metadata(): + paths = example_paths() + manifest = yaml.safe_load(paths.manifest_path.read_text()) + index_html = (paths.site_dir / "index.html").read_text() + + assay = next(item for item in manifest["assays"] if item.get("assay_link")) + + assert assay["assay_link"] in index_html + assert f'>{assay["assay_id"]}' in index_html + + +def test_assay_catalog_skips_invalid_metadata_links(): + paths = example_paths() + manifest = yaml.safe_load(paths.manifest_path.read_text()) + index_html = (paths.site_dir / "index.html").read_text() + + assay = next( + item + for item in manifest["assays"] + if item.get("assay_link") and not item["assay_link"].startswith("http") + ) + + assert f'href="{assay["assay_link"]}"' not in index_html diff --git a/tests/test_read.py b/tests/test_read.py index 299462e8..1ba72d8c 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -86,28 +86,6 @@ def test_read_set_files(): assert len(read.files) == 1 assert read.files[0].file_id == "file1" -def test_read_to_dict(): - """Test to_dict method""" - read = Read( - read_id="test_read", - name="Test Read", - modality="RNA", - primer_id="test_primer", - min_len=100, - max_len=150, - strand="pos" - ) - - read_dict = read.to_dict() - assert read_dict["read_id"] == "test_read" - assert read_dict["name"] == "Test Read" - assert read_dict["modality"] == "RNA" - assert read_dict["primer_id"] == "test_primer" - assert read_dict["min_len"] == 100 - assert read_dict["max_len"] == 150 - assert read_dict["strand"] == "pos" - assert read_dict["files"] == [] - def test_read_update_read_by_id(): """Test update_read_by_id method""" read = Read( @@ -165,6 +143,24 @@ def test_read_update_read_by_id_partial(): assert read.max_len == 150 # Unchanged assert read.strand == "pos" # Unchanged + +def test_read_update_read_by_id_accepts_zero_and_empty_values(): + read = Read( + read_id="test_read", + name="Test Read", + modality="RNA", + primer_id="test_primer", + min_len=100, + max_len=150, + strand="pos", + ) + + read.update_read_by_id(name="", min_len=0, max_len=0) + + assert read.name == "" + assert read.min_len == 0 + assert read.max_len == 0 + def test_read_get_read_by_file_id(): """Test getting read by file ID.""" file1 = File( @@ -275,26 +271,6 @@ def test_file_creation(): assert file_obj.urltype == "local" assert file_obj.md5 == "d41d8cd98f00b204e9800998ecf8427e" -def test_file_to_dict(): - """Test File to_dict method""" - file_obj = File( - file_id="file1", - filename="read1.fastq.gz", - filetype="fastq", - filesize=1000000, - url="file://read1.fastq.gz", - urltype="local", - md5="d41d8cd98f00b204e9800998ecf8427e" - ) - - file_dict = file_obj.to_dict() - assert file_dict["file_id"] == "file1" - assert file_dict["filename"] == "read1.fastq.gz" - assert file_dict["filetype"] == "fastq" - assert file_dict["filesize"] == 1000000 - assert file_dict["url"] == "file://read1.fastq.gz" - assert file_dict["urltype"] == "local" - assert file_dict["md5"] == "d41d8cd98f00b204e9800998ecf8427e" @@ -344,4 +320,128 @@ def test_update_read_by_id_real(atac_r1_read): assert atac_r1_read.max_len == 55 # Ensure other properties are unchanged assert atac_r1_read.read_id == "atac_R1" - assert atac_r1_read.modality == "atac" \ No newline at end of file + assert atac_r1_read.modality == "atac" + +# def test_rustread_roundtrip_and_mutation(): +# pytest.importorskip("seqspec._core") + +# r = ReadInput( +# read_id="R1", name="read1", modality="rna", primer_id="truseq_r1", +# min_len=1, max_len=100, strand="pos", +# files=[FileInput(filename="r1.fastq.gz")] +# ).to_read() + +# rr = RustRead.from_model(r) + +# # parity: check all attributes +# assert rr.read_id == r.read_id +# assert rr.name == r.name +# assert rr.modality == r.modality +# assert rr.primer_id == r.primer_id +# assert rr.min_len == r.min_len +# assert rr.max_len == r.max_len +# assert rr.strand == r.strand +# assert len(rr.files) == len(r.files) +# for rf, f in zip(rr.files, r.files): +# assert rf.file_id == f.file_id +# assert rf.filename == f.filename +# assert rf.filetype == f.filetype +# assert rf.filesize == f.filesize +# assert rf.url == f.url +# assert rf.urltype == f.urltype +# assert rf.md5 == f.md5 + +# # mutate via Rust and snapshot back +# rr.update_read_by_id(name="renamed", max_len=120, read_id=None, modality=None, primer_id=None, +# min_len=None, strand=None, files=None) +# snap = rr.snapshot() +# assert snap.name == "renamed" +# assert snap.max_len == 120 +# # original DTO unchanged +# assert r.name != "renamed" + +# def test_rustread_get_by_file_id(dogmaseq_dig_spec): +# pytest.importorskip("seqspec._core") +# r = dogmaseq_dig_spec.get_seqspec("rna")[0] +# rr = RustRead.from_model(r) +# found = rr.get_read_by_file_id(r.files[0].file_id) +# assert found is not None +# assert found.read_id == r.read_id + + +# def test_rustread_new_and_snapshot_parity_with_python(): +# pytest.importorskip("seqspec._core") + +# # Build Python DTO via inputs +# f1 = FileInput(filename="r1.fastq.gz").to_file() +# f2 = FileInput(filename="r2.fastq.gz").to_file() +# py = ReadInput( +# read_id="R1", +# name="read1", +# modality="rna", +# primer_id="truseq_r1", +# min_len=50, +# max_len=75, +# strand="pos", +# files=[FileInput(filename=f1.filename), FileInput(filename=f2.filename)], +# ).to_read() + +# # Build Rust proxy using RustFile list +# rf1 = RustFile.new(file_id=f1.file_id, filename=f1.filename, filetype=f1.filetype, filesize=f1.filesize, url=f1.url, urltype=f1.urltype, md5=f1.md5) +# rf2 = RustFile.new(file_id=f2.file_id, filename=f2.filename, filetype=f2.filetype, filesize=f2.filesize, url=f2.url, urltype=f2.urltype, md5=f2.md5) +# rr = RustRead.new( +# read_id=py.read_id, +# name=py.name, +# modality=py.modality, +# primer_id=py.primer_id, +# min_len=py.min_len, +# max_len=py.max_len, +# strand=py.strand, +# files=[rf1, rf2], +# ) + +# snap = rr.snapshot() +# assert snap.model_dump_json() == py.model_dump_json() +# assert repr(snap) == repr(py) + + +# def test_read_rust_parity_sweep(): +# pytest.importorskip("seqspec._core") + +# # Base Python object with one file +# f = FileInput(filename="sample_R1.fastq.gz").to_file() +# py = ReadInput( +# read_id="R1", +# name="read1", +# modality="rna", +# primer_id="truseq_r1", +# min_len=100, +# max_len=150, +# strand="pos", +# files=[FileInput(filename=f.filename)], +# ).to_read() + +# # Rust mirror +# rr = RustRead.from_model(py) + +# # Parity on attributes and repr +# snap0 = rr.snapshot() +# assert snap0.model_dump_json() == py.model_dump_json() +# assert repr(snap0) == repr(py) + +# # Update a subset of fields on both sides and re-check parity +# rr.update_read_by_id(name="renamed", min_len=120, max_len=160, strand="neg", +# read_id=None, modality=None, primer_id=None, files=None) +# py.update_read_by_id(name="renamed", min_len=120, max_len=160, strand="neg") + +# snap1 = rr.snapshot() +# assert snap1.model_dump_json() == py.model_dump_json() +# assert repr(snap1) == repr(py) + +# # Query by file id parity (present/absent) +# rr_found = rr.get_read_by_file_id(f.file_id) +# py_found = py.get_read_by_file_id(f.file_id) +# assert rr_found is not None and py_found is not None +# assert rr_found.read_id == py_found.read_id +# assert rr.get_read_by_file_id("does_not_exist") is None +# assert py.get_read_by_file_id("does_not_exist") is None diff --git a/tests/test_region.py b/tests/test_region.py index eff4a693..e4ee8055 100644 --- a/tests/test_region.py +++ b/tests/test_region.py @@ -5,6 +5,15 @@ project_regions_to_coordinates, itx_read, complement_nucleotide, complement_sequence ) +# from seqspec.Region import Onlist, OnlistInput, RustOnlist +# from seqspec.Region import Region, RegionInput, RustRegion +from seqspec.Region import ( + Region, + RegionInput, + Onlist, + OnlistInput, + # RustRegion, +) def test_region_creation_minimal(): """Test creating a minimal region""" @@ -800,4 +809,695 @@ def test_complement_real(rna_lib_spec): original_sequence = rna_lib_spec.get_sequence() rna_lib_spec.complement() complemented_sequence = rna_lib_spec.get_sequence() - assert complement_sequence(original_sequence) == complemented_sequence \ No newline at end of file + assert complement_sequence(original_sequence) == complemented_sequence + + +# def test_rustonlist_roundtrip_and_mutation(): +# # pytest.importorskip("seqspec._core") + +# inp = OnlistInput(filename="RNA-737K-arc-v1.txt.gz", url="https://example/file.txt.gz", urltype="https") +# py = inp.to_onlist() +# ro = RustOnlist.from_model(py) + +# # Assert parity on all attributes +# assert ro.file_id == py.file_id +# assert ro.filename == py.filename +# assert ro.filetype == py.filetype +# assert ro.filesize == py.filesize +# assert ro.url == py.url +# assert ro.urltype == py.urltype +# assert ro.md5 == py.md5 + +# # Mutate in Rust and snapshot +# ro.md5 = "deadbeef" +# snap = ro.snapshot() +# assert snap.md5 == "deadbeef" +# assert py.md5 != "deadbeef" # original DTO unchanged + +# def test_rustonlist_json_roundtrip(): +# # pytest.importorskip("seqspec._core") +# py = Onlist(file_id="ol1", filename="ol.txt", filetype="txt", filesize=10, url="ol.txt", urltype="local", md5="") +# from seqspec._core import Onlist as _CoreOnlist +# assert _CoreOnlist.from_json(py.model_dump_json()).to_json() == py.model_dump_json() + +def _make_small_tree() -> Region: + # parent -> [leafA (AAA, len=3), leafB (TT, len=2)] + leaf_a = Region( + region_id="A", + region_type="named", + name="A", + sequence_type="fixed", + sequence="AAA", + min_len=3, + max_len=3, + onlist=None, + regions=[], + ) + leaf_b = Region( + region_id="B", + region_type="named", + name="B", + sequence_type="fixed", + sequence="TT", + min_len=2, + max_len=2, + onlist=None, + regions=[], + ) + parent = Region( + region_id="root", + region_type="named", + name="root", + sequence_type="joined", + sequence="", + min_len=0, + max_len=0, + onlist=None, + regions=[leaf_a, leaf_b], + ) + return parent + +# def test_rustregion_update_and_queries(): +# # pytest.importorskip("seqspec._core") + +# py = _make_small_tree() +# rr = RustRegion.from_model(py) + +# # update derived attributes +# rr.update_attr() +# seq = rr.get_sequence() +# mn, mx = rr.get_len() + +# assert seq == "AAATT" +# assert (mn, mx) == (5, 5) + +# # leaves & region type set +# leaves = rr.get_leaves() +# assert [r.region_id for r in leaves] == ["A", "B"] +# rtypes = rr.get_leaf_region_types() +# assert "named" in rtypes + +# # by id +# found = rr.get_region_by_id("A") +# assert len(found) == 1 and found[0].region_id == "A" + +# # newick +# assert rr.to_newick() == "('A:3','B:2')root" + +# def test_rustregion_reverse_and_complement(): +# # pytest.importorskip("seqspec._core") + +# py = _make_small_tree() +# rr = RustRegion.from_model(py) +# rr.update_attr() + +# rr.reverse() +# snap1 = rr.snapshot() +# # reversing the leaves (AAA -> AAA, TT -> TT) but order preserved in tree; +# # we only reverse per-leaf sequence, not reorder children +# assert snap1.get_leaves()[0].sequence == "AAA" +# assert snap1.get_leaves()[1].sequence == "TT" + +# rr.complement() +# snap2 = rr.snapshot() +# # AAA -> TTT, TT -> AA +# assert snap2.get_leaves()[0].sequence == "TTT" +# assert snap2.get_leaves()[1].sequence == "AA" + +# ---------- helpers ---------- + +def _leaf(region_id: str, seq: str, min_len: int | None = None, max_len: int | None = None, rtype: str = "named", + seqtype: str = "fixed", onlist: Onlist | None = None) -> Region: + seq = seq or "" + if min_len is None: + min_len = len(seq) + if max_len is None: + max_len = len(seq) + return Region( + region_id=region_id, + region_type=rtype, + name=region_id, + sequence_type=seqtype, + sequence=seq, + min_len=min_len, + max_len=max_len, + onlist=onlist, + regions=[], + ) + +def _tree_joined(region_id: str, children: list[Region], rtype: str = "named") -> Region: + return Region( + region_id=region_id, + region_type=rtype, + name=region_id, + sequence_type="joined", + sequence="", + min_len=0, + max_len=0, + onlist=None, + regions=children, + ) + +def _simple_tree() -> Region: + # root -> [A("AAA"), B("TT")] + A = _leaf("A", "AAA", rtype="named", seqtype="fixed") + B = _leaf("B", "TT", rtype="named", seqtype="fixed") + return _tree_joined("root", [A, B]) + +def _tree_with_random_onlist() -> Region: + # root -> [randX (random, min=5), onlistN (onlist, min=3)] + randX = _leaf("randX", "", min_len=5, max_len=5, seqtype="random", rtype="umi") + ol = Onlist(file_id="ol1", filename="ol1.txt", filetype="txt", filesize=1, url="ol1.txt", urltype="local", md5="") + onlN = _leaf("onlistN", "", min_len=3, max_len=3, seqtype="onlist", rtype="barcode", onlist=ol) + return _tree_joined("root", [randX, onlN]) + +def _assert_region_equal(py: Region, rust_snap: Region): + # Basic field equality (not exhaustive) + assert py.region_id == rust_snap.region_id + assert str(py.region_type) == str(rust_snap.region_type) + assert py.name == rust_snap.name + assert str(py.sequence_type) == str(rust_snap.sequence_type) + assert py.min_len == rust_snap.min_len + assert py.max_len == rust_snap.max_len + assert py.sequence == rust_snap.sequence + # Check onlist presence equivalence at this node + if py.onlist is None: + assert rust_snap.onlist is None + else: + assert rust_snap.onlist is not None + assert py.onlist.filename == rust_snap.onlist.filename + + +# ---------- tests ---------- + +# def test_update_attr_sequence_and_lengths_fixed_joined(): +# # pytest.importorskip("seqspec._core") +# py = _simple_tree() +# # Python behavior +# py.update_attr() +# py_seq = py.get_sequence() +# py_len = py.get_len() + +# rr = RustRegion.from_model(py) +# rr.update_attr() +# ru_seq = rr.get_sequence() +# ru_len = rr.get_len() + +# assert py_seq == "AAATT" +# assert ru_seq == py_seq +# assert py_len == (5, 5) +# assert ru_len == py_len + +# snap = rr.snapshot() +# _assert_region_equal(py, snap) + + +# def test_leaf_queries_and_newick(): +# # pytest.importorskip("seqspec._core") +# py = _simple_tree() +# py.update_attr() + +# rr = RustRegion.from_model(py) +# rr.update_attr() + +# # leaves +# py_leaves = [r.region_id for r in py.get_leaves()] +# ru_leaves = [r.region_id for r in rr.get_leaves()] +# assert py_leaves == ru_leaves == ["A", "B"] + +# # region types set (as strings) +# assert set(py.get_leaf_region_types()) == set(rr.get_leaf_region_types()) + +# # find by id +# py_by_id = [r.region_id for r in py.get_region_by_id("A")] +# ru_by_id = [r.region_id for r in rr.get_region_by_id("A")] +# assert py_by_id == ru_by_id == ["A"] + +# # find by region_type +# py_by_type = [r.region_id for r in py.get_region_by_region_type("named")] +# ru_by_type = [r.region_id for r in rr.get_region_by_region_type("named")] +# assert set(py_by_type) == set(ru_by_type) + +# # newick +# assert py.to_newick() == rr.to_newick() == "('A:3','B:2')root" + + +# def test_random_and_onlist_behavior(): +# # pytest.importorskip("seqspec._core") +# py = _tree_with_random_onlist() +# py.update_attr() +# py_seq = py.get_sequence() +# py_len = py.get_len() + +# # Expect: "XXXXX" + "NNN" (random = X*min_len; onlist = N*min_len) +# assert py_seq == "XXXXXNNN" +# assert py_len == (8, 8) + +# rr = RustRegion.from_model(py) +# rr.update_attr() +# assert rr.get_sequence() == py_seq +# assert rr.get_len() == py_len + +# # onlist regions +# py_ol_ids = [r.region_id for r in py.get_onlist_regions()] +# ru_ol_ids = [r.region_id for r in rr.get_onlist_regions()] +# assert py_ol_ids == ru_ol_ids == ["onlistN"] + +# snap = rr.snapshot() +# _assert_region_equal(py, snap) + + +# def test_update_region_by_id_and_update_region(): +# # pytest.importorskip("seqspec._core") +# py = _simple_tree() +# py.update_attr() + +# rr = RustRegion.from_model(py) +# rr.update_attr() + +# # Partial update on leaf A +# rr.update_region_by_id( +# target_region_id="A", +# name="A_renamed", +# min_len=4, +# max_len=4, +# sequence="AAAA", +# ) +# # Recompute derived +# rr.update_attr() +# snap = rr.snapshot() + +# # Python side apply same change and recompute +# py.update_region_by_id("A", region_id=None, region_type=None, name="A_renamed", +# sequence_type=None, sequence="AAAA", min_len=4, max_len=4) +# py.update_attr() + +# # Parity +# assert snap.get_region_by_id("A")[0].name == "A_renamed" +# assert snap.get_len() == py.get_len() +# assert snap.get_sequence() == py.get_sequence() + +# # Now test full update_region on the root node +# rr.update_region( +# region_id="root2", +# region_type="named", +# name="root2", +# sequence_type="joined", +# sequence="", # joined will be recomputed by update_attr +# min_len=0, +# max_len=0, +# onlist=None, +# ) +# rr.update_attr() +# snap2 = rr.snapshot() +# assert snap2.region_id == "root2" +# assert snap2.get_sequence() == py.get_sequence() # children unchanged +# assert snap2.get_len() == py.get_len() + + +# def test_reverse_and_complement_leaf_sequences(): +# # pytest.importorskip("seqspec._core") +# py = _simple_tree() +# py.update_attr() + +# rr = RustRegion.from_model(py) +# rr.update_attr() + +# # Reverse (per-leaf) +# rr.reverse() +# snap_rev = rr.snapshot() +# # "AAA" -> "AAA", "TT" -> "TT" (palindromic examples; still a structural op) +# assert [r.sequence for r in snap_rev.get_leaves()] == ["AAA", "TT"] + +# # Complement (A<->T, C<->G, etc.) +# rr.complement() +# snap_comp = rr.snapshot() +# assert [r.sequence for r in snap_comp.get_leaves()] == ["TTT", "AA"] + + +# def test_get_leaves_with_region_id_behavior(): +# # pytest.importorskip("seqspec._core") +# # root -> middle -> [leaf1, leaf2] +# leaf1 = _leaf("leaf1", "AC", rtype="named") +# leaf2 = _leaf("leaf2", "GT", rtype="named") +# middle = Region( +# region_id="middle", +# region_type="named", +# name="middle", +# sequence_type="joined", +# sequence="", +# min_len=0, +# max_len=0, +# onlist=None, +# regions=[leaf1, leaf2], +# ) +# root = _tree_joined("root", [middle]) + +# # Python +# py = root +# py.update_attr() +# py_selected_ids = [r.region_id for r in py.get_leaves_with_region_id("middle")] + +# # Rust +# rr = RustRegion.from_model(py) +# rr.update_attr() +# ru_selected_ids = [r.region_id for r in rr.get_leaves_with_region_id("middle")] + +# # Your Python logic: if region_id matches, include that node (don’t descend) +# assert py_selected_ids == ["middle"] +# assert ru_selected_ids == ["middle"] + + +def test_region_get_onlist_method_simple(): + onlist = Onlist( + file_id="olx", + filename="ol.txt", + filetype="txt", + filesize=1, + url="ol.txt", + urltype="local", + md5="", + ) + r_with = Region( + region_id="r1", + region_type="barcode", + name="r1", + sequence_type="onlist", + min_len=3, + max_len=3, + onlist=onlist, + regions=[], + ) + r_without = Region( + region_id="r2", + region_type="barcode", + name="r2", + sequence_type="fixed", + sequence="AAA", + min_len=3, + max_len=3, + regions=[], + ) + assert r_with.get_onlist() is onlist + assert r_without.get_onlist() is None + + +def test_region_update_region_python(): + r = Region( + region_id="r", + region_type="named", + name="r", + sequence_type="fixed", + sequence="AC", + min_len=2, + max_len=2, + regions=[], + ) + ol = Onlist( + file_id="ol1", + filename="ol.txt", + filetype="txt", + filesize=1, + url="ol.txt", + urltype="local", + md5="deadbeef", + ) + r.update_region( + region_id="r2", + region_type="barcode", + name="r2", + sequence_type="onlist", + sequence="", + min_len=3, + max_len=3, + onlist=ol, + ) + r.update_attr() + assert (r.region_id, r.region_type, r.name) == ("r2", "barcode", "r2") + assert r.sequence_type == "onlist" + assert r.sequence == "N" * 3 + assert (r.min_len, r.max_len) == (3, 3) + assert r.onlist is ol + + +def test_update_region_by_id_partial_none(): + leaf = Region( + region_id="L", + region_type="named", + name="L", + sequence_type="fixed", + sequence="GG", + min_len=2, + max_len=2, + regions=[], + ) + root = Region( + region_id="root", + region_type="named", + name="root", + sequence_type="joined", + regions=[leaf], + ) + root.update_region_by_id( + target_region_id="L", + region_id=None, # keep the same + region_type=None, + name="L2", # change only name + sequence_type=None, + sequence=None, + min_len=None, + max_len=None, + ) + root.update_attr() + updated = root.get_region_by_id("L")[0] + assert updated.name == "L2" + assert updated.region_id == "L" + assert updated.sequence == "GG" + assert updated.min_len == 2 and updated.max_len == 2 + + +def test_update_region_by_id_accepts_zero_and_empty_values(): + leaf = Region( + region_id="L", + region_type="named", + name="L", + sequence_type="fixed", + sequence="GG", + min_len=2, + max_len=2, + regions=[], + ) + root = Region( + region_id="root", + region_type="named", + name="root", + sequence_type="joined", + regions=[leaf], + ) + root.update_region_by_id( + target_region_id="L", + region_id=None, + region_type=None, + name="", + sequence_type=None, + sequence="", + min_len=0, + max_len=0, + ) + updated = root.get_region_by_id("L")[0] + assert updated.name == "" + assert updated.sequence == "" + assert updated.min_len == 0 + assert updated.max_len == 0 + + +def test_region_repr_contains_type_and_lengths(): + r = Region( + region_id="x", + region_type="named", + name="x", + sequence_type="fixed", + sequence="A", + min_len=1, + max_len=1, + regions=[], + ) + s = repr(r) + assert "named" in s + assert "(1, 1)" in s + + +def test_get_region_by_region_type_with_enum(): + r_leaf = Region( + region_id="e", + region_type=RegionType.NAMED, + name="e", + sequence_type=SequenceType.FIXED, + sequence="T", + min_len=1, + max_len=1, + regions=[], + ) + r_root = Region( + region_id="root", + region_type=RegionType.NAMED, + name="root", + sequence_type=SequenceType.JOINED, + regions=[r_leaf], + ) + found = r_root.get_region_by_region_type(RegionType.NAMED) + assert len(found) >= 2 + assert any(x.region_id == "e" for x in found) + + +def test_regioncoordinate_subtraction_scenarios(): + a = RegionCoordinate( + region_id="a", + region_type="named", + name="a", + sequence_type="fixed", + sequence="AAAA", + min_len=0, + max_len=1024, + start=0, + stop=4, + ) + b = RegionCoordinate( + region_id="b", + region_type="named", + name="b", + sequence_type="fixed", + sequence="TT", + min_len=0, + max_len=1024, + start=6, + stop=8, + ) + c = a - b + assert isinstance(c, RegionCoordinate) + assert c.region_type == "difference" + assert (c.start, c.stop) == (4, 6) + assert c.sequence == "X" * (c.stop - c.start) + + # Equal ranges + x = RegionCoordinate(region_id="x", region_type="named", name="x", sequence_type="fixed", sequence="", + min_len=0, max_len=0, start=10, stop=12) + y = RegionCoordinate(region_id="y", region_type="named", name="y", sequence_type="fixed", sequence="", + min_len=0, max_len=0, start=10, stop=12) + z = x - y + assert (z.start, z.stop) == (10, 12) + + +def test_regioncoordinate_difference_loc_field(): + obj = RegionCoordinate(region_id="obj", region_type="named", name="obj", sequence_type="fixed", sequence="", + min_len=0, max_len=0, start=0, stop=2) + fixed = RegionCoordinate(region_id="fixed", region_type="named", name="fixed", sequence_type="fixed", sequence="", + min_len=0, max_len=0, start=5, stop=7) + diff = RegionCoordinateDifference(obj=obj, fixed=fixed, rgncdiff=obj) + assert diff.loc == "-" + + obj2 = RegionCoordinate(region_id="obj2", region_type="named", name="obj2", sequence_type="fixed", sequence="", + min_len=0, max_len=0, start=8, stop=9) + diff2 = RegionCoordinateDifference(obj=obj2, fixed=fixed, rgncdiff=obj2) + assert diff2.loc == "+" + + +def test_to_newick_ignores_n_param(): + A = _leaf("A", "AA") + B = _leaf("B", "T") + root = _tree_joined("root", [A, B]) + s1 = root.to_newick() + s2 = root.to_newick(n="ignored") + assert s1 == s2 + + +# def test_rustonlist_new_and_snapshot(): +# ro = RustOnlist.new(file_id="id", filename="f.txt", filetype="txt", filesize=1, url="f.txt", urltype="local", md5="m") +# snap = ro.snapshot() +# assert snap.file_id == "id" and snap.filename == "f.txt" + + +# def test_rustregion_get_and_set_onlist(): +# # root carries onlist +# ol = Onlist(file_id="ol1", filename="ol.txt", filetype="txt", filesize=1, url="ol.txt", urltype="local", md5="a") +# root = Region( +# region_id="root", +# region_type="named", +# name="root", +# sequence_type="joined", +# onlist=ol, +# regions=[_leaf("L", "AC")], +# ) +# rr = RustRegion.from_model(root) +# got = rr.get_onlist() +# assert got is not None and got.filename == "ol.txt" + +# # mutate onlist via Rust proxy +# rr.onlist = RustOnlist.new(file_id="ol2", filename="x.txt", filetype="txt", filesize=2, url="x.txt", urltype="local", md5="b") +# snap = rr.snapshot() +# assert snap.onlist is not None and snap.onlist.filename == "x.txt" + + +# def test_region_rust_parity_sweep(): +# # Build a slightly complex tree mixing fixed, random, onlist, and a nested joined +# ol = Onlist(file_id="olX", filename="olx.txt", filetype="txt", filesize=1, url="olx.txt", urltype="local", md5="") +# fixA = _leaf("fixA", "AAA", rtype="barcode", seqtype="fixed") +# rand2 = _leaf("rand2", "", min_len=2, max_len=2, rtype="umi", seqtype="random") +# fx2 = _leaf("fx2", "GC", rtype="linker", seqtype="fixed") +# mid = _tree_joined("mid", [fx2]) +# olN = _leaf("olN", "", min_len=3, max_len=3, rtype="barcode", seqtype="onlist", onlist=ol) +# py = _tree_joined("root", [fixA, rand2, mid, olN]) + +# # Python baseline +# py.update_attr() +# py_seq = py.get_sequence() +# py_len = py.get_len() + +# # Rust baseline +# rr = RustRegion.from_model(py) +# rr.update_attr() +# ru_seq = rr.get_sequence() +# ru_len = rr.get_len() + +# # Parity on primary derived attributes +# assert ru_seq == py_seq +# assert ru_len == py_len + +# # Parity on queries +# assert [r.region_id for r in rr.get_leaves()] == [r.region_id for r in py.get_leaves()] +# assert set(rr.get_leaf_region_types()) == set(py.get_leaf_region_types()) +# assert [r.region_id for r in rr.get_onlist_regions()] == [r.region_id for r in py.get_onlist_regions()] +# assert [r.region_id for r in rr.get_region_by_id("mid")] == [r.region_id for r in py.get_region_by_id("mid")] +# assert set(r.region_id for r in rr.get_region_by_region_type("barcode")) == set( +# r.region_id for r in py.get_region_by_region_type("barcode") +# ) +# assert rr.to_newick() == py.to_newick() + +# # Snapshot parity against the Python DTO +# assert rr.snapshot().model_dump_json() == py.model_dump_json() + +# # Mutate a leaf via both APIs and recheck parity +# # Change fixA to sequence AAAA, length 4 +# rr.update_region_by_id( +# target_region_id="fixA", name="fixA2", sequence="AAAA", min_len=4, max_len=4 +# ) +# py.update_region_by_id("fixA", region_id=None, region_type=None, name="fixA2", sequence_type=None, sequence="AAAA", min_len=4, max_len=4) +# rr.update_attr() +# py.update_attr() + +# assert rr.get_sequence() == py.get_sequence() +# assert rr.get_len() == py.get_len() +# assert rr.to_newick() == py.to_newick() +# assert rr.snapshot().model_dump_json() == py.model_dump_json() + +# # Reverse and complement both sides and verify parity remains +# rr.reverse(); py.reverse() +# rr.update_attr(); py.update_attr() +# assert rr.get_sequence() == py.get_sequence() +# assert rr.get_len() == py.get_len() + +# rr.complement(); py.complement() +# rr.update_attr(); py.update_attr() +# assert rr.get_sequence() == py.get_sequence() +# assert rr.get_len() == py.get_len() +# assert rr.snapshot().model_dump_json() == py.model_dump_json() diff --git a/tests/test_seqspec_check.py b/tests/test_seqspec_check.py index c6a85217..6c7871b0 100644 --- a/tests/test_seqspec_check.py +++ b/tests/test_seqspec_check.py @@ -12,9 +12,23 @@ def test_seqspec_check(dogmaseq_dig_spec: Assay): """Test seqspec_check function""" + spec = dogmaseq_dig_spec.model_copy(deep=True) + + def localize_onlists(region): + if region.onlist is not None and region.onlist.urltype in {"http", "https", "ftp"}: + region.onlist.urltype = "local" + region.onlist.url = region.onlist.filename + ".gz" + for child in region.regions: + localize_onlists(child) + + for region in spec.library_spec: + localize_onlists(region) + # Test with valid spec - errors = seqspec_check(spec=dogmaseq_dig_spec) - assert len(errors) == 0 # No errors for valid spec + diagnostics = seqspec_check(spec=spec) + assert not any( + diagnostic["severity"] == "error" for diagnostic in diagnostics + ), "Valid spec should not emit error diagnostics" # Test with invalid spec (missing required fields) invalid_spec = Assay( @@ -34,5 +48,77 @@ def test_seqspec_check(dogmaseq_dig_spec: Assay): library_spec=[] ) - errors = seqspec_check(spec=invalid_spec) - assert len(errors) > 0 # Should have errors for invalid spec \ No newline at end of file + diagnostics = seqspec_check(spec=invalid_spec) + assert any( + diagnostic["severity"] == "error" for diagnostic in diagnostics + ), "Invalid spec should emit error diagnostics" + + +def test_seqspec_check_warns_on_overlapping_read_regions(): + spec = load_spec(Path("tests/fixtures/check_overlap_warning/spec.yaml")) + + diagnostics = seqspec_check(spec=spec) + + errors = [ + diagnostic for diagnostic in diagnostics if diagnostic["severity"] == "error" + ] + warnings = [ + diagnostic + for diagnostic in diagnostics + if diagnostic["severity"] == "warning" + ] + + assert errors == [] + assert len(warnings) == 1 + assert warnings[0]["error_type"] == "check_overlapping_read_regions" + assert ( + "seqspec index --no-overlap" in warnings[0]["error_message"] + ) + assert "'barcode'" in warnings[0]["error_message"] + assert "'umi'" in warnings[0]["error_message"] + + +def test_seqspec_check_prefers_local_onlist_url(): + spec_path = Path("tests/fixtures/onlist_read_clip/spec.yaml") + spec = load_spec(spec_path) + + barcode_region = spec.get_libspec("rna").get_region_by_id("barcode_a")[0] + barcode_region.onlist.filename = "display.txt" + + diagnostics = seqspec_check(spec=spec) + + assert not any( + diagnostic["error_type"] == "check_onlist_files_exist" + for diagnostic in diagnostics + ) + + +def test_seqspec_check_errors_when_local_onlist_url_is_empty(): + spec_path = Path("tests/fixtures/onlist_read_clip/spec.yaml") + spec = load_spec(spec_path) + + barcode_region = spec.get_libspec("rna").get_region_by_id("barcode_a")[0] + barcode_region.onlist.url = "" + + diagnostics = seqspec_check(spec=spec) + + assert any( + diagnostic["error_type"] == "check_onlist_files_exist" + and diagnostic["error_message"] == "local onlist 'barcode_a.txt' has empty url" + for diagnostic in diagnostics + ) + + +def test_seqspec_check_errors_when_local_file_url_is_empty(): + spec_path = Path("tests/fixtures/onlist_read_clip/spec.yaml") + spec = load_spec(spec_path) + + spec.sequence_spec[0].files[0].url = "" + + diagnostics = seqspec_check(spec=spec) + + assert any( + diagnostic["error_type"] == "check_read_files_exist" + and diagnostic["error_message"] == "local file 'rna_read.fastq.gz' has empty url" + for diagnostic in diagnostics + ) diff --git a/tests/test_seqspec_file.py b/tests/test_seqspec_file.py index 9aa1316d..af8eb9ab 100644 --- a/tests/test_seqspec_file.py +++ b/tests/test_seqspec_file.py @@ -103,3 +103,52 @@ def test_list_files_by_id(dogmaseq_dig_spec: Assay): rna_cell_bc_file = files["rna_cell_bc"][0] assert rna_cell_bc_file.file_id == "RNA-737K-arc-v1.txt" +import pytest +from seqspec.File import File, FileInput + +# def test_rustfile_roundtrip_and_mutation(): +# # Skip if the Rust extension isn't built +# pytest.importorskip("seqspec._core") + +# # Build a Pydantic File, wrap it in Rust, mutate in Rust, snapshot back +# fin = FileInput(filename="r1.fastq.gz") +# py = fin.to_file() +# rf = RustFile.from_model(py) + +# # Properties reflect Rust state +# assert rf.file_id == py.file_id +# assert rf.filename == py.filename + +# # Mutate in Rust +# rf.file_id = "new_id" +# snap = rf.snapshot() + +# # Snapshot reflects Rust mutation; original Pydantic object is unchanged +# assert snap.file_id == "new_id" +# assert py.file_id != "new_id" + + +# def test_rustfile_on_real_spec_file(dogmaseq_dig_spec): +# # Skip if the Rust extension isn't built +# pytest.importorskip("seqspec._core") + +# # Take an existing File from the spec and round-trip through Rust +# rna_r1 = dogmaseq_dig_spec.sequence_spec[0].files[0] +# rf = RustFile.from_model(rna_r1) + +# # Assert parity on all attributes +# assert rf.file_id == rna_r1.file_id +# assert rf.filename == rna_r1.filename +# assert rf.filetype == rna_r1.filetype +# assert rf.filesize == rna_r1.filesize +# assert rf.url == rna_r1.url +# assert rf.urltype == rna_r1.urltype +# assert rf.md5 == rna_r1.md5 + +# # Mutate in Rust and snapshot back +# new_id = rna_r1.file_id + ".tmp" +# rf.file_id = new_id +# snap = rf.snapshot() +# assert snap.file_id == new_id +# # Original spec object remains unchanged +# assert rna_r1.file_id != new_id \ No newline at end of file diff --git a/tests/test_seqspec_index.py b/tests/test_seqspec_index.py index 4cb80ed4..09faca53 100644 --- a/tests/test_seqspec_index.py +++ b/tests/test_seqspec_index.py @@ -1,7 +1,9 @@ -from seqspec.seqspec_index import seqspec_index, format_index +from seqspec.seqspec_index import seqspec_index, format_index, filter_index_no_overlap from seqspec.Assay import Assay from seqspec.Region import RegionCoordinate import json +from pathlib import Path +from seqspec.utils import load_spec def test_seqspec_index(dogmaseq_dig_spec: Assay): @@ -374,3 +376,32 @@ def test_format_index(): split = format_index(indices, "splitcode") assert "@extract" in split assert "groups\tids\ttags\tdistances\tlocations" in split + + +def test_filter_index_no_overlap_is_noop_when_reads_do_not_overlap( + dogmaseq_dig_spec: Assay, +): + indices = seqspec_index( + spec=dogmaseq_dig_spec, modality="rna", ids=["rna_R1", "rna_R2"], idtype="read" + ) + + filtered = filter_index_no_overlap(indices) + + assert len(filtered) == 2 + assert len(filtered[0].rcv) == 2 + assert len(filtered[1].rcv) == 1 + + +def test_filter_index_no_overlap_removes_regions_seen_in_earlier_reads(): + spec = load_spec(Path("tests/fixtures/check_overlap_warning/spec.yaml")) + indices = seqspec_index( + spec=spec, modality="rna", ids=["rna_R1", "rna_R2"], idtype="read" + ) + + filtered = filter_index_no_overlap(indices) + + assert len(filtered) == 2 + assert filtered[0].query_id == "rna_R1" + assert [region.region_id for region in filtered[0].rcv] == ["barcode", "umi"] + assert filtered[1].query_id == "rna_R2" + assert filtered[1].rcv == [] diff --git a/tests/test_seqspec_info.py b/tests/test_seqspec_info.py index af63b6bb..6526a878 100644 --- a/tests/test_seqspec_info.py +++ b/tests/test_seqspec_info.py @@ -1,7 +1,7 @@ -from seqspec.seqspec_info import seqspec_info, format_info -from seqspec.Assay import Assay import json +from seqspec.Assay import Assay +from seqspec.seqspec_info import format_info, seqspec_info def test_seqspec_info(dogmaseq_dig_spec: Assay): # Test getting modalities @@ -37,6 +37,17 @@ def test_format_info(dogmaseq_dig_spec: Assay): meta_dict = json.loads(formatted_info) assert meta_dict["name"] == "DOGMAseq-DIG/Illumina" + info = seqspec_info(spec=dogmaseq_dig_spec, key="sequence_spec") + formatted_info = format_info(info, "sequence_spec", "json") + sequence_spec = json.loads(formatted_info) + assert any(read["read_id"] == "rna_R1" for read in sequence_spec) + + info = seqspec_info(spec=dogmaseq_dig_spec, key="library_spec") + formatted_info = format_info(info, "library_spec", "json") + library_spec = json.loads(formatted_info) + assert "rna" in library_spec + assert any(region["region_id"] == "rna_cell_bc" for region in library_spec["rna"]) + def test_seqspec_info_modalities(dogmaseq_dig_spec: Assay): """Test seqspec_info with modalities key""" @@ -51,12 +62,12 @@ def test_seqspec_info_sequence_spec(dogmaseq_dig_spec: Assay): sequence_spec = info["sequence_spec"] assert len(sequence_spec) > 0 - + # Check that we have the expected reads read_ids = [read["read_id"] for read in sequence_spec] expected_reads = ["protein_R1", "protein_R2", "tag_R1", "tag_R2", "rna_R1", "rna_R2", "atac_R1", "atac_R2", "atac_R3"] assert set(read_ids) == set(expected_reads) - + # Check structure of first read protein_r1 = next(read for read in sequence_spec if read["read_id"] == "protein_R1") assert protein_r1["name"] == "protein Read 1" @@ -71,25 +82,25 @@ def test_seqspec_info_library_spec(dogmaseq_dig_spec: Assay): info = seqspec_info(spec=dogmaseq_dig_spec, key="library_spec") assert "library_spec" in info library_spec = info["library_spec"] - + # Check that all modalities are present assert set(library_spec.keys()) == {"protein", "tag", "rna", "atac"} - + # Check protein modality regions protein_regions = library_spec["protein"] - region_ids = [region.region_id for region in protein_regions] + region_ids = [region["region_id"] for region in protein_regions] expected_protein_regions = [ "protein_truseq_read1", "protein_cell_bc", "protein_umi", "protein_seq", "protein_truseq_read2" ] assert set(region_ids) == set(expected_protein_regions) - + # Check specific region properties - protein_cell_bc = next(region for region in protein_regions if region.region_id == "protein_cell_bc") - assert protein_cell_bc.region_type == "barcode" - assert protein_cell_bc.name == "Cell Barcode" - assert protein_cell_bc.min_len == 16 - assert protein_cell_bc.max_len == 16 + protein_cell_bc = next(region for region in protein_regions if region["region_id"] == "protein_cell_bc") + assert protein_cell_bc["region_type"] == "barcode" + assert protein_cell_bc["name"] == "Cell Barcode" + assert protein_cell_bc["min_len"] == 16 + assert protein_cell_bc["max_len"] == 16 def test_seqspec_info_meta(dogmaseq_dig_spec: Assay): @@ -97,9 +108,9 @@ def test_seqspec_info_meta(dogmaseq_dig_spec: Assay): info = seqspec_info(spec=dogmaseq_dig_spec, key="meta") assert "meta" in info meta = info["meta"] - + # Check key metadata fields assert meta["assay_id"] == "DOGMAseq-DIG" assert meta["name"] == "DOGMAseq-DIG/Illumina" assert "description" in meta - assert "seqspec_version" in meta \ No newline at end of file + assert "seqspec_version" in meta diff --git a/tests/test_seqspec_insert.py b/tests/test_seqspec_insert.py index 5bfd05c4..9b0b4bff 100644 --- a/tests/test_seqspec_insert.py +++ b/tests/test_seqspec_insert.py @@ -1,4 +1,5 @@ from seqspec.seqspec_insert import ( + load_resource_payload, seqspec_insert_reads, seqspec_insert_regions, ) @@ -263,4 +264,78 @@ def test_seqspec_insert_general_read(temp_spec: Assay): assert len(updated_spec.get_seqspec("rna")) == original_rna_read_count + 1 read_ids = [r.read_id for r in updated_spec.get_seqspec("rna")] - assert "general_R1" in read_ids \ No newline at end of file + assert "general_R1" in read_ids + + +def test_load_resource_payload_accepts_yaml_file(tmp_path): + resource_path = tmp_path / "regions.yaml" + resource_path.write_text( + "- region_id: inserted_region\n" + " region_type: linker\n" + " name: Inserted Region\n" + " sequence_type: fixed\n" + " sequence: ACGT\n" + " min_len: 4\n" + " max_len: 4\n" + ) + + payload = load_resource_payload(str(resource_path)) + + assert isinstance(payload, list) + assert payload[0]["region_id"] == "inserted_region" + + +def test_load_resource_payload_accepts_reads_mapping(): + payload = load_resource_payload( + '{"reads":[{"read_id":"wrapped_R1","name":"Wrapped Read","primer_id":"rna_polyT","min_len":10,"max_len":10,"strand":"pos"}]}' + ) + + assert isinstance(payload, dict) + assert payload["reads"][0]["read_id"] == "wrapped_R1" + + +def test_load_resource_payload_accepts_regions_mapping(): + payload = load_resource_payload( + '{"regions":[{"region_id":"wrapped_bc","region_type":"barcode","name":"Wrapped Barcode","sequence_type":"fixed","sequence":"ACGT","min_len":4,"max_len":4}]}' + ) + + assert isinstance(payload, dict) + assert payload["regions"][0]["region_id"] == "wrapped_bc" + + +def test_seqspec_insert_reads_invalid_modality_raises(temp_spec): + new_read = ReadInput( + read_id="bad_read", + name="Bad Read", + modality="rna", + primer_id="test_primer", + min_len=10, + max_len=10, + strand="pos", + ) + + try: + seqspec_insert_reads(temp_spec, "missing", [new_read]) + except ValueError as exc: + assert "Modality 'missing' not found." in str(exc) + else: + raise AssertionError("Expected ValueError for invalid modality") + + +def test_seqspec_insert_regions_missing_after_raises(temp_spec): + new_region = RegionInput( + region_id="bad_region", + region_type="barcode", + name="Bad Region", + sequence_type="fixed", + sequence="ACGT", + min_len=4, + max_len=4, + ) + + try: + seqspec_insert_regions(temp_spec, "rna", [new_region], after="missing") + except ValueError as exc: + assert "No region with id 'missing' found under modality 'rna'" in str(exc) + else: + raise AssertionError("Expected ValueError for missing insertion target") diff --git a/tests/test_seqspec_onlist.py b/tests/test_seqspec_onlist.py index 158caad1..b9086035 100644 --- a/tests/test_seqspec_onlist.py +++ b/tests/test_seqspec_onlist.py @@ -1,16 +1,17 @@ -import pytest -import os from pathlib import Path from unittest.mock import patch + +import pytest + from seqspec.seqspec_onlist import ( + Onlist, + download_onlists_to_path, get_onlists, + get_onlist_urls, join_onlist_contents, - run_onlist, + join_onlists_and_save, ) from seqspec.utils import load_spec -from argparse import Namespace, ArgumentParser -from seqspec.seqspec_onlist import Onlist - def test_get_onlists_region(dogmaseq_dig_spec): @@ -19,27 +20,198 @@ def test_get_onlists_region(dogmaseq_dig_spec): assert len(onlists) == 1 assert onlists[0].file_id == "RNA-737K-arc-v1.txt" -def test_get_onlists_region_type(dogmaseq_dig_spec): - """Test get_onlists with region-type selector""" - onlists = get_onlists(dogmaseq_dig_spec, "rna", "region-type", "barcode") - assert len(onlists) > 0 - for onlist in onlists: - assert onlist is not None + +def test_get_onlists_region_type_raises_for_ambiguous_reads(dogmaseq_dig_spec): + """Test get_onlists with region-type selector when matches span reads.""" + with pytest.raises(ValueError, match="matches regions in multiple reads"): + get_onlists(dogmaseq_dig_spec, "rna", "region-type", "barcode") + def test_get_onlists_read(dogmaseq_dig_spec): """Test get_onlists with read selector""" onlists = get_onlists(dogmaseq_dig_spec, "rna", "read", "rna_R1") assert len(onlists) == 1 + +def test_get_onlists_read_respects_read_window(): + spec = load_spec("tests/fixtures/onlist_read_clip/spec.yaml") + onlists = get_onlists(spec, "rna", "read", "rna_read") + + assert [onlist.file_id for onlist in onlists] == ["barcode_a.txt"] + + def test_join_onlist_contents_product(): """Test joining onlists with product format""" contents = [["A", "B"], ["1", "2"]] joined = join_onlist_contents(contents, "product") assert set(joined) == {"A1", "A2", "B1", "B2"} + def test_join_onlist_contents_multi(): """Test joining onlists with multi format""" contents = [["A", "B"], ["1", "2", "3"]] joined = join_onlist_contents(contents, "multi") assert joined == ["A 1", "B 2", "- 3"] + +def remote_onlist() -> Onlist: + return Onlist( + file_id="remote_list", + filename="remote.txt.gz", + filetype="txt.gz", + filesize=123, + url="https://example.org/remote.txt.gz", + urltype="https", + md5="abc", + ) + + +def test_download_onlists_to_path_threads_auth_profile(tmp_path): + calls = [] + + def fake_read_remote_list(onlist, base_path="", auth_profile=None): + calls.append( + { + "file_id": onlist.file_id, + "base_path": base_path, + "auth_profile": auth_profile, + } + ) + return ["AAA", "CCC"] + + output_path = tmp_path / "joined.txt" + with patch("seqspec.seqspec_onlist.read_remote_list", side_effect=fake_read_remote_list): + downloaded = download_onlists_to_path( + [remote_onlist()], + output_path, + tmp_path, + auth_profile="igvf", + ) + + assert calls == [ + { + "file_id": "remote_list", + "base_path": "", + "auth_profile": "igvf", + } + ] + assert len(downloaded) == 1 + assert downloaded[0]["file_id"] == "remote_list" + assert downloaded[0]["url"].endswith("remote_list_joined.txt") + assert Path(downloaded[0]["url"]).read_text().splitlines() == ["AAA", "CCC"] + + +def test_join_onlists_and_save_threads_auth_profile(tmp_path): + calls = [] + + def fake_read_remote_list(onlist, base_path="", auth_profile=None): + calls.append( + { + "file_id": onlist.file_id, + "base_path": base_path, + "auth_profile": auth_profile, + } + ) + return ["AAA", "CCC"] + + output_path = tmp_path / "product.txt" + with patch("seqspec.seqspec_onlist.read_remote_list", side_effect=fake_read_remote_list): + result_path = join_onlists_and_save( + [remote_onlist()], + "product", + output_path, + tmp_path, + auth_profile="igvf", + ) + + assert calls == [ + { + "file_id": "remote_list", + "base_path": "", + "auth_profile": "igvf", + } + ] + assert result_path == str(output_path) + assert output_path.read_text().splitlines() == ["AAA", "CCC"] + + +def test_product_onlist_matches_between_read_and_region_type_for_issue_68(tmp_path): + fixture_dir = Path("tests/fixtures/onlist_issue_68") + spec = load_spec(fixture_dir / "spec.yaml") + + read_output = tmp_path / "read_product.txt" + region_type_output = tmp_path / "region_type_product.txt" + + join_onlists_and_save( + get_onlists(spec, "rna", "read", "rna_read"), + "product", + read_output, + fixture_dir, + ) + join_onlists_and_save( + get_onlists(spec, "rna", "region-type", "barcode"), + "product", + region_type_output, + fixture_dir, + ) + + read_lines = read_output.read_text().splitlines() + region_type_lines = region_type_output.read_text().splitlines() + + assert read_lines == ["TTAA", "TTAC", "TGAA", "TGAC"] + assert region_type_lines == read_lines + + +def test_get_onlists_region_type_uses_read_order_when_unique(): + spec = load_spec("tests/fixtures/onlist_issue_68/spec.yaml") + onlists = get_onlists(spec, "rna", "region-type", "barcode") + assert [onlist.file_id for onlist in onlists] == ["barcode_b.txt", "barcode_a.txt"] + + +def test_region_type_onlist_errors_when_matches_span_multiple_reads(): + spec = load_spec("tests/fixtures/onlist_ambiguous_region_type/spec.yaml") + + with pytest.raises(ValueError, match="matches regions in multiple reads"): + get_onlists(spec, "rna", "region-type", "barcode") + + +def test_get_onlist_urls_prefers_local_url(tmp_path): + onlist = Onlist( + file_id="local_list", + filename="display.txt", + filetype="txt", + filesize=0, + url="nested/whitelist.txt", + urltype="local", + md5="", + ) + + urls = get_onlist_urls([onlist], tmp_path) + assert urls == [ + { + "file_id": "local_list", + "url": str(tmp_path / "nested" / "whitelist.txt"), + } + ] + + +def test_join_onlists_and_save_reads_local_onlists_from_url(tmp_path): + nested = tmp_path / "nested" + nested.mkdir() + (nested / "whitelist.txt").write_text("AAAA\nCCCC\n") + + onlist = Onlist( + file_id="local_list", + filename="display.txt", + filetype="txt", + filesize=0, + url="nested/whitelist.txt", + urltype="local", + md5="", + ) + output = tmp_path / "joined.txt" + + result_path = join_onlists_and_save([onlist], "product", output, tmp_path) + + assert result_path == str(output) + assert output.read_text().splitlines() == ["AAAA", "CCCC"] diff --git a/tests/test_seqspec_print.py b/tests/test_seqspec_print.py index bb642b61..bf23d270 100644 --- a/tests/test_seqspec_print.py +++ b/tests/test_seqspec_print.py @@ -1,11 +1,164 @@ -from io import StringIO -from unittest import TestCase +from pathlib import Path from matplotlib.figure import Figure -from seqspec.seqspec_print import ( - print_library_ascii, - print_seqspec_png, -) -from seqspec.utils import load_spec_stream +from seqspec.Assay import Assay +from seqspec.Read import Read +from seqspec.Region import Region +from seqspec.seqspec_print import print_library_ascii, print_seqspec_png +from seqspec.seqspec_print_html import build_seqspec_view_data, print_seqspec_html +from seqspec.utils import load_spec +FIXTURE = Path("tests/fixtures/spec.yaml") + + +def nested_spec(): + return Assay( + seqspec_version="0.4.0", + assay_id="nested-assay", + name="Nested Assay", + doi="", + date="2026-03-24", + description="nested regions", + modalities=["rna"], + lib_struct="", + sequence_protocol=None, + sequence_kit=None, + library_protocol=None, + library_kit=None, + sequence_spec=[ + Read( + read_id="rna_R1", + name="Read 1", + modality="rna", + primer_id="joined_block", + min_len=2, + max_len=2, + strand="pos", + files=[], + ) + ], + library_spec=[ + Region( + region_id="rna", + region_type="rna", + name="rna", + sequence_type="joined", + sequence="AAATXX", + min_len=6, + max_len=6, + onlist=None, + regions=[ + Region( + region_id="joined_block", + region_type="named", + name="joined block", + sequence_type="joined", + sequence="AAAT", + min_len=4, + max_len=4, + onlist=None, + regions=[ + Region( + region_id="fixed_a", + region_type="linker", + name="fixed a", + sequence_type="fixed", + sequence="AAA", + min_len=3, + max_len=3, + onlist=None, + regions=[], + ), + Region( + region_id="fixed_t", + region_type="linker", + name="fixed t", + sequence_type="fixed", + sequence="T", + min_len=1, + max_len=1, + onlist=None, + regions=[], + ), + ], + ), + Region( + region_id="umi", + region_type="umi", + name="umi", + sequence_type="random", + sequence="XX", + min_len=2, + max_len=2, + onlist=None, + regions=[], + ), + ], + ) + ], + ) + + +def test_print_library_ascii_contains_regions(): + spec = load_spec(FIXTURE) + rendered = print_library_ascii(spec) + assert "rna_cell_bc" in rendered + assert "protein_truseq_read1" in rendered + assert "atac_cell_bc" in rendered + + +def test_print_seqspec_png_returns_figure(): + spec = load_spec(FIXTURE) + figure = print_seqspec_png(spec) + assert isinstance(figure, Figure) + + +def test_build_seqspec_view_data_contains_modalities(): + spec = load_spec(FIXTURE) + data = build_seqspec_view_data(spec) + assert data["assay_id"] == "DOGMAseq-DIG" + assert len(data["modalities"]) == 4 + assert any(modality["modality"] == "rna" for modality in data["modalities"]) + + +def test_build_seqspec_view_data_projects_reads(): + spec = load_spec(FIXTURE) + data = build_seqspec_view_data(spec) + rna = next(modality for modality in data["modalities"] if modality["modality"] == "rna") + read = next(read for read in rna["reads"] if read["read_id"] == "rna_R2") + assert read["strand"] == "neg" + assert read["start"] < read["end"] + assert len(read["files"]) == 1 + + +def test_print_seqspec_html_contains_embedded_payload(): + spec = load_spec(FIXTURE) + html = print_seqspec_html(spec) + assert "seqspec-view-data" in html + assert "DOGMAseq-DIG" in html + assert "region-rect" in html + + +def test_build_seqspec_view_data_keeps_nested_regions(): + data = build_seqspec_view_data(nested_spec()) + modality = data["modalities"][0] + parent = next(node for node in modality["region_nodes"] if node["region_id"] == "joined_block") + child = next(node for node in modality["region_nodes"] if node["region_id"] == "fixed_a") + assert parent["is_leaf"] is False + assert parent["child_region_ids"] == ["fixed_a", "fixed_t"] + assert child["path_region_ids"] == ["joined_block", "fixed_a"] + + +def test_build_seqspec_view_data_projects_reads_from_parent_region(): + data = build_seqspec_view_data(nested_spec()) + read = data["modalities"][0]["reads"][0] + assert read["primer_id"] == "joined_block" + assert read["start"] == 4 + assert read["end"] == 6 + + +def test_print_seqspec_html_contains_nested_region_payload(): + html = print_seqspec_html(nested_spec()) + assert "joined_block" in html + assert "group-rect" in html diff --git a/tests/test_seqspec_upgrade.py b/tests/test_seqspec_upgrade.py new file mode 100644 index 00000000..9bc3a9e7 --- /dev/null +++ b/tests/test_seqspec_upgrade.py @@ -0,0 +1,25 @@ +from pathlib import Path + +from seqspec.seqspec_upgrade import seqspec_upgrade +from seqspec.utils import load_spec + + +def test_seqspec_upgrade_promotes_0_3_to_0_4(): + spec = load_spec(Path("tests/fixtures/legacy_0_3_scalar_protocols.yaml"), strict=False) + assert spec.seqspec_version == "0.3.0" + + upgraded = seqspec_upgrade(spec, spec.seqspec_version) + + assert upgraded.seqspec_version == "0.4.0" + + +def test_seqspec_upgrade_promotes_0_2_to_0_4_and_adds_files(): + spec = load_spec(Path("tests/fixtures/legacy_0_2_missing_fields.yaml"), strict=False) + assert spec.seqspec_version == "0.2.0" + assert spec.sequence_spec[0].files == [] + + upgraded = seqspec_upgrade(spec, spec.seqspec_version) + + assert upgraded.seqspec_version == "0.4.0" + assert len(upgraded.sequence_spec[0].files) == 1 + assert upgraded.sequence_spec[0].files[0].file_id == upgraded.sequence_spec[0].read_id diff --git a/tests/test_utils.py b/tests/test_utils.py index 9f9126b3..67a54f30 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -9,9 +9,11 @@ from seqspec.utils import ( load_spec_stream, + local_resource_url, read_local_list, read_remote_list, get_remote_auth_token, + local_onlist_locator, map_read_id_to_regions, write_read, yield_onlist_contents, @@ -268,4 +270,56 @@ def test_map_read_id_to_regions_invalid_read_id(): library_spec=[], ) with pytest.raises(IndexError): - map_read_id_to_regions(spec, "RNA", "read2") \ No newline at end of file + map_read_id_to_regions(spec, "RNA", "read2") + + +def test_local_onlist_locator_prefers_url_when_present(): + onlist = Onlist( + file_id="ol1", + filename="display.txt", + filetype="txt", + filesize=0, + url="nested/whitelist.txt", + urltype="local", + md5="", + ) + + assert local_onlist_locator(onlist) == "nested/whitelist.txt" + + +def test_local_onlist_locator_errors_when_url_is_empty(): + onlist = Onlist( + file_id="ol1", + filename="display.txt", + filetype="txt", + filesize=0, + url="", + urltype="local", + md5="", + ) + + with pytest.raises(ValueError, match="local onlist 'display.txt' has empty url"): + local_onlist_locator(onlist) + + +def test_read_local_list_prefers_url_when_present(tmp_path): + nested = tmp_path / "nested" + nested.mkdir() + (nested / "whitelist.txt").write_text("AAAA\nCCCC\n") + + onlist = Onlist( + file_id="ol1", + filename="display.txt", + filetype="txt", + filesize=0, + url="nested/whitelist.txt", + urltype="local", + md5="", + ) + + assert read_local_list(onlist, str(tmp_path)) == ["AAAA", "CCCC"] + + +def test_local_resource_url_errors_when_url_is_empty(): + with pytest.raises(ValueError, match="local file 'display.fastq.gz' has empty url"): + local_resource_url("", "display.fastq.gz", "file")