From 6ea24447281adbdc09106c9a96a17ed44e090ddb Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 19 Nov 2025 14:16:27 -0800 Subject: [PATCH 1/3] Register workflows in nextstrain-pathogen.yaml --- nextstrain-pathogen.yaml | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/nextstrain-pathogen.yaml b/nextstrain-pathogen.yaml index b74c50d3..5186e369 100644 --- a/nextstrain-pathogen.yaml +++ b/nextstrain-pathogen.yaml @@ -1,5 +1,16 @@ -# This is currently an empty file to indicate the top level pathogen repo. -# The inclusion of this file allows the Nextstrain CLI to run the -# `nextstrain build` from any directory regardless of runtime. +# This file's *existence* marks the top level of a Nextstrain pathogen repo, +# which allows `nextstrain build` to be run from any subdirectory of the repo +# regardless of runtime. For more details, see +# . # -# See https://github.com/nextstrain/cli/releases/tag/8.2.0 for more details. +# This file's *contents* is the "registration metadata" for the pathogen repo, +# used by `nextstrain setup` and `nextstrain run`. +--- +$schema: https://nextstrain.org/schemas/pathogen/v0 +workflows: + ingest: + compatibility: + nextstrain run: True + phylogenetic: + compatibility: + nextstrain run: True \ No newline at end of file From 66b36ae49b0ef23ea1670189b72d65b87c1abc7d Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 19 Nov 2025 14:26:25 -0800 Subject: [PATCH 2/3] Ingest: Support workflows as programs --- ingest/Snakefile | 15 ++++++++++++--- ingest/defaults/config.yaml | 4 ++-- ingest/rules/curate.smk | 10 +++++----- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/ingest/Snakefile b/ingest/Snakefile index ad7e0f7c..90bead2c 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -4,10 +4,13 @@ min_version( "7.7.0" ) # Snakemake 7.7.0 introduced `retries` directive used in fetch-sequences -configfile: "defaults/config.yaml" +# Utility functions shared across all workflows. +include: "../shared/vendored/snakemake/config.smk" -serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4'] +# Use default configuration values. Extend with Snakemake's --configfile/--config options. +configfile: os.path.join(workflow.basedir, "defaults/config.yaml") +serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4'] rule all: input: @@ -23,4 +26,10 @@ include: "rules/nextclade.smk" if "custom_rules" in config: for rule_file in config["custom_rules"]: - include: rule_file + # Relative custom rule paths in the config are relative to the analysis + # directory (i.e. the current working directory, or workdir, usually + # given by --directory), but the "include" directive treats relative + # paths as relative to the workflow (e.g. workflow.current_basedir). + # Convert to an absolute path based on the analysis/current directory + # to avoid this mismatch of expectations. + include: os.path.join(os.getcwd(), rule_file) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index a80563c7..1bb08b2d 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -33,7 +33,7 @@ ncbi_datasets_fields: curate: # The path to the local geolocation rules within the pathogen repo # The path should be relative to the ingest directory. - local_geolocation_rules: "defaults/geolocation-rules.tsv" + local_geolocation_rules: "geolocation-rules.tsv" # List of field names to change where the key is the original field name and the value is the new field name # The original field names should match the ncbi_datasets_fields provided above. # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names @@ -86,7 +86,7 @@ curate: abbr_authors_field: "authors" # Path to the manual annotations file # The path should be relative to the ingest directory - annotations: "defaults/annotations.tsv" + annotations: "annotations.tsv" # Serotype field name inferred from NCBI Genbank annotation serotype_field: "serotype_genbank" # The ID field in the metadata to use to merge the manual annotations diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 7754de84..07af4995 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -29,9 +29,9 @@ def format_field_map(field_map: dict[str, str]) -> str: rule curate: input: sequences_ndjson="data/ncbi.ndjson", - geolocation_rules=config["curate"]["local_geolocation_rules"], - annotations=config["curate"]["annotations"], - manual_mapping="defaults/host_hostgenus_hosttype_map.tsv", + geolocation_rules=resolve_config_path(config["curate"]["local_geolocation_rules"]), + annotations=resolve_config_path(config["curate"]["annotations"]), + manual_mapping=resolve_config_path("host_hostgenus_hosttype_map.tsv"), output: metadata="data/all_metadata_curated.tsv", sequences="results/sequences_all.fasta", @@ -80,9 +80,9 @@ rule curate: --abbr-authors-field {params.abbr_authors_field} \ | augur curate apply-geolocation-rules \ --geolocation-rules {input.geolocation_rules} \ - | ./scripts/infer-dengue-serotype.py \ + | {workflow.basedir}/scripts/infer-dengue-serotype.py \ --out-col {params.serotype_field} \ - | ./scripts/transform-new-fields \ + | {workflow.basedir}/scripts/transform-new-fields \ --map-tsv {input.manual_mapping} \ --map-id host \ --metadata-id host \ From e43838f27a0d1fc5894911ac86d77005de72c1a0 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 19 Nov 2025 14:37:08 -0800 Subject: [PATCH 3/3] Phylogenetic: Support workflows as programs --- phylogenetic/Snakefile | 7 +++++-- phylogenetic/defaults/config_dengue.yaml | 16 ++++++++-------- phylogenetic/rules/annotate_phylogeny.smk | 4 ++-- phylogenetic/rules/export.smk | 10 +++++----- phylogenetic/rules/prepare_sequences.smk | 4 ++-- phylogenetic/rules/prepare_sequences_E.smk | 4 ++-- 6 files changed, 24 insertions(+), 21 deletions(-) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 8b3743f2..f88ef4f1 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,4 +1,7 @@ -configfile: "defaults/config_dengue.yaml" +# Utility functions shared across all workflows. +include: "../shared/vendored/snakemake/config.smk" + +configfile: os.path.join(workflow.basedir, "defaults/config_dengue.yaml") include: "../shared/vendored/snakemake/config.smk" include: "rules/config.smk" @@ -29,7 +32,7 @@ include: "rules/export.smk" if "custom_rules" in config: for rule_file in config["custom_rules"]: - include: rule_file + include: os.path.join(os.getcwd(), rule_file) rule clean: """Removing directories: {params}""" diff --git a/phylogenetic/defaults/config_dengue.yaml b/phylogenetic/defaults/config_dengue.yaml index f4f84717..d89f4832 100644 --- a/phylogenetic/defaults/config_dengue.yaml +++ b/phylogenetic/defaults/config_dengue.yaml @@ -14,8 +14,8 @@ strain_id_field: "accession" display_strain_field: "strain" filter: - exclude: "defaults/exclude.txt" - include: "defaults/{serotype}/include.txt" + exclude: "exclude.txt" + include: "{serotype}/include.txt" group_by: "year region" min_length: genome: 5000 @@ -33,11 +33,11 @@ traits: clades: clade_definitions: - all: 'defaults/clades_serotypes.tsv' - denv1: 'defaults/clades_genotypes.tsv' - denv2: 'defaults/clades_genotypes.tsv' - denv3: 'defaults/clades_genotypes.tsv' - denv4: 'defaults/clades_genotypes.tsv' + all: 'clades_serotypes.tsv' + denv1: 'clades_genotypes.tsv' + denv2: 'clades_genotypes.tsv' + denv3: 'clades_genotypes.tsv' + denv4: 'clades_genotypes.tsv' tip_frequencies: min_date: "1980-01-01" @@ -46,4 +46,4 @@ tip_frequencies: wide_bandwidth: 0.6 export: - description: "defaults/description.md" + description: "description.md" diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index f21e0dc5..88405708 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -46,7 +46,7 @@ rule translate: input: tree = "results/{serotype}/{gene}/tree.nwk", node_data = "results/{serotype}/{gene}/nt-muts.json", - reference = lambda wildcard: "defaults/{serotype}/reference.gb" if wildcard.gene in ['genome'] else "results/defaults/reference_{serotype}_{gene}.gb" + reference = lambda wildcard: resolve_config_path("{serotype}/reference.gb") if wildcard.gene in ['genome'] else "results/defaults/reference_{serotype}_{gene}.gb" output: node_data = "results/{serotype}/{gene}/aa-muts.json" benchmark: @@ -94,7 +94,7 @@ rule clades: tree = "results/{serotype}/genome/tree.nwk", nt_muts = "results/{serotype}/genome/nt-muts.json", aa_muts = "results/{serotype}/genome/aa-muts.json", - clade_defs = lambda wildcards: config['clades']['clade_definitions'][wildcards.serotype], + clade_defs = lambda wildcards: resolve_config_path(config['clades']['clade_definitions'][wildcards.serotype]), output: clades = "results/{serotype}/genome/clades.json" benchmark: diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index d262d9b3..5109ab15 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -23,17 +23,17 @@ import json rule colors: input: - color_schemes = "defaults/color_schemes.tsv", - color_orderings = "defaults/color_orderings.tsv", + color_schemes = resolve_config_path("color_schemes.tsv"), + color_orderings = resolve_config_path("color_orderings.tsv"), metadata = "results/{serotype}/metadata.tsv", - manual_colors = "defaults/colors.tsv" + manual_colors = resolve_config_path("colors.tsv") output: colors = "results/{serotype}/colors.tsv" benchmark: "benchmarks/{serotype}/colors.txt" shell: """ - python3 scripts/assign-colors.py \ + python3 {workflow.basedir}/scripts/assign-colors.py \ --color-schemes {input.color_schemes} \ --ordering {input.color_orderings} \ --metadata {input.metadata} \ @@ -180,7 +180,7 @@ rule export: clades = lambda wildcard: "results/{serotype}/{gene}/clades.json" if wildcard.gene in ['genome'] else [], nt_muts = "results/{serotype}/{gene}/nt-muts.json", aa_muts = "results/{serotype}/{gene}/aa-muts.json", - description = config["export"]["description"], + description = resolve_config_path(config["export"]["description"]), auspice_config = "results/defaults/{serotype}/{gene}/auspice_config.json", colors = "results/{serotype}/colors.tsv", output: diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 58966f18..0b5f5792 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -31,8 +31,8 @@ rule filter: if wildcard.gene in ['genome'] else "results/{serotype}/{gene}/sequences.fasta"), metadata = "results/{serotype}/metadata.tsv", - exclude = config["filter"]["exclude"], - include = config["filter"]["include"], + exclude = resolve_config_path(config["filter"]["exclude"]), + include = resolve_config_path(config["filter"]["include"]), output: sequences = "results/{serotype}/{gene}/filtered.fasta" benchmark: diff --git a/phylogenetic/rules/prepare_sequences_E.smk b/phylogenetic/rules/prepare_sequences_E.smk index 0473d1e9..069d324e 100644 --- a/phylogenetic/rules/prepare_sequences_E.smk +++ b/phylogenetic/rules/prepare_sequences_E.smk @@ -20,7 +20,7 @@ rule generate_E_reference_files: Generating reference files for the E gene """ input: - reference = "defaults/{serotype}/reference.gb", + reference = resolve_config_path("{serotype}/reference.gb"), output: fasta = "results/defaults/reference_{serotype}_E.fasta", genbank = "results/defaults/reference_{serotype}_E.gb", @@ -30,7 +30,7 @@ rule generate_E_reference_files: gene = "E", shell: """ - python3 scripts/newreference.py \ + python3 {workflow.basedir}/scripts/newreference.py \ --reference {input.reference} \ --output-fasta {output.fasta} \ --output-genbank {output.genbank} \