Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ min_version(
"7.7.0"
) # Snakemake 7.7.0 introduced `retries` directive used in fetch-sequences

configfile: "defaults/config.yaml"
# Utility functions shared across all workflows.
include: "../shared/vendored/snakemake/config.smk"

serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4']
# Use default configuration values. Extend with Snakemake's --configfile/--config options.
configfile: os.path.join(workflow.basedir, "defaults/config.yaml")

serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4']

rule all:
input:
Expand All @@ -23,4 +26,10 @@ include: "rules/nextclade.smk"
if "custom_rules" in config:
for rule_file in config["custom_rules"]:

include: rule_file
# Relative custom rule paths in the config are relative to the analysis
# directory (i.e. the current working directory, or workdir, usually
# given by --directory), but the "include" directive treats relative
# paths as relative to the workflow (e.g. workflow.current_basedir).
# Convert to an absolute path based on the analysis/current directory
# to avoid this mismatch of expectations.
include: os.path.join(os.getcwd(), rule_file)
4 changes: 2 additions & 2 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ ncbi_datasets_fields:
curate:
# The path to the local geolocation rules within the pathogen repo
# The path should be relative to the ingest directory.
local_geolocation_rules: "defaults/geolocation-rules.tsv"
local_geolocation_rules: "geolocation-rules.tsv"
# List of field names to change where the key is the original field name and the value is the new field name
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
Expand Down Expand Up @@ -86,7 +86,7 @@ curate:
abbr_authors_field: "authors"
# Path to the manual annotations file
# The path should be relative to the ingest directory
annotations: "defaults/annotations.tsv"
annotations: "annotations.tsv"
# Serotype field name inferred from NCBI Genbank annotation
serotype_field: "serotype_genbank"
# The ID field in the metadata to use to merge the manual annotations
Expand Down
10 changes: 5 additions & 5 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def format_field_map(field_map: dict[str, str]) -> str:
rule curate:
input:
sequences_ndjson="data/ncbi.ndjson",
geolocation_rules=config["curate"]["local_geolocation_rules"],
annotations=config["curate"]["annotations"],
manual_mapping="defaults/host_hostgenus_hosttype_map.tsv",
geolocation_rules=resolve_config_path(config["curate"]["local_geolocation_rules"]),
annotations=resolve_config_path(config["curate"]["annotations"]),
manual_mapping=resolve_config_path("host_hostgenus_hosttype_map.tsv"),
output:
metadata="data/all_metadata_curated.tsv",
sequences="results/sequences_all.fasta",
Expand Down Expand Up @@ -80,9 +80,9 @@ rule curate:
--abbr-authors-field {params.abbr_authors_field} \
| augur curate apply-geolocation-rules \
--geolocation-rules {input.geolocation_rules} \
| ./scripts/infer-dengue-serotype.py \
| {workflow.basedir}/scripts/infer-dengue-serotype.py \
--out-col {params.serotype_field} \
| ./scripts/transform-new-fields \
| {workflow.basedir}/scripts/transform-new-fields \
--map-tsv {input.manual_mapping} \
--map-id host \
--metadata-id host \
Expand Down
19 changes: 15 additions & 4 deletions nextstrain-pathogen.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# This is currently an empty file to indicate the top level pathogen repo.
# The inclusion of this file allows the Nextstrain CLI to run the
# `nextstrain build` from any directory regardless of runtime.
# This file's *existence* marks the top level of a Nextstrain pathogen repo,
# which allows `nextstrain build` to be run from any subdirectory of the repo
# regardless of runtime. For more details, see
# <https://github.com/nextstrain/cli/releases/tag/8.2.0>.
#
# See https://github.com/nextstrain/cli/releases/tag/8.2.0 for more details.
# This file's *contents* is the "registration metadata" for the pathogen repo,
# used by `nextstrain setup` and `nextstrain run`.
---
$schema: https://nextstrain.org/schemas/pathogen/v0
workflows:
ingest:
compatibility:
nextstrain run: True
phylogenetic:
compatibility:
nextstrain run: True
7 changes: 5 additions & 2 deletions phylogenetic/Snakefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
configfile: "defaults/config_dengue.yaml"
# Utility functions shared across all workflows.
include: "../shared/vendored/snakemake/config.smk"

configfile: os.path.join(workflow.basedir, "defaults/config_dengue.yaml")

include: "../shared/vendored/snakemake/config.smk"
include: "rules/config.smk"
Expand Down Expand Up @@ -29,7 +32,7 @@ include: "rules/export.smk"
if "custom_rules" in config:
for rule_file in config["custom_rules"]:

include: rule_file
include: os.path.join(os.getcwd(), rule_file)

rule clean:
"""Removing directories: {params}"""
Expand Down
16 changes: 8 additions & 8 deletions phylogenetic/defaults/config_dengue.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ strain_id_field: "accession"
display_strain_field: "strain"

filter:
exclude: "defaults/exclude.txt"
include: "defaults/{serotype}/include.txt"
exclude: "exclude.txt"
include: "{serotype}/include.txt"
group_by: "year region"
min_length:
genome: 5000
Expand All @@ -33,11 +33,11 @@ traits:

clades:
clade_definitions:
all: 'defaults/clades_serotypes.tsv'
denv1: 'defaults/clades_genotypes.tsv'
denv2: 'defaults/clades_genotypes.tsv'
denv3: 'defaults/clades_genotypes.tsv'
denv4: 'defaults/clades_genotypes.tsv'
all: 'clades_serotypes.tsv'
denv1: 'clades_genotypes.tsv'
denv2: 'clades_genotypes.tsv'
denv3: 'clades_genotypes.tsv'
denv4: 'clades_genotypes.tsv'

tip_frequencies:
min_date: "1980-01-01"
Expand All @@ -46,4 +46,4 @@ tip_frequencies:
wide_bandwidth: 0.6

export:
description: "defaults/description.md"
description: "description.md"
4 changes: 2 additions & 2 deletions phylogenetic/rules/annotate_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ rule translate:
input:
tree = "results/{serotype}/{gene}/tree.nwk",
node_data = "results/{serotype}/{gene}/nt-muts.json",
reference = lambda wildcard: "defaults/{serotype}/reference.gb" if wildcard.gene in ['genome'] else "results/defaults/reference_{serotype}_{gene}.gb"
reference = lambda wildcard: resolve_config_path("{serotype}/reference.gb") if wildcard.gene in ['genome'] else "results/defaults/reference_{serotype}_{gene}.gb"
output:
node_data = "results/{serotype}/{gene}/aa-muts.json"
benchmark:
Expand Down Expand Up @@ -94,7 +94,7 @@ rule clades:
tree = "results/{serotype}/genome/tree.nwk",
nt_muts = "results/{serotype}/genome/nt-muts.json",
aa_muts = "results/{serotype}/genome/aa-muts.json",
clade_defs = lambda wildcards: config['clades']['clade_definitions'][wildcards.serotype],
clade_defs = lambda wildcards: resolve_config_path(config['clades']['clade_definitions'][wildcards.serotype]),
output:
clades = "results/{serotype}/genome/clades.json"
benchmark:
Expand Down
10 changes: 5 additions & 5 deletions phylogenetic/rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,17 @@ import json

rule colors:
input:
color_schemes = "defaults/color_schemes.tsv",
color_orderings = "defaults/color_orderings.tsv",
color_schemes = resolve_config_path("color_schemes.tsv"),
color_orderings = resolve_config_path("color_orderings.tsv"),
metadata = "results/{serotype}/metadata.tsv",
manual_colors = "defaults/colors.tsv"
manual_colors = resolve_config_path("colors.tsv")
output:
colors = "results/{serotype}/colors.tsv"
benchmark:
"benchmarks/{serotype}/colors.txt"
shell:
"""
python3 scripts/assign-colors.py \
python3 {workflow.basedir}/scripts/assign-colors.py \
--color-schemes {input.color_schemes} \
--ordering {input.color_orderings} \
--metadata {input.metadata} \
Expand Down Expand Up @@ -180,7 +180,7 @@ rule export:
clades = lambda wildcard: "results/{serotype}/{gene}/clades.json" if wildcard.gene in ['genome'] else [],
nt_muts = "results/{serotype}/{gene}/nt-muts.json",
aa_muts = "results/{serotype}/{gene}/aa-muts.json",
description = config["export"]["description"],
description = resolve_config_path(config["export"]["description"]),
auspice_config = "results/defaults/{serotype}/{gene}/auspice_config.json",
colors = "results/{serotype}/colors.tsv",
output:
Expand Down
4 changes: 2 additions & 2 deletions phylogenetic/rules/prepare_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ rule filter:
if wildcard.gene in ['genome']
else "results/{serotype}/{gene}/sequences.fasta"),
metadata = "results/{serotype}/metadata.tsv",
exclude = config["filter"]["exclude"],
include = config["filter"]["include"],
exclude = resolve_config_path(config["filter"]["exclude"]),
include = resolve_config_path(config["filter"]["include"]),
output:
sequences = "results/{serotype}/{gene}/filtered.fasta"
benchmark:
Expand Down
4 changes: 2 additions & 2 deletions phylogenetic/rules/prepare_sequences_E.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ rule generate_E_reference_files:
Generating reference files for the E gene
"""
input:
reference = "defaults/{serotype}/reference.gb",
reference = resolve_config_path("{serotype}/reference.gb"),
output:
fasta = "results/defaults/reference_{serotype}_E.fasta",
genbank = "results/defaults/reference_{serotype}_E.gb",
Expand All @@ -30,7 +30,7 @@ rule generate_E_reference_files:
gene = "E",
shell:
"""
python3 scripts/newreference.py \
python3 {workflow.basedir}/scripts/newreference.py \
--reference {input.reference} \
--output-fasta {output.fasta} \
--output-genbank {output.genbank} \
Expand Down