Merge pull request #36 from nextstrain/use-remote-nextclade-dataset

kimandrews · web-flow · commit dc3cd4b8271c · 2024-06-07T14:03:38.000-07:00
Assign genotypes using Nextclade dataset and visualize on tree
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,4 +1,5 @@
 # CHANGELOG
+* 7 June 2024: Assign genotypes using Nextclade dataset and visualize on tree [PR #36](https://github.com/nextstrain/measles/pull/36)
 * 9 May 2024: Create a N450 tree that can be used as part of a Nextclade dataset to assign genotypes to measles samples based on criteria outlined by the WHO [PR #28](https://github.com/nextstrain/measles/pull/28)
 * 25 April 2024: Add specific sequences and metadata to the measles trees, including WHO reference sequences, vaccine strains, and genotypes reported on NCBI [PR #26](https://github.com/nextstrain/measles/pull/26)
 * 10 April 2024: Add a single GH Action workflow to automate the ingest and phylogenetic workflows [PR #22](https://github.com/nextstrain/measles/pull/22)
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -26,6 +26,7 @@ rule all:
 # by build specific rules.
 include: "rules/fetch_from_ncbi.smk"
 include: "rules/curate.smk"
+include: "rules/nextclade.smk"
 
 
 # Allow users to import custom rules provided via the config.
diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml
@@ -19,3 +19,6 @@ files_to_upload:
   ncbi.ndjson.zst: data/ncbi.ndjson
   metadata.tsv.zst: results/metadata.tsv
   sequences.fasta.zst: results/sequences.fasta
+  nextclade.tsv.zst: results/nextclade.tsv
+  alignment.fasta.zst: results/alignment.fasta
+  translations.zip: results/translations.zip
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -122,3 +122,7 @@ curate:
     'is_reference'
   ]
   genotype_field: "virus_name"
+nextclade:
+  dataset_name: "nextstrain/measles/N450/WHO-2012"
+  field_map: "defaults/nextclade_field_map.tsv"
+  id_field: "seqName"
diff --git a/ingest/defaults/nextclade_field_map.tsv b/ingest/defaults/nextclade_field_map.tsv
@@ -0,0 +1,28 @@
+# TSV file that is a mapping of column names for Nextclade output TSV
+# The first column should be the original column name of the Nextclade TSV
+# The second column should be the new column name to use in the final metadata TSV
+# Nextclade can have pathogen specific output columns so make sure to check which
+# columns would be useful for your downstream phylogenetic analysis.
+seqName	seqName
+clade	clade
+coverage	coverage
+totalMissing	missing_data
+totalSubstitutions	divergence
+totalNonACGTNs	nonACGTN
+qc.overallStatus	QC_overall
+qc.missingData.status	QC_missing_data
+qc.mixedSites.status	QC_mixed_sites
+qc.privateMutations.status	QC_rare_mutations
+qc.snpClusters.status	QC_snp_clusters
+qc.frameShifts.status	QC_frame_shifts
+qc.stopCodons.status	QC_stop_codons
+frameShifts	frame_shifts
+privateNucMutations.reversionSubstitutions	private_reversion_substitutions
+privateNucMutations.labeledSubstitutions	private_labeled_substitutions
+privateNucMutations.unlabeledSubstitutions	private_unlabeled_substitutions
+privateNucMutations.totalReversionSubstitutions	private_total_reversion_substitutions
+privateNucMutations.totalLabeledSubstitutions	private_total_labeled_substitutions
+privateNucMutations.totalUnlabeledSubstitutions	private_total_unlabeled_substitutions
+privateNucMutations.totalPrivateSubstitutions	private_total_private_substitutions
+qc.snpClusters.clusteredSNPs	private_snp_clusters
+qc.snpClusters.totalSNPs	private_total_snp_clusters
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -122,7 +122,7 @@ rule subset_metadata:
     input:
         metadata="data/all_metadata.tsv",
     output:
-        subset_metadata="results/metadata.tsv",
+        subset_metadata="data/subset_metadata.tsv",
     params:
         metadata_fields=",".join(config["curate"]["metadata_columns"]),
     shell:
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -0,0 +1,80 @@
+"""
+This part of the workflow handles running Nextclade on the curated metadata
+and sequences.
+
+See Nextclade docs for more details on usage, inputs, and outputs if you would
+like to customize the rules
+"""
+DATASET_NAME = config["nextclade"]["dataset_name"]
+
+
+rule get_nextclade_dataset:
+    """Download Nextclade dataset"""
+    output:
+        dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
+    params:
+        dataset_name=DATASET_NAME
+    shell:
+        """
+        nextclade3 dataset get \
+            --name={params.dataset_name:q} \
+            --output-zip={output.dataset} \
+            --verbose
+        """
+
+
+rule run_nextclade:
+    input:
+        dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
+        sequences="results/sequences.fasta",
+    output:
+        nextclade="results/nextclade.tsv",
+        alignment="results/alignment.fasta",
+        translations="results/translations.zip",
+    params:
+        translations=lambda w: "results/translations/{cds}.fasta",
+    shell:
+        """
+        nextclade3 run \
+            {input.sequences} \
+            --input-dataset {input.dataset} \
+            --output-tsv {output.nextclade} \
+            --output-fasta {output.alignment} \
+            --output-translations {params.translations}
+
+        zip -rj {output.translations} results/translations
+        """
+
+
+rule join_metadata_and_nextclade:
+    input:
+        nextclade="results/nextclade.tsv",
+        metadata="data/subset_metadata.tsv",
+        nextclade_field_map=config["nextclade"]["field_map"],
+    output:
+        metadata="results/metadata.tsv",
+    params:
+        metadata_id_field=config["curate"]["output_id_field"],
+        nextclade_id_field=config["nextclade"]["id_field"],
+    shell:
+        """
+        export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'`
+
+        csvtk -tl cut -f $SUBSET_FIELDS \
+            {input.nextclade} \
+        | csvtk -tl rename2 \
+            -F \
+            -f '*' \
+            -p '(.+)' \
+            -r '{{kv}}' \
+            -k {input.nextclade_field_map} \
+        | tsv-join -H \
+            --filter-file - \
+            --key-fields {params.nextclade_id_field} \
+            --data-fields {params.metadata_id_field} \
+            --append-fields '*' \
+            --write-all ? \
+            {input.metadata} \
+        | tsv-select -H --exclude {params.nextclade_id_field} \
+            > {output.metadata}
+        """    
diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json
@@ -17,18 +17,23 @@
       "type": "continuous"
     },
     {
-      "key": "country",
-      "title": "Country",
+      "key": "clade",
+      "title": "MeV Genotype (Nextstrain)",
       "type": "categorical"
     },
     {
       "key": "region",
       "title": "Region",
       "type": "categorical"
     },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
     {
       "key": "genotype_ncbi",
-      "title": "Genotype (NCBI)",
+      "title": "MeV Genotype (GenBank metadata)",
       "type": "categorical"
     }
   ],
@@ -37,11 +42,13 @@
     "region"
   ],
   "display_defaults": {
-    "map_triplicate": true
+    "map_triplicate": true,
+    "color_by": "clade"
   },
   "filters": [
-    "country",
+    "clade",
     "region",
+    "country",
     "author"
   ],
   "metadata_columns": [
diff --git a/phylogenetic/defaults/auspice_config_N450.json b/phylogenetic/defaults/auspice_config_N450.json
@@ -17,18 +17,23 @@
       "type": "continuous"
     },
     {
-      "key": "country",
-      "title": "Country",
+      "key": "clade",
+      "title": "MeV Genotype (Nextstrain)",
       "type": "categorical"
     },
     {
       "key": "region",
       "title": "Region",
       "type": "categorical"
     },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
     {
       "key": "genotype_ncbi",
-      "title": "Genotype (NCBI)",
+      "title": "MeV Genotype (GenBank metadata)",
       "type": "categorical"
     },
     {
@@ -42,11 +47,13 @@
     "region"
   ],
   "display_defaults": {
-    "map_triplicate": true
+    "map_triplicate": true,
+    "color_by": "clade"
   },
   "filters": [
-    "country",
+    "clade",
     "region",
+    "country",
     "author"
   ],
   "metadata_columns": [
diff --git a/phylogenetic/defaults/colors.tsv b/phylogenetic/defaults/colors.tsv
@@ -31,3 +31,29 @@ genotype_ncbi	G2	#E67832
 genotype_ncbi	G3	#E35F2D
 genotype_ncbi	H1	#DF4328
 genotype_ncbi	H2	#DB2823
+#
+# MeV Genotypes assigned by Nextclade
+clade	A	#5E1D9D
+clade	B1	#4B26B1
+clade	B2	#4138C3
+clade	B3	#3F4FCC
+clade	C1	#4065CF
+clade	C2	#447ACD
+clade	D1	#4A8BC3
+clade	D2	#529AB6
+clade	D3	#5BA6A6
+clade	D4	#66AE95
+clade	D5	#73B583
+clade	D6	#81B973
+clade	D7	#91BC64
+clade	D8	#A1BE58
+clade	D9	#B1BD4E
+clade	D10	#C0BA47
+clade	D11	#CEB541
+clade	E	#DAAD3D
+clade	F	#E19F3A
+clade	G1	#E68E36
+clade	G2	#E67832
+clade	G3	#E35F2D
+clade	H1	#DF4328
+clade	H2	#DB2823
diff --git a/phylogenetic/example_data/metadata.tsv b/phylogenetic/example_data/metadata.tsv
diff --git a/phylogenetic/example_data/sequences.fasta b/phylogenetic/example_data/sequences.fasta

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`# CHANGELOG`
	`2`	`+* 7 June 2024: Assign genotypes using Nextclade dataset and visualize on tree [PR #36](https://github.com/nextstrain/measles/pull/36)`
`2`	`3`	`* 9 May 2024: Create a N450 tree that can be used as part of a Nextclade dataset to assign genotypes to measles samples based on criteria outlined by the WHO [PR #28](https://github.com/nextstrain/measles/pull/28)`
`3`	`4`	`* 25 April 2024: Add specific sequences and metadata to the measles trees, including WHO reference sequences, vaccine strains, and genotypes reported on NCBI [PR #26](https://github.com/nextstrain/measles/pull/26)`
`4`	`5`	`* 10 April 2024: Add a single GH Action workflow to automate the ingest and phylogenetic workflows [PR #22](https://github.com/nextstrain/measles/pull/22)`
Original file line number	Diff line number	Diff line change
`@@ -122,3 +122,7 @@ curate:`
`122`	`122`	`'is_reference'`
`123`	`123`	`]`
`124`	`124`	`genotype_field: "virus_name"`
	`125`	`+nextclade:`
	`126`	`+ dataset_name: "nextstrain/measles/N450/WHO-2012"`
	`127`	`+ field_map: "defaults/nextclade_field_map.tsv"`
	`128`	`+ id_field: "seqName"`