From e42b38c658dbffe8126dcb8f2438d83a17a35a75 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 29 Oct 2025 14:35:31 -0700 Subject: [PATCH 01/11] Phylogenetic: Add VP1 and RdRp to genotype builds to see if they have sufficient clock signal --- phylogenetic/defaults/all/config.yaml | 2 +- phylogenetic/defaults/config.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/phylogenetic/defaults/all/config.yaml b/phylogenetic/defaults/all/config.yaml index 477e54f..58defa2 100644 --- a/phylogenetic/defaults/all/config.yaml +++ b/phylogenetic/defaults/all/config.yaml @@ -8,4 +8,4 @@ # {group} represents different genotypes to be analyzed groups: ['all'] # {gene} represents the norovirus genes to focus on in each build -genes: ['3CLpro', 'NTPase', 'p22', 'p48', 'RdRp', 'VP1', 'VP2', 'VPg'] +genes: ['genome', '3CLpro', 'NTPase', 'p22', 'p48', 'RdRp', 'VP1', 'VP2', 'VPg'] diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index d55d93f..aba9922 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -6,9 +6,9 @@ # Define wildcards used for building trees in the workflow, also used in the intermediate files or auxillariary default file names. # {group} represents different genotypes to be analyzed -groups: ['GII.6', 'GII.4', 'GII.2', 'GII.3', 'GII.17', 'all'] +groups: ['GII.6', 'GII.4', 'GII.2', 'GII.3', 'GII.17'] # {gene} represents the norovirus genes to focus on in each build -genes: ['genome'] +genes: ['genome', 'VP1', 'RdRp'] inputs: - name: ncbi From ece4a169e53274e6f1102f4a754e01c3274d09ce Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 29 Oct 2025 14:36:01 -0700 Subject: [PATCH 02/11] Phylogenetic: Fixup to avoid a trailing slash in the command --- phylogenetic/rules/construct_phylogeny.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index d0f8749..857d294 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -90,6 +90,6 @@ rule refine: --metadata-id-columns {params.id_field} \ --output-tree {output.tree:q} \ --output-node-data {output.node_data:q} \ - --stochastic-resolve \ - {params.clock_rate_params} + {params.clock_rate_params} \ + --stochastic-resolve """ From dfd2cee5979b781f71018ba2cda8868acccc9fc3 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 29 Oct 2025 14:39:00 -0700 Subject: [PATCH 03/11] Ingest: Add ORF coverage columns --- ingest/defaults/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 4d613d6..7f8102d 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -150,7 +150,7 @@ gene_coverage: id_field: "seqName" coverage: cdsCoverage_field: "cdsCoverage" - genes: "p48,NTPase,p22,VPg,3CLpro,RdRp,VP1,VP2" + genes: "p48,NTPase,p22,VPg,3CLpro,RdRp,VP1,VP2,ORF1,ORF2,ORF3" round_digits: 3 # Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow From 7c17f99f39bbae2bedeb5ae42d3a2b3367fcb891 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 29 Oct 2025 15:51:29 -0700 Subject: [PATCH 04/11] Phylogenetic: Work-around for GII.4 outliers --- phylogenetic/defaults/GII.4/exclude.txt | 13 +++++++++++++ phylogenetic/defaults/config.yaml | 4 +++- phylogenetic/rules/prepare_sequences.smk | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 phylogenetic/defaults/GII.4/exclude.txt diff --git a/phylogenetic/defaults/GII.4/exclude.txt b/phylogenetic/defaults/GII.4/exclude.txt new file mode 100644 index 0000000..f6cd0a4 --- /dev/null +++ b/phylogenetic/defaults/GII.4/exclude.txt @@ -0,0 +1,13 @@ +KX158286 +JX846924 +MK073894 +KJ407072 +NC_039475 +KC597139 +KJ196286 +MH218591 +KT589391 +MG557657 # Low quality, may break all/VPg +KX158285 # Perhaps too diverged, may break GII.4/genome +OR951134 # False positive GII.4, remove after nextclade tuning +MF373609 # False positive GII.4, remove after nextclade tuning diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index aba9922..90e3e97 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -20,7 +20,9 @@ strain_id_field: "accession" reference: defaults/{group}/reference.gb filter: - exclude: defaults/exclude.txt + exclude: + default: defaults/exclude.txt + GII.4: defaults/GII.4/exclude.txt min_coverage: 0.8 filter_params: --group-by year VP1_nextclade RdRp_nextclade --sequences-per-group 30 --min-date 1950 --exclude-ambiguous-dates-by year --query-columns is_lab_host:str --exclude-where is_lab_host='true' diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 69f35b4..67f8977 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -47,7 +47,7 @@ rule filter: input: sequences = "results/sequences.fasta", metadata = "results/metadata.tsv", - exclude = config['filter']['exclude'] + exclude = lambda wildcards: config['filter']['exclude'].get(wildcards.group, config['filter']['exclude']['default']), output: sequences = "results/{group}/{gene}/filtered.fasta", metadata = "results/{group}/{gene}/filtered.tsv", From 3933e16fc6a8c41b0d70795d91c52be94530c49c Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 29 Oct 2025 16:22:22 -0700 Subject: [PATCH 05/11] fixup: Add ORF2 and ORF3 to gff --- phylogenetic/defaults/all/reference.gff3 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/phylogenetic/defaults/all/reference.gff3 b/phylogenetic/defaults/all/reference.gff3 index a161a6a..451b6c6 100644 --- a/phylogenetic/defaults/all/reference.gff3 +++ b/phylogenetic/defaults/all/reference.gff3 @@ -14,7 +14,11 @@ NC_039477_REF Genbank gene 3029 3571 . + . ID=gene-3CLpro;Name=3CLpro NC_039477_REF Genbank CDS 3029 3571 . + . ID=cds-3CLpro;Parent=gene-3CLpro;Name=3CLpro NC_039477_REF Genbank gene 3572 5101 . + . ID=gene-RdRp;Name=RdRp NC_039477_REF Genbank CDS 3572 5101 . + . ID=cds-RdRp;Parent=gene-RdRp;Name=RdRp +NC_039477_REF Genbank gene 5085 6707 . + . ID=gene-ORF2;Name=ORF2 +NC_039477_REF Genbank CDS 5085 6707 . + . ID=cds-ORF2;Parent=gene-ORF2;Name=ORF2 NC_039477_REF Genbank gene 5085 6707 . + . ID=gene-VP1;Name=VP1 NC_039477_REF Genbank CDS 5085 6707 . + . ID=cds-VP1;Parent=gene-VP1;Name=VP1 +NC_039477_REF Genbank gene 6707 7513 . + . ID=gene-ORF3;Name=ORF3 +NC_039477_REF Genbank CDS 6707 7513 . + . ID=cds-ORF3;Parent=gene-ORF3;Name=ORF3 NC_039477_REF Genbank gene 6707 7513 . + . ID=gene-VP2;Name=VP2 NC_039477_REF Genbank CDS 6707 7513 . + . ID=cds-VP2;Parent=gene-VP2;Name=VP2 From f1a677a713e60cdc94ac8028f95576fbc1204617 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 29 Oct 2025 17:05:17 -0700 Subject: [PATCH 06/11] Phylogenetic: Add ORF trees --- phylogenetic/defaults/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 90e3e97..5c1ef27 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -8,7 +8,7 @@ # {group} represents different genotypes to be analyzed groups: ['GII.6', 'GII.4', 'GII.2', 'GII.3', 'GII.17'] # {gene} represents the norovirus genes to focus on in each build -genes: ['genome', 'VP1', 'RdRp'] +genes: ['genome', 'VP1', 'RdRp', 'ORF1', 'ORF2', 'ORF3'] inputs: - name: ncbi From c08818696e44c7037bdebf16bbdac0cdb44ad5ca Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 4 Nov 2025 15:41:18 -0800 Subject: [PATCH 07/11] phylogenetic: Update links to trees --- phylogenetic/defaults/description.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/phylogenetic/defaults/description.md b/phylogenetic/defaults/description.md index 0ed0593..253a3b2 100644 --- a/phylogenetic/defaults/description.md +++ b/phylogenetic/defaults/description.md @@ -1,15 +1,22 @@ We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequences and metadata for sharing their work. Please note that although data generators have generously shared data in an open fashion, that does not mean there should be free license to publish on this data. Data generators should be cited where possible and collaborations should be sought in some circumstances. Please try to avoid scooping someone else's work. Reach out if uncertain. -**We maintain 14 views of norovirus evolution:** +**We provide divergence tree views of norovirus evolution across all samples:** | group | genome | p48 | NTPase | p22 | VPg | 3CLpro | RdRp | VP1 | VP2 | |:--|:--|:--|:--|:--|:--|:--|:--|:--|:--| | all | [genome](https://nextstrain.org/norovirus/all/genome) | [p48](https://nextstrain.org/norovirus/all/p48) | [NTPase](https://nextstrain.org/norovirus/all/NTPase) | [p22](https://nextstrain.org/norovirus/all/p22) | [VPg](https://nextstrain.org/norovirus/all/VPg) | [3CLpro](https://nextstrain.org/norovirus/all/3CLpro) | [RdRp](https://nextstrain.org/norovirus/all/RdRp/) | [VP1](https://nextstrain.org/norovirus/all/VP1) | [VP2](https://nextstrain.org/norovirus/all/VP2) | -| GII.2 | [genome](https://nextstrain.org/norovirus/GII.2/genome) | | | | | | | | | -| GII.3 | [genome](https://nextstrain.org/norovirus/GII.3/genome) | | | | | | | | | -| GII.4 | [genome](https://nextstrain.org/norovirus/GII.4/genome) | | | | | | | | | -| GII.6 | [genome](https://nextstrain.org/norovirus/GII.6/genome) | | | | | | | | | -| GII.17 | [genome](https://nextstrain.org/norovirus/GII.17/genome) | | | | | | | | | + +**We provide timetree views by genogroup:** + +Since novovirus is known to have recombination breakpoints between ORF1-ORF2 (between the RdRp and VP1 genes) and less frequently between ORF2-ORF3, the treetime views are by breakpoint region. + +| group | ORF1 | ORF2 | ORF3 | RdRp | VP1 | +|:--|:-- |:--|:--|:--|:--| +| GII.2 | [ORF1](https://nextstrain.org/norovirus/GII.2/ORF1/) | [ORF2](https://nextstrain.org/norovirus/GII.2/ORF2/) | [ORF3](https://nextstrain.org/norovirus/GII.2/ORF3/) | [RdRp](https://nextstrain.org/norovirus/GII.2/RdRp/) | [VP1](https://nextstrain.org/norovirus/GII.2/VP1/) | +| GII.3 | [ORF1](https://nextstrain.org/norovirus/GII.3/ORF1/) | [ORF2](https://nextstrain.org/norovirus/GII.3/ORF2/) | [ORF3](https://nextstrain.org/norovirus/GII.3/ORF3/) | [RdRp](https://nextstrain.org/norovirus/GII.3/RdRp/) | [VP1](https://nextstrain.org/norovirus/GII.3/VP1/) | +| GII.4 | [ORF1](https://nextstrain.org/norovirus/GII.4/ORF1/) | [ORF2](https://nextstrain.org/norovirus/GII.4/ORF2/) | [ORF3](https://nextstrain.org/norovirus/GII.4/ORF3/) | [RdRp](https://nextstrain.org/norovirus/GII.4/RdRp/) | [VP1](https://nextstrain.org/norovirus/GII.4/VP1/) | +| GII.6 | [ORF1](https://nextstrain.org/norovirus/GII.6/ORF1/) | [ORF2](https://nextstrain.org/norovirus/GII.6/ORF2/) | [ORF3](https://nextstrain.org/norovirus/GII.6/ORF3/) | [RdRp](https://nextstrain.org/norovirus/GII.6/RdRp/) | [VP1](https://nextstrain.org/norovirus/GII.6/VP1/) | +| GII.17 | [ORF1](https://nextstrain.org/norovirus/GII.17/ORF1/) | [ORF2](https://nextstrain.org/norovirus/GII.17/ORF2/) | [ORF3](https://nextstrain.org/norovirus/GII.17/ORF3/) | [RdRp](https://nextstrain.org/norovirus/GII.17/RdRp/) | [VP1](https://nextstrain.org/norovirus/GII.17/VP1/) | #### Nextclade group, type, and variant assignment From 92de1f70e96934d00d202dd8fce9fb73fbff2617 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 4 Nov 2025 15:59:29 -0800 Subject: [PATCH 08/11] Phylogenetic: Timetree by genogroup, divergence tree for all samples --- phylogenetic/defaults/all/config.yaml | 4 +++ phylogenetic/defaults/config.yaml | 30 +--------------------- phylogenetic/rules/construct_phylogeny.smk | 2 +- 3 files changed, 6 insertions(+), 30 deletions(-) diff --git a/phylogenetic/defaults/all/config.yaml b/phylogenetic/defaults/all/config.yaml index 58defa2..6da4d01 100644 --- a/phylogenetic/defaults/all/config.yaml +++ b/phylogenetic/defaults/all/config.yaml @@ -9,3 +9,7 @@ groups: ['all'] # {gene} represents the norovirus genes to focus on in each build genes: ['genome', '3CLpro', 'NTPase', 'p22', 'p48', 'RdRp', 'VP1', 'VP2', 'VPg'] + +refine: + root: + default: mid_point \ No newline at end of file diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 5c1ef27..21d9686 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -8,7 +8,7 @@ # {group} represents different genotypes to be analyzed groups: ['GII.6', 'GII.4', 'GII.2', 'GII.3', 'GII.17'] # {gene} represents the norovirus genes to focus on in each build -genes: ['genome', 'VP1', 'RdRp', 'ORF1', 'ORF2', 'ORF3'] +genes: ['ORF1', 'ORF2', 'ORF3', 'RdRp', 'VP1'] inputs: - name: ncbi @@ -29,34 +29,6 @@ filter: refine: root: default: best --timetree --date-confidence --clock-filter-iqd 4 --date-inference marginal - all: - p48: DQ366347 --timetree --date-confidence --clock-filter-iqd 4 --date-inference marginal - VP2: best --timetree --date-confidence - genome: best --timetree --date-confidence - GII.4: - genome: mid_point --timetree --date-confidence - # Estimated from auspice build https://github.com/nextstrain/norovirus/issues/22#issuecomment-3221851561 - clock_rate: - all: - genome: 0.00328 - p48: 0.00409 - NTPase: 0.00174 - p22: 0.00174 - VPg: 0.00660 - 3CLpro: 0.00331 - RdRp: 0.00135 - VP1: 0.00136 - VP2: 0.00228 - GII.2: - genome: 0.000211 - GII.3: - genome: 0.000688 - GII.4: - genome: 0.000874 - GII.6: - genome: 0.000135 - GII.17: - genome: 0.00142 traits: default: region country ORF1_type ORF2_type VP1_nextclade VP1_group VP1_type VP1_variant RdRp_nextclade RdRp_group RdRp_type RdRp_variant host diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 857d294..928f7b1 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -52,7 +52,7 @@ def _clock_rate_params(wildcards): else leave blank """ - clock_rate = config['refine']['clock_rate'].get(wildcards.group, {}).get(wildcards.gene, "") + clock_rate = config['refine'].get('clock_rate', {}).get(wildcards.group, {}).get(wildcards.gene, "") if clock_rate !="": return f' --clock-rate {clock_rate} ' else: From 2a8c5ddbd75a25a7a48ecf09bfec1b95e539125f Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 7 Nov 2025 17:09:02 -0500 Subject: [PATCH 09/11] Phylogenetic: Outgroup root divergence trees of 'all' samples --- phylogenetic/defaults/all/config.yaml | 3 +- phylogenetic/defaults/outgroup.fasta | 2 ++ phylogenetic/rules/construct_phylogeny.smk | 42 ++++++++++++++++++++-- 3 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 phylogenetic/defaults/outgroup.fasta diff --git a/phylogenetic/defaults/all/config.yaml b/phylogenetic/defaults/all/config.yaml index 6da4d01..fca5366 100644 --- a/phylogenetic/defaults/all/config.yaml +++ b/phylogenetic/defaults/all/config.yaml @@ -11,5 +11,4 @@ groups: ['all'] genes: ['genome', '3CLpro', 'NTPase', 'p22', 'p48', 'RdRp', 'VP1', 'VP2', 'VPg'] refine: - root: - default: mid_point \ No newline at end of file + outgroup: NC_027026_outgroup \ No newline at end of file diff --git a/phylogenetic/defaults/outgroup.fasta b/phylogenetic/defaults/outgroup.fasta new file mode 100644 index 0000000..3ded2b6 --- /dev/null +++ b/phylogenetic/defaults/outgroup.fasta @@ -0,0 +1,2 @@ +>NC_027026_outgroup +gtgatcaccttgggatggcttccaagccattccaagttgagtcttgcgacttcatgtttgaagtccatgttctccacatgtgctacctcagggtagcaccgagagaacagtttcttgatgactgtactataagtgcagttaccaaatattatttgcaccacgtgcccctacataatagaaatggcatggactgcgagtttgaggctgaggggttgctgtcccggttattcgggacggccggttccccctcactggattcgcagtccgctttcaaggagcttttcggctttgacaccgatgagcaaatgccattgtcccttgaagaattggccaaattgcaaggcgaaatcaccgcctcgcttcaaatgaacaataaccaattcgttgctaaacatggtaaggccaaggtgcaggcccttttggaccagctcaacacactggtcccccgtgacatcagtgaggctgaaagacgccgtcgagaattctttgagaggcagactgctgctgccttcgctgaactcccaaatgatgacacatttaccgaacaagattggaagtcctattggtatgcaatgtggcggcgagttgtgggtggttgcaaatcatactaccatggtctgccccgttggagctctttcaagacacggctctcgcgtgccacagagcccctccgccaggtgttagcagttgctgcacagacctttgatcaatgtgtacaaactgaccctcgcatcttggctatgaattgtgtgactgccttgaaacccactgttttaaccatgatataccagcaacatcacaacacacccagtgggtggcttgcaacccttacggccctctgggaggttttccaaccttccctccctgcattaggcactttaggtactggtgttgccacaaccttggggctcgtcgtcaacaccttgtcacggttcttccagaaactttgtgcttacatttctgaaacatttcttcccgaatctccaacaaccccaggctgggttgccattgtggctggtgttctgctgttacttttgaaactctcctgcatccccaaagtgttttcccactggagcacgcttctcaagttggccagtggtatcacaacagtcattggagcaacacgagccgtggactggatcatggggaaaattcgcgacgcgcgccattcttccatgtgtaaacagttccttagccgtgtgtctgcactgttggagctccactactcaaagactgtcacaggtgttgcagagaacacagagctcctcaagtgtttcgaccagctcattgatgaaggagaagaattggtctctgaaataggaggtggttctttagccgcaataatacgcagtggtgttgacacactccagagggtttccacagaaatcaaagccacaatccaactggacaacccgcgtccagtgcccgtatgtgtgatattctccggacctccaggtattggaaaaacctctttagcataccacatggccaagggcattggtctcacctccaacttttctttggcaaacgatcatcatgatggatatacaggtaaccctgtagctatatgggatgaatatgatactgacaaggatggcaaatttgtggagcagatgatatccctggtcaacacacagccttgtgtcctcaattgtgaccgaccagaaaataagggcaagctttttacatccaagttcattttctgcactaccaattacacgaccagtgtgctcccagacaatccacgtgctggcgcattctaccgtcgagtcatcactgtagatgtgcgatctcctgaaattgaggattggatggcagcacaccctggtaggtcccctccgaaaactctcttcaaaagcgattgctcccacttgaaattaatggttaggccatatatggggtacaaccctgacggggatactctagctggcaaacgcgtaaagcccaccccaatcaccatagctgggcttcatgacatgattgacaaaaagtttgaggagcaatcaggggaggtccggggcatttggatcactgtgccccggcgtagtgtgcaaacagcccttgttgctgtcaagaagttttgtgtagcccatcaagcgttgtgccacgtcacatccactccttctcctgaaattctacagtgtgccactttctcgtgtgttgttgtgtcagatgcgcacccgccacctggtgctccacttttgcacatcaagaatgcacatcttgaagtagaccacactggccaagctgtcaccagcatttcagagagcctgcttggtatgtttattacagagcaacgtgtttcatccaagttgcaaagggatatcatgtacaaagtgtggtcaccttttacattaatgcagactgagccattgaatacacagtccctacccccagttaggcgcataatttatgctgacacgcccatggatttcattgggggtttaaggcaccaccttggctttagttcaattcctggtctgtggcgtgcccttaagcatctcccagacacgccctcaatgatagagtggatcactgaccacctttctcaagtgcagtttcccaacaaccccgaatcaaccctatttagaacaggcaatggtgatgtcatattctatacttatggttctttttatgctctaggcacttgtgcccgtgtgcctgtagttagtggtgacacagttagccctctcccaaatgtccccctaaaaatgacttggtttgaaacactgaaggccctttgctcgtcagcattgcgcttgttcacggccatctcaccatttgccatagccgttgctaatgtcacttatctcaccacccgaggatcacgtgaagaacaagccaaaggcaagacaaaacatggtcgtggcgcccgccatgcccgtgggcgatccacagcgcttaatgatgatgagtacaatgagtggatggacttgcgccgcgactggcgtgaagaaatgactgctgacgagttcttacacctaagggatgaagcttatgaaggtatcatcaatgaccgcacccagcggtacaacacctggcttaacctgcgcaacatgcgcttaggcgctggagcgtaccaacatgcaacaatcataggcaagggtggtgtgcgcaatgagatcatacgcacccaagtgctgaaggctcccaggaaaggaaaatggtcacacattgattcttctggccccatgaattattttgatgaggcaccgaccccactagtggaatttgagtgtgatggtgcccatgttgggtggggggtgcaccttgggaacgggcgcgtggtgacagtcacccatgtggccacatcatctaacactgtgaacgggctgccatttaaggtcaaggacaccgacggggagacctgccaggtttatgccactctaggtaacttgccacactaccagcttggggatggtgcgcctgtttactacacaaccaggtaccaccctgtgcttgtgattggtgaagggcaatttgacacacctacaacgactgtcaacggtttccacgttcgcattaccaattcatatccaacaaagaaaggtgattgtggactaccatacttcaatgcactgcgccaggtggttgcccttcatgctgctggcagtacagatgggtcaaccaagttggcgcaacgagttgctgagaaaccgcaaacaggggatgcattcgtctggaagggattgcccgttgttcgtggtagtgatgtcggcggactccccactggcactcgctaccatcgctccccagcttggccagagatgcgatctgacgagactcattcaccagctccttttggttctggtgacaagcggtatgaattctcacaagtggagatgcttgttaacaacttgagaccataccttgaggaagttccaggggtgcctccagcattacttaatagagctattgttcacacccgcaattacctacaatcaataattggcactgaacaaagtgaaccacttacttatgccatggcatcatcaatgttggaaaaaggcacctcctgtggtccccacattactggccttaagggtgattactgggatgatgaaacacaacagtacacagggtcacttcgagaacatctggaagcggtctggaacaaggctatgttagggacaccaccaagccacgattacaaattggcactcaaagatgagttaagaccaaatgagaaaaatgaacagggtaagaggcgcctgctgtggggtgctgatgctgggcttactctcgtgtgttgtgctgctctcaaaccagcagccgcacgactgcaaactgttgtgcctatgacacctgttgcagtcggcatcaacatggattctgcacacatagaagttatgaatgagtcccttaaggggcgtgtgctttatgcattagattatagcaagtgggacagcacccaatctgcagctgtgacagcagcctcacttgagatccttgcatccttcatgacaccaactccaattgtctcatcagctattgaggccctgaaggcacccgctagaggcatggtcaacgatgccatcttcatagctcgatccggtttgccatctggcatgccgttcacaagtgtggtcaactccatcaaccacatgctttacatctctgcagctattcttcaggcatacgaagcacataatctcccatatagtgacaatgtttttaacattgaaaccattcacacctatggtgatgattgcctgtacgggttcacccccgcaacagcctctctcatgcaggtaatcatagaaaacctacgctcatatggcctgaaacccacagcagctgataagggcgaaaccattgccccagtgcaaactcctgtttttctcaagcgcacgttcgcaacaacaccacatggactgcgcgctttgcttgacacctcttccattcttaggcagttcttctgggtgaaagcccaacgcacatgtgatgtctattctgctcctacaattgacaccaaatctcgggctgcgcagctcgaggtggcattggcctatgctagtcaacatggtcatgagttcttcaacaaagcacgtgaaatcgctgagaaaacaagtgcagctgaggggtatgttctggtgaataccaactacgaacaagcaactgcatgttacaacagctggtacataggaggcactacaccagaaatgcccgctaccaatgaaggctgcgggctaatagtgtttgagatggagggcaatggctccccacgaggaggtaaccagcctcaatcccacaatggtggcacttcacctgctcaggctgcaccgcctggcacgactggccctgcggaagcaccccttgtacctgttaatcctgaacagcccaattccattgcccaacgcatggagctggctgttgccacaggagcaacaacctcaaatgtccctgaatgtgtgcggaactgctttgctctccttcgtacgattccttggaattctcgacagccccagggatctcttctcacagctgtttctttacaccctgacatcaacccgtacacaaaacatcttgctcaaatgtttgccggatggggaggggcaatggatgtccgtgtcacgatctcaggctcgggcttatttgcagggaagcttgtgtgtgggatacttcctcctggagttaatcccactcttgtcagcgatccgggggttctgccacacgctttggtagatgcccgcgtgactgaaccagcatgcttcaacgtgcctgatgtccgcgccgtcgactaccatcgcactgacggcgatgaggcaactgcaacacttggcatttgggtgctccaaccactaatcaaccccttctccactgaagctgtttcaaccgcttggatttcaattgagaccaaaccaggtggtgattttgatctttgcttgatgaaacctcctaaccaagccatggataacggcacatctccatcgttcctcttgccacgccgtttacaacgttcaagaggcaaccgcgctggtggctatgcagttggaatggtcattgtgggctcagcacaccaggtgaaccgccacttcactgctcttggcacaacatttggttggtctaccgcaccttacgaacccatgcgatgtgcttttggtggagtacatcaagggcgtgacaccaacccaaaaattggctactactgggaggttggtgctgaccagcgtggcccgctttttccaaacattgttaatcattggcctgattttgcagtcaacaccaaatacacatggccagatgccgactatataccccatagtgcggttgttggaaccttggtttccttccaggacaatggagatgtttctgaggatcaagtggctactgcatttgcaatttcaatgaacactccaagtgggagcacaactggacgcggcacagtcagagaagcatttgatccatccacaatgcatttggtccgcaccaatggtaccactcaaccatctgggtggccaaccggctcaaacactggaaatggttacttcacaccaatgtggggtcatggtcagggtaatgcaattaatgataagatcaccaacatggagggggctaattatacatttggaggctctggccaaaacaacatagtcttgtgggttgaaagaatcttctcagaccatcctggtaaaaccacactttactcatcacaattggacagcactgcggccattttccagtcaggcccagtcaatatacctgagaacatgatggctgtctacaatgtcaccactaatggggctgactttcaagttggcatccgtcgtgatggctacatggtcacatctggaacaattggtactcagcaggagcttgaccctgacaccactttcacttatgttggacttttccccctttctgcctcattggttggcccacatgggaattctggacgggcccagatagcatggtcatgagctggttggtaggcactttgcaatctcttggtgggctcactgacgttgcgtccaccatttctggcatggtttatcaacatagacatcttgaccaactgaaaagacagaatgatctacaagaacagtggatggcccgcaatgaacaactgcagcgagatgcaatgcaattaactcaggatttggcagtcaatgccccagcgatgagagtgcaggccgctctcaatgctgggtttgatgtggttagtgcgcgccagcttgccggttcaactgagcgaaggatcaacggctatttggatcagccaattcgcactattgatcaggcaatggcagtgcagtctaggggcaatttgacttcgttgtcgaacgcccttgccacttaccaaaaaggcactcaatttggactcaaacaacccaaaggatttaagagcccaattgctgctgaacaatctcgtggtccaaccattacacttggccctcccccaccatctactaatctataaatcaatcttttataaatttgtgcaaatttctttttcttcctcatggtcgcacacgcgttcgggtgcgttgcagtcaattaagcgattgacgccatctttgg diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 928f7b1..933d5b9 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -19,11 +19,41 @@ This part of the workflow usually includes the following steps: See Augur's usage docs for these commands for more details. """ +rule add_outgroup: + """Add outgroup""" + input: + alignment = "results/{group}/{gene}/aligned.fasta", + outgroup = "defaults/outgroup.fasta", + output: + alignment_with_outgroup = "results/{group}/{gene}/aligned_with_outgroup.fasta", + log: + "logs/{group}/{gene}/add-outgroup.txt", + benchmark: + "benchmarks/{group}/{gene}/add-outgroup.txt", + shell: + """ + augur align \ + --sequences {input.outgroup} \ + --existing-alignment {input.alignment} \ + --output {output.alignment_with_outgroup} \ + 2>&1 | tee {log} + """ + +def _alignment(wildcards): + """ + Based on if outgroup rooting is specified in the config file, return the needed alignment file + """ + outgroup = config['refine'].get('outgroup', "") + if outgroup != "": + return "results/{group}/{gene}/aligned_with_outgroup.fasta" + else: + return "results/{group}/{gene}/aligned.fasta" + rule tree: """Building tree""" input: - alignment = "results/{group}/{gene}/aligned.fasta" + alignment = lambda wildcards: _alignment(wildcards), output: tree = "results/{group}/{gene}/tree_raw.nwk" benchmark: @@ -58,6 +88,12 @@ def _clock_rate_params(wildcards): else: return "" +def _root_params(wildcards): + outgroup = config['refine'].get('outgroup', '') + if outgroup !="": + return f'{outgroup} --remove-outgroup' + else: + return config['refine']['root'].get(wildcards.group, {}).get(wildcards.gene, config['refine']['root']['default']), rule refine: """ @@ -65,7 +101,7 @@ rule refine: """ input: tree = "results/{group}/{gene}/tree_raw.nwk", - alignment = "results/{group}/{gene}/aligned.fasta", + alignment = lambda wildcards: _alignment(wildcards), metadata = "results/{group}/{gene}/filtered.tsv" output: tree = "results/{group}/{gene}/tree.nwk", @@ -75,7 +111,7 @@ rule refine: log: "logs/{group}/{gene}/refine.txt", params: - root = lambda wildcards: config['refine']['root'].get(wildcards.group, {}).get(wildcards.gene, config['refine']['root']['default']), + root = lambda wildcards: _root_params(wildcards), clock_rate_params = lambda wildcards: _clock_rate_params(wildcards), id_field = config['strain_id_field'], shell: From 308ebd2fca33418598360cbebf92122c3d85e881 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 7 Nov 2025 17:21:18 -0500 Subject: [PATCH 10/11] CI: Check a genogroup build instead --- phylogenetic/build-configs/ci/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml index 9d51dec..2ac3ca6 100644 --- a/phylogenetic/build-configs/ci/config.yaml +++ b/phylogenetic/build-configs/ci/config.yaml @@ -2,7 +2,7 @@ # for the CI workflow to run with the example data. # {group} represents different genotypes to be analyzed -groups: ['all'] +groups: ['GII.2'] # {gene} represents the norovirus genes to focus on in each build genes: ['genome', 'VP1'] From d1b6c6be36575125c518d6763c89f476f1351726 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Mon, 10 Nov 2025 14:05:22 -0500 Subject: [PATCH 11/11] WIP: set clock rate still out performs outgroup --- phylogenetic/defaults/all/config.yaml | 5 +++-- phylogenetic/defaults/config.yaml | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/phylogenetic/defaults/all/config.yaml b/phylogenetic/defaults/all/config.yaml index fca5366..6ee640e 100644 --- a/phylogenetic/defaults/all/config.yaml +++ b/phylogenetic/defaults/all/config.yaml @@ -10,5 +10,6 @@ groups: ['all'] # {gene} represents the norovirus genes to focus on in each build genes: ['genome', '3CLpro', 'NTPase', 'p22', 'p48', 'RdRp', 'VP1', 'VP2', 'VPg'] -refine: - outgroup: NC_027026_outgroup \ No newline at end of file +# Timetree still performs better than outgroup +# refine: +# outgroup: NC_027026_outgroup \ No newline at end of file diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 21d9686..b2094e2 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -29,6 +29,33 @@ filter: refine: root: default: best --timetree --date-confidence --clock-filter-iqd 4 --date-inference marginal + # Estimated from auspice build https://github.com/nextstrain/norovirus/issues/22#issuecomment-3221851561 + all: + p48: DQ366347 --timetree --date-confidence --clock-filter-iqd 4 --date-inference marginal + VP2: best --timetree --date-confidence + genome: best --timetree --date-confidence + clock_rate: + all: + genome: 0.00328 + p48: 0.00409 + NTPase: 0.00174 + p22: 0.00174 + VPg: 0.00660 + 3CLpro: 0.00331 + RdRp: 0.00135 + VP1: 0.00136 + VP2: 0.00228 + GII.2: + RdRp: 0.000346 + GII.3: + RdRp: 0.00156 + GII.4: + RdRp: 0.00177 + GII.6: + RdRp: 0.00199 + GII.17: + RdRp: 0.00133 + traits: default: region country ORF1_type ORF2_type VP1_nextclade VP1_group VP1_type VP1_variant RdRp_nextclade RdRp_group RdRp_type RdRp_variant host