diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 4d613d6..7f8102d 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -150,7 +150,7 @@ gene_coverage: id_field: "seqName" coverage: cdsCoverage_field: "cdsCoverage" - genes: "p48,NTPase,p22,VPg,3CLpro,RdRp,VP1,VP2" + genes: "p48,NTPase,p22,VPg,3CLpro,RdRp,VP1,VP2,ORF1,ORF2,ORF3" round_digits: 3 # Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml index 9d51dec..2ac3ca6 100644 --- a/phylogenetic/build-configs/ci/config.yaml +++ b/phylogenetic/build-configs/ci/config.yaml @@ -2,7 +2,7 @@ # for the CI workflow to run with the example data. # {group} represents different genotypes to be analyzed -groups: ['all'] +groups: ['GII.2'] # {gene} represents the norovirus genes to focus on in each build genes: ['genome', 'VP1'] diff --git a/phylogenetic/defaults/GII.4/exclude.txt b/phylogenetic/defaults/GII.4/exclude.txt new file mode 100644 index 0000000..f6cd0a4 --- /dev/null +++ b/phylogenetic/defaults/GII.4/exclude.txt @@ -0,0 +1,13 @@ +KX158286 +JX846924 +MK073894 +KJ407072 +NC_039475 +KC597139 +KJ196286 +MH218591 +KT589391 +MG557657 # Low quality, may break all/VPg +KX158285 # Perhaps too diverged, may break GII.4/genome +OR951134 # False positive GII.4, remove after nextclade tuning +MF373609 # False positive GII.4, remove after nextclade tuning diff --git a/phylogenetic/defaults/all/config.yaml b/phylogenetic/defaults/all/config.yaml index 477e54f..6ee640e 100644 --- a/phylogenetic/defaults/all/config.yaml +++ b/phylogenetic/defaults/all/config.yaml @@ -8,4 +8,8 @@ # {group} represents different genotypes to be analyzed groups: ['all'] # {gene} represents the norovirus genes to focus on in each build -genes: ['3CLpro', 'NTPase', 'p22', 'p48', 'RdRp', 'VP1', 'VP2', 'VPg'] +genes: ['genome', '3CLpro', 'NTPase', 'p22', 'p48', 'RdRp', 'VP1', 'VP2', 'VPg'] + +# Timetree still performs better than outgroup +# refine: +# outgroup: NC_027026_outgroup \ No newline at end of file diff --git a/phylogenetic/defaults/all/reference.gff3 b/phylogenetic/defaults/all/reference.gff3 index a161a6a..451b6c6 100644 --- a/phylogenetic/defaults/all/reference.gff3 +++ b/phylogenetic/defaults/all/reference.gff3 @@ -14,7 +14,11 @@ NC_039477_REF Genbank gene 3029 3571 . + . ID=gene-3CLpro;Name=3CLpro NC_039477_REF Genbank CDS 3029 3571 . + . ID=cds-3CLpro;Parent=gene-3CLpro;Name=3CLpro NC_039477_REF Genbank gene 3572 5101 . + . ID=gene-RdRp;Name=RdRp NC_039477_REF Genbank CDS 3572 5101 . + . ID=cds-RdRp;Parent=gene-RdRp;Name=RdRp +NC_039477_REF Genbank gene 5085 6707 . + . ID=gene-ORF2;Name=ORF2 +NC_039477_REF Genbank CDS 5085 6707 . + . ID=cds-ORF2;Parent=gene-ORF2;Name=ORF2 NC_039477_REF Genbank gene 5085 6707 . + . ID=gene-VP1;Name=VP1 NC_039477_REF Genbank CDS 5085 6707 . + . ID=cds-VP1;Parent=gene-VP1;Name=VP1 +NC_039477_REF Genbank gene 6707 7513 . + . ID=gene-ORF3;Name=ORF3 +NC_039477_REF Genbank CDS 6707 7513 . + . ID=cds-ORF3;Parent=gene-ORF3;Name=ORF3 NC_039477_REF Genbank gene 6707 7513 . + . ID=gene-VP2;Name=VP2 NC_039477_REF Genbank CDS 6707 7513 . + . ID=cds-VP2;Parent=gene-VP2;Name=VP2 diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index d55d93f..b2094e2 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -6,9 +6,9 @@ # Define wildcards used for building trees in the workflow, also used in the intermediate files or auxillariary default file names. # {group} represents different genotypes to be analyzed -groups: ['GII.6', 'GII.4', 'GII.2', 'GII.3', 'GII.17', 'all'] +groups: ['GII.6', 'GII.4', 'GII.2', 'GII.3', 'GII.17'] # {gene} represents the norovirus genes to focus on in each build -genes: ['genome'] +genes: ['ORF1', 'ORF2', 'ORF3', 'RdRp', 'VP1'] inputs: - name: ncbi @@ -20,20 +20,20 @@ strain_id_field: "accession" reference: defaults/{group}/reference.gb filter: - exclude: defaults/exclude.txt + exclude: + default: defaults/exclude.txt + GII.4: defaults/GII.4/exclude.txt min_coverage: 0.8 filter_params: --group-by year VP1_nextclade RdRp_nextclade --sequences-per-group 30 --min-date 1950 --exclude-ambiguous-dates-by year --query-columns is_lab_host:str --exclude-where is_lab_host='true' refine: root: default: best --timetree --date-confidence --clock-filter-iqd 4 --date-inference marginal + # Estimated from auspice build https://github.com/nextstrain/norovirus/issues/22#issuecomment-3221851561 all: p48: DQ366347 --timetree --date-confidence --clock-filter-iqd 4 --date-inference marginal VP2: best --timetree --date-confidence genome: best --timetree --date-confidence - GII.4: - genome: mid_point --timetree --date-confidence - # Estimated from auspice build https://github.com/nextstrain/norovirus/issues/22#issuecomment-3221851561 clock_rate: all: genome: 0.00328 @@ -46,15 +46,16 @@ refine: VP1: 0.00136 VP2: 0.00228 GII.2: - genome: 0.000211 + RdRp: 0.000346 GII.3: - genome: 0.000688 + RdRp: 0.00156 GII.4: - genome: 0.000874 + RdRp: 0.00177 GII.6: - genome: 0.000135 + RdRp: 0.00199 GII.17: - genome: 0.00142 + RdRp: 0.00133 + traits: default: region country ORF1_type ORF2_type VP1_nextclade VP1_group VP1_type VP1_variant RdRp_nextclade RdRp_group RdRp_type RdRp_variant host diff --git a/phylogenetic/defaults/description.md b/phylogenetic/defaults/description.md index 0ed0593..253a3b2 100644 --- a/phylogenetic/defaults/description.md +++ b/phylogenetic/defaults/description.md @@ -1,15 +1,22 @@ We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequences and metadata for sharing their work. Please note that although data generators have generously shared data in an open fashion, that does not mean there should be free license to publish on this data. Data generators should be cited where possible and collaborations should be sought in some circumstances. Please try to avoid scooping someone else's work. Reach out if uncertain. -**We maintain 14 views of norovirus evolution:** +**We provide divergence tree views of norovirus evolution across all samples:** | group | genome | p48 | NTPase | p22 | VPg | 3CLpro | RdRp | VP1 | VP2 | |:--|:--|:--|:--|:--|:--|:--|:--|:--|:--| | all | [genome](https://nextstrain.org/norovirus/all/genome) | [p48](https://nextstrain.org/norovirus/all/p48) | [NTPase](https://nextstrain.org/norovirus/all/NTPase) | [p22](https://nextstrain.org/norovirus/all/p22) | [VPg](https://nextstrain.org/norovirus/all/VPg) | [3CLpro](https://nextstrain.org/norovirus/all/3CLpro) | [RdRp](https://nextstrain.org/norovirus/all/RdRp/) | [VP1](https://nextstrain.org/norovirus/all/VP1) | [VP2](https://nextstrain.org/norovirus/all/VP2) | -| GII.2 | [genome](https://nextstrain.org/norovirus/GII.2/genome) | | | | | | | | | -| GII.3 | [genome](https://nextstrain.org/norovirus/GII.3/genome) | | | | | | | | | -| GII.4 | [genome](https://nextstrain.org/norovirus/GII.4/genome) | | | | | | | | | -| GII.6 | [genome](https://nextstrain.org/norovirus/GII.6/genome) | | | | | | | | | -| GII.17 | [genome](https://nextstrain.org/norovirus/GII.17/genome) | | | | | | | | | + +**We provide timetree views by genogroup:** + +Since novovirus is known to have recombination breakpoints between ORF1-ORF2 (between the RdRp and VP1 genes) and less frequently between ORF2-ORF3, the treetime views are by breakpoint region. + +| group | ORF1 | ORF2 | ORF3 | RdRp | VP1 | +|:--|:-- |:--|:--|:--|:--| +| GII.2 | [ORF1](https://nextstrain.org/norovirus/GII.2/ORF1/) | [ORF2](https://nextstrain.org/norovirus/GII.2/ORF2/) | [ORF3](https://nextstrain.org/norovirus/GII.2/ORF3/) | [RdRp](https://nextstrain.org/norovirus/GII.2/RdRp/) | [VP1](https://nextstrain.org/norovirus/GII.2/VP1/) | +| GII.3 | [ORF1](https://nextstrain.org/norovirus/GII.3/ORF1/) | [ORF2](https://nextstrain.org/norovirus/GII.3/ORF2/) | [ORF3](https://nextstrain.org/norovirus/GII.3/ORF3/) | [RdRp](https://nextstrain.org/norovirus/GII.3/RdRp/) | [VP1](https://nextstrain.org/norovirus/GII.3/VP1/) | +| GII.4 | [ORF1](https://nextstrain.org/norovirus/GII.4/ORF1/) | [ORF2](https://nextstrain.org/norovirus/GII.4/ORF2/) | [ORF3](https://nextstrain.org/norovirus/GII.4/ORF3/) | [RdRp](https://nextstrain.org/norovirus/GII.4/RdRp/) | [VP1](https://nextstrain.org/norovirus/GII.4/VP1/) | +| GII.6 | [ORF1](https://nextstrain.org/norovirus/GII.6/ORF1/) | [ORF2](https://nextstrain.org/norovirus/GII.6/ORF2/) | [ORF3](https://nextstrain.org/norovirus/GII.6/ORF3/) | [RdRp](https://nextstrain.org/norovirus/GII.6/RdRp/) | [VP1](https://nextstrain.org/norovirus/GII.6/VP1/) | +| GII.17 | [ORF1](https://nextstrain.org/norovirus/GII.17/ORF1/) | [ORF2](https://nextstrain.org/norovirus/GII.17/ORF2/) | [ORF3](https://nextstrain.org/norovirus/GII.17/ORF3/) | [RdRp](https://nextstrain.org/norovirus/GII.17/RdRp/) | [VP1](https://nextstrain.org/norovirus/GII.17/VP1/) | #### Nextclade group, type, and variant assignment diff --git a/phylogenetic/defaults/outgroup.fasta b/phylogenetic/defaults/outgroup.fasta new file mode 100644 index 0000000..3ded2b6 --- /dev/null +++ b/phylogenetic/defaults/outgroup.fasta @@ -0,0 +1,2 @@ +>NC_027026_outgroup +gtgatcaccttgggatggcttccaagccattccaagttgagtcttgcgacttcatgtttgaagtccatgttctccacatgtgctacctcagggtagcaccgagagaacagtttcttgatgactgtactataagtgcagttaccaaatattatttgcaccacgtgcccctacataatagaaatggcatggactgcgagtttgaggctgaggggttgctgtcccggttattcgggacggccggttccccctcactggattcgcagtccgctttcaaggagcttttcggctttgacaccgatgagcaaatgccattgtcccttgaagaattggccaaattgcaaggcgaaatcaccgcctcgcttcaaatgaacaataaccaattcgttgctaaacatggtaaggccaaggtgcaggcccttttggaccagctcaacacactggtcccccgtgacatcagtgaggctgaaagacgccgtcgagaattctttgagaggcagactgctgctgccttcgctgaactcccaaatgatgacacatttaccgaacaagattggaagtcctattggtatgcaatgtggcggcgagttgtgggtggttgcaaatcatactaccatggtctgccccgttggagctctttcaagacacggctctcgcgtgccacagagcccctccgccaggtgttagcagttgctgcacagacctttgatcaatgtgtacaaactgaccctcgcatcttggctatgaattgtgtgactgccttgaaacccactgttttaaccatgatataccagcaacatcacaacacacccagtgggtggcttgcaacccttacggccctctgggaggttttccaaccttccctccctgcattaggcactttaggtactggtgttgccacaaccttggggctcgtcgtcaacaccttgtcacggttcttccagaaactttgtgcttacatttctgaaacatttcttcccgaatctccaacaaccccaggctgggttgccattgtggctggtgttctgctgttacttttgaaactctcctgcatccccaaagtgttttcccactggagcacgcttctcaagttggccagtggtatcacaacagtcattggagcaacacgagccgtggactggatcatggggaaaattcgcgacgcgcgccattcttccatgtgtaaacagttccttagccgtgtgtctgcactgttggagctccactactcaaagactgtcacaggtgttgcagagaacacagagctcctcaagtgtttcgaccagctcattgatgaaggagaagaattggtctctgaaataggaggtggttctttagccgcaataatacgcagtggtgttgacacactccagagggtttccacagaaatcaaagccacaatccaactggacaacccgcgtccagtgcccgtatgtgtgatattctccggacctccaggtattggaaaaacctctttagcataccacatggccaagggcattggtctcacctccaacttttctttggcaaacgatcatcatgatggatatacaggtaaccctgtagctatatgggatgaatatgatactgacaaggatggcaaatttgtggagcagatgatatccctggtcaacacacagccttgtgtcctcaattgtgaccgaccagaaaataagggcaagctttttacatccaagttcattttctgcactaccaattacacgaccagtgtgctcccagacaatccacgtgctggcgcattctaccgtcgagtcatcactgtagatgtgcgatctcctgaaattgaggattggatggcagcacaccctggtaggtcccctccgaaaactctcttcaaaagcgattgctcccacttgaaattaatggttaggccatatatggggtacaaccctgacggggatactctagctggcaaacgcgtaaagcccaccccaatcaccatagctgggcttcatgacatgattgacaaaaagtttgaggagcaatcaggggaggtccggggcatttggatcactgtgccccggcgtagtgtgcaaacagcccttgttgctgtcaagaagttttgtgtagcccatcaagcgttgtgccacgtcacatccactccttctcctgaaattctacagtgtgccactttctcgtgtgttgttgtgtcagatgcgcacccgccacctggtgctccacttttgcacatcaagaatgcacatcttgaagtagaccacactggccaagctgtcaccagcatttcagagagcctgcttggtatgtttattacagagcaacgtgtttcatccaagttgcaaagggatatcatgtacaaagtgtggtcaccttttacattaatgcagactgagccattgaatacacagtccctacccccagttaggcgcataatttatgctgacacgcccatggatttcattgggggtttaaggcaccaccttggctttagttcaattcctggtctgtggcgtgcccttaagcatctcccagacacgccctcaatgatagagtggatcactgaccacctttctcaagtgcagtttcccaacaaccccgaatcaaccctatttagaacaggcaatggtgatgtcatattctatacttatggttctttttatgctctaggcacttgtgcccgtgtgcctgtagttagtggtgacacagttagccctctcccaaatgtccccctaaaaatgacttggtttgaaacactgaaggccctttgctcgtcagcattgcgcttgttcacggccatctcaccatttgccatagccgttgctaatgtcacttatctcaccacccgaggatcacgtgaagaacaagccaaaggcaagacaaaacatggtcgtggcgcccgccatgcccgtgggcgatccacagcgcttaatgatgatgagtacaatgagtggatggacttgcgccgcgactggcgtgaagaaatgactgctgacgagttcttacacctaagggatgaagcttatgaaggtatcatcaatgaccgcacccagcggtacaacacctggcttaacctgcgcaacatgcgcttaggcgctggagcgtaccaacatgcaacaatcataggcaagggtggtgtgcgcaatgagatcatacgcacccaagtgctgaaggctcccaggaaaggaaaatggtcacacattgattcttctggccccatgaattattttgatgaggcaccgaccccactagtggaatttgagtgtgatggtgcccatgttgggtggggggtgcaccttgggaacgggcgcgtggtgacagtcacccatgtggccacatcatctaacactgtgaacgggctgccatttaaggtcaaggacaccgacggggagacctgccaggtttatgccactctaggtaacttgccacactaccagcttggggatggtgcgcctgtttactacacaaccaggtaccaccctgtgcttgtgattggtgaagggcaatttgacacacctacaacgactgtcaacggtttccacgttcgcattaccaattcatatccaacaaagaaaggtgattgtggactaccatacttcaatgcactgcgccaggtggttgcccttcatgctgctggcagtacagatgggtcaaccaagttggcgcaacgagttgctgagaaaccgcaaacaggggatgcattcgtctggaagggattgcccgttgttcgtggtagtgatgtcggcggactccccactggcactcgctaccatcgctccccagcttggccagagatgcgatctgacgagactcattcaccagctccttttggttctggtgacaagcggtatgaattctcacaagtggagatgcttgttaacaacttgagaccataccttgaggaagttccaggggtgcctccagcattacttaatagagctattgttcacacccgcaattacctacaatcaataattggcactgaacaaagtgaaccacttacttatgccatggcatcatcaatgttggaaaaaggcacctcctgtggtccccacattactggccttaagggtgattactgggatgatgaaacacaacagtacacagggtcacttcgagaacatctggaagcggtctggaacaaggctatgttagggacaccaccaagccacgattacaaattggcactcaaagatgagttaagaccaaatgagaaaaatgaacagggtaagaggcgcctgctgtggggtgctgatgctgggcttactctcgtgtgttgtgctgctctcaaaccagcagccgcacgactgcaaactgttgtgcctatgacacctgttgcagtcggcatcaacatggattctgcacacatagaagttatgaatgagtcccttaaggggcgtgtgctttatgcattagattatagcaagtgggacagcacccaatctgcagctgtgacagcagcctcacttgagatccttgcatccttcatgacaccaactccaattgtctcatcagctattgaggccctgaaggcacccgctagaggcatggtcaacgatgccatcttcatagctcgatccggtttgccatctggcatgccgttcacaagtgtggtcaactccatcaaccacatgctttacatctctgcagctattcttcaggcatacgaagcacataatctcccatatagtgacaatgtttttaacattgaaaccattcacacctatggtgatgattgcctgtacgggttcacccccgcaacagcctctctcatgcaggtaatcatagaaaacctacgctcatatggcctgaaacccacagcagctgataagggcgaaaccattgccccagtgcaaactcctgtttttctcaagcgcacgttcgcaacaacaccacatggactgcgcgctttgcttgacacctcttccattcttaggcagttcttctgggtgaaagcccaacgcacatgtgatgtctattctgctcctacaattgacaccaaatctcgggctgcgcagctcgaggtggcattggcctatgctagtcaacatggtcatgagttcttcaacaaagcacgtgaaatcgctgagaaaacaagtgcagctgaggggtatgttctggtgaataccaactacgaacaagcaactgcatgttacaacagctggtacataggaggcactacaccagaaatgcccgctaccaatgaaggctgcgggctaatagtgtttgagatggagggcaatggctccccacgaggaggtaaccagcctcaatcccacaatggtggcacttcacctgctcaggctgcaccgcctggcacgactggccctgcggaagcaccccttgtacctgttaatcctgaacagcccaattccattgcccaacgcatggagctggctgttgccacaggagcaacaacctcaaatgtccctgaatgtgtgcggaactgctttgctctccttcgtacgattccttggaattctcgacagccccagggatctcttctcacagctgtttctttacaccctgacatcaacccgtacacaaaacatcttgctcaaatgtttgccggatggggaggggcaatggatgtccgtgtcacgatctcaggctcgggcttatttgcagggaagcttgtgtgtgggatacttcctcctggagttaatcccactcttgtcagcgatccgggggttctgccacacgctttggtagatgcccgcgtgactgaaccagcatgcttcaacgtgcctgatgtccgcgccgtcgactaccatcgcactgacggcgatgaggcaactgcaacacttggcatttgggtgctccaaccactaatcaaccccttctccactgaagctgtttcaaccgcttggatttcaattgagaccaaaccaggtggtgattttgatctttgcttgatgaaacctcctaaccaagccatggataacggcacatctccatcgttcctcttgccacgccgtttacaacgttcaagaggcaaccgcgctggtggctatgcagttggaatggtcattgtgggctcagcacaccaggtgaaccgccacttcactgctcttggcacaacatttggttggtctaccgcaccttacgaacccatgcgatgtgcttttggtggagtacatcaagggcgtgacaccaacccaaaaattggctactactgggaggttggtgctgaccagcgtggcccgctttttccaaacattgttaatcattggcctgattttgcagtcaacaccaaatacacatggccagatgccgactatataccccatagtgcggttgttggaaccttggtttccttccaggacaatggagatgtttctgaggatcaagtggctactgcatttgcaatttcaatgaacactccaagtgggagcacaactggacgcggcacagtcagagaagcatttgatccatccacaatgcatttggtccgcaccaatggtaccactcaaccatctgggtggccaaccggctcaaacactggaaatggttacttcacaccaatgtggggtcatggtcagggtaatgcaattaatgataagatcaccaacatggagggggctaattatacatttggaggctctggccaaaacaacatagtcttgtgggttgaaagaatcttctcagaccatcctggtaaaaccacactttactcatcacaattggacagcactgcggccattttccagtcaggcccagtcaatatacctgagaacatgatggctgtctacaatgtcaccactaatggggctgactttcaagttggcatccgtcgtgatggctacatggtcacatctggaacaattggtactcagcaggagcttgaccctgacaccactttcacttatgttggacttttccccctttctgcctcattggttggcccacatgggaattctggacgggcccagatagcatggtcatgagctggttggtaggcactttgcaatctcttggtgggctcactgacgttgcgtccaccatttctggcatggtttatcaacatagacatcttgaccaactgaaaagacagaatgatctacaagaacagtggatggcccgcaatgaacaactgcagcgagatgcaatgcaattaactcaggatttggcagtcaatgccccagcgatgagagtgcaggccgctctcaatgctgggtttgatgtggttagtgcgcgccagcttgccggttcaactgagcgaaggatcaacggctatttggatcagccaattcgcactattgatcaggcaatggcagtgcagtctaggggcaatttgacttcgttgtcgaacgcccttgccacttaccaaaaaggcactcaatttggactcaaacaacccaaaggatttaagagcccaattgctgctgaacaatctcgtggtccaaccattacacttggccctcccccaccatctactaatctataaatcaatcttttataaatttgtgcaaatttctttttcttcctcatggtcgcacacgcgttcgggtgcgttgcagtcaattaagcgattgacgccatctttgg diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index d0f8749..933d5b9 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -19,11 +19,41 @@ This part of the workflow usually includes the following steps: See Augur's usage docs for these commands for more details. """ +rule add_outgroup: + """Add outgroup""" + input: + alignment = "results/{group}/{gene}/aligned.fasta", + outgroup = "defaults/outgroup.fasta", + output: + alignment_with_outgroup = "results/{group}/{gene}/aligned_with_outgroup.fasta", + log: + "logs/{group}/{gene}/add-outgroup.txt", + benchmark: + "benchmarks/{group}/{gene}/add-outgroup.txt", + shell: + """ + augur align \ + --sequences {input.outgroup} \ + --existing-alignment {input.alignment} \ + --output {output.alignment_with_outgroup} \ + 2>&1 | tee {log} + """ + +def _alignment(wildcards): + """ + Based on if outgroup rooting is specified in the config file, return the needed alignment file + """ + outgroup = config['refine'].get('outgroup', "") + if outgroup != "": + return "results/{group}/{gene}/aligned_with_outgroup.fasta" + else: + return "results/{group}/{gene}/aligned.fasta" + rule tree: """Building tree""" input: - alignment = "results/{group}/{gene}/aligned.fasta" + alignment = lambda wildcards: _alignment(wildcards), output: tree = "results/{group}/{gene}/tree_raw.nwk" benchmark: @@ -52,12 +82,18 @@ def _clock_rate_params(wildcards): else leave blank """ - clock_rate = config['refine']['clock_rate'].get(wildcards.group, {}).get(wildcards.gene, "") + clock_rate = config['refine'].get('clock_rate', {}).get(wildcards.group, {}).get(wildcards.gene, "") if clock_rate !="": return f' --clock-rate {clock_rate} ' else: return "" +def _root_params(wildcards): + outgroup = config['refine'].get('outgroup', '') + if outgroup !="": + return f'{outgroup} --remove-outgroup' + else: + return config['refine']['root'].get(wildcards.group, {}).get(wildcards.gene, config['refine']['root']['default']), rule refine: """ @@ -65,7 +101,7 @@ rule refine: """ input: tree = "results/{group}/{gene}/tree_raw.nwk", - alignment = "results/{group}/{gene}/aligned.fasta", + alignment = lambda wildcards: _alignment(wildcards), metadata = "results/{group}/{gene}/filtered.tsv" output: tree = "results/{group}/{gene}/tree.nwk", @@ -75,7 +111,7 @@ rule refine: log: "logs/{group}/{gene}/refine.txt", params: - root = lambda wildcards: config['refine']['root'].get(wildcards.group, {}).get(wildcards.gene, config['refine']['root']['default']), + root = lambda wildcards: _root_params(wildcards), clock_rate_params = lambda wildcards: _clock_rate_params(wildcards), id_field = config['strain_id_field'], shell: @@ -90,6 +126,6 @@ rule refine: --metadata-id-columns {params.id_field} \ --output-tree {output.tree:q} \ --output-node-data {output.node_data:q} \ - --stochastic-resolve \ - {params.clock_rate_params} + {params.clock_rate_params} \ + --stochastic-resolve """ diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 69f35b4..67f8977 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -47,7 +47,7 @@ rule filter: input: sequences = "results/sequences.fasta", metadata = "results/metadata.tsv", - exclude = config['filter']['exclude'] + exclude = lambda wildcards: config['filter']['exclude'].get(wildcards.group, config['filter']['exclude']['default']), output: sequences = "results/{group}/{gene}/filtered.fasta", metadata = "results/{group}/{gene}/filtered.tsv",