Skip to content

Commit c3eff03

Browse files
committed
feat: implement consensus tree building from CID cluster among mlTrees with highest average likelihood
1 parent 3866674 commit c3eff03

File tree

2 files changed

+79
-72
lines changed

2 files changed

+79
-72
lines changed

workflow/rules/common.smk

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,13 @@ def get_final_output():
7676
final_output.extend(
7777
expand(
7878
[
79-
"results/trees/{ind}.raxml_ng.max_missing_stable_topology_selection.pdf",
80-
"results/trees/{ind}.{software}.tree_values_across_missingness.pdf",
81-
"results/trees/{ind}.{software}.support_vs_branch_length.full_data.pdf",
82-
"results/trees/{ind}.{software}.support_vs_branch_length.summary.pdf",
83-
"results/trees/{ind}/{model}/max_{max_missing}_missing/{ind}.{model}.max_{max_missing}_missing.{software}.support.{tree_type}.pdf",
79+
"results/trees/{ind}.max_missing_stable_topology_selection.pdf",
80+
"results/trees/{ind}.tree_values_across_missingness.pdf",
81+
"results/trees/{ind}.{tree_type}.support_vs_branch_length.full_data.pdf",
82+
"results/trees/{ind}.{tree_type}.support_vs_branch_length.summary.pdf",
83+
"results/trees/{ind}/{model}/max_{max_missing}_missing/{ind}.{model}.max_{max_missing}_missing.support.{tree_type}.collapsed.pdf",
8484
],
8585
ind=individual,
86-
software=["raxml_ng", "gotree"],
8786
model=config["raxml_ng"]["models"],
8887
max_missing=config["raxml_ng"]["max_missing"],
8988
tree_type=["bestTree", "consensusTree"],

workflow/rules/phylogeny.smk

Lines changed: 74 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -290,27 +290,14 @@ rule raxml_ng_bootstrap:
290290
"2>{log}"
291291

292292

293-
rule gotree_consensus_ml_tree:
294-
input:
295-
ml_trees="results/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.mlTrees",
296-
output:
297-
consensus_tree="results/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.consensusTree",
298-
log:
299-
"logs/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.consensusTree.log",
300-
conda:
301-
"../envs/gotree.yaml"
302-
shell:
303-
"gotree compute consensus -i {input.ml_trees} --freq-min 0.5 -o {output.consensus_tree} 2>{log}"
304-
305-
306293
rule gotree_collapse_bootstrap_trees:
307294
input:
308295
bootstraps="results/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.bootstraps.raxml.bootstraps",
309296
variant_sites="results/raxml_ng/{individual}/input/max_{n_missing_cells}_missing/{individual}.ml_gt_and_likelihoods.filtered_sites.txt",
310297
output:
311-
collapsed="results/gotree/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.bootstraps.raxml.bootstraps",
298+
collapsed="results/gotree/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.bootstraps.raxml.collapsed.bootstraps",
312299
log:
313-
"logs/gotree/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.bootstraps.raxml.bootstraps.log",
300+
"logs/gotree/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.bootstraps.raxml.collapsed.bootstraps.log",
314301
conda:
315302
"../envs/gotree.yaml"
316303
params:
@@ -321,12 +308,12 @@ rule gotree_collapse_bootstrap_trees:
321308

322309
rule gotree_collapse_tree:
323310
input:
324-
best_tree="results/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.{type}",
311+
best_tree="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.{type}",
325312
variant_sites="results/raxml_ng/{individual}/input/max_{n_missing_cells}_missing/{individual}.ml_gt_and_likelihoods.filtered_sites.txt",
326313
output:
327-
collapsed="results/gotree/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.{type}",
314+
collapsed="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.collapsed.{type}",
328315
log:
329-
"logs/gotree/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.{type}.log",
316+
"logs/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.collapsed.{type}.log",
330317
conda:
331318
"../envs/gotree.yaml"
332319
params:
@@ -335,38 +322,20 @@ rule gotree_collapse_tree:
335322
"gotree collapse length --length {params.min_length} --input {input.best_tree} --output {output.collapsed} 2>{log} "
336323

337324

338-
rule gotree_support_collapsed_trees:
339-
input:
340-
ref_tree="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.{tree_type}",
341-
bootstraps="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.bootstraps.raxml.bootstraps",
342-
output:
343-
support="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.support.{tree_type}.raxml.support",
344-
log:
345-
"logs/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.support.{tree_type}.raxml.support.log",
346-
conda:
347-
"../envs/gotree.yaml"
348-
threads: 4
349-
shell:
350-
"( gotree compute support tbe "
351-
" --threads {threads} "
352-
" --bootstrap {input.bootstraps} "
353-
" --reftree {input.ref_tree} "
354-
" --out {output.support} "
355-
") 2>{log}"
356-
357-
358325
rule extract_ml_tree_likelihoods:
359326
input:
360-
log="results/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.log",
327+
log="results/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.{infix}.raxml.log",
361328
output:
362-
tsv="results/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.log_likelihoods.tsv",
329+
tsv="results/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.{infix}.raxml.log_likelihoods.tsv",
363330
log:
364-
"logs/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.log_likelihoods.log",
331+
"logs/raxml_ng/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.{infix}.raxml.log_likelihoods.log",
365332
conda:
366333
"../envs/grep.yaml"
334+
params:
335+
search_string=lambda wc: "Bootstrap tree" if wc.infix == "bootstraps" else "ML tree search"
367336
shell:
368-
'( grep "ML tree search #" {input.log} | '
369-
r"sed -r -e 's/^.*ML tree search #([0-9]+), logLikelihood: (-[0-9]+.[0-9]+)$/\1\t\2/' | "
337+
'( grep "{params.search_string} #" {input.log} | '
338+
r"sed -r -e 's/^.*{params.search_string} #([0-9]+), logLikelihood: (-[0-9]+.[0-9]+)$/\1\t\2/' | "
370339
"sort -n -k 1,1 "
371340
">{output.tsv} "
372341
") 2>{log}"
@@ -401,6 +370,45 @@ rule cluster_info_dist_across_trees:
401370
"../scripts/cluster_info_dist.R"
402371

403372

373+
rule gotree_compute_consensus_tree:
374+
input:
375+
best_cluster_trees="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.mlTrees.best_cluster.best_cluster_trees.newick",
376+
output:
377+
consensus_tree="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.consensusTree",
378+
log:
379+
"logs/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.consensusTree.log",
380+
conda:
381+
"../envs/gotree.yaml"
382+
threads: 4
383+
shell:
384+
"( gotree compute consensus "
385+
" --input {input.best_cluster_trees} "
386+
" --freq-min 0.5 " # this should be the default, but without a min frequency we get an error about this
387+
" --threads {threads} "
388+
" --output {output.consensus_tree} "
389+
") 2>{log}"
390+
391+
392+
rule gotree_support_collapsed_trees:
393+
input:
394+
ref_tree="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.search.raxml.collapsed.{tree_type}",
395+
bootstraps="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.bootstraps.raxml.collapsed.bootstraps",
396+
output:
397+
support="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.support.{tree_type}.raxml.collapsed.support",
398+
log:
399+
"logs/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.support.{tree_type}.raxml.support.log",
400+
conda:
401+
"../envs/gotree.yaml"
402+
threads: 4
403+
shell:
404+
"( gotree compute support tbe "
405+
" --threads {threads} "
406+
" --bootstrap {input.bootstraps} "
407+
" --reftree {input.ref_tree} "
408+
" --out {output.support} "
409+
") 2>{log}"
410+
411+
404412
rule raxml_ng_rfdist_across_trees:
405413
input:
406414
trees=lambda wc: expand(
@@ -558,15 +566,15 @@ rule raxml_ng_bsconverge_to_tsv:
558566
rule concatenate_raxml_ng_tsvs_per_individual_per_metric:
559567
input:
560568
rf_dists=lambda wc: expand(
561-
"results/{{software}}/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.{type}.raxml.{{metric}}.tsv",
569+
"results/raxml_ng/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.{type}.raxml.{{metric}}.tsv",
562570
model=config["raxml_ng"]["models"],
563571
n_missing_cells=config["raxml_ng"]["max_missing"],
564572
type=["startTree", "mlTrees", "bootstraps"] if wc.metric == "rf_dist" else ["bootstraps"],
565573
),
566574
output:
567-
tsv="results/trees/{individual}.{metric}.{software}.max_missing_stable_topology_selection.tsv",
575+
tsv="results/trees/{individual}.{metric}.max_missing_stable_topology_selection.tsv",
568576
log:
569-
"logs/trees/{individual}.{metric}.{software}.max_missing_stable_topology_selection.log",
577+
"logs/trees/{individual}.{metric}.max_missing_stable_topology_selection.log",
570578
conda:
571579
"../envs/xsv.yaml"
572580
shell:
@@ -581,11 +589,11 @@ rule concatenate_raxml_ng_tsvs_per_individual_per_metric:
581589
rule join_raxml_ng_tsvs_per_individual_across_metrics:
582590
input:
583591
bsconverge="results/trees/{individual}.bsconverge.raxml_ng.max_missing_stable_topology_selection.tsv",
584-
rf_dist="results/trees/{individual}.rf_dist.{software}.max_missing_stable_topology_selection.tsv",
592+
rf_dist="results/trees/{individual}.rf_dist.max_missing_stable_topology_selection.tsv",
585593
output:
586-
tsv="results/trees/{individual}.{software}.max_missing_stable_topology_selection.tsv",
594+
tsv="results/trees/{individual}.max_missing_stable_topology_selection.tsv",
587595
log:
588-
"logs/trees/{individual}.{software}.max_missing_stable_topology_selection.join_metrics.log",
596+
"logs/trees/{individual}.max_missing_stable_topology_selection.join_metrics.log",
589597
conda:
590598
"../envs/xsv.yaml"
591599
shell:
@@ -602,11 +610,11 @@ rule join_raxml_ng_tsvs_per_individual_across_metrics:
602610

603611
rule plot_distinct_topologies_across_missingness:
604612
input:
605-
tsv="results/trees/{individual}.{software}.max_missing_stable_topology_selection.tsv",
613+
tsv="results/trees/{individual}.gotree.max_missing_stable_topology_selection.tsv",
606614
output:
607-
plot="results/trees/{individual}.{software}.max_missing_stable_topology_selection.pdf",
615+
plot="results/trees/{individual}.max_missing_stable_topology_selection.pdf",
608616
log:
609-
"logs/trees/{individual}.{software}.max_missing_stable_topology_selection.log"
617+
"logs/trees/{individual}.max_missing_stable_topology_selection.log"
610618
conda:
611619
"../envs/tidyverse.yaml"
612620
script:
@@ -615,12 +623,12 @@ rule plot_distinct_topologies_across_missingness:
615623

616624
rule plot_support_tree:
617625
input:
618-
support="results/gotree/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.support.{tree_type}.raxml.support",
626+
support="results/gotree/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.support.{tree_type}.raxml.collapsed.support",
619627
samples=config["samples"],
620628
output:
621-
support="results/trees/{individual}/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.{software}.support.{tree_type}.pdf",
629+
support="results/trees/{individual}/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.support.{tree_type}.collapsed.pdf",
622630
log:
623-
"logs/trees/{model}/max_{n_missing_cells}_missing/{individual}.{software}.support.{tree_type}.log",
631+
"logs/trees/{model}/max_{n_missing_cells}_missing/{individual}.support.{tree_type}.collapsed.log",
624632
conda:
625633
"../envs/ggtree.yaml"
626634
script:
@@ -630,21 +638,21 @@ rule plot_support_tree:
630638
rule plot_values_across_missingness:
631639
input:
632640
support_trees=expand(
633-
"results/{{software}}/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.support.{tree_type}.raxml.support",
641+
"results/gotree/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.support.{tree_type}.raxml.collapsed.support",
634642
model=lookup("raxml_ng/models", within=config),
635643
n_missing_cells=lookup(dpath="raxml_ng/max_missing", within=config),
636644
tree_type=["bestTree", "consensusTree"]
637645
),
638646
cluster_info_dist=expand(
639-
"results/{{software}}/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.{type}.all.cluster_info_dist.tsv",
647+
"results/gotree/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.{type}.all.cluster_info_dist.tsv.gz",
640648
model=lookup("raxml_ng/models", within=config),
641649
n_missing_cells=lookup(dpath="raxml_ng/max_missing", within=config),
642650
type=["startTree", "mlTrees", "bootstraps"],
643651
)
644652
output:
645-
support_plot="results/trees/{individual}.{software}.tree_values_across_missingness.pdf",
653+
support_plot="results/trees/{individual}.tree_values_across_missingness.pdf",
646654
log:
647-
"logs/trees/{individual}.{software}.tree_values_across_missingness_plot.log",
655+
"logs/trees/{individual}.tree_values_across_missingness_plot.log",
648656
conda:
649657
"../envs/ggtree.yaml"
650658
resources:
@@ -656,7 +664,7 @@ rule plot_values_across_missingness:
656664
rule plot_support_values_and_branch_lengths:
657665
input:
658666
support_trees=expand(
659-
"results/{{software}}/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.support.raxml.support",
667+
"results/gotree/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.support.{{tree_type}}.raxml.collapsed.support",
660668
model=lookup("raxml_ng/models", within=config),
661669
n_missing_cells=lookup(dpath="raxml_ng/max_missing", within=config),
662670
),
@@ -666,13 +674,13 @@ rule plot_support_values_and_branch_lengths:
666674
n_missing_cells=lookup(dpath="raxml_ng/max_missing", within=config),
667675
),
668676
output:
669-
support_hist="results/trees/{individual}.{software}.support.histogram.pdf",
670-
branch_length_hist="results/trees/{individual}.{software}.branch_length.histogram.pdf",
671-
branch_length_ecdf="results/trees/{individual}.{software}.branch_length.ecdf.pdf",
672-
data_plot="results/trees/{individual}.{software}.support_vs_branch_length.full_data.pdf",
673-
summary_plot="results/trees/{individual}.{software}.support_vs_branch_length.summary.pdf",
677+
support_hist="results/trees/{individual}.{tree_type}.support.histogram.pdf",
678+
branch_length_hist="results/trees/{individual}.{tree_type}.branch_length.histogram.pdf",
679+
branch_length_ecdf="results/trees/{individual}.{tree_type}.branch_length.ecdf.pdf",
680+
data_plot="results/trees/{individual}.{tree_type}.support_vs_branch_length.full_data.pdf",
681+
summary_plot="results/trees/{individual}.{tree_type}.support_vs_branch_length.summary.pdf",
674682
log:
675-
"logs/trees/{individual}.{software}.support_values_vs_branch_lengths.log",
683+
"logs/trees/{individual}.{tree_type}.support_values_vs_branch_lengths.log",
676684
conda:
677685
"../envs/ggtree.yaml"
678686
script:

0 commit comments

Comments
 (0)