Skip to content

Commit 87524d4

Browse files
Merge pull request #54 from nextstrain/bdbv
Phylo workflows for BDBV & SUDV
2 parents 7304f52 + 13e114c commit 87524d4

11 files changed

Lines changed: 916 additions & 2 deletions

File tree

phylogenetic/defaults/lat_longs.tsv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,12 @@ country France 46.2276 2.2137
2828
country Netherlands 52.1326 5.2913
2929

3030
division Bas-Uele 3.626 25.145
31+
division Bundibugyo 0.708 30.062
3132
division Equateur 0.229 18.914
3233
division Haut-Katanga -10.469 27.836
3334
division Haut-Lomami -8.237 25.429
3435
division Haut-Uele 3.345 28.588
36+
division Isiro 2.468 27.293
3537
division Ituri 1.754 29.497
3638
division Kasai -4.947 21.106
3739
division Kasai-Central -6.226 22.489

phylogenetic/scripts/get_year.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
]
4444

4545

46-
def suggest_colors(years):
46+
def suggest_colors(years, fname):
4747
# A categorical scale looks better and helps understand the different outbreaks (IMO)
4848
# cf a continuous scale, although that would be more technically accurate
4949
c = colors[len(years)] # colors is 1-indexed
@@ -57,13 +57,18 @@ def suggest_colors(years):
5757

5858
print(f"Suggested auspice-config colors entry:")
5959
print(json.dumps(config))
60+
if fname:
61+
with open(fname, 'w') as fh:
62+
json.dump(config, fh, indent=2)
6063

6164

6265
if __name__ == "__main__":
6366
parser = argparse.ArgumentParser(description=__doc__)
6467
parser.add_argument("--metadata", required=True, help="Metadata TSV")
6568
parser.add_argument("--id-columns", nargs="+", help="ID columns in Metadata TSV", default=['accession'])
6669
parser.add_argument("--output", required=True, help="Node Data JSON output")
70+
parser.add_argument("--output-config", required=False, help="JSON coloring entry for an auspice-config JSON")
71+
6772
args = parser.parse_args()
6873

6974
m = read_metadata(args.metadata, id_columns=args.id_columns)
@@ -72,7 +77,9 @@ def suggest_colors(years):
7277
json.dump({"nodes": nodes}, fh)
7378

7479
try:
75-
suggest_colors(sorted(set([x['year'] for x in nodes.values()])))
80+
suggest_colors(sorted(set([x['year'] for x in nodes.values()])), args.output_config)
7681
except Exception:
7782
print("Failed to suggest colours for the auspice config")
83+
if args.output_config:
84+
raise Exception()
7885

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Phylogenetic workflows for BDBV and SUDV
2+
3+
_Work in progress!_
4+
5+
Firstly run a local ingest build which fetches data from Pathoplexus:
6+
7+
```sh
8+
# working directory: ingest
9+
snakemake --cores 4 -pf results/{bdbv,sudv}/{sequences.fasta,metadata.tsv}
10+
```
11+
12+
The run the phylo workflows:
13+
14+
```sh
15+
# working directory: phylogenetic
16+
snakemake --snakefile --cores 4 -pf species-workflows/bdbv.snakefile
17+
18+
snakemake --snakefile --cores 4 -pf species-workflows/sudv.snakefile
19+
```
20+
21+
22+
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
{
2+
"title": "Bundibugyo ebolavirus genomes",
3+
"maintainers": [
4+
{"name": "Eddy Lusamaki, Institut National de Recherche Biomédicale", "url": "https://inrb.net/"},
5+
{"name": "Nextstrain", "url": "http://nextstrain.org"}
6+
],
7+
"data_provenance": [
8+
{
9+
"name": "GenBank",
10+
"url": "https://www.ncbi.nlm.nih.gov/genbank/"
11+
},
12+
{
13+
"name": "Pathoplexus",
14+
"url": "https://pathoplexus.org"
15+
}
16+
],
17+
"build_url": "https://github.com/nextstrain/ebola",
18+
"colorings": [
19+
{
20+
"key": "gt",
21+
"title": "Genotype",
22+
"type": "categorical"
23+
},
24+
{
25+
"key": "date",
26+
"title": "Sampling Date",
27+
"type": "temporal"
28+
},
29+
{
30+
"key": "author",
31+
"title": "Author",
32+
"type":"categorical"
33+
},
34+
{
35+
"key": "country",
36+
"title": "Country",
37+
"type": "categorical"
38+
},
39+
{
40+
"key": "division",
41+
"title": "Division",
42+
"type": "categorical"
43+
},
44+
{
45+
"key": "dataUseTerms",
46+
"title": "Data use terms",
47+
"type": "categorical"
48+
}
49+
],
50+
"geo_resolutions": [
51+
"country",
52+
"division"
53+
],
54+
"display_defaults": {
55+
"tip_label": "strain",
56+
"color_by": "year",
57+
"geo_resolution": "division"
58+
},
59+
"filters": [
60+
"country",
61+
"division",
62+
"author"
63+
],
64+
"metadata_columns": [
65+
"PPX_accession",
66+
"INSDC_accession",
67+
"restrictedUntil",
68+
"strain"
69+
]
70+
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
{
2+
"title": "Sudan ebolavirus genomes",
3+
"maintainers": [
4+
{"name": "Eddy Lusamaki, Institut National de Recherche Biomédicale", "url": "https://inrb.net/"},
5+
{"name": "Nextstrain", "url": "http://nextstrain.org"}
6+
],
7+
"data_provenance": [
8+
{
9+
"name": "GenBank",
10+
"url": "https://www.ncbi.nlm.nih.gov/genbank/"
11+
},
12+
{
13+
"name": "Pathoplexus",
14+
"url": "https://pathoplexus.org"
15+
}
16+
],
17+
"build_url": "https://github.com/nextstrain/ebola",
18+
"colorings": [
19+
{
20+
"key": "gt",
21+
"title": "Genotype",
22+
"type": "categorical"
23+
},
24+
{
25+
"key": "date",
26+
"title": "Sampling Date",
27+
"type": "temporal"
28+
},
29+
{
30+
"key": "author",
31+
"title": "Author",
32+
"type":"categorical"
33+
},
34+
{
35+
"key": "country",
36+
"title": "Country",
37+
"type": "categorical"
38+
},
39+
{
40+
"key": "division",
41+
"title": "Division",
42+
"type": "categorical"
43+
},
44+
{
45+
"key": "dataUseTerms",
46+
"title": "Data use terms",
47+
"type": "categorical"
48+
}
49+
],
50+
"geo_resolutions": [
51+
"country",
52+
"division"
53+
],
54+
"display_defaults": {
55+
"tip_label": "strain",
56+
"color_by": "year",
57+
"geo_resolution": "country"
58+
},
59+
"filters": [
60+
"country",
61+
"division",
62+
"author"
63+
],
64+
"metadata_columns": [
65+
"PPX_accession",
66+
"INSDC_accession",
67+
"restrictedUntil",
68+
"strain"
69+
]
70+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# The bulk of the rules are generic and are located in the (included) generic.snakefile
2+
# It needs inputs of "results/{species}/sequences.fasta" and "results/{species}/metadata.tsv"
3+
# which we provision in this file
4+
5+
from pathlib import Path
6+
REPO = Path(workflow.current_basedir).parent.parent
7+
8+
config['sequences'] = REPO / "ingest/results/{species}/sequences.fasta",
9+
config['metadata'] = REPO / "ingest/results/{species}/metadata.tsv",
10+
config['exclude'] = REPO / "phylogenetic" / "species-workflows" / "exclude_bdbv.txt"
11+
config['id_column'] = "accession"
12+
config['species'] = ['bdbv']
13+
config['qc_min_length'] = 5_000
14+
# config['treetime_args'] = "--timetree --clock-filter-iqd 0 --root best --precision 3 --max-iter 5",
15+
config['treetime_args'] = "--root mid_point"
16+
config['cds'] = ["NP", "VP35", "VP40", "GP", "GP_003", "VP30", "VP24", "L"]
17+
config['id_column'] = "accession"
18+
config['genbank_reference'] = REPO / "shared" / "bdbv" / "reference.gb"
19+
config['fasta_reference'] = REPO / "shared" / "bdbv" / "reference.fasta"
20+
config['gff_annotation'] = REPO / "shared" / "bdbv" / "annotation.gff"
21+
config['nextclade_pathogen_json'] = REPO / "nextclade" / "dataset_files" / "bdbv" / "pathogen.json"
22+
config['warning'] = "This dataset sources RESTRICTED sequences from [Pathoplexus](https://pathoplexus.org/). Please see [virological](https://virological.org/t/initial-genomes-from-may-2026-bundibugyo-virus-disease-outbreak-in-the-democratic-republic-of-the-congo-and-uganda/1032) for more detail on the ongoing outbreak in DRC & Uganda."
23+
24+
# Define an input function so that species can vary which node-data files are generated/used
25+
def node_data_files(wildcards):
26+
return [
27+
"results/{species}/branch_lengths.json",
28+
"results/{species}/muts.json",
29+
"results/{species}/sampling-year.json",
30+
]
31+
32+
include: "generic.snakefile"
33+
34+
rule all:
35+
input:
36+
tree=expand("auspice/ebola_{species}.json", species=config['species']),
37+

phylogenetic/species-workflows/exclude_bdbv.txt

Whitespace-only changes.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# very diverged (>0.3)
2+
PP_000005Q
3+
4+
# SUDV lab passaged strains
5+
PP_00000CA
6+
PP_00000SF
7+
PP_00000TD

0 commit comments

Comments
 (0)