diff --git a/ingest/README.md b/ingest/README.md index 883d299..ae60a59 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -1,7 +1,7 @@ # Ingest -This workflow ingests public data from NCBI and outputs curated metadata and -sequences that can be used as input for the phylogenetic workflow. +This workflow ingests public data from Pathoplexus and outputs curated metadata +and sequences that can be used as input for the phylogenetic workflow. If you have another data source or private data that needs to be formatted for the phylogenetic workflow, then you can use a similar workflow to curate your @@ -25,18 +25,6 @@ This produces the default outputs of the ingest workflow: - metadata = results/metadata_all.tsv - sequences = results/sequences_all.fasta -### Dumping the full raw metadata from NCBI Datasets - -The workflow has a target for dumping the full raw metadata from NCBI Datasets. - -``` -nextstrain build ingest dump_ncbi_dataset_report -``` - -This will produce the file `ingest/data/ncbi_dataset_report_raw.tsv`, -which you can inspect to determine what fields and data to use if you want to -configure the workflow for your pathogen. - ## Defaults The defaults directory contains all of the default configurations for the ingest workflow. diff --git a/ingest/Snakefile b/ingest/Snakefile index 090f793..95216fa 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -10,6 +10,8 @@ rule all: input: sequences="results/sequences.fasta", metadata="results/metadata.tsv", + sequences_open="results/sequences_open.fasta", + metadata_open="results/metadata_open.tsv", # Shared Snakemake files with generic functions are shared across pathogens include: "../shared/vendored/snakemake/config.smk" @@ -18,7 +20,7 @@ include: "../shared/vendored/snakemake/config.smk" # If there are build-specific customizations, they should be added with the # custom_rules imported below to ensure that the core workflow is not complicated # by build-specific rules. -include: "rules/fetch_from_ncbi.smk" +include: "rules/fetch.smk" include: "rules/curate.smk" include: "rules/nextclade.smk" @@ -35,4 +37,4 @@ include: "rules/nextclade.smk" if "custom_rules" in config: for rule_file in config["custom_rules"]: - include: rule_file \ No newline at end of file + include: rule_file diff --git a/ingest/defaults/annotations.tsv b/ingest/defaults/annotations.tsv index 45ab88a..d454abf 100644 --- a/ingest/defaults/annotations.tsv +++ b/ingest/defaults/annotations.tsv @@ -272,25 +272,25 @@ ON694341 institution Centre for Biological Threats, Highly Pathogenic Viruses, R ON694342 institution Centre for Biological Threats, Highly Pathogenic Viruses, Robert Koch Institute, Germany ON720848 institution Microbial Genomics, Hospital General Universitario Gregorio Marañón, Madrid, Spain ON720849 institution Microbial Genomics, Hospital General Universitario Gregorio Marañón, Madrid, Spain -KT163243 date 1968-XX-XX -AF260968 date 1951-XX-XX -AF260968 region Africa -AF260968 country Egypt -AF260968 host Homo sapians -AF196835 host Phoenicopterus chilensis -AF196835 date 1999-XX-XX -AY765264 date 1997-XX-XX -AY765264 country Czech Republic -AY765264 region Europe -DQ318020 date 1972-XX-XX -DQ318020 host Culex tigripes -D00246 country Australia -D00246 date 1960-XX-XX -EF631122 date XXXX-XX-XX -EF631123 date XXXX-XX-XX -DQ116961 date 2004-XX-XX -AY603654 date 1976-XX-XX -AM404308 date 1971-XX-XX -AF260968 date 1951-XX-XX -AY660002 date 2003-XX-XX -AY268132 date 2000-XX-XX +PP_0001F2D date 1968-XX-XX +PP_000HJBT date 1951-XX-XX +PP_000HJBT region Africa +PP_000HJBT country Egypt +PP_000HJBT host Homo sapians +PP_000HHL9 host Phoenicopterus chilensis +PP_000HHL9 date 1999-XX-XX +PP_000HY01 date 1997-XX-XX +PP_000HY01 country Czech Republic +PP_000HY01 region Europe +PP_000JBDU date 1972-XX-XX +PP_000JBDU host Culex tigripes +PP_000HZ4S country Australia +PP_000HZ4S date 1960-XX-XX +PP_000JSDD date XXXX-XX-XX +PP_000JSEB date XXXX-XX-XX +PP_000J96A date 2004-XX-XX +PP_000HXJZ date 1976-XX-XX +PP_000HQ6X date 1971-XX-XX +PP_000HJBT date 1951-XX-XX +PP_000HXRK date 2003-XX-XX +PP_000HRSP date 2000-XX-XX diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 486503b..158b96a 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -4,70 +4,38 @@ # Define optional config parameters with their default values here so that users # do not have to dig through the workflows to figure out the default values -# Required to fetch from NCBI Datasets -ncbi_taxon_id: "11082" - -# The list of NCBI Datasets fields to include from NCBI Datasets output -# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields -# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields -# Note: the "accession" field MUST be provided to match with the sequences -ncbi_datasets_fields: - - accession - - sourcedb - - isolate-lineage - - geo-region - - geo-location - - isolate-collection-date - - release-date - - update-date - - length - - host-name - - is-lab-host - - isolate-lineage-source - - bioprojects - - biosample-acc - - sra-accs - - submitter-names - - submitter-affiliation +ppx_fetch: + seqs: https://lapis.pathoplexus.org/west-nile/sample/unalignedNucleotideSequences?versionStatus=LATEST_VERSION + meta: https://lapis.pathoplexus.org/west-nile/sample/details?dataFormat=csv&versionStatus=LATEST_VERSION # Config parameters related to the curate pipeline curate: # The path to the local geolocation rules within the pathogen repo # The path should be relative to the ingest directory. local_geolocation_rules: "defaults/geolocation-rules.tsv" - # The original field names should match the ncbi_datasets_fields provided above. # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names field_map: - accession: accession - accession_version: accession_version - sourcedb: database - isolate-lineage: strain - geo-region: region - geo-location: location - isolate-collection-date: date - release-date: date_released - update-date: date_updated - length: length - host-name: host - is-lab-host: is_lab_host - isolate-lineage-source: sample_type - biosample-acc: biosample_accessions - sra-accs: sra_accessions - submitter-names: full_authors - submitter-affiliation: institution - # Standardized strain name regex - # Currently accepts any characters because we do not have a clear standard for strain names across pathogens - strain_regex: "^.+$" - # Back up strain name field to use if "strain" doesn"t match regex above - strain_backup_fields: ["accession"] + accessionVersion: PPX_accession + insdcAccessionFull: INSDC_accession + insdcRawReadsAccession: sra_accession + displayName: strain + geoLocCountry: country + geoLocAdmin1: division + geoLocAdmin2: location + sampleCollectionDate: date + earliestReleaseDate: date_submitted + hostNameCommon: host + isLabHost: is_lab_host + dataUseTermsRestrictedUntil: restrictedUntil + dataUseTermsUrl: dataUseTerms__url + authors: full_authors + authorAffiliations: institution # List of date fields to standardize to ISO format YYYY-MM-DD - date_fields: ["date", "date_released", "date_updated"] + date_fields: ["date", "date_submitted"] # List of expected date formats that are present in the date fields provided above # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"] - # The expected field that contains the GenBank geo_loc_name - genbank_location_field: location titlecase: # List of string fields to titlecase fields: ["region", "country", "division", "location"] @@ -93,16 +61,19 @@ curate: output_id_field: "accession" # The field in the NDJSON record that contains the actual genomic sequence output_sequence_field: "sequence" - # The field in the NDJSON record that contains the actual GenBank accession - genbank_accession: 'accession' + # The field in the NDJSON record that contains the actual Pathoplexus accession + pathoplexus_accession: 'PPX_accession' + # The field in the NDJSON record that contains the actual INSDC accession + insdc_accession: 'INSDC_accession' # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [ 'accession', - #'genbank_accession_rev', + 'PPX_accession', + 'PPX_accession__url', + 'INSDC_accession', + 'INSDC_accession__url', #'strain', - #'strain_s', - #'viruslineage_ids', 'date', #'updated', 'region', @@ -116,15 +87,11 @@ curate: 'is_lab_host', #'date_submitted', #'sra_accession', - #'full_authors', - #'reverse', 'authors', - #'institution', - #'title', - #'journal', - #'publications', - #'paper_url', - 'url', + 'institution', + 'dataUseTerms', + 'dataUseTerms__url', + 'restrictedUntil', 'length', ] @@ -135,5 +102,72 @@ nextclade: pathoplexus: URL: 'https://lapis.pathoplexus.org/west-nile/sample/details' - fields: 'insdcAccessionBase,lineage' - accession_field: 'insdcAccessionBase' + fields: 'accession,lineage' + accession_field: 'accession' + +ppx_metadata_fields: + - "accessionVersion" + - "accession" + - "version" + - "submitter" + - "groupName" + - "submittedDate" + - "releasedDate" + - "dataUseTerms" + - "dataUseTermsRestrictedUntil" + - "dataUseTermsUrl" + - "assemblyReferenceGenomeAccession" + - "authorAffiliations" + - "authors" + - "bioprojectAccession" + - "biosampleAccession" + - "completeness" + - "displayName" + - "earliestReleaseDate" + - "frameShifts" + - "geoLocAdmin1" + - "geoLocAdmin2" + - "geoLocCity" + - "geoLocCountry" + - "geoLocLatitude" + - "geoLocLongitude" + - "geoLocSite" + - "hostAge" + - "hostAgeBin" + - "hostDisease" + - "hostGender" + - "hostHealthOutcome" + - "hostHealthState" + - "hostNameCommon" + - "hostOriginCountry" + - "hostVaccinationStatus" + - "insdcAccessionBase" + - "insdcAccessionFull" + - "insdcRawReadsAccession" + - "insdcVersion" + - "isLabHost" + - "length" + - "ncbiReleaseDate" + - "ncbiSourceDb" + - "ncbiSubmitterCountry" + - "ncbiUpdateDate" + - "ncbiVirusName" + - "ncbiVirusTaxId" + - "purposeOfSampling" + - "purposeOfSequencing" + - "qualityControlDetails" + - "qualityControlDetermination" + - "qualityControlIssues" + - "qualityControlMethodName" + - "qualityControlMethodVersion" + - "sampleCollectionDate" + - "sampleCollectionDateRangeLower" + - "sampleCollectionDateRangeUpper" + - "sampleType" + - "totalAmbiguousNucs" + - "totalDeletedNucs" + - "totalFrameShifts" + - "totalInsertedNucs" + - "totalSnps" + - "totalUnknownNucs" + - "travelHistory" diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index a3f6ba3..361771b 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -1,15 +1,15 @@ """ -This part of the workflow handles transforming the data into standardized -formats and expects input file +This part of the workflow handles the curation of data from Pathoplexus - sequences_ndjson = "data/sequences_{serotype}.ndjson" +REQUIRED INPUTS: -This will produce output files as + sequences_ndjson = data/sequences.ndjson - metadata = "results/metadata_{serotype}.tsv" - sequences = "results/sequences_{serotype}.fasta" +OUTPUTS: + + metadata = data/subset_metadata.tsv + sequences = results/sequences.fasta -Parameters are expected to be defined in `config.curate`. """ @@ -21,7 +21,7 @@ def format_field_map(field_map: dict[str, str]) -> str: rule curate: input: - sequences_ndjson="data/genbank.ndjson", + sequences_ndjson="data/sequences.ndjson", geolocation_rules=config["curate"]["local_geolocation_rules"], annotations=config["curate"]["annotations"], manual_mapping="defaults/host_hostgenus_hosttype_map.tsv", @@ -34,11 +34,8 @@ rule curate: "benchmarks/curate.txt", params: field_map=format_field_map(config["curate"]["field_map"]), - strain_regex=config["curate"]["strain_regex"], - strain_backup_fields=config["curate"]["strain_backup_fields"], date_fields=config["curate"]["date_fields"], expected_date_formats=config["curate"]["expected_date_formats"], - genbank_location_field=config["curate"]["genbank_location_field"], articles=config["curate"]["titlecase"]["articles"], abbreviations=config["curate"]["titlecase"]["abbreviations"], titlecase_fields=config["curate"]["titlecase"]["fields"], @@ -54,14 +51,9 @@ rule curate: | augur curate rename \ --field-map {params.field_map} \ | augur curate normalize-strings \ - | augur curate transform-strain-name \ - --strain-regex {params.strain_regex} \ - --backup-fields {params.strain_backup_fields} \ | augur curate format-dates \ --date-fields {params.date_fields} \ --expected-date-formats {params.expected_date_formats} \ - | augur curate parse-genbank-location \ - --location-field {params.genbank_location_field} \ | augur curate titlecase \ --titlecase-fields {params.titlecase_fields} \ --articles {params.articles} \ @@ -88,23 +80,34 @@ rule curate: --output-id-field {params.id_field} \ --output-seq-field {params.sequence_field} ) 2>> {log} """ -rule add_metadata_columns: +rule add_accession_urls: """Add columns to metadata Notable columns: - - [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*'). + - PPX_accession__url: URL linking to the Pathoplexus record. + - INSDC_accession__url: URL linking to the NCBI GenBank record. + - url: URL linking to the NCBI GenBank record (kept for backwards compatibility). """ input: metadata = "data/all_metadata.tsv" output: metadata = temp("data/all_metadata_added.tsv") params: - accession=config['curate']['genbank_accession'] + pathoplexus_accession=config['curate']['pathoplexus_accession'], + pathoplexus_accession_url=config['curate']['pathoplexus_accession'] + "__url", + insdc_accession=config['curate']['insdc_accession'], + insdc_accession_url=config['curate']['insdc_accession'] + "__url", shell: """ - csvtk mutate2 -t \ - -n url \ - -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \ - {input.metadata} \ + cat {input.metadata} \ + | csvtk mutate2 -t \ + -n {params.pathoplexus_accession_url} \ + -e '"https://pathoplexus.org/seq/" + ${params.pathoplexus_accession}' \ + | csvtk mutate2 -t \ + -n {params.insdc_accession_url} \ + -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.insdc_accession}' \ + | csvtk mutate2 -t \ + -n url \ + -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.insdc_accession}' \ > {output.metadata} """ @@ -121,6 +124,30 @@ rule subset_metadata: {input.metadata} > {output.metadata} """ +rule extract_open_data: + input: + metadata = "results/metadata.tsv", + sequences = "results/sequences.fasta" + output: + metadata = "results/metadata_open.tsv", + sequences = "results/sequences_open.fasta" + benchmark: + "benchmarks/extract_open_data.txt" + log: + "logs/extract_open_data.txt" + shell: + r""" + exec &> >(tee {log:q}) + + augur filter \ + --metadata {input.metadata:q} \ + --sequences {input.sequences:q} \ + --metadata-id-columns accession \ + --exclude-where "dataUseTerms=RESTRICTED" \ + --output-metadata {output.metadata:q} \ + --output-sequences {output.sequences:q} + """ + rule compress: input: file="{a_file}", diff --git a/ingest/rules/fetch.smk b/ingest/rules/fetch.smk new file mode 100644 index 0000000..5d9ce5a --- /dev/null +++ b/ingest/rules/fetch.smk @@ -0,0 +1,68 @@ +""" +This part of the workflow handles fetching sequences and metadata from Pathoplexus. + +REQUIRED INPUTS: + + None + +OUTPUTS: + + ndjson = data/sequences.ndjson + +""" +workflow.global_resources.setdefault("concurrent_deploys", 2) + +rule download_ppx_seqs: + output: + sequences= "data/ppx_sequences.fasta", + params: + sequences_url=config["ppx_fetch"]["seqs"], + # Allow retries in case of network errors + retries: 5 + benchmark: + "benchmarks/download_ppx_seqs.txt" + log: + "logs/download_ppx_seqs.txt" + shell: + """ + curl {params.sequences_url} -o {output.sequences} + """ + +rule download_ppx_meta: + output: + metadata= "data/ppx_metadata.csv" + params: + metadata_url=config["ppx_fetch"]["meta"], + fields = ",".join(config["ppx_metadata_fields"]) + # Allow retries in case of network errors + retries: 5 + benchmark: + "benchmarks/download_ppx_meta.txt" + log: + "logs/download_ppx_meta.txt" + shell: + """ + curl '{params.metadata_url}&fields={params.fields}' -o {output.metadata} + """ + +rule format_ppx_ndjson: + input: + sequences = "data/ppx_sequences.fasta", + metadata = "data/ppx_metadata.csv", + output: + ndjson = "data/sequences.ndjson", + log: + "logs/format_ppx_ndjson.txt" + benchmark: + "benchmarks/format_ppx_ndjson.txt" + shell: + """ + augur curate passthru \ + --metadata {input.metadata} \ + --fasta {input.sequences} \ + --seq-id-column accessionVersion \ + --seq-field sequence \ + --unmatched-reporting warn \ + --duplicate-reporting warn \ + 2> {log} > {output.ndjson} + """ diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk deleted file mode 100644 index d1d4b90..0000000 --- a/ingest/rules/fetch_from_ncbi.smk +++ /dev/null @@ -1,104 +0,0 @@ -""" -This part of the workflow handles fetching sequences from various sources. -Uses `config.sources` to determine which sequences to include in final output. - -Currently only fetches sequences from GenBank, but other sources can be -defined in the config. If adding other sources, add a new rule upstream -of rule `fetch_all_sequences` to create the file `data/{source}.ndjson` or the -file must exist as a static file in the repo. - -Produces final output as - - sequences_ndjson = "data/sequences.ndjson" - -""" -workflow.global_resources.setdefault("concurrent_deploys", 2) - -rule fetch_ncbi_dataset_package: - output: - dataset_package = temp("data/ncbi_dataset.zip") - retries: 5 # Requires snakemake 7.7.0 or later - log: - "logs/fetch_ncbi_dataset_package.txt" - benchmark: - "benchmarks/fetch_ncbi_dataset_package.txt" - params: - ncbi_taxon_id = config["ncbi_taxon_id"] - shell: - """ - datasets download virus genome taxon {params.ncbi_taxon_id} \ - --no-progressbar \ - --filename {output.dataset_package} 2>&1 | tee {log} - """ - -# Note: This rule is not part of the default workflow! -# It is intended to be used as a specific target for users to be able -# to inspect and explore the full raw metadata from NCBI Datasets. -rule dump_ncbi_dataset_report: - input: - dataset_package="data/ncbi_dataset.zip", - output: - ncbi_dataset_tsv="data/ncbi_dataset_report_raw.tsv", - shell: - """ - dataformat tsv virus-genome \ - --package {input.dataset_package} > {output.ncbi_dataset_tsv} - """ - -rule extract_ncbi_dataset_sequences: - input: - dataset_package = "data/ncbi_dataset.zip" - output: - ncbi_dataset_sequences = temp("data/ncbi_dataset_sequences.fasta") - benchmark: - "benchmarks/extract_ncbi_dataset_sequences.txt" - shell: - """ - unzip -jp {input.dataset_package} \ - ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences} - """ - -rule format_ncbi_dataset_report: - input: - dataset_package = "data/ncbi_dataset.zip", - output: - ncbi_dataset_tsv = temp("data/ncbi_dataset_report.tsv") - params: - ncbi_dataset_fields = ",".join(config["ncbi_datasets_fields"]), - benchmark: - "benchmarks/format_ncbi_dataset_report.txt" - shell: - """ - dataformat tsv virus-genome \ - --package {input.dataset_package} \ - --fields {params.ncbi_dataset_fields:q} \ - --elide-header \ - | csvtk fix-quotes -Ht \ - | csvtk add-header -t -n {params.ncbi_dataset_fields} \ - | csvtk rename -t -f accession -n accession_version \ - | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." --at 1 \ - > {output.ncbi_dataset_tsv} - """ - - -rule format_ncbi_datasets_ndjson: - input: - ncbi_dataset_sequences = "data/ncbi_dataset_sequences.fasta", - ncbi_dataset_tsv = "data/ncbi_dataset_report.tsv", - output: - ndjson = "data/genbank.ndjson", - log: - "logs/format_ncbi_datasets_ndjson.txt" - benchmark: - "benchmarks/format_ncbi_datasets_ndjson.txt" - shell: - """ - augur curate passthru \ - --metadata {input.ncbi_dataset_tsv} \ - --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column accession_version \ - --seq-field sequence \ - --unmatched-reporting warn \ - --duplicate-reporting warn \ - 2> {log} > {output.ndjson} - """ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 982e32c..7f6a308 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -3,16 +3,17 @@ This part of the workflow handles running Nextclade on the curated metadata and sequences. REQUIRED INPUTS: metadata = data/subset_metadata.tsv - sequences = data/sequences_all.fasta - nextclade_datasets = ../nextclade/dataset + sequences = results/sequences.fasta + dataset = (from config) OUTPUTS: - metadata = data/metadata_all.tsv - nextclade = data/nextclade_clades.tsv + metadata = results/metadata.tsv See Nextclade docs for more details on usage, inputs, and outputs if you would like to customize the rules: https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html """ +# TODO: This separate fetch should not be necessary - 'lineage' can be added +# to data/subset_metadata.tsv. rule pathoplexus_classify: """ Pulls global lineage calls from Pathoplexus API @@ -26,7 +27,7 @@ rule pathoplexus_classify: id_field=config["curate"]["output_id_field"], shell: r""" - curl "{params.URL}?dataFormat=TSV&downloadAsFile=false&fields={params.fields}" \ + curl "{params.URL}?versionStatus=LATEST_VERSION&dataFormat=TSV&downloadAsFile=false&fields={params.fields}" \ | tsv-filter -H --not-empty {params.accession_field} \ | uniq \ | csvtk -t rename -f {params.accession_field} -n {params.id_field} \ diff --git a/phylogenetic/defaults/all-lineages/auspice_config.json b/phylogenetic/defaults/all-lineages/auspice_config.json index d406200..269c6e4 100644 --- a/phylogenetic/defaults/all-lineages/auspice_config.json +++ b/phylogenetic/defaults/all-lineages/auspice_config.json @@ -1,6 +1,10 @@ { "title": "Genomic epidemiology of West Nile Virus", "data_provenance": [ + { + "name": "Pathoplexus", + "url": "https://pathoplexus.org" + }, { "name": "GenBank", "url": "https://www.ncbi.nlm.nih.gov/genbank/" @@ -16,6 +20,7 @@ {"key": "lineage", "title": "Lineage", "type": "categorical"}, {"key": "clade_membership", "title": "Clade", "type": "categorical"}, {"key": "author", "title": "Authors", "type": "categorical"}, + {"key": "dataUseTerms", "title": "Data use terms", "type": "categorical"}, {"key": "host", "title": "Host Species", "type": "categorical"}, {"key": "host_genus", "title": "Host Genus", "type": "categorical"}, {"key": "host_type", "title": "Host Type", "type": "categorical"} @@ -52,9 +57,10 @@ "geo_resolution": "country" }, "metadata_columns": [ - "accession", + "PPX_accession", + "INSDC_accession", "division", - "url" + "restrictedUntil" ], "extensions": { "nextclade": { diff --git a/phylogenetic/defaults/all-lineages/include.txt b/phylogenetic/defaults/all-lineages/include.txt index 1e92593..6471a7f 100644 --- a/phylogenetic/defaults/all-lineages/include.txt +++ b/phylogenetic/defaults/all-lineages/include.txt @@ -1,91 +1,91 @@ -AF260968 # Egypt 1951 all-lineages reference -NC_001563 # Lineage 2 reference -NC_009942 # Lineage 1 reference -HM051416 # Isreal 1953 -GQ851607 # Nigeria 1965 -GQ851606 # Senegal 1979 -AF481864 # pre-NY -MH166901 # NY99 -MH166903 # NY99 -MH166904 # NY99 -KX547395 # NY99 -KX547519 # NY99 -KX547602 # NY99 -HM488130 # NY99 -HM488132 # NY99 -HQ671707 # NY99 -AF202541 # NY99 -AF206518 # NY99 -HM488127 # NY99 -HM488126 # NY99 -KX547410 # WN02 -KJ501434 # WN02 -KX547456 # WN02 -KY216155 # WN02 -KX547460 # WN02 -MF175829 # WN02 -KX547482 # WN02 -MF175827 # WN02 -MF175839 # WN02 -KT020853 # WN02 -KX547548 # WN02 -MF175863 # WN02 -KX547286 # WN02 -MF175873 # WN02 -MF175865 # WN02 -MF175831 # WN02 -MF175858 # WN02 -KJ501117 # SW03 -KJ501120 # SW03 -MF175815 # SW03 -MG004533 # SW03 -KF704147 # SW03 -KF704153 # SW03 -KR348940 # SW03 -KR348937 # SW03 -KX547361 # SW03 -JX015523 # SW03 -KR348944 # SW03 -KJ501124 # SW03 -KX547552 # SW03 -KJ145829 # SW03 -KR348981 # SW03 -KJ501118 # SW03 -KR348938 # SW03 -KR348976 # SW03 -KJ501170 # SW03 -KR348993 # SW03 -JQ700438 # SW03 -KR348977 # SW03 -KR348942 # SW03 -KR348941 # SW03 -KJ501121 # SW03 -KJ501122 # SW03 -KX547375 # SW03 -KM012172 # SW03 -KC333375 # SW03 -KJ501222 # SW03 -MG004537 # SW03 -MF175866 # SW03 -MG004540 # SW03 -MW383507 # Lineage 2 -HM147822 # Lineage 2 -GQ903680 # Lineage 2 -DQ176636 # Lineage 2 -KU978767 # Lineage 2 -HM147823 # Lineage 2 -PP445212 # Lineage 3 -AY765264 # Lineage 3 -AY277251 # Lineage 4 -FJ159131 # Lineage 4 -FJ159129 # Lineage 4 -FJ159130 # Lineage 4 -KJ831223 # Lineage 4 -KU978770 # Lineage 5 -DQ256376 # Lineage 5 -JX041632 # Lineage 5 -GQ851604 # Lineage 5 -GQ851605 # Lineage 5 -KY703855 # Lineage 7 -OP846972 # Lineage 7 -KY703856 # Lineage 8 +PP_000HJBT # Egypt 1951 all-lineages reference +PP_0003ASZ # Lineage 2 reference +PP_0003ATX # Lineage 1 reference +PP_0008AWF # Isreal 1953 +PP_000K976 # Nigeria 1965 +PP_000K968 # Senegal 1979 +PP_000HP18 # pre-NY +PP_0002EDQ # NY99 +PP_0002EFL # NY99 +PP_0002EGJ # NY99 +PP_0001RJ4 # NY99 +PP_0001V6R # NY99 +PP_0001XMS # NY99 +PP_0008D7R # NY99 +PP_0008D9M # NY99 +PP_0008M3R # NY99 +PP_000HHM7 # NY99 +PP_000HHXM # NY99 +PP_0008D4X # NY99 +PP_0008D3Z # NY99 +PP_0001RZ8 # WN02 +PP_00012ZX # WN02 +PP_0001TBH # WN02 +PP_0001Z6M # WN02 +PP_0001TF9 # WN02 +PP_0002AES # WN02 +PP_0001U3Y # WN02 +PP_0002ACW # WN02 +PP_0002AQ5 # WN02 +PP_0001F1F # WN02 +PP_0001W10 # WN02 +PP_0002BER # WN02 +PP_0001NBN # WN02 +PP_0002BQ4 # WN02 +PP_0002BGM # WN02 +PP_0002AGN # WN02 +PP_0002B91 # WN02 +PP_0000T23 # SW03 +PP_0000T6V # SW03 +PP_0002A0L # SW03 +PP_0002DFM # SW03 +PP_0000Q26 # SW03 +PP_0000Q8U # SW03 +PP_0001C3E # SW03 +PP_0001BZN # SW03 +PP_0001QJ5 # SW03 +PP_0000FKE # SW03 +PP_0001C76 # SW03 +PP_0000TAM # SW03 +PP_0001W5S # SW03 +PP_0000RRS # SW03 +PP_0001DFP # SW03 +PP_0000T31 # SW03 +PP_0001C0L # SW03 +PP_0001DAZ # SW03 +PP_0000UWD # SW03 +PP_0001DVU # SW03 +PP_0000DT0 # SW03 +PP_0001DBX # SW03 +PP_0001C5A # SW03 +PP_0001C4C # SW03 +PP_0000T7T # SW03 +PP_0000T8R # SW03 +PP_0001QYB # SW03 +PP_00017WY # SW03 +PP_0000HXN # SW03 +PP_0000WPR # SW03 +PP_0002DLA # SW03 +PP_0002BHJ # SW03 +PP_0002DP4 # SW03 +PP_000370M # Lineage 2 +PP_0008CDE # Lineage 2 +PP_000K9BY # Lineage 2 +PP_000JB76 # Lineage 2 +PP_0001H9X # Lineage 2 +PP_0008CEC # Lineage 2 +PP_000RH4S # Lineage 3 +PP_000HY01 # Lineage 3 +PP_000HRWF # Lineage 4 +PP_000JWG3 # Lineage 4 +PP_000JWE7 # Lineage 4 +PP_000JWF5 # Lineage 4 +PP_00017EX # Lineage 4 +PP_0001HCR # Lineage 5 +PP_000JBA0 # Lineage 5 +PP_0000FR2 # Lineage 5 +PP_000K94C # Lineage 5 +PP_000K95A # Lineage 5 +PP_0001ZMQ # Lineage 7 +PP_0003L7U # Lineage 7 +PP_0001ZNN # Lineage 8 diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index b50e603..0d08cd4 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -56,7 +56,7 @@ build_params: lineage-1A: reference: "defaults/lineage-1A/reference.gb" - root: "KX394399" + root: "PP_0001JCQ" subsample: samples: diff --git a/phylogenetic/defaults/lineage-1A/auspice_config.json b/phylogenetic/defaults/lineage-1A/auspice_config.json index 945cdff..a0f3d12 100644 --- a/phylogenetic/defaults/lineage-1A/auspice_config.json +++ b/phylogenetic/defaults/lineage-1A/auspice_config.json @@ -1,6 +1,10 @@ { "title": "Genomic epidemiology of West Nile Virus lineage 1A", "data_provenance": [ + { + "name": "Pathoplexus", + "url": "https://pathoplexus.org" + }, { "name": "GenBank", "url": "https://www.ncbi.nlm.nih.gov/genbank/" @@ -16,6 +20,7 @@ {"key": "lineage", "title": "Lineage", "type": "categorical"}, {"key": "clade_membership", "title": "Clade", "type": "categorical"}, {"key": "author", "title": "Authors", "type": "categorical"}, + {"key": "dataUseTerms", "title": "Data use terms", "type": "categorical"}, {"key": "host", "title": "Host Species", "type": "categorical"}, {"key": "host_genus", "title": "Host Genus", "type": "categorical"}, {"key": "host_type", "title": "Host Type", "type": "categorical"} @@ -53,9 +58,10 @@ "distance_measure": "div" }, "metadata_columns": [ - "accession", + "PPX_accession", + "INSDC_accession", "division", - "url" + "restrictedUntil" ], "extensions": { "nextclade": { diff --git a/phylogenetic/defaults/lineage-1A/include.txt b/phylogenetic/defaults/lineage-1A/include.txt index 2bc4e0b..f4634a9 100644 --- a/phylogenetic/defaults/lineage-1A/include.txt +++ b/phylogenetic/defaults/lineage-1A/include.txt @@ -1,66 +1,66 @@ -KX394399 # Lineage 1B outgroup -NC_009942 # Lineage 1 reference -AF481864 # pre-NY -MH166901 # NY99 -MH166903 # NY99 -MH166904 # NY99 -KX547395 # NY99 -KX547519 # NY99 -KX547602 # NY99 -HM488130 # NY99 -HM488132 # NY99 -HQ671707 # NY99 -AF202541 # NY99 -AF206518 # NY99 -HM488127 # NY99 -HM488126 # NY99 -KX547410 # WN02 -KJ501434 # WN02 -KX547456 # WN02 -KY216155 # WN02 -KX547460 # WN02 -MF175829 # WN02 -KX547482 # WN02 -MF175827 # WN02 -MF175839 # WN02 -KT020853 # WN02 -KX547548 # WN02 -MF175863 # WN02 -KX547286 # WN02 -MF175873 # WN02 -MF175865 # WN02 -MF175831 # WN02 -MF175858 # WN02 -KJ501117 # SW03 -KJ501120 # SW03 -MF175815 # SW03 -MG004533 # SW03 -KF704147 # SW03 -KF704153 # SW03 -KR348940 # SW03 -KR348937 # SW03 -KX547361 # SW03 -JX015523 # SW03 -KR348944 # SW03 -KJ501124 # SW03 -KX547552 # SW03 -KJ145829 # SW03 -KR348981 # SW03 -KJ501118 # SW03 -KR348938 # SW03 -KR348976 # SW03 -KJ501170 # SW03 -KR348993 # SW03 -JQ700438 # SW03 -KR348977 # SW03 -KR348942 # SW03 -KR348941 # SW03 -KJ501121 # SW03 -KJ501122 # SW03 -KX547375 # SW03 -KM012172 # SW03 -KC333375 # SW03 -KJ501222 # SW03 -MG004537 # SW03 -MF175866 # SW03 -MG004540 # SW03 +PP_0001JCQ # Lineage 1B outgroup +PP_0003ATX # Lineage 1 reference +PP_000HP18 # pre-NY +PP_0002EDQ # NY99 +PP_0002EFL # NY99 +PP_0002EGJ # NY99 +PP_0001RJ4 # NY99 +PP_0001V6R # NY99 +PP_0001XMS # NY99 +PP_0008D7R # NY99 +PP_0008D9M # NY99 +PP_0008M3R # NY99 +PP_000HHM7 # NY99 +PP_000HHXM # NY99 +PP_0008D4X # NY99 +PP_0008D3Z # NY99 +PP_0001RZ8 # WN02 +PP_00012ZX # WN02 +PP_0001TBH # WN02 +PP_0001Z6M # WN02 +PP_0001TF9 # WN02 +PP_0002AES # WN02 +PP_0001U3Y # WN02 +PP_0002ACW # WN02 +PP_0002AQ5 # WN02 +PP_0001F1F # WN02 +PP_0001W10 # WN02 +PP_0002BER # WN02 +PP_0001NBN # WN02 +PP_0002BQ4 # WN02 +PP_0002BGM # WN02 +PP_0002AGN # WN02 +PP_0002B91 # WN02 +PP_0000T23 # SW03 +PP_0000T6V # SW03 +PP_0002A0L # SW03 +PP_0002DFM # SW03 +PP_0000Q26 # SW03 +PP_0000Q8U # SW03 +PP_0001C3E # SW03 +PP_0001BZN # SW03 +PP_0001QJ5 # SW03 +PP_0000FKE # SW03 +PP_0001C76 # SW03 +PP_0000TAM # SW03 +PP_0001W5S # SW03 +PP_0000RRS # SW03 +PP_0001DFP # SW03 +PP_0000T31 # SW03 +PP_0001C0L # SW03 +PP_0001DAZ # SW03 +PP_0000UWD # SW03 +PP_0001DVU # SW03 +PP_0000DT0 # SW03 +PP_0001DBX # SW03 +PP_0001C5A # SW03 +PP_0001C4C # SW03 +PP_0000T7T # SW03 +PP_0000T8R # SW03 +PP_0001QYB # SW03 +PP_00017WY # SW03 +PP_0000HXN # SW03 +PP_0000WPR # SW03 +PP_0002DLA # SW03 +PP_0002BHJ # SW03 +PP_0002DP4 # SW03 diff --git a/phylogenetic/defaults/lineage-2/auspice_config.json b/phylogenetic/defaults/lineage-2/auspice_config.json index 3453014..4d2e362 100644 --- a/phylogenetic/defaults/lineage-2/auspice_config.json +++ b/phylogenetic/defaults/lineage-2/auspice_config.json @@ -1,6 +1,10 @@ { "title": "Genomic epidemiology of West Nile Virus lineage 2", "data_provenance": [ + { + "name": "Pathoplexus", + "url": "https://pathoplexus.org" + }, { "name": "GenBank", "url": "https://www.ncbi.nlm.nih.gov/genbank/" @@ -16,6 +20,7 @@ {"key": "lineage", "title": "Lineage", "type": "categorical"}, {"key": "clade_membership", "title": "Clade", "type": "categorical"}, {"key": "author", "title": "Authors", "type": "categorical"}, + {"key": "dataUseTerms", "title": "Data use terms", "type": "categorical"}, {"key": "host", "title": "Host Species", "type": "categorical"}, {"key": "host_genus", "title": "Host Genus", "type": "categorical"}, {"key": "host_type", "title": "Host Type", "type": "categorical"} @@ -53,9 +58,10 @@ "distance_measure": "div" }, "metadata_columns": [ - "accession", + "PPX_accession", + "INSDC_accession", "division", - "url" + "restrictedUntil" ], "extensions": { "nextclade": { diff --git a/phylogenetic/defaults/lineage-2/include.txt b/phylogenetic/defaults/lineage-2/include.txt index e33db5b..23924ce 100644 --- a/phylogenetic/defaults/lineage-2/include.txt +++ b/phylogenetic/defaults/lineage-2/include.txt @@ -1,7 +1,7 @@ -NC_001563 # Lineage 2 reference -MW383507 # Lineage 2 -HM147822 # Lineage 2 -GQ903680 # Lineage 2 -DQ176636 # Lineage 2 -KU978767 # Lineage 2 -HM147823 # Lineage 2 +PP_0003ASZ # Lineage 2 reference +PP_000370M # Lineage 2 +PP_0008CDE # Lineage 2 +PP_000K9BY # Lineage 2 +PP_000JB76 # Lineage 2 +PP_0001H9X # Lineage 2 +PP_0008CEC # Lineage 2