From c1a86d2b59a37c29f69502b4237400b2521dbe4f Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 28 Sep 2023 00:20:47 +0200 Subject: [PATCH 1/2] doc: fix typo `variant_classification` -> `variant_classifications` --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4ca48cc..0eb58bb 100644 --- a/README.md +++ b/README.md @@ -54,10 +54,10 @@ To run pipeline for all available data generated by ingest: nextstrain build . ``` -To run the pipeline for specific data provenance, variant classification and geo resolution (e.g. gisaid, nextstrain_clades and global only): +To run the pipeline for specific data provenance, variant classifications and geo resolution (e.g. gisaid, nextstrain_clades and global only): ```bash -nextstrain build . --configfile config/config.yaml --config data_provenances=gisaid variant_classification=nextstrain_clades geo_resolutions=global +nextstrain build . --configfile config/config.yaml --config data_provenances=gisaid variant_classifications=nextstrain_clades geo_resolutions=global ``` ### Optional uploads From 4f876741d720d0cfac3c0d15f1db77adc0980ce8 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 28 Sep 2023 00:33:46 +0200 Subject: [PATCH 2/2] Include more countries, especially some big ones with force include Relax the location_min_seq_days as many important countries have quite some delay but are still informative/important --- config/config.yaml | 8 +++++--- defaults/global_included_locations.txt | 5 +++++ scripts/prepare-data.py | 23 +++++++++++++++++------ workflow/snakemake_rules/prepare_data.smk | 2 ++ 4 files changed, 29 insertions(+), 9 deletions(-) create mode 100644 defaults/global_included_locations.txt diff --git a/config/config.yaml b/config/config.yaml index 1c1e9fb..298d628 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -16,9 +16,10 @@ prepare_data: nextstrain_clades: global: included_days: 150 - location_min_seq: 100 - location_min_seq_days: 30 + location_min_seq: 200 + location_min_seq_days: 70 excluded_locations: "defaults/global_excluded_locations.txt" + included_locations: "defaults/global_included_locations.txt" prune_seq_days: 12 clade_min_seq: 5000 clade_min_seq_days: 150 @@ -26,8 +27,9 @@ prepare_data: global: included_days: 150 location_min_seq: 300 - location_min_seq_days: 30 + location_min_seq_days: 100 excluded_locations: "defaults/global_excluded_locations.txt" + included_locations: "defaults/global_included_locations.txt" prune_seq_days: 12 clade_min_seq: 1 clade_min_seq_days: 150 diff --git a/defaults/global_included_locations.txt b/defaults/global_included_locations.txt new file mode 100644 index 0000000..5e3a6ad --- /dev/null +++ b/defaults/global_included_locations.txt @@ -0,0 +1,5 @@ +India +South Africa +Brazil +Malaysia +Thailand diff --git a/scripts/prepare-data.py b/scripts/prepare-data.py index 6e9823b..ab11ae4 100644 --- a/scripts/prepare-data.py +++ b/scripts/prepare-data.py @@ -54,23 +54,25 @@ def positive_int(value): "This is useful to exclude sequence counts for recent days that are overly enriched for variants.") parser.add_argument("--location-min-seq", type=positive_int, default=1, help="The mininum number of sequences a location must have within the " - "days-min-seq to be included in analysis.\n" + "location-min-seq-days to be included in analysis.\n" "(default: %(default)s)") parser.add_argument("--location-min-seq-days", type=positive_int, help="The number of days (counting back from the cutoff date) to use as the date range " "for counting the number of sequences per location to determine if a location is included in analysis.\n" "If not provided, will count sequences from all dates included in analysis date range.") parser.add_argument("--excluded-locations", - help="File with a list locations to exclude from analysis.") + help="File with a list locations to always exclude from analysis.") + parser.add_argument("--included-locations", + help="File with a list locations to always include in analysis.") parser.add_argument("--clade-min-seq", type=positive_int, - help="The minimum number of sequences a clades must have to be included as it's own variant.\n" + help="The minimum number of sequences a clades must have to be included as its own variant.\n" "All clades with less than the minimum will be collapsed as 'other'.") parser.add_argument("--clade-min-seq-days", type=positive_int, - help="The number fo days (counting back from the cutoff date) to use as the date range " + help="The number of days (counting back from the cutoff date) to use as the date range " "for counting the number of sequences per clade to determine if a clade is included as its own variant.\n" "If not provided, will count sequences from all dates included in analysis date range.") parser.add_argument("--force-include-clades", nargs="*", - help="Clades to force include in the output regardless of sequences counts. " + + help="Clades to force include in the output regardless of sequence counts. " + "Must be formatted as =") parser.add_argument("--output-seq-counts", required=True, help="Path to output TSV file for the prepared variants data.") @@ -131,6 +133,7 @@ def positive_int(value): # Get a set of locations that meet the location_min_seq requirement locations_with_min_seq = set(seqs_per_location.loc[seqs_per_location['sequences'] >= args.location_min_seq, 'location']) + locations_with_min_tenth_seq = set(seqs_per_location.loc[seqs_per_location['sequences'] >= args.location_min_seq / 10, 'location']) # Load manually annotated excluded locations if provided excluded_locations = set() @@ -140,8 +143,16 @@ def positive_int(value): print(f"Excluding the following requested locations: {sorted(excluded_locations)}.") + # Load manually annotated excluded locations if provided + included_locations = set() + if args.included_locations: + with open(args.included_locations, 'r') as f: + included_locations = {line.rstrip() for line in f} & locations_with_min_tenth_seq + + print(f"Including the following requested locations: {sorted(included_locations)}.") + # Remove excluded-locations from the set of locations to include in analysis - locations_to_include = locations_with_min_seq - excluded_locations + locations_to_include = locations_with_min_seq - excluded_locations | included_locations print(f"Locations that will be included: {sorted(locations_to_include)}.") assert len(locations_to_include) > 0, \ diff --git a/workflow/snakemake_rules/prepare_data.smk b/workflow/snakemake_rules/prepare_data.smk index d11f547..96200ba 100644 --- a/workflow/snakemake_rules/prepare_data.smk +++ b/workflow/snakemake_rules/prepare_data.smk @@ -59,6 +59,7 @@ rule prepare_clade_data: included_days = lambda wildcards: _get_prepare_data_option(wildcards, 'included_days'), location_min_seq = lambda wildcards: _get_prepare_data_option(wildcards, 'location_min_seq'), location_min_seq_days = lambda wildcards: _get_prepare_data_option(wildcards, 'location_min_seq_days'), + included_locations = lambda wildcards: _get_prepare_data_option(wildcards, 'included_locations'), excluded_locations = lambda wildcards: _get_prepare_data_option(wildcards, 'excluded_locations'), prune_seq_days = lambda wildcards: _get_prepare_data_option(wildcards, 'prune_seq_days'), clade_min_seq = lambda wildcards: _get_prepare_data_option(wildcards, 'clade_min_seq'), @@ -74,6 +75,7 @@ rule prepare_clade_data: {params.location_min_seq} \ {params.location_min_seq_days} \ {params.excluded_locations} \ + {params.included_locations} \ {params.prune_seq_days} \ {params.clade_min_seq} \ {params.clade_min_seq_days} \