From c1a86d2b59a37c29f69502b4237400b2521dbe4f Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Thu, 28 Sep 2023 00:20:47 +0200
Subject: [PATCH 1/2] doc: fix typo `variant_classification` ->
 `variant_classifications`

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4ca48cc..0eb58bb 100644
--- a/README.md
+++ b/README.md
@@ -54,10 +54,10 @@ To run pipeline for all available data generated by ingest:
 nextstrain build .
 ```
 
-To run the pipeline for specific data provenance, variant classification and geo resolution (e.g. gisaid, nextstrain_clades and global only):
+To run the pipeline for specific data provenance, variant classifications and geo resolution (e.g. gisaid, nextstrain_clades and global only):
 
 ```bash
-nextstrain build . --configfile config/config.yaml --config data_provenances=gisaid variant_classification=nextstrain_clades geo_resolutions=global
+nextstrain build . --configfile config/config.yaml --config data_provenances=gisaid variant_classifications=nextstrain_clades geo_resolutions=global
 ```
 
 ### Optional uploads

From 4f876741d720d0cfac3c0d15f1db77adc0980ce8 Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Thu, 28 Sep 2023 00:33:46 +0200
Subject: [PATCH 2/2] Include more countries, especially some big ones with
 force include

Relax the location_min_seq_days as many important countries have quite some delay
but are still informative/important
---
 config/config.yaml                        |  8 +++++---
 defaults/global_included_locations.txt    |  5 +++++
 scripts/prepare-data.py                   | 23 +++++++++++++++++------
 workflow/snakemake_rules/prepare_data.smk |  2 ++
 4 files changed, 29 insertions(+), 9 deletions(-)
 create mode 100644 defaults/global_included_locations.txt

diff --git a/config/config.yaml b/config/config.yaml
index 1c1e9fb..298d628 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -16,9 +16,10 @@ prepare_data:
     nextstrain_clades:
       global:
         included_days: 150
-        location_min_seq: 100
-        location_min_seq_days: 30
+        location_min_seq: 200
+        location_min_seq_days: 70
         excluded_locations: "defaults/global_excluded_locations.txt"
+        included_locations: "defaults/global_included_locations.txt"
         prune_seq_days: 12
         clade_min_seq: 5000
         clade_min_seq_days: 150
@@ -26,8 +27,9 @@ prepare_data:
       global:
         included_days: 150
         location_min_seq: 300
-        location_min_seq_days: 30
+        location_min_seq_days: 100
         excluded_locations: "defaults/global_excluded_locations.txt"
+        included_locations: "defaults/global_included_locations.txt"
         prune_seq_days: 12
         clade_min_seq: 1
         clade_min_seq_days: 150
diff --git a/defaults/global_included_locations.txt b/defaults/global_included_locations.txt
new file mode 100644
index 0000000..5e3a6ad
--- /dev/null
+++ b/defaults/global_included_locations.txt
@@ -0,0 +1,5 @@
+India
+South Africa
+Brazil
+Malaysia
+Thailand
diff --git a/scripts/prepare-data.py b/scripts/prepare-data.py
index 6e9823b..ab11ae4 100644
--- a/scripts/prepare-data.py
+++ b/scripts/prepare-data.py
@@ -54,23 +54,25 @@ def positive_int(value):
              "This is useful to exclude sequence counts for recent days that are overly enriched for variants.")
     parser.add_argument("--location-min-seq", type=positive_int, default=1,
         help="The mininum number of sequences a location must have within the "
-             "days-min-seq to be included in analysis.\n"
+             "location-min-seq-days to be included in analysis.\n"
              "(default: %(default)s)")
     parser.add_argument("--location-min-seq-days", type=positive_int,
         help="The number of days (counting back from the cutoff date) to use as the date range "
              "for counting the number of sequences per location to determine if a location is included in analysis.\n"
              "If not provided, will count sequences from all dates included in analysis date range.")
     parser.add_argument("--excluded-locations",
-        help="File with a list locations to exclude from analysis.")
+        help="File with a list locations to always exclude from analysis.")
+    parser.add_argument("--included-locations",
+        help="File with a list locations to always include in analysis.")
     parser.add_argument("--clade-min-seq", type=positive_int,
-        help="The minimum number of sequences a clades must have to be included as it's own variant.\n"
+        help="The minimum number of sequences a clades must have to be included as its own variant.\n"
              "All clades with less than the minimum will be collapsed as 'other'.")
     parser.add_argument("--clade-min-seq-days", type=positive_int,
-        help="The number fo days (counting back from the cutoff date) to use as the date range "
+        help="The number of days (counting back from the cutoff date) to use as the date range "
              "for counting the number of sequences per clade to determine if a clade is included as its own variant.\n"
              "If not provided, will count sequences from all dates included in analysis date range.")
     parser.add_argument("--force-include-clades", nargs="*",
-        help="Clades to force include in the output regardless of sequences counts. " +
+        help="Clades to force include in the output regardless of sequence counts. " +
              "Must be formatted as <clade_name>=<variant_name>")
     parser.add_argument("--output-seq-counts", required=True,
         help="Path to output TSV file for the prepared variants data.")
@@ -131,6 +133,7 @@ def positive_int(value):
 
     # Get a set of locations that meet the location_min_seq requirement
     locations_with_min_seq = set(seqs_per_location.loc[seqs_per_location['sequences'] >= args.location_min_seq, 'location'])
+    locations_with_min_tenth_seq = set(seqs_per_location.loc[seqs_per_location['sequences'] >= args.location_min_seq / 10, 'location'])
 
     # Load manually annotated excluded locations if provided
     excluded_locations = set()
@@ -140,8 +143,16 @@ def positive_int(value):
 
         print(f"Excluding the following requested locations: {sorted(excluded_locations)}.")
 
+    # Load manually annotated excluded locations if provided
+    included_locations = set()
+    if args.included_locations:
+        with open(args.included_locations, 'r') as f:
+            included_locations = {line.rstrip() for line in f} & locations_with_min_tenth_seq
+
+        print(f"Including the following requested locations: {sorted(included_locations)}.")
+
     # Remove excluded-locations from the set of locations to include in analysis
-    locations_to_include = locations_with_min_seq - excluded_locations
+    locations_to_include = locations_with_min_seq - excluded_locations | included_locations
     print(f"Locations that will be included: {sorted(locations_to_include)}.")
 
     assert len(locations_to_include) > 0, \
diff --git a/workflow/snakemake_rules/prepare_data.smk b/workflow/snakemake_rules/prepare_data.smk
index d11f547..96200ba 100644
--- a/workflow/snakemake_rules/prepare_data.smk
+++ b/workflow/snakemake_rules/prepare_data.smk
@@ -59,6 +59,7 @@ rule prepare_clade_data:
         included_days = lambda wildcards: _get_prepare_data_option(wildcards, 'included_days'),
         location_min_seq = lambda wildcards: _get_prepare_data_option(wildcards, 'location_min_seq'),
         location_min_seq_days = lambda wildcards: _get_prepare_data_option(wildcards, 'location_min_seq_days'),
+        included_locations = lambda wildcards: _get_prepare_data_option(wildcards, 'included_locations'),
         excluded_locations = lambda wildcards: _get_prepare_data_option(wildcards, 'excluded_locations'),
         prune_seq_days = lambda wildcards: _get_prepare_data_option(wildcards, 'prune_seq_days'),
         clade_min_seq = lambda wildcards: _get_prepare_data_option(wildcards, 'clade_min_seq'),
@@ -74,6 +75,7 @@ rule prepare_clade_data:
             {params.location_min_seq} \
             {params.location_min_seq_days} \
             {params.excluded_locations} \
+            {params.included_locations} \
             {params.prune_seq_days} \
             {params.clade_min_seq} \
             {params.clade_min_seq_days} \