WIP [ingest] GenoFLU workflow for all-influenza ingest

jameshadfield · jameshadfield · commit 439d26eb8da4 · 2025-12-08T13:43:39.000+13:00
This takes the output from our all-influenza curation pipeline
(pre-filtered to avian-flu subtypes) and runs GenoFLU on it. It's a
little strange to have most of the ingest steps in one location and then
the GenoFLU step here; one day we may wish to unify them but that's
quite a big task given that this (avian-flu) ingest pipeline already
exists and is being used on other data sources.
diff --git a/.github/workflows/genoflu-gisaid.yaml b/.github/workflows/genoflu-gisaid.yaml
@@ -1,7 +1,78 @@
-# this workflow is a stub action to allow testing from a branch
-
 name: Run GenoFLU on curated GISAID data
 
+defaults:
+  run:
+    # This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
+    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
+    #
+    # Completely spelling it out here so that GitHub can't change it out from under us
+    # and we don't have to refer to the docs to know the expected behavior.
+    shell: bash --noprofile --norc -eo pipefail {0}
+
 on:
+  workflow_call:
+    inputs:
+      image:
+        description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
+        required: false
+        type: string
+
   workflow_dispatch:
-    
+    inputs:
+      image:
+        description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
+        required: false
+        type: string
+      trial-name:
+        description: |
+          Trial name for outputs.
+          If not set, outputs will overwrite files at s3://nextstrain-data/files/workflows/avian-flu/
+          If set, outputs will be uploaded to s3://nextstrain-data/files/workflows/avian-flu/trials/<trial_name>/
+        required: false
+        type: string
+
+  # Expose a repository dispatch so that we can trigger this workflow when the all-influenza
+  # curation pipeline has finished (currently via the seasonal-flu repo)
+  repository_dispatch:
+    types:
+      - genoflu-gisaid
+
+jobs:
+  ingest:
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      # Starting with the default docker runtime
+      # We can migrate to AWS Batch when/if we need to for more resources or if
+      # the job runs longer than the GH Action limit of 6 hours.
+      runtime: docker
+      run: |
+        declare -a config;
+
+        if [[ "$TRIAL_NAME" ]]; then
+          # Create JSON string for the nested upload config
+          S3_DST="s3://nextstrain-data-private/files/workflows/avian-flu/trial/$TRIAL_NAME"
+          config+=(
+            s3_dst=$(jq -cn --arg S3_DST "$S3_DST" '{"gisaid": $S3_DST}')
+          )
+        fi;
+
+        nextstrain build \
+          ingest \
+            --snakefile gisaid/Snakefile \
+            upload_all \
+            --config "${config[@]}"
+      env: |
+        NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
+        TRIAL_NAME: ${{ inputs.trial-name }}
+      # Explicitly excluding `ingest/gisaid/results` and `ingest/gisaid/data`
+      # since this is private data and should not available through the public artifacts
+      artifact-name: genoflu-gisaid
+      artifact-paths: |
+        ingest/.snakemake/log/
+        ingest/gisaid/logs/
+        ingest/gisaid/benchmarks/
+        !ingest/gisaid/results
+        !ingest/gisaid/data
diff --git a/ingest/gisaid/Snakefile b/ingest/gisaid/Snakefile
@@ -0,0 +1,53 @@
+# EXPECTED USAGE:
+# Working directory: "avian-flu/ingest"
+# Command: "snakemake --cores 1 -npf --snakefile gisaid/Snakefile"
+
+import os
+configfile: os.path.join(workflow.basedir, "config.yaml")
+
+include: "../../shared/vendored/snakemake/remote_files.smk"
+include: "../rules/genoflu.smk"
+include: "../rules/upload_to_s3.smk"
+
+
+# The Genoflu workflow will create "gisaid/results/metadata.tsv" with GenoFLU information
+# So make that the default workflow target. This will force provisioning of upstream
+# metadata & sequences
+rule all:
+    input:
+        metadata="gisaid/results/metadata.tsv",
+
+rule upload_all:
+    input:
+        metadata="gisaid/s3/metadata.done",
+        sequences=expand("gisaid/s3/sequences_{segment}.done", segment=config["segments"]),
+
+rule get_sequence:
+    """
+    Provisions the curated sequences (ultimately from the seasonal-flu ingest)
+    into the location where both the GenoFlu workflow and the upload rules can access them.
+    (Note: We could use a different location and skip `provision_genoflu_sequences` but
+    we want to upload the sequences at the end of the workflow in order to keep metadata
+    & sequences in-sync.)
+    """
+    input:
+        path_or_url(config['sequences'])
+    output:
+        "gisaid/results/sequences_{segment}.fasta"
+    shell:
+        """
+        cp {input[0]} {output[0]}
+        """
+
+rule get_metadata:
+    """
+    Provisions the metadata in the location the genoflu workflow expects it.
+    """
+    input:
+        path_or_url(config['metadata'])
+    output:
+        "gisaid/data/metadata_combined.tsv"
+    shell:
+        """
+        cp {input[0]} {output[0]}
+        """
diff --git a/ingest/gisaid/config.yaml b/ingest/gisaid/config.yaml
@@ -0,0 +1,37 @@
+
+# Following useful for development purposes as we'll simply copy files from the local seasonal-flu repo
+#
+# sequences: "../../seasonal-flu/ingest/results/avian-flu/{segment}.fasta"
+# metadata: "../../seasonal-flu/ingest/results/avian-flu/metadata.tsv"
+
+# Where are the curated files on S3?
+# The seasonal-flu repo (where the all-influenza curation currently is) puts them in
+# s3://nextstrain-data-private/files/workflows/seasonal-flu/trials/ingest/avian-flu
+# however the avian-flu repo doesn't have permission to read this -- see
+# <https://github.com/nextstrain/infra/blob/bb07ee82fd0bf59fc41846fec7005ce29ca92b81/env/production/aws-iam-policy-NextstrainPathogen%40.tf#L108C55-L108C64>
+# so for manual testing purposes we copy them to a prefix this repo can access:
+#
+# aws s3 cp s3://nextstrain-data-private/files/workflows/seasonal-flu/trials/ingest/avian-flu s3://nextstrain-data-private/files/workflows/avian-flu/trial/all-influenza-curation-pipeline  --recursive
+#
+sequences: s3://nextstrain-data-private/files/workflows/avian-flu/trial/all-influenza-curation-pipeline/{segment}/sequences.fasta.xz
+metadata: s3://nextstrain-data-private/files/workflows/avian-flu/trial/all-influenza-curation-pipeline/metadata.tsv.xz
+
+
+segments:
+  - pb2
+  - pb1
+  - pa
+  - ha
+  - np
+  - na
+  - mp
+  - ns
+
+genoflu:
+  gisaid: true
+
+s3_dst:
+  # TODO -- hardcoded trial destination during testing
+  # gisaid: s3://nextstrain-data-private/files/workflows/avian-flu
+  gisaid: s3://nextstrain-data-private/files/workflows/avian-flu/trial/genoflu-gisaid
+