Skip to content

Commit 439d26e

Browse files
committed
WIP [ingest] GenoFLU workflow for all-influenza ingest
This takes the output from our all-influenza curation pipeline (pre-filtered to avian-flu subtypes) and runs GenoFLU on it. It's a little strange to have most of the ingest steps in one location and then the GenoFLU step here; one day we may wish to unify them but that's quite a big task given that this (avian-flu) ingest pipeline already exists and is being used on other data sources.
1 parent 9080b16 commit 439d26e

3 files changed

Lines changed: 164 additions & 3 deletions

File tree

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,78 @@
1-
# this workflow is a stub action to allow testing from a branch
2-
31
name: Run GenoFLU on curated GISAID data
42

3+
defaults:
4+
run:
5+
# This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
6+
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
7+
#
8+
# Completely spelling it out here so that GitHub can't change it out from under us
9+
# and we don't have to refer to the docs to know the expected behavior.
10+
shell: bash --noprofile --norc -eo pipefail {0}
11+
512
on:
13+
workflow_call:
14+
inputs:
15+
image:
16+
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
17+
required: false
18+
type: string
19+
620
workflow_dispatch:
7-
21+
inputs:
22+
image:
23+
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
24+
required: false
25+
type: string
26+
trial-name:
27+
description: |
28+
Trial name for outputs.
29+
If not set, outputs will overwrite files at s3://nextstrain-data/files/workflows/avian-flu/
30+
If set, outputs will be uploaded to s3://nextstrain-data/files/workflows/avian-flu/trials/<trial_name>/
31+
required: false
32+
type: string
33+
34+
# Expose a repository dispatch so that we can trigger this workflow when the all-influenza
35+
# curation pipeline has finished (currently via the seasonal-flu repo)
36+
repository_dispatch:
37+
types:
38+
- genoflu-gisaid
39+
40+
jobs:
41+
ingest:
42+
permissions:
43+
id-token: write
44+
uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
45+
secrets: inherit
46+
with:
47+
# Starting with the default docker runtime
48+
# We can migrate to AWS Batch when/if we need to for more resources or if
49+
# the job runs longer than the GH Action limit of 6 hours.
50+
runtime: docker
51+
run: |
52+
declare -a config;
53+
54+
if [[ "$TRIAL_NAME" ]]; then
55+
# Create JSON string for the nested upload config
56+
S3_DST="s3://nextstrain-data-private/files/workflows/avian-flu/trial/$TRIAL_NAME"
57+
config+=(
58+
s3_dst=$(jq -cn --arg S3_DST "$S3_DST" '{"gisaid": $S3_DST}')
59+
)
60+
fi;
61+
62+
nextstrain build \
63+
ingest \
64+
--snakefile gisaid/Snakefile \
65+
upload_all \
66+
--config "${config[@]}"
67+
env: |
68+
NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
69+
TRIAL_NAME: ${{ inputs.trial-name }}
70+
# Explicitly excluding `ingest/gisaid/results` and `ingest/gisaid/data`
71+
# since this is private data and should not available through the public artifacts
72+
artifact-name: genoflu-gisaid
73+
artifact-paths: |
74+
ingest/.snakemake/log/
75+
ingest/gisaid/logs/
76+
ingest/gisaid/benchmarks/
77+
!ingest/gisaid/results
78+
!ingest/gisaid/data

ingest/gisaid/Snakefile

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# EXPECTED USAGE:
2+
# Working directory: "avian-flu/ingest"
3+
# Command: "snakemake --cores 1 -npf --snakefile gisaid/Snakefile"
4+
5+
import os
6+
configfile: os.path.join(workflow.basedir, "config.yaml")
7+
8+
include: "../../shared/vendored/snakemake/remote_files.smk"
9+
include: "../rules/genoflu.smk"
10+
include: "../rules/upload_to_s3.smk"
11+
12+
13+
# The Genoflu workflow will create "gisaid/results/metadata.tsv" with GenoFLU information
14+
# So make that the default workflow target. This will force provisioning of upstream
15+
# metadata & sequences
16+
rule all:
17+
input:
18+
metadata="gisaid/results/metadata.tsv",
19+
20+
rule upload_all:
21+
input:
22+
metadata="gisaid/s3/metadata.done",
23+
sequences=expand("gisaid/s3/sequences_{segment}.done", segment=config["segments"]),
24+
25+
rule get_sequence:
26+
"""
27+
Provisions the curated sequences (ultimately from the seasonal-flu ingest)
28+
into the location where both the GenoFlu workflow and the upload rules can access them.
29+
(Note: We could use a different location and skip `provision_genoflu_sequences` but
30+
we want to upload the sequences at the end of the workflow in order to keep metadata
31+
& sequences in-sync.)
32+
"""
33+
input:
34+
path_or_url(config['sequences'])
35+
output:
36+
"gisaid/results/sequences_{segment}.fasta"
37+
shell:
38+
"""
39+
cp {input[0]} {output[0]}
40+
"""
41+
42+
rule get_metadata:
43+
"""
44+
Provisions the metadata in the location the genoflu workflow expects it.
45+
"""
46+
input:
47+
path_or_url(config['metadata'])
48+
output:
49+
"gisaid/data/metadata_combined.tsv"
50+
shell:
51+
"""
52+
cp {input[0]} {output[0]}
53+
"""

ingest/gisaid/config.yaml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
2+
# Following useful for development purposes as we'll simply copy files from the local seasonal-flu repo
3+
#
4+
# sequences: "../../seasonal-flu/ingest/results/avian-flu/{segment}.fasta"
5+
# metadata: "../../seasonal-flu/ingest/results/avian-flu/metadata.tsv"
6+
7+
# Where are the curated files on S3?
8+
# The seasonal-flu repo (where the all-influenza curation currently is) puts them in
9+
# s3://nextstrain-data-private/files/workflows/seasonal-flu/trials/ingest/avian-flu
10+
# however the avian-flu repo doesn't have permission to read this -- see
11+
# <https://github.com/nextstrain/infra/blob/bb07ee82fd0bf59fc41846fec7005ce29ca92b81/env/production/aws-iam-policy-NextstrainPathogen%40.tf#L108C55-L108C64>
12+
# so for manual testing purposes we copy them to a prefix this repo can access:
13+
#
14+
# aws s3 cp s3://nextstrain-data-private/files/workflows/seasonal-flu/trials/ingest/avian-flu s3://nextstrain-data-private/files/workflows/avian-flu/trial/all-influenza-curation-pipeline --recursive
15+
#
16+
sequences: s3://nextstrain-data-private/files/workflows/avian-flu/trial/all-influenza-curation-pipeline/{segment}/sequences.fasta.xz
17+
metadata: s3://nextstrain-data-private/files/workflows/avian-flu/trial/all-influenza-curation-pipeline/metadata.tsv.xz
18+
19+
20+
segments:
21+
- pb2
22+
- pb1
23+
- pa
24+
- ha
25+
- np
26+
- na
27+
- mp
28+
- ns
29+
30+
genoflu:
31+
gisaid: true
32+
33+
s3_dst:
34+
# TODO -- hardcoded trial destination during testing
35+
# gisaid: s3://nextstrain-data-private/files/workflows/avian-flu
36+
gisaid: s3://nextstrain-data-private/files/workflows/avian-flu/trial/genoflu-gisaid
37+

0 commit comments

Comments
 (0)