Skip to content

Commit 42ab935

Browse files
authored
Smw helix dev (#13)
* first try!!!! * prep helix for launch * add helix name fix * udpate version
1 parent 8e7bcac commit 42ab935

6 files changed

+169
-12
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
main:
2+
params: [args]
3+
steps:
4+
- init:
5+
assign:
6+
- projectId: ${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
7+
- region: "us-central1"
8+
- batchApi: "batch.googleapis.com/v1"
9+
- batchApiUrl: ${ "https://" + batchApi + "/projects/" + projectId + "/locations/" + region + "/jobs"}
10+
- jobId: ${ "helix-gisaid-" + string(int(sys.now()))}
11+
- newFile: ${args.data.name}
12+
- logCreateBatchJob:
13+
call: sys.log
14+
args:
15+
data: ${ "Creating and running the batch job " + jobId}
16+
- createAndRunBatchJob:
17+
call: http.post
18+
args:
19+
url: ${batchApiUrl}
20+
query:
21+
job_id: ${jobId}
22+
headers:
23+
Content-Type: application/json
24+
auth:
25+
type: OAuth2
26+
body:
27+
taskGroups:
28+
- taskSpec:
29+
runnables:
30+
- container:
31+
imageUri: "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-08-08-2"
32+
entrypoint: "/bin/bash"
33+
volumes:
34+
- "/mnt/disks/cdph_helix_gisaid_staging:/data"
35+
- "/mnt/disks/cdph_helix_gisaid_staging/.config:/.config"
36+
commands:
37+
- "-c"
38+
- ${ "bash /data/standard-dashboard.sh -d gs://cdph_helix_gisaid_staging -s no -b /data/input_gisaid -o /data -t gs://cdph_helix_gisaid -g gs://fc-6f47810a-0cc3-4e68-b8f0-12cde24d5893 -r helix_gisaid -p cdc-terrabio-taborda-manual -w dataAnalysis_SARS-CoV-2_Helix -q no -m false -i " + newFile + " -k true -x true"}
39+
volumes:
40+
- gcs:
41+
remotePath: "cdph_helix_gisaid_staging"
42+
mountPath: "/mnt/disks/cdph_helix_gisaid_staging"
43+
taskCount: 1
44+
logsPolicy:
45+
destination: CLOUD_LOGGING
46+
result: createAndRunBatchJobResponse
47+
- getJob:
48+
call: http.get
49+
args:
50+
url: ${batchApiUrl + "/" + jobId}
51+
auth:
52+
type: OAuth2
53+
result: getJobResult
54+
- logState:
55+
call: sys.log
56+
args:
57+
data: ${ "Current job state " + getJobResult.body.status.state}
58+
- checkState:
59+
switch:
60+
- condition: ${getJobResult.body.status.state == "SUCCEEDED"}
61+
next: returnResult
62+
- condition: ${getJobResult.body.status.state == "FAILED"}
63+
next: returnResult
64+
next: sleep
65+
- sleep:
66+
call: sys.sleep
67+
args:
68+
seconds: 10
69+
next: getJob
70+
- returnResult:
71+
return:
72+
jobId: ${jobId}
73+
status: "OK"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
gcloud eventarc triggers create helix-gisaid-trigger \
2+
--destination-workflow=helix-gisaid \
3+
--destination-workflow-location=us-central1 \
4+
--event-filters="type=google.cloud.storage.object.v1.finalized" \
5+
--event-filters="bucket=cdph_helix_gisaid" \
6+
--location=us \
7+
--service-account="[email protected]"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
gcloud workflows deploy helix-gisaid \
2+
--source=helix-gisaid-data-workflow.yaml

google_workflows/standard-dashboard.sh

+47-10
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ set -e
44
# filename: standard_dashboard.sh
55
# authors: Sage Wright, Kevin Libuit, Frank Ambrosio
66

7-
VERSION="Google Dashboarding v0.1"
7+
VERSION="Google Dashboarding v0.3"
88

99
showHelp() {
1010
cat << EOF
11-
Google Dashboarding v0.1
11+
Google Dashboarding v0.3
1212
This script is configured to work within a Google Batch job managed by a Google Workflow and Trigger.
1313
The following variables need to be passed in as input parameters.
1414
CAUTION: The entire command length must be under 400 characters; using the short version of arguments is recommended
@@ -29,13 +29,14 @@ Usage: ./standard_dashboard.sh
2929
[ -q | --big-query-table-name ] the name of the big query table to upload to ("sars_cov_2_dashboard.workflow_la_state_gisaid_specimens_test")
3030
[ -m | --puerto-rico ] apply Puerto Rico-specific changes. available options: true or false
3131
[ -i | --input-tar-file ] the tar file given to the script by the Google Trigger
32-
32+
[ -k | --skip-bq-load ] skips the bq load step. available options: true or false
33+
[ -x | --helix ] apply Helix-specific changes. available options: true or false
3334
Happy dashboarding!
3435
EOF
3536
}
3637

3738
# use getopt to parse the input arguments
38-
PARSED_ARGUMENTS=$(getopt -n "standard-dashboard" -o "hvd:j:s:b:o:t:g:r:p:w:q:m:i:" -l "version,help,dashboard-gcp-uri:,dashboard-newline-json:,dashboard-schema:,gisaid-backup-dir:,output-dir:,trigger-bucket:,terra-gcp-uri:,terra-table-root-entity:,terra-project:,terra-workspace:,big-query-table-name:,puerto-rico:,input-tar-file:" -a -- "$@")
39+
PARSED_ARGUMENTS=$(getopt -n "standard-dashboard" -o "hvd:s:b:o:t:g:r:p:w:q:m:i:k:x:" -l "version,help,dashboard-gcp-uri:,dashboard-schema:,gisaid-backup-dir:,output-dir:,trigger-bucket:,terra-gcp-uri:,terra-table-root-entity:,terra-project:,terra-workspace:,big-query-table-name:,puerto-rico:,input-tar-file:,skip-bq-load:,helix:" -a -- "$@")
3940

4041
eval set -- "$PARSED_ARGUMENTS"
4142

@@ -47,8 +48,6 @@ while true; do
4748
showHelp; exit 0;;
4849
-d|--dashboard-gcp-uri)
4950
dashboard_gcp_uri=$2; shift 2;;
50-
-j|--dashboard-newline-json)
51-
dashboard_newline_json=$2; shift 2;;
5251
-s|--dashboard_schema)
5352
dashboard_schema=$2; shift 2;;
5453
-b|--gisaid-backup-dir)
@@ -71,6 +70,10 @@ while true; do
7170
puerto_rico=$2; shift 2;;
7271
-i|--input-tar-file)
7372
input_tar_file=$2; shift 2;;
73+
-k|--skip-bq-load)
74+
skip_bq_load=$2; shift 2;;
75+
-x|--helix)
76+
helix=$2; shift 2;;
7477
--) shift; break ;;
7578
*) echo "Unexpected option: $1 -- this should not happen."; exit 1;;
7679
esac
@@ -100,7 +103,7 @@ make_directory ${output_dir}/backup_jsons
100103

101104
# echo the variables that were provided
102105
echo -e "Dashboarding Automated System initiated at ${date_tag}\n" | tee ${output_dir}/automation_logs/dashboard-${date_tag}.log
103-
echo -e "Input variables:\ndashboard_gcp_uri: ${dashboard_gcp_uri},\ndashboard_newline_json: ${dashboard_newline_json},\ndashboard_bq_load_schema: ${dashboard_schema},\ngisaid_backup_dir: ${gisaid_backup_dir},\nmounted_output_dir: ${output_dir},\ntrigger_bucket_gcp_uri: ${trigger_bucket},\nterra_gcp_uri: ${terra_gcp_uri},\nterra_table_root_entity: ${terra_table_root_entity},\nterra_project: ${terra_project},\nterra_workspace: ${terra_workspace},\nbig_query_table_name: ${big_query_table_name}\n" >> ${output_dir}/automation_logs/dashboard-${date_tag}.log
106+
echo -e "Input variables:\ndashboard_gcp_uri: ${dashboard_gcp_uri},\ndashboard_bq_load_schema: ${dashboard_schema},\ngisaid_backup_dir: ${gisaid_backup_dir},\nmounted_output_dir: ${output_dir},\ntrigger_bucket_gcp_uri: ${trigger_bucket},\nterra_gcp_uri: ${terra_gcp_uri},\nterra_table_root_entity: ${terra_table_root_entity},\nterra_project: ${terra_project},\nterra_workspace: ${terra_workspace},\nbig_query_table_name: ${big_query_table_name}\n" >> ${output_dir}/automation_logs/dashboard-${date_tag}.log
104107

105108
# take in file as input from trigger
106109
file=${trigger_bucket}/${input_tar_file}
@@ -131,17 +134,17 @@ if [[ "$file" == *"gisaid_auspice_input"*"tar" ]]; then
131134
\n
132135
# Create individual fasta files from GISAID multifasta
133136
\n
134-
python3 /scripts/gisaid_multifasta_parser.py ${gisaid_dir}/*.sequences.fasta ${gisaid_dir} ${puerto_rico}
137+
python3 /scripts/gisaid_multifasta_parser.py ${gisaid_dir}/*.sequences.fasta ${gisaid_dir} ${puerto_rico} ${helix}
135138
\n
136139
\n
137140
# Deposit individual fasta files into Terra GCP bucket
138141
\n
139-
gsutil -m cp ${gisaid_dir}/individual_gisaid_assemblies_$(date -I)/*.fasta ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_$(date -I)/
142+
gsutil -m cp ${gisaid_dir}/individual_gisaid_assemblies_${date_tag}/*.fasta ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_${date_tag}/
140143
\n
141144
\n
142145
# Create and import Terra Data table containing GCP pointers to deposited assemblies
143146
\n
144-
/scripts/terra_table_from_gcp_assemblies.sh ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_$(date -I) ${terra_project} ${terra_workspace} ${terra_table_root_entity} ${gisaid_dir} \".fasta\" $(date -I)
147+
/scripts/terra_table_from_gcp_assemblies.sh ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_${date_tag} ${terra_project} ${terra_workspace} ${terra_table_root_entity} ${gisaid_dir} \".fasta\" ${date_tag}
145148
\n
146149
\n
147150
# Capture, reformat, and prune GISAID metadata
@@ -154,6 +157,38 @@ if [[ "$file" == *"gisaid_auspice_input"*"tar" ]]; then
154157
python3 /scripts/import_large_tsv/import_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --tsv ${gisaid_dir}/gisaid_metadata_${date_tag}.tsv
155158
\n
156159
\n
160+
if ${skip_bq_load} ; then
161+
\n
162+
# Make a set table
163+
\n
164+
/scripts/make_set_table.sh ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_${date_tag} ${terra_project} ${terra_workspace} ${terra_table_root_entity} ${gisaid_dir} \".fasta\" ${date_tag}
165+
\n
166+
\n
167+
# Run TheiaCoV_FASTA on the set
168+
\n
169+
TOKEN=`gcloud auth print-access-token`
170+
\n
171+
curl -X 'POST' \
172+
'https://api.firecloud.org/api/workspaces/${terra_project}/${terra_workspace}/submissions' \
173+
-H 'accept: */*' \
174+
-H "Authorization: Bearer ${TOKEN}" \
175+
-H 'Content-Type: application/json' \
176+
-d \"{
177+
\"methodConfigurationNamespace\": \"${terra_project}\",
178+
\"methodConfigurationName\": \"TheiaCoV_FASTA_PHB\",
179+
\"entityType\": \"${terra_table_root_entity}_set\",
180+
\"entityName\": \"${date_tag}-set\",
181+
\"expression\": \"this.${terra_table_root_entity}s\",
182+
\"useCallCache\": true,
183+
\"deleteIntermediateOutputFiles\": false,
184+
\"useReferenceDisks\": false,
185+
\"memoryRetryMultiplier\": 1,
186+
\"workflowFailureMode\": \"NoNewCalls\",
187+
\"userComment\": \"${date_tag}-set automatically launched\"
188+
}\"
189+
\n
190+
\n
191+
else
157192
# Capture the entire Terra data table as a tsv
158193
\n
159194
python3 /scripts/export_large_tsv/export_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --entity_type ${terra_table_root_entity} --tsv_filename ${gisaid_dir}/full_${terra_table_root_entity}_terra_table_${date_tag}.tsv
@@ -175,6 +210,8 @@ if [[ "$file" == *"gisaid_auspice_input"*"tar" ]]; then
175210
\n
176211
bq load --ignore_unknown_values=true --replace=true --source_format=NEWLINE_DELIMITED_JSON ${big_query_table_name} ${dashboard_gcp_uri}/${terra_table_root_entity}.json ${dashboard_schema}
177212
\n
213+
fi
214+
\n
178215
\n
179216
"
180217
# write the commands that will be run to the automation log

scripts/gisaid_metadata_cleanser.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def get_opts():
1010
p.add_argument('out_file', help='Output file: required, must be a string.')
1111
p.add_argument('table_name', help='Terra table name: required, must be a string; do not include entity: or _id.')
1212
p.add_argument('puertorico', help='Perform Puerto Rico-specific actions')
13+
p.add_argument('helix', help='Perform Helix-specific actions')
1314
args = p.parse_args()
1415
return args
1516
arguments = get_opts()
@@ -33,6 +34,11 @@ def get_opts():
3334
# remove any samples uploaded by PR
3435
meta_df1 = meta_df1[~meta_df1[table_name].str.contains("PR-CVL")]
3536

37+
# perform Helix specific actions:
38+
if arguments.helix == "true":
39+
# rename virus names to start after the `hCoV-10/USA/CA-` prefix
40+
meta_df1[table_name] = meta_df1[table_name].str.replace('hCoV-19/USA/CA-', '')
41+
3642
# drop extraneous cols
3743
drop_list = []
3844
for i in meta_df1.columns.values:
@@ -60,11 +66,9 @@ def get_opts():
6066
meta_df1['collection_date'].replace('_', value='-', regex=True, inplace=True)
6167
meta_df1['date_submitted'].replace('_', value='-', regex=True, inplace=True)
6268

63-
6469
# remove the word 'years' from the age column
6570
meta_df1['age'].replace(' years', value='', regex=True, inplace=True)
6671

67-
6872
# age column cleaning
6973
# replace string inputs of age ranges with individual numerical age equivalent to the bottom of the bins
7074
age_range_replace_dict = {'0-4': 4, '5-17': 5, '18-49': 18, '50-64': 50}

scripts/make_terra_set.sh

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
3+
gcp_uri=$1
4+
terra_project=$2
5+
terra_workspace=$3
6+
root_entity=$4
7+
output_dir=$5
8+
alt_delimiter=$6
9+
10+
date_tag=$7
11+
set_name=${date_tag}-set
12+
13+
# set default for $alt_delimiter in case user does not specify one
14+
if [ -z $alt_delimiter ]; then
15+
alt_delimiter="_"
16+
fi
17+
18+
assembly_files=$(gsutil ls ${gcp_uri}/*.fasta | awk -F'/' '{ print $NF }')
19+
20+
# make set table header
21+
echo -e "membership:${root_entity}_set_id\t${root_entity}" > ${output_dir}/${set_name}.tsv
22+
23+
for assembly in $assembly_files; do
24+
# capture samplename from assembly filename
25+
samplename=$(echo ${assembly} | awk -F "${alt_delimiter}|.fasta" '{ print $1 }')
26+
# write samplename to the set
27+
echo -e "${set_name}\t${samplename}" >> ${output_dir}/${set_name}.tsv
28+
done
29+
30+
# remove duplicates from tsv if samplename not unique
31+
awk '!a[$1]++' ${output_dir}/${set_name}.tsv > temp.tsv && mv temp.tsv ${output_dir}/${set_name}.tsv
32+
33+
# Import Terra table to sepcified terra_workspace
34+
python3 /scripts/import_large_tsv/import_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --tsv ${output_dir}/${set_name}.tsv

0 commit comments

Comments
 (0)