|
52 | 52 | <concurrentBuild>false</concurrentBuild> |
53 | 53 | <builders> |
54 | 54 | <hudson.tasks.Shell> |
55 | | - <command>#!/bin/bash |
| 55 | + <command>#!/usr/bin/env bash |
56 | 56 | set -euo pipefail |
57 | 57 |
|
58 | | -mkdir beforeRemoval || find beforeRemoval -type f -exec rm -rf {} \; |
59 | | -mkdir data || find data/ -type f -exec rm -rf {} \; |
60 | | -mkdir completed || find processing/ -type f -exec rm -rf {} \; |
61 | | -mkdir processing || find completed -type f -exec rm -rf {} \; |
| 58 | +LOG_TS() { date +"%Y-%m-%dT%H:%M:%S%z"; } |
| 59 | +log() { echo "[$(LOG_TS)] INFO $*" >&2; } |
| 60 | +warn() { echo "[$(LOG_TS)] WARN $*" >&2; } |
| 61 | +error() { echo "[$(LOG_TS)] ERROR $*" >&2; } |
62 | 62 |
|
| 63 | +cleanup() { |
| 64 | + local rc=$? |
| 65 | + local cmd="${BASH_COMMAND:-}" |
| 66 | + local line="${BASH_LINENO[0]:-}" |
63 | 67 |
|
64 | | -aws sts assume-role --duration-seconds 3600 --role-arn arn:aws:iam::736265540791:role/dbgap-etl --role-session-name "s3-test" > assume-role-output.txt |
65 | | - |
66 | | - export AWS_ACCESS_KEY_ID=`grep AccessKeyId assume-role-output.txt | cut -d ':' -f 2 | sed "s/[ ,\"]//g"` |
67 | | - export AWS_SECRET_ACCESS_KEY=`grep SecretAccessKey assume-role-output.txt | cut -d ':' -f 2 | sed "s/[ ,\"]//g"` |
68 | | - export AWS_SESSION_TOKEN=`grep SessionToken assume-role-output.txt | cut -d ':' -f 2 | sed "s/[ ,\"]//g"` |
| 68 | + if [[ $rc -eq 0 ]]; then |
| 69 | + log "EXIT rc=0 (success). Cleaning AWS env vars." |
| 70 | + else |
| 71 | + error "EXIT rc=${rc} at line=${line} cmd=${cmd}. Cleaning AWS env vars." |
| 72 | + fi |
69 | 73 |
|
70 | | -aws s3 cp ${managed_inputs} . |
| 74 | + unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN |
| 75 | +} |
| 76 | +on_err() { |
| 77 | + local rc=$? |
| 78 | + local line="${BASH_LINENO[0]:-}" |
| 79 | + local cmd="${BASH_COMMAND:-}" |
| 80 | + error "ERR rc=${rc} at line=${line} cmd=${cmd}" |
| 81 | + return $rc |
| 82 | +} |
| 83 | +trap on_err ERR |
| 84 | +trap cleanup EXIT |
71 | 85 |
|
72 | | -aws s3 cp --quiet s3://avillach-73-bdcatalyst-etl/general/data/metadata_new_search.json . |
| 86 | +need() { command -v "$1" >/dev/null 2>&1 || { error "Missing required command: $1"; exit 2; }; } |
73 | 87 |
|
74 | | -csvcut -c "Study Abbreviated Name","Study Identifier","Study Type","Data is ready to process","Data Processed" Managed_Inputs.csv > inputs.csv |
| 88 | +assume_role() { |
| 89 | + local role_arn="${ROLE_ARN:-arn:aws:iam::736265540791:role/dbgap-etl}" |
| 90 | + local session_name="${ROLE_SESSION_NAME:-s3-test}" |
| 91 | + local duration="${ASSUME_DURATION_SECONDS:-3600}" |
75 | 92 |
|
| 93 | + log "Assuming role ${role_arn} (duration=${duration}s)" |
| 94 | + local assume_json |
| 95 | + assume_json="$(mktemp)" |
76 | 96 |
|
77 | | -IFS=',' |
78 | | -[ ! -f inputs.csv ] |
79 | | -while read abv_name stdy_id stdy_type data_ready data_processed |
80 | | -do |
81 | | - if [[ "${data_ready,,}" == "yes" ]]; then |
82 | | - aws s3 cp --no-progress s3://avillach-73-bdcatalyst-etl/${abv_name,,}/completed/${stdy_id}/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv beforeRemoval/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv --quiet |
83 | | - if [[ -z beforeRemoval/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv ]]; then |
84 | | - echo "No data found for 'ready' study ${abv_name} ${stdy_id}" |
85 | | - exit 255 |
86 | | - fi |
87 | | - split -d --line-bytes=250MB beforeRemoval/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv beforeRemoval/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv && \ |
88 | | - rm -f beforeRemoval/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv && \ |
89 | | - echo "Downloaded and split ${stdy_id}" & |
| 97 | + aws sts assume-role \ |
| 98 | + --duration-seconds "$duration" \ |
| 99 | + --role-arn "$role_arn" \ |
| 100 | + --role-session-name "$session_name" \ |
| 101 | + > "$assume_json" |
| 102 | + |
| 103 | + export AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN |
| 104 | + AWS_ACCESS_KEY_ID="$(jq -r '.Credentials.AccessKeyId' "$assume_json")" |
| 105 | + AWS_SECRET_ACCESS_KEY="$(jq -r '.Credentials.SecretAccessKey' "$assume_json")" |
| 106 | + AWS_SESSION_TOKEN="$(jq -r '.Credentials.SessionToken' "$assume_json")" |
| 107 | + rm -f "$assume_json" |
| 108 | + |
| 109 | + log "Role assumed" |
| 110 | +} |
| 111 | + |
| 112 | +reset_dir() { rm -rf "$1"; mkdir -p "$1"; } |
| 113 | + |
| 114 | +# Wait for *this script's* background jobs and fail if any failed. |
| 115 | +wait_all() { |
| 116 | + local pids=("$@") |
| 117 | + local rc=0 pid |
| 118 | + for pid in "${pids[@]}"; do |
| 119 | + if ! wait "$pid"; then |
| 120 | + rc=1 |
| 121 | + warn "Background job failed (pid=$pid)" |
| 122 | + fi |
| 123 | + done |
| 124 | + return "$rc" |
| 125 | +} |
| 126 | + |
| 127 | +# ----------------------------- |
| 128 | +# Prereqs + required inputs |
| 129 | +# ----------------------------- |
| 130 | +need aws |
| 131 | +need jq |
| 132 | +need csvcut |
| 133 | +need csvformat |
| 134 | +need split |
| 135 | +need tail |
| 136 | +need java |
| 137 | + |
| 138 | +: "${managed_inputs:?managed_inputs must be set (s3://.../Managed_Inputs.csv)}" |
| 139 | + |
| 140 | +S3_BUCKET="${S3_BUCKET:-avillach-73-bdcatalyst-etl}" |
| 141 | +CHUNK_SIZE="${CHUNK_SIZE:-250m}" # split byte size |
| 142 | +SPLIT_SUFFIX_LEN="${SPLIT_SUFFIX_LEN:-4}" # avoid "suffixes exhausted" |
| 143 | +HEAP_GB="${HEAP_GB:-64}" |
| 144 | +PARALLEL="${PARALLEL:-16}" |
| 145 | + |
| 146 | +# ----------------------------- |
| 147 | +# Workspace |
| 148 | +# ----------------------------- |
| 149 | +reset_dir beforeRemoval |
| 150 | +reset_dir data |
| 151 | +reset_dir completed |
| 152 | +reset_dir processing |
| 153 | + |
| 154 | +# ----------------------------- |
| 155 | +# Auth once |
| 156 | +# ----------------------------- |
| 157 | +assume_role |
90 | 158 |
|
91 | | - else |
92 | | - echo "$abv_name marked not ready for processing in managed inputs" |
93 | | - fi |
94 | | -done < inputs.csv |
| 159 | +# ----------------------------- |
| 160 | +# Inputs |
| 161 | +# ----------------------------- |
| 162 | +log "Downloading managed inputs: ${managed_inputs}" |
| 163 | +aws s3 cp "$managed_inputs" ./Managed_Inputs.csv --no-progress --only-show-errors |
95 | 164 |
|
96 | | -aws s3 cp --no-progress s3://avillach-73-bdcatalyst-etl/hrmn/completed/HRMN_allConcepts.csv beforeRemoval/HRMN_allConcepts.csv & |
| 165 | +log "Downloading metadata_new_search.json" |
| 166 | +aws s3 cp "s3://${S3_BUCKET}/general/data/metadata_new_search.json" ./metadata_new_search.json --no-progress --only-show-errors |
| 167 | + |
| 168 | +log "Creating inputs.csv" |
| 169 | +csvcut -c "Study Abbreviated Name","Study Identifier","Study Type","Data is ready to process","Data Processed" \ |
| 170 | + Managed_Inputs.csv > inputs.csv |
| 171 | + |
| 172 | +# ----------------------------- |
| 173 | +# Download + split per study |
| 174 | +# NOTE: no backgrounding here; ensures split completes before downstream jars. |
| 175 | +# ----------------------------- |
| 176 | +download_and_split() { |
| 177 | + local abv_name="$1" stdy_id="$2" |
| 178 | + local abv_lc stdy_lc |
| 179 | + abv_lc="$(printf '%s' "$abv_name" | tr '[:upper:]' '[:lower:]')" |
| 180 | + stdy_lc="$(printf '%s' "$stdy_id" | tr '[:upper:]' '[:lower:]')" |
| 181 | + |
| 182 | + local s3_src="s3://${S3_BUCKET}/${abv_lc}/completed/${stdy_id}/${stdy_lc}_allConcepts_new_search_with_data_analyzer.csv" |
| 183 | + local dst="beforeRemoval/${stdy_lc}_allConcepts_new_search_with_data_analyzer.csv" |
| 184 | + |
| 185 | + log "Downloading ${stdy_id} allConcepts" |
| 186 | + aws s3 cp "$s3_src" "$dst" --no-progress --only-show-errors |
| 187 | + |
| 188 | + if [[ ! -s "$dst" ]]; then |
| 189 | + error "Downloaded file missing/empty for ready study ${abv_name} ${stdy_id}: ${dst}" |
| 190 | + return 255 |
| 191 | + fi |
| 192 | + |
| 193 | + log "Splitting ${stdy_id} (~${CHUNK_SIZE} chunks)" |
| 194 | + # Produces ${dst}.0000, ${dst}.0001, ... (numeric suffix, plenty of space) |
| 195 | + split -d -a "$SPLIT_SUFFIX_LEN" -b "$CHUNK_SIZE" "$dst" "${dst}." |
| 196 | + rm -f "$dst" |
| 197 | + |
| 198 | + log "Downloaded and split ${stdy_id}" |
| 199 | +} |
| 200 | + |
| 201 | +while IFS=',' read -r abv_name stdy_id stdy_type data_ready data_processed; do |
| 202 | + # Trim potential quotes/spaces |
| 203 | + abv_name="${abv_name%\"}"; abv_name="${abv_name#\"}" |
| 204 | + stdy_id="${stdy_id%\"}"; stdy_id="${stdy_id#\"}" |
| 205 | + data_ready="${data_ready%\"}"; data_ready="${data_ready#\"}" |
| 206 | + |
| 207 | + if [[ "${data_ready,,}" == "yes" ]]; then |
| 208 | + download_and_split "$abv_name" "$stdy_id" |
| 209 | + else |
| 210 | + log "${abv_name} ${stdy_id} marked not ready; skipping" |
| 211 | + fi |
| 212 | +done < <(tail -n +2 inputs.csv | csvformat -U 1) |
97 | 213 |
|
98 | | -aws s3 cp --no-progress s3://avillach-73-bdcatalyst-etl/general/completed/GLOBAL_allConcepts_merged.csv beforeRemoval/GLOBAL_allConcepts.csv & |
| 214 | +# ----------------------------- |
| 215 | +# Download shared inputs (can be parallel) |
| 216 | +# ----------------------------- |
| 217 | +pids=() |
99 | 218 |
|
100 | | -wait |
| 219 | +log "Downloading HRMN_allConcepts.csv" |
| 220 | +aws s3 cp "s3://${S3_BUCKET}/hrmn/completed/HRMN_allConcepts.csv" \ |
| 221 | + "beforeRemoval/HRMN_allConcepts.csv" --no-progress --only-show-errors & pids+=("$!") |
101 | 222 |
|
102 | | -java -Xmx64g -DmaximumPoolSize=16 -Djava.util.concurrent.ForkJoinPool.common.parallelism=16 -jar jars/RemoveConsentZeroPatients.jar || exit 255 |
103 | | -java -Xmx64g -jar jars/DbGapDataMerge.jar || exit 255 |
| 223 | +log "Downloading GLOBAL_allConcepts_merged.csv" |
| 224 | +aws s3 cp "s3://${S3_BUCKET}/general/completed/GLOBAL_allConcepts_merged.csv" \ |
| 225 | + "beforeRemoval/GLOBAL_allConcepts.csv" --no-progress --only-show-errors & pids+=("$!") |
104 | 226 |
|
| 227 | +log "Waiting for shared downloads..." |
| 228 | +wait_all "${pids[@]}" |
105 | 229 |
|
106 | | -unset AWS_ACCESS_KEY_ID |
107 | | -unset AWS_SECRET_ACCESS_KEY |
108 | | -unset AWS_SESSION_TOKEN |
| 230 | +# ----------------------------- |
| 231 | +# Run jars |
| 232 | +# ----------------------------- |
| 233 | +log "Running RemoveConsentZeroPatients.jar" |
| 234 | +java -Xms32g -Xmx"${HEAP_GB}g" \ |
| 235 | + -DmaximumPoolSize="$PARALLEL" \ |
| 236 | + -Djava.util.concurrent.ForkJoinPool.common.parallelism="$PARALLEL" \ |
| 237 | + -XX:+ExitOnOutOfMemoryError \ |
| 238 | + -XX:+HeapDumpOnOutOfMemoryError \ |
| 239 | + -XX:HeapDumpPath=./oom-heapdump.hprof \ |
| 240 | + -Xlog:gc*,safepoint:file=gc.log:time,level,tags \ |
| 241 | + -jar jars/RemoveConsentZeroPatients.jar |
109 | 242 |
|
110 | | -aws sts assume-role --duration-seconds 3600 --role-arn arn:aws:iam::736265540791:role/dbgap-etl --role-session-name "s3-test" > assume-role-output.txt |
| 243 | +log "Running DbGapDataMerge.jar" |
| 244 | +java -Xmx"${HEAP_GB}g" -jar jars/DbGapDataMerge.jar |
111 | 245 |
|
112 | | -export AWS_ACCESS_KEY_ID=`grep AccessKeyId assume-role-output.txt | cut -d ':' -f 2 | sed "s/[ ,\"]//g"` |
113 | | -export AWS_SECRET_ACCESS_KEY=`grep SecretAccessKey assume-role-output.txt | cut -d ':' -f 2 | sed "s/[ ,\"]//g"` |
114 | | -export AWS_SESSION_TOKEN=`grep SessionToken assume-role-output.txt | cut -d ':' -f 2 | sed "s/[ ,\"]//g"` |
115 | | -echo "copying to s3" |
116 | | -aws s3 cp --no-progress completed/allConcepts.csv s3://avillach-73-bdcatalyst-etl/general/completed/allConcepts_data_analyzer.csv |
| 246 | +# ----------------------------- |
| 247 | +# Upload output |
| 248 | +# ----------------------------- |
| 249 | +log "Uploading result to s3" |
| 250 | +aws s3 cp "completed/allConcepts.csv" \ |
| 251 | + "s3://${S3_BUCKET}/general/completed/allConcepts_data_analyzer.csv" \ |
| 252 | + --no-progress --only-show-errors |
117 | 253 |
|
118 | | -unset AWS_ACCESS_KEY_ID |
119 | | -unset AWS_SECRET_ACCESS_KEY |
120 | | -unset AWS_SESSION_TOKEN</command> |
| 254 | +log "Done"</command> |
121 | 255 | <configuredLocalRules/> |
122 | 256 | </hudson.tasks.Shell> |
123 | 257 | </builders> |
|
0 commit comments