Skip to content

Commit 66b273b

Browse files
author
jenkins
committed
Jenkins backup: 2025-12-20T01:16:08Z
1 parent 1296b3e commit 66b273b

File tree

7 files changed

+841
-317
lines changed

7 files changed

+841
-317
lines changed

jenkins-docker/jobs/ETL - All Concept Data Merge with Data Analyzer/config.xml

Lines changed: 181 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -52,72 +52,206 @@
5252
<concurrentBuild>false</concurrentBuild>
5353
<builders>
5454
<hudson.tasks.Shell>
55-
<command>#!/bin/bash
55+
<command>#!/usr/bin/env bash
5656
set -euo pipefail
5757

58-
mkdir beforeRemoval || find beforeRemoval -type f -exec rm -rf {} \;
59-
mkdir data || find data/ -type f -exec rm -rf {} \;
60-
mkdir completed || find processing/ -type f -exec rm -rf {} \;
61-
mkdir processing || find completed -type f -exec rm -rf {} \;
58+
LOG_TS() { date +&quot;%Y-%m-%dT%H:%M:%S%z&quot;; }
59+
log() { echo &quot;[$(LOG_TS)] INFO $*&quot; &gt;&amp;2; }
60+
warn() { echo &quot;[$(LOG_TS)] WARN $*&quot; &gt;&amp;2; }
61+
error() { echo &quot;[$(LOG_TS)] ERROR $*&quot; &gt;&amp;2; }
6262

63+
cleanup() {
64+
local rc=$?
65+
local cmd=&quot;${BASH_COMMAND:-}&quot;
66+
local line=&quot;${BASH_LINENO[0]:-}&quot;
6367

64-
aws sts assume-role --duration-seconds 3600 --role-arn arn:aws:iam::736265540791:role/dbgap-etl --role-session-name &quot;s3-test&quot; &gt; assume-role-output.txt
65-
66-
export AWS_ACCESS_KEY_ID=`grep AccessKeyId assume-role-output.txt | cut -d &apos;:&apos; -f 2 | sed &quot;s/[ ,\&quot;]//g&quot;`
67-
export AWS_SECRET_ACCESS_KEY=`grep SecretAccessKey assume-role-output.txt | cut -d &apos;:&apos; -f 2 | sed &quot;s/[ ,\&quot;]//g&quot;`
68-
export AWS_SESSION_TOKEN=`grep SessionToken assume-role-output.txt | cut -d &apos;:&apos; -f 2 | sed &quot;s/[ ,\&quot;]//g&quot;`
68+
if [[ $rc -eq 0 ]]; then
69+
log &quot;EXIT rc=0 (success). Cleaning AWS env vars.&quot;
70+
else
71+
error &quot;EXIT rc=${rc} at line=${line} cmd=${cmd}. Cleaning AWS env vars.&quot;
72+
fi
6973

70-
aws s3 cp ${managed_inputs} .
74+
unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN
75+
}
76+
on_err() {
77+
local rc=$?
78+
local line=&quot;${BASH_LINENO[0]:-}&quot;
79+
local cmd=&quot;${BASH_COMMAND:-}&quot;
80+
error &quot;ERR rc=${rc} at line=${line} cmd=${cmd}&quot;
81+
return $rc
82+
}
83+
trap on_err ERR
84+
trap cleanup EXIT
7185

72-
aws s3 cp --quiet s3://avillach-73-bdcatalyst-etl/general/data/metadata_new_search.json .
86+
need() { command -v &quot;$1&quot; &gt;/dev/null 2&gt;&amp;1 || { error &quot;Missing required command: $1&quot;; exit 2; }; }
7387

74-
csvcut -c &quot;Study Abbreviated Name&quot;,&quot;Study Identifier&quot;,&quot;Study Type&quot;,&quot;Data is ready to process&quot;,&quot;Data Processed&quot; Managed_Inputs.csv &gt; inputs.csv
88+
assume_role() {
89+
local role_arn=&quot;${ROLE_ARN:-arn:aws:iam::736265540791:role/dbgap-etl}&quot;
90+
local session_name=&quot;${ROLE_SESSION_NAME:-s3-test}&quot;
91+
local duration=&quot;${ASSUME_DURATION_SECONDS:-3600}&quot;
7592

93+
log &quot;Assuming role ${role_arn} (duration=${duration}s)&quot;
94+
local assume_json
95+
assume_json=&quot;$(mktemp)&quot;
7696

77-
IFS=&apos;,&apos;
78-
[ ! -f inputs.csv ]
79-
while read abv_name stdy_id stdy_type data_ready data_processed
80-
do
81-
if [[ &quot;${data_ready,,}&quot; == &quot;yes&quot; ]]; then
82-
aws s3 cp --no-progress s3://avillach-73-bdcatalyst-etl/${abv_name,,}/completed/${stdy_id}/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv beforeRemoval/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv --quiet
83-
if [[ -z beforeRemoval/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv ]]; then
84-
echo &quot;No data found for &apos;ready&apos; study ${abv_name} ${stdy_id}&quot;
85-
exit 255
86-
fi
87-
split -d --line-bytes=250MB beforeRemoval/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv beforeRemoval/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv &amp;&amp; \
88-
rm -f beforeRemoval/${stdy_id,,}_allConcepts_new_search_with_data_analyzer.csv &amp;&amp; \
89-
echo &quot;Downloaded and split ${stdy_id}&quot; &amp;
97+
aws sts assume-role \
98+
--duration-seconds &quot;$duration&quot; \
99+
--role-arn &quot;$role_arn&quot; \
100+
--role-session-name &quot;$session_name&quot; \
101+
&gt; &quot;$assume_json&quot;
102+
103+
export AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN
104+
AWS_ACCESS_KEY_ID=&quot;$(jq -r &apos;.Credentials.AccessKeyId&apos; &quot;$assume_json&quot;)&quot;
105+
AWS_SECRET_ACCESS_KEY=&quot;$(jq -r &apos;.Credentials.SecretAccessKey&apos; &quot;$assume_json&quot;)&quot;
106+
AWS_SESSION_TOKEN=&quot;$(jq -r &apos;.Credentials.SessionToken&apos; &quot;$assume_json&quot;)&quot;
107+
rm -f &quot;$assume_json&quot;
108+
109+
log &quot;Role assumed&quot;
110+
}
111+
112+
reset_dir() { rm -rf &quot;$1&quot;; mkdir -p &quot;$1&quot;; }
113+
114+
# Wait for *this script&apos;s* background jobs and fail if any failed.
115+
wait_all() {
116+
local pids=(&quot;$@&quot;)
117+
local rc=0 pid
118+
for pid in &quot;${pids[@]}&quot;; do
119+
if ! wait &quot;$pid&quot;; then
120+
rc=1
121+
warn &quot;Background job failed (pid=$pid)&quot;
122+
fi
123+
done
124+
return &quot;$rc&quot;
125+
}
126+
127+
# -----------------------------
128+
# Prereqs + required inputs
129+
# -----------------------------
130+
need aws
131+
need jq
132+
need csvcut
133+
need csvformat
134+
need split
135+
need tail
136+
need java
137+
138+
: &quot;${managed_inputs:?managed_inputs must be set (s3://.../Managed_Inputs.csv)}&quot;
139+
140+
S3_BUCKET=&quot;${S3_BUCKET:-avillach-73-bdcatalyst-etl}&quot;
141+
CHUNK_SIZE=&quot;${CHUNK_SIZE:-250m}&quot; # split byte size
142+
SPLIT_SUFFIX_LEN=&quot;${SPLIT_SUFFIX_LEN:-4}&quot; # avoid &quot;suffixes exhausted&quot;
143+
HEAP_GB=&quot;${HEAP_GB:-64}&quot;
144+
PARALLEL=&quot;${PARALLEL:-16}&quot;
145+
146+
# -----------------------------
147+
# Workspace
148+
# -----------------------------
149+
reset_dir beforeRemoval
150+
reset_dir data
151+
reset_dir completed
152+
reset_dir processing
153+
154+
# -----------------------------
155+
# Auth once
156+
# -----------------------------
157+
assume_role
90158

91-
else
92-
echo &quot;$abv_name marked not ready for processing in managed inputs&quot;
93-
fi
94-
done &lt; inputs.csv
159+
# -----------------------------
160+
# Inputs
161+
# -----------------------------
162+
log &quot;Downloading managed inputs: ${managed_inputs}&quot;
163+
aws s3 cp &quot;$managed_inputs&quot; ./Managed_Inputs.csv --no-progress --only-show-errors
95164

96-
aws s3 cp --no-progress s3://avillach-73-bdcatalyst-etl/hrmn/completed/HRMN_allConcepts.csv beforeRemoval/HRMN_allConcepts.csv &amp;
165+
log &quot;Downloading metadata_new_search.json&quot;
166+
aws s3 cp &quot;s3://${S3_BUCKET}/general/data/metadata_new_search.json&quot; ./metadata_new_search.json --no-progress --only-show-errors
167+
168+
log &quot;Creating inputs.csv&quot;
169+
csvcut -c &quot;Study Abbreviated Name&quot;,&quot;Study Identifier&quot;,&quot;Study Type&quot;,&quot;Data is ready to process&quot;,&quot;Data Processed&quot; \
170+
Managed_Inputs.csv &gt; inputs.csv
171+
172+
# -----------------------------
173+
# Download + split per study
174+
# NOTE: no backgrounding here; ensures split completes before downstream jars.
175+
# -----------------------------
176+
download_and_split() {
177+
local abv_name=&quot;$1&quot; stdy_id=&quot;$2&quot;
178+
local abv_lc stdy_lc
179+
abv_lc=&quot;$(printf &apos;%s&apos; &quot;$abv_name&quot; | tr &apos;[:upper:]&apos; &apos;[:lower:]&apos;)&quot;
180+
stdy_lc=&quot;$(printf &apos;%s&apos; &quot;$stdy_id&quot; | tr &apos;[:upper:]&apos; &apos;[:lower:]&apos;)&quot;
181+
182+
local s3_src=&quot;s3://${S3_BUCKET}/${abv_lc}/completed/${stdy_id}/${stdy_lc}_allConcepts_new_search_with_data_analyzer.csv&quot;
183+
local dst=&quot;beforeRemoval/${stdy_lc}_allConcepts_new_search_with_data_analyzer.csv&quot;
184+
185+
log &quot;Downloading ${stdy_id} allConcepts&quot;
186+
aws s3 cp &quot;$s3_src&quot; &quot;$dst&quot; --no-progress --only-show-errors
187+
188+
if [[ ! -s &quot;$dst&quot; ]]; then
189+
error &quot;Downloaded file missing/empty for ready study ${abv_name} ${stdy_id}: ${dst}&quot;
190+
return 255
191+
fi
192+
193+
log &quot;Splitting ${stdy_id} (~${CHUNK_SIZE} chunks)&quot;
194+
# Produces ${dst}.0000, ${dst}.0001, ... (numeric suffix, plenty of space)
195+
split -d -a &quot;$SPLIT_SUFFIX_LEN&quot; -b &quot;$CHUNK_SIZE&quot; &quot;$dst&quot; &quot;${dst}.&quot;
196+
rm -f &quot;$dst&quot;
197+
198+
log &quot;Downloaded and split ${stdy_id}&quot;
199+
}
200+
201+
while IFS=&apos;,&apos; read -r abv_name stdy_id stdy_type data_ready data_processed; do
202+
# Trim potential quotes/spaces
203+
abv_name=&quot;${abv_name%\&quot;}&quot;; abv_name=&quot;${abv_name#\&quot;}&quot;
204+
stdy_id=&quot;${stdy_id%\&quot;}&quot;; stdy_id=&quot;${stdy_id#\&quot;}&quot;
205+
data_ready=&quot;${data_ready%\&quot;}&quot;; data_ready=&quot;${data_ready#\&quot;}&quot;
206+
207+
if [[ &quot;${data_ready,,}&quot; == &quot;yes&quot; ]]; then
208+
download_and_split &quot;$abv_name&quot; &quot;$stdy_id&quot;
209+
else
210+
log &quot;${abv_name} ${stdy_id} marked not ready; skipping&quot;
211+
fi
212+
done &lt; &lt;(tail -n +2 inputs.csv | csvformat -U 1)
97213

98-
aws s3 cp --no-progress s3://avillach-73-bdcatalyst-etl/general/completed/GLOBAL_allConcepts_merged.csv beforeRemoval/GLOBAL_allConcepts.csv &amp;
214+
# -----------------------------
215+
# Download shared inputs (can be parallel)
216+
# -----------------------------
217+
pids=()
99218

100-
wait
219+
log &quot;Downloading HRMN_allConcepts.csv&quot;
220+
aws s3 cp &quot;s3://${S3_BUCKET}/hrmn/completed/HRMN_allConcepts.csv&quot; \
221+
&quot;beforeRemoval/HRMN_allConcepts.csv&quot; --no-progress --only-show-errors &amp; pids+=(&quot;$!&quot;)
101222

102-
java -Xmx64g -DmaximumPoolSize=16 -Djava.util.concurrent.ForkJoinPool.common.parallelism=16 -jar jars/RemoveConsentZeroPatients.jar || exit 255
103-
java -Xmx64g -jar jars/DbGapDataMerge.jar || exit 255
223+
log &quot;Downloading GLOBAL_allConcepts_merged.csv&quot;
224+
aws s3 cp &quot;s3://${S3_BUCKET}/general/completed/GLOBAL_allConcepts_merged.csv&quot; \
225+
&quot;beforeRemoval/GLOBAL_allConcepts.csv&quot; --no-progress --only-show-errors &amp; pids+=(&quot;$!&quot;)
104226

227+
log &quot;Waiting for shared downloads...&quot;
228+
wait_all &quot;${pids[@]}&quot;
105229

106-
unset AWS_ACCESS_KEY_ID
107-
unset AWS_SECRET_ACCESS_KEY
108-
unset AWS_SESSION_TOKEN
230+
# -----------------------------
231+
# Run jars
232+
# -----------------------------
233+
log &quot;Running RemoveConsentZeroPatients.jar&quot;
234+
java -Xms32g -Xmx&quot;${HEAP_GB}g&quot; \
235+
-DmaximumPoolSize=&quot;$PARALLEL&quot; \
236+
-Djava.util.concurrent.ForkJoinPool.common.parallelism=&quot;$PARALLEL&quot; \
237+
-XX:+ExitOnOutOfMemoryError \
238+
-XX:+HeapDumpOnOutOfMemoryError \
239+
-XX:HeapDumpPath=./oom-heapdump.hprof \
240+
-Xlog:gc*,safepoint:file=gc.log:time,level,tags \
241+
-jar jars/RemoveConsentZeroPatients.jar
109242

110-
aws sts assume-role --duration-seconds 3600 --role-arn arn:aws:iam::736265540791:role/dbgap-etl --role-session-name &quot;s3-test&quot; &gt; assume-role-output.txt
243+
log &quot;Running DbGapDataMerge.jar&quot;
244+
java -Xmx&quot;${HEAP_GB}g&quot; -jar jars/DbGapDataMerge.jar
111245

112-
export AWS_ACCESS_KEY_ID=`grep AccessKeyId assume-role-output.txt | cut -d &apos;:&apos; -f 2 | sed &quot;s/[ ,\&quot;]//g&quot;`
113-
export AWS_SECRET_ACCESS_KEY=`grep SecretAccessKey assume-role-output.txt | cut -d &apos;:&apos; -f 2 | sed &quot;s/[ ,\&quot;]//g&quot;`
114-
export AWS_SESSION_TOKEN=`grep SessionToken assume-role-output.txt | cut -d &apos;:&apos; -f 2 | sed &quot;s/[ ,\&quot;]//g&quot;`
115-
echo &quot;copying to s3&quot;
116-
aws s3 cp --no-progress completed/allConcepts.csv s3://avillach-73-bdcatalyst-etl/general/completed/allConcepts_data_analyzer.csv
246+
# -----------------------------
247+
# Upload output
248+
# -----------------------------
249+
log &quot;Uploading result to s3&quot;
250+
aws s3 cp &quot;completed/allConcepts.csv&quot; \
251+
&quot;s3://${S3_BUCKET}/general/completed/allConcepts_data_analyzer.csv&quot; \
252+
--no-progress --only-show-errors
117253

118-
unset AWS_ACCESS_KEY_ID
119-
unset AWS_SECRET_ACCESS_KEY
120-
unset AWS_SESSION_TOKEN</command>
254+
log &quot;Done&quot;</command>
121255
<configuredLocalRules/>
122256
</hudson.tasks.Shell>
123257
</builders>

0 commit comments

Comments
 (0)