-
Notifications
You must be signed in to change notification settings - Fork 516
Expand file tree
/
Copy pathoozie.sh
More file actions
executable file
·710 lines (627 loc) · 33 KB
/
oozie.sh
File metadata and controls
executable file
·710 lines (627 loc) · 33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
#!/bin/bash
#
# Copyright 2015,2016,2017,2018,2019,2020,2023 Google LLC and contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Initialization action for installing Apache Oozie on a Google Cloud
# Dataproc cluster. This script will install and configure Oozie to run on the
# master node of a Dataproc cluster. The version of Oozie which is installed
# comes from the BigTop repository.
#
# You can find more information about Oozie at http://oozie.apache.org/
# For more information in init actions and Google Cloud Dataproc see the Cloud
# Dataproc documentation at https://cloud.google.com/dataproc/init-actions
#
# This script should run in under a few minutes
set -euxo pipefail
OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
readonly OS_NAME
readonly master_node=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
readonly ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
readonly MAVEN_CENTRAL_URI=https://maven-central.storage-download.googleapis.com/maven2
# Use Python from /usr/bin instead of /opt/conda.
export PATH=/usr/bin:$PATH
# Detect dataproc image version from its various names
if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then
DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
fi
case "${DATAPROC_IMAGE_VERSION}" in
"1.3" | "1.4" | "1.5" | "2.0" )
curator_version="2.13.0"
curator_src="/usr/lib/hadoop/lib"
;;
"2.1" | "2.2")
curator_version="2.13.0"
curator_src="/usr/lib/spark/jars"
;;
*)
echo "unsupported DATAPROC_IMAGE_VERSION: ${DATAPROC_IMAGE_VERSION}" >&2
exit 1
;;
esac
# Use Python from /usr/bin instead of /opt/conda.
export PATH=/usr/bin:$PATH
export METADATA_HTTP_PROXY=$(/usr/share/google/get_metadata_value attributes/http-proxy)
export METADATA_EMAIL_SMTP_HOST=$(/usr/share/google/get_metadata_value attributes/email-smtp-host)
export METADATA_EMAIL_FROM_ADDRESS=$(/usr/share/google/get_metadata_value attributes/email-from-address)
export MYSQL_ROOT_USERNAME=$(/usr/share/google/get_metadata_value attributes/mysql-root-username || echo "root")
export OOZIE_DB_NAME=$(/usr/share/google/get_metadata_value attributes/oozie-db-name || echo "oozie")
export OOZIE_DB_USERNAME=$(/usr/share/google/get_metadata_value attributes/oozie-db-username || echo "oozie")
export OOZIE_PASSWORD_SECRET_NAME=$(/usr/share/google/get_metadata_value attributes/oozie-password-secret-name || echo "secret-name")
export OOZIE_PASSWORD_SECRET_VERSION=$(/usr/share/google/get_metadata_value attributes/oozie-password-secret-version || echo 1)
export OOZIE_PASSWORD=$(gcloud secrets versions access --secret ${OOZIE_PASSWORD_SECRET_NAME} ${OOZIE_PASSWORD_SECRET_VERSION} || echo oozie-password)
export http_proxy="${METADATA_HTTP_PROXY}"
export https_proxy="${METADATA_HTTP_PROXY}"
export HTTP_PROXY="${METADATA_HTTP_PROXY}"
export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
export no_proxy=metadata.google.internal
export NO_PROXY=metadata.google.internal
export MYSQL_ROOT_PASSWORD_SECRET_NAME=$(/usr/share/google/get_metadata_value attributes/mysql-root-password-secret-name)
export MYSQL_ROOT_PASSWORD_SECRET_VERSION=$(/usr/share/google/get_metadata_value attributes/mysql-root-password-secret-version || echo 1)
export MYSQL_ROOT_PASSWORD=$(gcloud secrets versions access --secret ${MYSQL_ROOT_PASSWORD_SECRET_NAME} ${MYSQL_ROOT_PASSWORD_SECRET_VERSION} || \
grep 'password=' /etc/mysql/my.cnf | sed 's/^.*=//' || echo root-password)
NUM_LIVE_DATANODES=0
function remove_old_backports {
# This script uses 'apt-get update' and is therefore potentially dependent on
# backports repositories which have been archived. In order to mitigate this
# problem, we will remove any reference to backports repos older than oldstable
# https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}');
stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}');
matched_files="$(grep -rsil '\-backports' /etc/apt/sources.list*)"
if [[ -n "$matched_files" ]]; then
for filename in "$matched_files"; do
grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \
sed -i -e 's/^.*-backports.*$//' "$filename"
done
fi
}
function await_hdfs_datanodes() {
# Wait for HDFS to come online
tryno=0
delay=0
until [[ $tryno -gt 9 || ${NUM_LIVE_DATANODES} -gt 0 ]]; do
NUM_LIVE_DATANODES=`sudo -u hdfs hdfs dfsadmin -report -live | perl -ne 'print $1 if /^Live.*\((.*)\):/'`
sleep ${delay}s
(( tryno=${tryno}+1 ))
(( delay=${tryno}*5 ))
done
if [[ $tryno -gt 9 ]]; then
echo "hdfs did not come online"
return -1
fi
}
function set_oozie_property() {
local prop_name="$1"
local prop_val="$2"
/usr/local/bin/bdconfig set_property \
--configuration_file '/etc/oozie/conf/oozie-site.xml' \
--name "${prop_name}" --value "${prop_val}" \
--clobber
}
function set_hadoop_property() {
local prop_name="$1"
local prop_val="$2"
/usr/local/bin/bdconfig set_property \
--configuration_file '/etc/hadoop/conf/core-site.xml' \
--name "${prop_name}" --value "${prop_val}" \
--clobber
}
function retry_command() {
local cmd="$1"
# First retry is immediate
for ((i = 0; i < 10; i++)); do
if eval "$cmd"; then
return 0
fi
sleep $((i * 5))
done
return 1
}
function min_version() {
echo -e "$1\n$2" | sort -r -t'.' -n -k1,1 -k2,2 -k3,3 | tail -n1
}
function configure_ssl() {
local oozie_home=$(getent passwd oozie home | cut -f 6 -d :)
local domain=$(hostname -d)
local keystore_file="${oozie_home}/.keystore"
local keystore_password="password"
local truststore_file="${oozie_home}/oozie.truststore"
local certificate_path="${oozie_home}/certificate.cert"
local certificate_secret_name=
if [[ "$(hostname -s)" == "${master_node}" ]]; then
test -f ${keystore_file} ||\
sudo -u oozie keytool -genkeypair -alias jetty -file ${keystore_file} \
-keyalg RSA -dname "CN=*.${domain}" \
-storepass "${keystore_password}" -keypass "${keystore_password}"
test -f ${certificate_path} ||\
sudo -u oozie keytool -exportcert -alias jetty -file "${certificate_path}" \
-storepass "${keystore_password}"
test -f ${truststore_file} ||\
sudo -u oozie keytool -import -noprompt -alias jetty -file "${certificate_path}" \
-keystore "${truststore_file}" -storepass "${keystore_password}"
if [[ ${NUM_LIVE_DATANODES} != 0 ]]; then
retry_command "hdfs dfs -put -f ${certificate_path} /tmp/oozie.certificate"
retry_command "hdfs dfs -put -f ${keystore_file} /tmp/oozie.keystore"
retry_command "hdfs dfs -put -f ${truststore_file} /tmp/oozie.truststore"
fi
else
if [[ ${NUM_LIVE_DATANODES} != 0 ]]; then
echo "Secondary master; attempting to copy SSL files (truststore, keystore, certificate) from HDFS."
retry_command "hdfs dfs -get /tmp/oozie.truststore ${truststore_file}"
retry_command "hdfs dfs -get /tmp/oozie.keystore ${keystore_file}"
retry_command "hdfs dfs -get /tmp/oozie.certificate ${certificate_path}"
fi
fi
# Configure the Oozie client to use the truststore.
echo "export OOZIE_CLIENT_OPTS='-Djavax.net.ssl.trustStore=${truststore_file}'" >> /usr/lib/oozie/conf/oozie-client-env.sh
# Configure the Oozie client to use the HTTPS URL.
echo "export OOZIE_URL='https://$(hostname -f):11443/oozie'" >> /usr/lib/oozie/conf/oozie-client-env.sh
set_oozie_property 'oozie.https.enabled' 'true'
set_oozie_property 'oozie.https.keystore.file' "${keystore_file}"
set_oozie_property 'oozie.https.keystore.pass' "${keystore_password}"
set_oozie_property 'oozie.https.truststore.file' "${truststore_file}"
}
function install_oozie() {
local enable_ssl
enable_ssl=$(/usr/share/google/get_metadata_value attributes/oozie-enable-ssl || echo "false")
# Upgrade the repository and install Oozie
if [[ ${OS_NAME} == rocky ]]; then
# update dnf proxy
retry_command "dnf -y -v install oozie"
# unzip does not come pre-installed on the 2.1-rocky8 image
if [[ $(echo "${DATAPROC_IMAGE_VERSION} >= 2.1" | bc -l) == 1 ]]; then
retry_command "dnf -y install unzip"
find /usr/lib/oozie/lib/ -name 'guava*.jar' -delete
cp /usr/lib/hadoop/lib/hadoop-shaded-guava-1.1.1.jar /usr/lib/oozie/lib
fi
# add mysql service dependency on oozie service
sed -i '/^# Required-Start:/ s/$/ mysqld.service/' /etc/init.d/oozie
# setup symlinks for hadoop jar dependencies
ln -sf /usr/lib/hadoop/hadoop-common.jar \
/usr/lib/hadoop/hadoop-auth.jar \
/usr/lib/hadoop/hadoop-annotations.jar \
/usr/lib/hadoop-hdfs/hadoop-hdfs-client.jar \
/usr/lib/hadoop-yarn/hadoop-yarn-common.jar \
/usr/lib/hadoop-yarn/hadoop-yarn-client.jar \
/usr/lib/hadoop-yarn/hadoop-yarn-server-common.jar \
/usr/lib/hadoop-yarn/hadoop-yarn-api.jar \
/usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-jobclient.jar \
/usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-app.jar \
/usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-common.jar \
/usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar \
/usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-shuffle.jar /usr/lib/oozie/lib/
elif [[ ${OS_NAME} == ubuntu ]] || [[ ${OS_NAME} == debian ]]; then
retry_command "apt-get install -y gnupg2 && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys B7B3B788A8D3785C"
retry_command "apt-get update --allow-releaseinfo-change"
retry_command "apt-get install -q -y oozie oozie-client"
else
echo "Unsupported OS: '${OS_NAME}'"
exit 1
fi
# For Oozie, remove Log4j 2 jar not compatible with Log4j 1 that was brought by Hive 2
find /usr/lib/oozie/lib -name "log4j-1.2-api*.jar" -delete
# Delete redundant Slf4j backend implementation
find /usr/lib/oozie/lib -name "slf4j-simple*.jar" -delete
find /usr/lib/oozie/lib -name "log4j-slf4j-impl*.jar" -delete
# Redirect Log4j2 logging to Slf4j backend
local log4j2_version
log4j2_version=$(
find /usr/lib/oozie/lib -name "log4j-core*-2.*.jar" | cut -d '/' -f 6 | cut -d '-' -f 3
)
log4j2_version=${log4j2_version/.jar/}
if [[ -n ${log4j2_version} ]]; then
local log4j2_to_slf4j=log4j-to-slf4j-${log4j2_version}.jar
local log4j2_to_slf4j_url=${MAVEN_CENTRAL_URI}/org/apache/logging/log4j/log4j-to-slf4j/${log4j2_version}/${log4j2_to_slf4j}
wget -nv --timeout=30 --tries=5 --retry-connrefused "${log4j2_to_slf4j_url}" -P /usr/lib/oozie/lib
fi
# Delete old versions of Jetty jars brought in by dependencies
find /usr/lib/oozie/ -name "jetty*-6.*.jar" -delete
local oozie_version
oozie_version=$(oozie version 2>&1 |
sed -n 's/.*Oozie[^:]\+:[[:blank:]]\+\([0-9]\+\.[0-9]\.[0-9]\+\+\).*/\1/p' | head -n1)
if [[ $(min_version '5.0.0' "${oozie_version}") == 5.0.0 ]]; then
find /usr/lib/oozie/ -name "jetty*-7.*.jar" -delete
fi
if [[ "$(hostname -s)" == "${master_node}" ]]; then
local tmp_dir
tmp_dir=$(mktemp -d -t oozie-install-XXXX)
# The ext library is needed to enable the Oozie web console
wget -nv --timeout=30 --tries=5 --retry-connrefused \
http://archive.cloudera.com/gplextras/misc/ext-2.2.zip -P "${tmp_dir}"
unzip -o -q "${tmp_dir}/ext-2.2.zip" -d /var/lib/oozie
# Install share lib
tar -xzf /usr/lib/oozie/oozie-sharelib.tar.gz -C "${tmp_dir}"
if [[ $(min_version '5.0.0' "${oozie_version}") != 5.0.0 ]]; then
# Workaround to issue where jackson 1.8 and 1.9 jars are found on the classpath, causing
# AbstractMethodError at runtime. We know hadoop/lib has matching vesions of jackson.
rm -f "${tmp_dir}"/share/lib/hive2/jackson-*
cp /usr/lib/hadoop/lib/jackson-* "${tmp_dir}/share/lib/hive2/"
fi
if ! hdfs dfs -test -d "/user/oozie"; then
await_hdfs_datanodes
if [[ ${NUM_LIVE_DATANODES} != 0 ]]; then
hadoop fs -mkdir -p /user/oozie/
hadoop fs -put -f "${tmp_dir}/share" /user/oozie/
if grep '^dataproc' /etc/passwd ; then
local hdfs_username=dataproc
else
local hdfs_username=hdfs
fi
sudo -u ${hdfs_username} hadoop fs -chown oozie /user/oozie
fi
fi
# Clean up temporary fles
rm -rf "${tmp_dir}"
fi
# Link the MySQL JDBC driver to the Oozie library directory
ln -sf /usr/share/java/mysql.jar /usr/lib/oozie/lib/mysql.jar
# Set JDBC properties
mysql_host=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
if [[ "${enable_ssl}" == 'true' ]]; then
configure_ssl
fi
set_oozie_property 'oozie.service.JPAService.jdbc.driver' "com.mysql.cj.jdbc.Driver"
set_oozie_property 'oozie.service.JPAService.jdbc.url' "jdbc:mysql://${mysql_host}/oozie"
set_oozie_property 'oozie.service.JPAService.jdbc.username' "oozie"
set_oozie_property 'oozie.service.JPAService.jdbc.password' "${OOZIE_PASSWORD}"
set_oozie_property 'oozie.email.smtp.host' "${METADATA_EMAIL_SMTP_HOST}"
set_oozie_property 'oozie.email.from.address' "${METADATA_EMAIL_FROM_ADDRESS}"
set_oozie_property 'oozie.action.max.output.data' "20000"
# Set hostname to allow connection from other hosts (not only localhost)
set_oozie_property 'oozie.http.hostname' "$(hostname -s)"
# Following property was requested in customer case
set_oozie_property 'oozie.service.WorkflowAppService.WorkflowDefinitionMaxLength' "1500000"
# Following 2 properties added for customer case
set_oozie_property 'oozie.service.URIHandlerService.uri.handlers' "org.apache.oozie.dependency.FSURIHandler,org.apache.oozie.dependency.HCatURIHandler"
set_oozie_property 'oozie.credentials.credentialclasses' \
"hcat=org.apache.oozie.action.hadoop.HCatCredentials,hive2=org.apache.oozie.action.hadoop.Hive2Credentials,hbase=org.apache.oozie.action.hadoop.HbaseCredentials"
# Following 4 properties provided by customer platform team for CEAM - Oozie to HCat integration
set_oozie_property 'oozie.services.ext' "org.apache.oozie.service.PartitionDependencyManagerService,org.apache.oozie.service.HCatAccessorService"
set_oozie_property 'oozie.service.HCatAccessorService.hcat.configuration' "/etc/hive/conf.dist/hive-site.xml"
set_oozie_property 'oozie.service.coord.input.check.requeue.interval' "120000"
set_oozie_property 'oozie.service.coord.push.check.requeue.interval' "120000"
# Following properties were added for materialization issues observed in the NDL data lake
set_oozie_property 'oozie.service.PurgeService.purge.interval' "86400"
set_oozie_property 'oozie.service.CallableQueueService.threads' "100"
set_oozie_property 'oozie.service.CallableQueueService.callable.concurrency' "50"
set_oozie_property 'oozie.service.CoordMaterializeTriggerService.lookup.interval' "300"
set_oozie_property 'oozie.service.CoordMaterializeTriggerService.scheduling.interval' "60"
set_oozie_property 'oozie.service.CoordMaterializeTriggerService.materialization.window' "1500"
set_oozie_property 'oozie.service.CoordMaterializeTriggerService.callable.batch.size' "10"
set_oozie_property 'oozie.service.CoordMaterializeTriggerService.materialization.system.limit' "150"
set_oozie_property 'oozie.service.JPAService.pool.max.active.conn' "50"
set_oozie_property 'oozie.service.StatusTransitService.backward.support.for.states.without.error' "false"
set_oozie_property 'oozie.service.ActionCheckerService.action.check.delay' "300"
set_oozie_property 'oozie.action.retry.policy' "exponential"
# Hadoop must allow impersonation for Oozie to work properly
set_hadoop_property 'hadoop.proxyuser.oozie.groups' '*'
set_hadoop_property 'hadoop.proxyuser.oozie.hosts' '*'
set_oozie_property 'oozie.service.HadoopAccessorService.supported.filesystems' 'hdfs,gs'
set_oozie_property 'fs.AbstractFileSystem.gs.impl' 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS'
# https://biconsult.ru/files/new2/Data%20Lake%20for%20Enterprises.pdf page 784
set_oozie_property 'oozie.service.ProxyUserService.proxyuser.oozie.hosts' '*'
set_oozie_property 'oozie.service.ProxyUserService.proxyuser.oozie.groups' '*'
if [[ "$(hostname -s)" == "${master_node}" ]]; then
# Create the Oozie user in MySQL. Do this before the copies, since other
# masters may start up and attempt to connect before the HDFS copies
# below complete. The other masters need to be able to connect to MySQL.
retry_command "/usr/bin/mysql -u ${MYSQL_ROOT_USERNAME} --password='${MYSQL_ROOT_PASSWORD}' -e 'use ${OOZIE_DB_NAME}' || /usr/bin/mysqladmin -u ${MYSQL_ROOT_USERNAME} --password='${MYSQL_ROOT_PASSWORD}' create ${OOZIE_DB_NAME}"
/usr/bin/mysql -u ${MYSQL_ROOT_USERNAME} --password="${MYSQL_ROOT_PASSWORD}" <<EOM
CREATE USER IF NOT EXISTS '${OOZIE_DB_USERNAME}'@'%' IDENTIFIED BY '${OOZIE_PASSWORD}';
GRANT ALL PRIVILEGES ON ${OOZIE_DB_NAME}.* TO '${OOZIE_DB_USERNAME}'@'%';
FLUSH PRIVILEGES;
EOM
fi
if [[ "$(hostname -s)" == "${master_node}" ]]; then
local tmp_dir
tmp_dir=$(mktemp -d -t oozie-install-XXXX)
# The ext library is needed to enable the Oozie web console
wget -nv --timeout=30 --tries=5 --retry-connrefused \
http://archive.cloudera.com/gplextras/misc/ext-2.2.zip -P "${tmp_dir}"
unzip -o -q "${tmp_dir}/ext-2.2.zip" -d /var/lib/oozie
# Install share lib
tar -xzf /usr/lib/oozie/oozie-sharelib.tar.gz -C "${tmp_dir}"
if [[ $(min_version '5.0.0' "${oozie_version}") != 5.0.0 ]]; then
# Workaround to issue where jackson 1.8 and 1.9 jars are found on the classpath, causing
# AbstractMethodError at runtime. We know hadoop/lib has matching vesions of jackson.
rm -f "${tmp_dir}"/share/lib/hive2/jackson-*
cp /usr/lib/hadoop/lib/jackson-* "${tmp_dir}/share/lib/hive2/"
fi
# start - copy spark and hive dependencies
local ADDITIONAL_JARS=""
if [[ ${OS_NAME} == rocky ]]; then
if [[ $(echo "${DATAPROC_IMAGE_VERSION} >= 2.1" | bc -l) == 1 ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/spark-hadoop-cloud*.jar /usr/lib/spark/jars/hadoop-cloud-storage-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar "
if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar "
else
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar "
fi
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.5" | bc -l) == 1 ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/spark-hadoop-cloud*.jar /usr/lib/spark/jars/hadoop-cloud-storage-*.jar /usr/lib/spark/jars/re2j-1.1.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/htrace-core4-*-incubating.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar "
if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar "
else
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar "
fi
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.4" | bc -l) == 1 ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/htrace-core4-*-incubating.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar "
if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar "
else
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar "
fi
find /usr/lib/oozie/lib/ -name 'guava*.jar' -delete
wget -P /usr/lib/oozie/lib ${MAVEN_CENTRAL_URI}/com/google/guava/guava/11.0.2/guava-11.0.2.jar
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} >= 1.3" | bc -l) == 1 ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar "
if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar "
else
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar "
fi
ADDITIONAL_JARS=""
find /usr/lib/oozie/lib/ -name 'guava*.jar' -delete
wget -P /usr/lib/oozie/lib ${MAVEN_CENTRAL_URI}/com/google/guava/guava/11.0.2/guava-11.0.2.jar
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.2" | bc -l) == 1 ]]; then
ADDITIONAL_JARS=""
find /usr/lib/oozie/lib/ -name 'guava*.jar' -delete
wget -P /usr/lib/oozie/lib ${MAVEN_CENTRAL_URI}/com/google/guava/guava/11.0.2/guava-11.0.2.jar
else
echo "unsupported DATAPROC_IMAGE_VERSION: ${DATAPROC_IMAGE_VERSION}" >&2
exit 1
fi
else
if [[ $(echo "${DATAPROC_IMAGE_VERSION} >= 2.1" | bc -l) == 1 ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/spark-hadoop-cloud*.jar /usr/lib/spark/jars/hadoop-cloud-storage-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar "
if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar "
else
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar "
fi
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.5" | bc -l) == 1 ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/spark-hadoop-cloud*.jar /usr/lib/spark/jars/hadoop-cloud-storage-*.jar /usr/lib/spark/jars/re2j-1.1.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/htrace-core4-*-incubating.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar "
if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar "
else
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar "
fi
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.4" | bc -l) == 1 ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/htrace-core4-*-incubating.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar "
if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar "
else
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar "
fi
find /usr/lib/oozie/lib/ -name 'guava*.jar' -delete
wget -P /usr/lib/oozie/lib ${MAVEN_CENTRAL_URI}/com/google/guava/guava/11.0.2/guava-11.0.2.jar
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.3" | bc -l) == 1 ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar "
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar "
if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar "
else
ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar "
fi
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.2" | bc -l) == 1 ]]; then
ADDITIONAL_JARS=""
else
echo "unsupported DATAPROC_IMAGE_VERSION: ${DATAPROC_IMAGE_VERSION}" >&2
exit 1
fi
fi
if [[ ${NUM_LIVE_DATANODES} != 0 ]]; then
hadoop fs -put -f \
${tmp_dir}/share/lib/hive/stax-api-*.jar \
${tmp_dir}/share/lib/hive/commons-*.jar \
/usr/lib/spark/python/lib/py*.zip \
${ADDITIONAL_JARS} /user/oozie/share/lib/spark
hadoop fs -put -f /usr/lib/hive/lib/disruptor*.jar /user/oozie/share/lib/hive
hadoop fs -put -f /usr/lib/hive/lib/hive-service-*.jar /user/oozie/share/lib/hive2
# end - copy spark and hive dependencies
# For oozie actions, remove log4j from oozie sharelib to allow log4j api classes loaded to avoid conflicts
res=`hadoop fs -find /user/oozie/share/lib/ -name "log4j-1.2.*"`
for i in $res
do
if [[ $(hadoop fs -find $(dirname "$i") -name "log4j-1.2-api*" | wc -l) -gt 0 ]]; then
hadoop fs -cp -f $i $i-backup
hadoop fs -rm $i
fi
done
# Clean up temporary files if datanodes are live
rm -rf "${tmp_dir}"
fi
fi
if [[ "$(hostname -s)" == "${master_node}" ]]; then
# Create the Oozie database. Since we are using MySQL,
# only do this on the master node.
retry_command "sudo -u oozie /usr/lib/oozie/bin/ooziedb.sh create -run"
fi
local gcs_connector_dir="/usr/local/share/google/dataproc/lib"
if [[ ! -d $gcs_connector_dir ]]; then
gcs_connector_dir="/usr/lib/hadoop/lib"
fi
cp "${gcs_connector_dir}/gcs-connector.jar" /usr/lib/oozie/lib/
# Detect if current node configuration is HA and then set oozie servers
local additional_nodes
additional_nodes=$(/usr/share/google/get_metadata_value attributes/dataproc-master-additional |
sed 's/,/\n/g' | wc -l)
if [[ ${additional_nodes} -ge 2 ]]; then
echo 'Starting configuration for HA'
# List of servers is used for proper zookeeper configuration.
# It is needed to replace original ports range with specific one
local servers
servers=$(grep 'server\.' /usr/lib/zookeeper/conf/zoo.cfg |
sed 's/server.//g' |
sed 's/:2888:3888//g' |
cut -d'=' -f2- |
sed 's/\n/,/g' |
head -n 3 |
sed 's/$/:2181,/g' |
xargs -L3 |
sed 's/.$//g')
/usr/local/bin/bdconfig set_property \
--configuration_file "/etc/oozie/conf/oozie-site.xml" \
--name 'oozie.services.ext' --value \
'org.apache.oozie.service.ZKLocksService,
org.apache.oozie.service.ZKXLogStreamingService,
org.apache.oozie.service.ZKJobsConcurrencyService,
org.apache.oozie.service.ZKUUIDService' \
--clobber
/usr/local/bin/bdconfig set_property \
--configuration_file "/etc/oozie/conf/oozie-site.xml" \
--name 'oozie.zookeeper.connection.string' --value "${servers}" \
--clobber
fi
# Workaround to avoid classnotfound issues due to old curator jar in Oozie classpath
if [ -f "/usr/lib/oozie/lib/curator-framework-2.5.0.jar" ]
then
find /usr/lib/oozie/lib \
-name "curator-framework*.jar" -o \
-name "curator-recipes*.jar" -o \
-name "curator-client*.jar" \
-delete
if [ $(ls ${curator_src}/ | grep "curator.*-${curator_version}.jar" | wc -l) -ne 0 ]; then
cp ${curator_src}/curator*-${curator_version}.jar /usr/lib/oozie/lib
fi
fi
# Restart the zookeeper service
if which systemctl > /dev/null && systemctl list-units | grep zookeeper-server > /dev/null ; then
systemctl restart zookeeper-server
else
/usr/lib/zookeeper/bin/zkServer.sh restart
fi
# HDFS and YARN must be cycled; restart to clean things up
for service in hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode hadoop-yarn-resourcemanager oozie; do
if [[ $(systemctl list-unit-files | grep ${service}) != '' ]] &&
[[ $(systemctl is-enabled ${service}) == 'enabled' ]]; then
systemctl restart ${service}
fi
done
# Leave safe mode - HDFS will enter safe mode because of Name Node restart
if [[ "$(hostname -s)" == "${master_node}" ]]; then
case "${DATAPROC_IMAGE_VERSION}" in
"1.3" | "1.4")
hadoop dfsadmin -safemode leave
;;
*)
hdfs dfsadmin -safemode leave
;;
esac
fi
}
function install_fluentd_configuration() {
# the /etc/google-fluentd/config.d is not created if the cluster is created with the flag dataproc:dataproc.logging.stackdriver.enable=false
# enable oozie fluentd only if the directory exists
if [[ -d /etc/google-fluentd/config.d ]]; then
cat <<EOF > /etc/google-fluentd/config.d/oozie_fluentd.conf
#################
#
# Oozie
#
# Fluentd config to tail the oozie log files.
# Currently severity is a seperate field from the Cloud Logging log_level.
<source>
@type tail
format none
path /var/log/oozie/*
pos_file /var/tmp/fluentd.dataproc.oozie.pos
refresh_interval 2s
read_from_head true
tag concat.raw.tail.*
</source>
<match concat.raw.tail.**>
@type detect_exceptions
remove_tag_prefix concat
multiline_flush_interval 0.1
</match>
<filter raw.tail.**>
@type parser
key_name message
<parse>
@type multi_format
<pattern>
format /^((?<time>[^ ]* [^ ]*) *(?<severity>[^ ]*) *(?<class>[^ ]*): (?<message>.*))/
time_format %Y-%m-%d %H:%M:%S,%L
</pattern>
<pattern>
format /^((?<time>\S+)\s+(?<severity>\S+)\s+\[(?<thread>[^\]]*)\]\s+(?<class>\S+):\s+(?<message>.*))/
time_format %Y-%m-%dT%H:%M:%S,%L
</pattern>
<pattern>
format none
</pattern>
</parse>
</filter>
<match raw.tail.**>
@type record_reformer
renew_record false
enable_ruby true
auto_typecast true
tag ${tag_parts[-3]}
# The following can be used when turning on jobid re-logging:
# dataproc.googleapis.com/process_id \${job}
filename \${tag_suffix[-2]}
</match>
EOF
if [[ $(echo "${DATAPROC_IMAGE_VERSION} >= 2.2" | bc -l) == 1 ]]; then
systemctl reload-or-restart google-fluentd-docker
else
systemctl reload-or-restart google-fluentd
fi
else
echo "Skipped fluentd configuration for oozie."
fi
}
function main() {
#Remove debian backports
if [[ ${OS_NAME} == debian ]] && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then
remove_old_backports
fi
# Only run on the master node of the cluster
if [[ "${ROLE}" == 'Master' ]]; then
install_oozie
install_fluentd_configuration
fi
}
main