forked from NVIDIA/spark-rapids
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild.sh
More file actions
executable file
·279 lines (246 loc) · 11.9 KB
/
build.sh
File metadata and controls
executable file
·279 lines (246 loc) · 11.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
#!/bin/bash
#
# Copyright (c) 2020-2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script installs dependencies required to build RAPIDS Accelerator for Apache Spark on DB.
# All the environments can be overwritten by shell variables:
# SPARKSRCTGZ: Archive file location of the plugin repository. Default is empty.
# BASE_SPARK_VERSION: Spark version [3.2.1, 3.3.0]. Default is pulled from current instance.
# BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS: The version of Spark used when we install the
# Databricks jars in .m2. Default is {BASE_SPARK_VERSION}.
# MVN_OPT: Options to be passed to the MVN commands. Note that "-DskipTests" is hardcoded in the
# build command.
# SKIP_DEP_INSTALL: Skips installation of dependencies when set to 1. Default is 0.
# Usage:
# - build for DB10.4/Spark 3.2.1:
# `BASE_SPARK_VERSION=3.2.1 ./jenkins/databricks/build.sh`
# - Build without dependency installation:
# `BASE_SPARK_VERSION=3.2.1 SKIP_DEP_INSTALL=1 ./jenkins/databricks/build.sh`
# To add support of new runtime:
# 1. Review `install_deps.py` to make sure that the prefix of the jar files is set
# correctly. If not, then add a new if-else block to set the variables as necessary.
# 2. The jar files and their artifacts are defined in `install_deps.py`.
# You may need to add another conditional block because some runtimes may require special
# handling.
# For example, "3.1.2" had different patterns for a few JARs (i.e., HIVE).
# 3. If you had to go beyond the above steps to support the new runtime, then update the
# instructions accordingly.
set -ex
# Map of software versions for each dependency.
declare -A sw_versions
# Map of jar file locations of all dependencies
declare -A dep_jars
# Map of string arrays to hold the groupId and the artifactId for each JAR
declare -A artifacts
# Initializes the scripts and the variables based on teh arguments passed to the script.
initialize()
{
# Print DB runtime version details
if [[ -f /databricks/BUILDINFO ]]; then
echo "DB runtime version details:"
cat /databricks/BUILDINFO
else
echo "No /databricks/BUILDINFO file found"
fi
# ubuntu22
sudo sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \
-e 's|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g' \
/etc/apt/sources.list
# ubuntu24
sudo find /etc/apt/sources.list.d/ -name '*.sources' -exec sed -i \
-e "s|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g" \
-e "s|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g" {} +
# force cache refresh
sudo apt-get clean
sudo rm -rf /var/lib/apt/lists/*
sudo apt-get update
# install rsync to be used for copying onto the databricks nodes
sudo apt install -y rsync openjdk-17-jdk
if [[ ! -d $HOME/apache-maven-3.6.3 ]]; then
# DBFS cache for Maven
DBFS_CACHE_DIR=${DBFS_CACHE_DIR:-"/dbfs/cached_jars"}
JAR_FILE_NAME=${JAR_FILE_NAME:-"apache-maven-3.6.3-bin.tar.gz"}
MAVEN_CACHE_FILE=${MAVEN_CACHE_FILE:-"$DBFS_CACHE_DIR/$JAR_FILE_NAME"}
MAVEN_URL=${MAVEN_URL:-"https://archive.apache.org/dist/maven/maven-3/3.6.3/binaries/$JAR_FILE_NAME"}
# Create cache directory if it doesn't exist
mkdir -p "$DBFS_CACHE_DIR"
# Check if file exists in DBFS cache
if [[ -f "$MAVEN_CACHE_FILE" ]]; then
echo "Found Maven in DBFS cache, copying to /tmp..."
cp "$MAVEN_CACHE_FILE" "/tmp/$JAR_FILE_NAME"
else
echo "Maven not found in DBFS cache, downloading from archive.apache.org..."
if wget "$MAVEN_URL" -P /tmp; then
echo "Download successful, caching to DBFS..."
cp "/tmp/$JAR_FILE_NAME" "$MAVEN_CACHE_FILE" || true
else
echo "Download failed"
exit 1
fi
fi
tar xf "/tmp/$JAR_FILE_NAME" -C $HOME
rm -f "/tmp/$JAR_FILE_NAME"
sudo ln -s $HOME/apache-maven-3.6.3/bin/mvn /usr/local/bin/mvn
fi
# Set JDK 17 as the default for nightly builds across both:
# scala2.12 and scala2.13 (with maven.compiler.source as 1.8)
export JAVA_HOME=$(echo /usr/lib/jvm/java-1.17.0-*)
mvn -version
# Archive file location of the plugin repository
SPARKSRCTGZ=${SPARKSRCTGZ:-''}
# Version of Apache Spark we are building against
BASE_SPARK_VERSION=${BASE_SPARK_VERSION:-$(< /databricks/spark/VERSION)}
## '-Pfoo=1,-Dbar=2,...' to '-Pfoo=1 -Dbar=2 ...'
MVN_OPT=${MVN_OPT//','/' '}
BUILDVER=$(echo ${BASE_SPARK_VERSION} | sed 's/\.//g')db
# the version of Spark used when we install the Databricks jars in .m2
BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS:-$BASE_SPARK_VERSION}
SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS}-databricks
# Determine Scala version based on Spark version
# Spark 4.0+ uses Scala 2.13, earlier versions use 2.12
if [[ "$BASE_SPARK_VERSION" == 4.* ]]; then
export SCALA_BINARY_VER=2.13
fi
DBR_VER=$(cat /databricks/DBR_VERSION)
if [ $DBR_VER == '14.3' ]; then
DBR_VER=$(echo $DBR_VER | sed 's/\.//g')
# We are appending 143 in addition to the base spark version because Databricks 14.3
# and Databricks 15.4 are both based on spark version 3.5.0
BUILDVER="$BUILDVER$DBR_VER"
SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS="$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS-$DBR_VER"
elif [ $DBR_VER == '17.3' ]; then
DBR_VER=$(echo $DBR_VER | sed 's/\.//g')
# Appending 173 for Databricks 17.3 based on Spark 4.0.0
BUILDVER="$BUILDVER$DBR_VER"
SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS="$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS-$DBR_VER"
fi
# pull normal Spark artifacts and ignore errors then install databricks jars, then build again.
# this should match the databricks init script.
JARDIR=/databricks/jars
if [[ -n $SPARKSRCTGZ ]]
then
rm -rf spark-rapids
mkdir spark-rapids
echo "tar -zxf $SPARKSRCTGZ -C spark-rapids"
tar -zxf $SPARKSRCTGZ -C spark-rapids
cd spark-rapids
fi
# Now, we can set the WORKSPACE
export WORKSPACE=$PWD
# set the retry count for mvn commands
MVN_CMD="mvn -Dmaven.wagon.http.retryHandler.count=3"
# Determine which pom to use based on Scala version
if [[ "$SCALA_BINARY_VER" == "2.13" ]]; then
POM_FILE="scala2.13/pom.xml"
else
POM_FILE="pom.xml"
fi
# getting the versions of CUDA, SCALA and SPARK_PLUGIN
SPARK_PLUGIN_JAR_VERSION=$($MVN_CMD help:evaluate -q -f $POM_FILE -pl dist -Dexpression=project.version -DforceStdout)
SCALA_VERSION=$($MVN_CMD help:evaluate -q -f $POM_FILE -pl dist -Dexpression=scala.binary.version -DforceStdout)
CUDA_VERSION=$($MVN_CMD help:evaluate -q -f $POM_FILE -pl dist -Dexpression=cuda.version -DforceStdout)
RAPIDS_BUILT_JAR=rapids-4-spark_$SCALA_VERSION-$SPARK_PLUGIN_JAR_VERSION.jar
# If set to 1, skips installing dependencies into mvn repo.
SKIP_DEP_INSTALL=${SKIP_DEP_INSTALL:-'0'}
# export 'M2DIR' so that shims can get the correct Spark dependency info
export M2DIR=/home/ubuntu/.m2/repository
# whether to build a two-shim jar with the lowest supported upstream Spark version
WITH_DEFAULT_UPSTREAM_SHIM=${WITH_DEFAULT_UPSTREAM_SHIM:-1}
# Print a banner of the build configurations.
printf '+ %*s +\n' 100 '' | tr ' ' =
echo "Initializing build for Databricks:"
echo
echo "tgz : ${SPARKSRCTGZ}"
echo "Base Spark version : ${BASE_SPARK_VERSION}"
echo "maven options : ${MVN_OPT}"
echo "BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS : ${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS}"
echo "workspace : ${WORKSPACE}"
echo "Scala version : ${SCALA_VERSION}"
echo "CUDA version : ${CUDA_VERSION}"
echo "Rapids build jar : ${RAPIDS_BUILT_JAR}"
echo "Build Version : ${BUILDVER}"
echo "Skip Dependencies : ${SKIP_DEP_INSTALL}"
echo "Include Default Spark Shim : ${WITH_DEFAULT_UPSTREAM_SHIM}"
echo "Extra environments : ${EXTRA_ENVS}"
printf '+ %*s +\n' 100 '' | tr ' ' =
}
# Install dependency jars to MVN repository.
install_dependencies()
{
local depsPomXml="$(mktemp /tmp/install-databricks-deps-XXXXXX-pom.xml)"
python jenkins/databricks/install_deps.py "${BASE_SPARK_VERSION}" "${SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS}" "${SCALA_VERSION}" "${M2DIR}" "${JARDIR}" "${depsPomXml}"
$MVN_CMD -f ${depsPomXml} initialize
echo "Done with installation of Databricks dependencies, removing ${depsPomXml}"
rm ${depsPomXml}
}
##########################
# Main script starts here
##########################
## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...'
if [ -n "$EXTRA_ENVS" ]; then
export ${EXTRA_ENVS//','/' '}
fi
initialize
if [[ $SKIP_DEP_INSTALL == "1" ]]
then
echo "!!!! SKIP_DEP_INSTALL is set to $SKIP_DEP_INSTALL. Skipping install-file for dependencies."
else
echo "!!!! Installing dependendecies. Set SKIP_DEP_INSTALL=1 to speed up reruns of build.sh"# Install required dependencies.
install_dependencies
fi
if [[ "$WITH_BLOOP" == "1" ]]; then
MVN_OPT="-DbloopInstall $MVN_OPT"
MVN_PHASES="clean install"
for jdk_ver in 17 11 8; do
if [[ $jdk_ver == 8 ]]; then
echo "WARNING: could not find an 11+ JDK. Bloop Project might not be fully functional" >&2
exit 1
fi
jdk_home="/usr/lib/jvm/zulu${jdk_ver}"
if ls $jdk_home > /dev/null; then
export JAVA_HOME=$jdk_home
break
fi
done
WITH_DEFAULT_UPSTREAM_SHIM=0
else
MVN_PHASES="clean package"
fi
# Build the RAPIDS plugin by running package command for databricks
$MVN_CMD -B -f $POM_FILE -Ddatabricks -Dbuildver=$BUILDVER $MVN_PHASES -DskipTests $MVN_OPT
if [[ "$WITH_DEFAULT_UPSTREAM_SHIM" != "0" ]]; then
echo "Building the default Spark shim and creating a two-shim dist jar"
UPSTREAM_BUILDVER=$($MVN_CMD help:evaluate -q -f $POM_FILE -pl dist -Dexpression=buildver -DforceStdout)
$MVN_CMD -B -f $POM_FILE -Dbuildver=$UPSTREAM_BUILDVER package -pl dist -am -DskipTests -Dmaven.scaladoc.skip $MVN_OPT \
-Dincluded_buildvers=$UPSTREAM_BUILDVER,$BUILDVER
fi
# "Delete the unused object files to reduce the size of the Spark Rapids built tar."
# Determine the correct dist target directory based on which POM was used
if [[ "$SCALA_BINARY_VER" == "2.13" ]]; then
DIST_TARGET="scala2.13/dist/target"
else
DIST_TARGET="dist/target"
fi
rm -rf $DIST_TARGET/jni-deps/
find $DIST_TARGET/parallel-world/ -mindepth 1 -maxdepth 1 ! -name META-INF -exec rm -rf {} +
cd /home/ubuntu
tar -zcf spark-rapids-built.tgz spark-rapids
# Back up spark rapids built jars for the CI_PART2 job to run integration tests
TEST_MODE=${TEST_MODE:-'DEFAULT'}
PLUGIN_BUILT_TGZ=${PLUGIN_BUILT_TGZ:-"$1"}
if [[ "$TEST_MODE" == "CI_PART1" && -n "$PLUGIN_BUILT_TGZ" ]]; then
mkdir -p $(dirname $PLUGIN_BUILT_TGZ)
cp spark-rapids-built.tgz $PLUGIN_BUILT_TGZ
fi