forked from NVIDIA/spark-rapids
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcommon_vars.sh
More file actions
119 lines (104 loc) · 4.73 KB
/
common_vars.sh
File metadata and controls
119 lines (104 loc) · 4.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/bin/bash
#
# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...'
if [ -n "$EXTRA_ENVS" ]; then
export ${EXTRA_ENVS//','/' '}
fi
SPARK_VER=${SPARK_VER:-$(< /databricks/spark/VERSION)}
# Print DB runtime version details
if [[ -f /databricks/BUILDINFO ]]; then
echo "DB runtime version details:"
cat /databricks/BUILDINFO
else
echo "No /databricks/BUILDINFO file found"
fi
# Extract Databricks version from deployed configs.
# spark.databricks.clusterUsageTags.sparkVersion is set automatically on Databricks
# notebooks but not when running Spark manually.
#
# At the OS level the DBR version can be obtailed via
# 1. DATABRICKS_RUNTIME_VERSION environment set by Databricks, e.g., 12.2
# 2. File at /databricks/DBR_VERSION created by Databricks, e.g., 12.2
# 3. The value for Spark conf in file /databricks/common/conf/deploy.conf created by Databricks,
# e.g. 12.2.x-gpu-ml-scala2.12
#
# For cases 1 and 2 append '.' for version matching in 3XYdb SparkShimServiceProvider
#
DBR_VERSION=/databricks/DBR_VERSION
DB_DEPLOY_CONF=/databricks/common/conf/deploy.conf
if [[ -n "${DATABRICKS_RUNTIME_VERSION}" ]]; then
export PYSP_TEST_spark_databricks_clusterUsageTags_sparkVersion="${DATABRICKS_RUNTIME_VERSION}."
elif [[ -f $DBR_VERSION || -f $DB_DEPLOY_CONF ]]; then
DB_VER="$(< ${DBR_VERSION})." || \
DB_VER=$(grep spark.databricks.clusterUsageTags.sparkVersion $DB_DEPLOY_CONF | sed -e 's/.*"\(.*\)".*/\1/')
# if we did not error out on reads we should have at least four characters "x.y."
if (( ${#DB_VER} < 4 )); then
echo >&2 "Unable to determine Databricks version, unexpected length of: ${DB_VER}"
exit 1
fi
export PYSP_TEST_spark_databricks_clusterUsageTags_sparkVersion=$DB_VER
else
cat << EOF
This node does not define
- DATABRICKS_RUNTIME_VERSION environment,
- Files containing version information: $DBR_VERSION, $DB_DEPLOY_CONF
Proceeding assuming a non-Databricks environment.
EOF
fi
# TODO make this standard going forward
if [[ "$SPARK_VER" == '3.5.0' || "$SPARK_VER" == '4.0.0' ]]; then
DB_VER_SUFFIX="${PYSP_TEST_spark_databricks_clusterUsageTags_sparkVersion//./}"
else
DB_VER_SUFFIX=""
fi
export SPARK_SHIM_VER=${SPARK_SHIM_VER:-"spark${SPARK_VER//.}db${DB_VER_SUFFIX}"}
# Setup SPARK_HOME if need
if [[ -z "$SPARK_HOME" ]]; then
# Configure spark environment on Databricks
export SPARK_HOME=$DB_HOME/spark
fi
# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB12.2
PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
PYTHON_SITE_PACKAGES=${PYTHON_SITE_PACKAGES:-"$HOME/.local/lib/${PYTHON_VERSION}/site-packages"}
# Get the correct py4j file.
PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
export PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
if [[ "$TEST" == "cache_test" || "$TEST" == "cache_test.py" ]]; then
export PYSP_TEST_spark_sql_cache_serializer="$PCBS_CONF"
fi
export TEST_TYPE=${TEST_TYPE:-"nightly"}
if [[ -n "$LOCAL_JAR_PATH" ]]; then
export LOCAL_JAR_PATH=$LOCAL_JAR_PATH
fi
## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
if [ -n "$SPARK_CONF" ]; then
CONF_LIST=${SPARK_CONF//','/' '}
for CONF in ${CONF_LIST}; do
KEY=${CONF%%=*}
VALUE=${CONF#*=}
## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
done
## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
export SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
fi