Skip to content

Commit 010152d

Browse files
committed
Merge branch 'branch-25.12' into change_jni_try_catch
2 parents 3ecee0b + 2eecac2 commit 010152d

35 files changed

+897
-88
lines changed

.github/workflows/shell-check.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# A workflow to check shell script syntax
16+
name: shell check
17+
18+
on:
19+
pull_request:
20+
types: [opened, synchronize, reopened]
21+
22+
jobs:
23+
shell-check:
24+
runs-on: ubuntu-latest
25+
if: "!contains(github.event.pull_request.title, '[bot]')"
26+
steps:
27+
- name: Checkout code
28+
uses: NVIDIA/spark-rapids-common/checkout@main
29+
30+
- name: Run ShellCheck
31+
uses: NVIDIA/spark-rapids-common/shell-check@main
32+
with:
33+
excluded_codes:
34+
SC3010,
35+
SC2054,
36+
SC2124
37+
# code explanation:
38+
# SC3010: In POSIX sh, [[ ]] is undefined.
39+
# SC2054: Use spaces, not commas, to separate array elements.
40+
# SC2124: Assigning an array to a string! Assign as array, or use * instead of @ to concatenate.
41+

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[submodule "thirdparty/cudf"]
22
path = thirdparty/cudf
33
url = https://github.com/rapidsai/cudf.git
4-
branch = branch-25.10
4+
branch = branch-25.12

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ $ ./build/build-in-docker install ...
165165
```
166166

167167
Now cd to ~/repos/NVIDIA/spark-rapids and build with one of the options from
168-
[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-25.10/CONTRIBUTING.md#building-from-source).
168+
[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-25.12/CONTRIBUTING.md#building-from-source).
169169

170170
```bash
171171
$ ./build/buildall

build/apply-patches

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22

33
#
4-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
4+
# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
55
#
66
# Licensed under the Apache License, Version 2.0 (the "License");
77
# you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ fi
5151

5252
CHANGED_FILES=$(git status --porcelain --untracked-files=no)
5353

54-
if [ \( -s "$FULLY_PATCHED_FILE" \) -a \( -n "$CHANGED_FILES" \) ] ; then
54+
if [ -s "$FULLY_PATCHED_FILE" ] && [ -n "$CHANGED_FILES" ] ; then
5555
if git apply -R --check "$FULLY_PATCHED_FILE" ; then
5656
echo "Patches appear to have been applied already"
5757
exit 0
@@ -60,7 +60,7 @@ fi
6060

6161
if [ -n "$CHANGED_FILES" ] ; then
6262
echo "Error: CUDF repository has uncommitted changes. No patches will be applied. Please clean the repository so we can try and add the needed patches"
63-
echo "$CHANGED_FILE"
63+
echo "$CHANGED_FILES"
6464
exit 1
6565
fi
6666

build/run-in-docker

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ if [[ "$HOST_CUDA_PATH" != "" ]]; then
8989
RO_DST+=("/usr/local/cuda")
9090
fi
9191
for (( i=0; i<${#RO_SRC[@]}; i++)); do
92-
MNT_ARGS+=(--mount type=bind,src=${RO_SRC[$i]},dst=${RO_DST[$i]},ro)
92+
MNT_ARGS+=(--mount "type=bind,src=${RO_SRC[$i]},dst=${RO_DST[$i]},ro")
9393
done
9494

9595
RW_SRC=(
@@ -99,11 +99,11 @@ RW_SRC=(
9999
"$LOCAL_MAVEN_REPO"
100100
)
101101
for (( i=0; i<${#RW_SRC[@]}; i++)); do
102-
MNT_ARGS+=(--mount type=bind,src=${RW_SRC[$i]},dst=${RW_SRC[$i]})
102+
MNT_ARGS+=(--mount "type=bind,src=${RW_SRC[$i]},dst=${RW_SRC[$i]}")
103103
done
104104

105105
$DOCKER_CMD run $DOCKER_GPU_OPTS $DOCKER_RUN_EXTRA_ARGS -u $(id -u):$(id -g) --rm \
106-
${MNT_ARGS[@]} \
106+
"${MNT_ARGS[@]}" \
107107
--workdir "$WORKDIR" \
108108
-e CCACHE_DIR="$LOCAL_CCACHE_DIR" \
109109
-e CMAKE_C_COMPILER_LAUNCHER="ccache" \

build/unapply-patches

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22

33
#
4-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
4+
# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
55
#
66
# Licensed under the Apache License, Version 2.0 (the "License");
77
# you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ fi
5151

5252
CHANGED_FILES=$(git status --porcelain --untracked-files=no)
5353

54-
if [ \( -s "$FULLY_PATCHED_FILE" \) -a \( -n "$CHANGED_FILES" \) ] ; then
54+
if [ -s "$FULLY_PATCHED_FILE" ] && [ -n "$CHANGED_FILES" ] ; then
5555
if git apply --check -R "$FULLY_PATCHED_FILE"; then
5656
echo "Patches appear to have been applied, so going to remove them"
5757
git apply -R -v "$FULLY_PATCHED_FILE"
@@ -66,14 +66,14 @@ if [ \( -s "$FULLY_PATCHED_FILE" \) -a \( -n "$CHANGED_FILES" \) ] ; then
6666

6767
exit 0
6868
else
69-
echo "Files are changed, but in a way where the full path file does not apply to remove them $FULL_PATCHED_FILE"
69+
echo "Files are changed, but in a way where the full path file does not apply to remove them $FULLY_PATCHED_FILE"
7070
exit 1
7171
fi
7272
fi
7373

7474
if [ -n "$CHANGED_FILES" ] ; then
7575
echo "Error: CUDF repository has uncommitted changes, but does not appear to have been patched. Please clean it and try again."
76-
echo "$CHANGED_FILE"
76+
echo "$CHANGED_FILES"
7777
exit 1
7878
else
7979
echo "No changes in CUDF repository to remove"

ci/Jenkinsfile.premerge

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ import ipp.blossom.*
3131
def githubHelper // blossom github helper
3232
def TEMP_IMAGE_BUILD = true
3333
def IMAGE_PREMERGE_CU12 = "${common.ARTIFACTORY_NAME}/sw-spark-docker/plugin-jni:rockylinux8-cuda12.9.1-blossom"
34-
def IMAGE_PREMERGE_CU13 = "${common.ARTIFACTORY_NAME}/sw-spark-docker/plugin-jni:rockylinux8-cuda13.0.0-blossom"
34+
def IMAGE_PREMERGE_CU13 = "${common.ARTIFACTORY_NAME}/sw-spark-docker/plugin-jni:rockylinux8-cuda13.0.1-blossom"
3535
def cpuImage = pod.getCPUYAML(IMAGE_PREMERGE_CU12)
3636
def PREMERGE_DOCKERFILE = 'ci/Dockerfile'
3737
def PREMERGE_TAG_CU12
@@ -162,7 +162,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
162162
"--network=host -f ${PREMERGE_DOCKERFILE} -t $IMAGE_PREMERGE_CU12 .")
163163
uploadDocker(IMAGE_PREMERGE_CU12)
164164
docker.build(IMAGE_PREMERGE_CU13,
165-
"--network=host -f ${PREMERGE_DOCKERFILE} -t $IMAGE_PREMERGE_CU13 --build-arg CUDA_VERSION=13.0.0 .")
165+
"--network=host -f ${PREMERGE_DOCKERFILE} -t $IMAGE_PREMERGE_CU13 --build-arg CUDA_VERSION=13.0.1 .")
166166
uploadDocker(IMAGE_PREMERGE_CU13)
167167
}
168168
}
@@ -214,7 +214,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
214214
kubernetes {
215215
label "cu12-${BUILD_TAG}"
216216
cloud "${common.CLOUD_NAME}"
217-
yaml pod.getGPUYAMLwithVolume("${IMAGE_PREMERGE_CU12}", "${env.GPU_RESOURCE}", "${PVC}", "${PVC_MOUNT_PATH}", '10', '38Gi')
217+
yaml pod.getGPUYAMLwithVolume("${IMAGE_PREMERGE_CU12}", "${env.GPU_RESOURCE}", "${PVC}", "${PVC_MOUNT_PATH}", '10', '48Gi')
218218
customWorkspace "${CUSTOM_WORKSPACE}-cu12"
219219
}
220220
}
@@ -247,7 +247,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
247247
kubernetes {
248248
label "cu13-${BUILD_TAG}"
249249
cloud "${common.CLOUD_NAME}"
250-
yaml pod.getGPUYAMLwithVolume("${IMAGE_PREMERGE_CU13}", "${env.GPU_RESOURCE}", "${PVC}", "${PVC_MOUNT_PATH}", '10', '38Gi', '580.65.06')
250+
yaml pod.getGPUYAMLwithVolume("${IMAGE_PREMERGE_CU13}", "${env.GPU_RESOURCE}", "${PVC}", "${PVC_MOUNT_PATH}", '10', '48Gi', '580.82.07')
251251
customWorkspace "${CUSTOM_WORKSPACE}-cu13"
252252
}
253253
}

ci/deploy.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22
#
3-
# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
3+
# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
66
# you may not use this file except in compliance with the License.
@@ -82,7 +82,7 @@ if [ "$SIGN_FILE" == true ]; then
8282
*)
8383
echo "Error unsupported sign type : $SIGN_TYPE !"
8484
echo "Please set variable SIGN_TOOL 'nvsec'or 'gpg'"
85-
exit -1
85+
exit 255
8686
;;
8787
esac
8888
else

docs/memory_management.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,11 @@ The selected thread transitions its state to `THREAD_SPLIT_THROW` and throws an
5555

5656
### Dedicated Threads vs. Pool Threads
5757

58-
From the view of the OOM state machine, each task has one or more "dedicated threads", along with zero or more "pool threads". When checking whether a task is blocked, OOM state machine is lenient on dedicated threads (only require any one of the dedicated threads to be blocked), but stringent on pool threads (all pool threads must be blocked). Being treated leniently is not always a good thing, it increases the chance of being mistakenly identified as a block task, thus causing unnecessary deadlock resolution. So we don't want a thread to be treated as a dedicated thread unless it is really necessary. There are two ways of avoiding a thread being treated as a dedicated thread:
58+
From the view of the OOM state machine, each task has one or more "dedicated threads", along with zero or more "pool threads" (background threads). When checking whether a task is blocked, OOM state machine is lenient on dedicated threads (only require any one of the dedicated threads to be blocked), but stringent on pool threads (all pool threads must be blocked). Being treated leniently is not always a good thing, it increases the chance of being mistakenly identified as a blocked task, thus causing unnecessary deadlock resolution. So we don't want a thread to be treated as a dedicated thread unless it is really necessary. There are two ways of avoiding a thread being treated as a dedicated thread:
5959

6060
1. Avoid calling TaskContext.setTaskContext() in the current thread, this will prevent OOM state machine connecting the current thread to the task as a dedicated thread.
61-
2. If you have to set TaskContext, then it's also a good idea to proactively register thread itself as a pool thread instead of a dedicated thread. An example can be found [here](https://github.com/NVIDIA/spark-rapids/blob/c39f6a6004b0cf684ca526172e87b2bd4481eb3a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala#L2056) for registering threads.
61+
2. Proactively register thread itself as a pool thread instead of a dedicated thread. An example can be found [here](https://github.com/NVIDIA/spark-rapids/blob/c39f6a6004b0cf684ca526172e87b2bd4481eb3a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala#L2056) for registering threads. (Don't forget to unregister the thread when it is done.)
62+
63+
In most cases, we recommend the second approach, because typically the main task and the background thread will form a producer-consumer relationship. The main task thread, which plays as a consumer, will typically wait for the background thread to produce data. If we choose approach 1 then while consumer is waiting, its Java thread state will be `WAITING`, and even if the producer is actively working (so the whole task should NOT be considered as blocked), the OOM state machine will mistakenly consider it as a blocked task because it cannot find any "pool thread" connected with this task. So "has at least one dedicated thread blocked on memory allocation, and all of the pool threads working on that task are also blocked" stands.
64+
65+
However, if we choose approach 2, then the OOM state machine will find at least one pool thread is NOT blocked, so "has at least one dedicated thread blocked on memory allocation, and all of the pool threads working on that task are also blocked" does NOT stand, then the task will NOT be considered as blocked.

pom.xml

Lines changed: 19 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
<groupId>com.nvidia</groupId>
2323
<artifactId>spark-rapids-jni</artifactId>
24-
<version>25.10.0-SNAPSHOT</version>
24+
<version>25.12.0-SNAPSHOT</version>
2525
<packaging>jar</packaging>
2626
<name>RAPIDS Accelerator JNI for Apache Spark</name>
2727
<description>
@@ -95,6 +95,7 @@
9595
<jni.classifier>${cuda.version}</jni.classifier>
9696
<cudf.path>${project.basedir}/thirdparty/cudf</cudf.path>
9797
<cudf.pin.path>${project.basedir}/thirdparty/cudf-pins/</cudf.pin.path>
98+
<guava.version>14.0.1</guava.version>
9899
<hadoop.version>3.4.0</hadoop.version>
99100
<junit.version>5.8.1</junit.version>
100101
<libcudf.build.path>${project.build.directory}/libcudf/cmake-build/</libcudf.build.path>
@@ -122,26 +123,28 @@
122123

123124
<dependencies>
124125
<dependency>
125-
<groupId>org.apache.arrow</groupId>
126-
<artifactId>arrow-vector</artifactId>
127-
<version>${arrow.version}</version>
128-
<scope>test</scope>
126+
<groupId>org.apache.hadoop</groupId>
127+
<artifactId>hadoop-client-api</artifactId>
128+
<version>${hadoop.version}</version>
129+
<scope>provided</scope>
129130
</dependency>
130131
<dependency>
131132
<groupId>org.apache.hadoop</groupId>
132-
<artifactId>hadoop-common</artifactId>
133+
<artifactId>hadoop-client-runtime</artifactId>
133134
<version>${hadoop.version}</version>
134135
<scope>test</scope>
135-
<exclusions>
136-
<exclusion>
137-
<groupId>org.slf4j</groupId>
138-
<artifactId>slf4j-reload4j</artifactId>
139-
</exclusion>
140-
<exclusion>
141-
<groupId>org.slf4j</groupId>
142-
<artifactId>slf4j-log4j12</artifactId>
143-
</exclusion>
144-
</exclusions>
136+
</dependency>
137+
<dependency>
138+
<groupId>com.google.guava</groupId>
139+
<artifactId>guava</artifactId>
140+
<version>${guava.version}</version>
141+
<scope>test</scope>
142+
</dependency>
143+
<dependency>
144+
<groupId>org.apache.arrow</groupId>
145+
<artifactId>arrow-vector</artifactId>
146+
<version>${arrow.version}</version>
147+
<scope>test</scope>
145148
</dependency>
146149
<dependency>
147150
<groupId>org.apache.parquet</groupId>
@@ -190,22 +193,6 @@
190193
<version>${hilbert.version}</version>
191194
<scope>test</scope>
192195
</dependency>
193-
<dependency>
194-
<groupId>org.apache.hadoop</groupId>
195-
<artifactId>hadoop-mapreduce-client-core</artifactId>
196-
<version>${hadoop.version}</version>
197-
<scope>test</scope>
198-
<exclusions>
199-
<exclusion>
200-
<groupId>org.slf4j</groupId>
201-
<artifactId>*</artifactId>
202-
</exclusion>
203-
<exclusion>
204-
<groupId>ch.qos.reload4j</groupId>
205-
<artifactId>*</artifactId>
206-
</exclusion>
207-
</exclusions>
208-
</dependency>
209196
</dependencies>
210197

211198
<profiles>

0 commit comments

Comments
 (0)