drop cuda11 support (#959)

YanxuanLiu · web-flow · commit a37938042215 · 2025-07-16T10:38:15.000+08:00
fix #957 - Updated default cuda version to 12.9.1 - TODO: `notebooks/databricks/init-pip-cuda-11.8.sh` --------- Signed-off-by: Yanxuan Liu <yanxuanl@nvidia.com>
diff --git a/ci/Dockerfile b/ci/Dockerfile
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-ARG CUDA_VERSION=11.8.0
+ARG CUDA_VERSION=12.0.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 # ubuntu22
@@ -48,5 +48,5 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
 
 # install cuML
 ARG CUML_VER=25.06
-RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.10 cuda-version=11.8 numpy~=1.0 \
+RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.10 cuda-version=12.0 numpy~=1.0 \
     && conda clean --all -f -y
diff --git a/ci/Jenkinsfile.premerge b/ci/Jenkinsfile.premerge
@@ -30,7 +30,7 @@ import ipp.blossom.*
 
 def githubHelper // blossom github helper
 def TEMP_IMAGE_BUILD = true
-def IMAGE_PREMERGE = "${common.ARTIFACTORY_NAME}/sw-spark-docker/rapids:ml-ubuntu22-cuda11.8.0-py310"
+def IMAGE_PREMERGE = "${common.ARTIFACTORY_NAME}/sw-spark-docker/rapids:ml-ubuntu22-cuda12.0.1-py310"
 def cpuImage = pod.getCPUYAML("${common.ARTIFACTORY_NAME}/sw-spark-docker/spark:rapids-databricks") // tooling image
 def PREMERGE_DOCKERFILE = 'ci/Dockerfile'
 def PREMERGE_TAG
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -35,7 +35,7 @@
 #
 ###
 
-ARG CUDA_VERSION=11.5.2
+ARG CUDA_VERSION=12.0.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 
 # ubuntu22
diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-ARG CUDA_VERSION=11.8.0
+ARG CUDA_VERSION=12.0.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 ARG PYSPARK_VERSION=3.3.1
diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-ARG CUDA_VERSION=11.8.0
+ARG CUDA_VERSION=12.0.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 ARG CUML_VERSION=25.06
@@ -47,7 +47,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linu
 
 # install cuML
 
-RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=11.8 cuml=$CUML_VERSION numpy~=1.0 \
+RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=12.0 cuml=$CUML_VERSION numpy~=1.0 \
     && conda clean --all -f -y
 
 # install python dependencies
diff --git a/docs/site/compatibility.md b/docs/site/compatibility.md
@@ -31,7 +31,7 @@ The following table shows the currently supported algorithms.  The goal is to ex
 
 | Spark Rapids ML | CUDA  | Spark  | Python |
 | :-------------- | :---- | :----- | :----- |
-| 1.0.0           | 11.4+ | 3.3+   | 3.10+  |
+| 1.0.0           | 12.0+ | 3.3+   | 3.10+  |
 
 
 ## Single vs Double precision inputs
diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
@@ -11,16 +11,16 @@ If you already have a Databricks account, you can run the example notebooks on a
   ```bash
   export WS_SAVE_DIR="/path/to/directory/in/workspace"
   databricks workspace mkdirs ${WS_SAVE_DIR} --profile ${PROFILE}
-  databricks workspace import --format AUTO --file init-pip-cuda-11.8.sh ${WS_SAVE_DIR}/init-pip-cuda-11.8.sh --profile ${PROFILE}
+  databricks workspace import --format AUTO --file init-pip-cuda-12.0.sh ${WS_SAVE_DIR}/init-pip-cuda-12.0.sh --profile ${PROFILE}
   ```
   **Note**: the init script does the following on each Spark node:
-  - updates the CUDA runtime to 11.8 (required for Spark Rapids ML dependencies).
+  - updates the CUDA runtime to 12.0 (required for Spark Rapids ML dependencies).
   - downloads and installs the [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) plugin for accelerating data loading and Spark SQL.
   - installs various `cuXX` dependencies via pip.
   - if the cluster environment variable `SPARK_RAPIDS_ML_NO_IMPORT_ENABLED=1` is define (see below), the init script also modifies a Databricks notebook kernel startup script to enable no-import change UX for the cluster.  See [no-import-change](../README.md#no-import-change).
 - Create a cluster using **Databricks 13.3 LTS ML GPU Runtime** using at least two single-gpu workers and add the following configurations to the **Advanced options**.
   - **Init Scripts**
-    - add the workspace path to the uploaded init script `${WS_SAVE_DIR}/init-pip-cuda-11.8.sh` as set above (but substitute variables manually in the form).
+    - add the workspace path to the uploaded init script `${WS_SAVE_DIR}/init-pip-cuda-12.0.sh` as set above (but substitute variables manually in the form).
   - **Spark**
     - **Spark config**
       ```
diff --git a/notebooks/databricks/init-pip-cuda-12.0.sh b/notebooks/databricks/init-pip-cuda-12.0.sh
@@ -21,24 +21,24 @@ set -ex
 RAPIDS_VERSION=25.6.0
 SPARK_RAPIDS_VERSION=25.04.0
 
-curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
+curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda12.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 
-# install cudatoolkit 11.8 via runfile approach
-wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
-sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit
+# install cudatoolkit 12.0 via runfile approach
+wget https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run
+sh cuda_12.0.1_525.85.12_linux.run --silent --toolkit
 
 # reset symlink and update library loading paths
 rm /usr/local/cuda
-ln -s /usr/local/cuda-11.8 /usr/local/cuda
+ln -s /usr/local/cuda-12.0 /usr/local/cuda
 
 # upgrade pip
 /databricks/python/bin/pip install --upgrade pip
 
 # install cudf, cuml and their rapids dependencies
 # using ~= pulls in latest micro version patches
-/databricks/python/bin/pip install cudf-cu11~=${RAPIDS_VERSION} \
-    cuml-cu11~=${RAPIDS_VERSION} \
-    cuvs-cu11~=${RAPIDS_VERSION} \
+/databricks/python/bin/pip install cudf-cu12~=${RAPIDS_VERSION} \
+    cuml-cu12~=${RAPIDS_VERSION} \
+    cuvs-cu12~=${RAPIDS_VERSION} \
     --extra-index-url=https://pypi.nvidia.com
 
 # install spark-rapids-ml
diff --git a/python/README.md b/python/README.md
@@ -18,11 +18,11 @@ This PySpark-compatible API leverages the RAPIDS cuML python API to provide GPU-
 
 For simplicity, the following instructions just use Spark local mode, assuming a server with at least one GPU.
 
-First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html).   Example for CUDA Toolkit 11.8:
+First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html).   Example for CUDA Toolkit 12.0:
 ```bash
 conda create -n rapids-25.06 \
     -c rapidsai -c conda-forge -c nvidia \
-    cuml=25.06 cuvs=25.06 python=3.10 cuda-version=11.8 numpy~=1.0
+    cuml=25.06 cuvs=25.06 python=3.10 cuda-version=12.0 numpy~=1.0
 ```
 
 **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting.  Once you have a working environment, you can then try installing directly, if necessary.
diff --git a/python/benchmark/databricks/gpu_cluster_spec.sh b/python/benchmark/databricks/gpu_cluster_spec.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ cat <<EOF
     "init_scripts": [
         {
             "workspace": {
-                "destination": "${INIT_SCRIPT_DIR}/init-pip-cuda-11.8.sh"
+                "destination": "${INIT_SCRIPT_DIR}/init-pip-cuda-12.0.sh"
             }
         }
     ],
diff --git a/python/benchmark/databricks/gpu_etl_cluster_spec.sh b/python/benchmark/databricks/gpu_etl_cluster_spec.sh
@@ -68,7 +68,7 @@ cat <<EOF
     "init_scripts": [
         {
             "workspace": {
-                "destination": "${INIT_SCRIPT_DIR}/init-pip-cuda-11.8.sh"
+                "destination": "${INIT_SCRIPT_DIR}/init-pip-cuda-12.0.sh"
             }
         }
     ],
diff --git a/python/benchmark/databricks/init-pip-cuda-12.0.sh b/python/benchmark/databricks/init-pip-cuda-12.0.sh
@@ -24,14 +24,14 @@ SPARK_RAPIDS_VERSION=25.04.0
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 
-# install cudatoolkit 11.8 via runfile approach
-wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
-sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit
+# install cudatoolkit 12.0 via runfile approach
+wget https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run
+sh cuda_12.0.1_525.85.12_linux.run --silent --toolkit
 
 # reset symlink and update library loading paths
 # **** set LD_LIBRARY_PATH as below in env var section of cluster config in DB cluster UI ****
 rm /usr/local/cuda
-ln -s /usr/local/cuda-11.8 /usr/local/cuda
+ln -s /usr/local/cuda-12.0 /usr/local/cuda
 
 # upgrade pip
 /databricks/python/bin/pip install --upgrade pip
diff --git a/python/benchmark/databricks/setup.sh b/python/benchmark/databricks/setup.sh
@@ -1,5 +1,5 @@
 #!/bin/bash -xe
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ popd
 # create workspace directory
 databricks workspace mkdirs ${INIT_SCRIPT_DIR} --profile ${DB_PROFILE} ${DB_OVERWRITE}
 # point cpu and gpu cluster init scripts to new files and upload
-for init_script in init-pip-cuda-11.8.sh init-cpu.sh
+for init_script in init-pip-cuda-12.0.sh init-cpu.sh
 do
 # NOTE: on linux delete the .bu after -i
     if base64 --help | grep '\-w'; then
diff --git a/python/run_benchmark.sh b/python/run_benchmark.sh
@@ -56,7 +56,7 @@
 
 
 export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
-cuda_version=${cuda_version:-11}
+cuda_version=${cuda_version:-12}
 
 cluster_type=${1:-gpu}
 remote_host=sc://${remote_host:-localhost}

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@`
`35`	`35`	`#`
`36`	`36`	`###`
`37`	`37`
`38`		`-ARG CUDA_VERSION=11.5.2`
	`38`	`+ARG CUDA_VERSION=12.0.1`
`39`	`39`	`FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04`
`40`	`40`
`41`	`41`	`# ubuntu22`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`	`# limitations under the License.`
`15`	`15`	`#`
`16`	`16`
`17`		`-ARG CUDA_VERSION=11.8.0`
	`17`	`+ARG CUDA_VERSION=12.0.1`
`18`	`18`	`FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04`
`19`	`19`
`20`	`20`	`ARG PYSPARK_VERSION=3.3.1`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright (c) 2024, NVIDIA CORPORATION.`
	`1`	`+# Copyright (c) 2024-2025, NVIDIA CORPORATION.`
`2`	`2`	`#`
`3`	`3`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`4`	`4`	`# you may not use this file except in compliance with the License.`
`@@ -54,7 +54,7 @@ cat <<EOF`
`54`	`54`	`"init_scripts": [`
`55`	`55`	`{`
`56`	`56`	`"workspace": {`
`57`		`- "destination": "${INIT_SCRIPT_DIR}/init-pip-cuda-11.8.sh"`
	`57`	`+ "destination": "${INIT_SCRIPT_DIR}/init-pip-cuda-12.0.sh"`
`58`	`58`	`}`
`59`	`59`	`}`
`60`	`60`	`],`