Move GPU JAX runtime to CUDA 13

yonromai · yonromai · commit c938b3d0d88b · 2026-05-04T18:23:15.000-07:00
diff --git a/docs/tutorials/local-gpu.md b/docs/tutorials/local-gpu.md
@@ -8,48 +8,32 @@ Similar steps will let you run Marin on a cloud GPU environment under Iris (the
 
 Make sure you've followed the [installation guide](installation.md) to do the basic installation.
 
-In addition to the prerequisites from the basic installation, we have GPU-specific dependencies:
+In addition to the prerequisites from the basic installation, we have one GPU-specific system dependency:
 
-- CUDA Toolkit (version 12.1 or higher)
-- cuDNN (version 9.1 or higher)
+- NVIDIA driver 580 or newer
 
 We assume you are running Ubuntu 24.04.
 
-## CUDA installation
+## NVIDIA driver and runtime
 
-Install CUDA 12.9.0:
+Install an NVIDIA driver that supports CUDA 13. Verify that the driver is at least 580 and that
+`nvidia-smi` reports CUDA 13.x:
 
 ```bash
-wget https://developer.download.nvidia.com/compute/cuda/12.9.0/local_installers/cuda_12.9.0_575.51.03_linux.run
-sudo sh cuda_12.9.0_575.51.03_linux.run
+nvidia-smi
 ```
 
-Install cuDNN 9.9.0 (Instructions from [NVIDIA's cuDNN download page](https://developer.nvidia.com/cudnn-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=24.04&target_type=deb_local)):
-
-```bash
-wget https://developer.download.nvidia.com/compute/cudnn/9.10.0/local_installers/cudnn-local-repo-ubuntu2404-9.10.0_1.0-1_amd64.deb
-sudo dpkg -i cudnn-local-repo-ubuntu2404-9.10.0_1.0-1_amd64.deb
-sudo cp /var/cudnn-local-repo-ubuntu2404-9.10.0/cudnn-*-keyring.gpg /usr/share/keyrings/
-sudo apt-get update
-sudo apt-get -y install cudnn
-sudo apt-get -y install cudnn-cuda-12
-sudo apt-get -y install nvidia-cuda-toolkit
-```
-
-Verify your setup by checking the CUDA version:
-
-```bash
-nvcc --version
-```
-
-Marin uses [JAX](https://jax.readthedocs.io/en/latest/index.html) as a core library.
-Install Python dependencies for CUDA 12.x via uv:
+Marin uses [JAX](https://docs.jax.dev/en/latest/index.html) as a core library. The `gpu`
+extra installs the CUDA 13 JAX runtime, including CUDA, cuDNN, and NCCL Python wheels:
 
 ```bash
 uv sync --extra=gpu
 ```
 
-See [JAX's installation guide](https://jax.readthedocs.io/en/latest/installation.html) for more options.
+If you install a local CUDA toolkit for custom kernels, use CUDA 13 and keep older CUDA libraries
+out of `LD_LIBRARY_PATH` so they do not override the JAX wheel libraries.
+
+See [JAX's installation guide](https://docs.jax.dev/en/latest/installation.html) for more options.
 
 !!! tip
 If you are using a DGX Spark or similar machine with unified memory, you may need to dramatically reduce the memory that XLA preallocates for itself. You can do this by setting the `XLA_PYTHON_CLIENT_MEM_FRACTION` variable, to something like 0.5:
diff --git a/lib/iris/OPS.md b/lib/iris/OPS.md
@@ -255,6 +255,10 @@ State dir: `gs://marin-us-central2/iris/<cluster>/state/` — contains `bundles/
 ## CoreWeave (GPU) Operations
 
 Use `lib/iris/examples/coreweave-*.yaml` for CoreWeave scale group configurations.
+Marin's `gpu` extra installs the JAX CUDA 13 wheel stack from PyPI. CoreWeave
+GPU nodes must expose NVIDIA driver 580 or newer; `nvidia-smi` should report
+CUDA 13.x. CPU, TPU, and vLLM jobs use separate extras and should not carry the
+CUDA 13 JAX runtime.
 
 ### Connecting
 
diff --git a/lib/levanter/pyproject.toml b/lib/levanter/pyproject.toml
@@ -80,10 +80,11 @@ Homepage = "https://github.com/stanford-crfm/levanter"
 
 [project.optional-dependencies]
 gpu = [
-  "jax[cuda12]>=0.9.2",
-  # JAX 0.9.2 all-to-all fails on CW H100s with NCCL 2.27.x. Keep this floor
-  # until the JAX CUDA deps or the top-level lock exclude the bad NCCL line.
-  "nvidia-nccl-cu12>=2.28.3; sys_platform == 'linux'",
+  "jax[cuda13]>=0.9.2",
+  # B200 emits a cuBLAS warning with older CUDA 13 cuBLAS builds.
+  "nvidia-cublas>=13.2.0.9; sys_platform == 'linux'",
+  # Preserve the CoreWeave H100 all-to-all guard under CUDA 13.
+  "nvidia-nccl-cu13>=2.28.3; sys_platform == 'linux'",
 ]
 tpu = ["jax==0.9.2", "jaxlib==0.9.2", "libtpu==0.0.38"]
 torch_test = [
diff --git a/lib/marin/pyproject.toml b/lib/marin/pyproject.toml
@@ -119,10 +119,6 @@ conflicts = [
         { extra = "vllm" },
         { extra = "cpu" },
     ],
-    [
-        { extra = "vllm" },
-        { extra = "cuda12" },
-    ],
     [
         # The vllm extra ships vllm-tpu only, so it must use CPU/torch_xla
         # torch rather than the cu128-pinned torch from the gpu extra.
@@ -135,12 +131,11 @@ conflicts = [
 [project.optional-dependencies]
 
 gpu = [
-  "jax[cuda12]==0.9.2",
-  # JAX 0.9.2 all-to-all fails on CW H100s with NCCL 2.27.x. This can be
-  # removed once the resolved GPU stack no longer admits NCCL <2.28.3.
-  "nvidia-nccl-cu12>=2.28.3; sys_platform == 'linux'",
-  # torch 2.10.0+cu128 pins nvidia-nccl-cu12==2.27.5, which reintroduces the
-  # bad all-to-all stack above. torch 2.11.0+cu128 resolves NCCL 2.28.9.
+  "jax[cuda13]==0.9.2",
+  # B200 emits a cuBLAS warning with older CUDA 13 cuBLAS builds.
+  "nvidia-cublas>=13.2.0.9; sys_platform == 'linux'",
+  # Preserve the CoreWeave H100 all-to-all guard under CUDA 13.
+  "nvidia-nccl-cu13>=2.28.3; sys_platform == 'linux'",
   "torch==2.11.0",
   "torchvision==0.26.0",
 ]
@@ -219,13 +214,11 @@ torchvision = [
     { index = "pytorch-cpu", extra = "cpu" },
     { index = "pytorch-cpu", extra = "tpu" },
     { index = "pytorch-cpu", extra = "vllm" },
-    # The GPU extra pins a plain torchvision version so non-Linux platforms can
-    # use PyPI wheels. Only Linux GPU installs should route to PyTorch's cu128
-    # index for the matching CUDA wheel.
+    # The GPU extra uses PyTorch cu128 wheels; JAX CUDA 13 packages come from PyPI.
     { index = "pytorch-cu128", extra = "gpu", marker = "sys_platform == 'linux'" },
 ]
 resiliparse = { index = "marin-resiliparse" }
-# Use CUDA PyTorch for --extra=gpu on Linux, CPU PyTorch for TPU/CPU/vLLM builds.
+# Use PyTorch CUDA 12.8 wheels for --extra=gpu on Linux, CPU PyTorch for TPU/CPU/vLLM builds.
 torch = [
     { index = "pytorch-cu128", extra = "gpu", marker = "sys_platform == 'linux'" },
     { index = "pytorch-cpu", extra = "cpu" },
diff --git a/tests/test_dependency_extras.py b/tests/test_dependency_extras.py
@@ -0,0 +1,74 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import re
+import subprocess
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+REQUIREMENT = re.compile(r"^([A-Za-z0-9_.-]+)==([^ ;]+)")
+CUDA_RUNTIME_PREFIXES = ("jax-cuda", "nvidia-")
+
+
+def export_packages(package: str, extra: str) -> dict[str, str]:
+    result = subprocess.run(
+        [
+            "uv",
+            "export",
+            "--package",
+            package,
+            "--extra",
+            extra,
+            "--no-dev",
+            "--frozen",
+            "--no-emit-project",
+            "--no-emit-workspace",
+            "--no-header",
+            "--no-annotate",
+            "--no-hashes",
+        ],
+        cwd=REPO_ROOT,
+        check=True,
+        capture_output=True,
+        text=True,
+        env={**os.environ, "UV_NO_PROGRESS": "1"},
+    )
+    packages = {}
+    for line in result.stdout.splitlines():
+        match = REQUIREMENT.match(line)
+        if match:
+            packages[match.group(1).lower().replace("_", "-")] = match.group(2)
+    return packages
+
+
+def cuda_runtime_packages(packages: dict[str, str]) -> list[str]:
+    return sorted(name for name in packages if name.startswith(CUDA_RUNTIME_PREFIXES))
+
+
+@pytest.mark.parametrize("package", ["marin", "marin-levanter"])
+def test_gpu_extra_exports_cuda13_jax_runtime(package: str):
+    """The GPU extra is the resolver boundary; this catches accidental reverts to JAX CUDA 12."""
+    packages = export_packages(package, "gpu")
+
+    assert "jax-cuda13-plugin" in packages
+    assert "jax-cuda13-pjrt" in packages
+    assert "jax-cuda12-plugin" not in packages
+    assert "jax-cuda12-pjrt" not in packages
+
+
+@pytest.mark.parametrize("extra", ["cpu", "tpu", "vllm"])
+def test_non_gpu_extras_do_not_export_cuda_runtime(extra: str):
+    """CPU/TPU/vLLM jobs should not inherit JAX CUDA or NVIDIA runtime wheels."""
+    packages = export_packages("marin", extra)
+
+    assert cuda_runtime_packages(packages) == []
+
+
+def test_levanter_tpu_extra_does_not_export_cuda_runtime():
+    """Levanter TPU jobs should not inherit JAX CUDA or NVIDIA runtime wheels."""
+    packages = export_packages("marin-levanter", "tpu")
+
+    assert cuda_runtime_packages(packages) == []
diff --git a/uv.lock b/uv.lock