Use JAX 0.10 CUDA 13 for GPU installs

yonromai · yonromai · commit ff8a9675cec7 · 2026-05-05T13:09:28.000-07:00
Switch Marin and Levanter GPU extras from JAX CUDA 12 to JAX 0.10 CUDA 13. JAX 0.9.2 reproduced an H100x8 CUDA 13 profiler crash; JAX 0.10 passed the repros and H100x8 canary. CPU, TPU, and vLLM stay on JAX 0.9.2 until tpu-inference can unpin JAX. Part of #5427
diff --git a/docs/tutorials/local-gpu.md b/docs/tutorials/local-gpu.md
@@ -8,48 +8,32 @@ Similar steps will let you run Marin on a cloud GPU environment under Iris (the
 
 Make sure you've followed the [installation guide](installation.md) to do the basic installation.
 
-In addition to the prerequisites from the basic installation, we have GPU-specific dependencies:
+In addition to the prerequisites from the basic installation, we have one GPU-specific system dependency:
 
-- CUDA Toolkit (version 12.1 or higher)
-- cuDNN (version 9.1 or higher)
+- NVIDIA driver 580 or newer
 
 We assume you are running Ubuntu 24.04.
 
-## CUDA installation
+## NVIDIA driver and runtime
 
-Install CUDA 12.9.0:
+Install an NVIDIA driver that supports CUDA 13. Verify that the driver is at least 580 and that
+`nvidia-smi` reports CUDA 13.x:
 
 ```bash
-wget https://developer.download.nvidia.com/compute/cuda/12.9.0/local_installers/cuda_12.9.0_575.51.03_linux.run
-sudo sh cuda_12.9.0_575.51.03_linux.run
+nvidia-smi
 ```
 
-Install cuDNN 9.9.0 (Instructions from [NVIDIA's cuDNN download page](https://developer.nvidia.com/cudnn-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=24.04&target_type=deb_local)):
-
-```bash
-wget https://developer.download.nvidia.com/compute/cudnn/9.10.0/local_installers/cudnn-local-repo-ubuntu2404-9.10.0_1.0-1_amd64.deb
-sudo dpkg -i cudnn-local-repo-ubuntu2404-9.10.0_1.0-1_amd64.deb
-sudo cp /var/cudnn-local-repo-ubuntu2404-9.10.0/cudnn-*-keyring.gpg /usr/share/keyrings/
-sudo apt-get update
-sudo apt-get -y install cudnn
-sudo apt-get -y install cudnn-cuda-12
-sudo apt-get -y install nvidia-cuda-toolkit
-```
-
-Verify your setup by checking the CUDA version:
-
-```bash
-nvcc --version
-```
-
-Marin uses [JAX](https://jax.readthedocs.io/en/latest/index.html) as a core library.
-Install Python dependencies for CUDA 12.x via uv:
+Marin uses [JAX](https://docs.jax.dev/en/latest/index.html) as a core library. The `gpu`
+extra installs the CUDA 13 JAX runtime, including CUDA, cuDNN, and NCCL Python wheels:
 
 ```bash
 uv sync --extra=gpu
 ```
 
-See [JAX's installation guide](https://jax.readthedocs.io/en/latest/installation.html) for more options.
+If you install a local CUDA toolkit for custom kernels, use CUDA 13 and keep older CUDA libraries
+out of `LD_LIBRARY_PATH` so they do not override the JAX wheel libraries.
+
+See [JAX's installation guide](https://docs.jax.dev/en/latest/installation.html) for more options.
 
 !!! tip
 If you are using a DGX Spark or similar machine with unified memory, you may need to dramatically reduce the memory that XLA preallocates for itself. You can do this by setting the `XLA_PYTHON_CLIENT_MEM_FRACTION` variable, to something like 0.5:
diff --git a/lib/iris/docs/coreweave.md b/lib/iris/docs/coreweave.md
@@ -210,6 +210,11 @@ Do not change the GH200 row to `GH200x1`: the RNO2A pool currently accepts
 Before the full GPU canary, run one tiny direct JAX job for each row. It should
 prove `nvidia-smi`, GPU-backed JAX, and a tiny matmul.
 
+Marin's `gpu` extra installs the JAX CUDA 13 wheel stack from PyPI. CoreWeave
+GPU nodes must expose NVIDIA driver 580 or newer; `nvidia-smi` should report
+CUDA 13.x. CPU, TPU, and vLLM jobs use separate extras and should not carry the
+CUDA 13 JAX runtime.
+
 ### KubernetesProvider Operations
 
 On CoreWeave, there are no persistent worker daemons. The controller dispatches
diff --git a/lib/levanter/pyproject.toml b/lib/levanter/pyproject.toml
@@ -80,10 +80,11 @@ Homepage = "https://github.com/stanford-crfm/levanter"
 
 [project.optional-dependencies]
 gpu = [
-  "jax[cuda12]>=0.9.2",
-  # JAX 0.9.2 all-to-all fails on CW H100s with NCCL 2.27.x. Keep this floor
-  # until the JAX CUDA deps or the top-level lock exclude the bad NCCL line.
-  "nvidia-nccl-cu12>=2.28.3; sys_platform == 'linux'",
+  "jax[cuda13]==0.10.0",
+  # B200 emits a cuBLAS warning with older CUDA 13 cuBLAS builds.
+  "nvidia-cublas>=13.2.0.9; sys_platform == 'linux'",
+  # Preserve the CoreWeave H100 all-to-all guard under CUDA 13.
+  "nvidia-nccl-cu13>=2.28.3; sys_platform == 'linux'",
 ]
 tpu = ["jax==0.9.2", "jaxlib==0.9.2", "libtpu==0.0.38"]
 torch_test = [
diff --git a/lib/levanter/src/levanter/kernels/pallas/autotune_utils.py b/lib/levanter/src/levanter/kernels/pallas/autotune_utils.py
@@ -9,7 +9,7 @@
 import jax
 from jax import core as jax_core
 from jax._src import mesh as mesh_lib
-from jax.sharding import NamedSharding
+from jax.sharding import AxisType, NamedSharding
 
 
 _AUTOTUNE_THREAD_POOL = ThreadPoolExecutor(max_workers=1, thread_name_prefix="pallas_autotune")
@@ -53,8 +53,15 @@ def hlo_sharding_of(value: jax.Array):
         return None
 
 
+def _named_sharding_uses_manual_axes(sharding: NamedSharding) -> bool:
+    return any(axis_type is AxisType.Manual for axis_type in sharding.mesh.axis_types)
+
+
 def value_uses_manual_sharding(value: jax.Array) -> bool:
     """Detect shard_map-local tracer values that carry manual sharding."""
+    sharding = sharding_of(value)
+    if isinstance(sharding, NamedSharding) and _named_sharding_uses_manual_axes(sharding):
+        return True
     hlo_sharding = hlo_sharding_of(value)
     return hlo_sharding is not None and hlo_sharding.is_manual()
 
diff --git a/lib/levanter/tests/kernels/test_pallas_autotune_utils.py b/lib/levanter/tests/kernels/test_pallas_autotune_utils.py
@@ -4,7 +4,6 @@
 import threading
 
 import jax
-from jax._src import pjit
 import jax.numpy as jnp
 import numpy as np
 import pytest
@@ -85,8 +84,6 @@ def test_shape_dtype_struct_for_benchmark_drops_manual_sharding_from_shard_map_t
     def _capture(local_x):
         seen_manual.append(autotune_utils.value_uses_manual_sharding(local_x))
         seen_shapes.append(local_x.shape)
-        with pytest.raises(AssertionError):
-            pjit.pjit_check_aval_sharding([local_x.aval.sharding], [local_x.aval], ["x"], "arg", False)
         seen_structs.append(autotune_utils.shape_dtype_struct_for_benchmark(local_x))
         return local_x
 
diff --git a/lib/marin/pyproject.toml b/lib/marin/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
     "google-cloud-storage",
     "google-cloud-storage-transfer",
     "marin-haliax",
-    "jax==0.9.2",
+    "jax>=0.9.2,<0.11",
     "jaxopt>=0.8.3",
     "marin-levanter[serve]",
     "lxml[html_clean]",
@@ -119,10 +119,6 @@ conflicts = [
         { extra = "vllm" },
         { extra = "cpu" },
     ],
-    [
-        { extra = "vllm" },
-        { extra = "cuda12" },
-    ],
     [
         # The vllm extra ships vllm-tpu only, so it must use CPU/torch_xla
         # torch rather than the cu128-pinned torch from the gpu extra.
@@ -135,12 +131,11 @@ conflicts = [
 [project.optional-dependencies]
 
 gpu = [
-  "jax[cuda12]==0.9.2",
-  # JAX 0.9.2 all-to-all fails on CW H100s with NCCL 2.27.x. This can be
-  # removed once the resolved GPU stack no longer admits NCCL <2.28.3.
-  "nvidia-nccl-cu12>=2.28.3; sys_platform == 'linux'",
-  # torch 2.10.0+cu128 pins nvidia-nccl-cu12==2.27.5, which reintroduces the
-  # bad all-to-all stack above. torch 2.11.0+cu128 resolves NCCL 2.28.9.
+  "jax[cuda13]==0.10.0",
+  # B200 emits a cuBLAS warning with older CUDA 13 cuBLAS builds.
+  "nvidia-cublas>=13.2.0.9; sys_platform == 'linux'",
+  # Preserve the CoreWeave H100 all-to-all guard under CUDA 13.
+  "nvidia-nccl-cu13>=2.28.3; sys_platform == 'linux'",
   "torch==2.11.0",
   "torchvision==0.26.0",
 ]
@@ -188,6 +183,8 @@ vizier = [
 ]
 
 vllm = [
+    "jax==0.9.2",
+    "jaxlib==0.9.2",
     "vllm-tpu==0.18.0",
     "tpu-inference==0.18.0",
     "triton==3.6.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
@@ -219,13 +216,11 @@ torchvision = [
     { index = "pytorch-cpu", extra = "cpu" },
     { index = "pytorch-cpu", extra = "tpu" },
     { index = "pytorch-cpu", extra = "vllm" },
-    # The GPU extra pins a plain torchvision version so non-Linux platforms can
-    # use PyPI wheels. Only Linux GPU installs should route to PyTorch's cu128
-    # index for the matching CUDA wheel.
+    # The GPU extra uses PyTorch cu128 wheels; JAX CUDA 13 packages come from PyPI.
     { index = "pytorch-cu128", extra = "gpu", marker = "sys_platform == 'linux'" },
 ]
 resiliparse = { index = "marin-resiliparse" }
-# Use CUDA PyTorch for --extra=gpu on Linux, CPU PyTorch for TPU/CPU/vLLM builds.
+# Use PyTorch CUDA 12.8 wheels for --extra=gpu on Linux, CPU PyTorch for TPU/CPU/vLLM builds.
 torch = [
     { index = "pytorch-cu128", extra = "gpu", marker = "sys_platform == 'linux'" },
     { index = "pytorch-cpu", extra = "cpu" },
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,6 +37,32 @@ override-dependencies = [
     "datasets>=3.1.0,<5.0.0",
     "equinox>=0.11.10",  # Override vizier's pin for modern JAX compatibility
 ]
+conflicts = [
+    [
+        { package = "marin-levanter", extra = "gpu" },
+        { package = "marin", extra = "cpu" },
+    ],
+    [
+        { package = "marin-levanter", extra = "gpu" },
+        { package = "marin", extra = "tpu" },
+    ],
+    [
+        { package = "marin-levanter", extra = "gpu" },
+        { package = "marin", extra = "vllm" },
+    ],
+    [
+        { package = "marin", extra = "gpu" },
+        { package = "marin-levanter", extra = "tpu" },
+    ],
+    [
+        { package = "marin", extra = "gpu" },
+        { package = "marin-fray", group = "fray_tpu_test" },
+    ],
+    [
+        { package = "marin-levanter", extra = "gpu" },
+        { package = "marin-fray", group = "fray_tpu_test" },
+    ],
+]
 
 [tool.uv.workspace]
 members = [
diff --git a/uv.lock b/uv.lock