ray-project
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.vale/styles/Google/Acronyms.yml‎
Lines changed: 1 addition & 0 deletions b/‎.vale/styles/Google/Acronyms.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/docker/llm.build.Dockerfile‎
Lines changed: 34 additions & 2 deletions b/‎ci/docker/llm.build.Dockerfile‎
Lines changed: 34 additions & 2 deletions
diff --git a/‎ci/docker/llm.build.wanda.yaml‎
Lines changed: 1 addition & 0 deletions b/‎ci/docker/llm.build.wanda.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/raydepsets/configs/rayllm.depsets.yaml‎
Lines changed: 8 additions & 0 deletions b/‎ci/raydepsets/configs/rayllm.depsets.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎doc/source/data/working-with-llms.rst‎
Lines changed: 9 additions & 10 deletions b/‎doc/source/data/working-with-llms.rst‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎doc/source/serve/llm/troubleshooting.md‎
Lines changed: 8 additions & 9 deletions b/‎doc/source/serve/llm/troubleshooting.md‎
Lines changed: 8 additions & 9 deletions
diff --git a/‎docker/ray-llm/Dockerfile‎
Lines changed: 38 additions & 5 deletions b/‎docker/ray-llm/Dockerfile‎
Lines changed: 38 additions & 5 deletions
diff --git a/‎docker/ray-llm/cuda.wanda.yaml‎
Lines changed: 1 addition & 0 deletions b/‎docker/ray-llm/cuda.wanda.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/deplocks/llm/rayllm_py312_cpu.lock‎
Lines changed: 39 additions & 32 deletions b/‎python/deplocks/llm/rayllm_py312_cpu.lock‎
Lines changed: 39 additions & 32 deletions
@@ -10,6 +10,7 @@ exclude: |
     release/release_logs/|
     rllib/offline/tests/data|
     thirdparty/patches/|
+    python/requirements/llm/patches/|
     src/ray/thirdparty/|
     doc/external/|
     doc/source/
 
@@ -46,6 +46,7 @@ exceptions:
   - MPS
   - NET
   - NFS
+  - NIXL
   - NOTE
   - NVDA
   - OSS
 
@@ -67,7 +67,39 @@ SKIP_PYTHON_PACKAGES=1 ./ci/env/install-dependencies.sh
 PYTHON_CODE="$(python -c "import sys; v=sys.version_info; print(f'py{v.major}{v.minor}')")"
 pip install --no-deps -r python/deplocks/llm/rayllm_test_${PYTHON_CODE}_${RAY_CUDA_CODE}.lock
 
+# Temporarily patch fixes from https://github.com/vllm-project/vllm/pull/39873
+# until the pinned vLLM release includes it.
+VLLM_IMPORT_UTILS_PATCH="$(pwd)/python/requirements/llm/patches/vllm-trial-import-patch"
+VLLM_SITE_PACKAGES="$(python - <<'PY'
+import site
+import sysconfig
+from pathlib import Path
+
+candidate_dirs = [
+    Path(sysconfig.get_paths()["purelib"]),
+    Path(sysconfig.get_paths()["platlib"]),
+    *(Path(path) for path in site.getsitepackages()),
+]
+
+for base_dir in dict.fromkeys(candidate_dirs):
+    import_utils = base_dir / "vllm" / "utils" / "import_utils.py"
+    if import_utils.exists():
+        print(base_dir)
+        break
+else:
+    raise SystemExit("vLLM import_utils.py not found")
+PY
+)"
+(
+    cd "${VLLM_SITE_PACKAGES}"
+    git apply "${VLLM_IMPORT_UTILS_PATCH}"
+)
+
 EOF
 
-# Use the revamped ray executor backend in vLLM
-ENV VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1
+
+# vLLM 0.21.0 selects the FlashInfer top-k/top-p sampler during engine initialization
+# instead of the previous PyTorch-native/Triton sampling path. The FlashInfer sampler
+# introduces longer adds a large one-time engine initialization cost. To avoid performance
+# surprises, we disable the FlashInfer sampler by default.
+ENV VLLM_USE_FLASHINFER_SAMPLER=0
@@ -5,6 +5,7 @@ srcs:
   - ci/env/install-dependencies.sh
   - ci/env/install-llvm-binaries.sh
   - ci/suppress_output
+  - python/requirements/llm/patches/vllm-trial-import-patch
   - python/deplocks/llm/rayllm_test_py312_cpu.lock
   - python/deplocks/llm/rayllm_test_py312_cu130.lock
 tags:
 
@@ -13,6 +13,14 @@ build_arg_sets:
   append_flags:
     - --python-version=${PYTHON_VERSION_STR}
     - --unsafe-package ray
+    # Omit the nixl-cu12 binary wheel from the compiled lockfiles. nixl-cu12
+    # and nixl-cu13 1.x wheels both install a top-level nixl_ep/ package with
+    # an identically named nixl_ep_cpp.so but different libcudart
+    # requirements; if both wheels are present the cu12 binary wins the file
+    # race and breaks vLLM's eager `import nixl_ep` on the cu130 image. The
+    # nixl meta-package (pure Python) is still required so that examples like
+    # dp_pd_example can `import nixl` to gate optional features.
+    - --unsafe-package nixl-cu12
     - --python-platform=x86_64-manylinux_2_31
     - --index https://download.pytorch.org/whl/${CUDA_CODE}
   build_arg_sets:
 
@@ -609,21 +609,20 @@ Then reference the remote path in your config:
     :end-before: __s3_config_example_end__
 
 
-C/C++ runtime dependencies incompatibility
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+vLLM NIXL EP dependency incompatibility
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. admonition:: Known issue
 
-   Ray 2.55 installs vLLM 0.18.0. Depending on the conda environment, you may encounter
-   incompatibilities with native runtime libraries (for example, ``libstdc++``, ``CXXABI``, ``ICU``).
+   Users who install Ray and vLLM directly may encounter NIXL EP incompatibility error as follows:
 
-   In such cases, override just the ``libstdc++`` library from your conda environment with ``LD_LIBRARY_PATH``:
+   .. code-block:: text
 
-   .. code-block:: shell
+      ImportError: libcudart.so.12: cannot open shared object file: No such file or directory
+
+   Remove the incompatible package or ensure the installed ``nixl_ep`` package is compatible with the CUDA runtime
+   and vLLM build in your environment.
 
-      mkdir -p "${CONDA_PREFIX}/lib-overrides"
-      ln -sf "${CONDA_PREFIX}/lib/libstdc++.so.6" "${CONDA_PREFIX}/lib-overrides/libstdc++.so.6"
-      export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib-overrides${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
 
 **Usage data collection**: Ray collects anonymous usage data to improve Ray Data LLM. To opt out, see :ref:`Ray usage stats <ref-usage-stats>`.
 
@@ -638,4 +637,4 @@ If you encounter issues not covered in this guide:
 - `Ray Discourse Forum <https://discuss.ray.io>`_ - Ask questions and share knowledge
 - `Ray LLM Office Hours <https://docs.google.com/document/d/1n3-Jw_4su8yilo9zdi5OciAduoz6H_VmdL8i9sL4f-E/edit?tab=t.e700ayqsx3v3>`_ - Learn about new features, ask questions, and get guidance from the team
 
-  - `Past Office Hours Recordings <https://youtube.com/playlist?list=PLzTswPQNepXl2IYF8DcV35FdCoVbeL4_6&si=ik81bljIlasYAHKN>`_ - View recordings from previous sessions
+  - `Past Office Hours Recordings <https://youtube.com/playlist?list=PLzTswPQNepXl2IYF8DcV35FdCoVbeL4_6&si=ik81bljIlasYAHKN>`_ - View recordings from previous sessions
@@ -77,18 +77,18 @@ app = build_openai_app({"llm_configs": [llm_config]})
 serve.run(app, blocking=True)
 ```
 
-### C/C++ runtime dependencies incompatibility
+### vLLM NIXL EP dependency incompatibility
 
 :::{admonition} Known issue
-Ray 2.55 installs vLLM 0.18.0. Depending on the conda environment, you may encounter incompatibilities with native runtime libraries (for example, `libstdc++`, `CXXABI`, `ICU`).
+Users who install Ray and vLLM directly may encounter NIXL EP incompatibility error as follows:
 
-In such cases, override just the ``libstdc++`` library from your conda environment with `LD_LIBRARY_PATH`:
-
-```shell
-mkdir -p "${CONDA_PREFIX}/lib-overrides"
-ln -sf "${CONDA_PREFIX}/lib/libstdc++.so.6" "${CONDA_PREFIX}/lib-overrides/libstdc++.so.6"
-export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib-overrides${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+```text
+ImportError: libcudart.so.12: cannot open shared object file: No such file or directory
 ```
+
+Remove the incompatible package or ensure the installed ``nixl_ep`` package is compatible with the CUDA runtime
+and vLLM build in your environment.
+
 :::
 
 ## Get help
@@ -105,4 +105,3 @@ If you encounter issues not covered in this guide:
 
 - {doc}`Quickstart examples <quick-start>`
 - {doc}`Examples <examples>`
-
@@ -4,10 +4,11 @@ ARG BASE_IMAGE
 FROM "$BASE_IMAGE"
 
 COPY python/deplocks/llm/rayllm_*.lock ./
+COPY python/requirements/llm/patches/vllm-trial-import-patch ./
 
 # vLLM version tag to use for EP kernel and DeepGEMM install scripts
 # Keep in sync with vllm version in python/requirements/llm/llm-requirements.txt
-ARG VLLM_SCRIPTS_REF="v0.20.0"
+ARG VLLM_SCRIPTS_REF="v0.21.0"
 
 RUN <<EOF
 #!/bin/bash
@@ -35,8 +36,33 @@ uv pip install --system --no-cache-dir --no-deps \
     --no-verify-hashes \
     -r "rayllm_${PYTHON_CODE}_${CUDA_CODE}.lock"
 
-# Export installed packages
-$HOME/anaconda3/bin/pip freeze > /home/ray/pip-freeze.txt
+# Temporarily patch fixes from https://github.com/vllm-project/vllm/pull/39873
+# until the pinned vLLM release includes it.
+VLLM_IMPORT_UTILS_PATCH="$(pwd)/vllm-trial-import-patch"
+VLLM_SITE_PACKAGES="$(python - <<'PY'
+import site
+import sysconfig
+from pathlib import Path
+
+candidate_dirs = [
+    Path(sysconfig.get_paths()["purelib"]),
+    Path(sysconfig.get_paths()["platlib"]),
+    *(Path(path) for path in site.getsitepackages()),
+]
+
+for base_dir in dict.fromkeys(candidate_dirs):
+    import_utils = base_dir / "vllm" / "utils" / "import_utils.py"
+    if import_utils.exists():
+        print(base_dir)
+        break
+else:
+    raise SystemExit("vLLM import_utils.py not found")
+PY
+)"
+(
+    cd "${VLLM_SITE_PACKAGES}"
+    git apply "${VLLM_IMPORT_UTILS_PATCH}"
+)
 
 sudo apt-get update -y && sudo apt-get install -y curl kmod pkg-config librdmacm-dev cmake
 
@@ -57,10 +83,17 @@ curl -fsSL "${VLLM_RAW}/tools/ep_kernels/install_python_libraries.sh" | \
 # Install DeepGEMM
 curl -fsSL "${VLLM_RAW}/tools/install_deepgemm.sh" | bash
 
+# Export installed packages
+$HOME/anaconda3/bin/pip freeze > /home/ray/pip-freeze.txt
+
 sudo rm -rf /var/lib/apt/lists/*
 sudo apt-get clean
 
 EOF
 
-# Use the revamped ray executor backend in vLLM
-ENV VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1
+
+# vLLM 0.21.0 selects the FlashInfer top-k/top-p sampler during engine initialization
+# instead of the previous PyTorch-native/Triton sampling path. The FlashInfer sampler
+# introduces longer adds a large one-time engine initialization cost. To avoid performance
+# surprises, we disable the FlashInfer sampler by default.
+ENV VLLM_USE_FLASHINFER_SAMPLER=0
@@ -3,6 +3,7 @@ froms: ["cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base"]
 dockerfile: docker/ray-llm/Dockerfile
 srcs:
   - python/requirements.txt
+  - python/requirements/llm/patches/vllm-trial-import-patch
   - python/deplocks/llm/rayllm_py312_cu130.lock
 build_args:
   - BASE_IMAGE=cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base
 
@@ -240,6 +240,7 @@ apache-tvm-ffi==0.1.9 \
     #   flashinfer-python
     #   quack-kernels
     #   tilelang
+    #   tokenspeed-mla
     #   vllm
     #   xgrammar
 astor==0.8.1 \
@@ -2547,37 +2548,22 @@ ninja==1.13.0 \
     #   -r python/requirements/llm/llm-requirements.txt
     #   flashinfer-python
     #   vllm
-nixl==0.10.1 \
-    --hash=sha256:616465673dae5180d296525a03237af4cd5f2c00c3228d185bc06dbe621509b7
+nixl==1.1.0 \
+    --hash=sha256:f46f65768770fa508eb52921c41b5dc52b754478b0ebb606fff6d80f41375d8b
     # via
     #   -c python/deplocks/llm/rayllm_test_py312_cpu.lock
     #   -r python/requirements/llm/llm-requirements.txt
-nixl-cu12==0.10.1 \
-    --hash=sha256:0bb1b3532f95c2f376e21008e91e8ec5791304a29af19e75d29fd1bcc754c9bc \
-    --hash=sha256:15376c1527c68d77fff5c6bb7cf7466a16dae0ab3bb32de152a602ba9edaaa9d \
-    --hash=sha256:26e59f9841985cf5b547202865036f84ae6dc23184789446fe5833e7499e21a9 \
-    --hash=sha256:277cde28bc45f706df689ed399327d0cce5432382606a5fc1d19fc470fcc57b4 \
-    --hash=sha256:3dde565c9d6e1d5af139a4dca240e902d5dbb32ea622acb31cdca3fb25cb859f \
-    --hash=sha256:48d3d9cc882edaa0a323d0ddfed39e0864b873ef1fa56e774c5a793629bcf083 \
-    --hash=sha256:685a0b8c5cdaa9cdbd826ea54cf46b4b3e46b016ee73a919cd2cf489402c56fd \
-    --hash=sha256:7641bc2bd3aeeefcf2ea3a3fd9f940f54a62d985ce2426dde6b3860d0edce13e \
-    --hash=sha256:b4712c6e0f18f57fee34cd970faac01480f0caf12da33d4a40ef4e9096a4caf7 \
-    --hash=sha256:ba46837abee8e06c8d86bd9b2cd7dab8c3d1e8407e04d52e6db3a9b137478c4b
-    # via
-    #   -c python/deplocks/llm/rayllm_test_py312_cpu.lock
-    #   -r python/requirements/llm/llm-requirements.txt
-    #   nixl
-nixl-cu13==0.10.1 \
-    --hash=sha256:0cbd4ffc25398f565a378e6da09b60d8d2625f1eda96bfe20d6d9c5f3fdba2c0 \
-    --hash=sha256:129f41f6855cf13837b55516319512ee1561e0a8cbbc2e4a9be3d839631ebf36 \
-    --hash=sha256:234752e979465e98aae5866e32777a4d98892c1d9fd2f59f25fef69e1e26716e \
-    --hash=sha256:322e4702606ad498a493d99a065af46c0c16ce84a4bed6495f85efab75670f0a \
-    --hash=sha256:66ad915a090da0b8928d9fea2aaac98c3468bb5e08a1a293ef211615bd49e460 \
-    --hash=sha256:67c913c0345f8703f1b3c96dd5f63c914bc6c173e3f283f62c46c07b5fcc5618 \
-    --hash=sha256:909d00ffc1929ef45cd3cfa0cd3585999274c90a6bcf0799000677cf83a8f0e2 \
-    --hash=sha256:9346f26d4b97088ee23921d567b7836eceb57473638455eac73b2f2b7388cbfc \
-    --hash=sha256:efa8f95ac57b9cf71fd5a0dcaa51ecef8d40510ca5ab3347e046a972905edbf1 \
-    --hash=sha256:f32bfd6f649ef1968e4f6d37d7c3cae61a58fe1f57a987d2fad324f18d5dc6e5
+nixl-cu13==1.1.0 \
+    --hash=sha256:1991d7899603907099f3e3ac3bf59f950de194bfe8d92c01ef9f06ce1639efa4 \
+    --hash=sha256:1c4e8142eff7cabe6107b3b65bc7a09da27ed585efb7972e45b1faabe74726c7 \
+    --hash=sha256:3f623b77fd59199afd71edadebb79ab394ec5e035873efe8a9bc8b4716b34e73 \
+    --hash=sha256:4e6031798b0a123d1821db698b1f9b3a1534c821af860ee0ef23601638c50d8f \
+    --hash=sha256:52b1e33ed9613df277d957cf1282cda14fbdf7b73006d8f45904cc68619e7af9 \
+    --hash=sha256:60cc00b12871d8c7d78c2385ad9380070424d5b07d3fe01680f222d6c4f1f428 \
+    --hash=sha256:6549dcb4f405f70903534a0770970ab95ed9185a8b16522db1ab4e2d0cc60b37 \
+    --hash=sha256:67149f7d2e3d471ca91499e5437d7b718e4e3e7a27f3b5b917f94b8992a4ed5a \
+    --hash=sha256:90c27cfdae0932f8ecb96ce29474249dd25d7f7712b9f43821cdb57699888fcb \
+    --hash=sha256:e8edf4b0d6a7549d8555fe1a99193aebd522b7737c405c3f8760f432d82e11df
     # via
     #   -c python/deplocks/llm/rayllm_test_py312_cpu.lock
     #   -r python/requirements/llm/llm-requirements.txt
@@ -2721,6 +2707,7 @@ nvidia-cutlass-dsl==4.4.2 \
     #   -c python/deplocks/llm/rayllm_test_py312_cpu.lock
     #   flashinfer-python
     #   quack-kernels
+    #   tokenspeed-mla
     #   vllm
 nvidia-cutlass-dsl-libs-base==4.4.2 \
     --hash=sha256:06acb3acff3dcf4bf6630476efac7de94de30b988ded4fa00b647bbcec4224ff \
@@ -4997,6 +4984,22 @@ tokenizers==0.22.2 \
     #   -c python/deplocks/llm/rayllm_test_py312_cpu.lock
     #   transformers
     #   vllm
+tokenspeed-mla==0.1.2 \
+    --hash=sha256:592590f36d85e624ecdc5e357ff35e29e761e6d879900dce8b67a6785c8ce75c \
+    --hash=sha256:c9466a351fe039792e56cf49f3e79744c1dc28c7af10306a02e62b8e92fa5985
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py312_cpu.lock
+    #   vllm
+tokenspeed-triton==3.7.10.post20260505 \
+    --hash=sha256:060f657c78b5cd0c5645f01eb0f73b72cf385589235e3b96ca05f9b3d33a644f \
+    --hash=sha256:06bad3e25ccaba22bb43eb8499f01008f9aaa0bfb3fbfb0cef1b37d2c006c6f0 \
+    --hash=sha256:15e867fbc3dc7f5d1d2ec80b6b783c0e58d6d5c470cbfa99e87a035ec6af6212 \
+    --hash=sha256:19618c7db01a9bd33885f7acbf8945adb2f5534668aa97629b56d481753cbcad \
+    --hash=sha256:7a679e079f98023cf326f299c8150ebc8ef6f1d2cf744d5dc435bc0d9a6f8a5b \
+    --hash=sha256:82c222755095db261e32e3964e009573f3360806088fa493be65404276866344
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py312_cpu.lock
+    #   tokenspeed-mla
 torch==2.11.0+cpu \
     --hash=sha256:1abeaa46fa7532ed35ed79146f4de5d7a9d4b30462c98052ea4ddfe781ea3eca \
     --hash=sha256:2db3ae5404e32cb42b5fcbd94f13607761eaec0cf1687fde95095289d1e26cfb \
@@ -5034,6 +5037,7 @@ torch==2.11.0+cpu \
     #   nixl-cu13
     #   quack-kernels
     #   tilelang
+    #   tokenspeed-mla
     #   torch-c-dlpack-ext
     #   torchvision
     #   vllm
@@ -5310,10 +5314,12 @@ virtualenv==21.2.4 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py312_cpu.lock
     #   -r python/requirements.txt
-vllm==0.20.0 \
-    --hash=sha256:24d28892e210200f6e1bd13f699c42a74cd2bb7364c11248e2348f677c7f6dfb \
-    --hash=sha256:29a135ca0d70650f057f15c7c0b560d24659524c771f70fbddc24597c861c118 \
-    --hash=sha256:a6d50152936ee292455af3ffbe359f7a284ac43bf3b68caccf29f368e196cc72
+vllm==0.21.0 \
+    --hash=sha256:05ff89c3e926b88b77d7878e317a659ffba678afc21c1d48952037aa5457f058 \
+    --hash=sha256:b241b085742cf04a68c82c089d12afe4d9ee729e0c7f81b2b2b9961d36105ee5 \
+    --hash=sha256:d6e63955b595bd2aa364e90f85c0a2e99573e701146db58394da569ddc6f4eea \
+    --hash=sha256:dc62135a50dc4b412b4f79549208e782f1665e49e8c13c2d29d2c3d94ff8ac97 \
+    --hash=sha256:f4a75b1391f44c67dc1ca268f5ffed9f6b7fdbc657c93db64e6892c5d1bc320b
     # via
     #   -c python/deplocks/llm/rayllm_test_py312_cpu.lock
     #   -r python/requirements/llm/llm-requirements.txt
@@ -5692,3 +5698,4 @@ zipp==3.23.1 \
 
 # The following packages were excluded from the output:
 # setuptools
+# nixl-cu12
-Original file line number
+Diff line change
   - MPS
   - NET
   - NFS
 +  - NIXL
   - NOTE
   - NVDA
   - OSS