ROCm
diff --git a/‎.buildkite/ci_config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.buildkite/ci_config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.buildkite/hardware_tests/cpu.yaml‎
Lines changed: 37 additions & 7 deletions b/‎.buildkite/hardware_tests/cpu.yaml‎
Lines changed: 37 additions & 7 deletions
diff --git a/‎.buildkite/hardware_tests/intel.yaml‎
Lines changed: 0 additions & 7 deletions b/‎.buildkite/hardware_tests/intel.yaml‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎.buildkite/image_build/image_build.sh‎
Lines changed: 1 addition & 0 deletions b/‎.buildkite/image_build/image_build.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.buildkite/image_build/image_build.yaml‎
Lines changed: 60 additions & 0 deletions b/‎.buildkite/image_build/image_build.yaml‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎.buildkite/image_build/image_build_arm64.sh‎
Lines changed: 37 additions & 0 deletions b/‎.buildkite/image_build/image_build_arm64.sh‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎.buildkite/image_build/image_build_hpu.sh‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/image_build/image_build_hpu.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/image_build/image_build_xpu.sh‎
Lines changed: 2 additions & 2 deletions b/‎.buildkite/image_build/image_build_xpu.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/intel_jobs/lora_intel.yaml‎
Lines changed: 14 additions & 9 deletions b/‎.buildkite/intel_jobs/lora_intel.yaml‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎.buildkite/intel_jobs/misc_intel.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/intel_jobs/misc_intel.yaml‎
Lines changed: 1 addition & 1 deletion
@@ -8,6 +8,7 @@ run_all_patterns:
   - "CMakeLists.txt"
   - "requirements/common.txt"
   - "requirements/cuda.txt"
+  - "requirements/kv_connectors.txt"
   - "requirements/build/cuda.txt"
   - "requirements/test/cuda.txt"
   - "setup.py"
 
@@ -12,15 +12,19 @@ steps:
   - vllm/_custom_ops.py
   - tests/kernels/attention/test_cpu_attn.py
   - tests/kernels/moe/test_cpu_fused_moe.py
+  - tests/kernels/moe/test_cpu_quant_fused_moe.py
   - tests/kernels/test_onednn.py
   - tests/kernels/test_awq_int4_to_int8.py
+  - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
   commands:
     - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
       pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
       pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
+      pytest -x -v -s tests/kernels/moe/test_cpu_quant_fused_moe.py
       pytest -x -v -s tests/kernels/test_onednn.py
-      pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py"
+      pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py
+      pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py"
 
 - label: CPU-Compatibility Tests
   depends_on: []
@@ -50,30 +54,45 @@ steps:
       pytest -x -v -s tests/models/language/generation -m cpu_model
       pytest -x -v -s tests/models/language/pooling -m cpu_model"
 
+- label: CPU-ModelRunnerV2 Tests
+  depends_on: []
+  device: intel_cpu
+  no_plugin: true
+  soft_fail: true
+  source_file_dependencies:
+  - vllm/v1/worker/cpu/
+  - vllm/v1/worker/gpu/
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
+      uv pip install git+https://github.com/triton-lang/triton-cpu.git@270e696d
+      VLLM_USE_V2_MODEL_RUNNER=1 pytest -x -v -s tests/models/language/generation/test_granite.py -m cpu_model"
+
 - label: CPU-Quantization Model Tests
   depends_on: []
   device: intel_cpu
   no_plugin: true
   source_file_dependencies:
   - csrc/cpu/
   - vllm/model_executor/layers/quantization/cpu_wna16.py
-  - vllm/model_executor/layers/quantization/gptq_marlin.py
+  - vllm/model_executor/layers/quantization/auto_gptq.py
   - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
   - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
   - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
+  - vllm/model_executor/layers/fused_moe/experts/cpu_moe.py
   - tests/quantization/test_compressed_tensors.py
   - tests/quantization/test_cpu_wna16.py
   commands:
     - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
       pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
       pytest -x -v -s tests/quantization/test_cpu_wna16.py"
       
-- label: CPU-Distributed Tests
+- label: CPU-Distributed Tests (PP+TP)
   depends_on: []
   device: intel_cpu
   no_plugin: true
-  source_file_dependencies:
+  source_file_dependencies: &cpu_distributed_deps
   - csrc/cpu/shm.cpp
   - vllm/v1/worker/cpu_worker.py
   - vllm/v1/worker/gpu_worker.py
@@ -82,10 +101,21 @@ steps:
   - vllm/platforms/cpu.py
   - vllm/distributed/parallel_state.py
   - vllm/distributed/device_communicators/cpu_communicator.py
+  - .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh tp_pp"
+
+- label: CPU-Distributed Tests (DP+TP)
+  depends_on: []
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies: *cpu_distributed_deps
   commands:
     - |
       bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
-      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh dp_tp"
 
 - label: CPU-Multi-Modal Model Tests %N
   depends_on: []
 
@@ -8,10 +8,3 @@ steps:
     commands: 
     - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
 
-  - label: "Intel GPU Test"
-    depends_on: []
-    soft_fail: true
-    device: intel_gpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -192,6 +192,7 @@ export BUILDKITE_COMMIT
 export PARENT_COMMIT
 export IMAGE_TAG
 export IMAGE_TAG_LATEST
+export COMMIT="${COMMIT:-${BUILDKITE_COMMIT}}"
 export CACHE_FROM
 export CACHE_FROM_BASE_BRANCH
 export CACHE_FROM_MAIN
 
@@ -6,6 +6,48 @@ steps:
     timeout_in_minutes: 600
     commands:
     - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
+    # Non-root smoke 1: the default (root) image must still be importable
+    # under a non-root UID via `--user 2000:0`. Validates the `vllm` passwd
+    # entry + group-0-writable /home/vllm + uv path cleanup from #31959.
+    # Uses `import vllm` rather than `vllm serve --help` because the latter
+    # instantiates `VllmConfig` which requires a GPU attached to the
+    # container.
+    - docker run --rm --user 2000:0 --entrypoint python3 "$IMAGE_TAG" -c "import vllm; print(vllm.__version__)"
+    # Non-root smoke 2: assert the non-root enabling invariants are baked
+    # into the image. Runs as UID 2000:0 via a shell so we can verify
+    # filesystem perms + passwd/group file state + wrapper presence without
+    # triggering vLLM's GPU-requiring config-init path. The opt-in
+    # `vllm-openai-nonroot` target adds only `USER vllm`, `WORKDIR
+    # /home/vllm`, and an `ENTRYPOINT` override on top of these invariants;
+    # its build correctness is reviewed at the Dockerfile level. Wrapper
+    # logic is covered separately by the pre-commit hook
+    # `test-nonroot-entrypoint` (see .pre-commit-config.yaml).
+    - |
+      docker run --rm --user 2000:0 --entrypoint /bin/sh "$IMAGE_TAG" -ec '
+        if ! getent passwd 2000 | grep -q ^vllm:; then
+          echo FAIL: UID 2000 != vllm
+          exit 1
+        fi
+        if ! id -gn 2>/dev/null | grep -qx root; then
+          echo FAIL: GID 0 not root group
+          exit 1
+        fi
+        touch /home/vllm/.smoke && rm /home/vllm/.smoke
+        touch /opt/uv/cache/.smoke && rm /opt/uv/cache/.smoke
+        if ! test -x /usr/local/bin/vllm-nonroot-entrypoint.sh; then
+          echo FAIL: wrapper missing
+          exit 1
+        fi
+        if ! test -w /etc/passwd; then
+          echo FAIL: /etc/passwd not group-writable
+          exit 1
+        fi
+        if ! test -w /etc/group; then
+          echo FAIL: /etc/group not group-writable
+          exit 1
+        fi
+        echo non-root invariants OK
+      '
     retry:
       automatic:
         - exit_status: -1  # Agent was lost
@@ -56,3 +98,21 @@ steps:
           limit: 2
         - exit_status: -10  # Agent was lost
           limit: 2
+
+  - label: ":docker: Build arm64 image"
+    key: arm64-image-build
+    depends_on: []
+    source_file_dependencies:
+      - ".buildkite/image_build/image_build.yaml"
+      - ".buildkite/image_build/image_build_arm64.sh"
+      - "docker/Dockerfile"
+    commands:
+    - .buildkite/image_build/image_build_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
@@ -0,0 +1,37 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build (Grace/GH200 is the arm64 GPU target; sm_90)
+docker build --file docker/Dockerfile \
+  --platform linux/arm64 \
+  --build-arg max_jobs=16 \
+  --build-arg nvcc_threads=4 \
+  --build-arg torch_cuda_arch_list="9.0" \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64 \
+  --target test \
+  --progress plain .
+
+# push
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64
@@ -11,7 +11,7 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
 
 # skip build if image already exists
 if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
 
@@ -11,8 +11,8 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
 
 # skip build if image already exists
 if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then
 
@@ -18,17 +18,18 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       pytest -v -s lora/test_layers.py &&
       pytest -v -s lora/test_lora_checkpoints.py &&
-      (pytest -v -s lora/test_lora_functions.py --deselect="tests/lora/test_lora_functions.py::test_lora_functions_sync" --deselect="tests/lora/test_lora_functions.py::test_lora_functions_async" || true) &&
+      pytest -v -s lora/test_lora_functions.py &&
       pytest -v -s lora/test_lora_huggingface.py &&
       pytest -v -s lora/test_lora_manager.py &&
       pytest -v -s lora/test_lora_utils.py &&
       pytest -v -s lora/test_peft_helper.py &&
       pytest -v -s lora/test_resolver.py &&
       pytest -v -s lora/test_utils.py &&
-      (pytest -v -s lora/test_add_lora.py --deselect="tests/lora/test_add_lora.py::test_add_lora" || true) &&
-      (pytest -v -s lora/test_worker.py --deselect="tests/lora/test_worker.py::test_worker_apply_lora" || true)'
+      pytest -v -s lora/test_add_lora.py  &&
+      pytest -v -s lora/test_worker.py'
 
 - label: LoRA Fused/MoE Kernels
   timeout_in_minutes: 45
@@ -46,8 +47,9 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       pytest -v -s lora/test_fused_moe_lora_kernel.py && 
-      pytest -v -s lora/test_moe_lora_align_sum.py'
+      pytest -v -s lora/test_moe_lora_align_sum.py --deselect="tests/lora/test_moe_lora_align_sum.py::test_moe_lora_align_block_size_mixed_base_and_lora[1]"'
 
 - label: LoRA Punica Kernels
   timeout_in_minutes: 45
@@ -65,8 +67,9 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       set -o pipefail &&
-      pytest -v -s lora/test_punica_ops.py --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-2-2049-64-32-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype1-2-64000-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-1-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-1-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-8-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype0-3-2049-128-8-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-8-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype1-1-2049-256-128-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-3-64256-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-2-29696-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-3-49408-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-2-16384-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-2-51328-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype1-1-102656-32-4-4]"'
+      pytest -v -s lora/test_punica_ops.py --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-3-43264-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype1-1-2049-64-128-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-1-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-1-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-8-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype0-3-2049-128-8-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-8-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype1-1-2049-256-128-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-3-64256-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-2-29696-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-3-49408-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-2-16384-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-2-51328-32-4-4]"'
 
 - label: LoRA Punica FP8/XPU Ops
   timeout_in_minutes: 45
@@ -84,6 +87,7 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       pytest -v -s lora/test_punica_ops_fp8.py &&
       pytest -v -s lora/test_punica_xpu_ops.py'
 
@@ -103,10 +107,12 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       (pytest -v -s lora/test_mixtral.py --deselect="tests/lora/test_mixtral.py::test_mixtral_lora[4]" || true) &&
       pytest -v -s lora/test_quant_model.py --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model0]" --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model1]" --deselect="tests/lora/test_quant_model.py::test_quant_model_tp_equality[model0]" &&
-      pytest -v -s lora/test_qwen35_densemodel_lora.py &&
-      pytest -v -s lora/test_transformers_model.py'
+      pytest -v -s lora/test_transformers_model.py &&
+      pytest -v -s lora/test_chatglm3_tp.py &&
+      pytest -s -v lora/test_minicpmv_tp.py'
 
 - label: LoRA Multimodal
   timeout_in_minutes: 45
@@ -124,7 +130,6 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       pytest -v -s lora/test_default_mm_loras.py && 
-      (pytest -v -s lora/test_qwen3_unembed.py || true) &&
-      (pytest -v -s lora/test_qwenvl.py || true) &&
       pytest -v -s lora/test_whisper.py'
@@ -49,7 +49,7 @@ steps:
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       cd tests &&
-      pytest -v -s v1/logits_processors &&
+      pytest -v -s v1/logits_processors --ignore=v1/logits_processors/test_custom_online.py --ignore=v1/logits_processors/test_custom_offline.py &&
       pytest -v -s v1/test_oracle.py &&
       pytest -v -s v1/test_request.py &&
       pytest -v -s v1/test_outputs.py'