vllm-project
diff --git a/‎.buildkite/pipeline.yml‎
Lines changed: 110 additions & 29 deletions b/‎.buildkite/pipeline.yml‎
Lines changed: 110 additions & 29 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-amd-test.sh‎
Lines changed: 6 additions & 0 deletions b/‎.buildkite/scripts/hardware_ci/run-amd-test.sh‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.buildkite/scripts/simple_test.sh‎
Lines changed: 0 additions & 58 deletions b/‎.buildkite/scripts/simple_test.sh‎
Lines changed: 0 additions & 58 deletions
diff --git a/‎.buildkite/test-amd.yaml‎
Lines changed: 44 additions & 34 deletions b/‎.buildkite/test-amd.yaml‎
Lines changed: 44 additions & 34 deletions
@@ -20,10 +20,19 @@ steps:
   - label: "Simple Unit Test"
     depends_on: image-build
     commands:
-      - pytest -v -s tests/entrypoints/
-      - pytest -v -s tests/diffusion/cache/
-      - pytest -v -s tests/model_executor/models/qwen2_5_omni/test_audio_length.py
-      - pytest -v -s tests/worker/
+    - |
+      pytest -v -s \
+        tests/entrypoints/ \
+        tests/diffusion/cache/ \
+        tests/diffusion/lora/ \
+        tests/model_executor/models/qwen2_5_omni/test_audio_length.py \
+        tests/worker/ \
+        tests/distributed/omni_connectors/test_kv_flow.py \
+        --cov=vllm_omni \
+        --cov-branch \
+        --cov-report=term-missing \
+        --cov-report=html \
+        --cov-report=xml
     agents:
       queue: "gpu_1_queue"
     plugins:
@@ -75,6 +84,7 @@ steps:
     depends_on: image-build
     commands:
       - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+      - pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
     plugins:
@@ -175,34 +185,13 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Omni Model Test"
-    timeout_in_minutes: 15
-    depends_on: image-build
-    commands:
-      - export VLLM_LOGGING_LEVEL=DEBUG
-      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
-    agents:
-      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
-    plugins:
-      - docker#v5.2.0:
-          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-          always-pull: true
-          propagate-environment: true
-          environment:
-            - "HF_HOME=/fsx/hf_cache"
-          volumes:
-            - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Omni Model Test with H100"
-    timeout_in_minutes: 30
+  - label: "Benchmark Test"
+    timeout_in_minutes: 15
     depends_on: image-build
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
-      - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
-      - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
-      - pytest -s -v tests/e2e/online_serving/test_async_omni.py
+      - pytest -s -v tests/benchmarks/test_serve_cli.py
     agents:
       queue: "mithril-h100-pool"
     plugins:
@@ -232,12 +221,69 @@ steps:
                   path: /mnt/hf-cache
                   type: DirectoryOrCreate
 
+  - label: "Omni Model Test"
+    timeout_in_minutes: 15
+    depends_on: image-build
+    commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+    agents:
+      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        always-pull: true
+        propagate-environment: true
+        environment:
+        - "HF_HOME=/fsx/hf_cache"
+        volumes:
+        - "/fsx/hf_cache:/fsx/hf_cache"
+
+  # - label: "Omni Model Test with H100"
+  #   timeout_in_minutes: 30
+  #   depends_on: image-build
+  #   commands:
+  #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  #     - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+  #     - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+  #     - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
+  #     - pytest -s -v tests/e2e/online_serving/test_async_omni.py
+  #   agents:
+  #     queue: "mithril-h100-pool"
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           containers:
+  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #               resources:
+  #                 limits:
+  #                   nvidia.com/gpu: 2
+  #               volumeMounts:
+  #                 - name: devshm
+  #                   mountPath: /dev/shm
+  #                 - name: hf-cache
+  #                   mountPath: /root/.cache/huggingface
+  #               env:
+  #                 - name: HF_HOME
+  #                   value: /root/.cache/huggingface
+  #           nodeSelector:
+  #             node.kubernetes.io/instance-type: gpu-h100-sxm
+  #           volumes:
+  #             - name: devshm
+  #               emptyDir:
+  #                 medium: Memory
+  #             - name: hf-cache
+  #               hostPath:
+  #                 path: /mnt/hf-cache
+  #                 type: DirectoryOrCreate
+
   - label: "Diffusion Image Edit Test with H100 (1 GPU)"
     timeout_in_minutes: 20
     depends_on: image-build
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/online_serving/test_i2i_multi_image_input.py
+      - pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
     agents:
       queue: "mithril-h100-pool"
     plugins:
@@ -266,3 +312,38 @@ steps:
                 hostPath:
                   path: /mnt/hf-cache
                   type: DirectoryOrCreate
+
+  # - label: "Bagel Text2Img Model Test with H100"
+  #   timeout_in_minutes: 30
+  #   depends_on: image-build
+  #   commands:
+  #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  #     - pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
+  #   agents:
+  #     queue: "mithril-h100-pool"
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           containers:
+  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #               resources:
+  #                 limits:
+  #                   nvidia.com/gpu: 1
+  #               volumeMounts:
+  #                 - name: devshm
+  #                   mountPath: /dev/shm
+  #                 - name: hf-cache
+  #                   mountPath: /root/.cache/huggingface
+  #               env:
+  #                 - name: HF_HOME
+  #                   value: /root/.cache/huggingface
+  #           nodeSelector:
+  #             node.kubernetes.io/instance-type: gpu-h100-sxm
+  #           volumes:
+  #             - name: devshm
+  #               emptyDir:
+  #                 medium: Memory
+  #             - name: hf-cache
+  #               hostPath:
+  #                 path: /mnt/hf-cache
+  #                 type: DirectoryOrCreate
@@ -116,6 +116,9 @@ if [[ $commands == *"--shard-id="* ]]; then
         --shm-size=16gb \
         --group-add "$render_gid" \
         --rm \
+        -e MIOPEN_DEBUG_CONV_DIRECT=0 \
+        -e MIOPEN_DEBUG_CONV_GEMM=0 \
+        -e VLLM_ROCM_USE_AITER=1 \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
         -e HF_TOKEN \
         -e AWS_ACCESS_KEY_ID \
@@ -148,6 +151,9 @@ else
           --shm-size=16gb \
           --group-add "$render_gid" \
           --rm \
+          -e MIOPEN_DEBUG_CONV_DIRECT=0 \
+          -e MIOPEN_DEBUG_CONV_GEMM=0 \
+          -e VLLM_ROCM_USE_AITER=1 \
           -e HF_TOKEN \
           -e AWS_ACCESS_KEY_ID \
           -e AWS_SECRET_ACCESS_KEY \
 
@@ -8,14 +8,32 @@ steps:
   grade: Blocking
   commands:
     - export GPU_ARCHS=gfx942
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
 
+- label: "Diffusion Images API LoRA E2E"
+  timeout_in_minutes: 20
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
+
+- label: "Diffusion Model CPU offloading Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+
 - label: "Diffusion Cache Backend Test"
   timeout_in_minutes: 15
   agent_pool: mi325_1
@@ -26,34 +44,37 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
 
-- label: "Diffusion Parallelism Test"
-  timeout_in_minutes: 15
+- label: "Diffusion Sequence Parallelism Test"
+  timeout_in_minutes: 20
   agent_pool: mi325_2
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
 
+- label: "Diffusion Tensor Parallelism Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
+
 - label: "Diffusion GPU Worker Test"
   timeout_in_minutes: 20
   agent_pool: mi325_2
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pytest -s -v tests/diffusion/test_diffusion_worker.py
 
 - label: "Omni Model Test Qwen2-5-Omni"
@@ -66,12 +87,6 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
 
 - label: "Omni Model Test Qwen3-Omni"
@@ -83,9 +98,10 @@ steps:
   commands:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py
+    - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+    - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+    - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
+    - pytest -s -v tests/e2e/online_serving/test_async_omni.py
 
 - label: "Diffusion Image Edit Test"
   timeout_in_minutes: 15
@@ -97,10 +113,4 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
-    - pytest -s -v tests/e2e/online_serving/test_i2i_multi_image_input.py
+    - pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py