vllm-project
diff --git a/‎.buildkite/test-nightly.yaml‎
Lines changed: 45 additions & 0 deletions b/‎.buildkite/test-nightly.yaml‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎docker/Dockerfile.npu‎
Lines changed: 1 addition & 5 deletions b/‎docker/Dockerfile.npu‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎docker/Dockerfile.npu.a3‎
Lines changed: 1 addition & 5 deletions b/‎docker/Dockerfile.npu.a3‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎docker/Dockerfile.rocm‎
Lines changed: 1 addition & 5 deletions b/‎docker/Dockerfile.rocm‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎docs/getting_started/installation/gpu.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/getting_started/installation/gpu.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/getting_started/installation/gpu/cuda.inc.md‎
Lines changed: 0 additions & 2 deletions b/‎docs/getting_started/installation/gpu/cuda.inc.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎docs/getting_started/installation/gpu/rocm.inc.md‎
Lines changed: 50 additions & 0 deletions b/‎docs/getting_started/installation/gpu/rocm.inc.md‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎docs/getting_started/installation/npu/npu.inc.md‎
Lines changed: 2 additions & 5 deletions b/‎docs/getting_started/installation/npu/npu.inc.md‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎docs/user_guide/diffusion/parallelism_acceleration.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/user_guide/diffusion/parallelism_acceleration.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/offline_inference/text_to_video/text_to_video.py‎
Lines changed: 8 additions & 2 deletions b/‎examples/offline_inference/text_to_video/text_to_video.py‎
Lines changed: 8 additions & 2 deletions
@@ -0,0 +1,45 @@
+steps:
+  - label: ":docker: Build image"
+    key: image-build
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker build --file docker/Dockerfile.ci -t vllm-omni-ci ."
+      - "docker tag vllm-omni-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
+    agents:
+      queue: "cpu_queue_premerge"
+
+  - label: "Omni Model Test with H100"
+    timeout_in_minutes: 180
+    depends_on: image-build
+    commands:
+      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+      - pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 2
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
@@ -7,12 +7,8 @@ WORKDIR ${APP_DIR}
 
 COPY . .
 
-# Remove this replace when the dispatch of requirements is ready
-RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \
- && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml
-
 # Install vllm-omni with dev dependencies
-RUN pip install --no-cache-dir -e .
+RUN pip install --no-cache-dir -e . --no-build-isolation
 
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 
 
@@ -7,12 +7,8 @@ WORKDIR ${APP_DIR}
 
 COPY . .
 
-# Remove this replace when the dispatch of requirements is ready
-RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \
- && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml
-
 # Install vllm-omni with dev dependencies
-RUN pip install --no-cache-dir -e .
+RUN pip install --no-cache-dir -e . --no-build-isolation
 
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 
 
@@ -15,11 +15,7 @@ RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni
 
 # Step 2: Copy vllm-omni code and install without uv
 COPY . ${COMMON_WORKDIR}/vllm-omni
-RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
-
-# When we are installing onnxruntime-rocm, we need to uninstall the system-installed onnxruntime first.
-# These are the dependencies of Qwen3-TTS.
-RUN uv pip uninstall onnxruntime --system && uv pip install --no-cache-dir onnxruntime-rocm sox --system
+RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]" --no-build-isolation
 
 RUN ln -sf /usr/bin/python3 /usr/bin/python
 
 
@@ -26,6 +26,8 @@ vLLM-Omni is a Python library that supports the following GPU variants. The libr
 
 ### Pre-built wheels
 
+Note: Pre-built wheels are currently only available for vLLM-Omni 0.11.0rc1, 0.12.0rc1, 0.14.0rc1, 0.14.0. For the latest version, please [build from source](https://docs.vllm.ai/projects/vllm-omni/en/latest/getting_started/installation/gpu/#build-wheel-from-source).
+
 === "NVIDIA CUDA"
 
     --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-wheels"
 
@@ -17,8 +17,6 @@ Therefore, it is recommended to install vLLM and vLLM-Omni with a **fresh new**
 # --8<-- [start:pre-built-wheels]
 
 #### Installation of vLLM
-Note: Pre-built wheels are currently only available for vLLM-Omni 0.11.0rc1, 0.12.0rc1, 0.14.0rc1, 0.14.0. For the latest version, please [build from source](https://docs.vllm.ai/projects/vllm-omni/en/latest/getting_started/installation/gpu/#build-wheel-from-source).
-
 
 vLLM-Omni is built based on vLLM. Please install it with command below.
 ```bash
 
@@ -9,10 +9,60 @@ vLLM-Omni current recommends the steps in under setup through Docker Images.
 
 # --8<-- [start:pre-built-wheels]
 
+#### Installation of vLLM
+
+vLLM-Omni is built based on vLLM. Please install it with command below.
+```bash
+uv pip install vllm==0.14.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.14.0/rocm700
+```
+
+#### Installation of vLLM-Omni
+
+```bash
+# we need to add --no-build-isolation as the torch
+# is not obtained from pypi, we have to install using the
+# torch installed in our environment
+uv pip install vllm-omni
+
+# Optional if want to run Qwen3 TTS
+uv pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
+uv pip install onnxruntime-rocm sox
+```
+
 # --8<-- [end:pre-built-wheels]
 
 # --8<-- [start:build-wheel-from-source]
 
+#### Installation of vLLM
+If you do not need to modify source code of vLLM, you can directly install the stable 0.14.0 release version of the library
+
+```bash
+uv pip install vllm==0.14.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.14.0/rocm700
+```
+
+The release 0.14.0 of vLLM requires ROCm 7.0 environment.
+
+#### Installation of vLLM-Omni
+Since vllm-omni is rapidly evolving, it's recommended to install it from source
+```bash
+git clone https://github.com/vllm-project/vllm-omni.git
+cd vllm-omni
+VLLM_OMNI_TARGET_DEVICE=rocm uv pip install -e .
+# OR
+uv pip install -e . --no-build-isolation
+```
+
+<details><summary>(Optional) Installation of vLLM from source</summary>
+If you want to check, modify or debug with source code of vLLM, install the library from source with the following instructions:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+git checkout v0.14.0
+python3 -m pip install -r requirements/rocm.txt
+python3 setup.py develop
+```
+
 # --8<-- [end:build-wheel-from-source]
 
 # --8<-- [start:build-docker]
 
@@ -38,12 +38,9 @@ docker run --rm \
 cd /vllm-workspace
 git clone -b v0.14.0 https://github.com/vllm-project/vllm-omni.git
 
-# Remove this replace when the dispatch of requirements is ready
-RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \
- && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml
-
 cd vllm-omni
-pip install -v -e .
+VLLM_OMNI_TARGET_DEVICE=npu pip install -v -e .
+# OR pip install -v -e . --no-build-isolation
 export VLLM_WORKER_MULTIPROC_METHOD=spawn
 ```
 
 
@@ -49,7 +49,7 @@ The following table shows which models are currently supported by parallelism me
 
 | Model | Model Identifier | Ulysses-SP | Ring-SP | Tensor-Parallel |
 |-------|------------------|------------|---------|--------------------------|
-| **Wan2.2** | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | ✅ | ✅ | ❌ |
+| **Wan2.2** | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | ✅ | ✅ | ✅ |
 
 ### Tensor Parallelism
 
 
@@ -109,7 +109,12 @@ def parse_args() -> argparse.Namespace:
         choices=[1, 2],
         help="Number of GPUs used for classifier free guidance parallel size.",
     )
-
+    parser.add_argument(
+        "--tensor_parallel_size",
+        type=int,
+        default=1,
+        help="Number of GPUs used for tensor parallelism (TP) inside the DiT.",
+    )
     return parser.parse_args()
 
 
@@ -141,6 +146,7 @@ def main():
         ulysses_degree=args.ulysses_degree,
         ring_degree=args.ring_degree,
         cfg_parallel_size=args.cfg_parallel_size,
+        tensor_parallel_size=args.tensor_parallel_size,
     )
 
     # Check if profiling is requested via environment variable
@@ -173,7 +179,7 @@ def main():
     print(f"  Inference steps: {args.num_inference_steps}")
     print(f"  Frames: {args.num_frames}")
     print(
-        f"  Parallel configuration: ulysses_degree={args.ulysses_degree}, ring_degree={args.ring_degree}, cfg_parallel_size={args.cfg_parallel_size}"
+        f"  Parallel configuration: ulysses_degree={args.ulysses_degree}, ring_degree={args.ring_degree}, cfg_parallel_size={args.cfg_parallel_size}, tensor_parallel_size={args.tensor_parallel_size}"
     )
     print(f"  Video size: {args.width}x{args.height}")
     print(f"{'=' * 60}\n")