test vllm

junpuf · junpuf · commit 4ff3bc1e9a57 · 2025-12-05T13:51:44.000-08:00
Signed-off-by: Junpu Fan &lt;junpu@amazon.com&gt;
diff --git a/.github/workflows/pr-vllm.yml b/.github/workflows/pr-vllm.yml
@@ -15,15 +15,15 @@ env:
   # CI Image configuration
   CONTAINER_TYPE: "general"
   FRAMEWORK: "vllm"
-  VLLM_VERSION: 0.11.2
+  VLLM_VERSION: 0.12.0
   VLLM_RAYSERVE_VERSION: 0.10.2
   PYTHON_VERSION: "py312"
   CUDA_VERSION: "cu129"
   OS_VERSION: "ubuntu22.04"
   # Prod Image configuration
-  PROD_EC2_IMAGE: vllm:0.11-gpu-py312-ec2
+  PROD_EC2_IMAGE: vllm:0.12-gpu-py312-ec2
   PROD_RAYSERVE_IMAGE: vllm:0.10-gpu-py312-rayserve
-  PROD_SAGEMAKER_IMAGE: vllm:0.11-gpu-py312
+  PROD_SAGEMAKER_IMAGE: vllm:0.12-gpu-py312
   # CI environment configuration
   FORCE_COLOR: "1"
 
@@ -343,30 +343,30 @@ jobs:
           docker exec ${CONTAINER_ID} sh -c '
             set -eux
             nvidia-smi
-
-            # Examples Test # 30min
-            cd /workdir/examples
-            pip install tensorizer # for tensorizer test
-            python3 offline_inference/basic/generate.py --model facebook/opt-125m
-            # python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+            # for basic
             python3 offline_inference/basic/chat.py
-            python3 offline_inference/prefix_caching.py
-            python3 offline_inference/llm_engine_example.py
-
-            # NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
-            # vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11
-            # python3 offline_inference/audio_language.py --seed 0
-
-            python3 offline_inference/vision_language.py --seed 0
-            # broken before v0.12.0: https://github.com/vllm-project/vllm/commit/c64c0b78de4716ef019666663c56b6ceaa019463
-            # python3 offline_inference/vision_language_pooling.py --seed
-            # python3 offline_inference/vision_language_multi_image.py --seed 0
-            python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-            python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+            python3 offline_inference/basic/generate.py --model facebook/opt-125m
+            python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
             python3 offline_inference/basic/classify.py
             python3 offline_inference/basic/embed.py
             python3 offline_inference/basic/score.py
-            python3 offline_inference/simple_profiling.py
+
+            # for multi-modal models
+            python3 offline_inference/audio_language.py --seed 0
+            python3 offline_inference/vision_language.py --seed 0
+            python3 offline_inference/vision_language_multi_image.py --seed 0
+            python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+
+            # for pooling models
+            python3 pooling/pooling/vision_language_pooling.py --seed 0
+
+            # for features demo
+            python3 offline_inference/prefix_caching.py
+            python3 offline_inference/llm_engine_example.py
+            python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+            python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+            # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+            python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
           '
 
   # ===================================================
@@ -919,30 +919,30 @@ jobs:
           docker exec ${CONTAINER_ID} sh -c '
             set -eux
             nvidia-smi
-
-            # Examples Test # 30min
-            cd /workdir/examples
-            pip install tensorizer # for tensorizer test
-            python3 offline_inference/basic/generate.py --model facebook/opt-125m
-            # python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+            # for basic
             python3 offline_inference/basic/chat.py
-            python3 offline_inference/prefix_caching.py
-            python3 offline_inference/llm_engine_example.py
-
-            # NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
-            # vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11
-            # python3 offline_inference/audio_language.py --seed 0
-
-            python3 offline_inference/vision_language.py --seed 0
-            # broken before v0.12.0: https://github.com/vllm-project/vllm/commit/c64c0b78de4716ef019666663c56b6ceaa019463
-            # python3 offline_inference/vision_language_pooling.py --seed
-            # python3 offline_inference/vision_language_multi_image.py --seed 0
-            python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-            python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+            python3 offline_inference/basic/generate.py --model facebook/opt-125m
+            python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
             python3 offline_inference/basic/classify.py
             python3 offline_inference/basic/embed.py
             python3 offline_inference/basic/score.py
-            python3 offline_inference/simple_profiling.py
+
+            # for multi-modal models
+            python3 offline_inference/audio_language.py --seed 0
+            python3 offline_inference/vision_language.py --seed 0
+            python3 offline_inference/vision_language_multi_image.py --seed 0
+            python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+
+            # for pooling models
+            python3 pooling/pooling/vision_language_pooling.py --seed 0
+
+            # for features demo
+            python3 offline_inference/prefix_caching.py
+            python3 offline_inference/llm_engine_example.py
+            python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+            python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+            # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+            python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
           '
 
   vllm-sagemaker-endpoint-test:
diff --git a/docker/vllm/Dockerfile b/docker/vllm/Dockerfile
@@ -1,14 +1,14 @@
 # Declare the argument as default to use as input
 # base image: https://hub.docker.com/r/vllm/vllm-openai/tags
-ARG BASE_IMAGE=vllm/vllm-openai:v0.11.0
+ARG BASE_IMAGE=vllm/vllm-openai:v0.12.0
 
 # Use input argument as base image
 FROM $BASE_IMAGE AS base
 
 # ====================== common =========================================
 ARG PYTHON="python3"
 LABEL maintainer="Amazon AI"
-ARG EFA_VERSION="1.43.3"
+ARG EFA_VERSION="1.45.1"
 LABEL dlc_major_version="1"
 ENV DEBIAN_FRONTEND=noninteractive \
   LANG=C.UTF-8 \