Skip to content

Commit 4ff3bc1

Browse files
committed
test vllm
Signed-off-by: Junpu Fan <junpu@amazon.com>
1 parent 29bfbdd commit 4ff3bc1

File tree

2 files changed

+45
-45
lines changed

2 files changed

+45
-45
lines changed

.github/workflows/pr-vllm.yml

Lines changed: 43 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@ env:
1515
# CI Image configuration
1616
CONTAINER_TYPE: "general"
1717
FRAMEWORK: "vllm"
18-
VLLM_VERSION: 0.11.2
18+
VLLM_VERSION: 0.12.0
1919
VLLM_RAYSERVE_VERSION: 0.10.2
2020
PYTHON_VERSION: "py312"
2121
CUDA_VERSION: "cu129"
2222
OS_VERSION: "ubuntu22.04"
2323
# Prod Image configuration
24-
PROD_EC2_IMAGE: vllm:0.11-gpu-py312-ec2
24+
PROD_EC2_IMAGE: vllm:0.12-gpu-py312-ec2
2525
PROD_RAYSERVE_IMAGE: vllm:0.10-gpu-py312-rayserve
26-
PROD_SAGEMAKER_IMAGE: vllm:0.11-gpu-py312
26+
PROD_SAGEMAKER_IMAGE: vllm:0.12-gpu-py312
2727
# CI environment configuration
2828
FORCE_COLOR: "1"
2929

@@ -343,30 +343,30 @@ jobs:
343343
docker exec ${CONTAINER_ID} sh -c '
344344
set -eux
345345
nvidia-smi
346-
347-
# Examples Test # 30min
348-
cd /workdir/examples
349-
pip install tensorizer # for tensorizer test
350-
python3 offline_inference/basic/generate.py --model facebook/opt-125m
351-
# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
346+
# for basic
352347
python3 offline_inference/basic/chat.py
353-
python3 offline_inference/prefix_caching.py
354-
python3 offline_inference/llm_engine_example.py
355-
356-
# NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
357-
# vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11
358-
# python3 offline_inference/audio_language.py --seed 0
359-
360-
python3 offline_inference/vision_language.py --seed 0
361-
# broken before v0.12.0: https://github.com/vllm-project/vllm/commit/c64c0b78de4716ef019666663c56b6ceaa019463
362-
# python3 offline_inference/vision_language_pooling.py --seed
363-
# python3 offline_inference/vision_language_multi_image.py --seed 0
364-
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
365-
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
348+
python3 offline_inference/basic/generate.py --model facebook/opt-125m
349+
python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
366350
python3 offline_inference/basic/classify.py
367351
python3 offline_inference/basic/embed.py
368352
python3 offline_inference/basic/score.py
369-
python3 offline_inference/simple_profiling.py
353+
354+
# for multi-modal models
355+
python3 offline_inference/audio_language.py --seed 0
356+
python3 offline_inference/vision_language.py --seed 0
357+
python3 offline_inference/vision_language_multi_image.py --seed 0
358+
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
359+
360+
# for pooling models
361+
python3 pooling/pooling/vision_language_pooling.py --seed 0
362+
363+
# for features demo
364+
python3 offline_inference/prefix_caching.py
365+
python3 offline_inference/llm_engine_example.py
366+
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
367+
python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
368+
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
369+
python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
370370
'
371371
372372
# ===================================================
@@ -919,30 +919,30 @@ jobs:
919919
docker exec ${CONTAINER_ID} sh -c '
920920
set -eux
921921
nvidia-smi
922-
923-
# Examples Test # 30min
924-
cd /workdir/examples
925-
pip install tensorizer # for tensorizer test
926-
python3 offline_inference/basic/generate.py --model facebook/opt-125m
927-
# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
922+
# for basic
928923
python3 offline_inference/basic/chat.py
929-
python3 offline_inference/prefix_caching.py
930-
python3 offline_inference/llm_engine_example.py
931-
932-
# NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
933-
# vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11
934-
# python3 offline_inference/audio_language.py --seed 0
935-
936-
python3 offline_inference/vision_language.py --seed 0
937-
# broken before v0.12.0: https://github.com/vllm-project/vllm/commit/c64c0b78de4716ef019666663c56b6ceaa019463
938-
# python3 offline_inference/vision_language_pooling.py --seed
939-
# python3 offline_inference/vision_language_multi_image.py --seed 0
940-
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
941-
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
924+
python3 offline_inference/basic/generate.py --model facebook/opt-125m
925+
python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
942926
python3 offline_inference/basic/classify.py
943927
python3 offline_inference/basic/embed.py
944928
python3 offline_inference/basic/score.py
945-
python3 offline_inference/simple_profiling.py
929+
930+
# for multi-modal models
931+
python3 offline_inference/audio_language.py --seed 0
932+
python3 offline_inference/vision_language.py --seed 0
933+
python3 offline_inference/vision_language_multi_image.py --seed 0
934+
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
935+
936+
# for pooling models
937+
python3 pooling/pooling/vision_language_pooling.py --seed 0
938+
939+
# for features demo
940+
python3 offline_inference/prefix_caching.py
941+
python3 offline_inference/llm_engine_example.py
942+
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
943+
python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
944+
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
945+
python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
946946
'
947947
948948
vllm-sagemaker-endpoint-test:

docker/vllm/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
# Declare the argument as default to use as input
22
# base image: https://hub.docker.com/r/vllm/vllm-openai/tags
3-
ARG BASE_IMAGE=vllm/vllm-openai:v0.11.0
3+
ARG BASE_IMAGE=vllm/vllm-openai:v0.12.0
44

55
# Use input argument as base image
66
FROM $BASE_IMAGE AS base
77

88
# ====================== common =========================================
99
ARG PYTHON="python3"
1010
LABEL maintainer="Amazon AI"
11-
ARG EFA_VERSION="1.43.3"
11+
ARG EFA_VERSION="1.45.1"
1212
LABEL dlc_major_version="1"
1313
ENV DEBIAN_FRONTEND=noninteractive \
1414
LANG=C.UTF-8 \

0 commit comments

Comments
 (0)