|
15 | 15 | # CI Image configuration |
16 | 16 | CONTAINER_TYPE: "general" |
17 | 17 | FRAMEWORK: "vllm" |
18 | | - VLLM_VERSION: 0.11.2 |
| 18 | + VLLM_VERSION: 0.12.0 |
19 | 19 | VLLM_RAYSERVE_VERSION: 0.10.2 |
20 | 20 | PYTHON_VERSION: "py312" |
21 | 21 | CUDA_VERSION: "cu129" |
22 | 22 | OS_VERSION: "ubuntu22.04" |
23 | 23 | # Prod Image configuration |
24 | | - PROD_EC2_IMAGE: vllm:0.11-gpu-py312-ec2 |
| 24 | + PROD_EC2_IMAGE: vllm:0.12-gpu-py312-ec2 |
25 | 25 | PROD_RAYSERVE_IMAGE: vllm:0.10-gpu-py312-rayserve |
26 | | - PROD_SAGEMAKER_IMAGE: vllm:0.11-gpu-py312 |
| 26 | + PROD_SAGEMAKER_IMAGE: vllm:0.12-gpu-py312 |
27 | 27 | # CI environment configuration |
28 | 28 | FORCE_COLOR: "1" |
29 | 29 |
|
@@ -343,30 +343,30 @@ jobs: |
343 | 343 | docker exec ${CONTAINER_ID} sh -c ' |
344 | 344 | set -eux |
345 | 345 | nvidia-smi |
346 | | -
|
347 | | - # Examples Test # 30min |
348 | | - cd /workdir/examples |
349 | | - pip install tensorizer # for tensorizer test |
350 | | - python3 offline_inference/basic/generate.py --model facebook/opt-125m |
351 | | - # python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 |
| 346 | + # for basic |
352 | 347 | python3 offline_inference/basic/chat.py |
353 | | - python3 offline_inference/prefix_caching.py |
354 | | - python3 offline_inference/llm_engine_example.py |
355 | | -
|
356 | | - # NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e |
357 | | - # vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11 |
358 | | - # python3 offline_inference/audio_language.py --seed 0 |
359 | | -
|
360 | | - python3 offline_inference/vision_language.py --seed 0 |
361 | | - # broken before v0.12.0: https://github.com/vllm-project/vllm/commit/c64c0b78de4716ef019666663c56b6ceaa019463 |
362 | | - # python3 offline_inference/vision_language_pooling.py --seed |
363 | | - # python3 offline_inference/vision_language_multi_image.py --seed 0 |
364 | | - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors |
365 | | - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 |
| 348 | + python3 offline_inference/basic/generate.py --model facebook/opt-125m |
| 349 | + python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 |
366 | 350 | python3 offline_inference/basic/classify.py |
367 | 351 | python3 offline_inference/basic/embed.py |
368 | 352 | python3 offline_inference/basic/score.py |
369 | | - python3 offline_inference/simple_profiling.py |
| 353 | +
|
| 354 | + # for multi-modal models |
| 355 | + python3 offline_inference/audio_language.py --seed 0 |
| 356 | + python3 offline_inference/vision_language.py --seed 0 |
| 357 | + python3 offline_inference/vision_language_multi_image.py --seed 0 |
| 358 | + python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 |
| 359 | +
|
| 360 | + # for pooling models |
| 361 | + python3 pooling/pooling/vision_language_pooling.py --seed 0 |
| 362 | +
|
| 363 | + # for features demo |
| 364 | + python3 offline_inference/prefix_caching.py |
| 365 | + python3 offline_inference/llm_engine_example.py |
| 366 | + python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors |
| 367 | + python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 |
| 368 | + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU |
| 369 | + python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 |
370 | 370 | ' |
371 | 371 |
|
372 | 372 | # =================================================== |
@@ -919,30 +919,30 @@ jobs: |
919 | 919 | docker exec ${CONTAINER_ID} sh -c ' |
920 | 920 | set -eux |
921 | 921 | nvidia-smi |
922 | | -
|
923 | | - # Examples Test # 30min |
924 | | - cd /workdir/examples |
925 | | - pip install tensorizer # for tensorizer test |
926 | | - python3 offline_inference/basic/generate.py --model facebook/opt-125m |
927 | | - # python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 |
| 922 | + # for basic |
928 | 923 | python3 offline_inference/basic/chat.py |
929 | | - python3 offline_inference/prefix_caching.py |
930 | | - python3 offline_inference/llm_engine_example.py |
931 | | -
|
932 | | - # NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e |
933 | | - # vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11 |
934 | | - # python3 offline_inference/audio_language.py --seed 0 |
935 | | -
|
936 | | - python3 offline_inference/vision_language.py --seed 0 |
937 | | - # broken before v0.12.0: https://github.com/vllm-project/vllm/commit/c64c0b78de4716ef019666663c56b6ceaa019463 |
938 | | - # python3 offline_inference/vision_language_pooling.py --seed |
939 | | - # python3 offline_inference/vision_language_multi_image.py --seed 0 |
940 | | - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors |
941 | | - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 |
| 924 | + python3 offline_inference/basic/generate.py --model facebook/opt-125m |
| 925 | + python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 |
942 | 926 | python3 offline_inference/basic/classify.py |
943 | 927 | python3 offline_inference/basic/embed.py |
944 | 928 | python3 offline_inference/basic/score.py |
945 | | - python3 offline_inference/simple_profiling.py |
| 929 | +
|
| 930 | + # for multi-modal models |
| 931 | + python3 offline_inference/audio_language.py --seed 0 |
| 932 | + python3 offline_inference/vision_language.py --seed 0 |
| 933 | + python3 offline_inference/vision_language_multi_image.py --seed 0 |
| 934 | + python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 |
| 935 | +
|
| 936 | + # for pooling models |
| 937 | + python3 pooling/pooling/vision_language_pooling.py --seed 0 |
| 938 | +
|
| 939 | + # for features demo |
| 940 | + python3 offline_inference/prefix_caching.py |
| 941 | + python3 offline_inference/llm_engine_example.py |
| 942 | + python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors |
| 943 | + python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 |
| 944 | + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU |
| 945 | + python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 |
946 | 946 | ' |
947 | 947 |
|
948 | 948 | vllm-sagemaker-endpoint-test: |
|
0 commit comments