Skip to content

Refactor vllm sm endpoint test to use pytest #118

Refactor vllm sm endpoint test to use pytest

Refactor vllm sm endpoint test to use pytest #118

Workflow file for this run

name: PR - vLLM
on:
pull_request:
branches:
- main
paths:
- "**vllm**"
permissions:
contents: read
env:
# CI Image configuration
VLLM_VERSION: 0.11.2
VLLM_RAYSERVE_VERSION: 0.10.2
PYTHON_VERSION: "py312"
CUDA_VERSION: "cu129"
OS_VERSION: "ubuntu22.04"
# Prod Image configuration
PROD_EC2_IMAGE: vllm:0.11-gpu-py312-ec2
PROD_RAYSERVE_IMAGE: vllm:0.10-gpu-py312-rayserve
PROD_SAGEMAKER_IMAGE: vllm:0.11-gpu-py312
# CI environment configuration
FORCE_COLOR: "1"
jobs:
check-changes:
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-check-changes-${{ github.event.pull_request.number }}
cancel-in-progress: true
outputs:
build-change: ${{ steps.changes.outputs.build-change }}
test-change: ${{ steps.changes.outputs.test-change }}
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Setup python
uses: actions/setup-python@v6
with:
python-version: "3.12"
- name: Run pre-commit
uses: pre-commit/action@v3.0.1
with:
extra_args: --all-files
- name: Detect file changes
id: changes
uses: dorny/paths-filter@v3
with:
filters: |
build-change:
- "docker/vllm/**"
- "scripts/vllm/**"
- "scripts/common/**"
- "scripts/telemetry/**"
- ".github/workflows/pr-vllm*"
test-change:
- "test/vllm/**"
# ==============================================
# =============== vLLM EC2 jobs ================
# ==============================================
build-vllm-ec2-image:
needs: [check-changes]
if: needs.check-changes.outputs.build-change == 'true'
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-build-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-build-vllm-ec2-image-${{ github.event.pull_request.number }}
cancel-in-progress: true
outputs:
ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }}
steps:
- uses: actions/checkout@v5
- run: .github/scripts/buildkitd.sh
- name: ECR login
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
aws-region: ${{ vars.AWS_REGION }}
- name: Resolve image URI for build
id: image-uri-build
run: |
CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-ec2-pr-${{ github.event.pull_request.number }}
echo "Image URI to build: ${CI_IMAGE_URI}"
echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV}
echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT}
- name: Build image
run: |
# base image: https://hub.docker.com/r/vllm/vllm-openai/tags
docker buildx build --progress plain \
--build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
--build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_VERSION }}" \
--cache-to=type=inline \
--cache-from=type=registry,ref=${CI_IMAGE_URI} \
--tag ${CI_IMAGE_URI} \
--target vllm-ec2 \
-f docker/vllm/Dockerfile .
- name: Container push
run: |
docker push ${CI_IMAGE_URI}
docker rmi ${CI_IMAGE_URI}
set-ec2-test-environment:
needs: [check-changes, build-vllm-ec2-image]
if: |
always() && !failure() && !cancelled() &&
(needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.test-change == 'true')
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-set-ec2-test-environment-${{ github.event.pull_request.number }}
cancel-in-progress: true
outputs:
aws-account-id: ${{ steps.set-env.outputs.AWS_ACCOUNT_ID }}
image-uri: ${{ steps.set-env.outputs.IMAGE_URI }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set test environment
id: set-env
run: |
if [[ "${{ needs.build-vllm-ec2-image.result }}" == "success" ]]; then
AWS_ACCOUNT_ID=${{ vars.CI_AWS_ACCOUNT_ID }}
IMAGE_URI=${{ needs.build-vllm-ec2-image.outputs.ci-image }}
else
AWS_ACCOUNT_ID=${{ vars.PROD_AWS_ACCOUNT_ID }}
IMAGE_URI=${{ vars.PROD_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/${{ env.PROD_EC2_IMAGE }}
fi
echo "Image URI to test: ${IMAGE_URI}"
echo "AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID}" >> ${GITHUB_OUTPUT}
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
vllm-ec2-regression-test:
needs: [build-vllm-ec2-image, set-ec2-test-environment]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-g6xl-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-vllm-ec2-regression-test-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Container pull
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ needs.set-ec2-test-environment.outputs.aws-account-id }}
aws-region: ${{ vars.AWS_REGION }}
image-uri: ${{ needs.set-ec2-test-environment.outputs.image-uri }}
- name: Checkout vLLM tests
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v${{ env.VLLM_VERSION }}
path: vllm_source
- name: Start container
run: |
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-v ${HOME}/.cache/vllm:/root/.cache/vllm \
-v ./vllm_source:/workdir --workdir /workdir \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ needs.set-ec2-test-environment.outputs.image-uri }})
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
- name: Setup for vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
mkdir src
mv vllm src/vllm
'
- name: Run vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
nvidia-smi
# Regression Test # 7min
cd /workdir/tests
uv pip install --system modelscope
pytest -v -s test_regression.py
'
vllm-ec2-cuda-test:
needs: [build-vllm-ec2-image, set-ec2-test-environment]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-g6xl-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-vllm-ec2-cuda-test-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Container pull
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ needs.set-ec2-test-environment.outputs.aws-account-id }}
aws-region: ${{ vars.AWS_REGION }}
image-uri: ${{ needs.set-ec2-test-environment.outputs.image-uri }}
- name: Checkout vLLM tests
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v${{ env.VLLM_VERSION }}
path: vllm_source
- name: Start container
run: |
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-v ${HOME}/.cache/vllm:/root/.cache/vllm \
-v ./vllm_source:/workdir --workdir /workdir \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ needs.set-ec2-test-environment.outputs.image-uri }})
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
- name: Setup for vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
mkdir src
mv vllm src/vllm
'
- name: Run vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
nvidia-smi
# Platform Tests (CUDA) # 4min
cd /workdir/tests
pytest -v -s cuda/test_cuda_context.py
'
vllm-ec2-example-test:
needs: [build-vllm-ec2-image, set-ec2-test-environment]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-g6xl-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-vllm-ec2-example-test-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Container pull
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ needs.set-ec2-test-environment.outputs.aws-account-id }}
aws-region: ${{ vars.AWS_REGION }}
image-uri: ${{ needs.set-ec2-test-environment.outputs.image-uri }}
- name: Checkout vLLM tests
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v${{ env.VLLM_VERSION }}
path: vllm_source
- name: Start container
run: |
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-v ${HOME}/.cache/vllm:/root/.cache/vllm \
-v ./vllm_source:/workdir --workdir /workdir \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ needs.set-ec2-test-environment.outputs.image-uri }})
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
- name: Setup for vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
mkdir src
mv vllm src/vllm
'
- name: Run vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
nvidia-smi
# Examples Test # 30min
cd /workdir/examples
pip install tensorizer # for tensorizer test
python3 offline_inference/basic/generate.py --model facebook/opt-125m
# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
python3 offline_inference/basic/chat.py
python3 offline_inference/prefix_caching.py
python3 offline_inference/llm_engine_example.py
# NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
# vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11
# python3 offline_inference/audio_language.py --seed 0
python3 offline_inference/vision_language.py --seed 0
python3 offline_inference/vision_language_pooling.py --seed 0
python3 offline_inference/vision_language_multi_image.py --seed 0
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
python3 offline_inference/basic/classify.py
python3 offline_inference/basic/embed.py
python3 offline_inference/basic/score.py
python3 offline_inference/simple_profiling.py
'
# ===================================================
# =============== vLLM RayServe jobs ================
# ===================================================
build-vllm-rayserve-image:
needs: [check-changes]
if: needs.check-changes.outputs.build-change == 'true'
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-build-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-build-vllm-rayserve-image-${{ github.event.pull_request.number }}
cancel-in-progress: true
outputs:
ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }}
steps:
- uses: actions/checkout@v5
- run: .github/scripts/buildkitd.sh
- name: ECR login
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
aws-region: ${{ vars.AWS_REGION }}
- name: Resolve image URI for build
id: image-uri-build
run: |
CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-rayserve-ec2-pr-${{ github.event.pull_request.number }}
echo "Image URI to build: ${CI_IMAGE_URI}"
echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV}
echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT}
- name: Build image
run: |
# base image: https://hub.docker.com/r/vllm/vllm-openai/tags
docker buildx build --progress plain \
--build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
--build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_RAYSERVE_VERSION }}" \
--cache-to=type=inline \
--cache-from=type=registry,ref=${CI_IMAGE_URI} \
--tag ${CI_IMAGE_URI} \
--target vllm-rayserve-ec2 \
-f docker/vllm/Dockerfile .
- name: Container push
run: |
docker push ${CI_IMAGE_URI}
docker rmi ${CI_IMAGE_URI}
set-rayserve-test-environment:
needs: [check-changes, build-vllm-rayserve-image]
if: |
always() && !failure() && !cancelled() &&
(needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.test-change == 'true')
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-set-rayserve-test-environment-${{ github.event.pull_request.number }}
cancel-in-progress: true
outputs:
aws-account-id: ${{ steps.set-env.outputs.AWS_ACCOUNT_ID }}
image-uri: ${{ steps.set-env.outputs.IMAGE_URI }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set test environment
id: set-env
run: |
if [[ "${{ needs.build-vllm-rayserve-image.result }}" == "success" ]]; then
AWS_ACCOUNT_ID=${{ vars.CI_AWS_ACCOUNT_ID }}
IMAGE_URI=${{ needs.build-vllm-rayserve-image.outputs.ci-image }}
else
AWS_ACCOUNT_ID=${{ vars.PROD_AWS_ACCOUNT_ID }}
IMAGE_URI=${{ vars.PROD_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/${{ env.PROD_RAYSERVE_IMAGE }}
fi
echo "Image URI to test: ${IMAGE_URI}"
echo "AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID}" >> ${GITHUB_OUTPUT}
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
vllm-rayserve-regression-test:
needs: [build-vllm-rayserve-image, set-rayserve-test-environment]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-g6xl-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-vllm-rayserve-regression-test-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Container pull
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ needs.set-rayserve-test-environment.outputs.aws-account-id }}
aws-region: ${{ vars.AWS_REGION }}
image-uri: ${{ needs.set-rayserve-test-environment.outputs.image-uri }}
- name: Checkout vLLM tests
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v${{ env.VLLM_RAYSERVE_VERSION }}
path: vllm_source
- name: Start container
run: |
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-v ${HOME}/.cache/vllm:/root/.cache/vllm \
-v ./vllm_source:/workdir --workdir /workdir \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ needs.set-rayserve-test-environment.outputs.image-uri }})
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
- name: Setup for vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
mkdir src
mv vllm src/vllm
'
- name: Run vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
nvidia-smi
# Regression Test # 7min
cd /workdir/tests
uv pip install --system modelscope
pytest -v -s test_regression.py
'
vllm-rayserve-cuda-test:
needs: [build-vllm-rayserve-image, set-rayserve-test-environment]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-g6xl-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-vllm-rayserve-cuda-test-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Container pull
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ needs.set-rayserve-test-environment.outputs.aws-account-id }}
aws-region: ${{ vars.AWS_REGION }}
image-uri: ${{ needs.set-rayserve-test-environment.outputs.image-uri }}
- name: Checkout vLLM tests
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v${{ env.VLLM_RAYSERVE_VERSION }}
path: vllm_source
- name: Start container
run: |
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-v ${HOME}/.cache/vllm:/root/.cache/vllm \
-v ./vllm_source:/workdir --workdir /workdir \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ needs.set-rayserve-test-environment.outputs.image-uri }})
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
- name: Setup for vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
mkdir src
mv vllm src/vllm
'
- name: Run vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
nvidia-smi
# Platform Tests (CUDA) # 4min
cd /workdir/tests
pytest -v -s cuda/test_cuda_context.py
'
vllm-rayserve-example-test:
needs: [build-vllm-rayserve-image, set-rayserve-test-environment]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-g6xl-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-vllm-rayserve-example-test-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Container pull
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ needs.set-rayserve-test-environment.outputs.aws-account-id }}
aws-region: ${{ vars.AWS_REGION }}
image-uri: ${{ needs.set-rayserve-test-environment.outputs.image-uri }}
- name: Checkout vLLM tests
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v${{ env.VLLM_RAYSERVE_VERSION }}
path: vllm_source
- name: Start container
run: |
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-v ${HOME}/.cache/vllm:/root/.cache/vllm \
-v ./vllm_source:/workdir --workdir /workdir \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ needs.set-rayserve-test-environment.outputs.image-uri }})
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
- name: Setup for vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
mkdir src
mv vllm src/vllm
'
- name: Run vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
nvidia-smi
# Examples Test # 30min
cd /workdir/examples
pip install tensorizer # for tensorizer test
python3 offline_inference/basic/generate.py --model facebook/opt-125m
# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
python3 offline_inference/basic/chat.py
python3 offline_inference/prefix_caching.py
python3 offline_inference/llm_engine_example.py
# NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
# vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11
# python3 offline_inference/audio_language.py --seed 0
python3 offline_inference/vision_language.py --seed 0
python3 offline_inference/vision_language_pooling.py --seed 0
python3 offline_inference/vision_language_multi_image.py --seed 0
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
python3 offline_inference/basic/classify.py
python3 offline_inference/basic/embed.py
python3 offline_inference/basic/score.py
VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
'
# ====================================================
# =============== vLLM SageMaker jobs ================
# ====================================================
build-vllm-sagemaker-image:
needs: [check-changes]
if: needs.check-changes.outputs.build-change == 'true'
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-build-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-build-vllm-sagemaker-image-${{ github.event.pull_request.number }}
cancel-in-progress: true
outputs:
ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }}
steps:
- uses: actions/checkout@v5
- run: .github/scripts/buildkitd.sh
- name: ECR login
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
aws-region: ${{ vars.AWS_REGION }}
- name: Resolve image URI for build
id: image-uri-build
run: |
CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-sagemaker-pr-${{ github.event.pull_request.number }}
echo "Image URI to build: ${CI_IMAGE_URI}"
echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV}
echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT}
- name: Build image
run: |
# base image: https://hub.docker.com/r/vllm/vllm-openai/tags
docker buildx build --progress plain \
--build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
--build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_VERSION }}" \
--cache-to=type=inline \
--cache-from=type=registry,ref=${CI_IMAGE_URI} \
--tag ${CI_IMAGE_URI} \
--target vllm-sagemaker \
-f docker/vllm/Dockerfile .
- name: Container push
run: |
docker push ${CI_IMAGE_URI}
docker rmi ${CI_IMAGE_URI}
set-sagemaker-test-environment:
needs: [check-changes, build-vllm-sagemaker-image]
if: |
always() && !failure() && !cancelled() &&
(needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.test-change == 'true')
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-set-sagemaker-test-environment-${{ github.event.pull_request.number }}
cancel-in-progress: true
outputs:
aws-account-id: ${{ steps.set-env.outputs.AWS_ACCOUNT_ID }}
image-uri: ${{ steps.set-env.outputs.IMAGE_URI }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set test environment
id: set-env
run: |
if [[ "${{ needs.build-vllm-sagemaker-image.result }}" == "success" ]]; then
AWS_ACCOUNT_ID=${{ vars.CI_AWS_ACCOUNT_ID }}
IMAGE_URI=${{ needs.build-vllm-sagemaker-image.outputs.ci-image }}
else
AWS_ACCOUNT_ID=${{ vars.PROD_AWS_ACCOUNT_ID }}
IMAGE_URI=${{ vars.PROD_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/${{ env.PROD_SAGEMAKER_IMAGE }}
fi
echo "Image URI to test: ${IMAGE_URI}"
echo "AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID}" >> ${GITHUB_OUTPUT}
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
vllm-sagemaker-regression-test:
needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-g6xl-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-vllm-sagemaker-regression-test-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Container pull
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ needs.set-sagemaker-test-environment.outputs.aws-account-id }}
aws-region: ${{ vars.AWS_REGION }}
image-uri: ${{ needs.set-sagemaker-test-environment.outputs.image-uri }}
- name: Checkout vLLM tests
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v${{ env.VLLM_VERSION }}
path: vllm_source
- name: Start container
run: |
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-v ${HOME}/.cache/vllm:/root/.cache/vllm \
-v ./vllm_source:/workdir --workdir /workdir \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ needs.set-sagemaker-test-environment.outputs.image-uri }})
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
- name: Setup for vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
mkdir src
mv vllm src/vllm
'
- name: Run vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
nvidia-smi
# Regression Test # 7min
cd /workdir/tests
uv pip install --system modelscope
pytest -v -s test_regression.py
'
vllm-sagemaker-cuda-test:
needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-g6xl-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-vllm-sagemaker-cuda-test-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Container pull
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ needs.set-sagemaker-test-environment.outputs.aws-account-id }}
aws-region: ${{ vars.AWS_REGION }}
image-uri: ${{ needs.set-sagemaker-test-environment.outputs.image-uri }}
- name: Checkout vLLM tests
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v${{ env.VLLM_VERSION }}
path: vllm_source
- name: Start container
run: |
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-v ${HOME}/.cache/vllm:/root/.cache/vllm \
-v ./vllm_source:/workdir --workdir /workdir \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ needs.set-sagemaker-test-environment.outputs.image-uri }})
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
- name: Setup for vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
mkdir src
mv vllm src/vllm
'
- name: Run vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
nvidia-smi
# Platform Tests (CUDA) # 4min
cd /workdir/tests
pytest -v -s cuda/test_cuda_context.py
'
vllm-sagemaker-example-test:
needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment]
if: success()
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-g6xl-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-vllm-sagemaker-example-test-${{ github.event.pull_request.number }}
cancel-in-progress: true
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Container pull
uses: ./.github/actions/ecr-authenticate
with:
aws-account-id: ${{ needs.set-sagemaker-test-environment.outputs.aws-account-id }}
aws-region: ${{ vars.AWS_REGION }}
image-uri: ${{ needs.set-sagemaker-test-environment.outputs.image-uri }}
- name: Checkout vLLM tests
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v${{ env.VLLM_VERSION }}
path: vllm_source
- name: Start container
run: |
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-v ${HOME}/.cache/vllm:/root/.cache/vllm \
-v ./vllm_source:/workdir --workdir /workdir \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ needs.set-sagemaker-test-environment.outputs.image-uri }})
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
- name: Setup for vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
mkdir src
mv vllm src/vllm
'
- name: Run vLLM tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
nvidia-smi
# Examples Test # 30min
cd /workdir/examples
pip install tensorizer # for tensorizer test
python3 offline_inference/basic/generate.py --model facebook/opt-125m
# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
python3 offline_inference/basic/chat.py
python3 offline_inference/prefix_caching.py
python3 offline_inference/llm_engine_example.py
# NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
# vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11
# python3 offline_inference/audio_language.py --seed 0
python3 offline_inference/vision_language.py --seed 0
python3 offline_inference/vision_language_pooling.py --seed 0
python3 offline_inference/vision_language_multi_image.py --seed 0
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
python3 offline_inference/basic/classify.py
python3 offline_inference/basic/embed.py
python3 offline_inference/basic/score.py
python3 offline_inference/simple_profiling.py
'
vllm-sagemaker-endpoint-test:
needs: [set-sagemaker-test-environment]
if: |
always() && !failure() && !cancelled() &&
needs.set-sagemaker-test-environment.result == 'success'
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:default-runner
buildspec-override:true
concurrency:
group: ${{ github.workflow }}-vllm-sagemaker-endpoint-test-${{ github.event.pull_request.number }}
cancel-in-progress: false
steps:
- name: Checkout DLC source
uses: actions/checkout@v5
- name: Install test dependencies
run: |
uv venv
source .venv/bin/activate
uv pip install -r test/requirements.txt
uv pip install -r test/vllm/sagemaker/requirements.txt
- name: Run sagemaker endpoint test
run: |
source .venv/bin/activate
cd test/
python3 -m pytest -vs -rA --image-uri ${{ needs.set-sagemaker-test-environment.outputs.image-uri }} vllm/sagemaker