Skip to content
Open
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
f93b993
add Dockerfile.rocm_base build
tjtanaa Feb 11, 2026
421c526
Merge remote-tracking branch 'origin/main' into rocmnightly
tjtanaa Feb 24, 2026
a24755a
change queue machine and also grep the gpu arch from dockerfile.rocm_…
tjtanaa Feb 24, 2026
a670248
change from amd-cpu to cpu_post_merge
tjtanaa Feb 24, 2026
4035b5d
automatically extract gpu arch in bootstrap-amd.sh
tjtanaa Feb 24, 2026
806ecc0
extract default python version from dockerfile.rocm_base
tjtanaa Feb 24, 2026
1b6a4c8
push to vllm ecr repo instead
tjtanaa Feb 24, 2026
d326d8a
try amd-cpu again, moving away from cpu_postmerge_queue
tjtanaa Feb 26, 2026
ca9ec6e
noninteractive installation
tjtanaa Feb 26, 2026
729703f
change git history check from amd-cpu to cpu_queue_postmerge
tjtanaa Feb 26, 2026
0f1980e
use amd cpu for ci image
tjtanaa Feb 26, 2026
2f87429
remove sudo
tjtanaa Feb 26, 2026
ba73c39
try enabling sccache when building amd CI image
tjtanaa Feb 26, 2026
72ea91e
fix the aws cli installation command; fix the main branch; avoid down…
tjtanaa Feb 26, 2026
ff2db46
try apt install awscli
tjtanaa Feb 26, 2026
d55a61c
don't install awscli
tjtanaa Feb 27, 2026
2d23606
use smaller agent small_cpu_queue_premerge
tjtanaa Feb 27, 2026
2bb6520
add to use premerge queue
tjtanaa Feb 27, 2026
8c61d7f
always use cpu_queue_postmerge when need to push to ecr
tjtanaa Feb 27, 2026
4dacfc8
don't use sccache
tjtanaa Feb 27, 2026
bcd0474
fix mising aws cli
tjtanaa Feb 27, 2026
ff49fd8
debug environment
tjtanaa Feb 27, 2026
d425ac6
use python3 pip install
tjtanaa Feb 27, 2026
746f44f
use docker image instead
tjtanaa Feb 27, 2026
93662a6
pin to a specific aws-cli version
tjtanaa Feb 28, 2026
b0d07f8
remove ci-infra; remove the nightly docker image and wheel releases
tjtanaa Mar 4, 2026
bcdfd4d
Merge remote-tracking branch 'origin/main' into rocmnightly
tjtanaa Mar 10, 2026
9e04cfa
Merge remote-tracking branch 'origin/main' into rocmnightly
tjtanaa Mar 17, 2026
1282490
remove code related to rocm-base-image.tar.gz and also remove the roc…
tjtanaa Mar 23, 2026
d0a0a06
remove PYTHON and PYTORCH_ARCH from docker image cache key
tjtanaa Mar 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions buildkite/bootstrap-amd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,25 @@ check_run_all_label() {
fi
}

compute_rocm_base_cache_key() {
local DOCKERFILE="docker/Dockerfile.rocm_base"
local DEFAULT_PYTHON
DEFAULT_PYTHON=$(grep '^ARG PYTHON_VERSION=' "$DOCKERFILE" | sed 's/^ARG PYTHON_VERSION=//')
local CI_PYTHON_VERSION="${ROCM_CI_PYTHON_VERSION:-$DEFAULT_PYTHON}"
local DEFAULT_ARCH
DEFAULT_ARCH=$(grep '^ARG PYTORCH_ROCM_ARCH=' "$DOCKERFILE" | sed 's/^ARG PYTORCH_ROCM_ARCH=//')
local CI_PYTORCH_ROCM_ARCH="${ROCM_CI_PYTORCH_ROCM_ARCH:-$DEFAULT_ARCH}"

if [[ ! -f "$DOCKERFILE" ]]; then
echo "unknown"
return
fi
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
local args_string="${CI_PYTHON_VERSION}|${CI_PYTORCH_ROCM_ARCH}"
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
echo "${dockerfile_hash}-${args_hash}"
}

if [[ -z "${COV_ENABLED:-}" ]]; then
COV_ENABLED=0
fi
Expand Down Expand Up @@ -86,6 +105,9 @@ upload_pipeline() {
echo "Nightly: $NIGHTLY"
echo "AMD Mirror HW: $AMD_MIRROR_HW"

ROCM_BASE_CACHE_KEY=$(compute_rocm_base_cache_key)
echo "ROCm base cache key: $ROCM_BASE_CACHE_KEY"

FAIL_FAST=$(fail_fast)

cd .buildkite
Expand All @@ -103,6 +125,8 @@ upload_pipeline() {
-D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \
-D cov_enabled="$COV_ENABLED" \
-D vllm_ci_branch="$VLLM_CI_BRANCH" \
-D rocm_base_cache_key="$ROCM_BASE_CACHE_KEY" \
-D rocm_base_changed="$ROCM_BASE_CHANGED" \
| sed '/^[[:space:]]*$/d' \
> pipeline.yaml
)
Expand Down Expand Up @@ -212,6 +236,15 @@ for file in $file_diff; do
fi
done

ROCM_BASE_CHANGED=0
for file in $file_diff; do
if [[ "$file" == "docker/Dockerfile.rocm_base" ]]; then
ROCM_BASE_CHANGED=1
echo "Dockerfile.rocm_base changed in this PR"
break
fi
done

# Check for ready-run-all-tests label
LABEL_RUN_ALL=$(check_run_all_label)
if [[ $LABEL_RUN_ALL == true ]]; then
Expand Down
292 changes: 286 additions & 6 deletions buildkite/test-template-amd.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
{% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %}
{% endif %}
{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %}
{% set rocm_base_ecr_image = "public.ecr.aws/q9t5s3a7/vllm-release-repo" %}
{% set rocm_base_ecr_commit_tag = rocm_base_ecr_image ~ ":$BUILDKITE_COMMIT-" ~ rocm_base_cache_key ~ "-rocm-base" %}
{% set rocm_base_ecr_cache_tag = rocm_base_ecr_image ~ ":" ~ rocm_base_cache_key ~ "-rocm-base" %}
{% set rocm_base_ecr_nightly_tag = rocm_base_ecr_image ~ ":latest-rocm-base-nightly" %}
{% set default_working_dir = "/vllm-workspace/tests" %}
{% set hf_home = "/root/.cache/huggingface" %}
{% set hf_home_efs = "/mnt/efs/hf_cache" %}
Expand Down Expand Up @@ -284,17 +288,202 @@ plugins:
- group: "AMD Tests"
depends_on: ~
steps:
- label: "AMD: :docker: build image"
{% if branch != "main" %}
- label: "AMD: :git: Check Dockerfile.rocm_base freshness"
depends_on: ~
key: "amd-rocm-base-check"
commands:
- |
set -euo pipefail
echo "--- Checking Dockerfile.rocm_base git history"
git fetch origin main
MAIN_LATEST=$$(git log origin/main -1 --format='%H' -- docker/Dockerfile.rocm_base)
if [ -z "$$MAIN_LATEST" ]; then
echo "No commits found for Dockerfile.rocm_base on origin/main. Skipping check."
exit 0
fi
echo "Latest commit on main for Dockerfile.rocm_base: $$MAIN_LATEST"
if git merge-base --is-ancestor "$$MAIN_LATEST" HEAD; then
echo "OK: Current branch contains the latest Dockerfile.rocm_base from main"
else
echo "ERROR: Branch does NOT contain the latest Dockerfile.rocm_base from main."
echo "Please rebase or merge main into your branch."
buildkite-agent annotate --style error \
"Your branch is missing the latest Dockerfile.rocm_base changes from main (commit $$MAIN_LATEST). Please rebase or merge main." \
--context "rocm-base-freshness"
exit 1
fi
agents:
queue: small_cpu_queue_premerge
soft_fail: false
{% endif %}

- label: "AMD: :docker: Build/Reuse ROCm base image"
depends_on:
{% if branch != "main" %}
- "amd-rocm-base-check"
{% else %}
- ~
{% endif %}
key: "amd-rocm-base-build"
commands:
- |
set -euo pipefail

CACHE_KEY="{{ rocm_base_cache_key }}"
ECR_CACHE_TAG="{{ rocm_base_ecr_cache_tag }}"
ECR_COMMIT_TAG="{{ rocm_base_ecr_commit_tag }}"
{% if branch == "main" %}
ECR_NIGHTLY_TAG="{{ rocm_base_ecr_nightly_tag }}"
{% endif %}
S3_BUCKET="vllm-wheels"
S3_CACHE_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"

echo "========================================"
echo "ROCm Base Image Build/Reuse"
echo " Cache Key: $${CACHE_KEY}"
echo " ECR Cache Tag: $${ECR_CACHE_TAG}"
echo " ECR Commit Tag: $${ECR_COMMIT_TAG}"
{% if branch == "main" %}
echo " ECR Nightly Tag: $${ECR_NIGHTLY_TAG}"
{% endif %}
echo "========================================"

python3 -m pip install awscli
# Login to ECR
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7

# Tier 1: Check ECR cache (fastest)
IMAGE_EXISTS=0
if docker manifest inspect "$${ECR_CACHE_TAG}" > /dev/null 2>&1; then
IMAGE_EXISTS=1
echo "ECR cache HIT: $${ECR_CACHE_TAG}"
fi

if [ "$$IMAGE_EXISTS" -eq 1 ]; then
echo "ECR cache HIT: $${ECR_CACHE_TAG}"
# Create commit tag directly from cache tag (no pull needed)
docker buildx imagetools create --tag "$${ECR_COMMIT_TAG}" "$${ECR_CACHE_TAG}"
echo "Tagged $${ECR_CACHE_TAG} as $${ECR_COMMIT_TAG} in ECR (no pull required)"
{% if branch == "main" %}
# On main, also tag as latest nightly (no pull needed)
docker buildx imagetools create --tag "$${ECR_NIGHTLY_TAG}" "$${ECR_CACHE_TAG}"
echo "Tagged $${ECR_CACHE_TAG} as $${ECR_NIGHTLY_TAG} in ECR (no pull required)"
{% endif %}
else
# Tier 2: Check S3 cache
S3_IMAGE_EXISTS=0
if aws s3 ls "$${S3_CACHE_PATH}/rocm-base-image.tar.gz" > /dev/null 2>&1; then
S3_IMAGE_EXISTS=1
fi

if [ "$$S3_IMAGE_EXISTS" -eq 1 ]; then
echo "S3 cache HIT. Downloading..."
mkdir -p /tmp/rocm-cache
aws s3 cp "$${S3_CACHE_PATH}/rocm-base-image.tar.gz" /tmp/rocm-cache/rocm-base-image.tar.gz
LOAD_OUTPUT=$$(gunzip -c /tmp/rocm-cache/rocm-base-image.tar.gz | docker load)
echo "$$LOAD_OUTPUT"
BASE_TAG=$$(echo "$$LOAD_OUTPUT" | grep "Loaded image:" | sed 's/Loaded image: //')
docker tag "$$BASE_TAG" "rocm/vllm-dev:base-ci"
rm -rf /tmp/rocm-cache
else
echo "CACHE MISS. Building Dockerfile.rocm_base from scratch..."

PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//')
echo "Using PYTORCH_ROCM_ARCH from Dockerfile.rocm_base: $${PYTORCH_ROCM_ARCH}"
PYTHON_VERSION=$$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//')
echo "Using PYTHON_VERSION from Dockerfile.rocm_base: $${PYTHON_VERSION}"

DOCKER_BUILDKIT=1 docker buildx build \
--file docker/Dockerfile.rocm_base \
--tag "rocm/vllm-dev:base-ci" \
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--load \
--progress plain \
.

# Upload to S3 for future cache hits
docker save rocm/vllm-dev:base-ci | gzip > /tmp/rocm-base-image.tar.gz
aws s3 cp /tmp/rocm-base-image.tar.gz "$${S3_CACHE_PATH}/rocm-base-image.tar.gz"
rm -f /tmp/rocm-base-image.tar.gz

# Also upload base wheels to S3 cache
DOCKER_BUILDKIT=1 docker buildx build \
--file docker/Dockerfile.rocm_base \
--tag rocm-base-debs:ci \
--target debs_wheel_release \
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--load \
.
mkdir -p artifacts/rocm-base-wheels
cid=$$(docker create rocm-base-debs:ci)
docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/
docker rm $${cid}
export PYTHON_VERSION
export PYTORCH_ROCM_ARCH
S3_BUCKET=vllm-wheels .buildkite/scripts/cache-rocm-base-wheels.sh upload
fi

# Push to ECR (cache tag for future lookups + commit tag for traceability)
docker tag "rocm/vllm-dev:base-ci" "$${ECR_CACHE_TAG}"
docker push "$${ECR_CACHE_TAG}"
docker tag "rocm/vllm-dev:base-ci" "$${ECR_COMMIT_TAG}"
docker push "$${ECR_COMMIT_TAG}"
{% if branch == "main" %}
# On main, also tag and push as latest nightly
docker tag "rocm/vllm-dev:base-ci" "$${ECR_NIGHTLY_TAG}"
docker push "$${ECR_NIGHTLY_TAG}"
echo "Pushed nightly tag: $${ECR_NIGHTLY_TAG}"
{% endif %}
fi

echo "Base image ready: $${ECR_COMMIT_TAG}"
agents:
queue: cpu_queue_postmerge
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1
limit: 2
- exit_status: -10
limit: 2
- exit_status: 1
limit: 1

- label: "AMD: :docker: build image"
depends_on: amd-rocm-base-build
soft_fail: false
commands:
# Handle the introduction of test target in Dockerfile.rocm
# Login to ECR using AWS CLI Docker image (no pip/python needed)
- |
docker run --rm \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-e AWS_SESSION_TOKEN \
-e AWS_DEFAULT_REGION=us-east-1 \
amazon/aws-cli:2.34.0 \
ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
- "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci"
- >
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--build-arg BASE_IMAGE=rocm/vllm-dev:base-ci
--tag {{ docker_image_amd }}
-f docker/Dockerfile.rocm
--target test
Expand All @@ -306,13 +495,13 @@ plugins:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
- exit_status: -1
limit: 2
- exit_status: -10 # Agent was lost
- exit_status: -10
limit: 2
- exit_status: 128 # Git connectivity issues
- exit_status: 128
limit: 2
- exit_status: 1 # Machine occasionally fail
- exit_status: 1
limit: 1
agents:
queue: amd-cpu
Expand Down Expand Up @@ -378,3 +567,94 @@ plugins:
limit: 2
{% endif %}
{% endfor %}

{% if branch == "main" %}
- label: "AMD: :rocket: Build ROCm nightly release image"
depends_on: amd-build
key: "amd-nightly-release-image"
soft_fail: true
agents:
queue: amd-cpu
commands:
- |
set -euo pipefail
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull "{{ rocm_base_ecr_commit_tag }}"
docker tag "{{ rocm_base_ecr_commit_tag }}" rocm/vllm-dev:base-ci

DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg BASE_IMAGE=rocm/vllm-dev:base-ci \
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$$BUILDKITE_COMMIT-rocm \
--target vllm-openai \
--progress plain \
-f docker/Dockerfile.rocm .
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$$BUILDKITE_COMMIT-rocm
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1
limit: 2

- label: "AMD: :python: Build ROCm nightly wheel"
depends_on: amd-rocm-base-build
key: "amd-nightly-wheel"
soft_fail: true
agents:
queue: amd-cpu
commands:
- |
set -euo pipefail
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull "{{ rocm_base_ecr_commit_tag }}"
docker tag "{{ rocm_base_ecr_commit_tag }}" rocm/vllm-dev:base-ci

# Download base wheels from S3 cache
PYTHON_VERSION=$$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//')
echo "Using PYTHON_VERSION from Dockerfile.rocm_base: $${PYTHON_VERSION}"
export PYTHON_VERSION
PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//')
echo "Using PYTORCH_ROCM_ARCH from Dockerfile.rocm_base: $${PYTORCH_ROCM_ARCH}"
export PYTORCH_ROCM_ARCH
export S3_BUCKET=vllm-wheels
.buildkite/scripts/cache-rocm-base-wheels.sh download

mkdir -p docker/context/base-wheels
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/

git fetch --tags --force origin

DOCKER_BUILDKIT=1 docker build \
--file docker/Dockerfile.rocm \
--target export_vllm_wheel_release \
--output type=local,dest=rocm-dist \
--build-arg BASE_IMAGE=rocm/vllm-dev:base-ci \
--build-arg ARG_PYTORCH_ROCM_ARCH="gfx90a;gfx942" \
--build-arg REMOTE_VLLM=0 \
--build-arg GIT_REPO_CHECK=1 \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
.

mkdir -p artifacts/rocm-vllm-wheel
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
bash .buildkite/scripts/upload-rocm-wheels.sh
env:
DOCKER_BUILDKIT: "1"
ROCM_UPLOAD_WHEELS: "true"
S3_BUCKET: "vllm-wheels"
retry:
automatic:
- exit_status: -1
limit: 2
{% endif %}