Skip to content

[Plugin][CI/CD] establish CI/CD and add workflow for ATOM OOT #3

[Plugin][CI/CD] establish CI/CD and add workflow for ATOM OOT

[Plugin][CI/CD] establish CI/CD and add workflow for ATOM OOT #3

name: ATOM vLLM OOT Test
on:
push:
branches: [main]
pull_request:
branches: [main] # Triggers on PRs targeting `main`
types: [opened, synchronize, reopened, ready_for_review]
paths-ignore:
- '**/*.md'
- 'docs/**'
- 'LICENSE'
- '.gitignore'
schedule:
# Nightly at 00:00 Beijing time (16:00 UTC)
- cron: '0 16 * * *'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
ATOM_BASE_NIGHTLY_IMAGE: rocm/atom-dev:latest
GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/ATOM.git' }}
GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }}
jobs:
wait-atom-test-success:
name: Wait for ATOM Test success
runs-on: ubuntu-latest
timeout-minutes: 180
outputs:
atom_test_ok: ${{ steps.wait.outputs.atom_test_ok }}
steps:
- name: Wait until ATOM Test is completed for this commit
id: wait
uses: actions/github-script@v7
with:
script: |
const owner = context.repo.owner;
const repo = context.repo.repo;
const eventName = context.eventName;
const headSha = context.payload.pull_request?.head?.sha ?? context.sha;
const maxAttempts = 180;
const sleepMs = 60000;
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
if (eventName === "workflow_dispatch") {
core.info("workflow_dispatch detected: bypass ATOM Test gate.");
core.setOutput("atom_test_ok", "true");
return;
}
let foundCompletedRun = null;
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const resp = await github.rest.actions.listWorkflowRuns({
owner,
repo,
workflow_id: "atom-test.yaml",
event: eventName,
head_sha: headSha,
per_page: 20,
});
const candidates = (resp.data.workflow_runs || [])
.filter((run) => run.name === "ATOM Test" && run.id !== context.runId)
.sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
if (candidates.length > 0) {
const latest = candidates[0];
core.info(
`Attempt ${attempt}/${maxAttempts}: latest ATOM Test run id=${latest.id}, status=${latest.status}, conclusion=${latest.conclusion}`
);
if (latest.status === "completed") {
foundCompletedRun = latest;
break;
}
} else {
core.info(`Attempt ${attempt}/${maxAttempts}: no ATOM Test run found yet for this sha.`);
}
await sleep(sleepMs);
}
if (!foundCompletedRun) {
core.warning("Timeout waiting for ATOM Test workflow completion. OOT workflow will be skipped.");
core.setOutput("atom_test_ok", "false");
return;
}
const ok = foundCompletedRun.conclusion === "success";
core.setOutput("atom_test_ok", ok ? "true" : "false");
if (!ok) {
core.warning(
`Skip OOT workflow: ATOM Test conclusion is '${foundCompletedRun.conclusion}'.`
);
}
pre-checks:
needs: [wait-atom-test-success]
if: ${{ needs.wait-atom-test-success.outputs.atom_test_ok == 'true' }}
uses: ./.github/workflows/pre-checks.yaml
with:
black: true
ruff: true
build_atom_image:
if: ${{ needs.wait-atom-test-success.outputs.atom_test_ok == 'true' && needs.pre-checks.result == 'success' && (!github.event.pull_request || github.event.pull_request.draft == false) }}
needs: [wait-atom-test-success, pre-checks]
name: Build ATOM image
runs-on: build-only-atom
steps:
- name: Docker Login
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }}
- name: Try pull pre-built ATOM image
if: ${{ !github.event.pull_request.head.repo.fork }}
id: pull_prebuilt
run: |
IMAGE_TAG=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}
if docker pull "$IMAGE_TAG"; then
echo "image_ready=true" >> "$GITHUB_OUTPUT"
echo "Reusing existing image: $IMAGE_TAG"
else
echo "image_ready=false" >> "$GITHUB_OUTPUT"
echo "Pre-built image not found, will rebuild: $IMAGE_TAG"
fi
- name: Checkout ATOM repo
if: ${{ !github.event.pull_request.head.repo.fork && steps.pull_prebuilt.outputs.image_ready != 'true' }}
uses: actions/checkout@v4
- name: Generate Dockerfile
if: ${{ !github.event.pull_request.head.repo.fork && steps.pull_prebuilt.outputs.image_ready != 'true' }}
run: |
cat <<EOF > Dockerfile.mod
FROM ${{ env.ATOM_BASE_NIGHTLY_IMAGE }}
RUN pip install -U lm-eval[api]
RUN pip show lm-eval || true
RUN pip install hf_transfer
RUN pip show hf_transfer || true
RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true
RUN pip uninstall -y amd-aiter
RUN pip install --upgrade "pybind11>=3.0.1"
RUN pip show pybind11
RUN wget https://github.com/stedolan/jq/releases/download/jq-1.7/jq-linux64 -O jq
RUN chmod +x jq
RUN mv jq /usr/local/bin/jq
RUN rm -rf /app/aiter-test
RUN git clone --depth 1 https://github.com/ROCm/aiter.git /app/aiter-test && \\
cd /app/aiter-test && \\
git checkout HEAD && \\
git submodule sync && git submodule update --init --recursive && \\
MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom || true
RUN pip uninstall -y atom
RUN rm -rf /app/ATOM
RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\
cd /app/ATOM && \\
git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
pip install -e .
RUN echo "=== ATOM version AFTER installation ===" && pip show atom || true
EOF
- name: Build Docker image
if: ${{ !github.event.pull_request.head.repo.fork && steps.pull_prebuilt.outputs.image_ready != 'true' }}
run: |
docker build --pull --network=host \
--no-cache \
-t atom_test:ci \
-f Dockerfile.mod .
- name: Push Docker image
if: ${{ !github.event.pull_request.head.repo.fork && steps.pull_prebuilt.outputs.image_ready != 'true' }}
run: |
IMAGE_TAG=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}
docker tag atom_test:ci $IMAGE_TAG
docker push $IMAGE_TAG
- name: Success message
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
IMAGE_TAG=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}
if [ "${{ steps.pull_prebuilt.outputs.image_ready }}" = "true" ]; then
echo "Successfully reused image: $IMAGE_TAG"
else
echo "Successfully rebuilt and pushed image: $IMAGE_TAG"
fi
atom-vllm-oot:
needs: [wait-atom-test-success, pre-checks, build_atom_image]
if: ${{ needs.wait-atom-test-success.outputs.atom_test_ok == 'true' && needs.pre-checks.result == 'success' && needs.build_atom_image.result == 'success' && (!github.event.pull_request || github.event.pull_request.draft == false) }}
name: ATOM vLLM OOT Test
strategy:
fail-fast: false
matrix:
include:
# Keep CI runtime under control: enable only one OOT model for now.
- model_name: "Kimi-K2-Thinking-MXFP4"
model_path: "amd/Kimi-K2-Thinking-MXFP4"
accuracy_test_threshold: "0.90"
runner: atom-mi355-8gpu.predownload
runs-on: ${{ matrix.runner }}
timeout-minutes: 180
env:
CONTAINER_NAME: atom_vllm_oot_${{ strategy.job-index }}
OOT_IMAGE_TAG: atom_vllm_oot_test:${{ github.sha }}-${{ strategy.job-index }}
VLLM_COMMIT: b31e9326a7d9394aab8c767f8ebe225c65594b60
VLLM_VERSION: "0.17"
steps:
- name: Clean up containers and workspace
run: |
echo "=== Cleaning up containers on $(hostname) ==="
containers=$(docker ps -q)
if [ -n "$containers" ]; then
docker kill $containers || true
fi
docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && rm -rf /workspace/*" || true
- name: Checkout ATOM repo
uses: actions/checkout@v4
- name: Docker Login
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }}
- name: Prepare OOT base image for forked repo
if: ${{ github.event.pull_request.head.repo.fork }}
run: |
cat <<EOF > Dockerfile.mod
FROM ${{ env.ATOM_BASE_NIGHTLY_IMAGE }}
RUN pip install -U lm-eval[api]
RUN pip show lm-eval || true
RUN pip install hf_transfer
RUN pip show hf_transfer || true
RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true
RUN pip uninstall -y amd-aiter
RUN pip install --upgrade "pybind11>=3.0.1"
RUN pip show pybind11
RUN rm -rf /app/aiter-test
RUN git clone https://github.com/ROCm/aiter.git /app/aiter-test && \\
cd /app/aiter-test && \\
git checkout HEAD && \\
git submodule sync && git submodule update --init --recursive && \\
MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom || true
# Fork PR fallback: this workflow cannot rely on pre-built images from
# other workflows, so reinstall ATOM from the current PR commit.
RUN pip uninstall -y atom
RUN rm -rf /app/ATOM
RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\
cd /app/ATOM && \\
git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
pip install -e .
RUN echo "=== ATOM version AFTER installation ===" && pip show atom || true
EOF
docker build --pull --network=host \
--no-cache \
-t atom_oot_base:ci \
-f Dockerfile.mod .
echo "OOT_BASE_IMAGE=atom_oot_base:ci" >> "$GITHUB_ENV"
- name: Select OOT base image from pre-built ATOM image
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
echo "OOT_BASE_IMAGE=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}" >> "$GITHUB_ENV"
- name: Build OOT vLLM image
run: |
if [ "${{ github.event_name }}" = "pull_request" ] && [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then
pull_base_image=0
else
pull_base_image=1
fi
chmod +x docker/plugin/build_OOT_vLLM.sh
IMAGE_TAG="${OOT_IMAGE_TAG}" \
BASE_IMAGE="${OOT_BASE_IMAGE}" \
VLLM_COMMIT="${VLLM_COMMIT}" \
VLLM_VERSION="${VLLM_VERSION}" \
INSTALL_LM_EVAL=1 \
PULL_BASE_IMAGE="${pull_base_image}" \
BUILD_NO_CACHE=1 \
docker/plugin/build_OOT_vLLM.sh
- name: Run all plugin unit tests
run: |
docker run --rm \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
"$OOT_IMAGE_TAG" \
bash -lc "pytest -q tests/plugin"
- name: Start OOT test container
run: |
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
if [ -d "/models" ]; then
MODEL_MOUNT="-v /models:/models"
else
echo "Warning: /models directory not found on runner; skipping /models mount."
MODEL_MOUNT=""
fi
docker run -dt --device=/dev/kfd $DEVICE_FLAG \
-v "${GITHUB_WORKSPACE:-$PWD}":/workspace \
$MODEL_MOUNT \
-w /workspace \
--ipc=host --group-add video \
--shm-size=16G \
--privileged \
--cap-add=SYS_PTRACE \
-e HF_TOKEN="${HF_TOKEN:-}" \
--security-opt seccomp=unconfined \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
--name "$CONTAINER_NAME" \
"$OOT_IMAGE_TAG"
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Download model if needed
run: |
if [ -d "/models" ] && [ ! -f "/models/${{ matrix.model_path }}/config.json" ]; then
echo "Downloading model to /models/${{ matrix.model_path }}"
docker exec -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} "$CONTAINER_NAME" bash -lc "hf download ${{ matrix.model_path }} --local-dir /models/${{ matrix.model_path }}"
else
echo "Skip model download"
fi
- name: Run OOT launch and gsm8k accuracy via script (ci mode)
timeout-minutes: 90
run: |
docker exec "$CONTAINER_NAME" bash -lc "
set -euo pipefail
bash .github/scripts/atom_oot_test.sh accuracy ci
"
- name: Collect OOT accuracy summary
if: success()
run: |
echo "OOT Accuracy Test Summary for ${{ matrix.model_name }}:" >> $GITHUB_STEP_SUMMARY
docker exec "$CONTAINER_NAME" bash -lc "awk '/\|Tasks\|Version\|/,/^$/ { if (NF > 0) print }' /tmp/oot_accuracy_output.txt" >> $GITHUB_STEP_SUMMARY || true
- name: Collect OOT logs and results
if: always()
run: |
docker cp "$CONTAINER_NAME":/tmp/vllm_oot.log ./vllm_oot.log || true
docker cp "$CONTAINER_NAME":/tmp/oot_accuracy_output.txt ./oot_accuracy_output.txt || true
docker cp "$CONTAINER_NAME":/tmp/oot_accuracy_results ./oot_accuracy_results || true
- name: Upload OOT artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: oot-${{ matrix.model_name }}-artifacts
path: |
vllm_oot.log
oot_accuracy_output.txt
oot_accuracy_results
- name: Clean up OOT test
if: always()
run: |
docker exec "$CONTAINER_NAME" bash -lc "if [ -f /tmp/vllm_oot.pid ]; then kill \$(cat /tmp/vllm_oot.pid) || true; fi" || true
docker stop "$CONTAINER_NAME" || true
docker rm "$CONTAINER_NAME" || true