[Plugin][CI/CD] establish CI/CD and add workflow for ATOM OOT #3
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: ATOM vLLM OOT Test | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] # Triggers on PRs targeting `main` | |
| types: [opened, synchronize, reopened, ready_for_review] | |
| paths-ignore: | |
| - '**/*.md' | |
| - 'docs/**' | |
| - 'LICENSE' | |
| - '.gitignore' | |
| schedule: | |
| # Nightly at 00:00 Beijing time (16:00 UTC) | |
| - cron: '0 16 * * *' | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| env: | |
| ATOM_BASE_NIGHTLY_IMAGE: rocm/atom-dev:latest | |
| GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/ATOM.git' }} | |
| GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }} | |
| jobs: | |
| wait-atom-test-success: | |
| name: Wait for ATOM Test success | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 180 | |
| outputs: | |
| atom_test_ok: ${{ steps.wait.outputs.atom_test_ok }} | |
| steps: | |
| - name: Wait until ATOM Test is completed for this commit | |
| id: wait | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const owner = context.repo.owner; | |
| const repo = context.repo.repo; | |
| const eventName = context.eventName; | |
| const headSha = context.payload.pull_request?.head?.sha ?? context.sha; | |
| const maxAttempts = 180; | |
| const sleepMs = 60000; | |
| const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); | |
| if (eventName === "workflow_dispatch") { | |
| core.info("workflow_dispatch detected: bypass ATOM Test gate."); | |
| core.setOutput("atom_test_ok", "true"); | |
| return; | |
| } | |
| let foundCompletedRun = null; | |
| for (let attempt = 1; attempt <= maxAttempts; attempt++) { | |
| const resp = await github.rest.actions.listWorkflowRuns({ | |
| owner, | |
| repo, | |
| workflow_id: "atom-test.yaml", | |
| event: eventName, | |
| head_sha: headSha, | |
| per_page: 20, | |
| }); | |
| const candidates = (resp.data.workflow_runs || []) | |
| .filter((run) => run.name === "ATOM Test" && run.id !== context.runId) | |
| .sort((a, b) => new Date(b.created_at) - new Date(a.created_at)); | |
| if (candidates.length > 0) { | |
| const latest = candidates[0]; | |
| core.info( | |
| `Attempt ${attempt}/${maxAttempts}: latest ATOM Test run id=${latest.id}, status=${latest.status}, conclusion=${latest.conclusion}` | |
| ); | |
| if (latest.status === "completed") { | |
| foundCompletedRun = latest; | |
| break; | |
| } | |
| } else { | |
| core.info(`Attempt ${attempt}/${maxAttempts}: no ATOM Test run found yet for this sha.`); | |
| } | |
| await sleep(sleepMs); | |
| } | |
| if (!foundCompletedRun) { | |
| core.warning("Timeout waiting for ATOM Test workflow completion. OOT workflow will be skipped."); | |
| core.setOutput("atom_test_ok", "false"); | |
| return; | |
| } | |
| const ok = foundCompletedRun.conclusion === "success"; | |
| core.setOutput("atom_test_ok", ok ? "true" : "false"); | |
| if (!ok) { | |
| core.warning( | |
| `Skip OOT workflow: ATOM Test conclusion is '${foundCompletedRun.conclusion}'.` | |
| ); | |
| } | |
| pre-checks: | |
| needs: [wait-atom-test-success] | |
| if: ${{ needs.wait-atom-test-success.outputs.atom_test_ok == 'true' }} | |
| uses: ./.github/workflows/pre-checks.yaml | |
| with: | |
| black: true | |
| ruff: true | |
| build_atom_image: | |
| if: ${{ needs.wait-atom-test-success.outputs.atom_test_ok == 'true' && needs.pre-checks.result == 'success' && (!github.event.pull_request || github.event.pull_request.draft == false) }} | |
| needs: [wait-atom-test-success, pre-checks] | |
| name: Build ATOM image | |
| runs-on: build-only-atom | |
| steps: | |
| - name: Docker Login | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }} | |
| - name: Try pull pre-built ATOM image | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| id: pull_prebuilt | |
| run: | | |
| IMAGE_TAG=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| if docker pull "$IMAGE_TAG"; then | |
| echo "image_ready=true" >> "$GITHUB_OUTPUT" | |
| echo "Reusing existing image: $IMAGE_TAG" | |
| else | |
| echo "image_ready=false" >> "$GITHUB_OUTPUT" | |
| echo "Pre-built image not found, will rebuild: $IMAGE_TAG" | |
| fi | |
| - name: Checkout ATOM repo | |
| if: ${{ !github.event.pull_request.head.repo.fork && steps.pull_prebuilt.outputs.image_ready != 'true' }} | |
| uses: actions/checkout@v4 | |
| - name: Generate Dockerfile | |
| if: ${{ !github.event.pull_request.head.repo.fork && steps.pull_prebuilt.outputs.image_ready != 'true' }} | |
| run: | | |
| cat <<EOF > Dockerfile.mod | |
| FROM ${{ env.ATOM_BASE_NIGHTLY_IMAGE }} | |
| RUN pip install -U lm-eval[api] | |
| RUN pip show lm-eval || true | |
| RUN pip install hf_transfer | |
| RUN pip show hf_transfer || true | |
| RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true | |
| RUN pip uninstall -y amd-aiter | |
| RUN pip install --upgrade "pybind11>=3.0.1" | |
| RUN pip show pybind11 | |
| RUN wget https://github.com/stedolan/jq/releases/download/jq-1.7/jq-linux64 -O jq | |
| RUN chmod +x jq | |
| RUN mv jq /usr/local/bin/jq | |
| RUN rm -rf /app/aiter-test | |
| RUN git clone --depth 1 https://github.com/ROCm/aiter.git /app/aiter-test && \\ | |
| cd /app/aiter-test && \\ | |
| git checkout HEAD && \\ | |
| git submodule sync && git submodule update --init --recursive && \\ | |
| MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop | |
| RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true | |
| RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom || true | |
| RUN pip uninstall -y atom | |
| RUN rm -rf /app/ATOM | |
| RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\ | |
| cd /app/ATOM && \\ | |
| git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\ | |
| pip install -e . | |
| RUN echo "=== ATOM version AFTER installation ===" && pip show atom || true | |
| EOF | |
| - name: Build Docker image | |
| if: ${{ !github.event.pull_request.head.repo.fork && steps.pull_prebuilt.outputs.image_ready != 'true' }} | |
| run: | | |
| docker build --pull --network=host \ | |
| --no-cache \ | |
| -t atom_test:ci \ | |
| -f Dockerfile.mod . | |
| - name: Push Docker image | |
| if: ${{ !github.event.pull_request.head.repo.fork && steps.pull_prebuilt.outputs.image_ready != 'true' }} | |
| run: | | |
| IMAGE_TAG=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| docker tag atom_test:ci $IMAGE_TAG | |
| docker push $IMAGE_TAG | |
| - name: Success message | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| IMAGE_TAG=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| if [ "${{ steps.pull_prebuilt.outputs.image_ready }}" = "true" ]; then | |
| echo "Successfully reused image: $IMAGE_TAG" | |
| else | |
| echo "Successfully rebuilt and pushed image: $IMAGE_TAG" | |
| fi | |
| atom-vllm-oot: | |
| needs: [wait-atom-test-success, pre-checks, build_atom_image] | |
| if: ${{ needs.wait-atom-test-success.outputs.atom_test_ok == 'true' && needs.pre-checks.result == 'success' && needs.build_atom_image.result == 'success' && (!github.event.pull_request || github.event.pull_request.draft == false) }} | |
| name: ATOM vLLM OOT Test | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| # Keep CI runtime under control: enable only one OOT model for now. | |
| - model_name: "Kimi-K2-Thinking-MXFP4" | |
| model_path: "amd/Kimi-K2-Thinking-MXFP4" | |
| accuracy_test_threshold: "0.90" | |
| runner: atom-mi355-8gpu.predownload | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 180 | |
| env: | |
| CONTAINER_NAME: atom_vllm_oot_${{ strategy.job-index }} | |
| OOT_IMAGE_TAG: atom_vllm_oot_test:${{ github.sha }}-${{ strategy.job-index }} | |
| VLLM_COMMIT: b31e9326a7d9394aab8c767f8ebe225c65594b60 | |
| VLLM_VERSION: "0.17" | |
| steps: | |
| - name: Clean up containers and workspace | |
| run: | | |
| echo "=== Cleaning up containers on $(hostname) ===" | |
| containers=$(docker ps -q) | |
| if [ -n "$containers" ]; then | |
| docker kill $containers || true | |
| fi | |
| docker rm -f "$CONTAINER_NAME" 2>/dev/null || true | |
| docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && rm -rf /workspace/*" || true | |
| - name: Checkout ATOM repo | |
| uses: actions/checkout@v4 | |
| - name: Docker Login | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }} | |
| - name: Prepare OOT base image for forked repo | |
| if: ${{ github.event.pull_request.head.repo.fork }} | |
| run: | | |
| cat <<EOF > Dockerfile.mod | |
| FROM ${{ env.ATOM_BASE_NIGHTLY_IMAGE }} | |
| RUN pip install -U lm-eval[api] | |
| RUN pip show lm-eval || true | |
| RUN pip install hf_transfer | |
| RUN pip show hf_transfer || true | |
| RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true | |
| RUN pip uninstall -y amd-aiter | |
| RUN pip install --upgrade "pybind11>=3.0.1" | |
| RUN pip show pybind11 | |
| RUN rm -rf /app/aiter-test | |
| RUN git clone https://github.com/ROCm/aiter.git /app/aiter-test && \\ | |
| cd /app/aiter-test && \\ | |
| git checkout HEAD && \\ | |
| git submodule sync && git submodule update --init --recursive && \\ | |
| MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop | |
| RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true | |
| RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom || true | |
| # Fork PR fallback: this workflow cannot rely on pre-built images from | |
| # other workflows, so reinstall ATOM from the current PR commit. | |
| RUN pip uninstall -y atom | |
| RUN rm -rf /app/ATOM | |
| RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\ | |
| cd /app/ATOM && \\ | |
| git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\ | |
| pip install -e . | |
| RUN echo "=== ATOM version AFTER installation ===" && pip show atom || true | |
| EOF | |
| docker build --pull --network=host \ | |
| --no-cache \ | |
| -t atom_oot_base:ci \ | |
| -f Dockerfile.mod . | |
| echo "OOT_BASE_IMAGE=atom_oot_base:ci" >> "$GITHUB_ENV" | |
| - name: Select OOT base image from pre-built ATOM image | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| echo "OOT_BASE_IMAGE=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}" >> "$GITHUB_ENV" | |
| - name: Build OOT vLLM image | |
| run: | | |
| if [ "${{ github.event_name }}" = "pull_request" ] && [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then | |
| pull_base_image=0 | |
| else | |
| pull_base_image=1 | |
| fi | |
| chmod +x docker/plugin/build_OOT_vLLM.sh | |
| IMAGE_TAG="${OOT_IMAGE_TAG}" \ | |
| BASE_IMAGE="${OOT_BASE_IMAGE}" \ | |
| VLLM_COMMIT="${VLLM_COMMIT}" \ | |
| VLLM_VERSION="${VLLM_VERSION}" \ | |
| INSTALL_LM_EVAL=1 \ | |
| PULL_BASE_IMAGE="${pull_base_image}" \ | |
| BUILD_NO_CACHE=1 \ | |
| docker/plugin/build_OOT_vLLM.sh | |
| - name: Run all plugin unit tests | |
| run: | | |
| docker run --rm \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| "$OOT_IMAGE_TAG" \ | |
| bash -lc "pytest -q tests/plugin" | |
| - name: Start OOT test container | |
| run: | | |
| if [ -f "/etc/podinfo/gha-render-devices" ]; then | |
| DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) | |
| else | |
| DEVICE_FLAG="--device /dev/dri" | |
| fi | |
| if [ -d "/models" ]; then | |
| MODEL_MOUNT="-v /models:/models" | |
| else | |
| echo "Warning: /models directory not found on runner; skipping /models mount." | |
| MODEL_MOUNT="" | |
| fi | |
| docker run -dt --device=/dev/kfd $DEVICE_FLAG \ | |
| -v "${GITHUB_WORKSPACE:-$PWD}":/workspace \ | |
| $MODEL_MOUNT \ | |
| -w /workspace \ | |
| --ipc=host --group-add video \ | |
| --shm-size=16G \ | |
| --privileged \ | |
| --cap-add=SYS_PTRACE \ | |
| -e HF_TOKEN="${HF_TOKEN:-}" \ | |
| --security-opt seccomp=unconfined \ | |
| --ulimit memlock=-1 \ | |
| --ulimit stack=67108864 \ | |
| --name "$CONTAINER_NAME" \ | |
| "$OOT_IMAGE_TAG" | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Download model if needed | |
| run: | | |
| if [ -d "/models" ] && [ ! -f "/models/${{ matrix.model_path }}/config.json" ]; then | |
| echo "Downloading model to /models/${{ matrix.model_path }}" | |
| docker exec -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} "$CONTAINER_NAME" bash -lc "hf download ${{ matrix.model_path }} --local-dir /models/${{ matrix.model_path }}" | |
| else | |
| echo "Skip model download" | |
| fi | |
| - name: Run OOT launch and gsm8k accuracy via script (ci mode) | |
| timeout-minutes: 90 | |
| run: | | |
| docker exec "$CONTAINER_NAME" bash -lc " | |
| set -euo pipefail | |
| bash .github/scripts/atom_oot_test.sh accuracy ci | |
| " | |
| - name: Collect OOT accuracy summary | |
| if: success() | |
| run: | | |
| echo "OOT Accuracy Test Summary for ${{ matrix.model_name }}:" >> $GITHUB_STEP_SUMMARY | |
| docker exec "$CONTAINER_NAME" bash -lc "awk '/\|Tasks\|Version\|/,/^$/ { if (NF > 0) print }' /tmp/oot_accuracy_output.txt" >> $GITHUB_STEP_SUMMARY || true | |
| - name: Collect OOT logs and results | |
| if: always() | |
| run: | | |
| docker cp "$CONTAINER_NAME":/tmp/vllm_oot.log ./vllm_oot.log || true | |
| docker cp "$CONTAINER_NAME":/tmp/oot_accuracy_output.txt ./oot_accuracy_output.txt || true | |
| docker cp "$CONTAINER_NAME":/tmp/oot_accuracy_results ./oot_accuracy_results || true | |
| - name: Upload OOT artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: oot-${{ matrix.model_name }}-artifacts | |
| path: | | |
| vllm_oot.log | |
| oot_accuracy_output.txt | |
| oot_accuracy_results | |
| - name: Clean up OOT test | |
| if: always() | |
| run: | | |
| docker exec "$CONTAINER_NAME" bash -lc "if [ -f /tmp/vllm_oot.pid ]; then kill \$(cat /tmp/vllm_oot.pid) || true; fi" || true | |
| docker stop "$CONTAINER_NAME" || true | |
| docker rm "$CONTAINER_NAME" || true |