CI: Replace nightly image build with S3 aiter wheel installation #1476
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: ATOM Test | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] # Triggers on PRs targeting `main` | |
| types: [opened, synchronize, reopened, ready_for_review] | |
| paths-ignore: | |
| - '**/*.md' | |
| - 'docs/**' | |
| - 'LICENSE' | |
| - '.gitignore' | |
| schedule: | |
| # Nightly at 00:00 Beijing time (16:00 UTC) | |
| - cron: '0 16 * * *' | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| permissions: | |
| contents: read | |
| actions: read | |
| env: | |
| ATOM_BASE_IMAGE: rocm/pytorch:latest | |
| GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/ATOM.git' }} | |
| GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }} | |
| jobs: | |
| pre-checks: | |
| uses: ./.github/workflows/pre-checks.yaml | |
| with: | |
| black: true | |
| ruff: true | |
| atom: | |
| needs: [pre-checks] | |
| name: ATOM Test | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| # run_on_pr: true = run on all events; false = skip on PR (still runs on push/schedule/workflow_dispatch) | |
| - model_name: "Meta-Llama-3-8B-Instruct" | |
| model_path: "meta-llama/Meta-Llama-3-8B-Instruct" | |
| extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3" | |
| env_vars: "" | |
| accuracy_test_threshold: "0.73" | |
| runner: linux-atom-mi355-1 | |
| run_on_pr: true | |
| - model_name: "Llama-3.3-70B-Instruct-MXFP4-Preview" | |
| model_path: "amd/Llama-3.3-70B-Instruct-MXFP4-Preview" | |
| extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3" | |
| env_vars: "" | |
| accuracy_test_threshold: "0.88" | |
| runner: linux-atom-mi355-1 | |
| run_on_pr: true | |
| - model_name: "DeepSeek-R1-0528" | |
| model_path: "deepseek-ai/DeepSeek-R1-0528" | |
| extraArgs: "--kv_cache_dtype fp8 -tp 8" | |
| env_vars: "" | |
| accuracy_test_threshold: "0.94" | |
| runner: atom-mi355-8gpu.predownload | |
| run_on_pr: true | |
| - model_name: "DeepSeek-R1-0528 MTP" | |
| model_path: "deepseek-ai/DeepSeek-R1-0528" | |
| extraArgs: "--kv_cache_dtype fp8 -tp 8 --method mtp" | |
| env_vars: "" | |
| accuracy_test_threshold: "0.94" | |
| runner: atom-mi355-8gpu.predownload | |
| run_on_pr: true | |
| - model_name: "DeepSeek-R1-0528-FP4" | |
| model_path: "amd/DeepSeek-R1-0528-MXFP4" | |
| extraArgs: "--kv_cache_dtype fp8 -tp 8" | |
| env_vars: "" | |
| accuracy_test_threshold: "0.93" | |
| runner: atom-mi355-8gpu.predownload | |
| run_on_pr: true | |
| - model_name: "DeepSeek-R1-0528-FP4 MTP" | |
| model_path: "amd/DeepSeek-R1-0528-MXFP4" | |
| extraArgs: "--kv_cache_dtype fp8 -tp 8 --method mtp" | |
| env_vars: "" | |
| accuracy_test_threshold: "0.93" | |
| runner: atom-mi355-8gpu.predownload | |
| run_on_pr: true | |
| - model_name: "gpt-oss-120b" | |
| model_path: "openai/gpt-oss-120b" | |
| extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3" | |
| env_vars: | | |
| ATOM_GPT_OSS_MODEL=1 | |
| accuracy_test_threshold: "0.38" | |
| runner: linux-atom-mi355-1 | |
| run_on_pr: true | |
| - model_name: "gpt-oss-120b (2 GPUs)" | |
| model_path: "openai/gpt-oss-120b" | |
| extraArgs: "--kv_cache_dtype fp8 -tp 2 --enable-dp-attention --enable-expert-parallel --gpu-memory-utilization 0.3" | |
| env_vars: | | |
| ATOM_GPT_OSS_MODEL=1 | |
| accuracy_test_threshold: "0.38" | |
| runner: linux-atom-mi355-4 | |
| run_on_pr: true | |
| - model_name: "Qwen3-235B-A22B-Instruct-2507-FP8" | |
| model_path: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8" | |
| extraArgs: "--kv_cache_dtype fp8 -tp 8 --enable-expert-parallel" | |
| env_vars: | | |
| ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1 | |
| accuracy_test_threshold: "0.87" | |
| runner: atom-mi355-8gpu.predownload | |
| run_on_pr: true | |
| - model_name: "Qwen3-235B-A22B-Instruct-2507-MXFP4" | |
| model_path: "amd/Qwen3-235B-A22B-Instruct-2507-MXFP4" | |
| extraArgs: "--kv_cache_dtype fp8 -tp 8 --enable-expert-parallel" | |
| env_vars: | | |
| ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1 | |
| accuracy_test_threshold: "0.87" | |
| runner: atom-mi355-8gpu.predownload | |
| run_on_pr: false | |
| - model_name: "Qwen3-Next-80B-A3B-Thinking" | |
| model_path: "Qwen/Qwen3-Next-80B-A3B-Thinking" | |
| extraArgs: "-tp 8" | |
| env_vars: "" | |
| accuracy_test_threshold: "0.65" | |
| runner: atom-mi355-8gpu.predownload | |
| run_on_pr: true | |
| if: ${{ needs.pre-checks.result == 'success' && (!github.event.pull_request || github.event.pull_request.draft == false) }} | |
| runs-on: ${{ matrix.runner }} | |
| env: | |
| CONTAINER_NAME: atom_test_${{ strategy.job-index }} | |
| steps: | |
| - name: Kill all Docker containers and clean up workspace | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload' | |
| run: | | |
| echo "=== Cleaning up containers on $(hostname) ===" | |
| containers=$(docker ps -q) | |
| if [ -n "$containers" ]; then | |
| docker kill $containers || true | |
| fi | |
| docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && rm -rf /workspace/*" || true | |
| - name: Show Docker containers | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload' | |
| run: docker ps -a | |
| - name: Show ROCm memory usage | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload' | |
| run: rocm-smi --showmemuse | |
| - name: Show ROCm GPU processes | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload' | |
| run: rocm-smi --showpidgpus | |
| - name: Checkout ATOM repo | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| uses: actions/checkout@v4 | |
| - name: Download latest aiter wheel from CI artifacts | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| set -euo pipefail | |
| echo "=== Finding latest aiter-whl-main artifact from ROCm/aiter ===" | |
| API_URL="https://api.github.com" | |
| AUTH_HEADER="Authorization: token ${{ secrets.GITHUB_TOKEN }}" | |
| AITER_TEST_WORKFLOW_ID=179476100 | |
| # Search Aiter Test workflow runs on main branch for one that has an aiter-whl artifact | |
| RUNS=$(curl -s -H "$AUTH_HEADER" \ | |
| "$API_URL/repos/ROCm/aiter/actions/workflows/$AITER_TEST_WORKFLOW_ID/runs?per_page=100&branch=main&event=push") | |
| ARTIFACT_ID="" | |
| ARTIFACT_NAME="" | |
| for RUN_ID in $(echo "$RUNS" | jq -r '.workflow_runs[].id'); do | |
| ARTIFACT_JSON=$(curl -s -H "$AUTH_HEADER" \ | |
| "$API_URL/repos/ROCm/aiter/actions/runs/$RUN_ID/artifacts" \ | |
| | jq '[.artifacts[] | select(.name | startswith("aiter-whl-main")) | select(.expired == false)] | first') | |
| if [ "$ARTIFACT_JSON" != "null" ] && [ -n "$ARTIFACT_JSON" ]; then | |
| ARTIFACT_ID=$(echo "$ARTIFACT_JSON" | jq -r '.id') | |
| ARTIFACT_NAME=$(echo "$ARTIFACT_JSON" | jq -r '.name') | |
| echo "Found artifact in run $RUN_ID: $ARTIFACT_NAME (ID: $ARTIFACT_ID)" | |
| break | |
| fi | |
| done | |
| if [ -z "$ARTIFACT_ID" ] || [ "$ARTIFACT_ID" = "null" ]; then | |
| echo "ERROR: No aiter-whl-main artifact found in recent Aiter Test runs" | |
| exit 1 | |
| fi | |
| echo "=== Downloading artifact ===" | |
| mkdir -p /tmp/aiter-whl | |
| curl -s -L -H "$AUTH_HEADER" \ | |
| "$API_URL/repos/ROCm/aiter/actions/artifacts/$ARTIFACT_ID/zip" \ | |
| -o /tmp/aiter-whl.zip | |
| unzip -o /tmp/aiter-whl.zip -d /tmp/aiter-whl | |
| rm -f /tmp/aiter-whl.zip | |
| AITER_WHL=$(ls /tmp/aiter-whl/amd_aiter*.whl 2>/dev/null | head -1) | |
| if [ -z "$AITER_WHL" ]; then | |
| echo "ERROR: No amd_aiter wheel found in artifact" | |
| ls -la /tmp/aiter-whl/ | |
| exit 1 | |
| fi | |
| echo "Downloaded wheel: $AITER_WHL" | |
| echo "AITER_WHL_PATH=$AITER_WHL" >> $GITHUB_ENV | |
| - name: Start CI container | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| echo "Clean up containers..." | |
| (docker ps -aq -f name="^${CONTAINER_NAME}$" | xargs -r docker stop) || true | |
| (docker ps -aq -f name="^${CONTAINER_NAME}$" | xargs -r docker rm) || true | |
| if [ -f "/etc/podinfo/gha-render-devices" ]; then | |
| DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) | |
| else | |
| DEVICE_FLAG="--device /dev/dri" | |
| fi | |
| if [ -d "/models" ]; then | |
| MODEL_MOUNT="-v /models:/models" | |
| else | |
| echo "Warning: /models directory not found on runner; skipping /models mount and disabling model pre-download optimization." | |
| MODEL_MOUNT="" | |
| fi | |
| cat > /tmp/env_file.txt << 'EOF' | |
| ${{ matrix.env_vars }} | |
| EOF | |
| IMAGE_TAG=${{ env.ATOM_BASE_IMAGE }} | |
| echo "Starting container with image: $IMAGE_TAG" | |
| echo "Model-specific environment variables for ${{ matrix.model_name }}:" | |
| cat /tmp/env_file.txt | |
| docker run -dt --device=/dev/kfd $DEVICE_FLAG \ | |
| -v "${GITHUB_WORKSPACE:-$PWD}":/workspace \ | |
| $MODEL_MOUNT \ | |
| -w /workspace \ | |
| --ipc=host --group-add video \ | |
| --shm-size=16G \ | |
| --privileged \ | |
| --cap-add=SYS_PTRACE \ | |
| -e HF_TOKEN="${HF_TOKEN:-}" \ | |
| --env-file /tmp/env_file.txt \ | |
| --security-opt seccomp=unconfined \ | |
| --ulimit memlock=-1 \ | |
| --ulimit stack=67108864 \ | |
| -e ATOM_DISABLE_MMAP=true \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| --name "$CONTAINER_NAME" \ | |
| $IMAGE_TAG | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Check shm size | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| docker exec "$CONTAINER_NAME" df -h /dev/shm | |
| - name: Install aiter from wheel | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| echo "=== Copying wheel into container ===" | |
| WHL_NAME=$(basename "${{ env.AITER_WHL_PATH }}") | |
| docker cp "${{ env.AITER_WHL_PATH }}" "$CONTAINER_NAME:/tmp/$WHL_NAME" | |
| docker exec "$CONTAINER_NAME" bash -lc " | |
| set -euo pipefail | |
| echo '=== Uninstalling existing amd-aiter ===' | |
| pip uninstall -y amd-aiter || true | |
| echo '=== Installing amd-aiter from wheel ===' | |
| pip install /tmp/$WHL_NAME | |
| echo '=== Installed amd-aiter version ===' | |
| pip show amd-aiter | |
| " | |
| - name: Install ATOM and dependencies | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| docker exec "$CONTAINER_NAME" bash -lc " | |
| set -euo pipefail | |
| pip install --timeout 60 --retries 10 -U 'lm-eval[api]' | |
| pip install --timeout 60 --retries 10 hf_transfer | |
| pip install --timeout 60 --retries 10 --upgrade 'pybind11>=3.0.1' | |
| echo '=== Installing ATOM ===' | |
| cd /workspace | |
| pip install -e . | |
| echo '=== Installed ATOM version ===' | |
| pip show atom | |
| " | |
| - name: Download models | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| run: | | |
| if [ -d "/models" ]; then | |
| echo "/models directory found, downloading model to /models/${{ matrix.model_path }}" | |
| if ! docker exec -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} "$CONTAINER_NAME" bash -lc "hf download ${{ matrix.model_path }} --local-dir /models/${{ matrix.model_path }}"; then | |
| echo "Model download failed for '${{ matrix.model_path }}'. Aborting." | |
| exit 1 | |
| fi | |
| else | |
| echo "/models directory not found, skipping model download" | |
| fi | |
| - name: Run ATOM simple inference | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| timeout-minutes: 30 | |
| run: | | |
| # Run the inference and capture output | |
| set -euo pipefail | |
| echo "" | |
| echo "========== Running test ==========" | |
| if [ -d "/models" ]; then | |
| model_path="/models/${{ matrix.model_path }}" | |
| else | |
| model_path="${{ matrix.model_path }}" | |
| fi | |
| echo "Model path: $model_path" | |
| ls -la $model_path || true | |
| # Print debug logs | |
| echo "========= Runner debug logs ===============" | |
| ps aux | |
| rocm-smi --showmemuse | |
| rocm-smi --showpids | |
| docker ps -a | |
| echo "========= End runner debug logs ===============" | |
| docker exec "$CONTAINER_NAME" bash -lc " | |
| set -euo pipefail | |
| python3 -m atom.examples.simple_inference \ | |
| --model \"$model_path\" \ | |
| ${{ matrix.extraArgs }} \ | |
| --temperature 0 \ | |
| | grep -E '^Prompt: |^Completion:' | |
| " > atom_test_output.txt | |
| echo "" | |
| echo "========== Showing test output below ==========" | |
| cat atom_test_output.txt | |
| - name: Compare output with golden outputs | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && false | |
| timeout-minutes: 30 | |
| # TODO: skip for all test until it's fixed | |
| run: | | |
| echo "========== Comparing output with golden outputs ==========" | |
| if ! diff -u -B -w --strip-trailing-cr \ | |
| atom_test_output.txt \ | |
| ".github/workflows/golden_outputs/${{ matrix.model_name }}_golden_output.txt"; then | |
| echo "Failed: Output does not match golden outputs." | |
| exit 1 | |
| else | |
| echo "Success: Output matches golden outputs." | |
| fi | |
| - name: Run ATOM accuracy test | |
| if: matrix.run_on_pr == true || github.event_name != 'pull_request' | |
| timeout-minutes: 30 | |
| run: | | |
| set -euo pipefail | |
| echo "" | |
| echo "========== Launching ATOM server ==========" | |
| if [ -d "/models" ]; then | |
| model_path="/models/${{ matrix.model_path }}" | |
| else | |
| model_path="${{ matrix.model_path }}" | |
| fi | |
| docker exec "$CONTAINER_NAME" bash -lc " | |
| .github/scripts/atom_test.sh launch $model_path ${{ matrix.extraArgs }} | |
| " | |
| echo "" | |
| echo "========== Running accuracy test ==========" | |
| docker exec "$CONTAINER_NAME" bash -lc " | |
| .github/scripts/atom_test.sh accuracy $model_path | |
| " 2>&1 | tee atom_accuracy_output.txt | |
| - name: Check accuracy test results | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && success() | |
| run: | | |
| result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null | head -n 1) | |
| if [ -z "$result_file" ] || [ ! -f "$result_file" ]; then | |
| echo "ERROR: No results JSON file found in accuracy_test_results/" | |
| exit 2 | |
| else | |
| echo "RESULT_FILE: $result_file" | |
| fi | |
| flexible_extract_value=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file") | |
| echo "Flexible extract value: $flexible_extract_value" | |
| echo "Accuracy test threshold: ${{ matrix.accuracy_test_threshold }}" | |
| # Compare as float: use awk for decimal value comparison | |
| result=$(awk -v val="$flexible_extract_value" -v threshold="${{ matrix.accuracy_test_threshold }}" 'BEGIN {print (val < threshold) ? 1 : 0}') | |
| if [ "$result" -eq 1 ]; then | |
| echo "Accuracy test failed: Flexible extract value $flexible_extract_value is less than the threshold ${{ matrix.accuracy_test_threshold }}." | |
| exit 1 | |
| else | |
| echo "Accuracy test passed: Flexible extract value $flexible_extract_value is greater than or equal to the threshold ${{ matrix.accuracy_test_threshold }}." | |
| exit 0 | |
| fi | |
| - name: Collect Test Summary | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && success() | |
| run: | | |
| echo "Accuracy Test Summary for ${{ matrix.model_name }}:" >> $GITHUB_STEP_SUMMARY | |
| awk '/\|Tasks\|Version\|/,/^$/ { if (NF > 0) print }' atom_accuracy_output.txt >> $GITHUB_STEP_SUMMARY | |
| - name: Upload output | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: ${{ matrix.model_name }}_atom_test_output.txt | |
| path: atom_test_output.txt | |
| - name: Clean Up | |
| if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && always() | |
| run: | | |
| # TODO: run a separate container for cleanup of the workspace due to permission issue to remove some pyc files under __pycache__ whose owners are root. | |
| # We should use non-root user to run the test to avoid this issue. | |
| set -x | |
| echo "========== Cleaning up workspace ==========" | |
| if [[ ${{ matrix.runner }} == atom-mi355-8gpu.predownload ]]; then | |
| docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && rm -rf /workspace/*" || true | |
| fi | |
| docker stop "$CONTAINER_NAME" || true | |
| docker rm "$CONTAINER_NAME" || true |