Skip to content

CI: Set HF_TOKEN from runner env with secret fallback #1472

CI: Set HF_TOKEN from runner env with secret fallback

CI: Set HF_TOKEN from runner env with secret fallback #1472

Workflow file for this run

name: ATOM Test
on:
push:
branches: [main]
pull_request:
branches: [main] # Triggers on PRs targeting `main`
types: [opened, synchronize, reopened, ready_for_review]
paths-ignore:
- '**/*.md'
- 'docs/**'
- 'LICENSE'
- '.gitignore'
schedule:
# Nightly at 00:00 Beijing time (16:00 UTC)
- cron: '0 16 * * *'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
ATOM_BASE_NIGTHLY_IMAGE: rocm/atom-dev:latest
GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/ATOM.git' }}
GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }}
jobs:
pre-checks:
uses: ./.github/workflows/pre-checks.yaml
with:
black: true
ruff: true
build_atom_image:
if: ${{ needs.pre-checks.result == 'success' && (!github.event.pull_request || github.event.pull_request.draft == false) }}
needs: [pre-checks]
name: Build ATOM image
runs-on: build-only-atom
steps:
- name: Checkout ATOM repo
if: ${{ !github.event.pull_request.head.repo.fork }}
uses: actions/checkout@v4
- name: Generate Dockerfile
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
cat <<EOF > Dockerfile.mod
FROM ${{ env.ATOM_BASE_NIGTHLY_IMAGE }}
RUN pip install -U lm-eval[api]
RUN pip show lm-eval || true
RUN pip install hf_transfer
RUN pip show hf_transfer || true
RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true
RUN pip uninstall -y amd-aiter
RUN pip install --upgrade "pybind11>=3.0.1"
RUN pip show pybind11
RUN wget https://github.com/stedolan/jq/releases/download/jq-1.7/jq-linux64 -O jq
RUN chmod +x jq
RUN mv jq /usr/local/bin/jq
RUN rm -rf /app/aiter-test
RUN git clone --depth 1 https://github.com/ROCm/aiter.git /app/aiter-test && \\
cd /app/aiter-test && \\
git checkout HEAD && \\
git submodule sync && git submodule update --init --recursive && \\
MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom || true
RUN pip uninstall -y atom
RUN rm -rf /app/ATOM
RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\
cd /app/ATOM && \\
git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
pip install -e .
RUN echo "=== ATOM version AFTER installation ===" && pip show atom || true
EOF
- name: Build Docker image
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
docker build --pull --network=host \
--no-cache \
-t atom_test:ci \
-f Dockerfile.mod .
- name: Push Docker image
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
IMAGE_TAG=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}
docker tag atom_test:ci $IMAGE_TAG
docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }}
docker push $IMAGE_TAG
- name: Success message
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
echo "Successfully prepared image: $IMAGE_TAG"
atom:
needs: [pre-checks, build_atom_image]
name: ATOM Test
strategy:
fail-fast: false
matrix:
include:
# run_on_pr: true = run on all events; false = skip on PR (still runs on push/schedule/workflow_dispatch)
- model_name: "Meta-Llama-3-8B-Instruct"
model_path: "meta-llama/Meta-Llama-3-8B-Instruct"
extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3"
env_vars: ""
accuracy_test_threshold: "0.73"
runner: linux-atom-mi355-1
run_on_pr: true
- model_name: "Llama-3.3-70B-Instruct-MXFP4-Preview"
model_path: "amd/Llama-3.3-70B-Instruct-MXFP4-Preview"
extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3"
env_vars: ""
accuracy_test_threshold: "0.88"
runner: linux-atom-mi355-1
run_on_pr: true
- model_name: "DeepSeek-R1-0528"
model_path: "deepseek-ai/DeepSeek-R1-0528"
extraArgs: "--kv_cache_dtype fp8 -tp 8"
env_vars: ""
accuracy_test_threshold: "0.94"
runner: atom-mi355-8gpu.predownload
run_on_pr: true
- model_name: "DeepSeek-R1-0528 MTP"
model_path: "deepseek-ai/DeepSeek-R1-0528"
extraArgs: "--kv_cache_dtype fp8 -tp 8 --method mtp"
env_vars: ""
accuracy_test_threshold: "0.94"
runner: atom-mi355-8gpu.predownload
run_on_pr: true
- model_name: "DeepSeek-R1-0528-FP4"
model_path: "amd/DeepSeek-R1-0528-MXFP4"
extraArgs: "--kv_cache_dtype fp8 -tp 8"
env_vars: ""
accuracy_test_threshold: "0.93"
runner: atom-mi355-8gpu.predownload
run_on_pr: true
- model_name: "DeepSeek-R1-0528-FP4 MTP"
model_path: "amd/DeepSeek-R1-0528-MXFP4"
extraArgs: "--kv_cache_dtype fp8 -tp 8 --method mtp"
env_vars: ""
accuracy_test_threshold: "0.93"
runner: atom-mi355-8gpu.predownload
run_on_pr: true
- model_name: "gpt-oss-120b"
model_path: "openai/gpt-oss-120b"
extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3"
env_vars: |
ATOM_GPT_OSS_MODEL=1
accuracy_test_threshold: "0.38"
runner: linux-atom-mi355-1
run_on_pr: true
- model_name: "gpt-oss-120b (2 GPUs)"
model_path: "openai/gpt-oss-120b"
extraArgs: "--kv_cache_dtype fp8 -tp 2 --enable-dp-attention --enable-expert-parallel --gpu-memory-utilization 0.3"
env_vars: |
ATOM_GPT_OSS_MODEL=1
accuracy_test_threshold: "0.38"
runner: linux-atom-mi355-4
run_on_pr: true
- model_name: "Qwen3-235B-A22B-Instruct-2507-FP8"
model_path: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
extraArgs: "--kv_cache_dtype fp8 -tp 8 --enable-expert-parallel"
env_vars: |
ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
accuracy_test_threshold: "0.87"
runner: atom-mi355-8gpu.predownload
run_on_pr: true
- model_name: "Qwen3-235B-A22B-Instruct-2507-MXFP4"
model_path: "amd/Qwen3-235B-A22B-Instruct-2507-MXFP4"
extraArgs: "--kv_cache_dtype fp8 -tp 8 --enable-expert-parallel"
env_vars: |
ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
accuracy_test_threshold: "0.87"
runner: atom-mi355-8gpu.predownload
run_on_pr: false
- model_name: "Qwen3-Next-80B-A3B-Thinking"
model_path: "Qwen/Qwen3-Next-80B-A3B-Thinking"
extraArgs: "-tp 8"
env_vars: ""
accuracy_test_threshold: "0.65"
runner: atom-mi355-8gpu.predownload
run_on_pr: true
if: ${{ needs.pre-checks.result == 'success' && (!github.event.pull_request || github.event.pull_request.draft == false) }}
runs-on: ${{ matrix.runner }}
env:
CONTAINER_NAME: atom_test_${{ strategy.job-index }}
steps:
- name: Set HF_TOKEN
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: echo "HF_TOKEN=${HF_TOKEN:-${{ secrets.AMD_HF_TOKEN }}}" >> $GITHUB_ENV
- name: Kill all Docker containers and clean up workspace
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload'
run: |
echo "=== Cleaning up containers on $(hostname) ==="
containers=$(docker ps -q)
if [ -n "$containers" ]; then
docker kill $containers || true
fi
docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && rm -rf /workspace/*" || true
- name: Show Docker containers
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload'
run: docker ps -a
- name: Show ROCm memory usage
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload'
run: rocm-smi --showmemuse
- name: Show ROCm GPU processes
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload'
run: rocm-smi --showpidgpus
- name: Checkout ATOM repo
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
uses: actions/checkout@v4
- name: Docker Login
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && !github.event.pull_request.head.repo.fork
run: |
docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }}
- name: Generate Dockerfile for forked repo
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && github.event.pull_request.head.repo.fork
run: |
cat <<EOF > Dockerfile.mod
FROM ${{ env.ATOM_BASE_NIGTHLY_IMAGE }}
RUN pip install -U lm-eval[api]
RUN pip show lm-eval || true
RUN pip install hf_transfer
RUN pip show hf_transfer || true
RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true
RUN pip uninstall -y amd-aiter
RUN pip install --upgrade "pybind11>=3.0.1"
RUN pip show pybind11
RUN rm -rf /app/aiter-test
RUN git clone https://github.com/ROCm/aiter.git /app/aiter-test && \\
cd /app/aiter-test && \\
git checkout HEAD && \\
git submodule sync && git submodule update --init --recursive && \\
MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom || true
RUN pip uninstall -y atom
RUN rm -rf /app/ATOM
RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\
cd /app/ATOM && \\
git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
pip install -e .
RUN echo "=== ATOM version AFTER installation ===" && pip show atom || true
EOF
- name: Build Docker image for forked repo
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && github.event.pull_request.head.repo.fork
run: |
docker build --pull --network=host \
--no-cache \
-t atom_test:ci \
-f Dockerfile.mod .
- name: Start CI container
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: |
echo "Clean up containers..."
(docker ps -aq -f name="^${CONTAINER_NAME}$" | xargs -r docker stop) || true
(docker ps -aq -f name="^${CONTAINER_NAME}$" | xargs -r docker rm) || true
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
if [ -d "/models" ]; then
MODEL_MOUNT="-v /models:/models"
else
echo "Warning: /models directory not found on runner; skipping /models mount and disabling model pre-download optimization."
MODEL_MOUNT=""
fi
cat > /tmp/env_file.txt << 'EOF'
${{ matrix.env_vars }}
EOF
echo "Starting container: atom_test:ci"
echo "Model-specific environment variables for ${{ matrix.model_name }}:"
cat /tmp/env_file.txt
if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then
IMAGE_TAG=atom_test:ci
else
IMAGE_TAG=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}
fi
docker run -dt --device=/dev/kfd $DEVICE_FLAG \
-v "${GITHUB_WORKSPACE:-$PWD}":/workspace \
$MODEL_MOUNT \
-w /workspace \
--ipc=host --group-add video \
--shm-size=16G \
--privileged \
--cap-add=SYS_PTRACE \
-e HF_TOKEN="${HF_TOKEN:-}" \
--env-file /tmp/env_file.txt \
--security-opt seccomp=unconfined \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-e ATOM_DISABLE_MMAP=true \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name "$CONTAINER_NAME" \
$IMAGE_TAG
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Check shm size
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: |
docker exec "$CONTAINER_NAME" df -h /dev/shm
- name: Download models
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: |
if [ -d "/models" ]; then
echo "/models directory found, downloading model to /models/${{ matrix.model_path }}"
if ! docker exec -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} "$CONTAINER_NAME" bash -lc "hf download ${{ matrix.model_path }} --local-dir /models/${{ matrix.model_path }}"; then
echo "Model download failed for '${{ matrix.model_path }}'. Aborting."
exit 1
fi
else
echo "/models directory not found, skipping model download"
fi
- name: Run ATOM simple inference
# Skip simple inference; accuracy test already validates correctness
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && false
timeout-minutes: 30
run: |
# Run the inference and capture output
set -euo pipefail
echo ""
echo "========== Running test =========="
if [ -d "/models" ]; then
model_path="/models/${{ matrix.model_path }}"
else
model_path="${{ matrix.model_path }}"
fi
echo "Model path: $model_path"
ls -la $model_path || true
# Print debug logs
echo "========= Runner debug logs ==============="
ps aux
rocm-smi --showmemuse
rocm-smi --showpids
docker ps -a
echo "========= End runner debug logs ==============="
docker exec "$CONTAINER_NAME" bash -lc "
set -euo pipefail
python3 -m atom.examples.simple_inference \
--model \"$model_path\" \
${{ matrix.extraArgs }} \
--temperature 0 \
| grep -E '^Prompt: |^Completion:'
" > atom_test_output.txt
echo ""
echo "========== Showing test output below =========="
cat atom_test_output.txt
- name: Compare output with golden outputs
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && false
timeout-minutes: 30
# TODO: skip for all test until it's fixed
run: |
echo "========== Comparing output with golden outputs =========="
if ! diff -u -B -w --strip-trailing-cr \
atom_test_output.txt \
".github/workflows/golden_outputs/${{ matrix.model_name }}_golden_output.txt"; then
echo "Failed: Output does not match golden outputs."
exit 1
else
echo "Success: Output matches golden outputs."
fi
- name: Run ATOM accuracy test
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
timeout-minutes: 30
run: |
set -euo pipefail
echo ""
echo "========== Launching ATOM server =========="
if [ -d "/models" ]; then
model_path="/models/${{ matrix.model_path }}"
else
model_path="${{ matrix.model_path }}"
fi
docker exec "$CONTAINER_NAME" bash -lc "
.github/scripts/atom_test.sh launch $model_path ${{ matrix.extraArgs }}
"
echo ""
echo "========== Running accuracy test =========="
docker exec "$CONTAINER_NAME" bash -lc "
.github/scripts/atom_test.sh accuracy $model_path
" 2>&1 | tee atom_accuracy_output.txt
- name: Check accuracy test results
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && success()
run: |
result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null | head -n 1)
if [ -z "$result_file" ] || [ ! -f "$result_file" ]; then
echo "ERROR: No results JSON file found in accuracy_test_results/"
exit 2
else
echo "RESULT_FILE: $result_file"
fi
flexible_extract_value=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file")
echo "Flexible extract value: $flexible_extract_value"
echo "Accuracy test threshold: ${{ matrix.accuracy_test_threshold }}"
# Compare as float: use awk for decimal value comparison
result=$(awk -v val="$flexible_extract_value" -v threshold="${{ matrix.accuracy_test_threshold }}" 'BEGIN {print (val < threshold) ? 1 : 0}')
if [ "$result" -eq 1 ]; then
echo "Accuracy test failed: Flexible extract value $flexible_extract_value is less than the threshold ${{ matrix.accuracy_test_threshold }}."
exit 1
else
echo "Accuracy test passed: Flexible extract value $flexible_extract_value is greater than or equal to the threshold ${{ matrix.accuracy_test_threshold }}."
exit 0
fi
- name: Collect Test Summary
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && success()
run: |
echo "Accuracy Test Summary for ${{ matrix.model_name }}:" >> $GITHUB_STEP_SUMMARY
awk '/\|Tasks\|Version\|/,/^$/ { if (NF > 0) print }' atom_accuracy_output.txt >> $GITHUB_STEP_SUMMARY
- name: Upload output
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && always()
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.model_name }}_atom_test_output.txt
path: atom_test_output.txt
- name: Clean Up
if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && always()
run: |
# TODO: run a separate container for cleanup of the workspace due to permission issue to remove some pyc files under __pycache__ whose owners are root.
# We should use non-root user to run the test to avoid this issue.
set -x
echo "========== Cleaning up workspace =========="
if [[ ${{ matrix.runner }} == atom-mi355-8gpu.predownload ]]; then
docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && rm -rf /workspace/*" || true
fi
docker stop "$CONTAINER_NAME" || true
docker rm "$CONTAINER_NAME" || true