Add CK-free fallback for fused QKNorm+RoPE+Cache #52

Workflow file for this run

.github/workflows/atom-test.yaml at 0ca5159

	name: ATOM Test

	on:
	push:
	branches: [main]
	pull_request:
	branches: [main] # Triggers on PRs targeting `main`
	types: [opened, synchronize, reopened, ready_for_review]
	paths-ignore:
	- '*/.md'
	- 'docs/**'
	- 'LICENSE'
	- '.gitignore'
	schedule:
	# Nightly at 00:00 Beijing time (16:00 UTC)
	- cron: '0 16 * * *'
	workflow_dispatch:

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

	env:
	ATOM_BASE_NIGTHLY_IMAGE: rocm/atom-dev:latest
	GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url \|\| 'https://github.com/ROCm/ATOM.git' }}
	GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha \|\| github.event.head_commit.id }}

	jobs:
	pre-checks:
	uses: ./.github/workflows/pre-checks.yaml
	with:
	black: true
	ruff: true

	build_atom_image:
	if: ${{ needs.pre-checks.result == 'success' && (!github.event.pull_request \|\| github.event.pull_request.draft == false) }}
	needs: [pre-checks]
	name: Build ATOM image
	runs-on: build-only-atom
	steps:
	- name: Checkout ATOM repo
	if: ${{ !github.event.pull_request.head.repo.fork }}
	uses: actions/checkout@v4

	- name: Generate Dockerfile
	if: ${{ !github.event.pull_request.head.repo.fork }}
	run: \|
	cat <<EOF > Dockerfile.mod
	FROM ${{ env.ATOM_BASE_NIGTHLY_IMAGE }}
	RUN pip install -U lm-eval[api]
	RUN pip show lm-eval \|\| true
	RUN pip install hf_transfer
	RUN pip show hf_transfer \|\| true
	RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter \|\| true
	RUN pip uninstall -y amd-aiter
	RUN pip install --upgrade "pybind11>=3.0.1"
	RUN pip show pybind11
	RUN wget https://github.com/stedolan/jq/releases/download/jq-1.7/jq-linux64 -O jq
	RUN chmod +x jq
	RUN mv jq /usr/local/bin/jq
	RUN rm -rf /app/aiter-test
	RUN git clone --depth 1 https://github.com/ROCm/aiter.git /app/aiter-test && \\
	cd /app/aiter-test && \\
	git checkout HEAD && \\
	git submodule sync && git submodule update --init --recursive && \\
	MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
	RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter \|\| true

	RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom \|\| true
	RUN pip uninstall -y atom
	RUN rm -rf /app/ATOM
	RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\
	cd /app/ATOM && \\
	git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
	pip install -e .

	RUN echo "=== ATOM version AFTER installation ===" && pip show atom \|\| true
	EOF

	- name: Build Docker image
	if: ${{ !github.event.pull_request.head.repo.fork }}
	run: \|
	docker build --network=host \
	--no-cache \
	-t atom_test:ci \
	-f Dockerfile.mod .

	- name: Push Docker image
	if: ${{ !github.event.pull_request.head.repo.fork }}
	run: \|
	IMAGE_TAG=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}
	docker tag atom_test:ci $IMAGE_TAG
	docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }}
	docker push $IMAGE_TAG

	- name: Success message
	if: ${{ !github.event.pull_request.head.repo.fork }}
	run: \|
	echo "Successfully prepared image: $IMAGE_TAG"

	atom:
	needs: [pre-checks, build_atom_image]
	name: ATOM Test
	strategy:
	fail-fast: false
	matrix:
	include:
	# run_on_pr: true = run on all events; false = skip on PR (still runs on push/schedule/workflow_dispatch)
	- model_name: "Meta-Llama-3-8B-Instruct"
	model_path: "meta-llama/Meta-Llama-3-8B-Instruct"
	extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3"
	env_vars: ""
	accuracy_test_threshold: "0.73"
	runner: linux-atom-mi355-1
	run_on_pr: true
	- model_name: "Llama-3.3-70B-Instruct-MXFP4-Preview"
	model_path: "amd/Llama-3.3-70B-Instruct-MXFP4-Preview"
	extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3"
	env_vars: ""
	accuracy_test_threshold: "0.88"
	runner: linux-atom-mi355-1
	run_on_pr: true
	- model_name: "DeepSeek-R1-0528"
	model_path: "deepseek-ai/DeepSeek-R1-0528"
	extraArgs: "--kv_cache_dtype fp8 -tp 8"
	env_vars: ""
	accuracy_test_threshold: "0.94"
	runner: atom-mi355-8gpu.predownload
	run_on_pr: true
	- model_name: "DeepSeek-R1-0528 MTP"
	model_path: "deepseek-ai/DeepSeek-R1-0528"
	extraArgs: "--kv_cache_dtype fp8 -tp 8 --method mtp"
	env_vars: ""
	accuracy_test_threshold: "0.94"
	runner: atom-mi355-8gpu.predownload
	run_on_pr: true
	- model_name: "DeepSeek-R1-0528-FP4"
	model_path: "amd/DeepSeek-R1-0528-mtp-mxfp4"
	extraArgs: "--kv_cache_dtype fp8 -tp 8"
	env_vars: ""
	accuracy_test_threshold: "0.93"
	runner: atom-mi355-8gpu.predownload
	run_on_pr: true
	- model_name: "DeepSeek-R1-0528-FP4 MTP"
	model_path: "amd/DeepSeek-R1-0528-mtp-mxfp4"
	extraArgs: "--kv_cache_dtype fp8 -tp 8 --method mtp"
	env_vars: ""
	accuracy_test_threshold: "0.93"
	runner: atom-mi355-8gpu.predownload
	run_on_pr: true
	- model_name: "gpt-oss-120b"
	model_path: "openai/gpt-oss-120b"
	extraArgs: "--kv_cache_dtype fp8 --gpu-memory-utilization 0.3"
	env_vars: \|
	ATOM_GPT_OSS_MODEL=1
	accuracy_test_threshold: "0.38"
	runner: linux-atom-mi355-1
	run_on_pr: true
	- model_name: "gpt-oss-120b (2 GPUs)"
	model_path: "openai/gpt-oss-120b"
	extraArgs: "--kv_cache_dtype fp8 -tp 2 --enable-dp-attention --enable-expert-parallel --gpu-memory-utilization 0.3"
	env_vars: \|
	ATOM_GPT_OSS_MODEL=1
	accuracy_test_threshold: "0.38"
	runner: linux-atom-mi355-4
	run_on_pr: true
	- model_name: "Qwen3-235B-A22B-Instruct-2507-FP8"
	model_path: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
	extraArgs: "--kv_cache_dtype fp8 -tp 8 --enable-expert-parallel"
	env_vars: \|
	ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
	accuracy_test_threshold: "0.87"
	runner: atom-mi355-8gpu.predownload
	run_on_pr: true
	- model_name: "Qwen3-235B-A22B-Instruct-2507-MXFP4"
	model_path: "amd/Qwen3-235B-A22B-Instruct-2507-MXFP4"
	extraArgs: "--kv_cache_dtype fp8 -tp 8 --enable-expert-parallel"
	env_vars: \|
	ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
	accuracy_test_threshold: "0.87"
	runner: atom-mi355-8gpu.predownload
	run_on_pr: false
	- model_name: "Qwen3-Next-80B-A3B-Thinking"
	model_path: "Qwen/Qwen3-Next-80B-A3B-Thinking"
	extraArgs: "-tp 8"
	env_vars: ""
	accuracy_test_threshold: "0.65"
	runner: atom-mi355-8gpu.predownload
	run_on_pr: true
	if: ${{ needs.pre-checks.result == 'success' && (!github.event.pull_request \|\| github.event.pull_request.draft == false) }}
	runs-on: ${{ matrix.runner }}

	env:
	CONTAINER_NAME: atom_test_${{ strategy.job-index }}

	steps:
	- name: Kill all Docker containers and clean up workspace
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload'
	run: \|
	echo "=== Cleaning up containers on $(hostname) ==="
	containers=$(docker ps -q)
	if [ -n "$containers" ]; then
	docker kill $containers \|\| true
	fi
	docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && rm -rf /workspace/*" \|\| true

	- name: Show Docker containers
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload'
	run: docker ps -a

	- name: Show ROCm memory usage
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload'
	run: rocm-smi --showmemuse

	- name: Show ROCm GPU processes
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && matrix.runner == 'atom-mi355-8gpu.predownload'
	run: rocm-smi --showpidgpus

	- name: Checkout ATOM repo
	if: matrix.run_on_pr == true \|\| github.event_name != 'pull_request'
	uses: actions/checkout@v4

	- name: Docker Login
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && !github.event.pull_request.head.repo.fork
	run: \|
	docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }}

	- name: Generate Dockerfile for forked repo
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && github.event.pull_request.head.repo.fork
	run: \|
	cat <<EOF > Dockerfile.mod
	FROM ${{ env.ATOM_BASE_NIGTHLY_IMAGE }}
	RUN pip install -U lm-eval[api]
	RUN pip show lm-eval \|\| true
	RUN pip install hf_transfer
	RUN pip show hf_transfer \|\| true
	RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter \|\| true
	RUN pip uninstall -y amd-aiter
	RUN pip install --upgrade "pybind11>=3.0.1"
	RUN pip show pybind11
	RUN rm -rf /app/aiter-test
	RUN git clone https://github.com/ROCm/aiter.git /app/aiter-test && \\
	cd /app/aiter-test && \\
	git checkout HEAD && \\
	git submodule sync && git submodule update --init --recursive && \\
	MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
	RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter \|\| true

	RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom \|\| true
	RUN pip uninstall -y atom
	RUN rm -rf /app/ATOM
	RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\
	cd /app/ATOM && \\
	git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
	pip install -e .

	RUN echo "=== ATOM version AFTER installation ===" && pip show atom \|\| true
	EOF

	- name: Build Docker image for forked repo
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && github.event.pull_request.head.repo.fork
	run: \|
	docker build --network=host \
	--no-cache \
	-t atom_test:ci \
	-f Dockerfile.mod .

	- name: Start CI container
	if: matrix.run_on_pr == true \|\| github.event_name != 'pull_request'
	run: \|
	echo "Clean up containers..."
	(docker ps -aq -f name="^${CONTAINER_NAME}$" \| xargs -r docker stop) \|\| true
	(docker ps -aq -f name="^${CONTAINER_NAME}$" \| xargs -r docker rm) \|\| true

	if [ -f "/etc/podinfo/gha-render-devices" ]; then
	DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
	else
	DEVICE_FLAG="--device /dev/dri"
	fi

	if [ -d "/models" ]; then
	MODEL_MOUNT="-v /models:/models"
	else
	echo "Warning: /models directory not found on runner; skipping /models mount and disabling model pre-download optimization."
	MODEL_MOUNT=""
	fi

	cat > /tmp/env_file.txt << 'EOF'
	${{ matrix.env_vars }}
	EOF

	echo "Starting container: atom_test:ci"
	echo "Model-specific environment variables for ${{ matrix.model_name }}:"
	cat /tmp/env_file.txt

	if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then
	IMAGE_TAG=atom_test:ci
	else
	IMAGE_TAG=rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}
	fi


	docker run -dt --device=/dev/kfd $DEVICE_FLAG \
	-v "${GITHUB_WORKSPACE:-$PWD}":/workspace \
	$MODEL_MOUNT \
	-w /workspace \
	--ipc=host --group-add video \
	--shm-size=16G \
	--privileged \
	--cap-add=SYS_PTRACE \
	-e HF_TOKEN="${HF_TOKEN:-}" \
	--env-file /tmp/env_file.txt \
	--security-opt seccomp=unconfined \
	--ulimit memlock=-1 \
	--ulimit stack=67108864 \
	-e ATOM_DISABLE_MMAP=true \
	-v "${{ github.workspace }}:/workspace" \
	-w /workspace \
	--name "$CONTAINER_NAME" \
	$IMAGE_TAG

	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Check shm size
	if: matrix.run_on_pr == true \|\| github.event_name != 'pull_request'
	run: \|
	docker exec "$CONTAINER_NAME" df -h /dev/shm

	- name: Download models
	if: matrix.run_on_pr == true \|\| github.event_name != 'pull_request'
	run: \|
	if [ -d "/models" ]; then
	echo "/models directory found, downloading model to /models/${{ matrix.model_path }}"
	if ! docker exec -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} "$CONTAINER_NAME" bash -lc "hf download ${{ matrix.model_path }} --local-dir /models/${{ matrix.model_path }}"; then
	echo "Model download failed for '${{ matrix.model_path }}'. Aborting."
	exit 1
	fi
	else
	echo "/models directory not found, skipping model download"
	fi

	- name: Run ATOM simple inference
	if: matrix.run_on_pr == true \|\| github.event_name != 'pull_request'
	timeout-minutes: 30
	run: \|
	# Run the inference and capture output
	set -euo pipefail

	echo ""
	echo "========== Running test =========="

	if [ -d "/models" ]; then
	model_path="/models/${{ matrix.model_path }}"
	else
	model_path="${{ matrix.model_path }}"
	fi
	echo "Model path: $model_path"
	ls -la $model_path \|\| true
	# Print debug logs
	echo "========= Runner debug logs ==============="
	ps aux
	rocm-smi --showmemuse
	rocm-smi --showpids
	docker ps -a
	echo "========= End runner debug logs ==============="
	docker exec "$CONTAINER_NAME" bash -lc "
	set -euo pipefail
	python3 -m atom.examples.simple_inference \
	--model \"$model_path\" \
	${{ matrix.extraArgs }} \
	--temperature 0 \
	\| grep -E '^Prompt: \|^Completion:'
	" > atom_test_output.txt

	echo ""
	echo "========== Showing test output below =========="
	cat atom_test_output.txt

	- name: Compare output with golden outputs
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && false
	timeout-minutes: 30
	# TODO: skip for all test until it's fixed
	run: \|
	echo "========== Comparing output with golden outputs =========="
	if ! diff -u -B -w --strip-trailing-cr \
	atom_test_output.txt \
	".github/workflows/golden_outputs/${{ matrix.model_name }}_golden_output.txt"; then
	echo "Failed: Output does not match golden outputs."
	exit 1
	else
	echo "Success: Output matches golden outputs."
	fi

	- name: Run ATOM accuracy test
	if: matrix.run_on_pr == true \|\| github.event_name != 'pull_request'
	timeout-minutes: 30
	run: \|
	set -euo pipefail
	echo ""
	echo "========== Launching ATOM server =========="
	if [ -d "/models" ]; then
	model_path="/models/${{ matrix.model_path }}"
	else
	model_path="${{ matrix.model_path }}"
	fi
	docker exec "$CONTAINER_NAME" bash -lc "
	.github/scripts/atom_test.sh launch $model_path ${{ matrix.extraArgs }}
	"
	echo ""
	echo "========== Running accuracy test =========="
	docker exec "$CONTAINER_NAME" bash -lc "
	.github/scripts/atom_test.sh accuracy $model_path
	" 2>&1 \| tee atom_accuracy_output.txt

	- name: Check accuracy test results
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && success()
	run: \|
	result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null \| head -n 1)
	if [ -z "$result_file" ] \|\| [ ! -f "$result_file" ]; then
	echo "ERROR: No results JSON file found in accuracy_test_results/"
	exit 2
	else
	echo "RESULT_FILE: $result_file"
	fi
	flexible_extract_value=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file")
	echo "Flexible extract value: $flexible_extract_value"
	echo "Accuracy test threshold: ${{ matrix.accuracy_test_threshold }}"

	# Compare as float: use awk for decimal value comparison
	result=$(awk -v val="$flexible_extract_value" -v threshold="${{ matrix.accuracy_test_threshold }}" 'BEGIN {print (val < threshold) ? 1 : 0}')
	if [ "$result" -eq 1 ]; then
	echo "Accuracy test failed: Flexible extract value $flexible_extract_value is less than the threshold ${{ matrix.accuracy_test_threshold }}."
	exit 1
	else
	echo "Accuracy test passed: Flexible extract value $flexible_extract_value is greater than or equal to the threshold ${{ matrix.accuracy_test_threshold }}."
	exit 0
	fi

	- name: Collect Test Summary
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && success()
	run: \|
	echo "Accuracy Test Summary for ${{ matrix.model_name }}:" >> $GITHUB_STEP_SUMMARY
	awk '/\\|Tasks\\|Version\\|/,/^$/ { if (NF > 0) print }' atom_accuracy_output.txt >> $GITHUB_STEP_SUMMARY

	- name: Upload output
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && always()
	uses: actions/upload-artifact@v4
	with:
	name: ${{ matrix.model_name }}_atom_test_output.txt
	path: atom_test_output.txt

	- name: Clean Up
	if: (matrix.run_on_pr == true \|\| github.event_name != 'pull_request') && always()
	run: \|
	# TODO: run a separate container for cleanup of the workspace due to permission issue to remove some pyc files under __pycache__ whose owners are root.
	# We should use non-root user to run the test to avoid this issue.
	set -x
	echo "========== Cleaning up workspace =========="
	if [[ ${{ matrix.runner }} == atom-mi355-8gpu.predownload ]]; then
	docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && rm -rf /workspace/*" \|\| true
	fi
	docker stop "$CONTAINER_NAME" \|\| true
	docker rm "$CONTAINER_NAME" \|\| true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add CK-free fallback for fused QKNorm+RoPE+Cache #52

Workflow file

Add CK-free fallback for fused QKNorm+RoPE+Cache #52

Uh oh!

Workflow file for this run