ROCm
diff --git a/‎.github/scripts/atom_oot_test.sh‎
Lines changed: 232 additions & 0 deletions b/‎.github/scripts/atom_oot_test.sh‎
Lines changed: 232 additions & 0 deletions
diff --git a/‎.github/workflows/atom-test.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/atom-test.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/atom-vllm-oot-full-test.yaml‎
Lines changed: 8 additions & 59 deletions b/‎.github/workflows/atom-vllm-oot-full-test.yaml‎
Lines changed: 8 additions & 59 deletions
@@ -0,0 +1,232 @@
+#!/bin/bash
+set -euo pipefail
+
+# Usage:
+#   .github/scripts/atom_oot_test.sh launch <mode> [model_name]
+#   .github/scripts/atom_oot_test.sh accuracy <mode> [model_name]
+#
+# TYPE:
+#   launch   - launch vLLM server and wait until ready
+#   accuracy - run gsm8k accuracy test (and threshold check)
+#
+# MODE:
+#   ci    - only Kimi-K2
+#   full  - all OOT-supported models
+#
+# Optional model_name can be used to run a single model in full mode.
+
+TYPE=${1:-launch}
+MODE=${2:-ci}
+SELECTED_MODEL=${3:-}
+
+if [[ "$TYPE" != "launch" && "$TYPE" != "accuracy" ]]; then
+  echo "Invalid TYPE: $TYPE. Expected: launch or accuracy"
+  exit 2
+fi
+
+if [[ "$MODE" != "ci" && "$MODE" != "full" ]]; then
+  echo "Invalid MODE: $MODE. Expected: ci or full"
+  exit 2
+fi
+
+MAX_WAIT_RETRIES=${MAX_WAIT_RETRIES:-60}
+WAIT_INTERVAL_SEC=${WAIT_INTERVAL_SEC:-30}
+VLLM_PORT=${VLLM_PORT:-8000}
+VLLM_HOST=${VLLM_HOST:-0.0.0.0}
+VLLM_PID_FILE=${VLLM_PID_FILE:-/tmp/vllm_oot.pid}
+VLLM_LOG_FILE=${VLLM_LOG_FILE:-/tmp/vllm_oot.log}
+RESULT_DIR=${RESULT_DIR:-/tmp/oot_accuracy_results}
+ACCURACY_LOG_FILE=${ACCURACY_LOG_FILE:-/tmp/oot_accuracy_output.txt}
+
+# Format:
+#   MODEL_NAME|MODEL_PATH|EXTRA_ARGS|THRESHOLD
+CI_MODE_MODELS=(
+  "Kimi-K2|amd/Kimi-K2-Thinking-MXFP4|--trust-remote-code --kv-cache-dtype fp8 --tensor-parallel-size 8 --enable-expert-parallel|0.90"
+)
+
+FULL_MODE_MODELS=(
+  "Qwen3 Dense|Qwen/Qwen3-8B|--trust-remote-code --kv-cache-dtype fp8 --tensor-parallel-size 1|0.70"
+  "Qwen3 MoE|Qwen/Qwen3-235B-A22B-Instruct-2507-FP8|--trust-remote-code --kv-cache-dtype fp8 --tensor-parallel-size 8 --enable-expert-parallel|0.87"
+  "DeepSeek-V3 family|deepseek-ai/DeepSeek-R1-0528|--trust-remote-code --kv-cache-dtype fp8 --tensor-parallel-size 8|0.93"
+  "GPT-OSS|openai/gpt-oss-120b|--trust-remote-code --kv-cache-dtype fp8 --tensor-parallel-size 2 --enable-dp-attention --enable-expert-parallel --gpu-memory-utilization 0.3|0.38"
+  "Kimi-K2|amd/Kimi-K2-Thinking-MXFP4|--trust-remote-code --kv-cache-dtype fp8 --tensor-parallel-size 8 --enable-expert-parallel|0.90"
+)
+
+declare -a ACTIVE_MODELS=()
+if [[ "$MODE" == "ci" ]]; then
+  ACTIVE_MODELS=("${CI_MODE_MODELS[@]}")
+else
+  ACTIVE_MODELS=("${FULL_MODE_MODELS[@]}")
+fi
+
+resolve_model_path() {
+  local model_path="$1"
+  if [[ -f "/models/${model_path}/config.json" ]]; then
+    echo "/models/${model_path}"
+  else
+    echo "${model_path}"
+  fi
+}
+
+wait_server_ready() {
+  local model_name="$1"
+  echo ""
+  echo "========== Waiting for vLLM server (${model_name}) =========="
+  for ((i=1; i<=MAX_WAIT_RETRIES; i++)); do
+    if curl -sS "http://127.0.0.1:${VLLM_PORT}/v1/models" >/dev/null; then
+      echo "vLLM server is ready for ${model_name}."
+      return 0
+    fi
+
+    if [[ -f "${VLLM_PID_FILE}" ]]; then
+      local pid
+      pid=$(cat "${VLLM_PID_FILE}")
+      if ! kill -0 "${pid}" 2>/dev/null; then
+        echo "vLLM process exited early for ${model_name}."
+        tail -n 200 "${VLLM_LOG_FILE}" || true
+        return 1
+      fi
+    fi
+
+    echo "Waiting for vLLM server... (${i}/${MAX_WAIT_RETRIES})"
+    sleep "${WAIT_INTERVAL_SEC}"
+  done
+
+  echo "vLLM server did not become ready in time for ${model_name}."
+  tail -n 200 "${VLLM_LOG_FILE}" || true
+  return 1
+}
+
+stop_server() {
+  if [[ -f "${VLLM_PID_FILE}" ]]; then
+    local pid
+    pid=$(cat "${VLLM_PID_FILE}")
+    kill "${pid}" 2>/dev/null || true
+    rm -f "${VLLM_PID_FILE}" || true
+  fi
+}
+
+launch_one_model() {
+  local model_name="$1"
+  local model_path="$2"
+  local extra_args="$3"
+
+  local resolved_model_path
+  resolved_model_path=$(resolve_model_path "${model_path}")
+
+  echo ""
+  echo "========== Launching vLLM server =========="
+  echo "Model name: ${model_name}"
+  echo "Model path: ${resolved_model_path}"
+  echo "Extra args: ${extra_args}"
+
+  export SAFETENSORS_FAST_GPU=1
+  export VLLM_ROCM_USE_AITER=1
+  export VLLM_RPC_TIMEOUT=1800000
+  export VLLM_CACHE_ROOT=/tmp/.cache/vllm
+  export TORCHINDUCTOR_CACHE_DIR=/tmp/.cache/inductor
+  rm -rf /tmp/.cache
+
+  rm -f "${VLLM_PID_FILE}" || true
+
+  nohup vllm serve "${resolved_model_path}" \
+    --host "${VLLM_HOST}" \
+    --port "${VLLM_PORT}" \
+    --disable-log-requests \
+    --async-scheduling \
+    --load-format fastsafetensors \
+    --max-model-len 16384 \
+    ${extra_args} \
+    > "${VLLM_LOG_FILE}" 2>&1 &
+  echo $! > "${VLLM_PID_FILE}"
+  echo "Server PID: $(cat "${VLLM_PID_FILE}")"
+
+  wait_server_ready "${model_name}"
+}
+
+accuracy_one_model() {
+  local model_name="$1"
+  local model_path="$2"
+  local extra_args="$3"
+  local threshold="$4"
+
+  local resolved_model_path
+  resolved_model_path=$(resolve_model_path "${model_path}")
+
+  if ! command -v lm_eval >/dev/null 2>&1; then
+    echo "========== Installing lm-eval =========="
+    pip install 'lm-eval[api]'
+  fi
+
+  mkdir -p "${RESULT_DIR}"
+  local result_file="${RESULT_DIR}/$(date +%Y%m%d%H%M%S)_${model_name// /_}.json"
+
+  echo ""
+  echo "========== Running OOT gsm8k accuracy =========="
+  echo "Model name: ${model_name}"
+  echo "Threshold: ${threshold}"
+
+  lm_eval --model local-completions \
+    --model_args model="${resolved_model_path}",base_url="http://127.0.0.1:${VLLM_PORT}/v1/completions",num_concurrent=65,max_retries=1,tokenized_requests=False \
+    --tasks gsm8k \
+    --num_fewshot 3 \
+    --output_path "${result_file}" 2>&1 | tee -a "${ACCURACY_LOG_FILE}"
+
+  local value
+  value=$(python - <<PY
+import json
+with open("${result_file}", "r", encoding="utf-8") as f:
+    data = json.load(f)
+print(data["results"]["gsm8k"]["exact_match,flexible-extract"])
+PY
+)
+
+  echo "Result file: ${result_file}"
+  echo "Flexible extract value: ${value}"
+  echo "Accuracy threshold: ${threshold}"
+
+  python - <<PY
+value = float("${value}")
+threshold = float("${threshold}")
+assert value >= threshold, f"Accuracy failed: {value} < {threshold}"
+print(f"Accuracy passed: {value} >= {threshold}")
+PY
+}
+
+run_for_models() {
+  local action="$1"
+  local matched=0
+
+  for entry in "${ACTIVE_MODELS[@]}"; do
+    IFS='|' read -r model_name model_path extra_args threshold <<< "${entry}"
+
+    if [[ -n "${SELECTED_MODEL}" && "${SELECTED_MODEL}" != "${model_name}" ]]; then
+      continue
+    fi
+    matched=1
+
+    if [[ "${action}" == "launch" ]]; then
+      launch_one_model "${model_name}" "${model_path}" "${extra_args}"
+      break
+    fi
+
+    # accuracy mode: launch + evaluate each selected model, then stop server.
+    launch_one_model "${model_name}" "${model_path}" "${extra_args}"
+    accuracy_one_model "${model_name}" "${model_path}" "${extra_args}" "${threshold}"
+    stop_server
+  done
+
+  if [[ "${matched}" -eq 0 ]]; then
+    echo "No model matched MODE=${MODE}, SELECTED_MODEL=${SELECTED_MODEL}"
+    exit 2
+  fi
+}
+
+trap 'stop_server' EXIT
+
+if [[ "${TYPE}" == "launch" ]]; then
+  run_for_models "launch"
+else
+  run_for_models "accuracy"
+fi
+
@@ -21,7 +21,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 
 env:
-  ATOM_BASE_NIGTHLY_IMAGE: rocm/atom-dev:latest
+  ATOM_BASE_NIGHTLY_IMAGE: rocm/atom-dev:latest
   GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/ATOM.git' }}
   GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }}
 
@@ -46,7 +46,7 @@ jobs:
         if: ${{ !github.event.pull_request.head.repo.fork }}
         run: |
           cat <<EOF > Dockerfile.mod
-          FROM ${{ env.ATOM_BASE_NIGTHLY_IMAGE }}
+          FROM ${{ env.ATOM_BASE_NIGHTLY_IMAGE }}
           RUN pip install -U lm-eval[api]
           RUN pip show lm-eval || true
           RUN pip install hf_transfer
@@ -229,7 +229,7 @@ jobs:
         if: (matrix.run_on_pr == true || github.event_name != 'pull_request') && github.event.pull_request.head.repo.fork
         run: |
           cat <<EOF > Dockerfile.mod
-          FROM ${{ env.ATOM_BASE_NIGTHLY_IMAGE }}
+          FROM ${{ env.ATOM_BASE_NIGHTLY_IMAGE }}
           RUN pip install -U lm-eval[api]
           RUN pip show lm-eval || true
           RUN pip install hf_transfer
 
@@ -113,7 +113,7 @@ jobs:
         run: |
           docker pull "${{ needs.build-oot-image.outputs.oot_image_tag }}"
 
-      - name: Run plugin unit tests
+      - name: Run all plugin unit tests
         run: |
           docker run --rm \
             -v "${{ github.workspace }}:/workspace" \
@@ -221,71 +221,20 @@ jobs:
         env:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
-      - name: Resolve and download model
+      - name: Pre-download model if /models exists
         run: |
-          if [ -f "/models/${{ matrix.model_path }}/config.json" ]; then
-            echo "MODEL_PATH=/models/${{ matrix.model_path }}" >> "$GITHUB_ENV"
+          if [ -d "/models" ] && [ ! -f "/models/${{ matrix.model_path }}/config.json" ]; then
+            docker exec -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} "$CONTAINER_NAME" bash -lc "hf download ${{ matrix.model_path }} --local-dir /models/${{ matrix.model_path }}"
           else
-            echo "MODEL_PATH=${{ matrix.model_path }}" >> "$GITHUB_ENV"
-            if [ -d "/models" ]; then
-              docker exec -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} "$CONTAINER_NAME" bash -lc "hf download ${{ matrix.model_path }} --local-dir /models/${{ matrix.model_path }}"
-              echo "MODEL_PATH=/models/${{ matrix.model_path }}" >> "$GITHUB_ENV"
-            fi
+            echo "Skip model pre-download"
           fi
 
-      - name: Launch vLLM server with ATOM OOT plugin
+      - name: Run OOT launch and gsm8k accuracy via script (full mode)
+        timeout-minutes: 120
         run: |
           docker exec "$CONTAINER_NAME" bash -lc "
             set -euo pipefail
-            export SAFETENSORS_FAST_GPU=1
-            export VLLM_ROCM_USE_AITER=1
-            export VLLM_RPC_TIMEOUT=1800000
-            export VLLM_CACHE_ROOT=/tmp/.cache/vllm
-            export TORCHINDUCTOR_CACHE_DIR=/tmp/.cache/inductor
-            rm -rf /tmp/.cache
-
-            nohup vllm serve \"$MODEL_PATH\" \
-              --host 0.0.0.0 \
-              --port 8000 \
-              ${{ matrix.extra_args }} \
-              > /tmp/vllm_oot.log 2>&1 &
-            echo \$! > /tmp/vllm_oot.pid
-          "
-
-      - name: Wait for vLLM readiness
-        timeout-minutes: 30
-        run: |
-          set -euo pipefail
-          for i in $(seq 1 60); do
-            if docker exec "$CONTAINER_NAME" bash -lc "curl -sS http://127.0.0.1:8000/v1/models >/dev/null"; then
-              echo "vLLM server is ready."
-              exit 0
-            fi
-            echo "Waiting for server... ($i/60)"
-            sleep 30
-          done
-          docker exec "$CONTAINER_NAME" bash -lc "tail -n 200 /tmp/vllm_oot.log || true"
-          exit 1
-
-      - name: Run gsm8k accuracy
-        timeout-minutes: 60
-        run: |
-          docker exec "$CONTAINER_NAME" bash -lc "
-            set -euo pipefail
-            mkdir -p /tmp/oot_accuracy_results
-            RESULT_FILE=/tmp/oot_accuracy_results/\$(date +%Y%m%d%H%M%S).json
-            lm_eval --model local-completions \
-              --model_args model=\"$MODEL_PATH\",base_url=http://127.0.0.1:8000/v1/completions,num_concurrent=65,max_retries=1,tokenized_requests=False \
-              --tasks gsm8k \
-              --num_fewshot 3 \
-              --output_path \"\$RESULT_FILE\" 2>&1 | tee /tmp/oot_accuracy_output.txt
-          "
-
-      - name: Check accuracy threshold
-        run: |
-          docker exec "$CONTAINER_NAME" bash -lc "
-            set -euo pipefail
-            python -c \"import json, glob; files=sorted(glob.glob('/tmp/oot_accuracy_results/*.json')); assert files, 'No accuracy JSON found'; threshold=float('${{ matrix.accuracy_test_threshold }}'); result_file=files[-1]; data=json.load(open(result_file)); value=data['results']['gsm8k']['exact_match,flexible-extract']; print('RESULT_FILE:', result_file); print('value:', value, 'threshold:', threshold); assert value >= threshold, f'Accuracy failed: {value} < {threshold}'\"
+            bash .github/scripts/atom_oot_test.sh accuracy full '${{ matrix.model_name }}'
           "
 
       - name: Collect summary