NVIDIA · ishandhanani · Apr 27, 2026
@@ -0,0 +1,75 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# GPQA reasoning eval — runs inside the NeMo Skills container.
+#
+# Phase 1: ns prepare_data gpqa  (downloads GPQA Diamond from HF; gated dataset
+#          → set HF_TOKEN in benchmark.env)
+# Phase 2: ns eval --benchmarks=gpqa:$REPEAT  (NeMo Skills' default multi-choice
+#          extractor; pass@k via REPEAT)
+#
+# Server endpoint, model, and dataset can be overridden via env. Tuning knobs
+# (max_tokens, repeat, etc.) match the upstream reasoning-eval reference
+# (--benchmarks=gpqa:32, temperature=1.0, max_tokens=400000).
+#
+# Same nemo-run unquoting hazard as AIME applies — do not pass Hydra ++overrides
+# with backslash-bearing values (e.g. custom extract_regex). Post-process the
+# cached output-rs<seed>.jsonl files in Python if you need a broader extractor.
+
+set -euo pipefail
+
+ENDPOINT="${ENDPOINT:-http://localhost:8000/v1}"
+MODEL="${MODEL:-dspro}"
+DATASET="${DATASET:-gpqa}"
+REPEAT="${REPEAT:-32}"
+MAX_TOKENS="${MAX_TOKENS:-400000}"
+NUM_THREADS="${NUM_THREADS:-512}"
+TEMPERATURE="${TEMPERATURE:-1.0}"
+TOP_P="${TOP_P:-1.0}"
+SEED="${SEED:-42}"
+OUTPUT_DIR="${OUTPUT_DIR:-/logs/accuracy/${DATASET}}"
+
+export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
+
+echo "=== Config ==="
+echo "  endpoint:    $ENDPOINT"
+echo "  model:       $MODEL"
+echo "  dataset:     $DATASET"
+echo "  repeat:      $REPEAT"
+echo "  max_tokens:  $MAX_TOKENS"
+echo "  num_threads: $NUM_THREADS"
+echo "  temperature: $TEMPERATURE"
+echo "  top_p:       $TOP_P"
+echo "  seed:        $SEED"
+echo "  output_dir:  $OUTPUT_DIR"
+echo
+
+if [ -z "${HF_TOKEN:-}" ]; then
+  echo "WARNING: HF_TOKEN is not set. GPQA Diamond is HF-gated; ns prepare_data"
+  echo "         will fail unless the token is plumbed through benchmark.env."
+fi
+
+mkdir -p "$OUTPUT_DIR"
+
+echo "=== Phase 1: prepare_data ==="
+ns prepare_data "$DATASET"
+
+echo
+echo "=== Phase 2: ns eval ==="
+ns eval \
+  --server_type=openai \
+  --model="$MODEL" \
+  --server_address="$ENDPOINT" \
+  --benchmarks="${DATASET}:${REPEAT}" \
+  --output_dir="$OUTPUT_DIR" \
+  --starting_seed="$SEED" \
+  "++inference.tokens_to_generate=${MAX_TOKENS}" \
+  "++max_concurrent_requests=${NUM_THREADS}" \
+  "++inference.temperature=${TEMPERATURE}" \
+  "++inference.top_p=${TOP_P}" \
+  "++inference.timeout=25000000"
+
+echo
+echo "=== Done ==="
+echo "Metrics: ${OUTPUT_DIR}/eval-results/${DATASET}/metrics.json"
@@ -1,6 +1,6 @@
 # Accuracy Benchmarks
 
-In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa`, `longbenchv2`, and AIME (via the script under `configs/aime/`).
+In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `longbenchv2`, AIME (via the script under `configs/aime/`), and GPQA (via the script under `configs/gpqa/`).
 
 ## Table of Contents
 
@@ -154,16 +154,69 @@ MMLU evaluation complete
 
 
 ## GPQA
-For GPQA dataset, the benchmark section in yaml file can be modified in the following way:
-```bash
+
+GPQA runs in the official **NeMo Skills container** (`nvcr.io/nvidia/eval-factory/nemo-skills:26.03`),
+side-by-side with the model server. There is no first-class `type: gpqa` runner —
+the eval logic lives in `configs/gpqa/run.sh` and recipes wire it up via
+`type: custom` (same pattern as AIME above).
+
+### Recipe shape
+
+```yaml
 benchmark:
-  type: "gpqa"
-  num_examples: 198 # Number of examples to run
-  max_tokens: 65536 # We need a larger output token number for GPQA
-  repeat: 8 # Number of repetition
-  num_threads: 128 # Number of parallel threads for running benchmark
+  type: custom
+  container_image: nemo-skills    # alias defined in srtslurm.yaml `containers:`
+                                  # or the full nvcr.io URI for Pyxis auto-pull
+  env:
+    OPENAI_API_KEY: "EMPTY"       # ns/litellm requires it set; value is unused
+    HF_TOKEN: "${HF_TOKEN}"       # REQUIRED: GPQA Diamond is HF-gated
+    # Optional knob overrides — defaults match the upstream reasoning-eval reference:
+    # MODEL: "dspro"           # must match served-model-name from sglang_config
+    # DATASET: "gpqa"          # ns prepare_data target (gpqa → GPQA Diamond)
+    # REPEAT: "32"             # pass@k samples per problem
+    # MAX_TOKENS: "400000"     # generous ceiling for reasoning traces
+    # NUM_THREADS: "512"       # client-side concurrency
+    # TEMPERATURE: "1.0"
+    # TOP_P: "1.0"
+    # SEED: "42"               # --starting_seed for reproducibility
+  command: |
+    bash /configs/gpqa/run.sh
 ```
-The `context-length` argument here should be set to a value larger than `max_tokens`.
+
+Container alias setup is identical to AIME — see the AIME section above for the
+`srtslurm.yaml` `containers:` entry and the `enroot import` pre-cache step.
+
+### HF gating
+
+GPQA Diamond is gated on Hugging Face. The recipe must propagate `HF_TOKEN`
+through `benchmark.env` (already plumbed end-to-end). Without it, `ns
+prepare_data gpqa` aborts during dataset download. The script prints a
+warning at startup if `HF_TOKEN` is unset.
+
+### Reasoning-mode env vars (server side)
+
+For reasoning-capable models, set the same SGLang reasoning env vars as AIME
+(see the AIME "Reasoning-mode env vars" section). Without them, GPQA pass@k
+drops well below what the model can do.
+
+### What the script does
+
+1. `ns prepare_data $DATASET` — fetches GPQA from HF (gated) into the NeMo
+   Skills install.
+2. `ns eval --benchmarks=${DATASET}:${REPEAT} ...` against
+   `http://localhost:8000/v1` (the in-job dynamo frontend) with the
+   upstream reasoning-eval reference's tuning defaults. NeMo Skills' default
+   multi-choice extractor scores the generations.
+
+Outputs land at `/logs/accuracy/<dataset>/eval-results/<dataset>/metrics.json`
+with pass@1, pass@N, and majority@N.
+
+### Custom answer-extraction regex (not currently applied)
+
+Same nemo-run unquoting hazard as AIME — do not pass Hydra `++overrides` with
+backslash-bearing values to `ns eval`. Post-process the cached
+`output-rs<seed>.jsonl` files with a Python script (raw-string regex, no shell
+layers) if you need a broader extractor.
 
 
 ## LongBench-V2

@@ -1087,9 +1087,7 @@ src/srtctl/
 |   |-- __init__.py          # Registry and exports
 |   |-- base.py              # BenchmarkRunner ABC, register_benchmark
 |   |-- sa_bench.py          # SA-Bench throughput benchmark
-|   |-- aime.py              # AIME math accuracy benchmark
 |   |-- mmlu.py              # MMLU accuracy benchmark
-|   |-- gpqa.py              # GPQA benchmark
 |   |-- longbenchv2.py       # LongBench v2 benchmark
 |   |-- router.py            # Router benchmark
 |   |-- mooncake_router.py   # Mooncake router benchmark

@@ -420,7 +420,6 @@ Benchmark configuration. The `type` field determines which benchmark runner is u
 | `sa-bench`        | Throughput/latency serving benchmark           |
 | `sglang-bench`    | SGLang bench_serving benchmark                 |
 | `mmlu`            | MMLU accuracy evaluation                       |
-| `gpqa`            | GPQA (Graduate-level science QA) evaluation    |
 | `longbenchv2`     | Long-context evaluation benchmark              |
 | `router`          | Router performance with prefix caching         |
 | `mooncake-router` | KV-aware routing with Mooncake trace           |
@@ -498,26 +497,6 @@ benchmark:
 | `repeat`       | int  | No       | 8       | Number of repeats            |
 | `num_threads`  | int  | No       | 512     | Concurrent threads           |
 
-### gpqa
-
-Graduate-level science QA evaluation using sglang.test.run_eval.
-
-```yaml
-benchmark:
-  type: "gpqa"
-  num_examples: 198                  # Optional: Number of examples
-  max_tokens: 32768                  # Optional: Max tokens per response
-  repeat: 8                          # Optional: Number of repeats
-  num_threads: 128                   # Optional: Concurrent threads
-```
-
-| Field          | Type | Required | Default | Description                  |
-| -------------- | ---- | -------- | ------- | ---------------------------- |
-| `num_examples` | int  | No       | 198     | Number of examples to run    |
-| `max_tokens`   | int  | No       | 32768   | Max tokens per response      |
-| `repeat`       | int  | No       | 8       | Number of repeats            |
-| `num_threads`  | int  | No       | 128     | Concurrent threads           |
-
 ### longbenchv2
 
 Long-context evaluation benchmark.

@@ -63,7 +63,7 @@ backend:
 
 # Benchmark configuration
 benchmark:
-  type: "sa-bench"           # sa-bench, mmlu, gpqa, custom, or "manual" (no auto-benchmark)
+  type: "sa-bench"           # sa-bench, mmlu, custom, or "manual" (no auto-benchmark)
   isl: 1024                  # Input sequence length
   osl: 1024                  # Output sequence length
   concurrencies: [256, 512]  # Concurrency levels to test

@@ -117,9 +117,4 @@ benchmark:
   concurrencies: "32x64x128"
   req_rate: "inf"
 
-# benchmark:
-#   type: "gpqa"
-#   num_examples: 198
-#   repeat: 4
-#   num_threads: 32
-#   max_tokens: 64000
+# See configs/gpqa/run.sh + docs/accuracy.md for the script-based GPQA recipe.
@@ -107,9 +107,4 @@ benchmark:
   concurrencies: "32x64x128"
   req_rate: "inf"
 
-# benchmark:
-#   type: "gpqa"
-#   num_examples: 198
-#   repeat: 4
-#   num_threads: 32
-#   max_tokens: 64000
+# See configs/gpqa/run.sh + docs/accuracy.md for the script-based GPQA recipe.
@@ -121,8 +121,14 @@ backend:
       reasoning-parser: qwen3
 
 benchmark:
-  type: "gpqa"
-  num_examples: 198
-  max_tokens: 65536
-  repeat: 8
-  num_threads: 32
+  type: custom
+  container_image: nemo-skills
+  env:
+    OPENAI_API_KEY: "EMPTY"
+    HF_TOKEN: "${HF_TOKEN}"
+    MODEL: "Qwen/Qwen3.5-397B-A17B-FP8"
+    MAX_TOKENS: "65536"
+    REPEAT: "8"
+    NUM_THREADS: "32"
+  command: |
+    bash /configs/gpqa/run.sh
@@ -123,8 +123,14 @@ backend:
       reasoning-parser: qwen3
 
 benchmark:
-  type: "gpqa"
-  num_examples: 198
-  max_tokens: 65536
-  repeat: 8
-  num_threads: 128
+  type: custom
+  container_image: nemo-skills
+  env:
+    OPENAI_API_KEY: "EMPTY"
+    HF_TOKEN: "${HF_TOKEN}"
+    MODEL: "Qwen/Qwen3.5-397B-A17B-FP8"
+    MAX_TOKENS: "65536"
+    REPEAT: "8"
+    NUM_THREADS: "128"
+  command: |
+    bash /configs/gpqa/run.sh
@@ -119,8 +119,14 @@ backend:
       reasoning-parser: qwen3
 
 benchmark:
-  type: "gpqa"
-  num_examples: 198
-  max_tokens: 65536
-  repeat: 1
-  num_threads: 64
+  type: custom
+  container_image: nemo-skills
+  env:
+    OPENAI_API_KEY: "EMPTY"
+    HF_TOKEN: "${HF_TOKEN}"
+    MODEL: "Qwen/Qwen3.5-397B-A17B-FP8"
+    MAX_TOKENS: "65536"
+    REPEAT: "1"
+    NUM_THREADS: "64"
+  command: |
+    bash /configs/gpqa/run.sh
@@ -110,8 +110,14 @@ backend:
       watchdog-timeout: 1000000
 
 benchmark:
-  type: "gpqa"
-  num_examples: 198
-  max_tokens: 65536
-  repeat: 8
-  num_threads: 128
+  type: custom
+  container_image: nemo-skills
+  env:
+    OPENAI_API_KEY: "EMPTY"
+    HF_TOKEN: "${HF_TOKEN}"
+    MODEL: "Qwen/Qwen3.5-397B-A17B-FP8"
+    MAX_TOKENS: "65536"
+    REPEAT: "8"
+    NUM_THREADS: "128"
+  command: |
+    bash /configs/gpqa/run.sh
@@ -114,8 +114,14 @@ backend:
       watchdog-timeout: 1000000
 
 benchmark:
-  type: "gpqa"
-  num_examples: 198
-  max_tokens: 65536
-  repeat: 8
-  num_threads: 128
+  type: custom
+  container_image: nemo-skills
+  env:
+    OPENAI_API_KEY: "EMPTY"
+    HF_TOKEN: "${HF_TOKEN}"
+    MODEL: "Qwen/Qwen3.5-397B-A17B-FP8"
+    MAX_TOKENS: "65536"
+    REPEAT: "8"
+    NUM_THREADS: "128"
+  command: |
+    bash /configs/gpqa/run.sh
@@ -109,8 +109,14 @@ backend:
       watchdog-timeout: 1000000
 
 benchmark:
-  type: "gpqa"
-  num_examples: 198
-  max_tokens: 65536
-  repeat: 8
-  num_threads: 128
+  type: custom
+  container_image: nemo-skills
+  env:
+    OPENAI_API_KEY: "EMPTY"
+    HF_TOKEN: "${HF_TOKEN}"
+    MODEL: "Qwen/Qwen3.5-397B-A17B-FP8"
+    MAX_TOKENS: "65536"
+    REPEAT: "8"
+    NUM_THREADS: "128"
+  command: |
+    bash /configs/gpqa/run.sh
@@ -6,7 +6,6 @@
 # Import runners to trigger registration
 from srtctl.benchmarks import (
     custom,
-    gpqa,
     gsm8k,
     longbenchv2,
     mmlu,
@@ -33,7 +32,6 @@
     "sa_bench",
     "sglang_bench",
     "mmlu",
-    "gpqa",
     "gsm8k",
     "longbenchv2",
     "router",