Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
256 changes: 256 additions & 0 deletions tests/slurm-tests/nano_30b_eval/check_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Check results for nano_30b_eval SLURM benchmark suite."""

import argparse
import json
import re
import sys
from pathlib import Path

sys.path.append(str(Path(__file__).resolve().parent.parent)) # for utils.py
from utils import assert_all, get_nested_value, load_json, soft_assert # noqa: E402

NO_TOOLS_METRICS = {
"aime25": ("pass@1[avg-of-4]", "symbolic_correct", (88.0, 94.0)),
"gpqa": ("pass@1[avg-of-4]", "symbolic_correct", (69.0, 76.0)),
"mmlu-pro": ("pass@1", "symbolic_correct", (74.0, 82.0)),
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for large datasets like this, I think we can use max_samples=256 or something like this to make it faster

"ifbench": ("pass@1[avg-of-5]", "average_score", (66.0, 77.0)),
"livecodebench": ("pass@1[avg-of-4]", "accuracy", (62.0, 72.0)),
"arena-hard-v2": ("pass@1", "score", (61.0, 74.0)),
"arena-hard-v2-hard_prompt": ("pass@1", ("category_hard_prompt", "score"), (66.0, 78.0)),
"arena-hard-v2-creative_writing": ("pass@1", ("category_creative_writing", "score"), (55.0, 72.0)),
"scicode": ("pass@1[avg-of-4]", "subtask_accuracy", (28.0, 38.0)),
"hle": ("pass@1", "judge_correct", (8.0, 14.0)),
"aalcr": ("pass@1[avg-of-3]", "judge_correct", (30.0, 42.0)),
"mmlu-prox": ("pass@1", "symbolic_correct", (54.0, 65.0)),
"wmt24pp": ("en->xx", "bleu", (82.0, 90.0)),
}

WITH_TOOLS_METRICS = {
"aime25": ("pass@1[avg-of-4]", "symbolic_correct", (95.0, 100.0)),
"gpqa": ("pass@1[avg-of-4]", "symbolic_correct", (72.0, 78.0)),
"hle": ("pass@1", "judge_correct", (13.0, 19.0)),
}

FORMAL_MATH_METRICS = {
"minif2f_pass1": ("minif2f", "pass@1[avg-of-32]", "lean4_correct", (42.0, 58.0)),
"minif2f_pass32": ("minif2f", "pass@32", "lean4_correct", (72.0, 88.0)),
}

AGENTIC_METRICS = {
"swe-bench": ("pass@1", "issues_resolved", (34.0, 44.0)),
}

TOOL_BENCHMARKS = ["aime25", "gpqa", "hle"]
MIN_TOOL_CALL_FRACTION = 0.05
MAX_TIMEOUTS = {
"aime25": 200,
"gpqa": 1000,
"hle": 4000,
}
TIMEOUT_INDICATORS = [
"execution timed out",
"timed out",
"process_status.*timeout",
"TimeoutError",
]


def load_metrics_block(metrics_path: Path, benchmark: str):
data = load_json(metrics_path)
soft_assert(benchmark in data, f"Missing benchmark {benchmark} in {metrics_path}")
return data[benchmark]

Comment thread
coderabbitai[bot] marked this conversation as resolved.

def normalize_percent(value: float) -> float:
return value * 100.0 if 0.0 <= value <= 1.0 else value


def resolve_metrics_entry(eval_dir: Path, benchmark_key: str):
if benchmark_key.startswith("arena-hard-v2-"):
metrics_path = eval_dir / "eval-results" / "arena-hard-v2" / "metrics.json"
metrics = load_metrics_block(metrics_path, "arena-hard-v2")
return metrics_path, metrics, "arena-hard-v2"
metrics_path = eval_dir / "eval-results" / benchmark_key / "metrics.json"
metrics = load_metrics_block(metrics_path, benchmark_key)
return metrics_path, metrics, benchmark_key


def check_metric_group(
eval_dir: Path, metric_config: dict[str, tuple[str, str | tuple[str, ...], tuple[float, float]]]
):
for benchmark, (agg_key, field, (lo, hi)) in metric_config.items():
metrics_path, metrics, benchmark_label = resolve_metrics_entry(eval_dir, benchmark)
soft_assert(agg_key in metrics, f"Missing aggregation key {agg_key} in {metrics_path}")
if agg_key not in metrics:
continue
agg_metrics = metrics[agg_key]
if isinstance(field, tuple):
value = get_nested_value(agg_metrics, field)
field_label = "/".join(field)
else:
soft_assert(field in agg_metrics, f"Missing field {field} in {metrics_path}")
if field not in agg_metrics:
continue
value = agg_metrics[field]
field_label = field
soft_assert(value is not None, f"Missing field {field_label} in {metrics_path}")
if value is None:
continue
value = normalize_percent(float(value))
print(f"{eval_dir.name}/{benchmark_label}/{agg_key}/{field_label}: {value}")
soft_assert(lo <= value <= hi, f"{benchmark}: {field_label}={value} out of range [{lo}, {hi}]")


def iter_output_rows(bench_dir: Path):
output_files = sorted(bench_dir.glob("output-rs*.jsonl"))
soft_assert(len(output_files) > 0, f"No output files found in {bench_dir}")

for output_path in output_files:
with output_path.open("rt", encoding="utf-8") as fin:
for line in fin:
if not line.strip():
continue
yield output_path, json.loads(line)


def check_tool_usage(eval_dir: Path):
total_samples = 0
samples_with_tools = 0
samples_with_tool_messages = 0

for benchmark in TOOL_BENCHMARKS:
bench_dir = eval_dir / "eval-results" / benchmark
for _, row in iter_output_rows(bench_dir):
total_samples += 1
soft_assert("num_tool_calls" in row, f"Missing num_tool_calls in {benchmark} output row")
soft_assert("conversation" in row, f"Missing conversation in {benchmark} output row")
if "num_tool_calls" not in row or "conversation" not in row:
continue
if row["num_tool_calls"] > 0:
samples_with_tools += 1
has_tool_message = False
for msg in row["conversation"]:
soft_assert(isinstance(msg, dict), f"Conversation entry is not a dict in {benchmark} output row")
if not isinstance(msg, dict):
continue
soft_assert("role" in msg, f"Missing role in {benchmark} conversation entry")
if "role" not in msg:
continue
if msg["role"] == "tool":
has_tool_message = True
if has_tool_message:
samples_with_tool_messages += 1

soft_assert(total_samples > 0, "No samples found in with_tools outputs")
tool_fraction = samples_with_tools / total_samples
print(
Comment on lines +158 to +160
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Prevent division by zero when tool outputs are missing.

At Line 159, samples_with_tools / total_samples can raise ZeroDivisionError even after soft_assert fails.

Proposed fix
     soft_assert(total_samples > 0, "No samples found in with_tools outputs")
+    if total_samples == 0:
+        return
     tool_fraction = samples_with_tools / total_samples
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/slurm-tests/nano_30b_eval/check_results.py` around lines 158 - 160, The
division samples_with_tools / total_samples can still raise ZeroDivisionError
even though soft_assert was called; update the logic around the tool_fraction
calculation in check_results.py to guard against total_samples == 0 (e.g., check
total_samples before dividing, set tool_fraction to 0 or skip downstream
processing/return early) and ensure any downstream uses of tool_fraction handle
the fallback; modify the block that contains soft_assert, total_samples,
samples_with_tools and the tool_fraction assignment to perform the safe check
and avoid the division when total_samples is zero.

f"with_tools/tool_usage: {samples_with_tools}/{total_samples} "
f"samples used tools ({tool_fraction:.1%}), {samples_with_tool_messages} had tool messages"
)
soft_assert(
tool_fraction >= MIN_TOOL_CALL_FRACTION,
f"Too few samples used tools: {tool_fraction:.1%} < {MIN_TOOL_CALL_FRACTION:.0%}",
)
soft_assert(samples_with_tool_messages > 0, "No samples contained tool messages")


def check_timeouts(eval_dir: Path):
timeout_pattern = re.compile("|".join(TIMEOUT_INDICATORS), re.IGNORECASE)

for benchmark in TOOL_BENCHMARKS:
bench_dir = eval_dir / "eval-results" / benchmark
bench_timeouts = 0

for output_path in sorted(bench_dir.glob("output-rs*.jsonl")):
file_timeouts = 0
with output_path.open("rt", encoding="utf-8") as fin:
Comment on lines +178 to +180
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Assert timeout input files exist per benchmark.

check_timeouts currently allows a benchmark with no output-rs*.jsonl files to pass as 0 timeouts, which can mask missing eval outputs.

Proposed fix
     for benchmark in TOOL_BENCHMARKS:
         bench_dir = eval_dir / "eval-results" / benchmark
         bench_timeouts = 0
+        output_files = sorted(bench_dir.glob("output-rs*.jsonl"))
+        soft_assert(len(output_files) > 0, f"No output files found in {bench_dir}")
+        if not output_files:
+            continue
 
-        for output_path in sorted(bench_dir.glob("output-rs*.jsonl")):
+        for output_path in output_files:
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/slurm-tests/nano_30b_eval/check_results.py` around lines 178 - 180, The
loop over bench_dir.glob("output-rs*.jsonl") in check_timeouts currently treats
zero matched files as zero timeouts; update check_timeouts to first collect
matches (e.g., list(sorted(bench_dir.glob("output-rs*.jsonl")))) and assert that
the list is non-empty for each benchmark, raising or failing the test with a
clear message referencing the benchmark (bench_dir) when no output-rs*.jsonl
files are found so missing eval outputs cannot be silently ignored.

for line in fin:
if not line.strip():
continue
row = json.loads(line)
soft_assert("conversation" in row, f"Missing conversation in {benchmark}/{output_path.name}")
if "conversation" not in row:
continue
for msg in row["conversation"]:
soft_assert(
isinstance(msg, dict),
f"Conversation entry is not a dict in {benchmark}/{output_path.name}",
)
if not isinstance(msg, dict):
continue
soft_assert("role" in msg, f"Missing role in {benchmark}/{output_path.name}")
if "role" not in msg or msg["role"] != "tool":
continue
soft_assert("content" in msg, f"Missing content in {benchmark}/{output_path.name}")
if "content" not in msg:
continue
content = str(msg["content"])
if timeout_pattern.search(content):
file_timeouts += 1
bench_timeouts += file_timeouts
if file_timeouts > 0:
print(f"{benchmark}/{output_path.name}: num_code_timeouts={file_timeouts}")

allowed = MAX_TIMEOUTS[benchmark]
print(f"{benchmark} total code_timeouts: {bench_timeouts} (allowed: {allowed})")
soft_assert(
bench_timeouts <= allowed,
f"{benchmark}: code execution timeouts regressed: observed {bench_timeouts}, allowed <= {allowed}",
)


def check_formal_math(eval_dir: Path):
for label, (benchmark, agg_key, field, (lo, hi)) in FORMAL_MATH_METRICS.items():
metrics_path = eval_dir / "eval-results" / benchmark / "metrics.json"
metrics = load_metrics_block(metrics_path, benchmark)
soft_assert(agg_key in metrics, f"Missing aggregation key {agg_key} in {metrics_path}")
soft_assert(field in metrics[agg_key], f"Missing field {field} in {metrics_path}")
value = normalize_percent(float(metrics[agg_key][field]))
print(f"formal_math/{label}/{agg_key}/{field}: {value}")
soft_assert(lo <= value <= hi, f"{label}: {field}={value} out of range [{lo}, {hi}]")


def check_agentic(eval_dir: Path):
check_metric_group(eval_dir, AGENTIC_METRICS)


def main():
ap = argparse.ArgumentParser()
ap.add_argument("--workspace", required=True, help="Workspace directory containing results")
args = ap.parse_args()

eval_root = Path(args.workspace)

print("=== no_tools ===")
check_metric_group(eval_root / "no_tools", NO_TOOLS_METRICS)

print("\n=== with_tools ===")
check_metric_group(eval_root / "with_tools", WITH_TOOLS_METRICS)
check_tool_usage(eval_root / "with_tools")
check_timeouts(eval_root / "with_tools")

print("\n=== formal_math ===")
check_formal_math(eval_root / "formal_math")

print("\n=== agentic ===")
check_agentic(eval_root / "agentic")

assert_all()


if __name__ == "__main__":
main()
Loading
Loading