-
Notifications
You must be signed in to change notification settings - Fork 182
update slurm test for nano v3 #1389
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
2dd4c1f
f05a14f
4b14e1b
59d07a8
8acf86d
21077b6
6ce8450
5fe7617
df54e02
6bdefa5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,256 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """Check results for nano_30b_eval SLURM benchmark suite.""" | ||
|
|
||
| import argparse | ||
| import json | ||
| import re | ||
| import sys | ||
| from pathlib import Path | ||
|
|
||
| sys.path.append(str(Path(__file__).resolve().parent.parent)) # for utils.py | ||
| from utils import assert_all, get_nested_value, load_json, soft_assert # noqa: E402 | ||
|
|
||
| NO_TOOLS_METRICS = { | ||
| "aime25": ("pass@1[avg-of-4]", "symbolic_correct", (88.0, 94.0)), | ||
| "gpqa": ("pass@1[avg-of-4]", "symbolic_correct", (69.0, 76.0)), | ||
| "mmlu-pro": ("pass@1", "symbolic_correct", (74.0, 82.0)), | ||
| "ifbench": ("pass@1[avg-of-5]", "average_score", (66.0, 77.0)), | ||
| "livecodebench": ("pass@1[avg-of-4]", "accuracy", (62.0, 72.0)), | ||
| "arena-hard-v2": ("pass@1", "score", (61.0, 74.0)), | ||
| "arena-hard-v2-hard_prompt": ("pass@1", ("category_hard_prompt", "score"), (66.0, 78.0)), | ||
| "arena-hard-v2-creative_writing": ("pass@1", ("category_creative_writing", "score"), (55.0, 72.0)), | ||
| "scicode": ("pass@1[avg-of-4]", "subtask_accuracy", (28.0, 38.0)), | ||
| "hle": ("pass@1", "judge_correct", (8.0, 14.0)), | ||
| "aalcr": ("pass@1[avg-of-3]", "judge_correct", (30.0, 42.0)), | ||
| "mmlu-prox": ("pass@1", "symbolic_correct", (54.0, 65.0)), | ||
| "wmt24pp": ("en->xx", "bleu", (82.0, 90.0)), | ||
| } | ||
|
|
||
| WITH_TOOLS_METRICS = { | ||
| "aime25": ("pass@1[avg-of-4]", "symbolic_correct", (95.0, 100.0)), | ||
| "gpqa": ("pass@1[avg-of-4]", "symbolic_correct", (72.0, 78.0)), | ||
| "hle": ("pass@1", "judge_correct", (13.0, 19.0)), | ||
| } | ||
|
|
||
| FORMAL_MATH_METRICS = { | ||
| "minif2f_pass1": ("minif2f", "pass@1[avg-of-32]", "lean4_correct", (42.0, 58.0)), | ||
| "minif2f_pass32": ("minif2f", "pass@32", "lean4_correct", (72.0, 88.0)), | ||
| } | ||
|
|
||
| AGENTIC_METRICS = { | ||
| "swe-bench": ("pass@1", "issues_resolved", (34.0, 44.0)), | ||
| } | ||
|
|
||
| TOOL_BENCHMARKS = ["aime25", "gpqa", "hle"] | ||
| MIN_TOOL_CALL_FRACTION = 0.05 | ||
| MAX_TIMEOUTS = { | ||
| "aime25": 200, | ||
| "gpqa": 1000, | ||
| "hle": 4000, | ||
| } | ||
| TIMEOUT_INDICATORS = [ | ||
| "execution timed out", | ||
| "timed out", | ||
| "process_status.*timeout", | ||
| "TimeoutError", | ||
| ] | ||
|
|
||
|
|
||
| def load_metrics_block(metrics_path: Path, benchmark: str): | ||
| data = load_json(metrics_path) | ||
| soft_assert(benchmark in data, f"Missing benchmark {benchmark} in {metrics_path}") | ||
| return data[benchmark] | ||
|
|
||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| def normalize_percent(value: float) -> float: | ||
| return value * 100.0 if 0.0 <= value <= 1.0 else value | ||
|
|
||
|
|
||
| def resolve_metrics_entry(eval_dir: Path, benchmark_key: str): | ||
| if benchmark_key.startswith("arena-hard-v2-"): | ||
| metrics_path = eval_dir / "eval-results" / "arena-hard-v2" / "metrics.json" | ||
| metrics = load_metrics_block(metrics_path, "arena-hard-v2") | ||
| return metrics_path, metrics, "arena-hard-v2" | ||
| metrics_path = eval_dir / "eval-results" / benchmark_key / "metrics.json" | ||
| metrics = load_metrics_block(metrics_path, benchmark_key) | ||
| return metrics_path, metrics, benchmark_key | ||
|
|
||
|
|
||
| def check_metric_group( | ||
| eval_dir: Path, metric_config: dict[str, tuple[str, str | tuple[str, ...], tuple[float, float]]] | ||
| ): | ||
| for benchmark, (agg_key, field, (lo, hi)) in metric_config.items(): | ||
| metrics_path, metrics, benchmark_label = resolve_metrics_entry(eval_dir, benchmark) | ||
| soft_assert(agg_key in metrics, f"Missing aggregation key {agg_key} in {metrics_path}") | ||
| if agg_key not in metrics: | ||
| continue | ||
| agg_metrics = metrics[agg_key] | ||
| if isinstance(field, tuple): | ||
| value = get_nested_value(agg_metrics, field) | ||
| field_label = "/".join(field) | ||
| else: | ||
| soft_assert(field in agg_metrics, f"Missing field {field} in {metrics_path}") | ||
| if field not in agg_metrics: | ||
| continue | ||
| value = agg_metrics[field] | ||
| field_label = field | ||
| soft_assert(value is not None, f"Missing field {field_label} in {metrics_path}") | ||
| if value is None: | ||
| continue | ||
| value = normalize_percent(float(value)) | ||
| print(f"{eval_dir.name}/{benchmark_label}/{agg_key}/{field_label}: {value}") | ||
| soft_assert(lo <= value <= hi, f"{benchmark}: {field_label}={value} out of range [{lo}, {hi}]") | ||
|
|
||
|
|
||
| def iter_output_rows(bench_dir: Path): | ||
| output_files = sorted(bench_dir.glob("output-rs*.jsonl")) | ||
| soft_assert(len(output_files) > 0, f"No output files found in {bench_dir}") | ||
|
|
||
| for output_path in output_files: | ||
| with output_path.open("rt", encoding="utf-8") as fin: | ||
| for line in fin: | ||
| if not line.strip(): | ||
| continue | ||
| yield output_path, json.loads(line) | ||
|
|
||
|
|
||
| def check_tool_usage(eval_dir: Path): | ||
| total_samples = 0 | ||
| samples_with_tools = 0 | ||
| samples_with_tool_messages = 0 | ||
|
|
||
| for benchmark in TOOL_BENCHMARKS: | ||
| bench_dir = eval_dir / "eval-results" / benchmark | ||
| for _, row in iter_output_rows(bench_dir): | ||
| total_samples += 1 | ||
| soft_assert("num_tool_calls" in row, f"Missing num_tool_calls in {benchmark} output row") | ||
| soft_assert("conversation" in row, f"Missing conversation in {benchmark} output row") | ||
| if "num_tool_calls" not in row or "conversation" not in row: | ||
| continue | ||
| if row["num_tool_calls"] > 0: | ||
| samples_with_tools += 1 | ||
| has_tool_message = False | ||
| for msg in row["conversation"]: | ||
| soft_assert(isinstance(msg, dict), f"Conversation entry is not a dict in {benchmark} output row") | ||
| if not isinstance(msg, dict): | ||
| continue | ||
| soft_assert("role" in msg, f"Missing role in {benchmark} conversation entry") | ||
| if "role" not in msg: | ||
| continue | ||
| if msg["role"] == "tool": | ||
| has_tool_message = True | ||
| if has_tool_message: | ||
| samples_with_tool_messages += 1 | ||
|
|
||
| soft_assert(total_samples > 0, "No samples found in with_tools outputs") | ||
| tool_fraction = samples_with_tools / total_samples | ||
| print( | ||
|
Comment on lines
+158
to
+160
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prevent division by zero when tool outputs are missing. At Line 159, Proposed fix soft_assert(total_samples > 0, "No samples found in with_tools outputs")
+ if total_samples == 0:
+ return
tool_fraction = samples_with_tools / total_samples🤖 Prompt for AI Agents |
||
| f"with_tools/tool_usage: {samples_with_tools}/{total_samples} " | ||
| f"samples used tools ({tool_fraction:.1%}), {samples_with_tool_messages} had tool messages" | ||
| ) | ||
| soft_assert( | ||
| tool_fraction >= MIN_TOOL_CALL_FRACTION, | ||
| f"Too few samples used tools: {tool_fraction:.1%} < {MIN_TOOL_CALL_FRACTION:.0%}", | ||
| ) | ||
| soft_assert(samples_with_tool_messages > 0, "No samples contained tool messages") | ||
|
|
||
|
|
||
| def check_timeouts(eval_dir: Path): | ||
| timeout_pattern = re.compile("|".join(TIMEOUT_INDICATORS), re.IGNORECASE) | ||
|
|
||
| for benchmark in TOOL_BENCHMARKS: | ||
| bench_dir = eval_dir / "eval-results" / benchmark | ||
| bench_timeouts = 0 | ||
|
|
||
| for output_path in sorted(bench_dir.glob("output-rs*.jsonl")): | ||
| file_timeouts = 0 | ||
| with output_path.open("rt", encoding="utf-8") as fin: | ||
|
Comment on lines
+178
to
+180
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assert timeout input files exist per benchmark.
Proposed fix for benchmark in TOOL_BENCHMARKS:
bench_dir = eval_dir / "eval-results" / benchmark
bench_timeouts = 0
+ output_files = sorted(bench_dir.glob("output-rs*.jsonl"))
+ soft_assert(len(output_files) > 0, f"No output files found in {bench_dir}")
+ if not output_files:
+ continue
- for output_path in sorted(bench_dir.glob("output-rs*.jsonl")):
+ for output_path in output_files:🤖 Prompt for AI Agents |
||
| for line in fin: | ||
| if not line.strip(): | ||
| continue | ||
| row = json.loads(line) | ||
| soft_assert("conversation" in row, f"Missing conversation in {benchmark}/{output_path.name}") | ||
| if "conversation" not in row: | ||
| continue | ||
| for msg in row["conversation"]: | ||
| soft_assert( | ||
| isinstance(msg, dict), | ||
| f"Conversation entry is not a dict in {benchmark}/{output_path.name}", | ||
| ) | ||
| if not isinstance(msg, dict): | ||
| continue | ||
| soft_assert("role" in msg, f"Missing role in {benchmark}/{output_path.name}") | ||
| if "role" not in msg or msg["role"] != "tool": | ||
| continue | ||
| soft_assert("content" in msg, f"Missing content in {benchmark}/{output_path.name}") | ||
| if "content" not in msg: | ||
| continue | ||
| content = str(msg["content"]) | ||
| if timeout_pattern.search(content): | ||
| file_timeouts += 1 | ||
| bench_timeouts += file_timeouts | ||
| if file_timeouts > 0: | ||
| print(f"{benchmark}/{output_path.name}: num_code_timeouts={file_timeouts}") | ||
|
|
||
| allowed = MAX_TIMEOUTS[benchmark] | ||
| print(f"{benchmark} total code_timeouts: {bench_timeouts} (allowed: {allowed})") | ||
| soft_assert( | ||
| bench_timeouts <= allowed, | ||
| f"{benchmark}: code execution timeouts regressed: observed {bench_timeouts}, allowed <= {allowed}", | ||
| ) | ||
|
|
||
|
|
||
| def check_formal_math(eval_dir: Path): | ||
| for label, (benchmark, agg_key, field, (lo, hi)) in FORMAL_MATH_METRICS.items(): | ||
| metrics_path = eval_dir / "eval-results" / benchmark / "metrics.json" | ||
| metrics = load_metrics_block(metrics_path, benchmark) | ||
| soft_assert(agg_key in metrics, f"Missing aggregation key {agg_key} in {metrics_path}") | ||
| soft_assert(field in metrics[agg_key], f"Missing field {field} in {metrics_path}") | ||
| value = normalize_percent(float(metrics[agg_key][field])) | ||
| print(f"formal_math/{label}/{agg_key}/{field}: {value}") | ||
| soft_assert(lo <= value <= hi, f"{label}: {field}={value} out of range [{lo}, {hi}]") | ||
|
|
||
|
|
||
| def check_agentic(eval_dir: Path): | ||
| check_metric_group(eval_dir, AGENTIC_METRICS) | ||
|
|
||
|
|
||
| def main(): | ||
| ap = argparse.ArgumentParser() | ||
| ap.add_argument("--workspace", required=True, help="Workspace directory containing results") | ||
| args = ap.parse_args() | ||
|
|
||
| eval_root = Path(args.workspace) | ||
|
|
||
| print("=== no_tools ===") | ||
| check_metric_group(eval_root / "no_tools", NO_TOOLS_METRICS) | ||
|
|
||
| print("\n=== with_tools ===") | ||
| check_metric_group(eval_root / "with_tools", WITH_TOOLS_METRICS) | ||
| check_tool_usage(eval_root / "with_tools") | ||
| check_timeouts(eval_root / "with_tools") | ||
|
|
||
| print("\n=== formal_math ===") | ||
| check_formal_math(eval_root / "formal_math") | ||
|
|
||
| print("\n=== agentic ===") | ||
| check_agentic(eval_root / "agentic") | ||
|
|
||
| assert_all() | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for large datasets like this, I think we can use max_samples=256 or something like this to make it faster