Skip to content

Commit d9ac92f

Browse files
Lawhyclaude
andcommitted
feat(cli): save config.json to output directory for reproducibility
- Add to_dict() methods to ModelConfig, EnvConfig, EvalConfig - Save config.json with benchmark, env_path, and all config settings - Update docs to show config.json in output files Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent cfe5a2f commit d9ac92f

File tree

3 files changed

+48
-2
lines changed

3 files changed

+48
-2
lines changed

docs/evaluation.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ Evaluation results are saved to the output directory:
212212

213213
```
214214
{benchmark}_eval/
215+
├── config.json # CLI configuration for reproducibility
215216
├── results.jsonl # Per-sample results (action, step_result, reward)
216217
└── metrics.json # Aggregated metrics (pass@k, etc.)
217218
```

src/strands_env/cli/__init__.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from __future__ import annotations
1818

1919
import asyncio
20+
import json
2021
import logging
2122
from pathlib import Path
2223
from typing import Literal
@@ -266,10 +267,23 @@ def eval_cmd(
266267
output_dir = eval_config.get_output_dir(benchmark)
267268
results_path = eval_config.get_results_path(benchmark)
268269
metrics_path = eval_config.get_metrics_path(benchmark)
270+
config_path = eval_config.get_config_path(benchmark)
269271

270272
# Create output directory
271273
output_dir.mkdir(parents=True, exist_ok=True)
272274

275+
# Save config for reproducibility
276+
config_data = {
277+
"benchmark": benchmark,
278+
"env_path": str(env_path),
279+
"model": model_config.to_dict(),
280+
"env": env_config.to_dict(),
281+
"eval": eval_config.to_dict(),
282+
}
283+
with open(config_path, "w", encoding="utf-8") as f:
284+
json.dump(config_data, f, indent=2)
285+
click.echo(f"Saved config to {config_path}")
286+
273287
# Create evaluator
274288
evaluator = evaluator_cls(
275289
env_factory=env_factory,
@@ -293,8 +307,6 @@ def eval_cmd(
293307
metrics = evaluator.compute_metrics(results)
294308

295309
# Save metrics to JSON
296-
import json
297-
298310
with open(metrics_path, "w", encoding="utf-8") as f:
299311
json.dump(metrics, f, indent=2)
300312
click.echo(f"Saved metrics to {metrics_path}")

src/strands_env/cli/config.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,19 @@ class ModelConfig:
6161
# Sampling
6262
sampling: SamplingConfig = field(default_factory=SamplingConfig)
6363

64+
def to_dict(self) -> dict:
65+
"""Convert to dict for serialization."""
66+
return {
67+
"backend": self.backend,
68+
"base_url": self.base_url,
69+
"tokenizer_path": self.tokenizer_path,
70+
"model_id": self.model_id,
71+
"region": self.region,
72+
"profile_name": self.profile_name,
73+
"role_arn": self.role_arn,
74+
"sampling": self.sampling.to_dict(),
75+
}
76+
6477

6578
@dataclass
6679
class EnvConfig:
@@ -77,6 +90,13 @@ def system_prompt(self) -> str | None:
7790
return None
7891
return self.system_prompt_path.read_text()
7992

93+
def to_dict(self) -> dict:
94+
"""Convert to dict for serialization."""
95+
return {
96+
"system_prompt_path": str(self.system_prompt_path) if self.system_prompt_path else None,
97+
"max_tool_iterations": self.max_tool_iterations,
98+
}
99+
80100

81101
@dataclass
82102
class EvalConfig:
@@ -101,3 +121,16 @@ def get_results_path(self, benchmark_name: str) -> Path:
101121
def get_metrics_path(self, benchmark_name: str) -> Path:
102122
"""Get path for metrics JSON file."""
103123
return self.get_output_dir(benchmark_name) / "metrics.json"
124+
125+
def get_config_path(self, benchmark_name: str) -> Path:
126+
"""Get path for config JSON file."""
127+
return self.get_output_dir(benchmark_name) / "config.json"
128+
129+
def to_dict(self) -> dict:
130+
"""Convert to dict for serialization."""
131+
return {
132+
"n_samples_per_prompt": self.n_samples_per_prompt,
133+
"max_concurrency": self.max_concurrency,
134+
"save_interval": self.save_interval,
135+
"keep_tokens": self.keep_tokens,
136+
}

0 commit comments

Comments
 (0)