feat(cli): save config.json to output directory for reproducibility

Lawhy · claude · Lawhy · commit d9ac92ff13bf · 2026-02-06T21:01:16.000-08:00
- Add to_dict() methods to ModelConfig, EnvConfig, EvalConfig
- Save config.json with benchmark, env_path, and all config settings
- Update docs to show config.json in output files

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -212,6 +212,7 @@ Evaluation results are saved to the output directory:
 
 ```
 {benchmark}_eval/
+├── config.json      # CLI configuration for reproducibility
 ├── results.jsonl    # Per-sample results (action, step_result, reward)
 └── metrics.json     # Aggregated metrics (pass@k, etc.)
 ```
diff --git a/src/strands_env/cli/__init__.py b/src/strands_env/cli/__init__.py
@@ -17,6 +17,7 @@
 from __future__ import annotations
 
 import asyncio
+import json
 import logging
 from pathlib import Path
 from typing import Literal
@@ -266,10 +267,23 @@ def eval_cmd(
     output_dir = eval_config.get_output_dir(benchmark)
     results_path = eval_config.get_results_path(benchmark)
     metrics_path = eval_config.get_metrics_path(benchmark)
+    config_path = eval_config.get_config_path(benchmark)
 
     # Create output directory
     output_dir.mkdir(parents=True, exist_ok=True)
 
+    # Save config for reproducibility
+    config_data = {
+        "benchmark": benchmark,
+        "env_path": str(env_path),
+        "model": model_config.to_dict(),
+        "env": env_config.to_dict(),
+        "eval": eval_config.to_dict(),
+    }
+    with open(config_path, "w", encoding="utf-8") as f:
+        json.dump(config_data, f, indent=2)
+    click.echo(f"Saved config to {config_path}")
+
     # Create evaluator
     evaluator = evaluator_cls(
         env_factory=env_factory,
@@ -293,8 +307,6 @@ def eval_cmd(
     metrics = evaluator.compute_metrics(results)
 
     # Save metrics to JSON
-    import json
-
     with open(metrics_path, "w", encoding="utf-8") as f:
         json.dump(metrics, f, indent=2)
     click.echo(f"Saved metrics to {metrics_path}")
diff --git a/src/strands_env/cli/config.py b/src/strands_env/cli/config.py
@@ -61,6 +61,19 @@ class ModelConfig:
     # Sampling
     sampling: SamplingConfig = field(default_factory=SamplingConfig)
 
+    def to_dict(self) -> dict:
+        """Convert to dict for serialization."""
+        return {
+            "backend": self.backend,
+            "base_url": self.base_url,
+            "tokenizer_path": self.tokenizer_path,
+            "model_id": self.model_id,
+            "region": self.region,
+            "profile_name": self.profile_name,
+            "role_arn": self.role_arn,
+            "sampling": self.sampling.to_dict(),
+        }
+
 
 @dataclass
 class EnvConfig:
@@ -77,6 +90,13 @@ def system_prompt(self) -> str | None:
             return None
         return self.system_prompt_path.read_text()
 
+    def to_dict(self) -> dict:
+        """Convert to dict for serialization."""
+        return {
+            "system_prompt_path": str(self.system_prompt_path) if self.system_prompt_path else None,
+            "max_tool_iterations": self.max_tool_iterations,
+        }
+
 
 @dataclass
 class EvalConfig:
@@ -101,3 +121,16 @@ def get_results_path(self, benchmark_name: str) -> Path:
     def get_metrics_path(self, benchmark_name: str) -> Path:
         """Get path for metrics JSON file."""
         return self.get_output_dir(benchmark_name) / "metrics.json"
+
+    def get_config_path(self, benchmark_name: str) -> Path:
+        """Get path for config JSON file."""
+        return self.get_output_dir(benchmark_name) / "config.json"
+
+    def to_dict(self) -> dict:
+        """Convert to dict for serialization."""
+        return {
+            "n_samples_per_prompt": self.n_samples_per_prompt,
+            "max_concurrency": self.max_concurrency,
+            "save_interval": self.save_interval,
+            "keep_tokens": self.keep_tokens,
+        }