Siddhant-K-code
diff --git a/‎src/agent_trace/eval/__init__.py‎
Lines changed: 35 additions & 0 deletions b/‎src/agent_trace/eval/__init__.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/agent_trace/eval/config.py‎
Lines changed: 227 additions & 0 deletions b/‎src/agent_trace/eval/config.py‎
Lines changed: 227 additions & 0 deletions
diff --git a/‎src/agent_trace/eval/dataset.py‎
Lines changed: 108 additions & 0 deletions b/‎src/agent_trace/eval/dataset.py‎
Lines changed: 108 additions & 0 deletions
@@ -0,0 +1,35 @@
+"""Evaluation framework for agent sessions.
+
+Score, compare, and regression-test agent sessions against configurable
+scorers. All storage is local JSONL — no external service, no database.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+from .runner import cmd_eval_run, cmd_eval_compare, cmd_eval_ci
+from .dataset import cmd_dataset
+
+
+def cmd_eval(args: argparse.Namespace) -> int:
+    eval_command = getattr(args, "eval_command", None)
+    if not eval_command:
+        sys.stderr.write(
+            "Usage: agent-strace eval <run|compare|ci|dataset> ...\n"
+            "Run `agent-strace eval --help` for details.\n"
+        )
+        return 1
+
+    if eval_command == "run":
+        return cmd_eval_run(args)
+    if eval_command == "compare":
+        return cmd_eval_compare(args)
+    if eval_command == "ci":
+        return cmd_eval_ci(args)
+    if eval_command == "dataset":
+        return cmd_dataset(args)
+
+    sys.stderr.write(f"Unknown eval subcommand: {eval_command}\n")
+    return 1
@@ -0,0 +1,227 @@
+"""Eval configuration loader (.agent-evals.yaml).
+
+Parses a YAML-like config file using stdlib only (no PyYAML dependency).
+Supports a minimal subset of YAML: string keys, scalar values, and lists.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class ScorerConfig:
+    type: str
+    threshold: float = 1.0
+    weight: float = 1.0
+    # Extra scorer-specific params stored as a dict
+    params: dict = field(default_factory=dict)
+
+
+@dataclass
+class EvalConfig:
+    scorers: list[ScorerConfig] = field(default_factory=list)
+    pass_threshold: float = 0.85
+    warn_threshold: float = 0.70
+
+    @classmethod
+    def default(cls) -> "EvalConfig":
+        """Return a sensible default config when no file is present."""
+        return cls(
+            scorers=[
+                ScorerConfig(type="no_errors", threshold=1.0),
+            ],
+            pass_threshold=0.85,
+            warn_threshold=0.70,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Minimal YAML parser (stdlib only)
+# ---------------------------------------------------------------------------
+
+def _parse_yaml_value(raw: str):
+    """Parse a scalar YAML value to Python type."""
+    raw = raw.strip()
+    if raw in ("true", "True", "yes"):
+        return True
+    if raw in ("false", "False", "no"):
+        return False
+    if raw in ("null", "~", ""):
+        return None
+    try:
+        return int(raw)
+    except ValueError:
+        pass
+    try:
+        return float(raw)
+    except ValueError:
+        pass
+    # Strip surrounding quotes
+    if (raw.startswith('"') and raw.endswith('"')) or \
+       (raw.startswith("'") and raw.endswith("'")):
+        return raw[1:-1]
+    return raw
+
+
+def _parse_minimal_yaml(text: str) -> dict:
+    """Parse a minimal YAML document into a nested dict/list structure.
+
+    Supports:
+    - key: value          (scalar mapping)
+    - key:                (mapping block — indented children follow)
+    - - key: value        (list of mappings)
+    - - scalar            (list of scalars)
+    """
+    lines = text.splitlines()
+
+    def _skip_blanks(idx: int) -> int:
+        while idx < len(lines) and (not lines[idx].strip() or lines[idx].strip().startswith("#")):
+            idx += 1
+        return idx
+
+    def _indent(line: str) -> int:
+        return len(line) - len(line.lstrip())
+
+    def _parse_block(start: int, base_indent: int) -> tuple[int, object]:
+        """Parse a block starting at *start* with *base_indent* indentation.
+
+        Returns (next_line_index, parsed_value).
+        """
+        i = _skip_blanks(start)
+        if i >= len(lines):
+            return i, {}
+
+        first_line = lines[i]
+        first_indent = _indent(first_line)
+        first_content = first_line.strip()
+
+        # Determine block type from first non-blank line
+        if first_content.startswith("- "):
+            # List block
+            result_list: list = []
+            while i < len(lines):
+                i = _skip_blanks(i)
+                if i >= len(lines):
+                    break
+                line = lines[i]
+                ind = _indent(line)
+                content = line.strip()
+                if ind < base_indent:
+                    break
+                if not content.startswith("- "):
+                    break
+
+                item_content = content[2:].strip()
+                if ":" in item_content:
+                    # Mapping item: parse key: value on this line, then sub-keys
+                    item_dict: dict = {}
+                    key, _, rest = item_content.partition(":")
+                    key = key.strip()
+                    rest = rest.strip()
+                    if rest:
+                        item_dict[key] = _parse_yaml_value(rest)
+                    else:
+                        # value is a sub-block
+                        i += 1
+                        i, sub = _parse_block(i, ind + 2)
+                        item_dict[key] = sub
+                        result_list.append(item_dict)
+                        continue
+
+                    # Collect additional key: value lines at deeper indent
+                    i += 1
+                    while i < len(lines):
+                        i = _skip_blanks(i)
+                        if i >= len(lines):
+                            break
+                        sub_line = lines[i]
+                        sub_ind = _indent(sub_line)
+                        sub_content = sub_line.strip()
+                        if sub_ind <= ind or sub_content.startswith("- "):
+                            break
+                        if ":" in sub_content:
+                            k, _, v = sub_content.partition(":")
+                            item_dict[k.strip()] = _parse_yaml_value(v.strip())
+                        i += 1
+                    result_list.append(item_dict)
+                else:
+                    result_list.append(_parse_yaml_value(item_content))
+                    i += 1
+
+            return i, result_list
+        else:
+            # Mapping block
+            result_dict: dict = {}
+            while i < len(lines):
+                i = _skip_blanks(i)
+                if i >= len(lines):
+                    break
+                line = lines[i]
+                ind = _indent(line)
+                content = line.strip()
+                if ind < base_indent:
+                    break
+                if content.startswith("- "):
+                    break
+                if ":" not in content:
+                    i += 1
+                    continue
+
+                key, _, rest = content.partition(":")
+                key = key.strip()
+                rest = rest.strip()
+                i += 1
+
+                if rest:
+                    result_dict[key] = _parse_yaml_value(rest)
+                else:
+                    # Look ahead for child block
+                    j = _skip_blanks(i)
+                    if j < len(lines) and _indent(lines[j]) > ind:
+                        i, child = _parse_block(i, _indent(lines[j]))
+                        result_dict[key] = child
+                    else:
+                        result_dict[key] = None
+
+            return i, result_dict
+
+    _, result = _parse_block(0, 0)
+    if not isinstance(result, dict):
+        return {}
+    return result
+
+
+def load_config(path: str | Path = ".agent-evals.yaml") -> EvalConfig:
+    """Load eval config from *path*. Returns default config if file not found."""
+    p = Path(path)
+    if not p.exists():
+        return EvalConfig.default()
+
+    try:
+        text = p.read_text(encoding="utf-8")
+        data = _parse_minimal_yaml(text)
+    except Exception:
+        return EvalConfig.default()
+
+    scorers: list[ScorerConfig] = []
+    for s in data.get("scorers", []):
+        if not isinstance(s, dict):
+            continue
+        scorer_type = str(s.get("type", ""))
+        if not scorer_type:
+            continue
+        threshold = float(s.get("threshold", s.get("weight", 1.0)))
+        params = {k: v for k, v in s.items() if k not in ("type", "threshold", "weight")}
+        scorers.append(ScorerConfig(type=scorer_type, threshold=threshold, params=params))
+
+    thresholds = data.get("thresholds", {}) or {}
+    pass_t = float(thresholds.get("pass", 0.85)) if isinstance(thresholds, dict) else 0.85
+    warn_t = float(thresholds.get("warn", 0.70)) if isinstance(thresholds, dict) else 0.70
+
+    if not scorers:
+        scorers = EvalConfig.default().scorers
+
+    return EvalConfig(scorers=scorers, pass_threshold=pass_t, warn_threshold=warn_t)
@@ -0,0 +1,108 @@
+"""Dataset management for eval sessions.
+
+Datasets are JSONL files stored in .agent-traces/datasets/.
+Each entry records a session ID, label, and scorer configuration.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+import uuid
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+
+from ..store import TraceStore
+
+
+@dataclass
+class DatasetEntry:
+    entry_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
+    session_id: str = ""
+    label: str = ""
+    added_at: float = field(default_factory=time.time)
+    scorers: list[dict] = field(default_factory=list)
+
+    def to_json(self) -> str:
+        return json.dumps(asdict(self), separators=(",", ":"))
+
+    @classmethod
+    def from_json(cls, line: str) -> "DatasetEntry":
+        return cls(**json.loads(line))
+
+
+# ---------------------------------------------------------------------------
+# CRUD
+# ---------------------------------------------------------------------------
+
+def _ensure_dir(path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+
+def add_entry(dataset_path: str | Path, entry: DatasetEntry) -> None:
+    p = Path(dataset_path)
+    _ensure_dir(p)
+    with open(p, "a", encoding="utf-8") as f:
+        f.write(entry.to_json() + "\n")
+
+
+def list_entries(dataset_path: str | Path) -> list[DatasetEntry]:
+    p = Path(dataset_path)
+    if not p.exists():
+        return []
+    entries = []
+    for line in p.read_text(encoding="utf-8").splitlines():
+        line = line.strip()
+        if line:
+            try:
+                entries.append(DatasetEntry.from_json(line))
+            except (json.JSONDecodeError, TypeError):
+                continue
+    return entries
+
+
+def export_entries(dataset_path: str | Path, out=sys.stdout) -> None:
+    for entry in list_entries(dataset_path):
+        out.write(entry.to_json() + "\n")
+
+
+# ---------------------------------------------------------------------------
+# CLI handler
+# ---------------------------------------------------------------------------
+
+def cmd_dataset(args: argparse.Namespace) -> int:
+    dataset_command = getattr(args, "dataset_command", None)
+    dataset_path = getattr(args, "dataset", ".agent-traces/datasets/default.jsonl")
+
+    if dataset_command == "add":
+        session_id = getattr(args, "session", "")
+        label = getattr(args, "label", "")
+        if not session_id:
+            sys.stderr.write("--session is required\n")
+            return 1
+        entry = DatasetEntry(session_id=session_id, label=label)
+        add_entry(dataset_path, entry)
+        sys.stderr.write(f"Added session {session_id} to dataset {dataset_path}\n")
+        return 0
+
+    if dataset_command == "list":
+        entries = list_entries(dataset_path)
+        if not entries:
+            sys.stdout.write(f"No entries in {dataset_path}\n")
+            return 0
+        sys.stdout.write(f"\nDataset: {dataset_path} ({len(entries)} entries)\n")
+        sys.stdout.write(f"{'─' * 60}\n")
+        for e in entries:
+            label = f"  {e.label}" if e.label else ""
+            sys.stdout.write(f"  {e.entry_id}  {e.session_id}{label}\n")
+        sys.stdout.write(f"{'─' * 60}\n\n")
+        return 0
+
+    if dataset_command == "export":
+        export_entries(dataset_path)
+        return 0
+
+    sys.stderr.write("Usage: agent-strace eval dataset <add|list|export>\n")
+    return 1