Add grid-power traces and metric glossary to eval PR comments (#468)

tomquist · claude · web-flow · commit 122a4c47d518 · 2026-06-13T00:16:26.000+02:00
* Add collapsible metric glossary to steering-eval PR comment

* Add per-scenario grid-power chart to steering-eval PR comment

Export a downsampled grid-power trace from each scenario run and render it
as a Mermaid xychart (base vs head overlaid) inside the scenario's details
block in the CI comment.

* Increase steering-eval chart resolution to 1800 points

* Enlarge steering-eval chart canvas for thinner lines and resolution

Mermaid xychart has no line stroke-width config, so set a larger 1800x600
canvas via an init directive; GitHub scales the SVG to comment width,
rendering the line proportionally thinner and giving 1800 points room to
resolve.

---------

Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/src/astrameter/simulator/evaluation.py b/src/astrameter/simulator/evaluation.py
@@ -22,9 +22,10 @@
     uv run python -m astrameter.simulator.evaluation --compare base.json \\
         --input head.json
 
-``--compare`` renders a Markdown before/after table; CI runs the suite on
-the PR base and head and posts that comparison as a sticky PR comment (see
-``.github/workflows/ci.yml``, job ``steering-eval``).
+``--compare`` renders a Markdown before/after table — including a Mermaid
+chart of each scenario's grid-power trace (base vs head) — and CI runs the
+suite on the PR base and head and posts that comparison as a sticky PR
+comment (see ``.github/workflows/ci.yml``, job ``steering-eval``).
 """
 
 from __future__ import annotations
@@ -68,6 +69,16 @@
 HEADROOM_MARGIN_W = 5.0
 SOC_EMPTY = 0.02
 SOC_FULL = 0.98
+# Number of points each scenario's grid-power trace is downsampled to for the
+# Mermaid chart in the CI PR comment. Base and head share this fixed count so
+# the two lines align by index regardless of poll cadence.
+GRAPH_POINTS = 1800
+# Mermaid's plot line has a fixed pixel stroke width with no config knob, so we
+# enlarge the chart canvas instead: GitHub scales the SVG down to the comment
+# width, which renders the line proportionally thinner — and the extra width
+# gives the 1800 points room to resolve instead of blobbing together.
+GRAPH_WIDTH = 1800
+GRAPH_HEIGHT = 600
 
 
 # ---------------------------------------------------------------------------
@@ -341,6 +352,29 @@ def _settle_time(samples: list[_Sample], start: float, end: float) -> float | No
     return None
 
 
+def _downsample_grid(
+    samples: list[_Sample], duration_s: float, n: int = GRAPH_POINTS
+) -> list[float]:
+    """Bucket the grid trace into *n* evenly spaced mean values over the run.
+
+    Empty buckets carry the previous value forward so the chart has no gaps;
+    the fixed length lets base and head overlay by index in the PR comment.
+    """
+    if not samples or duration_s <= 0 or n <= 0:
+        return []
+    buckets: list[list[float]] = [[] for _ in range(n)]
+    for s in samples:
+        idx = min(int(s.t / duration_s * n), n - 1)
+        buckets[idx].append(s.grid)
+    out: list[float] = []
+    last = 0.0
+    for bucket in buckets:
+        if bucket:
+            last = sum(bucket) / len(bucket)
+        out.append(round(last, 1))
+    return out
+
+
 def _compute_metrics(
     scenario: Scenario,
     seed: int,
@@ -452,6 +486,7 @@ def in_transient(t: float) -> bool:
         "avoidable_import_wh": round(avoid_import_wh, 1),
         "avoidable_export_wh": round(avoid_export_wh, 1),
         "battery_travel_w_per_h": round(travel_w / duration_h, 0),
+        "grid_trace": _downsample_grid(samples, scenario.duration_s),
     }
 
 
@@ -694,6 +729,66 @@ def add(s: Scenario) -> None:
     "battery_travel_w_per_h",
 ]
 
+# Short, human-readable description for each metric in `_REPORT_METRICS`,
+# rendered as a collapsible glossary in the CI PR comment. Keep in sync with
+# `_REPORT_METRICS` and the metric computation in `_score()`.
+_METRIC_GLOSSARY = [
+    (
+        "settle_mean_s",
+        f"Mean seconds after a load/PV step for grid power to return inside the "
+        f"±{SETTLE_BAND_W:g} W settle band and hold for {SETTLE_HOLD_S:g} s "
+        f"(reaction speed).",
+    ),
+    (
+        "settle_p95_s",
+        "95th-percentile settle time — the slow tail of reactions.",
+    ),
+    (
+        "unsettled_events",
+        f"Number of disturbance events that never settled within the "
+        f"{EVENT_WINDOW_S / 60:g}-minute measurement window.",
+    ),
+    (
+        "overshoot_mean_w",
+        "Mean overshoot (W): how far grid power swings past zero to the "
+        "opposite sign after an event.",
+    ),
+    (
+        "overshoot_max_w",
+        "Worst-case overshoot (W) across all events.",
+    ),
+    (
+        "band_crossings_per_h",
+        f"Sign flips per hour across the ±{OSC_BAND_W:g} W hysteresis band — "
+        f"oscillation / hunting.",
+    ),
+    (
+        "steady_rms_w",
+        f"RMS grid power (W) during steady state (excluding the "
+        f"{STEADY_EXCLUDE_S:g} s after each event) — residual jitter when "
+        f"nothing is changing.",
+    ),
+    (
+        "mean_abs_grid_w",
+        "Mean absolute grid power (W) over the whole run — overall tracking accuracy.",
+    ),
+    (
+        "avoidable_import_wh",
+        "Energy imported from the grid (Wh) the battery could have supplied "
+        "(it had charge and discharge headroom) — missed self-consumption.",
+    ),
+    (
+        "avoidable_export_wh",
+        "Energy exported to the grid (Wh) an AC-chargeable battery could have "
+        "absorbed (it had room and charge headroom) — missed charging.",
+    ),
+    (
+        "battery_travel_w_per_h",
+        "Total absolute change in battery setpoints per hour (W/h) — control "
+        "effort / actuator wear; lower is smoother.",
+    ),
+]
+
 
 def render_text(results: list[dict]) -> str:
     lines = []
@@ -724,7 +819,15 @@ def render_markdown_compare(base: list[dict], head: list[dict]) -> str:
         "Lower is better for every metric. See "
         "`src/astrameter/simulator/evaluation.py` for definitions.",
         "",
+        "<details><summary><b>What do these metrics mean?</b></summary>",
+        "",
+        "| Metric | Meaning |",
+        "|---|---|",
     ]
+    out.extend(f"| `{key}` | {desc} |" for key, desc in _METRIC_GLOSSARY)
+    out.append("")
+    out.append("</details>")
+    out.append("")
     for res in head:
         b = base_by.get(res["scenario"])
         out.append(
@@ -739,6 +842,7 @@ def render_markdown_compare(base: list[dict], head: list[dict]) -> str:
             delta = _fmt_delta(float(b[key]), float(res[key])) if b else "—"
             out.append(f"| {key} | {bv} | {res[key]} | {delta} |")
         out.append("")
+        out.extend(_grid_chart(b, res))
         out.append("</details>")
     missing = [
         r["scenario"]
@@ -751,6 +855,41 @@ def render_markdown_compare(base: list[dict], head: list[dict]) -> str:
     return "\n".join(out)
 
 
+def _grid_chart(base: dict | None, head: dict) -> list[str]:
+    """Mermaid ``xychart`` of grid power over time, base vs head overlaid.
+
+    Renders nothing when the head result predates the trace (older JSON
+    artifacts), so the comment stays valid across mixed-version runs.
+    """
+    head_trace = head.get("grid_trace") or []
+    if not head_trace:
+        return []
+    base_trace = (base or {}).get("grid_trace") or []
+    duration_min = round(float(head.get("duration_h", 0.0)) * 60)
+    caption = (
+        "Grid power over time (W) — line 1 = base, line 2 = head:"
+        if base_trace
+        else "Grid power over time (W) — head:"
+    )
+    lines = [
+        caption,
+        "",
+        "```mermaid",
+        f'%%{{init: {{"xyChart": {{"width": {GRAPH_WIDTH}, '
+        f'"height": {GRAPH_HEIGHT}}}}}}}%%',
+        "xychart-beta",
+        '    title "grid power (W)"',
+        f'    x-axis "minutes" 0 --> {duration_min}',
+        '    y-axis "W"',
+    ]
+    if base_trace:
+        lines.append(f"    line [{', '.join(f'{v:g}' for v in base_trace)}]")
+    lines.append(f"    line [{', '.join(f'{v:g}' for v in head_trace)}]")
+    lines.append("```")
+    lines.append("")
+    return lines
+
+
 def _summary_line(base: dict | None, head: dict) -> str:
     parts = [
         f"settle {head['settle_mean_s']}s",
diff --git a/tests/test_steering_eval.py b/tests/test_steering_eval.py
@@ -12,6 +12,9 @@
 import pytest
 
 from astrameter.simulator.evaluation import (
+    _METRIC_GLOSSARY,
+    _REPORT_METRICS,
+    GRAPH_POINTS,
     BatterySpec,
     Event,
     Scenario,
@@ -104,6 +107,25 @@ def test_markdown_compare_renders():
     md = render_markdown_compare([base], [res])
     assert "| overshoot_max_w |" in md
     assert "tiny" in md
+    # The collapsible metric glossary is included with a row per metric.
+    assert "What do these metrics mean?" in md
+    for key in _REPORT_METRICS:
+        assert f"| `{key}` |" in md
+    # Each scenario embeds a Mermaid grid-power chart with a base and head line.
+    assert "```mermaid" in md
+    assert "xychart-beta" in md
+    assert md.count("    line [") == 2
+
+
+def test_grid_trace_is_downsampled_to_fixed_length():
+    res = asyncio.run(run_scenario(_tiny_scenario(), seed=3))
+    assert len(res["grid_trace"]) == GRAPH_POINTS
+    assert all(isinstance(v, float) for v in res["grid_trace"])
+
+
+def test_metric_glossary_covers_every_reported_metric():
+    glossary_keys = [key for key, _ in _METRIC_GLOSSARY]
+    assert glossary_keys == _REPORT_METRICS
 
 
 @pytest.mark.parametrize("name", ["single_venus_steps"])