Skip to content

Commit 122a4c4

Browse files
tomquistclaude
andauthored
Add grid-power traces and metric glossary to eval PR comments (#468)
* Add collapsible metric glossary to steering-eval PR comment * Add per-scenario grid-power chart to steering-eval PR comment Export a downsampled grid-power trace from each scenario run and render it as a Mermaid xychart (base vs head overlaid) inside the scenario's details block in the CI comment. * Increase steering-eval chart resolution to 1800 points * Enlarge steering-eval chart canvas for thinner lines and resolution Mermaid xychart has no line stroke-width config, so set a larger 1800x600 canvas via an init directive; GitHub scales the SVG to comment width, rendering the line proportionally thinner and giving 1800 points room to resolve. --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent d6327db commit 122a4c4

2 files changed

Lines changed: 164 additions & 3 deletions

File tree

src/astrameter/simulator/evaluation.py

Lines changed: 142 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@
2222
uv run python -m astrameter.simulator.evaluation --compare base.json \\
2323
--input head.json
2424
25-
``--compare`` renders a Markdown before/after table; CI runs the suite on
26-
the PR base and head and posts that comparison as a sticky PR comment (see
27-
``.github/workflows/ci.yml``, job ``steering-eval``).
25+
``--compare`` renders a Markdown before/after table — including a Mermaid
26+
chart of each scenario's grid-power trace (base vs head) — and CI runs the
27+
suite on the PR base and head and posts that comparison as a sticky PR
28+
comment (see ``.github/workflows/ci.yml``, job ``steering-eval``).
2829
"""
2930

3031
from __future__ import annotations
@@ -68,6 +69,16 @@
6869
HEADROOM_MARGIN_W = 5.0
6970
SOC_EMPTY = 0.02
7071
SOC_FULL = 0.98
72+
# Number of points each scenario's grid-power trace is downsampled to for the
73+
# Mermaid chart in the CI PR comment. Base and head share this fixed count so
74+
# the two lines align by index regardless of poll cadence.
75+
GRAPH_POINTS = 1800
76+
# Mermaid's plot line has a fixed pixel stroke width with no config knob, so we
77+
# enlarge the chart canvas instead: GitHub scales the SVG down to the comment
78+
# width, which renders the line proportionally thinner — and the extra width
79+
# gives the 1800 points room to resolve instead of blobbing together.
80+
GRAPH_WIDTH = 1800
81+
GRAPH_HEIGHT = 600
7182

7283

7384
# ---------------------------------------------------------------------------
@@ -341,6 +352,29 @@ def _settle_time(samples: list[_Sample], start: float, end: float) -> float | No
341352
return None
342353

343354

355+
def _downsample_grid(
356+
samples: list[_Sample], duration_s: float, n: int = GRAPH_POINTS
357+
) -> list[float]:
358+
"""Bucket the grid trace into *n* evenly spaced mean values over the run.
359+
360+
Empty buckets carry the previous value forward so the chart has no gaps;
361+
the fixed length lets base and head overlay by index in the PR comment.
362+
"""
363+
if not samples or duration_s <= 0 or n <= 0:
364+
return []
365+
buckets: list[list[float]] = [[] for _ in range(n)]
366+
for s in samples:
367+
idx = min(int(s.t / duration_s * n), n - 1)
368+
buckets[idx].append(s.grid)
369+
out: list[float] = []
370+
last = 0.0
371+
for bucket in buckets:
372+
if bucket:
373+
last = sum(bucket) / len(bucket)
374+
out.append(round(last, 1))
375+
return out
376+
377+
344378
def _compute_metrics(
345379
scenario: Scenario,
346380
seed: int,
@@ -452,6 +486,7 @@ def in_transient(t: float) -> bool:
452486
"avoidable_import_wh": round(avoid_import_wh, 1),
453487
"avoidable_export_wh": round(avoid_export_wh, 1),
454488
"battery_travel_w_per_h": round(travel_w / duration_h, 0),
489+
"grid_trace": _downsample_grid(samples, scenario.duration_s),
455490
}
456491

457492

@@ -694,6 +729,66 @@ def add(s: Scenario) -> None:
694729
"battery_travel_w_per_h",
695730
]
696731

732+
# Short, human-readable description for each metric in `_REPORT_METRICS`,
733+
# rendered as a collapsible glossary in the CI PR comment. Keep in sync with
734+
# `_REPORT_METRICS` and the metric computation in `_score()`.
735+
_METRIC_GLOSSARY = [
736+
(
737+
"settle_mean_s",
738+
f"Mean seconds after a load/PV step for grid power to return inside the "
739+
f"±{SETTLE_BAND_W:g} W settle band and hold for {SETTLE_HOLD_S:g} s "
740+
f"(reaction speed).",
741+
),
742+
(
743+
"settle_p95_s",
744+
"95th-percentile settle time — the slow tail of reactions.",
745+
),
746+
(
747+
"unsettled_events",
748+
f"Number of disturbance events that never settled within the "
749+
f"{EVENT_WINDOW_S / 60:g}-minute measurement window.",
750+
),
751+
(
752+
"overshoot_mean_w",
753+
"Mean overshoot (W): how far grid power swings past zero to the "
754+
"opposite sign after an event.",
755+
),
756+
(
757+
"overshoot_max_w",
758+
"Worst-case overshoot (W) across all events.",
759+
),
760+
(
761+
"band_crossings_per_h",
762+
f"Sign flips per hour across the ±{OSC_BAND_W:g} W hysteresis band — "
763+
f"oscillation / hunting.",
764+
),
765+
(
766+
"steady_rms_w",
767+
f"RMS grid power (W) during steady state (excluding the "
768+
f"{STEADY_EXCLUDE_S:g} s after each event) — residual jitter when "
769+
f"nothing is changing.",
770+
),
771+
(
772+
"mean_abs_grid_w",
773+
"Mean absolute grid power (W) over the whole run — overall tracking accuracy.",
774+
),
775+
(
776+
"avoidable_import_wh",
777+
"Energy imported from the grid (Wh) the battery could have supplied "
778+
"(it had charge and discharge headroom) — missed self-consumption.",
779+
),
780+
(
781+
"avoidable_export_wh",
782+
"Energy exported to the grid (Wh) an AC-chargeable battery could have "
783+
"absorbed (it had room and charge headroom) — missed charging.",
784+
),
785+
(
786+
"battery_travel_w_per_h",
787+
"Total absolute change in battery setpoints per hour (W/h) — control "
788+
"effort / actuator wear; lower is smoother.",
789+
),
790+
]
791+
697792

698793
def render_text(results: list[dict]) -> str:
699794
lines = []
@@ -724,7 +819,15 @@ def render_markdown_compare(base: list[dict], head: list[dict]) -> str:
724819
"Lower is better for every metric. See "
725820
"`src/astrameter/simulator/evaluation.py` for definitions.",
726821
"",
822+
"<details><summary><b>What do these metrics mean?</b></summary>",
823+
"",
824+
"| Metric | Meaning |",
825+
"|---|---|",
727826
]
827+
out.extend(f"| `{key}` | {desc} |" for key, desc in _METRIC_GLOSSARY)
828+
out.append("")
829+
out.append("</details>")
830+
out.append("")
728831
for res in head:
729832
b = base_by.get(res["scenario"])
730833
out.append(
@@ -739,6 +842,7 @@ def render_markdown_compare(base: list[dict], head: list[dict]) -> str:
739842
delta = _fmt_delta(float(b[key]), float(res[key])) if b else "—"
740843
out.append(f"| {key} | {bv} | {res[key]} | {delta} |")
741844
out.append("")
845+
out.extend(_grid_chart(b, res))
742846
out.append("</details>")
743847
missing = [
744848
r["scenario"]
@@ -751,6 +855,41 @@ def render_markdown_compare(base: list[dict], head: list[dict]) -> str:
751855
return "\n".join(out)
752856

753857

858+
def _grid_chart(base: dict | None, head: dict) -> list[str]:
859+
"""Mermaid ``xychart`` of grid power over time, base vs head overlaid.
860+
861+
Renders nothing when the head result predates the trace (older JSON
862+
artifacts), so the comment stays valid across mixed-version runs.
863+
"""
864+
head_trace = head.get("grid_trace") or []
865+
if not head_trace:
866+
return []
867+
base_trace = (base or {}).get("grid_trace") or []
868+
duration_min = round(float(head.get("duration_h", 0.0)) * 60)
869+
caption = (
870+
"Grid power over time (W) — line 1 = base, line 2 = head:"
871+
if base_trace
872+
else "Grid power over time (W) — head:"
873+
)
874+
lines = [
875+
caption,
876+
"",
877+
"```mermaid",
878+
f'%%{{init: {{"xyChart": {{"width": {GRAPH_WIDTH}, '
879+
f'"height": {GRAPH_HEIGHT}}}}}}}%%',
880+
"xychart-beta",
881+
' title "grid power (W)"',
882+
f' x-axis "minutes" 0 --> {duration_min}',
883+
' y-axis "W"',
884+
]
885+
if base_trace:
886+
lines.append(f" line [{', '.join(f'{v:g}' for v in base_trace)}]")
887+
lines.append(f" line [{', '.join(f'{v:g}' for v in head_trace)}]")
888+
lines.append("```")
889+
lines.append("")
890+
return lines
891+
892+
754893
def _summary_line(base: dict | None, head: dict) -> str:
755894
parts = [
756895
f"settle {head['settle_mean_s']}s",

tests/test_steering_eval.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
import pytest
1313

1414
from astrameter.simulator.evaluation import (
15+
_METRIC_GLOSSARY,
16+
_REPORT_METRICS,
17+
GRAPH_POINTS,
1518
BatterySpec,
1619
Event,
1720
Scenario,
@@ -104,6 +107,25 @@ def test_markdown_compare_renders():
104107
md = render_markdown_compare([base], [res])
105108
assert "| overshoot_max_w |" in md
106109
assert "tiny" in md
110+
# The collapsible metric glossary is included with a row per metric.
111+
assert "What do these metrics mean?" in md
112+
for key in _REPORT_METRICS:
113+
assert f"| `{key}` |" in md
114+
# Each scenario embeds a Mermaid grid-power chart with a base and head line.
115+
assert "```mermaid" in md
116+
assert "xychart-beta" in md
117+
assert md.count(" line [") == 2
118+
119+
120+
def test_grid_trace_is_downsampled_to_fixed_length():
121+
res = asyncio.run(run_scenario(_tiny_scenario(), seed=3))
122+
assert len(res["grid_trace"]) == GRAPH_POINTS
123+
assert all(isinstance(v, float) for v in res["grid_trace"])
124+
125+
126+
def test_metric_glossary_covers_every_reported_metric():
127+
glossary_keys = [key for key, _ in _METRIC_GLOSSARY]
128+
assert glossary_keys == _REPORT_METRICS
107129

108130

109131
@pytest.mark.parametrize("name", ["single_venus_steps"])

0 commit comments

Comments
 (0)