2222 uv run python -m astrameter.simulator.evaluation --compare base.json \\
2323 --input head.json
2424
25- ``--compare`` renders a Markdown before/after table; CI runs the suite on
26- the PR base and head and posts that comparison as a sticky PR comment (see
27- ``.github/workflows/ci.yml``, job ``steering-eval``).
25+ ``--compare`` renders a Markdown before/after table — including a Mermaid
26+ chart of each scenario's grid-power trace (base vs head) — and CI runs the
27+ suite on the PR base and head and posts that comparison as a sticky PR
28+ comment (see ``.github/workflows/ci.yml``, job ``steering-eval``).
2829"""
2930
3031from __future__ import annotations
6869HEADROOM_MARGIN_W = 5.0
6970SOC_EMPTY = 0.02
7071SOC_FULL = 0.98
72+ # Number of points each scenario's grid-power trace is downsampled to for the
73+ # Mermaid chart in the CI PR comment. Base and head share this fixed count so
74+ # the two lines align by index regardless of poll cadence.
75+ GRAPH_POINTS = 1800
76+ # Mermaid's plot line has a fixed pixel stroke width with no config knob, so we
77+ # enlarge the chart canvas instead: GitHub scales the SVG down to the comment
78+ # width, which renders the line proportionally thinner — and the extra width
79+ # gives the 1800 points room to resolve instead of blobbing together.
80+ GRAPH_WIDTH = 1800
81+ GRAPH_HEIGHT = 600
7182
7283
7384# ---------------------------------------------------------------------------
@@ -341,6 +352,29 @@ def _settle_time(samples: list[_Sample], start: float, end: float) -> float | No
341352 return None
342353
343354
355+ def _downsample_grid (
356+ samples : list [_Sample ], duration_s : float , n : int = GRAPH_POINTS
357+ ) -> list [float ]:
358+ """Bucket the grid trace into *n* evenly spaced mean values over the run.
359+
360+ Empty buckets carry the previous value forward so the chart has no gaps;
361+ the fixed length lets base and head overlay by index in the PR comment.
362+ """
363+ if not samples or duration_s <= 0 or n <= 0 :
364+ return []
365+ buckets : list [list [float ]] = [[] for _ in range (n )]
366+ for s in samples :
367+ idx = min (int (s .t / duration_s * n ), n - 1 )
368+ buckets [idx ].append (s .grid )
369+ out : list [float ] = []
370+ last = 0.0
371+ for bucket in buckets :
372+ if bucket :
373+ last = sum (bucket ) / len (bucket )
374+ out .append (round (last , 1 ))
375+ return out
376+
377+
344378def _compute_metrics (
345379 scenario : Scenario ,
346380 seed : int ,
@@ -452,6 +486,7 @@ def in_transient(t: float) -> bool:
452486 "avoidable_import_wh" : round (avoid_import_wh , 1 ),
453487 "avoidable_export_wh" : round (avoid_export_wh , 1 ),
454488 "battery_travel_w_per_h" : round (travel_w / duration_h , 0 ),
489+ "grid_trace" : _downsample_grid (samples , scenario .duration_s ),
455490 }
456491
457492
@@ -694,6 +729,66 @@ def add(s: Scenario) -> None:
694729 "battery_travel_w_per_h" ,
695730]
696731
732+ # Short, human-readable description for each metric in `_REPORT_METRICS`,
733+ # rendered as a collapsible glossary in the CI PR comment. Keep in sync with
734+ # `_REPORT_METRICS` and the metric computation in `_score()`.
735+ _METRIC_GLOSSARY = [
736+ (
737+ "settle_mean_s" ,
738+ f"Mean seconds after a load/PV step for grid power to return inside the "
739+ f"±{ SETTLE_BAND_W :g} W settle band and hold for { SETTLE_HOLD_S :g} s "
740+ f"(reaction speed)." ,
741+ ),
742+ (
743+ "settle_p95_s" ,
744+ "95th-percentile settle time — the slow tail of reactions." ,
745+ ),
746+ (
747+ "unsettled_events" ,
748+ f"Number of disturbance events that never settled within the "
749+ f"{ EVENT_WINDOW_S / 60 :g} -minute measurement window." ,
750+ ),
751+ (
752+ "overshoot_mean_w" ,
753+ "Mean overshoot (W): how far grid power swings past zero to the "
754+ "opposite sign after an event." ,
755+ ),
756+ (
757+ "overshoot_max_w" ,
758+ "Worst-case overshoot (W) across all events." ,
759+ ),
760+ (
761+ "band_crossings_per_h" ,
762+ f"Sign flips per hour across the ±{ OSC_BAND_W :g} W hysteresis band — "
763+ f"oscillation / hunting." ,
764+ ),
765+ (
766+ "steady_rms_w" ,
767+ f"RMS grid power (W) during steady state (excluding the "
768+ f"{ STEADY_EXCLUDE_S :g} s after each event) — residual jitter when "
769+ f"nothing is changing." ,
770+ ),
771+ (
772+ "mean_abs_grid_w" ,
773+ "Mean absolute grid power (W) over the whole run — overall tracking accuracy." ,
774+ ),
775+ (
776+ "avoidable_import_wh" ,
777+ "Energy imported from the grid (Wh) the battery could have supplied "
778+ "(it had charge and discharge headroom) — missed self-consumption." ,
779+ ),
780+ (
781+ "avoidable_export_wh" ,
782+ "Energy exported to the grid (Wh) an AC-chargeable battery could have "
783+ "absorbed (it had room and charge headroom) — missed charging." ,
784+ ),
785+ (
786+ "battery_travel_w_per_h" ,
787+ "Total absolute change in battery setpoints per hour (W/h) — control "
788+ "effort / actuator wear; lower is smoother." ,
789+ ),
790+ ]
791+
697792
698793def render_text (results : list [dict ]) -> str :
699794 lines = []
@@ -724,7 +819,15 @@ def render_markdown_compare(base: list[dict], head: list[dict]) -> str:
724819 "Lower is better for every metric. See "
725820 "`src/astrameter/simulator/evaluation.py` for definitions." ,
726821 "" ,
822+ "<details><summary><b>What do these metrics mean?</b></summary>" ,
823+ "" ,
824+ "| Metric | Meaning |" ,
825+ "|---|---|" ,
727826 ]
827+ out .extend (f"| `{ key } ` | { desc } |" for key , desc in _METRIC_GLOSSARY )
828+ out .append ("" )
829+ out .append ("</details>" )
830+ out .append ("" )
728831 for res in head :
729832 b = base_by .get (res ["scenario" ])
730833 out .append (
@@ -739,6 +842,7 @@ def render_markdown_compare(base: list[dict], head: list[dict]) -> str:
739842 delta = _fmt_delta (float (b [key ]), float (res [key ])) if b else "—"
740843 out .append (f"| { key } | { bv } | { res [key ]} | { delta } |" )
741844 out .append ("" )
845+ out .extend (_grid_chart (b , res ))
742846 out .append ("</details>" )
743847 missing = [
744848 r ["scenario" ]
@@ -751,6 +855,41 @@ def render_markdown_compare(base: list[dict], head: list[dict]) -> str:
751855 return "\n " .join (out )
752856
753857
858+ def _grid_chart (base : dict | None , head : dict ) -> list [str ]:
859+ """Mermaid ``xychart`` of grid power over time, base vs head overlaid.
860+
861+ Renders nothing when the head result predates the trace (older JSON
862+ artifacts), so the comment stays valid across mixed-version runs.
863+ """
864+ head_trace = head .get ("grid_trace" ) or []
865+ if not head_trace :
866+ return []
867+ base_trace = (base or {}).get ("grid_trace" ) or []
868+ duration_min = round (float (head .get ("duration_h" , 0.0 )) * 60 )
869+ caption = (
870+ "Grid power over time (W) — line 1 = base, line 2 = head:"
871+ if base_trace
872+ else "Grid power over time (W) — head:"
873+ )
874+ lines = [
875+ caption ,
876+ "" ,
877+ "```mermaid" ,
878+ f'%%{{init: {{"xyChart": {{"width": { GRAPH_WIDTH } , '
879+ f'"height": { GRAPH_HEIGHT } }}}}}}%%' ,
880+ "xychart-beta" ,
881+ ' title "grid power (W)"' ,
882+ f' x-axis "minutes" 0 --> { duration_min } ' ,
883+ ' y-axis "W"' ,
884+ ]
885+ if base_trace :
886+ lines .append (f" line [{ ', ' .join (f'{ v :g} ' for v in base_trace )} ]" )
887+ lines .append (f" line [{ ', ' .join (f'{ v :g} ' for v in head_trace )} ]" )
888+ lines .append ("```" )
889+ lines .append ("" )
890+ return lines
891+
892+
754893def _summary_line (base : dict | None , head : dict ) -> str :
755894 parts = [
756895 f"settle { head ['settle_mean_s' ]} s" ,
0 commit comments