Skip to content

Commit 707c2a3

Browse files
committed
feat: detect and flag infrastructure failures in trend reports
When Bedrock is unavailable during evaluation (throttling, service outages), runs produce 0 test passes and the trend report incorrectly signals a regression. This change adds infrastructure failure detection so the gate skips unreliable runs instead of firing false regressions. - Add InfraFailureReason enum and InfraFailure dataclass to models - Preserve individual error type counts (throttle, service_unavailable, model_error) instead of collapsing to a single error_count - Read actual server_started/server_error from contract test results instead of hardcoding server_startup_success=True - Add detect_infra_failure() with conservative detection logic - Gate passes with annotation when latest run is an infra failure - Gate falls back to older non-infra run when comparison is infra failure - Add prominent warning banner to both MD and HTML trend reports - Expand Section F with per-error-type columns and infra failure flag - Serialize InfraFailureReason in YAML output via generalized Enum handler
1 parent a756b71 commit 707c2a3

15 files changed

Lines changed: 774 additions & 90 deletions

File tree

scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
from trend_reports.models import (
2020
BaselineMetrics,
2121
GateResult,
22+
InfraFailure,
23+
InfraFailureReason,
2224
RunData,
2325
RunType,
2426
SemVer,
@@ -33,6 +35,8 @@
3335
__all__ = [
3436
"BaselineMetrics",
3537
"GateResult",
38+
"InfraFailure",
39+
"InfraFailureReason",
3640
"RunData",
3741
"RunType",
3842
"SemVer",

scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__main__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,11 @@ def cmd_trend(
217217
# 5. Gate
218218
if gate:
219219
result = check_regressions(trend)
220+
if result.infra_failure_detected:
221+
print(
222+
f"Gate WARNING: {result.infra_failure_summary}",
223+
file=sys.stderr,
224+
)
220225
if result.passed:
221226
print(
222227
f"Gate PASSED: {result.latest_label} vs {result.comparison_label} "

scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/collector.py

Lines changed: 91 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
ContractTestResults,
1919
DocumentScore,
2020
HandoffMetrics,
21+
InfraFailure,
22+
InfraFailureReason,
2123
QualitativeComparison,
2224
RunConfig,
2325
RunData,
@@ -147,14 +149,20 @@ def parse_run_metrics(yaml_path: Path) -> RunMetrics:
147149

148150
hp = raw.get("handoff_patterns", {})
149151
errors = raw.get("errors", {})
152+
throttle_events = errors.get("throttle_events", 0)
153+
timeout_events = errors.get("timeout_events", 0)
154+
failed_tool_calls = errors.get("failed_tool_calls", 0)
155+
model_error_events = errors.get("model_error_events", 0)
156+
service_unavailable_events = errors.get("service_unavailable_events", 0)
157+
validation_error_events = errors.get("validation_error_events", 0)
150158
error_count = sum(
151159
[
152-
errors.get("throttle_events", 0),
153-
errors.get("timeout_events", 0),
154-
errors.get("failed_tool_calls", 0),
155-
errors.get("model_error_events", 0),
156-
errors.get("service_unavailable_events", 0),
157-
errors.get("validation_error_events", 0),
160+
throttle_events,
161+
timeout_events,
162+
failed_tool_calls,
163+
model_error_events,
164+
service_unavailable_events,
165+
validation_error_events,
158166
]
159167
)
160168

@@ -175,6 +183,12 @@ def parse_run_metrics(yaml_path: Path) -> RunMetrics:
175183
handoffs=handoffs,
176184
server_startup_success=True,
177185
error_count=error_count,
186+
throttle_events=throttle_events,
187+
service_unavailable_events=service_unavailable_events,
188+
model_error_events=model_error_events,
189+
timeout_events=timeout_events,
190+
failed_tool_calls=failed_tool_calls,
191+
validation_error_events=validation_error_events,
178192
)
179193

180194

@@ -215,12 +229,17 @@ def parse_contract_tests(yaml_path: Path) -> ContractTestResults:
215229
)
216230
)
217231

232+
server_started = raw.get("server_started", True)
233+
server_error = raw.get("server_error") or ""
234+
218235
return ContractTestResults(
219236
total=total,
220237
passed=passed,
221238
failed=failed,
222239
pass_rate=pass_rate,
223240
failures=failures,
241+
server_started=server_started,
242+
server_error=server_error,
224243
)
225244

226245

@@ -301,6 +320,59 @@ def classify_run(rules_ref: str) -> tuple[RunType, str, SemVer | None, int | Non
301320
return RunType.RELEASE, rules_ref, None, None
302321

303322

323+
# ---------------------------------------------------------------------------
324+
# Infrastructure failure detection
325+
# ---------------------------------------------------------------------------
326+
327+
328+
def detect_infra_failure(
329+
meta: RunMeta,
330+
metrics: RunMetrics,
331+
contract_tests: ContractTestResults,
332+
has_metrics_file: bool,
333+
) -> InfraFailure:
334+
"""Detect infrastructure failures from run signals.
335+
336+
Conservative: only flags clear infra issues, not ambiguous cases.
337+
"""
338+
reasons: list[InfraFailureReason] = []
339+
340+
# Signal 1: Bedrock infra errors in run-metrics.yaml
341+
if metrics.throttle_events > 0:
342+
reasons.append(InfraFailureReason.THROTTLED)
343+
if metrics.service_unavailable_events > 0:
344+
reasons.append(InfraFailureReason.SERVICE_UNAVAILABLE)
345+
if metrics.model_error_events > 0:
346+
reasons.append(InfraFailureReason.MODEL_ERROR)
347+
348+
# Signal 2: run-meta.yaml status indicates failure/crash
349+
status_lower = meta.status.lower() if meta.status else ""
350+
if "failed" in status_lower:
351+
reasons.append(InfraFailureReason.RUN_FAILED)
352+
elif not meta.status or meta.status.strip() == "":
353+
reasons.append(InfraFailureReason.RUN_CRASHED)
354+
355+
# Signal 3: run-metrics.yaml missing entirely (swarm crashed before writing)
356+
if not has_metrics_file:
357+
reasons.append(InfraFailureReason.METRICS_MISSING)
358+
359+
# Signal 4: Server failed to start (from contract-test-results.yaml)
360+
if not contract_tests.server_started:
361+
reasons.append(InfraFailureReason.SERVER_START_FAILED)
362+
363+
if not reasons:
364+
return InfraFailure(is_infra_failure=False)
365+
366+
reason_strs = [r.value for r in reasons]
367+
summary = f"Infrastructure failure detected: {', '.join(reason_strs)}"
368+
369+
return InfraFailure(
370+
is_infra_failure=True,
371+
reasons=reasons,
372+
summary=summary,
373+
)
374+
375+
304376
# ---------------------------------------------------------------------------
305377
# Collection pipeline
306378
# ---------------------------------------------------------------------------
@@ -319,11 +391,8 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
319391
meta = parse_run_meta(yaml_files["run-meta"])
320392
run_type, label, semver, pr_number = classify_run(meta.config.rules_ref)
321393

322-
metrics = (
323-
parse_run_metrics(yaml_files["run-metrics"])
324-
if "run-metrics" in yaml_files
325-
else RunMetrics()
326-
)
394+
has_metrics_file = "run-metrics" in yaml_files
395+
metrics = parse_run_metrics(yaml_files["run-metrics"]) if has_metrics_file else RunMetrics()
327396
unit_tests = (
328397
parse_test_results(yaml_files["test-results"])
329398
if "test-results" in yaml_files
@@ -334,6 +403,10 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
334403
if "contract-test-results" in yaml_files
335404
else ContractTestResults()
336405
)
406+
407+
# Propagate actual server_started to metrics
408+
metrics.server_startup_success = contract_tests.server_started
409+
337410
code_quality = (
338411
parse_quality_report(yaml_files["quality-report"])
339412
if "quality-report" in yaml_files
@@ -346,13 +419,18 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
346419
)
347420

348421
# Backfill artifact counts from run-metrics if available
349-
if "run-metrics" in yaml_files:
422+
if has_metrics_file:
350423
raw_metrics = _load_yaml(yaml_files["run-metrics"])
351424
workspace = raw_metrics.get("artifacts", {}).get("workspace", {})
352425
code_quality.source_file_count = workspace.get("source_files", 0)
353426
code_quality.test_file_count = workspace.get("test_files", 0)
354427
code_quality.total_lines_of_code = workspace.get("total_lines_of_code", 0)
355428

429+
# Detect infrastructure failures
430+
infra_failure = detect_infra_failure(meta, metrics, contract_tests, has_metrics_file)
431+
if infra_failure.is_infra_failure:
432+
logger.warning("Infra failure detected in %s: %s", source_label, infra_failure.summary)
433+
356434
return RunData(
357435
label=label,
358436
run_type=run_type,
@@ -364,6 +442,7 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
364442
contract_tests=contract_tests,
365443
code_quality=code_quality,
366444
qualitative=qualitative,
445+
infra_failure=infra_failure,
367446
)
368447

369448

scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/gate.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ def check_regressions(trend: TrendData) -> GateResult:
1212
- Contract test pass rate decreased
1313
- Unit test failures appeared (> 0 when previous had 0)
1414
- Qualitative overall score decreased by more than 0.02
15+
16+
If the latest run is an infrastructure failure, the gate passes with
17+
an annotation — results from infra-failed runs are unreliable.
1518
"""
1619
latest, previous = find_latest_and_previous(trend)
1720
if latest is None or previous is None:
@@ -22,6 +25,36 @@ def check_regressions(trend: TrendData) -> GateResult:
2225
comparison_label=previous.label if previous else "",
2326
)
2427

28+
# If the latest run is an infra failure, skip regression checks
29+
if latest.infra_failure.is_infra_failure:
30+
return GateResult(
31+
passed=True,
32+
regressions=[],
33+
latest_label=latest.label,
34+
comparison_label=previous.label,
35+
infra_failure_detected=True,
36+
infra_failure_summary=(
37+
f"Latest run ({latest.label}) was an infrastructure failure: "
38+
f"{latest.infra_failure.summary}. "
39+
"Regression check skipped — results are unreliable."
40+
),
41+
)
42+
43+
# If the comparison run is an infra failure, find a non-infra alternative
44+
if previous.infra_failure.is_infra_failure:
45+
previous = _find_non_infra_previous(trend, latest)
46+
if previous is None:
47+
return GateResult(
48+
passed=True,
49+
regressions=[],
50+
latest_label=latest.label,
51+
comparison_label="",
52+
infra_failure_detected=True,
53+
infra_failure_summary=(
54+
"No non-infra-failure comparison run available. Regression check skipped."
55+
),
56+
)
57+
2558
regressions: list[str] = []
2659

2760
# Contract test regression
@@ -52,6 +85,27 @@ def check_regressions(trend: TrendData) -> GateResult:
5285
)
5386

5487

88+
def _find_non_infra_previous(trend: TrendData, latest: RunData) -> RunData | None:
89+
"""Find the most recent non-infra-failure run suitable for comparison."""
90+
candidates = [r for r in trend.runs if r is not latest]
91+
92+
if latest.run_type == RunType.RELEASE:
93+
for run in reversed(candidates):
94+
if run.run_type == RunType.RELEASE and not run.infra_failure.is_infra_failure:
95+
return run
96+
else:
97+
for run in reversed(candidates):
98+
if run.run_type == RunType.RELEASE and not run.infra_failure.is_infra_failure:
99+
return run
100+
101+
# Fallback: any non-infra run
102+
for run in reversed(candidates):
103+
if not run.infra_failure.is_infra_failure:
104+
return run
105+
106+
return None
107+
108+
55109
def find_latest_and_previous(
56110
trend: TrendData,
57111
) -> tuple[RunData | None, RunData | None]:

scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/models.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,27 @@ class RunType(Enum):
3434
PR = "pr"
3535

3636

37+
class InfraFailureReason(Enum):
38+
"""Reasons why a run is classified as an infrastructure failure."""
39+
40+
THROTTLED = "bedrock_throttled"
41+
SERVICE_UNAVAILABLE = "bedrock_service_unavailable"
42+
MODEL_ERROR = "bedrock_model_error"
43+
RUN_FAILED = "run_failed"
44+
RUN_CRASHED = "run_crashed"
45+
SERVER_START_FAILED = "server_start_failed"
46+
METRICS_MISSING = "metrics_missing"
47+
48+
49+
@dataclass
50+
class InfraFailure:
51+
"""Details about an infrastructure failure detected in a run."""
52+
53+
is_infra_failure: bool = False
54+
reasons: list[InfraFailureReason] = field(default_factory=list)
55+
summary: str = ""
56+
57+
3758
@dataclass(frozen=True, order=True)
3859
class SemVer:
3960
"""Semantic version, comparable via tuple ordering."""
@@ -119,6 +140,12 @@ class RunMetrics:
119140
handoffs: list[HandoffMetrics] = field(default_factory=list)
120141
server_startup_success: bool = True
121142
error_count: int = 0
143+
throttle_events: int = 0
144+
service_unavailable_events: int = 0
145+
model_error_events: int = 0
146+
timeout_events: int = 0
147+
failed_tool_calls: int = 0
148+
validation_error_events: int = 0
122149

123150

124151
@dataclass
@@ -152,6 +179,8 @@ class ContractTestResults:
152179
failed: int = 0
153180
pass_rate: float = 0.0
154181
failures: list[ContractTestFailure] = field(default_factory=list)
182+
server_started: bool = True
183+
server_error: str = ""
155184

156185

157186
@dataclass
@@ -210,6 +239,7 @@ class RunData:
210239
contract_tests: ContractTestResults
211240
code_quality: CodeQualityMetrics
212241
qualitative: QualitativeComparison
242+
infra_failure: InfraFailure = field(default_factory=InfraFailure)
213243

214244

215245
@dataclass
@@ -258,3 +288,5 @@ class GateResult:
258288
regressions: list[str] = field(default_factory=list)
259289
latest_label: str = ""
260290
comparison_label: str = ""
291+
infra_failure_detected: bool = False
292+
infra_failure_summary: str = ""

0 commit comments

Comments
 (0)