awslabs
diff --git a/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__main__.py‎
Lines changed: 5 additions & 0 deletions b/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__main__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/collector.py‎
Lines changed: 91 additions & 12 deletions b/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/collector.py‎
Lines changed: 91 additions & 12 deletions
diff --git a/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/gate.py‎
Lines changed: 54 additions & 0 deletions b/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/gate.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/models.py‎
Lines changed: 32 additions & 0 deletions b/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/models.py‎
Lines changed: 32 additions & 0 deletions
@@ -19,6 +19,8 @@
 from trend_reports.models import (
     BaselineMetrics,
     GateResult,
+    InfraFailure,
+    InfraFailureReason,
     RunData,
     RunType,
     SemVer,
@@ -33,6 +35,8 @@
 __all__ = [
     "BaselineMetrics",
     "GateResult",
+    "InfraFailure",
+    "InfraFailureReason",
     "RunData",
     "RunType",
     "SemVer",
 
@@ -217,6 +217,11 @@ def cmd_trend(
         # 5. Gate
         if gate:
             result = check_regressions(trend)
+            if result.infra_failure_detected:
+                print(
+                    f"Gate WARNING: {result.infra_failure_summary}",
+                    file=sys.stderr,
+                )
             if result.passed:
                 print(
                     f"Gate PASSED: {result.latest_label} vs {result.comparison_label} "
 
@@ -18,6 +18,8 @@
     ContractTestResults,
     DocumentScore,
     HandoffMetrics,
+    InfraFailure,
+    InfraFailureReason,
     QualitativeComparison,
     RunConfig,
     RunData,
@@ -147,14 +149,20 @@ def parse_run_metrics(yaml_path: Path) -> RunMetrics:
 
     hp = raw.get("handoff_patterns", {})
     errors = raw.get("errors", {})
+    throttle_events = errors.get("throttle_events", 0)
+    timeout_events = errors.get("timeout_events", 0)
+    failed_tool_calls = errors.get("failed_tool_calls", 0)
+    model_error_events = errors.get("model_error_events", 0)
+    service_unavailable_events = errors.get("service_unavailable_events", 0)
+    validation_error_events = errors.get("validation_error_events", 0)
     error_count = sum(
         [
-            errors.get("throttle_events", 0),
-            errors.get("timeout_events", 0),
-            errors.get("failed_tool_calls", 0),
-            errors.get("model_error_events", 0),
-            errors.get("service_unavailable_events", 0),
-            errors.get("validation_error_events", 0),
+            throttle_events,
+            timeout_events,
+            failed_tool_calls,
+            model_error_events,
+            service_unavailable_events,
+            validation_error_events,
         ]
     )
 
@@ -175,6 +183,12 @@ def parse_run_metrics(yaml_path: Path) -> RunMetrics:
         handoffs=handoffs,
         server_startup_success=True,
         error_count=error_count,
+        throttle_events=throttle_events,
+        service_unavailable_events=service_unavailable_events,
+        model_error_events=model_error_events,
+        timeout_events=timeout_events,
+        failed_tool_calls=failed_tool_calls,
+        validation_error_events=validation_error_events,
     )
 
 
@@ -215,12 +229,17 @@ def parse_contract_tests(yaml_path: Path) -> ContractTestResults:
                 )
             )
 
+    server_started = raw.get("server_started", True)
+    server_error = raw.get("server_error") or ""
+
     return ContractTestResults(
         total=total,
         passed=passed,
         failed=failed,
         pass_rate=pass_rate,
         failures=failures,
+        server_started=server_started,
+        server_error=server_error,
     )
 
 
@@ -301,6 +320,59 @@ def classify_run(rules_ref: str) -> tuple[RunType, str, SemVer | None, int | Non
         return RunType.RELEASE, rules_ref, None, None
 
 
+# ---------------------------------------------------------------------------
+# Infrastructure failure detection
+# ---------------------------------------------------------------------------
+
+
+def detect_infra_failure(
+    meta: RunMeta,
+    metrics: RunMetrics,
+    contract_tests: ContractTestResults,
+    has_metrics_file: bool,
+) -> InfraFailure:
+    """Detect infrastructure failures from run signals.
+
+    Conservative: only flags clear infra issues, not ambiguous cases.
+    """
+    reasons: list[InfraFailureReason] = []
+
+    # Signal 1: Bedrock infra errors in run-metrics.yaml
+    if metrics.throttle_events > 0:
+        reasons.append(InfraFailureReason.THROTTLED)
+    if metrics.service_unavailable_events > 0:
+        reasons.append(InfraFailureReason.SERVICE_UNAVAILABLE)
+    if metrics.model_error_events > 0:
+        reasons.append(InfraFailureReason.MODEL_ERROR)
+
+    # Signal 2: run-meta.yaml status indicates failure/crash
+    status_lower = meta.status.lower() if meta.status else ""
+    if "failed" in status_lower:
+        reasons.append(InfraFailureReason.RUN_FAILED)
+    elif not meta.status or meta.status.strip() == "":
+        reasons.append(InfraFailureReason.RUN_CRASHED)
+
+    # Signal 3: run-metrics.yaml missing entirely (swarm crashed before writing)
+    if not has_metrics_file:
+        reasons.append(InfraFailureReason.METRICS_MISSING)
+
+    # Signal 4: Server failed to start (from contract-test-results.yaml)
+    if not contract_tests.server_started:
+        reasons.append(InfraFailureReason.SERVER_START_FAILED)
+
+    if not reasons:
+        return InfraFailure(is_infra_failure=False)
+
+    reason_strs = [r.value for r in reasons]
+    summary = f"Infrastructure failure detected: {', '.join(reason_strs)}"
+
+    return InfraFailure(
+        is_infra_failure=True,
+        reasons=reasons,
+        summary=summary,
+    )
+
+
 # ---------------------------------------------------------------------------
 # Collection pipeline
 # ---------------------------------------------------------------------------
@@ -319,11 +391,8 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
     meta = parse_run_meta(yaml_files["run-meta"])
     run_type, label, semver, pr_number = classify_run(meta.config.rules_ref)
 
-    metrics = (
-        parse_run_metrics(yaml_files["run-metrics"])
-        if "run-metrics" in yaml_files
-        else RunMetrics()
-    )
+    has_metrics_file = "run-metrics" in yaml_files
+    metrics = parse_run_metrics(yaml_files["run-metrics"]) if has_metrics_file else RunMetrics()
     unit_tests = (
         parse_test_results(yaml_files["test-results"])
         if "test-results" in yaml_files
@@ -334,6 +403,10 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
         if "contract-test-results" in yaml_files
         else ContractTestResults()
     )
+
+    # Propagate actual server_started to metrics
+    metrics.server_startup_success = contract_tests.server_started
+
     code_quality = (
         parse_quality_report(yaml_files["quality-report"])
         if "quality-report" in yaml_files
@@ -346,13 +419,18 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
     )
 
     # Backfill artifact counts from run-metrics if available
-    if "run-metrics" in yaml_files:
+    if has_metrics_file:
         raw_metrics = _load_yaml(yaml_files["run-metrics"])
         workspace = raw_metrics.get("artifacts", {}).get("workspace", {})
         code_quality.source_file_count = workspace.get("source_files", 0)
         code_quality.test_file_count = workspace.get("test_files", 0)
         code_quality.total_lines_of_code = workspace.get("total_lines_of_code", 0)
 
+    # Detect infrastructure failures
+    infra_failure = detect_infra_failure(meta, metrics, contract_tests, has_metrics_file)
+    if infra_failure.is_infra_failure:
+        logger.warning("Infra failure detected in %s: %s", source_label, infra_failure.summary)
+
     return RunData(
         label=label,
         run_type=run_type,
@@ -364,6 +442,7 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
         contract_tests=contract_tests,
         code_quality=code_quality,
         qualitative=qualitative,
+        infra_failure=infra_failure,
     )
 
 
 
@@ -12,6 +12,9 @@ def check_regressions(trend: TrendData) -> GateResult:
     - Contract test pass rate decreased
     - Unit test failures appeared (> 0 when previous had 0)
     - Qualitative overall score decreased by more than 0.02
+
+    If the latest run is an infrastructure failure, the gate passes with
+    an annotation — results from infra-failed runs are unreliable.
     """
     latest, previous = find_latest_and_previous(trend)
     if latest is None or previous is None:
@@ -22,6 +25,36 @@ def check_regressions(trend: TrendData) -> GateResult:
             comparison_label=previous.label if previous else "",
         )
 
+    # If the latest run is an infra failure, skip regression checks
+    if latest.infra_failure.is_infra_failure:
+        return GateResult(
+            passed=True,
+            regressions=[],
+            latest_label=latest.label,
+            comparison_label=previous.label,
+            infra_failure_detected=True,
+            infra_failure_summary=(
+                f"Latest run ({latest.label}) was an infrastructure failure: "
+                f"{latest.infra_failure.summary}. "
+                "Regression check skipped — results are unreliable."
+            ),
+        )
+
+    # If the comparison run is an infra failure, find a non-infra alternative
+    if previous.infra_failure.is_infra_failure:
+        previous = _find_non_infra_previous(trend, latest)
+        if previous is None:
+            return GateResult(
+                passed=True,
+                regressions=[],
+                latest_label=latest.label,
+                comparison_label="",
+                infra_failure_detected=True,
+                infra_failure_summary=(
+                    "No non-infra-failure comparison run available. Regression check skipped."
+                ),
+            )
+
     regressions: list[str] = []
 
     # Contract test regression
@@ -52,6 +85,27 @@ def check_regressions(trend: TrendData) -> GateResult:
     )
 
 
+def _find_non_infra_previous(trend: TrendData, latest: RunData) -> RunData | None:
+    """Find the most recent non-infra-failure run suitable for comparison."""
+    candidates = [r for r in trend.runs if r is not latest]
+
+    if latest.run_type == RunType.RELEASE:
+        for run in reversed(candidates):
+            if run.run_type == RunType.RELEASE and not run.infra_failure.is_infra_failure:
+                return run
+    else:
+        for run in reversed(candidates):
+            if run.run_type == RunType.RELEASE and not run.infra_failure.is_infra_failure:
+                return run
+
+    # Fallback: any non-infra run
+    for run in reversed(candidates):
+        if not run.infra_failure.is_infra_failure:
+            return run
+
+    return None
+
+
 def find_latest_and_previous(
     trend: TrendData,
 ) -> tuple[RunData | None, RunData | None]:
 
@@ -34,6 +34,27 @@ class RunType(Enum):
     PR = "pr"
 
 
+class InfraFailureReason(Enum):
+    """Reasons why a run is classified as an infrastructure failure."""
+
+    THROTTLED = "bedrock_throttled"
+    SERVICE_UNAVAILABLE = "bedrock_service_unavailable"
+    MODEL_ERROR = "bedrock_model_error"
+    RUN_FAILED = "run_failed"
+    RUN_CRASHED = "run_crashed"
+    SERVER_START_FAILED = "server_start_failed"
+    METRICS_MISSING = "metrics_missing"
+
+
+@dataclass
+class InfraFailure:
+    """Details about an infrastructure failure detected in a run."""
+
+    is_infra_failure: bool = False
+    reasons: list[InfraFailureReason] = field(default_factory=list)
+    summary: str = ""
+
+
 @dataclass(frozen=True, order=True)
 class SemVer:
     """Semantic version, comparable via tuple ordering."""
@@ -119,6 +140,12 @@ class RunMetrics:
     handoffs: list[HandoffMetrics] = field(default_factory=list)
     server_startup_success: bool = True
     error_count: int = 0
+    throttle_events: int = 0
+    service_unavailable_events: int = 0
+    model_error_events: int = 0
+    timeout_events: int = 0
+    failed_tool_calls: int = 0
+    validation_error_events: int = 0
 
 
 @dataclass
@@ -152,6 +179,8 @@ class ContractTestResults:
     failed: int = 0
     pass_rate: float = 0.0
     failures: list[ContractTestFailure] = field(default_factory=list)
+    server_started: bool = True
+    server_error: str = ""
 
 
 @dataclass
@@ -210,6 +239,7 @@ class RunData:
     contract_tests: ContractTestResults
     code_quality: CodeQualityMetrics
     qualitative: QualitativeComparison
+    infra_failure: InfraFailure = field(default_factory=InfraFailure)
 
 
 @dataclass
@@ -258,3 +288,5 @@ class GateResult:
     regressions: list[str] = field(default_factory=list)
     latest_label: str = ""
     comparison_label: str = ""
+    infra_failure_detected: bool = False
+    infra_failure_summary: str = ""