Classify trusted perf soak evidence

durable-workflow-ops · durable-workflow-ops · commit 8b76ac5c2ecb · 2026-04-22T07:36:52.000Z
diff --git a/docs/bounded-growth.md b/docs/bounded-growth.md
@@ -94,7 +94,8 @@ service logs under `build/perf/`. A trusted bounded-growth run must include:
   sample was collected successfully; missing resource samples fail the run
   instead of being recorded as zero-count evidence;
 - GitHub/runner provenance in `summary.json` (`GITHUB_SHA`, `GITHUB_RUN_ID`,
-  runner name/OS/arch, Compose project, and the tested base URL when present);
+  runner name/OS/arch/environment, Compose project, and the tested base URL
+  when present);
 - the SHA-256 digest of `config/dw-bounded-growth.php` so the artifact can be
   tied back to the policy that was active for the run.
 
@@ -110,6 +111,13 @@ bounded cache family produced growth instead of only reporting a total
 `dw_perf_redis_server_keys_by_policy{policy="..."}` for optional remote-write
 alerting.
 
+`summary.json` also includes `evidence.trust` with the
+`trusted_long_soak_v1` profile. Short CI smokes can still pass, but they are
+classified as ineligible for the issue-closing trusted evidence unless they run
+for at least one hour, use compose-backed resource sampling, run on a
+self-hosted runner when GitHub exposes the runner environment, meet sample
+coverage, and have no bounded-growth assertion failures.
+
 Per-policy limits can be enforced with JSON maps keyed by policy ID:
 `DW_PERF_MAX_SERVER_CACHE_KEYS_BY_POLICY` for maximum observed keys and
 `DW_PERF_MAX_FINAL_SERVER_CACHE_KEYS_BY_POLICY` for post-drain keys. Unknown
diff --git a/scripts/perf/server_soak.py b/scripts/perf/server_soak.py
@@ -654,12 +654,50 @@ def evidence_provenance(base_url: str, compose_project: str) -> dict[str, Any]:
         "runner_name": os.environ.get("RUNNER_NAME", ""),
         "runner_os": os.environ.get("RUNNER_OS", ""),
         "runner_arch": os.environ.get("RUNNER_ARCH", ""),
+        "runner_environment": os.environ.get("RUNNER_ENVIRONMENT", ""),
         "compose_project": compose_project,
         "base_url": base_url,
         "bounded_growth_policy_sha256": file_sha256(policy_path),
     }
 
 
+def evidence_trust_profile(
+    *,
+    duration_seconds: int,
+    compose_project: str,
+    runner_environment: str,
+    periodic_sample_count: int,
+    minimum_trusted_samples: int,
+    sampling_health: dict[str, Any],
+    failures: list[str],
+) -> dict[str, Any]:
+    minimum_duration_seconds = 3600
+    reasons = []
+
+    if duration_seconds < minimum_duration_seconds:
+        reasons.append(f"duration below trusted long-soak minimum {minimum_duration_seconds}s")
+    if not compose_project:
+        reasons.append("compose-backed resource sampling was not configured")
+    if runner_environment and runner_environment != "self-hosted":
+        reasons.append(f"runner environment is {runner_environment}, not self-hosted")
+    if periodic_sample_count < minimum_trusted_samples:
+        reasons.append("periodic sample coverage below trusted minimum")
+    if int(sampling_health.get("unhealthy_samples") or 0) > 0:
+        reasons.append("compose-backed resource sampling has unhealthy samples")
+    if failures:
+        reasons.append("bounded-growth assertions failed")
+
+    return {
+        "profile": "trusted_long_soak_v1",
+        "eligible": len(reasons) == 0,
+        "minimum_duration_seconds": minimum_duration_seconds,
+        "runner_environment": runner_environment,
+        "requires_self_hosted_runner": True,
+        "requires_compose_resource_sampling": True,
+        "reasons": reasons,
+    }
+
+
 def main() -> int:
     args = parse_args()
 
@@ -758,6 +796,8 @@ def main() -> int:
         observed_sample_coverage = periodic_sample_count / expected_samples
         sampling_health = sample_health(samples, args.compose_project)
 
+        provenance = evidence_provenance(base_url, args.compose_project)
+
         summary = {
             "duration_seconds": args.duration_seconds,
             "elapsed_seconds": round(elapsed_seconds, 2),
@@ -799,7 +839,7 @@ def main() -> int:
             "evidence": {
                 "started_at": started_at.isoformat().replace("+00:00", "Z"),
                 "finished_at": finished_at.isoformat().replace("+00:00", "Z"),
-                "provenance": evidence_provenance(base_url, args.compose_project),
+                "provenance": provenance,
             },
         }
 
@@ -867,6 +907,16 @@ def main() -> int:
             metrics.mark_assertion_failed()
             summary["failures"] = failures
 
+        summary["evidence"]["trust"] = evidence_trust_profile(
+            duration_seconds=args.duration_seconds,
+            compose_project=args.compose_project,
+            runner_environment=str(provenance.get("runner_environment") or ""),
+            periodic_sample_count=periodic_sample_count,
+            minimum_trusted_samples=min_samples,
+            sampling_health=sampling_health,
+            failures=failures,
+        )
+
         metrics_path.write_text(metrics.prometheus(), encoding="utf-8")
         summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
         print(json.dumps(summary, indent=2, sort_keys=True))
diff --git a/tests/Unit/ServerPerfHarnessContractTest.php b/tests/Unit/ServerPerfHarnessContractTest.php
@@ -44,6 +44,14 @@ public function test_soak_summary_records_trusted_evidence_fields(): void
             'bounded_growth_policy_sha256',
             'GITHUB_RUN_ID',
             'RUNNER_NAME',
+            'RUNNER_ENVIRONMENT',
+            'evidence_trust_profile',
+            'trusted_long_soak_v1',
+            'minimum_duration_seconds',
+            'requires_self_hosted_runner',
+            'requires_compose_resource_sampling',
+            'duration below trusted long-soak minimum',
+            'bounded-growth assertions failed',
         ] as $needle) {
             $this->assertStringContainsString($needle, $source, "Perf soak summary must retain {$needle}");
         }