Require trusted server perf evidence

durable-workflow-ops · durable-workflow-ops · commit 34da30b78307 · 2026-04-22T16:53:23.000Z
diff --git a/.github/workflows/server-perf.yml b/.github/workflows/server-perf.yml
@@ -166,6 +166,7 @@ jobs:
           DW_PERF_MAX_SERVER_CACHE_KEYS_BY_POLICY: '{"workflow_task_poll_requests":2048,"long_poll_signals":2048,"workflow_query_tasks":128,"task_queue_admission_locks":256,"task_queue_dispatch_counters":256,"workflow_task_expired_lease_recovery":256,"history_retention_inline":128,"readiness_probe":32}'
           DW_PERF_MAX_FINAL_SERVER_CACHE_KEYS_BY_POLICY: '{"workflow_task_poll_requests":0,"long_poll_signals":0,"workflow_query_tasks":0,"task_queue_admission_locks":0,"task_queue_dispatch_counters":0,"workflow_task_expired_lease_recovery":0,"history_retention_inline":0,"readiness_probe":0}'
           DW_PERF_MAX_SERVER_MEMORY_SLOPE_MB_HOUR: "128"
+          DW_PERF_REQUIRE_TRUSTED_EVIDENCE: "true"
           DW_PERF_REMOTE_WRITE_ENABLED: ${{ github.event_name != 'workflow_dispatch' || inputs.remote_write }}
           DW_PERF_REMOTE_WRITE_URL: ${{ vars.DW_PERF_REMOTE_WRITE_URL }}
           DW_PERF_REMOTE_WRITE_USERNAME: ${{ vars.DW_PERF_REMOTE_WRITE_USERNAME }}
diff --git a/docs/bounded-growth.md b/docs/bounded-growth.md
@@ -139,7 +139,11 @@ feature-branch workflow can still produce useful artifacts, but it cannot
 satisfy the trusted long-soak evidence profile just by setting
 `RUNNER_ENVIRONMENT=self-hosted`.
 The CI smoke workflow sets `RUNNER_ENVIRONMENT=github-hosted` so those artifacts
-are traceable without being eligible for the trusted long-soak profile.
+are traceable without being eligible for the trusted long-soak profile. The
+self-hosted soak workflow sets `DW_PERF_REQUIRE_TRUSTED_EVIDENCE=true`, so a
+scheduled or manual soak fails if the artifact cannot satisfy
+`trusted_long_soak_v1` even when the load generator itself stayed inside the
+bounded-growth budgets.
 
 Per-policy limits can be enforced with JSON maps keyed by policy ID:
 `DW_PERF_MAX_SERVER_CACHE_KEYS_BY_POLICY` for maximum observed keys and
diff --git a/docs/perf-runner.md b/docs/perf-runner.md
@@ -63,7 +63,9 @@ signal meaningful.
 Both workflow modes pass explicit runner provenance into the artifact. Short
 smokes set `RUNNER_ENVIRONMENT=github-hosted`; long soaks set
 `RUNNER_ENVIRONMENT=self-hosted`, which is required before `summary.json` can be
-classified as trusted long-soak evidence.
+classified as trusted long-soak evidence. The self-hosted job also sets
+`DW_PERF_REQUIRE_TRUSTED_EVIDENCE=true`, so it fails if the run completes but the
+artifact is ineligible for `trusted_long_soak_v1`.
 
 ## Local Run
 
@@ -90,6 +92,8 @@ and requires a checked-out source commit matching `GITHUB_SHA`, so artifacts
 from uncommitted source, policy edits, feature branches, forks, unrelated
 workflows, pull-request smokes, misconfigured checkouts, or ad hoc local runs
 are marked ineligible for the trusted profile.
+When `DW_PERF_REQUIRE_TRUSTED_EVIDENCE=true`, the harness turns that ineligible
+profile into a failed run and records the profile reasons in `summary.json`.
 The harness fails when it cannot collect at least `DW_PERF_MIN_SAMPLE_COVERAGE`
 of the expected periodic samples, which defaults to 80%. The final post-drain
 sample is included in the artifact but does not count toward the periodic sample
diff --git a/scripts/perf/server_soak.py b/scripts/perf/server_soak.py
@@ -39,6 +39,14 @@
 }
 
 
+def env_bool(name: str, default: bool = False) -> bool:
+    value = os.environ.get(name)
+    if value is None:
+        return default
+
+    return value.strip().lower() in ("1", "true", "yes", "on")
+
+
 class Metrics:
     def __init__(self) -> None:
         self.lock = threading.Lock()
@@ -204,6 +212,12 @@ def parse_args() -> argparse.Namespace:
         default=float(os.environ.get("DW_PERF_MAX_SERVER_MEMORY_SLOPE_MB_HOUR", "0")),
         help="If positive and duration is at least 10 minutes, fail when post-warmup server memory slope exceeds this value.",
     )
+    parser.add_argument(
+        "--require-trusted-evidence",
+        action="store_true",
+        default=env_bool("DW_PERF_REQUIRE_TRUSTED_EVIDENCE"),
+        help="Fail if the generated summary is not eligible for trusted_long_soak_v1 evidence.",
+    )
     args = parser.parse_args()
     policy_ids = set(SERVER_CACHE_KEY_PATTERNS)
     args.max_server_cache_keys_by_policy = parse_policy_limit_map(
@@ -930,6 +944,7 @@ def main() -> int:
                 "max_final_server_cache_keys_by_policy": args.max_final_server_cache_keys_by_policy,
                 "max_server_memory_slope_mb_hour": args.max_server_memory_slope_mb_hour,
                 "min_sample_coverage": args.min_sample_coverage,
+                "require_trusted_evidence": args.require_trusted_evidence,
             },
             "evidence": {
                 "started_at": started_at.isoformat().replace("+00:00", "Z"),
@@ -1015,6 +1030,14 @@ def main() -> int:
             max_final_server_cache_keys_by_policy=args.max_final_server_cache_keys_by_policy,
             failures=failures,
         )
+        trust_reasons = summary["evidence"]["trust"].get("reasons") or []
+        if args.require_trusted_evidence and not summary["evidence"]["trust"].get("eligible"):
+            failures.append(
+                "trusted evidence profile is ineligible"
+                + (f": {trust_reasons}" if trust_reasons else "")
+            )
+            metrics.mark_assertion_failed()
+            summary["failures"] = failures
 
         metrics_path.write_text(metrics.prometheus(), encoding="utf-8")
         summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
diff --git a/tests/Unit/ServerPerfHarnessContractTest.php b/tests/Unit/ServerPerfHarnessContractTest.php
@@ -79,6 +79,10 @@ public function test_soak_summary_records_trusted_evidence_fields(): void
             'per_policy_threshold_reasons',
             'max_server_cache_keys_by_policy=args.max_server_cache_keys_by_policy',
             'max_final_server_cache_keys_by_policy=args.max_final_server_cache_keys_by_policy',
+            'DW_PERF_REQUIRE_TRUSTED_EVIDENCE',
+            '--require-trusted-evidence',
+            'require_trusted_evidence',
+            'trusted evidence profile is ineligible',
             'duration below trusted long-soak minimum',
             'bounded-growth assertions failed',
         ] as $needle) {
@@ -183,6 +187,30 @@ public function test_ci_perf_jobs_set_runner_environment_provenance(): void
         );
     }
 
+    public function test_self_hosted_perf_soak_requires_trusted_evidence_eligibility(): void
+    {
+        $workflow = file_get_contents(dirname(__DIR__, 2).'/.github/workflows/server-perf.yml');
+        $this->assertNotFalse($workflow, '.github/workflows/server-perf.yml must be readable');
+
+        $this->assertMatchesRegularExpression(
+            '/name:\s+Self-hosted polling cache soak.*?DW_PERF_REQUIRE_TRUSTED_EVIDENCE:\s+"true"/s',
+            $workflow,
+            'Self-hosted long soaks must fail instead of producing green ineligible trusted evidence.',
+        );
+
+        $this->assertMatchesRegularExpression(
+            '/name:\s+Polling cache bounded-growth smoke(?P<block>.*?)\n\s+soak:/s',
+            $workflow,
+            'Server Perf workflow must keep a distinct short smoke job before the long soak job.',
+        );
+        preg_match('/name:\s+Polling cache bounded-growth smoke(?P<block>.*?)\n\s+soak:/s', $workflow, $smokeMatch);
+        $this->assertStringNotContainsString(
+            'DW_PERF_REQUIRE_TRUSTED_EVIDENCE: "true"',
+            (string) ($smokeMatch['block'] ?? ''),
+            'Short perf smokes should remain useful but ineligible artifacts.',
+        );
+    }
+
     public function test_ci_perf_trigger_paths_cover_bounded_growth_runtime_surfaces(): void
     {
         $repoRoot = dirname(__DIR__, 2);