Skip to content

Commit 8cb95fb

Browse files
Publish coordination health in cluster discovery
Publish coordination health in cluster discovery
1 parent 118e322 commit 8cb95fb

6 files changed

Lines changed: 265 additions & 3 deletions

File tree

README.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,7 @@ workflow-task command payload.
493493
### System
494494
- `GET /api/health` — Health check
495495
- `GET /api/ready` — Readiness check for migrations, default namespace, cache, auth config, and workflow v2 rollout-safety health
496-
- `GET /api/cluster/info` — Server capabilities, role topology, and version
496+
- `GET /api/cluster/info` — Server capabilities, role topology, coordination-health summary, and version
497497
- `GET /api/system/metrics` — Server metrics including bounded stuck workflow-task diagnostics
498498
- `GET /api/system/operator-metrics` — Full operator metrics snapshot (runs, tasks, backlog, repair, workers/fleet, backend, structural limits) for rollout-safety coordination health
499499
- `GET /api/system/repair` — Task repair diagnostics
@@ -669,6 +669,15 @@ authority boundary for every role, the expected degraded behavior for each role
669669
failure domain, the scaling axis for each role, and the incremental migration
670670
steps from today's standalone shape to the split control/execution topology.
671671

672+
The same `GET /api/cluster/info` response now includes a versioned
673+
`coordination_health` manifest for rollout-safety coordination risk. It
674+
summarizes the current server-wide workflow v2 health status, warning and error
675+
check names, category counts, and the normalized check list that already powers
676+
the readiness gate. The manifest is intentionally `all_namespaces` scoped so it
677+
describes the server's fleet-wide coordination posture; use
678+
`GET /api/system/operator-metrics` when you need namespace-specific backlog and
679+
worker detail.
680+
672681
The activity-grade external execution surface is published from
673682
`GET /api/cluster/info` at
674683
`worker_protocol.external_execution_surface_contract`. That manifest is the

app/Http/Controllers/Api/HealthController.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77
use App\Support\BridgeAdapterOutcomeContract;
88
use App\Support\ClientCompatibility;
99
use App\Support\ControlPlaneProtocol;
10-
use App\Support\ServerTopology;
10+
use App\Support\CoordinationHealthContract;
1111
use App\Support\ServerReadiness;
12+
use App\Support\ServerTopology;
1213
use App\Support\WorkerProtocol;
1314
use Illuminate\Http\JsonResponse;
1415
use Illuminate\Http\Request;
@@ -119,6 +120,7 @@ public function clusterInfo(Request $request): JsonResponse
119120
],
120121
'structural_limits' => StructuralLimits::snapshot(),
121122
'topology' => ServerTopology::info(),
123+
'coordination_health' => CoordinationHealthContract::manifest($this->readiness->workflowStatus()),
122124
'client_compatibility' => ClientCompatibility::info(),
123125
'auth_composition_contract' => AuthCompositionContract::manifest(),
124126
'control_plane' => ControlPlaneProtocol::info(),
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
<?php
2+
3+
namespace App\Support;
4+
5+
final class CoordinationHealthContract
6+
{
7+
public const SCHEMA = 'durable-workflow.v2.coordination-health.contract';
8+
9+
public const VERSION = 1;
10+
11+
/**
12+
* @param array<string, mixed> $workflowCheck
13+
* @return array<string, mixed>
14+
*/
15+
public static function manifest(array $workflowCheck): array
16+
{
17+
return [
18+
'schema' => self::SCHEMA,
19+
'version' => self::VERSION,
20+
'namespace_scope' => 'all_namespaces',
21+
'status' => is_string($workflowCheck['status'] ?? null) ? $workflowCheck['status'] : 'error',
22+
'http_status' => is_int($workflowCheck['http_status'] ?? null) ? $workflowCheck['http_status'] : 503,
23+
'generated_at' => is_string($workflowCheck['generated_at'] ?? null) ? $workflowCheck['generated_at'] : null,
24+
'categories' => is_array($workflowCheck['categories'] ?? null) ? $workflowCheck['categories'] : [],
25+
'warning_checks' => self::stringList($workflowCheck['warning_checks'] ?? []),
26+
'error_checks' => self::stringList($workflowCheck['error_checks'] ?? []),
27+
'checks' => self::checkList($workflowCheck['checks'] ?? []),
28+
];
29+
}
30+
31+
/**
32+
* @return list<string>
33+
*/
34+
private static function stringList(mixed $value): array
35+
{
36+
if (! is_array($value)) {
37+
return [];
38+
}
39+
40+
return array_values(array_filter(
41+
$value,
42+
static fn (mixed $item): bool => is_string($item) && $item !== '',
43+
));
44+
}
45+
46+
/**
47+
* @return list<array{name: string, status: string, category: ?string, message: ?string}>
48+
*/
49+
private static function checkList(mixed $value): array
50+
{
51+
if (! is_array($value)) {
52+
return [];
53+
}
54+
55+
$checks = [];
56+
57+
foreach ($value as $entry) {
58+
if (! is_array($entry)) {
59+
continue;
60+
}
61+
62+
$checks[] = [
63+
'name' => is_string($entry['name'] ?? null) ? $entry['name'] : 'unknown',
64+
'status' => is_string($entry['status'] ?? null) ? $entry['status'] : 'unknown',
65+
'category' => is_string($entry['category'] ?? null) ? $entry['category'] : null,
66+
'message' => is_string($entry['message'] ?? null) ? $entry['message'] : null,
67+
];
68+
}
69+
70+
return $checks;
71+
}
72+
}

app/Support/ServerReadiness.php

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ public function snapshot(): array
3737
'cache' => $this->cacheCheck(),
3838
'auth' => $this->authCheck(),
3939
];
40-
$checks['workflow_v2'] = $this->workflowCheck($checks);
40+
$checks['workflow_v2'] = $this->workflowStatus($checks);
4141

4242
return [
4343
'ready' => collect($checks)->every(
@@ -47,6 +47,20 @@ public function snapshot(): array
4747
];
4848
}
4949

50+
/**
51+
* @param array<string, array<string, mixed>>|null $checks
52+
* @return array<string, mixed>
53+
*/
54+
public function workflowStatus(?array $checks = null): array
55+
{
56+
$checks ??= [
57+
'database' => $this->databaseCheck(),
58+
'migrations' => $this->migrationCheck(),
59+
];
60+
61+
return $this->normalizeWorkflowCheck($this->workflowCheck($checks));
62+
}
63+
5064
private static function statusAllowsReady(mixed $status): bool
5165
{
5266
return in_array($status, ['ok', 'warning'], true);
@@ -333,4 +347,48 @@ private function workflowCheck(array $checks): array
333347
'checks' => $checksList,
334348
];
335349
}
350+
351+
/**
352+
* @param array<string, mixed> $check
353+
* @return array<string, mixed>
354+
*/
355+
private function normalizeWorkflowCheck(array $check): array
356+
{
357+
$status = is_string($check['status'] ?? null) ? $check['status'] : 'error';
358+
359+
$normalized = [
360+
'status' => $status,
361+
'generated_at' => is_string($check['generated_at'] ?? null) ? $check['generated_at'] : null,
362+
'http_status' => is_int($check['http_status'] ?? null)
363+
? $check['http_status']
364+
: (self::statusAllowsReady($status) ? 200 : 503),
365+
'categories' => is_array($check['categories'] ?? null) ? $check['categories'] : [],
366+
'warning_checks' => $this->stringList($check['warning_checks'] ?? []),
367+
'error_checks' => $this->stringList($check['error_checks'] ?? []),
368+
'checks' => is_array($check['checks'] ?? null) ? array_values($check['checks']) : [],
369+
];
370+
371+
foreach (['blocked_by', 'message', 'remediation'] as $key) {
372+
if (array_key_exists($key, $check)) {
373+
$normalized[$key] = $check[$key];
374+
}
375+
}
376+
377+
return $normalized;
378+
}
379+
380+
/**
381+
* @return list<string>
382+
*/
383+
private function stringList(mixed $value): array
384+
{
385+
if (! is_array($value)) {
386+
return [];
387+
}
388+
389+
return array_values(array_filter(
390+
$value,
391+
static fn (mixed $item): bool => is_string($item) && $item !== '',
392+
));
393+
}
336394
}

tests/Feature/ClusterInfoCompatibilityTest.php

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
use App\Support\ClientCompatibility;
77
use App\Support\ControlPlaneProtocol;
88
use App\Support\ControlPlaneRequestContract;
9+
use App\Support\CoordinationHealthContract;
910
use App\Support\ServerTopology;
1011
use App\Support\WorkerProtocol;
1112
use Illuminate\Foundation\Testing\RefreshDatabase;
@@ -90,6 +91,18 @@ public function test_cluster_info_is_a_versionless_protocol_discovery_contract()
9091
'scaling_boundaries',
9192
'migration_path',
9293
],
94+
'coordination_health' => [
95+
'schema',
96+
'version',
97+
'namespace_scope',
98+
'status',
99+
'http_status',
100+
'generated_at',
101+
'categories',
102+
'warning_checks',
103+
'error_checks',
104+
'checks',
105+
],
93106
'client_compatibility',
94107
'auth_composition_contract',
95108
'control_plane',
@@ -99,6 +112,9 @@ public function test_cluster_info_is_a_versionless_protocol_discovery_contract()
99112
->assertJsonPath('topology.schema', ServerTopology::SCHEMA)
100113
->assertJsonPath('topology.version', ServerTopology::VERSION)
101114
->assertJsonPath('topology.matching_role.task_dispatch_mode', 'poll')
115+
->assertJsonPath('coordination_health.schema', CoordinationHealthContract::SCHEMA)
116+
->assertJsonPath('coordination_health.version', CoordinationHealthContract::VERSION)
117+
->assertJsonPath('coordination_health.namespace_scope', 'all_namespaces')
102118
->assertJsonPath(
103119
'topology.failure_domains.matching_down.effect',
104120
'claim_falls_back_to_direct_ready_task_discovery',

tests/Feature/ClusterInfoTest.php

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22

33
namespace Tests\Feature;
44

5+
use App\Models\WorkflowNamespace;
6+
use App\Support\CoordinationHealthContract;
57
use App\Support\ServerTopology;
68
use Illuminate\Foundation\Testing\RefreshDatabase;
79
use Tests\TestCase;
10+
use Workflow\V2\Support\WorkerCompatibilityFleet;
811

912
class ClusterInfoTest extends TestCase
1013
{
@@ -205,6 +208,108 @@ public function test_it_switches_cluster_topology_execution_mode_when_embedded_d
205208
->assertJsonPath('topology.execution_mode', 'local_queue_worker');
206209
}
207210

211+
public function test_it_publishes_a_versioned_coordination_health_manifest(): void
212+
{
213+
$response = $this->getJson('/api/cluster/info')->assertOk();
214+
215+
$response
216+
->assertJsonPath('coordination_health.schema', CoordinationHealthContract::SCHEMA)
217+
->assertJsonPath('coordination_health.version', CoordinationHealthContract::VERSION)
218+
->assertJsonPath('coordination_health.namespace_scope', 'all_namespaces')
219+
->assertJsonPath('coordination_health.http_status', 200);
220+
221+
$this->assertContains(
222+
$response->json('coordination_health.status'),
223+
['ok', 'warning', 'error', 'blocked', 'unavailable'],
224+
);
225+
$this->assertIsArray($response->json('coordination_health.categories'));
226+
$this->assertIsArray($response->json('coordination_health.warning_checks'));
227+
$this->assertIsArray($response->json('coordination_health.error_checks'));
228+
$this->assertIsArray($response->json('coordination_health.checks'));
229+
}
230+
231+
public function test_it_surfaces_worker_compatibility_warnings_in_coordination_health(): void
232+
{
233+
WorkflowNamespace::query()->create([
234+
'name' => 'default',
235+
'description' => 'Default namespace',
236+
'retention_days' => 30,
237+
'status' => 'active',
238+
]);
239+
240+
config([
241+
'workflows.v2.compatibility.current' => 'build-a',
242+
'workflows.v2.compatibility.supported' => ['build-a'],
243+
'workflows.v2.compatibility.namespace' => 'default',
244+
'workflows.v2.fleet.validation_mode' => 'warn',
245+
]);
246+
WorkerCompatibilityFleet::clear();
247+
248+
try {
249+
WorkerCompatibilityFleet::recordForNamespace(
250+
'default',
251+
['build-b'],
252+
'database',
253+
'default',
254+
'worker-b',
255+
);
256+
257+
$response = $this->getJson('/api/cluster/info')->assertOk();
258+
259+
$response
260+
->assertJsonPath('coordination_health.status', 'warning')
261+
->assertJsonPath('coordination_health.http_status', 200);
262+
263+
$this->assertContains(
264+
'worker_compatibility',
265+
$response->json('coordination_health.warning_checks', []),
266+
);
267+
} finally {
268+
WorkerCompatibilityFleet::clear();
269+
}
270+
}
271+
272+
public function test_it_fails_coordination_health_closed_when_fleet_validation_requires_compatible_workers(): void
273+
{
274+
WorkflowNamespace::query()->create([
275+
'name' => 'default',
276+
'description' => 'Default namespace',
277+
'retention_days' => 30,
278+
'status' => 'active',
279+
]);
280+
281+
config([
282+
'workflows.v2.compatibility.current' => 'build-a',
283+
'workflows.v2.compatibility.supported' => ['build-a'],
284+
'workflows.v2.compatibility.namespace' => 'default',
285+
'workflows.v2.fleet.validation_mode' => 'fail',
286+
]);
287+
WorkerCompatibilityFleet::clear();
288+
289+
try {
290+
WorkerCompatibilityFleet::recordForNamespace(
291+
'default',
292+
['build-b'],
293+
'database',
294+
'default',
295+
'worker-b',
296+
);
297+
298+
$response = $this->getJson('/api/cluster/info')->assertOk();
299+
300+
$response
301+
->assertJsonPath('coordination_health.status', 'error')
302+
->assertJsonPath('coordination_health.http_status', 503);
303+
304+
$this->assertContains(
305+
'worker_compatibility',
306+
$response->json('coordination_health.error_checks', []),
307+
);
308+
} finally {
309+
WorkerCompatibilityFleet::clear();
310+
}
311+
}
312+
208313
public function test_it_publishes_matching_role_wake_ownership_for_dedicated_matching_shape(): void
209314
{
210315
config([

0 commit comments

Comments
 (0)