diff --git a/infra/status-page/README.md b/infra/status-page/README.md index c9f1392be8..642996de25 100644 --- a/infra/status-page/README.md +++ b/infra/status-page/README.md @@ -14,9 +14,9 @@ following the `infra/iris-iap-proxy/` pattern. - **Server** — Node 20 + TypeScript + [Hono](https://hono.dev). Exposes `/api/ferry`, `/api/builds`, `/api/iris`, `/api/workers`, - `/api/control-plane/health`, `/api/workers/history`, `/api/jobs`, - `/api/probes`, `/api/health`, and serves the built web UI from - `web/dist`. + `/api/control-plane/health`, `/api/workers/history`, + `/api/provisioning/history`, `/api/jobs`, `/api/probes`, `/api/health`, + and serves the built web UI from `web/dist`. - **Web** — Vite + React 18 + TypeScript + Jotai + `@tanstack/react-query` + Tailwind. - Single `package.json`, multi-stage Dockerfile, single service account, @@ -28,7 +28,7 @@ following the `infra/iris-iap-proxy/` pattern. server/ main.ts Hono app: routes, sampler, static serving cache.ts TTL cache with in-flight coalesce - history.ts ring buffer for worker-count history + history.ts ring buffers for the in-process iris-ping + control-plane series sources/ github.ts shared REPO + auth header helper githubActions.ts Ferry workflow runs (REST API) @@ -36,6 +36,7 @@ server/ iris.ts iris controller /health caller serviceHealth.ts active env Iris + finelog /health probes (+ finelog URL) workers.ts iris worker counts via the ListWorkers RPC + clusterHistory.ts 24h worker + provisioning history from finelog canary rows jobs.ts iris 24h job-state breakdown via ExecuteRawQuery probes.ts synthetic-canary checks + provisioning from finelog finelogQuery.ts finelog StatsService SQL query → Arrow IPC decode @@ -55,6 +56,7 @@ web/ useControlPlaneHealth.ts useWorkers.ts useWorkersHistory.ts + useProvisioningHistory.ts useJobs.ts useProbes.ts components/ @@ -62,7 +64,8 @@ web/ BuildPanel.tsx GitHub CI, last 100 runs on main IrisPanel.tsx wraps reachability + WorkersPanel + ControlPlanePanel + JobsPanel ControlPlanePanel.tsx active env Iris + finelog latency chart - WorkersPanel.tsx + WorkersPanel.tsx live worker counts + side-by-side availability & provisioning history + ProvisioningHistoryChart.tsx per-region + fleet-average provisioning success ratio JobsPanel.tsx ProbesPanel.tsx synthetic-canary health checks + provisioning rollup style.css Tailwind entry @@ -200,7 +203,9 @@ down by in-flight builds. The Probes panel renders the synthetic-canary telemetry the `infra/probes/` daemon writes to the finelog `infra.canary.metrics` namespace (one flat `{metric, value, labels, collected_at}` row per -sample). Two bounded DuckDB queries run against the **active +sample). Two bounded SQL queries (Apache DataFusion, finelog's read +engine — note: no JSON functions, so labels are decoded app-side) run +against the **active environment's** finelog log-server through its `StatsService.Query` Connect RPC — the same JSON-over-HTTP shape the controller's `ExecuteRawQuery` uses, except the result is an Arrow IPC stream, which @@ -257,7 +262,8 @@ plus the dev controller discovery settings. | Iris | 15s | 15s | current only | | Control plane | in-memory | 30s | 24h ring buffer | | Workers | 15s | 30s | current only | -| Workers history | in-memory | 30s | 24h ring buffer | +| Workers history | 60s | 60s | 24h from finelog | +| Provisioning history | 60s | 60s | 24h from finelog | | Jobs | 60s | 60s | 24h window | | Probes | 60s | 60s | latest cycle | @@ -266,10 +272,17 @@ frontend polling can be tuned without affecting upstream. Concurrent backend requests for the same key coalesce into one upstream call via `server/cache.ts`. -The workers history is a 2880-slot ring buffer (`server/history.ts`) -filled by a background sampler on a 30s cadence — 24h worth of points. -The sampler runs on a fixed interval, not off request traffic, so -history keeps ticking even when nobody is looking at the dashboard. +The Workers panel renders two finelog-backed history charts side by +side: per-region healthy worker counts (the `worker_healthy` gauge the +canary writes every 60s) and the provisioning create-success ratio +(a fleet average plus per-region lines, derived from the per-pool +`provision_ready` / `provision_outcomes` gauges; zones roll up to +regions). Both query the trailing 24h via `server/sources/clusterHistory.ts` +and survive Cloud Run restarts since the history lives in finelog, not in +process. The remaining in-process ring buffers (`server/history.ts`) back +only the iris-ping and control-plane latency series, filled by a +background sampler on a fixed cadence so they keep ticking even when +nobody is looking at the dashboard. ## Controller data @@ -292,14 +305,12 @@ break** — we'll need to plumb a service-account bearer token. ## Known limitations -- **Workers history is in-process.** The ring buffer is lost on Cloud - Run restart (deploys, migrations), so the chart shows a 24h warm-up - window after each restart. Follow-ups to consider: - 1. Persist samples to a small GCS object (rewrite on each sample). - 2. Bump retention on the controller's `worker_resource_history` table - — currently ~45min — and aggregate from there. - 3. Add a proper `worker_count_history` table in the controller schema - so history lives authoritatively next to the workers table. +- **History depends on the canary.** Worker and provisioning history are + read from the `infra.canary.metrics` finelog namespace the `infra/probes` + daemon writes, so both charts are durable across Cloud Run restarts — but + they only have data for an environment whose canary is running. Point the + dashboard at an environment with no canary and both charts show their + empty state rather than data. - **Iris panel reachability row** is still `/health`-only. Worker counts and job-state breakdowns are surfaced in the Workers and Jobs subsections via `ExecuteRawQuery` SQL. Tasks, autoscaler, and detailed diff --git a/infra/status-page/server/history.ts b/infra/status-page/server/history.ts index d798969ffa..b7833b908e 100644 --- a/infra/status-page/server/history.ts +++ b/infra/status-page/server/history.ts @@ -1,15 +1,14 @@ -// Fixed-capacity circular buffer for worker-count history. +// Fixed-capacity circular buffers for the in-process sample histories. // -// Capacity is sized so the buffer holds 24h of samples at a 30s cadence -// (2880 slots). Each append overwrites the oldest slot once full, so -// memory stays bounded regardless of how long the server runs. History -// is in-process and lost on restart — see infra/status-page/README.md -// "Known limitations" for the follow-up plan (persist to GCS or grow a -// worker_count_history table in the controller). +// Capacity is sized so a buffer holds 24h of samples at its sampler's cadence. +// Each append overwrites the oldest slot once full, so memory stays bounded +// regardless of how long the server runs. These histories are in-process and +// lost on restart; worker-count history moved out to finelog (see +// server/sources/clusterHistory.ts), but the iris-ping and control-plane +// latency series are still sampled in-process here. import type { IrisPingSample } from "./sources/iris.js"; import type { ServiceHealthHistorySample } from "./sources/serviceHealth.js"; -import type { WorkerSample } from "./sources/workers.js"; export class RingBuffer { private readonly capacity: number; @@ -44,8 +43,6 @@ export class RingBuffer { } } -export class WorkerHistory extends RingBuffer {} - export class IrisPingHistory extends RingBuffer {} export class ServiceHealthHistory extends RingBuffer {} diff --git a/infra/status-page/server/main.ts b/infra/status-page/server/main.ts index 1656e0dc23..a6013196f8 100644 --- a/infra/status-page/server/main.ts +++ b/infra/status-page/server/main.ts @@ -6,7 +6,8 @@ // GET /api/iris — iris controller reachability (15s cache) // GET /api/control-plane/health — active env Iris + finelog health history // GET /api/workers — current iris worker counts (15s cache) -// GET /api/workers/history — in-memory 24h worker count ring buffer +// GET /api/workers/history — 24h per-region worker count from finelog (60s cache) +// GET /api/provisioning/history — 24h provisioning success ratio from finelog (60s cache) // GET /api/jobs — iris job counts for last 24h by state (60s cache) // GET /api/probes — synthetic-canary checks + provisioning from finelog (60s cache) // GET /api/health — liveness probe, no upstream calls @@ -19,7 +20,13 @@ import { serve } from "@hono/node-server"; import { serveStatic } from "@hono/node-server/serve-static"; import { Hono } from "hono"; import { TTLCache } from "./cache.js"; -import { IrisPingHistory, ServiceHealthHistory, WorkerHistory } from "./history.js"; +import { IrisPingHistory, ServiceHealthHistory } from "./history.js"; +import { + provisioningHistory, + workersHistory, + type ProvisioningHistoryResponse, + type WorkersHistoryResponse, +} from "./sources/clusterHistory.js"; import { FERRY_GROUPS, fetchTierStatus, @@ -47,6 +54,11 @@ const ferryCache = new TTLCache(60_000); const buildCache = new TTLCache(60_000); const workersCache = new TTLCache(15_000); const jobsCache = new TTLCache(60_000); +// Worker (60s cadence) and provisioning (15min cadence) history come from the +// canary's finelog rows; a 60s shield keeps finelog query load low without +// lagging the worker series. +const workersHistoryCache = new TTLCache(60_000); +const provisioningHistoryCache = new TTLCache(60_000); // Probe metrics turn over slowly — health checks every ≤5min, provisioning // every 15min — so a 60s shield is plenty and keeps finelog query load low. const probesCache = new TTLCache(60_000); @@ -62,34 +74,14 @@ const IRIS_PING_CAPACITY = Math.ceil(IRIS_PING_WINDOW_MS / IRIS_PING_INTERVAL_MS const irisPingHistory = new IrisPingHistory(IRIS_PING_CAPACITY); let lastIrisPing: IrisPingResult | null = null; -// Ring buffer for worker-count history. Sized so the buffer holds 24h of -// samples at the configured cadence. The sampler runs on a fixed interval -// below — not lazily off request traffic — so history keeps ticking even -// when nobody's watching the dashboard. +// In-process sampler cadence + buffer sizing for the control-plane latency +// history (worker-count history now lives in finelog — see clusterHistory.ts). const SAMPLE_INTERVAL_MS = 30_000; const HISTORY_CAPACITY = Math.ceil((24 * 60 * 60 * 1000) / SAMPLE_INTERVAL_MS); -const workerHistory = new WorkerHistory(HISTORY_CAPACITY); const serviceHealthHistory = new ServiceHealthHistory(HISTORY_CAPACITY); const SERVICE_HEALTH_WINDOW_MS = 24 * 60 * 60 * 1000; let lastServiceHealth: ServiceHealthSnapshot[] = []; -async function sampleWorkers(): Promise { - const snapshot = await workersCache.get("workers", () => workerSnapshot()); - if (snapshot.error) { - console.error("worker sampler: snapshot error:", snapshot.error); - // Don't pollute history with zeros when the controller is unreachable. - return; - } - const regions: Record = {}; - for (const r of snapshot.byRegion) { - regions[r.region] = r.healthy; - } - workerHistory.push({ - t: Date.parse(snapshot.fetchedAt), - regions, - }); -} - async function sampleIrisPing(): Promise { const result = await pingIris(); lastIrisPing = result; @@ -111,15 +103,6 @@ async function sampleServiceHealth(): Promise { // Kick off immediately, then on a fixed cadence. unref() lets the process // exit cleanly during tests without waiting on the timer. -void sampleWorkers().catch((err) => { - console.error("worker sampler error", err); -}); -setInterval(() => { - void sampleWorkers().catch((err) => { - console.error("worker sampler error", err); - }); -}, SAMPLE_INTERVAL_MS).unref(); - void sampleIrisPing().catch((err) => { console.error("iris ping sampler error", err); }); @@ -180,8 +163,16 @@ app.get("/api/workers", async (c) => { return c.json(snapshot); }); -app.get("/api/workers/history", (c) => { - return c.json({ samples: workerHistory.samples() }); +app.get("/api/workers/history", async (c) => { + const snapshot = await workersHistoryCache.get("workers-history", () => workersHistory()); + return c.json(snapshot); +}); + +app.get("/api/provisioning/history", async (c) => { + const snapshot = await provisioningHistoryCache.get("provisioning-history", () => + provisioningHistory(), + ); + return c.json(snapshot); }); app.get("/api/jobs", async (c) => { diff --git a/infra/status-page/server/sources/clusterHistory.ts b/infra/status-page/server/sources/clusterHistory.ts new file mode 100644 index 0000000000..805bd87776 --- /dev/null +++ b/infra/status-page/server/sources/clusterHistory.ts @@ -0,0 +1,187 @@ +// Finelog-backed time series for the Workers + provisioning history charts. +// +// The infra/probes canary writes one flat row per metric per cycle to the +// `infra.canary.metrics` namespace (see infra/probes/src/cluster.py and +// provisioning.py): `worker_healthy` per region (and a fleet total) every 60s, +// and the `provision_*` fleet/per-pool gauges every 15min. This module rolls +// the trailing 24h of those rows into the two chart series the frontend reads. +// +// It replaces the in-process worker-count ring buffer, whose history was lost +// on every Cloud Run restart (see README "Known limitations"). The canary now +// owns this history durably in finelog, so the dashboard just queries it. + +import { + asCanaryRows, + CANARY_METRICS_NAMESPACE, + type CanaryMetricRow, + decodeLabels, + FLEET_SCOPE, + queryFinelog, + sqlTimestampUtc, +} from "./finelogQuery.js"; + +const HISTORY_WINDOW_MS = 24 * 60 * 60 * 1000; + +const METRIC_WORKER_HEALTHY = "worker_healthy"; +const METRIC_PROVISION_READY = "provision_ready"; +const METRIC_PROVISION_OUTCOMES = "provision_outcomes"; + +// Per-region healthy worker count at one sample time. Flat map keyed by region +// so the frontend feeds recharts directly (each region → a ). +export interface WorkerSample { + t: number; // epoch millis + regions: Record; +} + +export interface WorkersHistoryResponse { + samples: WorkerSample[]; + windowMs: number; + fetchedAt: string; + error?: string; +} + +// One provisioning cycle's create-success ratio (ready / resolved attempts). +// `fleet` is the cluster-wide average (null when the cycle resolved zero +// attempts); `regions` carries the per-region ratio, keyed by region name and +// omitting any region with zero attempts that cycle. Ratios are 0..1. +export interface ProvisioningHistorySample { + t: number; // epoch millis + fleet: number | null; + regions: Record; +} + +export interface ProvisioningHistoryResponse { + samples: ProvisioningHistorySample[]; + windowMs: number; + fetchedAt: string; + error?: string; +} + +// SQL is Apache DataFusion (finelog's read engine), NOT DuckDB: no JSON +// functions (labels are decoded in JS), and collected_at is read out as epoch +// micros via arrow_cast(...,'Int64') AS collected_us, which asCanaryRows +// normalizes to millis. +const workersSql = (cutoff: string) => ` + SELECT metric, labels, value, arrow_cast(collected_at, 'Int64') AS collected_us + FROM "${CANARY_METRICS_NAMESPACE}" + WHERE metric = '${METRIC_WORKER_HEALTHY}' + AND collected_at >= TIMESTAMP '${cutoff}' + ORDER BY collected_at +`; + +// Pull both the fleet-scoped and per-pool ready/outcomes counts; the success +// ratio is computed here (ready / outcomes) for the fleet average and rolled +// up per region from the per-pool rows, rather than reading the probe's +// pre-baked fleet-only provision_success_ratio. +const provisioningSql = (cutoff: string) => ` + SELECT metric, value, labels, arrow_cast(collected_at, 'Int64') AS collected_us + FROM "${CANARY_METRICS_NAMESPACE}" + WHERE metric IN ('${METRIC_PROVISION_READY}', '${METRIC_PROVISION_OUTCOMES}') + AND collected_at >= TIMESTAMP '${cutoff}' + ORDER BY collected_at +`; + +// Group the per-region worker_healthy rows by cycle timestamp. The fleet-total +// rows (scope=fleet) are dropped — the chart plots per-region lines, matching +// what the in-process buffer used to feed it. +function parseWorkerSamples(rows: CanaryMetricRow[]): WorkerSample[] { + const byTime = new Map>(); + for (const row of rows) { + const region = decodeLabels(row.labels).region; + if (!region) continue; + const t = row.collectedMs; + const regions = byTime.get(t) ?? {}; + regions[region] = row.value; + byTime.set(t, regions); + } + return [...byTime.entries()].sort((a, b) => a[0] - b[0]).map(([t, regions]) => ({ t, regions })); +} + +// GCP zone → region: drop the trailing "-" (us-east5-a → us-east5), +// matching the region names the worker_healthy gauge already uses. +function regionOfZone(zone: string): string { + return zone.replace(/-[a-z]$/, ""); +} + +interface Counts { + ready: number; + outcomes: number; +} + +function ratio({ ready, outcomes }: Counts): number | null { + return outcomes > 0 ? ready / outcomes : null; +} + +// Group the per-pool ready/outcomes counts by cycle timestamp into a fleet +// average plus a per-region rollup. provision_outcomes already excludes +// preemptions (it counts resolved create attempts), so ready / outcomes is the +// create-success ratio. The fleet rows (scope=fleet) carry the cluster total; +// per-pool rows are summed across pools sharing a region. +function parseProvisioningSamples(rows: CanaryMetricRow[]): ProvisioningHistorySample[] { + const fleetByTime = new Map(); + const regionByTime = new Map>(); + + for (const row of rows) { + const labels = decodeLabels(row.labels); + const t = row.collectedMs; + if (labels.scope === FLEET_SCOPE) { + const counts = fleetByTime.get(t) ?? { ready: 0, outcomes: 0 }; + addCount(counts, row.metric, row.value); + fleetByTime.set(t, counts); + } else if (labels.zone) { + const region = regionOfZone(labels.zone); + const regions = regionByTime.get(t) ?? new Map(); + const counts = regions.get(region) ?? { ready: 0, outcomes: 0 }; + addCount(counts, row.metric, row.value); + regions.set(region, counts); + regionByTime.set(t, regions); + } + } + + const times = [...new Set([...fleetByTime.keys(), ...regionByTime.keys()])].sort((a, b) => a - b); + return times.map((t) => { + const regions: Record = {}; + for (const [region, counts] of regionByTime.get(t) ?? []) { + const r = ratio(counts); + if (r !== null) regions[region] = r; + } + const fleetCounts = fleetByTime.get(t); + return { t, fleet: fleetCounts ? ratio(fleetCounts) : null, regions }; + }); +} + +function addCount(counts: Counts, metric: string, value: number): void { + if (metric === METRIC_PROVISION_READY) counts.ready += value; + else if (metric === METRIC_PROVISION_OUTCOMES) counts.outcomes += value; +} + +interface HistoryResponse { + samples: T[]; + windowMs: number; + fetchedAt: string; + error?: string; +} + +// Run a trailing-24h canary query and parse it into chart samples, with the +// shared fetchedAt/cutoff/empty-on-error scaffold both history endpoints want. +async function historyResponse( + buildSql: (cutoff: string) => string, + parse: (rows: CanaryMetricRow[]) => T[], +): Promise> { + const fetchedAt = new Date().toISOString(); + const cutoff = sqlTimestampUtc(new Date(Date.now() - HISTORY_WINDOW_MS)); + try { + const rows = await queryFinelog(buildSql(cutoff)); + return { samples: parse(asCanaryRows(rows)), windowMs: HISTORY_WINDOW_MS, fetchedAt }; + } catch (err) { + return { samples: [], windowMs: HISTORY_WINDOW_MS, fetchedAt, error: (err as Error).message }; + } +} + +export function workersHistory(): Promise { + return historyResponse(workersSql, parseWorkerSamples); +} + +export function provisioningHistory(): Promise { + return historyResponse(provisioningSql, parseProvisioningSamples); +} diff --git a/infra/status-page/server/sources/finelogQuery.ts b/infra/status-page/server/sources/finelogQuery.ts index 3386ab3974..93c6ac6267 100644 --- a/infra/status-page/server/sources/finelogQuery.ts +++ b/infra/status-page/server/sources/finelogQuery.ts @@ -34,6 +34,56 @@ export function sqlTimestampUtc(at: Date): string { return at.toISOString().slice(0, 19).replace("T", " "); } +// `arrow_cast(collected_at, 'Int64')` reads finelog's timestamp column out as +// epoch MICROSECONDS (the namespace stores it at microsecond precision), so a +// raw cast value fed to `new Date()` lands ~58000 years in the future. Every +// chart/axis here works in millis, so normalize at the query boundary. +export function microsToMillis(micros: number): number { + return Math.round(micros / 1000); +} + +// Decode a finelog `labels` cell (a JSON object string the probes write via +// json.dumps). A parse failure is a real anomaly — schema drift or truncation, +// not an expected case — so log it rather than letting the row's labels +// silently vanish; the empty fallback keeps one bad row from sinking a query. +export function decodeLabels(raw: string): Record { + try { + return JSON.parse(raw) as Record; + } catch (err) { + console.warn(`finelog: unparseable labels ${JSON.stringify(raw.slice(0, 200))}: ${(err as Error).message}`); + return {}; + } +} + +// The finelog namespace the infra/probes canary writes its flat metric samples +// to (see infra/probes/src/infra_probes.py). Both the probes snapshot +// (probes.ts) and the cluster-history charts (clusterHistory.ts) read from it. +export const CANARY_METRICS_NAMESPACE = "infra.canary.metrics"; + +// Label value marking a fleet-wide aggregate row (no pool/region labels). +export const FLEET_SCOPE = "fleet"; + +// One decoded row from the canary namespace. `collectedMs` is normalized from +// the micros timestamp cast; `labels` stays a JSON object string for per-row +// decode (DataFusion has no JSON functions to slice it server-side). Queries +// feeding asCanaryRows must alias the cast `arrow_cast(collected_at, 'Int64') +// AS collected_us`. +export interface CanaryMetricRow { + metric: string; + value: number; + labels: string; + collectedMs: number; +} + +export function asCanaryRows(rows: Record[]): CanaryMetricRow[] { + return rows.map((r) => ({ + metric: String(r.metric), + value: Number(r.value), + labels: String(r.labels ?? "{}"), + collectedMs: microsToMillis(Number(r.collected_us)), + })); +} + export async function queryFinelog(sql: string): Promise[]> { const base = await activeFinelogUrl(); // Hold a strong reference to the controller so the abort timer is not GC'd diff --git a/infra/status-page/server/sources/probes.ts b/infra/status-page/server/sources/probes.ts index 723795ab74..dea4894d52 100644 Binary files a/infra/status-page/server/sources/probes.ts and b/infra/status-page/server/sources/probes.ts differ diff --git a/infra/status-page/server/sources/workers.ts b/infra/status-page/server/sources/workers.ts index a328286bf7..bd2c074425 100644 --- a/infra/status-page/server/sources/workers.ts +++ b/infra/status-page/server/sources/workers.ts @@ -20,8 +20,8 @@ // whole-VM granularity anyway, which made the "available" number a // thin proxy for "idle VMs × resources" on a busy cluster. // -// History lives in a separate ring buffer (server/history.ts); this -// file only ever returns the current snapshot. +// History lives in finelog, queried by server/sources/clusterHistory.ts; +// this file only ever returns the current snapshot. import { getControllerUrl } from "./discovery.js"; @@ -82,14 +82,6 @@ export interface WorkersSnapshot { error?: string; } -export interface WorkerSample { - t: number; // epoch millis - // Per-region healthy worker count at the sample time. Flat map keyed - // by region name so the frontend can feed recharts directly (each - // region becomes a ). - regions: Record; -} - function emptyResources(): WorkerResourceTotals { return { cpuTotalMillicores: 0, diff --git a/infra/status-page/web/src/api.ts b/infra/status-page/web/src/api.ts index 97463c11ce..8275a4b224 100644 --- a/infra/status-page/web/src/api.ts +++ b/infra/status-page/web/src/api.ts @@ -162,6 +162,26 @@ export interface WorkerSample { export interface WorkersHistoryResponse { samples: WorkerSample[]; + windowMs: number; + fetchedAt: string; + error?: string; +} + +// Per-cycle create-success ratio over the trailing 24h: a fleet average plus a +// per-region rollup (region omitted for a cycle with zero resolved attempts). +// `fleet` is null for a cycle that resolved zero attempts. Ratios are 0..1. +// Mirrors server/sources/clusterHistory.ts. +export interface ProvisioningHistorySample { + t: number; // epoch millis + fleet: number | null; + regions: Record; +} + +export interface ProvisioningHistoryResponse { + samples: ProvisioningHistorySample[]; + windowMs: number; + fetchedAt: string; + error?: string; } export interface JobStateCount { @@ -248,5 +268,7 @@ export const fetchControlPlaneHealth = () => getJson("/api/control-plane/health"); export const fetchWorkers = () => getJson("/api/workers"); export const fetchWorkersHistory = () => getJson("/api/workers/history"); +export const fetchProvisioningHistory = () => + getJson("/api/provisioning/history"); export const fetchJobs = () => getJson("/api/jobs"); export const fetchProbes = () => getJson("/api/probes"); diff --git a/infra/status-page/web/src/components/ProvisioningHistoryChart.tsx b/infra/status-page/web/src/components/ProvisioningHistoryChart.tsx new file mode 100644 index 0000000000..f7867a35e1 --- /dev/null +++ b/infra/status-page/web/src/components/ProvisioningHistoryChart.tsx @@ -0,0 +1,164 @@ +import { useMemo } from "react"; +import { + CartesianGrid, + Legend, + Line, + LineChart, + Tooltip, + XAxis, + YAxis, +} from "recharts"; +import type { ProvisioningHistorySample } from "../api"; +import { useProvisioningHistory } from "../hooks/useProvisioningHistory"; +import { displayedSpanLabel, formatClock, useContainerSize } from "./chartUtils"; + +// dataKey for the fleet-average line; distinct from any region name. +const FLEET_KEY = "__fleet__"; +const FLEET_COLOR = "#e2e8f0"; // slate-200 — brighter than the region lines +const FALLBACK_REGION_COLOR = "#64748b"; // slate-500, for a region not in the shared map + +function pct(ratio: number | null | undefined): number | null { + return ratio === null || ratio === undefined ? null : Math.round(ratio * 1000) / 10; +} + +// Flatten `{t, fleet, regions:{...}}` into rows recharts consumes via a +// dataKey per line, and return the ordered region list (alphabetical, matching +// the worker chart's coloring). Values are percentages so the 0–100% axis and +// tooltip read naturally. +function useChartData(samples: ProvisioningHistorySample[]) { + return useMemo(() => { + const seen = new Set(); + for (const s of samples) { + for (const region of Object.keys(s.regions)) seen.add(region); + } + const regions = [...seen].sort(); + const rows = samples.map((s) => { + const row: Record = { t: s.t, [FLEET_KEY]: pct(s.fleet) }; + for (const region of regions) row[region] = pct(s.regions[region]); + return row; + }); + return { rows, regions }; + }, [samples]); +} + +function RatioTooltip({ + active, + payload, + label, +}: { + active?: boolean; + payload?: { name: string; value: number | null; color: string; dataKey: string }[]; + label?: number; +}) { + if (!active || !payload?.length) return null; + const rows = payload.filter((p) => p.value !== null && p.value !== undefined); + return ( +
+
{new Date(label as number).toLocaleString()}
+ {rows.map((p) => ( +
+ {p.name}: {p.value}% +
+ ))} +
+ ); +} + +// colorByRegion is shared with the sibling worker chart (built off the union +// of both charts' regions) so a region reads the same color in both. +export function ProvisioningHistoryChart({ + colorByRegion, +}: { + colorByRegion: Map; +}) { + const { data, isLoading, error } = useProvisioningHistory(); + const samples = data?.samples ?? []; + const { ref: chartRef, size: chartSize } = useContainerSize(); + const { rows, regions } = useChartData(samples); + + return ( +
+
+

+ Provisioning success +

+ + {samples.length > 1 ? displayedSpanLabel(samples) : "no cycles yet"} + +
+
+ {isLoading &&
loading…
} + {error && ( +
failed to load: {(error as Error).message}
+ )} + {data?.error &&
{data.error}
} + {!isLoading && !error && !data?.error && + (samples.length > 1 && chartSize ? ( + + + + `${v}%`} + stroke="#475569" + fontSize={11} + label={{ + value: "Success", + angle: -90, + position: "insideLeft", + style: { fill: "#64748b", fontSize: 11 }, + }} + /> + } /> + + {regions.map((region) => ( + + ))} + + + ) : ( +
+ waiting for provisioning cycles — the canary rolls one up every 15min +
+ ))} +
+
+ ); +} diff --git a/infra/status-page/web/src/components/WorkersPanel.tsx b/infra/status-page/web/src/components/WorkersPanel.tsx index 56fad8571c..ca381d3cf3 100644 --- a/infra/status-page/web/src/components/WorkersPanel.tsx +++ b/infra/status-page/web/src/components/WorkersPanel.tsx @@ -9,22 +9,16 @@ import { YAxis, } from "recharts"; import type { WorkerResourceTotals, WorkerSample } from "../api"; +import { useProvisioningHistory } from "../hooks/useProvisioningHistory"; import { useWorkers } from "../hooks/useWorkers"; import { useWorkersHistory } from "../hooks/useWorkersHistory"; -import { displayedSpanLabel, formatClock, useContainerSize } from "./chartUtils"; - -// Palette for per-region chart lines. Picked for good contrast on the -// dark background; cycles if there are more regions than entries. -const REGION_COLORS = [ - "#10b981", // emerald-500 - "#06b6d4", // cyan-500 - "#8b5cf6", // violet-500 - "#f59e0b", // amber-500 - "#ec4899", // pink-500 - "#f43f5e", // rose-500 - "#14b8a6", // teal-500 - "#3b82f6", // blue-500 -]; +import { + displayedSpanLabel, + formatClock, + regionColorMap, + useContainerSize, +} from "./chartUtils"; +import { ProvisioningHistoryChart } from "./ProvisioningHistoryChart"; // Turn total_cpu_millicores into a human-readable CPU-core count. // 1000 millicores = 1 full core. k-suffix for anything above ~10k. @@ -141,14 +135,18 @@ export function WorkersPanel() { [data?.byRegion], ); const { rows: chartRows, regions: chartRegions } = useChartData(samples, currentOrder); - // Color is keyed by region name (alphabetical), not display index, so a - // region keeps the same color even when worker counts reorder the legend. + // Share one color map across this chart and the sibling provisioning chart, + // built off the union of both charts' regions so a region reads the same + // color in both. Keyed by sorted name (not display index) so colors don't + // shift when counts reorder a legend. + const provisioningHistory = useProvisioningHistory(); const colorByRegion = useMemo(() => { - const sorted = [...chartRegions].sort(); - const map = new Map(); - sorted.forEach((r, i) => map.set(r, REGION_COLORS[i % REGION_COLORS.length])); - return map; - }, [chartRegions]); + const union = new Set(chartRegions); + for (const sample of provisioningHistory.data?.samples ?? []) { + for (const region of Object.keys(sample.regions)) union.add(region); + } + return regionColorMap([...union]); + }, [chartRegions, provisioningHistory.data]); return (
@@ -156,11 +154,6 @@ export function WorkersPanel() {

Workers

- - {samples.length > 1 - ? `${samples.length} samples · ${displayedSpanLabel(samples)}` - : "history warming up"} -
{isLoading &&
loading…
} @@ -174,8 +167,20 @@ export function WorkersPanel() { {data.healthy}
-
- {samples.length > 1 && chartSize && chartRegions.length > 0 ? ( +
+
+
+

+ Availability by region +

+ + {samples.length > 1 + ? `${samples.length} samples · ${displayedSpanLabel(samples)}` + : "history warming up"} + +
+
+ {samples.length > 1 && chartSize && chartRegions.length > 0 ? ( ))} - ) : ( -
- history warming up — samples collected every 30s, chart appears once - we have two points + ) : ( +
+ history warming up — the canary samples workers every 60s, chart + appears once we have two points +
+ )}
- )} +
+
)} diff --git a/infra/status-page/web/src/components/chartUtils.ts b/infra/status-page/web/src/components/chartUtils.ts index 5aad47f0a1..de6e6df34c 100644 --- a/infra/status-page/web/src/components/chartUtils.ts +++ b/infra/status-page/web/src/components/chartUtils.ts @@ -1,5 +1,29 @@ import { useCallback, useEffect, useState } from "react"; +// Palette for per-region chart lines, picked for contrast on the dark +// background; cycles if there are more regions than entries. Shared by the +// worker-availability and provisioning charts so a region keeps one color +// across both. +export const REGION_COLORS = [ + "#10b981", // emerald-500 + "#06b6d4", // cyan-500 + "#8b5cf6", // violet-500 + "#f59e0b", // amber-500 + "#ec4899", // pink-500 + "#f43f5e", // rose-500 + "#14b8a6", // teal-500 + "#3b82f6", // blue-500 +]; + +// Map region name → stable color. Keyed off the alphabetically-sorted region +// set (not display order) so a region's color doesn't shift when counts +// reorder a legend, and so both charts agree on the same region's color. +export function regionColorMap(regions: string[]): Map { + const map = new Map(); + [...regions].sort().forEach((region, i) => map.set(region, REGION_COLORS[i % REGION_COLORS.length])); + return map; +} + // Human "N{s,m,h,d} ago" for an ISO timestamp. Non-parseable input is // echoed back. Shared by every panel's "updated …" / "collected …" line. export function formatRelative(iso: string): string { diff --git a/infra/status-page/web/src/hooks/useProvisioningHistory.ts b/infra/status-page/web/src/hooks/useProvisioningHistory.ts new file mode 100644 index 0000000000..dae0ddd85c --- /dev/null +++ b/infra/status-page/web/src/hooks/useProvisioningHistory.ts @@ -0,0 +1,18 @@ +import { useQuery } from "@tanstack/react-query"; +import { useAtomValue } from "jotai"; +import { fetchProvisioningHistory } from "../api"; +import { autoRefreshAtom } from "../state"; + +// Provisioning gauges turn over every 15min and the server shields finelog +// behind a 60s TTL, so a 60s refetch is plenty. +const REFETCH_INTERVAL_MS = 60_000; + +export function useProvisioningHistory() { + const autoRefresh = useAtomValue(autoRefreshAtom); + return useQuery({ + queryKey: ["provisioning", "history"], + queryFn: fetchProvisioningHistory, + refetchInterval: autoRefresh ? REFETCH_INTERVAL_MS : false, + staleTime: 30_000, + }); +} diff --git a/infra/status-page/web/src/hooks/useWorkersHistory.ts b/infra/status-page/web/src/hooks/useWorkersHistory.ts index cec2e57e00..bd6d93f1c8 100644 --- a/infra/status-page/web/src/hooks/useWorkersHistory.ts +++ b/infra/status-page/web/src/hooks/useWorkersHistory.ts @@ -3,9 +3,9 @@ import { useAtomValue } from "jotai"; import { fetchWorkersHistory } from "../api"; import { autoRefreshAtom } from "../state"; -// Refetches on the same cadence as the server sampler — any faster is -// wasted traffic since the ring buffer only gains a new point every 30s. -const REFETCH_INTERVAL_MS = 30_000; +// The canary writes a new worker_healthy point every 60s and the server +// shields finelog behind a 60s TTL, so anything faster is wasted traffic. +const REFETCH_INTERVAL_MS = 60_000; export function useWorkersHistory() { const autoRefresh = useAtomValue(autoRefreshAtom);