admin/live: cancel sessions by worker id (fix cross-CP cancel) (#855)

fuziontech · claude · web-flow · commit e60fecc51bee · 2026-06-30T20:22:11.000-07:00
* admin/live: cancel sessions by worker id so cross-CP cancel works The Live view's Cancel button failed with "no active session with pid N": KillSession scans only the SERVING replica's stacks, but behind the admin ALB the session usually lives on a different CP. And fanning the pid-based cancel out is unsafe — pids collide across CPs (each CP's counter starts at 1000), so a pid fan-out could kill the wrong replica's session. Fix: cancel by the CLUSTER-UNIQUE worker id (the key the detail view already uses), with fan-out — the same collision-safe pattern. - KillSessionByWorkerID(wid): destroys the session on that worker on this replica (0/1), located via SessionForWorker. - POST /sessions/by-worker/:wid/cancel: kills locally, and only if this replica didn't own it, fans out to peers (?scope=local recursion guard) and sums the killed count; 404 only if no replica owns the worker. The old pid route stays (local-only, documented). - UI: the query-row, session-row, and detail-dialog Cancel buttons all address by worker_id now. - Tests: TestCancelByWorkerFansOut (local hit skips fan-out, peer-owned via fan-out, scope=local no-recursion, unknown→404); harness admin_cancel_by_worker kills a real session by worker id + asserts unknown→404. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01NUq2EVxvKQFq3YEDNLF5HP * admin/live: address review nits on cancel-by-worker - Drop cp_responders/cp_total from the by-worker cancel response: a worker is owned by exactly one CP, so non-owning peers 404 (dropped by the fetcher) and the coverage count would undercount and mislead. It's a single-owner op — return just {killed}. (The per-user kill keeps coverage; it IS an aggregate.) - harness admin_cancel_by_worker: hold the session (sleep) longer than the appear-poll budget so a slow cold-start can't exit the client before the session is observed (still cancels well before the 60s idle timeout). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01NUq2EVxvKQFq3YEDNLF5HP --------- Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
diff --git a/controlplane/admin/README.md b/controlplane/admin/README.md
@@ -64,7 +64,8 @@ Added for the console:
 | `GET /api/v1/sessions`, `/workers` | viewer | live sessions / session-holding workers |
 | `GET /api/v1/workers/fleet` | viewer | cluster worker counts by lifecycle state |
 | `GET /api/v1/cluster/instances` | viewer | live CP replicas (self-flagged) |
-| `POST /api/v1/sessions/:pid/cancel` | admin | tear down a session + its worker |
+| `POST /api/v1/sessions/:pid/cancel` | admin | tear down a session by pid — LOCAL only (pid is per-CP); prefer the worker-id form |
+| `POST /api/v1/sessions/by-worker/:wid/cancel` | admin | tear down the session on a cluster-unique worker id; fans out to whichever CP owns it (pid can't be fanned out — it collides across CPs). Returns `{killed, cp_responders, cp_total}` |
 | `POST /api/v1/orgs/:id/users/:username/kill` | admin | per-user kill switch (one-shot): tear down ALL of a user's sessions + in-flight queries cluster-wide. Returns `{killed, cp_responders, cp_total}`. Does NOT block reconnects |
 | `POST /api/v1/orgs/:id/users/:username/disable` | admin | persist `disabled=true` (refused at connect on PG wire + Flight), reload the snapshot cluster-wide so the block is immediate, AND kill the user's live sessions. Returns `{disabled, killed, …}` |
 | `POST /api/v1/orgs/:id/users/:username/enable` | admin | persist `disabled=false` + reload cluster-wide so the user can reconnect at once |
diff --git a/controlplane/admin/live.go b/controlplane/admin/live.go
@@ -5,6 +5,7 @@ package admin
 import (
 	"encoding/json"
 	"errors"
+	"fmt"
 	"net/http"
 	"net/url"
 	"strconv"
@@ -102,8 +103,14 @@ type LiveInfo interface {
 	// ControlPlaneInstances returns the live CP replicas.
 	ControlPlaneInstances() ([]CPInstance, error)
 	// KillSession tears down the session (and its exclusive worker) for pid.
-	// Returns an error if no such session exists.
+	// Returns an error if no such session exists. NOTE: pid is per-CP, not
+	// cluster-unique — prefer KillSessionByWorkerID for cross-CP cancel.
 	KillSession(pid int32) error
+	// KillSessionByWorkerID tears down the session bound to the cluster-unique
+	// worker id on THIS replica (0 or 1). The handler fans it out so a cancel
+	// hits whichever replica owns the session — pid can't be used for the
+	// fan-out because pids collide across CPs.
+	KillSessionByWorkerID(workerID int) int
 	// KillUserSessions tears down every active session for (orgID, username) owned
 	// by THIS control-plane replica and returns the count destroyed. The handler
 	// fans it out to peers so the kill is cluster-wide. 0 (not an error) when the
@@ -230,7 +237,9 @@ func registerLiveAPI(r *gin.RouterGroup, live LiveInfo, fetcher PeerFetcher, use
 		c.JSON(http.StatusOK, gin.H{"instances": instances})
 	})
 
-	// Kill a session (admin-only via RoleGate — it's a POST).
+	// Kill a session by pid (admin-only via RoleGate — it's a POST). LOCAL ONLY:
+	// pid is per-CP, so this only finds a session the serving replica owns.
+	// Prefer /sessions/by-worker/:wid/cancel, which fans out.
 	r.POST("/sessions/:pid/cancel", func(c *gin.Context) {
 		pid64, err := strconv.ParseInt(c.Param("pid"), 10, 32)
 		if err != nil {
@@ -244,6 +253,34 @@ func registerLiveAPI(r *gin.RouterGroup, live LiveInfo, fetcher PeerFetcher, use
 		c.JSON(http.StatusOK, gin.H{"killed": pid64})
 	})
 
+	// Kill a session addressed by CLUSTER-UNIQUE worker id (admin-only POST).
+	// A session lives on exactly one CP, so this kills locally and, unless it's
+	// a scope=local peer call, fans out to peer replicas (?scope=local stops the
+	// recursion). Worker id — not pid — is the address because pids collide
+	// across CPs, so a pid fan-out could kill the wrong replica's session.
+	r.POST("/sessions/by-worker/:wid/cancel", func(c *gin.Context) {
+		wid, err := strconv.Atoi(c.Param("wid"))
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "invalid worker id"})
+			return
+		}
+		killed := live.KillSessionByWorkerID(wid)
+		// A worker hosts exactly one session on exactly one CP: only fan out when
+		// this replica didn't own it (avoids needless peer POSTs on the hit path).
+		// This is a single-owner op, so — unlike the per-user kill — cp coverage
+		// isn't meaningful (non-owning peers 404): the response is just {killed}.
+		if killed == 0 && !localScope(c) && fetcher != nil {
+			bodies, _ := fetcher.PostPeers(c.Request.Context(), "/api/v1/sessions/by-worker/"+strconv.Itoa(wid)+"/cancel")
+			k, _ := sumKilled(bodies)
+			killed += k
+		}
+		if killed == 0 {
+			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("no active session on worker %d", wid)})
+			return
+		}
+		c.JSON(http.StatusOK, gin.H{"killed": killed})
+	})
+
 	// Per-user kill switch (admin-only via RoleGate — all POSTs). A user's
 	// sessions live on whichever CP replica owns each connection, so every action
 	// kills locally and fans out to peers (?scope=local stops the recursion); the
diff --git a/controlplane/admin/live_aggregate_test.go b/controlplane/admin/live_aggregate_test.go
@@ -24,6 +24,9 @@ type fakeLiveInfo struct {
 
 	killedPerUser int            // returned by KillUserSessions (local kill count)
 	killUserCalls []killUserCall // recorded (org, user) of each KillUserSessions
+
+	killByWorkerReturn int   // returned by KillSessionByWorkerID (local kill count)
+	killByWorkerCalls  []int // recorded worker ids
 }
 
 type killUserCall struct{ org, user string }
@@ -38,6 +41,10 @@ func (f *fakeLiveInfo) QueryDetailForWorkerID(wid int) (QueryDetail, bool) {
 func (f *fakeLiveInfo) WorkerFleet() ([]FleetStat, error)            { return nil, nil }
 func (f *fakeLiveInfo) ControlPlaneInstances() ([]CPInstance, error) { return nil, nil }
 func (f *fakeLiveInfo) KillSession(int32) error                      { return nil }
+func (f *fakeLiveInfo) KillSessionByWorkerID(wid int) int {
+	f.killByWorkerCalls = append(f.killByWorkerCalls, wid)
+	return f.killByWorkerReturn
+}
 func (f *fakeLiveInfo) KillUserSessions(org, user string) int {
 	f.killUserCalls = append(f.killUserCalls, killUserCall{org, user})
 	return f.killedPerUser
diff --git a/controlplane/admin/live_test.go b/controlplane/admin/live_test.go
@@ -124,3 +124,56 @@ func TestQueryDetailWorkerIDDistinguishesCollidingPIDs(t *testing.T) {
 		t.Fatalf("worker 22 should resolve to org-b's query despite the shared pid, got %d %+v", w.Code, b)
 	}
 }
+
+// TestCancelByWorkerFansOut covers the collision-safe cross-CP cancel: the
+// serving replica kills locally if it owns the worker, else fans out to peers
+// (scope=local guard), and 404s only if no replica owns it.
+func TestCancelByWorkerFansOut(t *testing.T) {
+	// Owned locally → killed without fan-out.
+	local := &fakeLiveInfo{killByWorkerReturn: 1}
+	fetcher := &fakePeerFetcher{}
+	r := liveTestRouter(local, fetcher)
+	w := httptest.NewRecorder()
+	r.ServeHTTP(w, httptest.NewRequest(http.MethodPost, "/api/v1/sessions/by-worker/77/cancel", nil))
+	if w.Code != http.StatusOK {
+		t.Fatalf("locally-owned cancel: got %d (%s)", w.Code, w.Body.String())
+	}
+	if len(local.killByWorkerCalls) != 1 || local.killByWorkerCalls[0] != 77 {
+		t.Fatalf("KillSessionByWorkerID not called with 77: %v", local.killByWorkerCalls)
+	}
+	if fetcher.postCallCount() != 0 {
+		t.Fatalf("owned locally should not fan out, but PostPeers ran")
+	}
+
+	// Not local → fan out; a peer reports killed:1 → success.
+	local2 := &fakeLiveInfo{killByWorkerReturn: 0}
+	peerBody, _ := json.Marshal(map[string]any{"killed": 1})
+	fetcher2 := &fakePeerFetcher{postByPath: map[string][][]byte{"/api/v1/sessions/by-worker/88/cancel": {peerBody}}}
+	r2 := liveTestRouter(local2, fetcher2)
+	w = httptest.NewRecorder()
+	r2.ServeHTTP(w, httptest.NewRequest(http.MethodPost, "/api/v1/sessions/by-worker/88/cancel", nil))
+	if w.Code != http.StatusOK {
+		t.Fatalf("peer-owned cancel via fan-out: got %d (%s)", w.Code, w.Body.String())
+	}
+
+	// scope=local (a peer answering us): kill locally only, NO recursion.
+	before := fetcher2.postCallCount()
+	w = httptest.NewRecorder()
+	r2.ServeHTTP(w, httptest.NewRequest(http.MethodPost, "/api/v1/sessions/by-worker/88/cancel?scope=local", nil))
+	if w.Code != http.StatusNotFound {
+		t.Fatalf("scope=local with no local session should 404, got %d", w.Code)
+	}
+	if fetcher2.postCallCount() != before {
+		t.Fatalf("scope=local must not fan out")
+	}
+
+	// Nobody owns it anywhere → 404.
+	local3 := &fakeLiveInfo{killByWorkerReturn: 0}
+	fetcher3 := &fakePeerFetcher{}
+	r3 := liveTestRouter(local3, fetcher3)
+	w = httptest.NewRecorder()
+	r3.ServeHTTP(w, httptest.NewRequest(http.MethodPost, "/api/v1/sessions/by-worker/99/cancel", nil))
+	if w.Code != http.StatusNotFound {
+		t.Fatalf("unknown worker cancel should 404, got %d", w.Code)
+	}
+}
diff --git a/controlplane/admin/ui/src/components/QueryDetailDialog.tsx b/controlplane/admin/ui/src/components/QueryDetailDialog.tsx
@@ -34,7 +34,7 @@ export function QueryDetailDialog({
 }: {
   workerId: number | null;
   onClose: () => void;
-  onCancel: (pid: number) => void;
+  onCancel: (workerId: number) => void;
 }) {
   const detail = useQueryDetail(workerId);
   const d = detail.data;
@@ -115,7 +115,7 @@ export function QueryDetailDialog({
                 variant="ghost"
                 size="sm"
                 onClick={() => {
-                  onCancel(d.pid);
+                  onCancel(d.worker_id);
                   onClose();
                 }}
               >
diff --git a/controlplane/admin/ui/src/hooks/useApi.ts b/controlplane/admin/ui/src/hooks/useApi.ts
@@ -267,7 +267,7 @@ export function useSessions() {
 export function useCancelSession() {
   const qc = useQueryClient();
   return useMutation({
-    mutationFn: (pid: number) => api.cancelSession(pid),
+    mutationFn: (workerId: number) => api.cancelSession(workerId),
     onSuccess: () => {
       qc.invalidateQueries({ queryKey: ["sessions"] });
       qc.invalidateQueries({ queryKey: ["queries"] });
diff --git a/controlplane/admin/ui/src/lib/api.ts b/controlplane/admin/ui/src/lib/api.ts
@@ -151,7 +151,10 @@ export const api = {
   listQueries: () => get<{ queries: RunningQuery[] }>("/queries").then((r) => r.queries ?? []),
   // Detail is addressed by cluster-unique worker id (pid is per-org, not unique).
   queryDetail: (workerId: number) => get<QueryDetail>(`/queries/by-worker/${workerId}`),
-  cancelSession: (pid: number) => post<{ killed: number }>(`/sessions/${pid}/cancel`, {}),
+  // Cancel is addressed by cluster-unique worker id (pid is per-CP and collides
+  // across replicas); the server fans out to whichever CP owns the session.
+  cancelSession: (workerId: number) =>
+    post<{ killed: number }>(`/sessions/by-worker/${workerId}/cancel`, {}),
   // Per-user kill switch. killUser terminates all of a user's sessions + queries
   // (one-shot); disableUser also persists a block so new connections are refused
   // until enableUser. All three fan out across CP replicas server-side.
diff --git a/controlplane/admin/ui/src/pages/Live.tsx b/controlplane/admin/ui/src/pages/Live.tsx
@@ -219,7 +219,7 @@ export function Live() {
                               disabled={cancel.isPending}
                               onClick={(e) => {
                                 e.stopPropagation();
-                                cancel.mutate(q.pid);
+                                cancel.mutate(q.worker_id);
                               }}
                             >
                               <Ban className="h-4 w-4 text-destructive" /> Cancel
@@ -272,7 +272,7 @@ export function Live() {
                             size="sm"
                             className="-my-1 h-6"
                             disabled={cancel.isPending}
-                            onClick={() => cancel.mutate(s.pid)}
+                            onClick={() => cancel.mutate(s.worker_id)}
                           >
                             <Ban className="h-4 w-4 text-destructive" /> Cancel
                           </Button>
@@ -289,7 +289,7 @@ export function Live() {
       <QueryDetailDialog
         workerId={detailWid}
         onClose={() => setDetailWid(null)}
-        onCancel={(pid) => cancel.mutate(pid)}
+        onCancel={(workerId) => cancel.mutate(workerId)}
       />
     </>
   );
diff --git a/controlplane/admin_providers.go b/controlplane/admin_providers.go
@@ -177,6 +177,21 @@ func (p *clusterInfoProvider) KillSession(pid int32) error {
 	return fmt.Errorf("no active session with pid %d", pid)
 }
 
+// KillSessionByWorkerID tears down the session bound to the cluster-unique
+// worker id on this replica, returning the count destroyed (0 or 1 — one
+// session per worker). The admin handler fans this out so a cancel reaches
+// whichever CP owns the session; addressing by worker id (not the per-CP pid,
+// which collides across CPs) makes that fan-out collision-safe.
+func (p *clusterInfoProvider) KillSessionByWorkerID(workerID int) int {
+	for _, stack := range p.router.AllStacks() {
+		if s, ok := stack.Sessions.SessionForWorker(workerID); ok {
+			stack.Sessions.DestroySession(s.PID)
+			return 1
+		}
+	}
+	return 0
+}
+
 // KillUserSessions tears down every active session for (orgID, username) owned by
 // THIS control-plane replica and returns the count destroyed. It is the local
 // half of the cluster-wide per-user kill switch: the admin handler fans this out
diff --git a/tests/e2e-mw-dev/harness.sh b/tests/e2e-mw-dev/harness.sh
@@ -1604,6 +1604,52 @@ admin_idle_session_flagged() { # org password
   esac
 }
 
+# ---- admin: cancel a session by (cluster-unique) worker id ------------------
+# The Live view's Cancel button posts /sessions/by-worker/:wid/cancel. Worker id
+# (not the per-CP pid) is the address, and the handler kills locally or fans out
+# to whichever CP owns the session — so cancel works regardless of which replica
+# the request lands on. (The cross-CP fan-out itself is unit-tested in
+# TestCancelByWorkerFansOut; the CI CP is single-replica, so this covers the
+# real kill + the unknown→404 path end-to-end.)
+admin_cancel_by_worker() { # org password
+  org="$1"; pw="$2"
+  log "admin: cancel a live session by worker id on $org"
+  # Hold longer than the appear-poll budget (30×2s) so a slow cold-start can't
+  # exit the client before the session is observed (we cancel it well before
+  # the 60s idle timeout anyway).
+  ( printf 'BEGIN;\n'; sleep 90 ) | PGPASSWORD="$pw" psql \
+      "sslmode=require host=$org$SNI_SUFFIX hostaddr=$CP_IP port=5432 user=root dbname=ducklake" \
+      -v ON_ERROR_STOP=1 -qtA >/dev/null 2>&1 &
+  bg=$!
+  cleanup_cbw() { kill "$bg" 2>/dev/null || true; wait "$bg" 2>/dev/null || true; }
+  wid="" a=0
+  while [ "$a" -lt 30 ]; do
+    kill -0 "$bg" 2>/dev/null || break
+    wid="$(curl -fsS -H "$H" "$API/api/v1/queries" \
+      | jq -r --arg o "$org" 'first(.queries[]? | select(.org==$o and .user=="root")) | .worker_id // empty')"
+    [ -n "$wid" ] && break
+    sleep 2; a=$((a + 1))
+  done
+  [ -n "$wid" ] || { cleanup_cbw; fail "admin_cancel_by_worker: session never appeared for $org"; }
+  # Cancel it by worker id → killed>=1.
+  resp="$(curl -fsS -H "$H" -X POST "$API/api/v1/sessions/by-worker/$wid/cancel")" \
+    || { cleanup_cbw; fail "admin_cancel_by_worker: POST cancel failed for worker $wid"; }
+  echo "$resp" | jq -e '.killed >= 1' >/dev/null \
+    || { cleanup_cbw; fail "admin_cancel_by_worker: cancel did not kill worker $wid: $resp"; }
+  # The session must disappear from /queries.
+  gone="" a=0
+  while [ "$a" -lt 15 ]; do
+    [ "$(curl -fsS -H "$H" "$API/api/v1/queries" | jq -r --argjson w "$wid" 'any(.queries[]?; .worker_id==$w)')" = "false" ] && { gone=1; break; }
+    sleep 2; a=$((a + 1))
+  done
+  cleanup_cbw
+  [ -n "$gone" ] || fail "admin_cancel_by_worker: session on worker $wid still present after cancel"
+  # Unknown worker → 404 (not a 500).
+  code="$(curl -s -o /dev/null -w '%{http_code}' -H "$H" -X POST "$API/api/v1/sessions/by-worker/999999999/cancel")"
+  [ "$code" = "404" ] || fail "admin_cancel_by_worker: unknown worker cancel returned $code, want 404"
+  log "admin: cancel by worker id OK (killed worker $wid, unknown→404) on $org"
+}
+
 admin_query_detail() { # org password
   org="$1"; pw="$2"
   log "admin live: per-query detail round-trip on $org"
@@ -2099,6 +2145,9 @@ main() {
   # ---- admin live-query detail view (phase 1) — cnpg stack is warm now ----
   admin_query_detail "$CNPG" "$cnpg_pw"
 
+  # ---- admin: cancel a live session by worker id (cross-CP addressed) ----
+  admin_cancel_by_worker "$CNPG" "$cnpg_pw"
+
   # ---- admin live: idle-in-transaction session is flagged (state column) ----
   admin_idle_session_flagged "$CNPG" "$cnpg_pw"