Skip to content

Commit 2cfb1d3

Browse files
authored
Merge pull request #7746 from elizaOS/feat/agent-delete-via-job-queue
2 parents 15fca07 + fdb3ac0 commit 2cfb1d3

9 files changed

Lines changed: 663 additions & 118 deletions

File tree

packages/cloud-api/v1/eliza/agents/[agentId]/route.ts

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import { getPreferredElizaAgentWebUiUrl } from "@/lib/eliza-agent-web-ui";
1818
import { adminService } from "@/lib/services/admin";
1919
import { reusesExistingElizaCharacter } from "@/lib/services/eliza-agent-config";
2020
import { elizaSandboxService } from "@/lib/services/eliza-sandbox";
21+
import { provisioningJobService } from "@/lib/services/provisioning-jobs";
2122
import { getStewardAgent } from "@/lib/services/steward-client";
2223
import type {
2324
AgentAdminDetailsDto,
@@ -364,63 +365,63 @@ app.delete("/", async (c) => {
364365
return c.json({ success: false, error: "Agent not found" }, 404);
365366
}
366367

367-
if (existing.node_id && existing.sandbox_id) {
368-
const forwarded = await deleteDockerBackedAgentViaControlPlane(
369-
c,
370-
user,
371-
agentId,
368+
if (existing.status === "provisioning") {
369+
return c.json(
370+
{ success: false, error: "Agent provisioning is in progress" },
371+
409,
372372
);
373-
if (forwarded) return forwarded;
374373
}
375374

376-
const deleted = await elizaSandboxService.deleteAgent(
375+
// Async delete via the same job-queue path agent_provision uses. This
376+
// moves the SSH stop, Neon cleanup, and per-agent key revoke off the
377+
// request thread so a slow / unreachable Hetzner core can no longer
378+
// make the API hang or silently return 200 while the container lives
379+
// on. Idempotent: a second DELETE while a job is in flight reuses
380+
// the existing one.
381+
const enqueueResult = await provisioningJobService.enqueueAgentDeleteOnce({
377382
agentId,
378-
user.organization_id,
379-
);
380-
if (!deleted.success) {
381-
const status =
382-
deleted.error === "Agent not found"
383-
? 404
384-
: deleted.error === "Agent provisioning is in progress"
385-
? 409
386-
: 500;
387-
return c.json({ success: false, error: deleted.error }, status);
388-
}
389-
390-
const characterId = deleted.deletedSandbox.character_id;
391-
const reusesExistingCharacter = reusesExistingElizaCharacter(
392-
deleted.deletedSandbox.agent_config,
393-
);
383+
organizationId: user.organization_id,
384+
userId: user.id,
385+
});
394386

395-
if (characterId && !reusesExistingCharacter) {
396-
try {
397-
await userCharactersRepository.delete(characterId);
398-
logger.info("[agent-api] Cleaned up linked character after delete", {
399-
agentId,
400-
characterId,
401-
});
402-
} catch (characterErr) {
403-
logger.warn(
404-
"[agent-api] Failed to clean up linked character after delete",
405-
{
406-
agentId,
407-
characterId,
408-
error:
409-
characterErr instanceof Error
410-
? characterErr.message
411-
: String(characterErr),
412-
},
413-
);
414-
}
415-
}
387+
// Best-effort wake of the worker so the user does not wait for the
388+
// next cron tick. Same pattern as the provision path.
389+
void provisioningJobService.triggerImmediate(c.env).catch(() => {
390+
// Logged inside the service; nothing actionable here.
391+
});
416392

417-
logger.info("[agent-api] Agent deleted", {
393+
logger.info("[agent-api] Agent delete enqueued", {
418394
agentId,
419395
orgId: user.organization_id,
396+
jobId: enqueueResult.job.id,
397+
created: enqueueResult.created,
420398
});
421399

422-
return c.json({ success: true });
400+
return c.json(
401+
{
402+
success: true,
403+
created: enqueueResult.created,
404+
alreadyInProgress: !enqueueResult.created,
405+
message: enqueueResult.created
406+
? "Delete job created. Poll the job endpoint for status."
407+
: "Delete is already in progress.",
408+
data: {
409+
jobId: enqueueResult.job.id,
410+
agentId,
411+
status: enqueueResult.job.status,
412+
},
413+
polling: {
414+
endpoint: `/api/v1/jobs/${enqueueResult.job.id}`,
415+
intervalMs: 5_000,
416+
expectedDurationMs: 30_000,
417+
},
418+
},
419+
202,
420+
);
423421
} catch (error) {
422+
if (error instanceof Error && error.message === "Agent not found") {
423+
return c.json({ success: false, error: "Agent not found" }, 404);
424+
}
424425
logger.error("[agent-api] DELETE /agents/:agentId error", { error });
425426
return failureResponse(c, error);
426427
}

packages/cloud-shared/src/db/schemas/agent-sandboxes.ts

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,21 @@ export type AgentSandboxStatus =
5454
| "running"
5555
| "stopped"
5656
| "disconnected"
57-
| "error";
57+
| "error"
58+
/**
59+
* Row is queued for async deletion. An `agent_delete` job has been
60+
* enqueued in `jobs`; the provisioning worker will SSH the core, stop
61+
* the container, and then DELETE the row. UI must treat this as
62+
* "soon-to-be-gone" — no mutations should be accepted while in this
63+
* state.
64+
*/
65+
| "deletion_pending"
66+
/**
67+
* Async deletion exhausted retries (e.g. SSH unreachable for the core
68+
* hosting this sandbox). The container may still be running on the
69+
* core; ops must investigate. Row stays so the failure is visible.
70+
*/
71+
| "deletion_failed";
5872

5973
export type AgentBillingStatus = "active" | "warning" | "suspended" | "shutdown_pending" | "exempt";
6074

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/**
2+
* Covers the substring matcher that decides whether a failed `docker stop`
3+
* / `docker rm` error indicates the container is already absent. This is
4+
* the pivot of the prod fix shipped in PR #BIG — without it, both-calls-
5+
* failed used to silently leave zombie containers on the cores; with it,
6+
* both-calls-failed throws only when the failures are unrelated to "gone".
7+
*/
8+
import { describe, expect, test } from "bun:test";
9+
import { isAlreadyGoneMessage } from "../docker-error-classifier";
10+
11+
describe("isAlreadyGoneMessage", () => {
12+
test('recognizes "No such container" (Docker 24)', () => {
13+
expect(
14+
isAlreadyGoneMessage(
15+
"Error response from daemon: No such container: agent-abc123",
16+
),
17+
).toBe(true);
18+
});
19+
20+
test('recognizes "not found" (older Docker)', () => {
21+
expect(isAlreadyGoneMessage("Container not found: agent-abc")).toBe(true);
22+
});
23+
24+
test('recognizes "already gone"', () => {
25+
expect(isAlreadyGoneMessage("container already gone before stop")).toBe(
26+
true,
27+
);
28+
});
29+
30+
test('recognizes "no longer exists"', () => {
31+
expect(
32+
isAlreadyGoneMessage("the named container no longer exists on host"),
33+
).toBe(true);
34+
});
35+
36+
test("case-insensitive", () => {
37+
expect(isAlreadyGoneMessage("NO SUCH CONTAINER: AGENT-1")).toBe(true);
38+
});
39+
40+
test("returns false for SSH connection failure", () => {
41+
expect(
42+
isAlreadyGoneMessage(
43+
"ssh: connect to host 138.201.80.125 port 22: Connection timed out",
44+
),
45+
).toBe(false);
46+
});
47+
48+
test("returns false for Docker daemon down", () => {
49+
expect(
50+
isAlreadyGoneMessage(
51+
"Cannot connect to the Docker daemon at unix:///var/run/docker.sock",
52+
),
53+
).toBe(false);
54+
});
55+
56+
test("returns false for permission denied", () => {
57+
expect(isAlreadyGoneMessage("Permission denied (publickey)")).toBe(false);
58+
});
59+
60+
test("returns false for empty / unrelated text", () => {
61+
expect(isAlreadyGoneMessage("")).toBe(false);
62+
expect(isAlreadyGoneMessage("some unrelated error")).toBe(false);
63+
});
64+
});
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/**
2+
* Tiny dep-free helpers to classify errors returned by `docker` / SSH so
3+
* the rest of the sandbox provider can stay readable. Extracted from
4+
* `docker-sandbox-provider.ts` only so the helpers can be unit-tested
5+
* without pulling in plugin-sql / drizzle / @elizaos/core at import time.
6+
*/
7+
8+
/**
9+
* Matches Docker / SSH error messages that mean "the thing we tried to
10+
* stop is no longer there". Used by `DockerSandboxProvider.stop()` to
11+
* treat both-calls-failed as success when the container was already gone
12+
* before we got the SSH window. Substring match because docker error
13+
* formatting drifts across versions ("No such container", "is not
14+
* running", etc.).
15+
*/
16+
export function isAlreadyGoneMessage(message: string): boolean {
17+
const normalized = message.toLowerCase();
18+
return (
19+
normalized.includes("no such container") ||
20+
normalized.includes("not found") ||
21+
normalized.includes("already gone") ||
22+
normalized.includes("no longer exists")
23+
);
24+
}

0 commit comments

Comments
 (0)