Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions packages/cloud-api/v1/eliza/agents/[agentId]/resume/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,16 +173,26 @@ async function __hono_POST(
}

try {
// Use the dedicated `agent_resume` job type instead of
// `agent_provision`. The orchestrator's `executeResume` tries
// `docker start <existing-container>` first (fast path, ~5s) and
// falls back to a full re-provision only if the container is gone
// (daemon scrub, core eviction). Re-provisioning every resume is
// 60s+ and wasteful when the original container is still on disk.
const { job, created } =
await provisioningJobService.enqueueAgentProvisionOnce({
await provisioningJobService.enqueueAgentResumeOnce({
agentId,
organizationId: user.organization_id,
userId: user.id,
agentName: agent.agent_name ?? agentId,
webhookUrl,
expectedUpdatedAt: agent.updated_at,
});

// Best-effort wake of the orchestrator so the user does not wait for
// the next cron tick. Same pattern as provision/delete/suspend.
void provisioningJobService.triggerImmediate().catch(() => {
// Logged inside the service; nothing actionable here.
});

return applyCorsHeaders(
Response.json(
{
Expand All @@ -195,7 +205,7 @@ async function __hono_POST(
jobId: job.id,
status: job.status,
message: created
? "Resume job created. Agent will restore from latest snapshot."
? "Resume job created. Container will be docker-started (fast path) or re-provisioned if gone."
: "Resume is already in progress.",
},
polling: {
Expand Down
69 changes: 41 additions & 28 deletions packages/cloud-api/v1/eliza/agents/[agentId]/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -232,43 +232,56 @@ app.patch("/", async (c) => {
});
}

const result = await elizaSandboxService.shutdown(
agentId,
user.organization_id,
);
if (!result.success) {
const status =
result.error === "Agent not found"
? 404
: result.error === "Agent provisioning is in progress"
? 409
: 400;
// Enqueue `agent_suspend` job — the orchestrator does the docker stop
// via SSH and flips the DB. Workers can't SSH the cores; the previous
// inline `shutdown()` path silently failed to stop the container and
// left a stale DB row claiming `stopped` while the container kept
// running. See suspend/route.ts for the same refactor.
if (agent.status === "provisioning") {
return c.json(
{
success: false,
error: result.error ?? `${parsed.data.action} failed`,
},
status,
{ success: false, error: "Agent provisioning is in progress" },
409,
);
}

logger.info(`[agent-api] Agent ${parsed.data.action} complete`, {
const enqueueResult = await provisioningJobService.enqueueAgentSuspendOnce({
agentId,
orgId: user.organization_id,
organizationId: user.organization_id,
userId: user.id,
});

return c.json({
success: true,
data: {
void provisioningJobService.triggerImmediate().catch(() => {
// Logged inside the service.
});

logger.info(
`[agent-api] Agent ${parsed.data.action} enqueued (suspend job)`,
{
agentId,
action: parsed.data.action,
message:
parsed.data.action === "shutdown"
? "Agent shutdown complete"
: "Agent suspended with snapshot. Use resume or provision to restart.",
previousStatus: agent.status,
orgId: user.organization_id,
jobId: enqueueResult.job.id,
created: enqueueResult.created,
},
});
);

return c.json(
{
success: true,
created: enqueueResult.created,
alreadyInProgress: !enqueueResult.created,
data: {
agentId,
action: parsed.data.action,
jobId: enqueueResult.job.id,
status: enqueueResult.job.status,
message: enqueueResult.created
? `${parsed.data.action} job created. Poll the job endpoint for status.`
: `${parsed.data.action} is already in progress.`,
previousStatus: agent.status,
},
},
202,
);
} catch (error) {
logger.error("[agent-api] PATCH /agents/:agentId error", { error });
return failureResponse(c, error);
Expand Down
83 changes: 52 additions & 31 deletions packages/cloud-api/v1/eliza/agents/[agentId]/suspend/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { Hono } from "hono";
import { errorToResponse } from "@/lib/api/errors";
import { requireAuthOrApiKeyWithOrg } from "@/lib/auth";
import { elizaSandboxService } from "@/lib/services/eliza-sandbox";
import { provisioningJobService } from "@/lib/services/provisioning-jobs";
import { applyCorsHeaders, handleCorsOptions } from "@/lib/services/proxy/cors";
import { logger } from "@/lib/utils/logger";
import type { AppEnv } from "@/types/cloud-worker-env";
Expand All @@ -11,14 +12,20 @@ const CORS_METHODS = "POST, OPTIONS";
/**
* POST /api/v1/eliza/agents/[agentId]/suspend
*
* Gracefully suspend a running agent:
* 1. Takes a pre-shutdown snapshot (backup) of the agent's state
* 2. Stops and removes the Docker container
* 3. Updates status to "stopped" in DB
* Enqueues an `agent_suspend` job. The Hetzner orchestrator (which has SSH
* access to the cores) picks it up, runs `docker stop` on the container,
* flips the DB row to `stopped`, and clears `bridge_url`/`health_url`. The
* `sandbox_id` is retained so a later `agent_resume` job can `docker start`
* the same container without a full re-provision.
*
* The agent can be resumed later via POST /api/v1/eliza/agents/[agentId]/resume
* or POST /api/v1/eliza/agents/[agentId]/provision, which will restore from
* the latest backup automatically. The agent may resume on a different node.
* Previously this route called `elizaSandboxService.shutdown()` inline,
* which only worked from a Node sidecar — Cloudflare Workers can't SSH the
* Hetzner cores, so the inline path silently failed to stop the container
* and the DB row showed `stopped` while the container kept burning RAM.
*
* Returns 202 with the job id; clients poll `/api/v1/jobs/<id>` for the
* final status. Idempotent: a second suspend on the same agent while a job
* is in flight returns the existing job.
*/
async function __hono_POST(
request: Request,
Expand Down Expand Up @@ -62,43 +69,57 @@ async function __hono_POST(
);
}

const result = await elizaSandboxService.shutdown(
agentId,
user.organization_id,
);

if (!result.success) {
const status =
result.error === "Agent not found"
? 404
: result.error === "Agent provisioning is in progress"
? 409
: 500;
if (agent.status === "provisioning") {
return applyCorsHeaders(
Response.json(
{ success: false, error: result.error ?? "Suspend failed" },
{ status },
{
success: false,
error: "Agent provisioning is in progress",
},
{ status: 409 },
),
CORS_METHODS,
);
}

logger.info("[agent-api] Agent suspended", {
const enqueueResult = await provisioningJobService.enqueueAgentSuspendOnce({
agentId,
organizationId: user.organization_id,
userId: user.id,
});

// Best-effort wake of the orchestrator so the user does not wait for the
// next cron tick. Same pattern as provision + delete.
void provisioningJobService.triggerImmediate().catch(() => {
// Logged inside the service; nothing actionable here.
});

logger.info("[agent-api] Agent suspend enqueued", {
agentId,
orgId: user.organization_id,
jobId: enqueueResult.job.id,
created: enqueueResult.created,
});

return applyCorsHeaders(
Response.json({
success: true,
data: {
agentId,
action: "suspend",
message:
"Agent suspended with snapshot. Use resume or provision to restart.",
previousStatus: agent.status,
Response.json(
{
success: true,
created: enqueueResult.created,
alreadyInProgress: !enqueueResult.created,
message: enqueueResult.created
? "Suspend job created. Poll the job endpoint for status."
: "Suspend is already in progress.",
data: {
agentId,
action: "suspend",
jobId: enqueueResult.job.id,
status: enqueueResult.job.status,
previousStatus: agent.status,
},
},
}),
{ status: 202 },
),
CORS_METHODS,
);
} catch (error) {
Expand Down
124 changes: 124 additions & 0 deletions packages/cloud-shared/src/lib/services/eliza-sandbox.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2703,6 +2703,130 @@ export class ElizaSandboxService {
return result;
}

/**
* Daemon-side handler for the `agent_suspend` job. SSH-stops the
* container, flips the DB row to `stopped`, clears bridge/health URLs
* but keeps `sandbox_id` for a subsequent `agent_resume` to docker
* start. Replaces the Worker-callable `shutdown()` path which silently
* failed to stop the container (Workers can't SSH).
*/
async executeSuspend(
agentId: string,
orgId: string,
): Promise<{ success: boolean; containerStopped: boolean; error?: string }> {
return await dbWrite.transaction(async (tx) => {
await this.lockLifecycle(tx, agentId, orgId);
const rec = await this.getAgentForLifecycleMutation(tx, agentId, orgId);
if (!rec)
return { success: false, containerStopped: false, error: "Agent not found" } as const;

const hasActiveProvisionJob = await this.hasActiveProvisionJobTx(tx, agentId, orgId);
if (rec.status === "provisioning" || hasActiveProvisionJob) {
return {
success: false,
containerStopped: false,
error: "Agent provisioning is in progress",
} as const;
}
if (rec.status === "stopped") return { success: true, containerStopped: true } as const;

let containerStopped = false;
if (rec.sandbox_id) {
try {
await (await this.getProvider()).stop(rec.sandbox_id);
containerStopped = true;
} catch (e) {
if (this.isIgnorableSandboxStopError(e)) {
containerStopped = true;
logger.info("[agent-sandbox] Sandbox already absent during suspend", {
sandboxId: rec.sandbox_id,
error: e instanceof Error ? e.message : String(e),
});
} else {
return {
success: false,
containerStopped: false,
error: e instanceof Error ? e.message : String(e),
} as const;
}
}
} else {
containerStopped = true;
}

await tx.execute(sql`
UPDATE ${agentSandboxes}
SET status = 'stopped', bridge_url = NULL, health_url = NULL, updated_at = NOW()
WHERE id = ${rec.id}
`);
return { success: true, containerStopped } as const;
});
}

/**
* Daemon-side handler for the `agent_resume` job. Tries `docker start`
* on the existing container first (fast path, ~5s). Falls back to a
* full `provision()` if the container is gone (daemon scrub or core
* eviction). The Neon DB is reused across both paths.
*/
async executeResume(
agentId: string,
orgId: string,
): Promise<{
success: boolean;
containerStarted: boolean;
reprovisioned: boolean;
error?: string;
}> {
const rec = await agentSandboxesRepository.findByIdAndOrg(agentId, orgId);
if (!rec)
return {
success: false,
containerStarted: false,
reprovisioned: false,
error: "Agent not found",
};
if (rec.status === "running")
return { success: true, containerStarted: true, reprovisioned: false };

// Fast path: docker start existing container.
if (rec.sandbox_id) {
try {
const provider = await this.getProvider();
const start = (provider as unknown as { start?: (sandboxId: string) => Promise<void> })
.start;
if (typeof start === "function") {
await start.call(provider, rec.sandbox_id);
await dbWrite.execute(sql`
UPDATE ${agentSandboxes} SET status = 'running', updated_at = NOW() WHERE id = ${rec.id}
`);
return { success: true, containerStarted: true, reprovisioned: false };
Comment thread
greptile-apps[bot] marked this conversation as resolved.
Outdated
}
Comment thread
greptile-apps[bot] marked this conversation as resolved.
Outdated
} catch (e) {
logger.warn(
"[agent-sandbox] docker start failed during resume, falling back to provision",
{
agentId,
sandboxId: rec.sandbox_id,
error: e instanceof Error ? e.message : String(e),
},
);
}
}

// Slow path: full re-provision (reuses existing Neon DB).
const provisionResult = await this.provision(agentId, orgId);
if (!provisionResult.success) {
return {
success: false,
containerStarted: false,
reprovisioned: true,
error: provisionResult.error,
};
}
return { success: true, containerStarted: true, reprovisioned: true };
}
Comment thread
greptile-apps[bot] marked this conversation as resolved.

// Private helpers

private async lockLifecycle(tx: LifecycleTx, agentId: string, orgId: string): Promise<void> {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
export const JOB_TYPES = {
AGENT_PROVISION: "agent_provision",
AGENT_DELETE: "agent_delete",
AGENT_SUSPEND: "agent_suspend",
AGENT_RESUME: "agent_resume",
} as const;

export type ProvisioningJobType = (typeof JOB_TYPES)[keyof typeof JOB_TYPES];
Loading
Loading