Skip to content

Commit c4f2698

Browse files
committed
feat(cloud-shared): executors + machinery for restart/logs/snapshot jobs
Three new daemon-side handlers extend the cores-never-touched-from-Workers architecture started in #7808 (suspend/resume) to cover the rest of the lifecycle: - executeRestart: shutdown() + provision() atomically on the daemon. Replaces the Worker-side sequence which silently no-op'd the stop step and could leave a stale container running alongside the new one. - executeLogs: SSH \`docker logs --tail N <container>\` via the new SandboxProvider.fetchLogs() method. Works for stopped + crashed agents (the Worker-side fetch(bridge_url + /logs) returned empty for anything not actively running). - executeSnapshot: thin wrapper around the existing snapshot() so the job dispatcher can route through a single contract. Invoked from the daemon so outbound traffic to cores uses the same network identity as every other lifecycle op. Job machinery in provisioning-jobs.ts mirrors the suspend/resume patterns: data/result shapes, type guards, idempotent enqueue methods that reuse in-flight jobs on duplicate requests. DockerSandboxProvider.fetchLogs() merges stderr into stdout because agent crash traces tend to land on stderr.
1 parent 38a71ab commit c4f2698

4 files changed

Lines changed: 769 additions & 1 deletion

File tree

packages/cloud-shared/src/lib/services/docker-sandbox-provider.ts

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,6 +1019,33 @@ export class DockerSandboxProvider implements SandboxProvider {
10191019
return output;
10201020
}
10211021

1022+
/**
1023+
* SSH `docker logs --tail N <container>` on the assigned core and
1024+
* return the combined stdout/stderr. Used by the `agent_logs` job
1025+
* type so the cloud-api Worker doesn't have to reach the container
1026+
* bridge HTTP endpoint (which is unreachable for stopped/crashed
1027+
* agents).
1028+
*/
1029+
async fetchLogs(sandboxId: string, tail: number): Promise<string> {
1030+
const meta = await this.resolveContainer(sandboxId);
1031+
1032+
const safeTail = Math.max(1, Math.min(Math.floor(tail), 5000));
1033+
1034+
const ssh = DockerSSHClient.getClient(
1035+
meta.hostname,
1036+
meta.sshPort,
1037+
meta.hostKeyFingerprint,
1038+
meta.sshUser,
1039+
);
1040+
// `2>&1` merges stderr so the user sees boot errors when an agent
1041+
// is crash-looping — agents in node tend to write the interesting
1042+
// failure traces to stderr.
1043+
return await ssh.exec(
1044+
`docker logs --tail ${safeTail} ${shellQuote(meta.containerName)} 2>&1`,
1045+
DOCKER_CMD_TIMEOUT_MS,
1046+
);
1047+
}
1048+
10221049
// ------------------------------------------------------------------
10231050
// Helpers
10241051
// ------------------------------------------------------------------

packages/cloud-shared/src/lib/services/eliza-sandbox.ts

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2808,6 +2808,130 @@ export class ElizaSandboxService {
28082808
return { success: true, containerStarted: true, reprovisioned: true };
28092809
}
28102810

2811+
/**
2812+
* Daemon-side handler for the `agent_restart` job. Runs `shutdown()`
2813+
* (SSH stop + DB to stopped) and then `provision()` (recreate
2814+
* container + restore URLs). Replaces the Worker-side sequence which
2815+
* silently no-op'd the SSH stop and left the old container running
2816+
* alongside the new one.
2817+
*
2818+
* `shutdown()` failure is logged but doesn't abort — same lenience as
2819+
* the legacy restart route (the old container may already be gone or
2820+
* unreachable; the goal is "end up with a running fresh container",
2821+
* not "verify the old one stopped cleanly").
2822+
*/
2823+
async executeRestart(
2824+
agentId: string,
2825+
orgId: string,
2826+
): Promise<{
2827+
success: boolean;
2828+
containerStopped: boolean;
2829+
containerStarted: boolean;
2830+
bridgeUrl?: string;
2831+
healthUrl?: string;
2832+
error?: string;
2833+
}> {
2834+
const shutdownResult = await this.shutdown(agentId, orgId);
2835+
if (!shutdownResult.success) {
2836+
if (shutdownResult.error === "Agent not found") {
2837+
return {
2838+
success: false,
2839+
containerStopped: false,
2840+
containerStarted: false,
2841+
error: "Agent not found",
2842+
};
2843+
}
2844+
logger.warn("[agent-sandbox] Shutdown during restart returned error, continuing", {
2845+
agentId,
2846+
error: shutdownResult.error,
2847+
});
2848+
}
2849+
2850+
const provisionResult = await this.provision(agentId, orgId);
2851+
if (!provisionResult.success) {
2852+
return {
2853+
success: false,
2854+
containerStopped: shutdownResult.success,
2855+
containerStarted: false,
2856+
error: provisionResult.error,
2857+
};
2858+
}
2859+
2860+
return {
2861+
success: true,
2862+
containerStopped: shutdownResult.success,
2863+
containerStarted: true,
2864+
bridgeUrl: provisionResult.bridgeUrl,
2865+
healthUrl: provisionResult.healthUrl,
2866+
};
2867+
}
2868+
2869+
/**
2870+
* Daemon-side handler for the `agent_logs` job. SSH `docker logs
2871+
* --tail N <container>` on the assigned core via the provider. The
2872+
* daemon path works for stopped/crashed agents (the legacy Worker
2873+
* path hits the bridge HTTP `/logs` endpoint which is gone when the
2874+
* agent isn't running).
2875+
*/
2876+
async executeLogs(
2877+
agentId: string,
2878+
orgId: string,
2879+
tail: number,
2880+
): Promise<{
2881+
success: boolean;
2882+
status: string;
2883+
logs?: string;
2884+
message?: string;
2885+
error?: string;
2886+
}> {
2887+
const rec = await agentSandboxesRepository.findByIdAndOrg(agentId, orgId);
2888+
if (!rec) {
2889+
return { success: false, status: "missing", error: "Agent not found" };
2890+
}
2891+
if (!rec.sandbox_id) {
2892+
return {
2893+
success: true,
2894+
status: rec.status,
2895+
message: `Agent is ${rec.status} — no container assigned yet.`,
2896+
};
2897+
}
2898+
2899+
const provider = await this.getProvider();
2900+
if (typeof provider.fetchLogs !== "function") {
2901+
return {
2902+
success: true,
2903+
status: rec.status,
2904+
message: "Logs unavailable: sandbox provider does not implement fetchLogs.",
2905+
};
2906+
}
2907+
2908+
try {
2909+
const logs = await provider.fetchLogs(rec.sandbox_id, tail);
2910+
return { success: true, status: rec.status, logs };
2911+
} catch (e) {
2912+
return {
2913+
success: false,
2914+
status: rec.status,
2915+
error: e instanceof Error ? e.message : String(e),
2916+
};
2917+
}
2918+
}
2919+
2920+
/**
2921+
* Daemon-side handler for the `agent_snapshot` job. Same operation
2922+
* as the Worker-side `snapshot()` path, but invoked from the daemon
2923+
* so outbound traffic to the agent bridge uses the same network
2924+
* identity as every other cores-bound call. Returns the
2925+
* `agent_sandbox_backups` row that was persisted.
2926+
*/
2927+
async executeSnapshot(
2928+
agentId: string,
2929+
orgId: string,
2930+
snapshotType: "manual" | "auto" = "manual",
2931+
): Promise<SnapshotResult> {
2932+
return await this.snapshot(agentId, orgId, snapshotType);
2933+
}
2934+
28112935
// Private helpers
28122936

28132937
private async lockLifecycle(tx: LifecycleTx, agentId: string, orgId: string): Promise<void> {

0 commit comments

Comments
 (0)