Skip to content

Commit d092037

Browse files
committed
fix: Resolve critical audit findings — crash recovery, race conditions, error codes
Four fixes from end-to-end production audit: 1. Crash recovery (C2): loadPlansFromDisk() now scans for plans stuck in EXECUTING state and resets them to FAILED with a recovery message. Prevents permanently wedged plans after process crashes. 2. Concurrent alert dedup race (H2): Alert ingestion now wraps the check-then-create logic inside withCaseLock() to eliminate the TOCTOU race. Concurrent duplicate alerts merge gracefully instead of 500. 3. HTTP error codes (M4): parseJsonBody() errors now carry httpStatus (400 for invalid JSON, 413 for oversized body, 408 for timeout). Catch-all handler respects these instead of returning generic 500. 4. Policy Guard thresholds (M5): Aligned confidence thresholds in policy-guard AGENTS.md and TOOLS.md with policy.yaml per-action values (low=0.7, medium=0.7, high=0.8, critical=0.95). Also added GET /api/plans/:id tests confirming the endpoint exists and works correctly (C3 was already implemented).
1 parent 3624831 commit d092037

File tree

6 files changed

+356
-41
lines changed

6 files changed

+356
-41
lines changed

openclaw/agents/policy-guard/AGENTS.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,9 @@ Use the following confidence thresholds for your policy evaluation. The runtime
116116

117117
| Risk Level | Default Minimum Confidence |
118118
|------------|---------------------------|
119-
| Low | 0.5 |
119+
| Low | 0.7 |
120120
| Medium | 0.7 |
121-
| High | 0.85 |
121+
| High | 0.8 |
122122
| Critical | 0.95 |
123123

124124
## Deny Reason Codes

openclaw/agents/policy-guard/TOOLS.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,9 @@ Use the following confidence thresholds for your evaluation. The runtime service
6969

7070
| Risk Level | Default Minimum Confidence |
7171
|------------|---------------------------|
72-
| Low | 0.5 |
72+
| Low | 0.7 |
7373
| Medium | 0.7 |
74-
| High | 0.85 |
74+
| High | 0.8 |
7575
| Critical | 0.95 |
7676

7777
If below threshold, deny with `LOW_CONFIDENCE`.

runtime/autopilot-service/agent-actions.test.js

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,50 @@ describe("GET /api/agent-action/get-agent", () => {
708708
});
709709
});
710710

711+
// ===================================================================
712+
// GET /api/plans/:id — retrieve a single plan by ID
713+
// ===================================================================
714+
715+
describe("GET /api/plans/:id", () => {
716+
let server;
717+
let testPlanId;
718+
719+
before(async () => {
720+
ensureTestDirs();
721+
server = createServer();
722+
await new Promise((resolve) => server.listen(0, "127.0.0.1", resolve));
723+
const alert = await ingestAlert(server, `plan-get-${Date.now()}`);
724+
const actions = JSON.stringify([{ type: "block_ip", target: "9.8.7.6" }]);
725+
const planRes = await request(server, "GET",
726+
`/api/agent-action/create-plan?case_id=${alert.case_id}&title=Get%20Plan%20Test&risk_level=low&actions=${encodeURIComponent(actions)}`);
727+
testPlanId = planRes.body.plan_id;
728+
});
729+
730+
after(() => {
731+
return new Promise((resolve) => server.close(() => { rmTestDir(); resolve(); }));
732+
});
733+
734+
it("returns a plan by valid plan_id", async () => {
735+
const res = await request(server, "GET", `/api/plans/${testPlanId}`);
736+
assert.equal(res.status, 200);
737+
assert.equal(res.body.plan_id, testPlanId);
738+
assert.ok(res.body.actions);
739+
assert.ok(res.body.state);
740+
});
741+
742+
it("returns 404 for non-existent plan_id", async () => {
743+
const res = await request(server, "GET", "/api/plans/PLAN-9999999999999-deadbeef");
744+
assert.equal(res.status, 404);
745+
assert.ok(res.body.error);
746+
});
747+
748+
it("returns 400 for invalid plan_id format", async () => {
749+
const res = await request(server, "GET", "/api/plans/not-a-valid-id");
750+
assert.equal(res.status, 400);
751+
assert.ok(res.body.error);
752+
});
753+
});
754+
711755
// ===================================================================
712756
// Audit fix C2: Action type allowlist enforcement
713757
// ===================================================================

runtime/autopilot-service/index.js

Lines changed: 171 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1478,6 +1478,31 @@ async function loadPlansFromDisk() {
14781478
if (loaded > 0) {
14791479
log("info", "plans", "Loaded plans from disk", { count: loaded });
14801480
}
1481+
1482+
// Crash recovery: reset any plans stuck in EXECUTING state
1483+
// If the process crashed mid-executePlan(), the plan is persisted as EXECUTING
1484+
// but cannot be re-executed (requires APPROVED) or re-approved (requires PROPOSED).
1485+
// Reset them to FAILED so operators can investigate and re-create if needed.
1486+
let recovered = 0;
1487+
for (const [planId, plan] of responsePlans.entries()) {
1488+
if (plan.state === "executing") {
1489+
plan.state = "failed";
1490+
plan.execution_result = {
1491+
success: false,
1492+
reason: "Process crashed during execution — plan state recovered on restart",
1493+
};
1494+
plan.updated_at = new Date().toISOString();
1495+
log("warn", "plans", "Recovered stuck EXECUTING plan on startup", {
1496+
plan_id: planId,
1497+
case_id: plan.case_id,
1498+
});
1499+
await savePlanToDisk(plan);
1500+
recovered++;
1501+
}
1502+
}
1503+
if (recovered > 0) {
1504+
log("warn", "plans", "Crash recovery: reset EXECUTING plans to FAILED", { count: recovered });
1505+
}
14811506
} catch {
14821507
// Plans directory doesn't exist yet — will be created on first plan
14831508
}
@@ -3390,7 +3415,9 @@ function parseJsonBody(req) {
33903415
// Pre-check Content-Length header to reject oversized requests early
33913416
const contentLength = parseInt(req.headers?.["content-length"], 10);
33923417
if (contentLength > 0 && contentLength > MAX_BODY_SIZE) {
3393-
reject(new Error("Request body too large"));
3418+
const err = new Error("Request body too large");
3419+
err.httpStatus = 413;
3420+
reject(err);
33943421
return;
33953422
}
33963423

@@ -3403,7 +3430,9 @@ function parseJsonBody(req) {
34033430
if (!rejected) {
34043431
rejected = true;
34053432
req.destroy();
3406-
reject(new Error("Request body timeout"));
3433+
const err = new Error("Request body timeout");
3434+
err.httpStatus = 408;
3435+
reject(err);
34073436
}
34083437
}, 30000);
34093438

@@ -3416,7 +3445,9 @@ function parseJsonBody(req) {
34163445
rejected = true;
34173446
clearTimeout(bodyTimeout);
34183447
req.destroy(); // Stop receiving data
3419-
reject(new Error("Request body too large"));
3448+
const err = new Error("Request body too large");
3449+
err.httpStatus = 413;
3450+
reject(err);
34203451
return;
34213452
}
34223453
chunks.push(chunk);
@@ -3429,7 +3460,9 @@ function parseJsonBody(req) {
34293460
const body = Buffer.concat(chunks).toString("utf8");
34303461
resolve(body ? JSON.parse(body) : {});
34313462
} catch (err) {
3432-
reject(new Error("Invalid JSON"));
3463+
const parseErr = new Error("Invalid JSON");
3464+
parseErr.httpStatus = 400;
3465+
reject(parseErr);
34333466
}
34343467
});
34353468

@@ -3937,47 +3970,146 @@ function createServer() {
39373970
}],
39383971
};
39393972

3940-
// Check if case already exists (idempotency via hash)
3941-
let existingCase = null;
3942-
try {
3943-
existingCase = await getCase(caseId);
3944-
} catch (e) {
3945-
// Case doesn't exist, which is expected
3946-
}
3947-
3948-
// Entity-based alert grouping: if no exact hash match, check for related cases
3973+
// Entity-based alert grouping: check for related cases (in-memory, no lock needed)
39493974
let groupedCaseId = null;
3950-
if (!existingCase && entities.length > 0) {
3975+
if (entities.length > 0) {
39513976
groupedCaseId = findRelatedCase(entities);
3952-
if (groupedCaseId) {
3953-
try {
3954-
existingCase = await getCase(groupedCaseId);
3955-
} catch (e) {
3956-
groupedCaseId = null; // Related case was deleted
3957-
}
3958-
}
39593977
}
39603978

39613979
const effectiveCaseId = groupedCaseId || caseId;
39623980

3981+
// Use withCaseLock to atomically check-then-create/update, preventing
3982+
// race conditions when concurrent requests arrive for the same alert_id.
3983+
// We inline the file operations instead of calling createCase/updateCase
3984+
// because those functions acquire the same lock (non-reentrant) and would deadlock.
3985+
let existingCase = false;
3986+
await withCaseLock(effectiveCaseId, async () => {
3987+
let caseExists = false;
3988+
try {
3989+
await getCase(effectiveCaseId);
3990+
caseExists = true;
3991+
} catch (e) {
3992+
// Case doesn't exist, which is expected
3993+
}
3994+
3995+
if (caseExists) {
3996+
existingCase = true;
3997+
// Update existing case with new evidence directly (lock already held)
3998+
const caseDir = path.join(config.dataDir, "cases", effectiveCaseId);
3999+
const packPath = path.join(caseDir, "evidence-pack.json");
4000+
const content = await fs.readFile(packPath, "utf8");
4001+
const evidencePack = JSON.parse(content);
4002+
const now = new Date().toISOString();
4003+
evidencePack.updated_at = now;
4004+
4005+
// Merge entities (deduplicate by type+value)
4006+
if (caseData.entities) {
4007+
const existingKeys = new Set(
4008+
(evidencePack.entities || []).map((e) => `${e.type}:${e.value}`)
4009+
);
4010+
for (const entity of caseData.entities) {
4011+
if (!existingKeys.has(`${entity.type}:${entity.value}`)) {
4012+
evidencePack.entities.push(entity);
4013+
existingKeys.add(`${entity.type}:${entity.value}`);
4014+
}
4015+
}
4016+
}
4017+
4018+
// Append timeline entries
4019+
if (caseData.timeline) {
4020+
evidencePack.timeline = (evidencePack.timeline || []).concat(caseData.timeline);
4021+
}
4022+
4023+
// Append evidence refs (deduplicate by ref_id)
4024+
if (caseData.evidence_refs) {
4025+
const existingRefs = new Set(
4026+
(evidencePack.evidence_refs || []).map((r) => r.ref_id)
4027+
);
4028+
for (const ref of caseData.evidence_refs) {
4029+
if (!existingRefs.has(ref.ref_id)) {
4030+
evidencePack.evidence_refs.push(ref);
4031+
existingRefs.add(ref.ref_id);
4032+
}
4033+
}
4034+
}
4035+
4036+
await atomicWriteFile(packPath, JSON.stringify(evidencePack, null, 2));
4037+
} else {
4038+
// Create new case directly (lock already held)
4039+
const caseDir = path.join(config.dataDir, "cases", effectiveCaseId);
4040+
const packPath = path.join(caseDir, "evidence-pack.json");
4041+
4042+
await ensureDir(caseDir);
4043+
4044+
const now = new Date().toISOString();
4045+
4046+
const evidencePack = {
4047+
schema_version: EVIDENCE_PACK_SCHEMA_VERSION,
4048+
case_id: effectiveCaseId,
4049+
created_at: now,
4050+
updated_at: now,
4051+
title: caseData.title || "",
4052+
summary: caseData.summary || "",
4053+
severity: caseData.severity || "medium",
4054+
confidence: caseData.confidence || 0,
4055+
entities: caseData.entities || [],
4056+
timeline: caseData.timeline || [],
4057+
mitre: caseData.mitre || [],
4058+
mcp_calls: [],
4059+
evidence_refs: caseData.evidence_refs || [],
4060+
plans: [],
4061+
approvals: [],
4062+
actions: [],
4063+
status: "open",
4064+
feedback: [],
4065+
};
4066+
4067+
await atomicWriteFile(packPath, JSON.stringify(evidencePack, null, 2));
4068+
4069+
const caseSummary = {
4070+
case_id: effectiveCaseId,
4071+
created_at: now,
4072+
updated_at: now,
4073+
title: caseData.title || "",
4074+
severity: caseData.severity || "medium",
4075+
status: "open",
4076+
};
4077+
4078+
await atomicWriteFile(
4079+
path.join(caseDir, "case.json"),
4080+
JSON.stringify(caseSummary, null, 2),
4081+
);
4082+
4083+
incrementMetric("cases_created_total");
4084+
log("info", "evidence-pack", "Case created", { case_id: effectiveCaseId });
4085+
4086+
// Post to Slack alerts channel (async, don't await)
4087+
if (slack && slack.isInitialized()) {
4088+
slack.postCaseAlert({
4089+
case_id: effectiveCaseId,
4090+
title: caseData.title,
4091+
summary: caseData.summary,
4092+
severity: caseData.severity,
4093+
entities: caseData.entities || [],
4094+
created_at: now,
4095+
}).catch((err) => {
4096+
log("warn", "evidence-pack", "Failed to post case to Slack", { error: err.message, case_id: effectiveCaseId });
4097+
});
4098+
}
4099+
}
4100+
});
4101+
4102+
// Entity indexing and webhook dispatch stay OUTSIDE the lock
4103+
indexCaseEntities(effectiveCaseId, entities, severity);
4104+
39634105
if (existingCase) {
3964-
// Update existing case with new evidence
3965-
await updateCase(effectiveCaseId, {
3966-
entities: caseData.entities,
3967-
timeline: caseData.timeline,
3968-
evidence_refs: caseData.evidence_refs,
3969-
});
3970-
indexCaseEntities(effectiveCaseId, entities, severity);
39714106
log("info", "triage", "Updated existing case with new alert", {
39724107
case_id: effectiveCaseId,
39734108
alert_id: alertId,
39744109
...(groupedCaseId && { grouped_from: caseId }),
39754110
});
39764111
} else {
3977-
// Create new case
3978-
await createCase(caseId, caseData);
3979-
indexCaseEntities(caseId, entities, severity);
3980-
log("info", "triage", "Created new case from alert", { case_id: caseId, alert_id: alertId, severity });
4112+
log("info", "triage", "Created new case from alert", { case_id: effectiveCaseId, alert_id: alertId, severity });
39814113

39824114
// Dispatch to triage agent via OpenClaw gateway
39834115
// NOTE: Callback URLs are NOT included in the webhook message because
@@ -3987,14 +4119,14 @@ function createServer() {
39874119
// callback URL templates. The agent reads case_id from this data and
39884120
// substitutes it into the URL pattern from its system prompt.
39894121
dispatchToGateway("/webhook/wazuh-alert", {
3990-
message: `New triage task. Case ID: ${caseId}. Severity: ${severity}. Title: ${caseData.title}. Entities: ${entities.length} extracted. Follow your AGENTS.md instructions to triage this alert and advance the pipeline.`,
3991-
case_id: caseId,
4122+
message: `New triage task. Case ID: ${effectiveCaseId}. Severity: ${severity}. Title: ${caseData.title}. Entities: ${entities.length} extracted. Follow your AGENTS.md instructions to triage this alert and advance the pipeline.`,
4123+
case_id: effectiveCaseId,
39924124
severity,
39934125
title: caseData.title,
39944126
entities_count: entities.length,
39954127
trigger: "alert_ingestion",
39964128
}).catch((err) => {
3997-
log("warn", "dispatch", "Failed to dispatch alert ingestion webhook", { case_id: caseId, error: err.message });
4129+
log("warn", "dispatch", "Failed to dispatch alert ingestion webhook", { case_id: effectiveCaseId, error: err.message });
39984130
incrementMetric("webhook_dispatch_failures_total");
39994131
});
40004132
}
@@ -5000,8 +5132,10 @@ function createServer() {
50005132
// 404 for unknown routes
50015133
sendJsonError(res, 404, "Not found", requestId);
50025134
} catch (err) {
5003-
log("error", "http", "Request error", { error: err.message });
5004-
sendJsonError(res, 500, "Internal server error", requestId);
5135+
const status = err.httpStatus || 500;
5136+
const message = err.httpStatus ? err.message : "Internal server error";
5137+
log("error", "http", "Request error", { error: err.message, status });
5138+
sendJsonError(res, status, message, requestId);
50055139
}
50065140
});
50075141

0 commit comments

Comments
 (0)