Skip to content

Commit ffa3bd6

Browse files
committed
feat(observability): add Prometheus metrics endpoint
Signed-off-by: Ho Lim <subhoya@gmail.com>
1 parent 0d061d4 commit ffa3bd6

14 files changed

Lines changed: 1162 additions & 24 deletions

File tree

.agents/skills/nemoclaw-user-monitor-sandbox/SKILL.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,38 @@ openshell sandbox download <name> /sandbox/.openclaw/agents/main/sessions/<sessi
7979
Treat exported session logs as sensitive data.
8080
They can contain prompts, tool inputs, tool outputs, file paths, and cost metadata from the agent run.
8181

82+
## Export Prometheus Metrics
83+
84+
NemoClaw can expose lightweight Prometheus-format metrics for blueprint execution, API validation, and sandbox lifecycle operations.
85+
Metrics are disabled by default.
86+
87+
Set the following environment variable before starting the OpenClaw process that loads the NemoClaw plugin:
88+
89+
```bash
90+
export NEMOCLAW_METRICS_ENABLED=true
91+
```
92+
93+
The metrics endpoint listens on `127.0.0.1:9090` by default:
94+
95+
```bash
96+
curl http://127.0.0.1:9090/metrics
97+
```
98+
99+
Warning: the `/metrics` endpoint is unauthenticated.
100+
If `NEMOCLAW_METRICS_HOST` binds beyond loopback, any host or network that can reach `NEMOCLAW_METRICS_HOST:NEMOCLAW_METRICS_PORT/metrics` may scrape operational metadata about blueprint execution, API validation, and sandbox lifecycle activity.
101+
Prefer scraping over a secured network, restrict access with firewall rules, or keep `NEMOCLAW_METRICS_HOST` bound to loopback and expose `/metrics` through a secured proxy.
102+
103+
Use `NEMOCLAW_METRICS_PORT` to select another port, or `NEMOCLAW_METRICS_HOST` to bind to a different interface when your deployment needs remote scraping.
104+
The endpoint serves only `/metrics`; other paths return `404`.
105+
106+
Example metric families include:
107+
108+
```text
109+
blueprint_execution_total{action="apply",profile="default",status="success"} 1
110+
api_validation_total{kind="endpoint_url",source="blueprint",status="success"} 1
111+
sandbox_lifecycle_total{operation="create",status="success"} 1
112+
```
113+
82114
## Monitor Network Activity in the TUI
83115

84116
Open the OpenShell terminal UI for a live view of sandbox network activity and egress requests:
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
5+
import type fs from "node:fs";
6+
7+
interface FsEntry {
8+
type: "file" | "dir";
9+
content?: string;
10+
}
11+
12+
const store = new Map<string, FsEntry>();
13+
const mockExeca = vi.fn();
14+
15+
vi.mock("node:os", () => ({
16+
homedir: () => "/fakehome",
17+
}));
18+
19+
vi.mock("node:crypto", () => ({
20+
randomUUID: () => "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee",
21+
}));
22+
23+
vi.mock("node:fs", async (importOriginal) => {
24+
const original = await importOriginal<typeof fs>();
25+
return {
26+
...original,
27+
existsSync: (p: string) => store.has(p),
28+
mkdirSync: vi.fn((p: string) => {
29+
store.set(p, { type: "dir" });
30+
}),
31+
readFileSync: (p: string) => {
32+
const entry = store.get(p);
33+
if (entry?.type !== "file") {
34+
throw new Error(`ENOENT: ${p}`);
35+
}
36+
return entry.content ?? "";
37+
},
38+
writeFileSync: vi.fn((p: string, data: string) => {
39+
store.set(p, { type: "file", content: data });
40+
}),
41+
readdirSync: (p: string) => {
42+
const prefix = p.endsWith("/") ? p : `${p}/`;
43+
const entries = new Set<string>();
44+
for (const key of store.keys()) {
45+
if (key.startsWith(prefix)) {
46+
const [first] = key.slice(prefix.length).split("/");
47+
if (first) {
48+
entries.add(first);
49+
}
50+
}
51+
}
52+
if (entries.size === 0 && !store.has(p)) {
53+
throw new Error(`ENOENT: ${p}`);
54+
}
55+
return [...entries].sort();
56+
},
57+
};
58+
});
59+
60+
vi.mock("execa", () => ({
61+
execa: (...args: unknown[]) => mockExeca(...args),
62+
}));
63+
64+
vi.mock("./ssrf.js", () => ({
65+
validateEndpointUrl: vi.fn(async (url: string) => ({ url, pinnedUrl: url })),
66+
}));
67+
68+
const { validateEndpointUrl } = await import("./ssrf.js");
69+
const mockedValidateEndpoint = vi.mocked(validateEndpointUrl);
70+
const { metrics } = await import("../observability/metrics.js");
71+
const { actionApply, actionPlan } = await import("./runner.js");
72+
73+
const stdoutChunks: string[] = [];
74+
75+
function captureStdout(): void {
76+
vi.spyOn(process.stdout, "write").mockImplementation((chunk: string | Uint8Array) => {
77+
stdoutChunks.push(String(chunk));
78+
return true;
79+
});
80+
}
81+
82+
function minimalBlueprint(): Record<string, unknown> {
83+
return {
84+
version: "1.0",
85+
components: {
86+
inference: {
87+
profiles: {
88+
default: {
89+
provider_type: "openai",
90+
provider_name: "my-provider",
91+
endpoint: "https://api.example.com/v1",
92+
model: "gpt-4",
93+
credential_env: "MY_API_KEY",
94+
},
95+
},
96+
},
97+
sandbox: {
98+
image: "openclaw",
99+
name: "test-sandbox",
100+
forward_ports: [18789],
101+
},
102+
policy: { additions: {} },
103+
},
104+
};
105+
}
106+
107+
describe("runner metrics", () => {
108+
beforeEach(() => {
109+
store.clear();
110+
stdoutChunks.length = 0;
111+
vi.clearAllMocks();
112+
process.env.NEMOCLAW_METRICS_ENABLED = "true";
113+
metrics.reset();
114+
});
115+
116+
afterEach(() => {
117+
vi.restoreAllMocks();
118+
metrics.reset();
119+
delete process.env.NEMOCLAW_METRICS_ENABLED;
120+
});
121+
122+
it("records blueprint and endpoint validation metrics for successful plans", async () => {
123+
captureStdout();
124+
mockExeca.mockResolvedValue({ exitCode: 0 });
125+
126+
await actionPlan("default", minimalBlueprint());
127+
128+
const output = metrics.renderPrometheus();
129+
expect(output).toContain('blueprint_execution_total{action="plan",status="success"} 1');
130+
expect(output).toContain(
131+
'blueprint_execution_duration_seconds_count{action="plan",status="success"} 1',
132+
);
133+
expect(output).toContain(
134+
'api_validation_total{kind="endpoint_url",source="blueprint",status="success"} 1',
135+
);
136+
});
137+
138+
it("records blueprint and endpoint validation metrics for failed plans", async () => {
139+
captureStdout();
140+
mockExeca.mockResolvedValue({ exitCode: 0 });
141+
mockedValidateEndpoint.mockRejectedValueOnce(new Error("SSRF blocked"));
142+
143+
await expect(actionPlan("default", minimalBlueprint())).rejects.toThrow("SSRF blocked");
144+
145+
const output = metrics.renderPrometheus();
146+
expect(output).toContain('blueprint_execution_total{action="plan",status="error"} 1');
147+
expect(output).toContain(
148+
'api_validation_total{kind="endpoint_url",source="blueprint",status="error"} 1',
149+
);
150+
});
151+
152+
it("records sandbox lifecycle metrics during apply", async () => {
153+
captureStdout();
154+
mockExeca.mockResolvedValue({ exitCode: 0, stdout: "", stderr: "" });
155+
156+
await actionApply("default", minimalBlueprint());
157+
158+
const output = metrics.renderPrometheus();
159+
expect(output).toContain('blueprint_execution_total{action="apply",status="success"} 1');
160+
expect(output).toContain('sandbox_lifecycle_total{operation="create",status="success"} 1');
161+
expect(output).toContain(
162+
'sandbox_lifecycle_duration_seconds_count{operation="create",status="success"} 1',
163+
);
164+
});
165+
});

nemoclaw/src/blueprint/runner.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,7 @@ describe("runner", () => {
620620
expect(plan.router.enabled).toBe(false);
621621
expect(plan.router.port).toBe(4000);
622622
});
623+
623624
});
624625

625626
describe("actionApply", () => {

nemoclaw/src/blueprint/runner.ts

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import { join, sep } from "node:path";
2020
import { execa } from "execa";
2121
import YAML from "yaml";
2222

23+
import { metrics } from "../observability/metrics.js";
2324
import { validateEndpointUrl } from "./ssrf.js";
2425
import { buildSubprocessEnv } from "../lib/subprocess-env.js";
2526
import { DASHBOARD_PORT } from "../lib/ports.js";
@@ -409,6 +410,15 @@ async function openshellAvailable(): Promise<boolean> {
409410
return result.exitCode === 0;
410411
}
411412

413+
async function validateEndpointForMetrics(
414+
endpointUrl: string,
415+
source: "override" | "blueprint",
416+
): ReturnType<typeof validateEndpointUrl> {
417+
return await metrics.observeOperation("api_validation", { kind: "endpoint_url", source }, () =>
418+
validateEndpointUrl(endpointUrl),
419+
);
420+
}
421+
412422
/**
413423
* Resolve inference config and sandbox config from a blueprint, applying
414424
* endpoint URL override and SSRF validation if provided.
@@ -431,7 +441,7 @@ async function resolveRunConfig(
431441

432442
let inferenceCfg = { ...inferenceProfiles[profile] };
433443
if (endpointUrl) {
434-
const validated = await validateEndpointUrl(endpointUrl);
444+
const validated = await validateEndpointForMetrics(endpointUrl, "override");
435445
// Use DNS-pinned URL for HTTP (full SSRF/rebinding protection). For HTTPS,
436446
// keep the original hostname — TLS certificate validation prevents rebinding
437447
// since the attacker cannot present a valid cert for the target.
@@ -441,7 +451,7 @@ async function resolveRunConfig(
441451

442452
// Validate the final endpoint (whether from CLI override or blueprint profile)
443453
if (inferenceCfg.endpoint) {
444-
const validated = await validateEndpointUrl(inferenceCfg.endpoint);
454+
const validated = await validateEndpointForMetrics(inferenceCfg.endpoint, "blueprint");
445455
const safe = inferenceCfg.endpoint.startsWith("https:") ? validated.url : validated.pinnedUrl;
446456
inferenceCfg = { ...inferenceCfg, endpoint: safe };
447457
}
@@ -655,6 +665,16 @@ export async function actionPlan(
655665
profile: string,
656666
blueprint: Blueprint,
657667
options?: { dryRun?: boolean; endpointUrl?: string },
668+
): Promise<RunPlan> {
669+
return await metrics.observeOperation("blueprint_execution", { action: "plan" }, () =>
670+
actionPlanImpl(profile, blueprint, options),
671+
);
672+
}
673+
674+
async function actionPlanImpl(
675+
profile: string,
676+
blueprint: Blueprint,
677+
options?: { dryRun?: boolean; endpointUrl?: string },
658678
): Promise<RunPlan> {
659679
const rid = emitRunId();
660680
progress(10, "Validating blueprint");
@@ -691,6 +711,16 @@ export async function actionApply(
691711
profile: string,
692712
blueprint: Blueprint,
693713
options?: { planPath?: string; endpointUrl?: string },
714+
): Promise<void> {
715+
await metrics.observeOperation("blueprint_execution", { action: "apply" }, () =>
716+
actionApplyImpl(profile, blueprint, options),
717+
);
718+
}
719+
720+
async function actionApplyImpl(
721+
profile: string,
722+
blueprint: Blueprint,
723+
options?: { planPath?: string; endpointUrl?: string },
694724
): Promise<void> {
695725
if (options?.planPath) {
696726
throw new Error(
@@ -727,14 +757,16 @@ export async function actionApply(
727757
createArgs.push("--forward", String(port));
728758
}
729759

730-
const createResult = await runCmd(createArgs, { reject: false });
731-
if (createResult.exitCode !== 0) {
732-
if (createResult.stderr.includes("already exists")) {
733-
log(`Sandbox '${sandboxName}' already exists, reusing.`);
734-
} else {
735-
throw new Error(`Failed to create sandbox: ${createResult.stderr}`);
760+
await metrics.observeOperation("sandbox_lifecycle", { operation: "create" }, async () => {
761+
const createResult = await runCmd(createArgs, { reject: false });
762+
if (createResult.exitCode !== 0) {
763+
if (createResult.stderr.includes("already exists")) {
764+
log(`Sandbox '${sandboxName}' already exists, reusing.`);
765+
} else {
766+
throw new Error(`Failed to create sandbox: ${createResult.stderr}`);
767+
}
736768
}
737-
}
769+
});
738770

739771
progress(50, "Configuring inference provider");
740772
const providerName = inferenceCfg.provider_name ?? "default";

nemoclaw/src/index.ts

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ import {
1919
describeOnboardProvider,
2020
loadOnboardConfig,
2121
} from "./onboard/config.js";
22+
import { isMetricsEnabled, metrics } from "./observability/metrics.js";
23+
import { startMetricsServer, type MetricsServer } from "./observability/server.js";
2224
import { registerRuntimeContext } from "./runtime-context.js";
2325
import { scanForSecrets, isMemoryPath } from "./security/secret-scanner.js";
2426
import { safeResolvePath } from "./security/safe-resolve-path.js";
@@ -352,7 +354,43 @@ export default function register(api: OpenClawPluginApi): void {
352354
handler: (ctx) => handleSlashCommand(ctx, api),
353355
});
354356

355-
// 2. Register nvidia-nim provider from the active OpenClaw config, falling
357+
// 2. Register optional Prometheus-compatible metrics endpoint (#233)
358+
if (isMetricsEnabled()) {
359+
let metricsServer: MetricsServer | undefined;
360+
api.registerService({
361+
id: "nemoclaw-metrics",
362+
start: async ({ logger }) => {
363+
try {
364+
metricsServer = await startMetricsServer({ registry: metrics, logger });
365+
} catch (error) {
366+
logger.warn(
367+
`[OBSERVABILITY] Could not start NemoClaw metrics endpoint: ${
368+
error instanceof Error ? error.message : String(error)
369+
}`,
370+
);
371+
}
372+
},
373+
stop: async ({ logger }) => {
374+
if (!metricsServer) {
375+
return;
376+
}
377+
try {
378+
await metricsServer.close();
379+
logger.info("NemoClaw metrics endpoint stopped");
380+
} catch (error) {
381+
logger.warn(
382+
`[OBSERVABILITY] Could not stop NemoClaw metrics endpoint cleanly: ${
383+
error instanceof Error ? error.message : String(error)
384+
}`,
385+
);
386+
} finally {
387+
metricsServer = undefined;
388+
}
389+
},
390+
});
391+
}
392+
393+
// 3. Register nvidia-nim provider from the active OpenClaw config, falling
356394
// back to the onboard snapshot and then the NemoClaw default.
357395
const onboardCfg = loadOnboardConfig();
358396
const activeModel = readOpenClawPrimaryModel(api.logger) || onboardCfg?.model || "";

0 commit comments

Comments
 (0)