Skip to content

Commit c3a7144

Browse files
Itay Inbarclaude
andcommitted
v1.8.0: auto-detect llama.cpp live context window at startup
llama-cpp-provider now probes the server's /props endpoint for its real n_ctx and registers the model with that window instead of models.json's declared default — so a `-c 131072` server shows 128k with no config edit, and (via v1.7.0) the TUI readout, read-guard, and budgets all follow it. /props is derived from baseUrl by stripping /v1; n_ctx read from default_generation_settings.n_ctx. Best-effort: llamacpp-only, 1.5s timeout, any failure falls back to the declared window — startup never blocks/breaks. Env: LITTLE_CODER_NO_CTX_PROBE, LITTLE_CODER_LLAMACPP_PROPS_URL, LITTLE_CODER_CTX_PROBE_TIMEOUT_MS. New tested propsUrlFor/contextWindowFromProps/probeContextWindow; validated end-to-end against a live -c 131072 server (→ 131072). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 2718503 commit c3a7144

5 files changed

Lines changed: 162 additions & 5 deletions

File tree

.pi/extensions/llama-cpp-provider/config.test.ts

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,16 @@ import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs";
33
import { tmpdir } from "node:os";
44
import { dirname, join, resolve } from "node:path";
55
import { fileURLToPath } from "node:url";
6-
import { applyEnvOverrides, loadProviders, mergeProviders, resolveOverridePath, type ProviderEntry } from "./config.ts";
6+
import {
7+
applyEnvOverrides,
8+
loadProviders,
9+
mergeProviders,
10+
resolveOverridePath,
11+
propsUrlFor,
12+
contextWindowFromProps,
13+
probeContextWindow,
14+
type ProviderEntry,
15+
} from "./config.ts";
716

817
const sampleProvider = (baseUrl: string, modelId: string): ProviderEntry => ({
918
api: "openai-completions",
@@ -185,3 +194,65 @@ describe("shipped models.json", () => {
185194
expect(Object.keys(result.providers).sort()).toEqual(["llamacpp", "lmstudio", "ollama"]);
186195
});
187196
});
197+
198+
describe("propsUrlFor", () => {
199+
it("strips a trailing /v1 and points at the server root /props", () => {
200+
expect(propsUrlFor("http://127.0.0.1:8888/v1")).toBe("http://127.0.0.1:8888/props");
201+
expect(propsUrlFor("http://host:8888/v1/")).toBe("http://host:8888/props");
202+
expect(propsUrlFor("http://host:8888")).toBe("http://host:8888/props");
203+
expect(propsUrlFor("http://host:8888/")).toBe("http://host:8888/props");
204+
});
205+
});
206+
207+
describe("contextWindowFromProps", () => {
208+
it("reads default_generation_settings.n_ctx (real llama.cpp shape)", () => {
209+
expect(contextWindowFromProps({ default_generation_settings: { n_ctx: 131072 } })).toBe(131072);
210+
});
211+
it("falls back to a top-level n_ctx", () => {
212+
expect(contextWindowFromProps({ n_ctx: 65536 })).toBe(65536);
213+
});
214+
it("returns undefined when absent or non-positive", () => {
215+
expect(contextWindowFromProps({})).toBeUndefined();
216+
expect(contextWindowFromProps({ default_generation_settings: { n_ctx: 0 } })).toBeUndefined();
217+
expect(contextWindowFromProps({ default_generation_settings: { n_ctx: "lots" } })).toBeUndefined();
218+
expect(contextWindowFromProps(null)).toBeUndefined();
219+
});
220+
});
221+
222+
describe("probeContextWindow", () => {
223+
const okRes = (body: unknown) => ({ ok: true, json: async () => body }) as Response;
224+
225+
it("returns the server's n_ctx on success", async () => {
226+
const fetchImpl = (async () =>
227+
okRes({ default_generation_settings: { n_ctx: 131072 } })) as unknown as typeof fetch;
228+
expect(await probeContextWindow("http://x:8888/v1", { fetchImpl })).toBe(131072);
229+
});
230+
231+
it("returns undefined on a non-OK response", async () => {
232+
const fetchImpl = (async () => ({ ok: false }) as Response) as unknown as typeof fetch;
233+
expect(await probeContextWindow("http://x:8888/v1", { fetchImpl })).toBeUndefined();
234+
});
235+
236+
it("returns undefined when fetch throws (server down / unreachable)", async () => {
237+
const fetchImpl = (async () => {
238+
throw new Error("ECONNREFUSED");
239+
}) as unknown as typeof fetch;
240+
expect(await probeContextWindow("http://x:8888/v1", { fetchImpl })).toBeUndefined();
241+
});
242+
243+
it("returns undefined when the response lacks n_ctx", async () => {
244+
const fetchImpl = (async () => okRes({ total_slots: 1 })) as unknown as typeof fetch;
245+
expect(await probeContextWindow("http://x:8888/v1", { fetchImpl })).toBeUndefined();
246+
});
247+
248+
it("honors an explicit props url override", async () => {
249+
let seen = "";
250+
const fetchImpl = (async (u: string) => {
251+
seen = u;
252+
return okRes({ default_generation_settings: { n_ctx: 40960 } });
253+
}) as unknown as typeof fetch;
254+
const got = await probeContextWindow("http://x:8888/v1", { fetchImpl, url: "http://other/props" });
255+
expect(seen).toBe("http://other/props");
256+
expect(got).toBe(40960);
257+
});
258+
});

.pi/extensions/llama-cpp-provider/config.ts

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,54 @@ export function loadProviders(pkgRoot: string, env: NodeJS.ProcessEnv = process.
146146
const withEnv = applyEnvOverrides(merged, env);
147147
return { providers: withEnv, sources };
148148
}
149+
150+
// ── live context-window detection (llama.cpp /props) ────────────────────────
151+
// little-coder budgets against the model's registered contextWindow. Rather than
152+
// trust the static value in models.json, we ask a running llama.cpp server for
153+
// its actual n_ctx at startup, so a `-c 131072` server shows 128k instead of the
154+
// declared default. Best-effort: any failure falls back to the declared window.
155+
156+
/** Derive the llama.cpp `/props` URL from an OpenAI-style baseUrl. llama-server
157+
* serves /props at the server ROOT, not under /v1 (which 404s), so strip a
158+
* trailing /v1 (and any trailing slash) before appending /props. */
159+
export function propsUrlFor(baseUrl: string): string {
160+
const root = baseUrl.replace(/\/+$/, "").replace(/\/v1$/, "");
161+
return `${root}/props`;
162+
}
163+
164+
/** Pull the context window (n_ctx) out of a llama.cpp /props response. It lives
165+
* at default_generation_settings.n_ctx (the per-slot window — exactly what one
166+
* conversation can use); some builds also expose a top-level n_ctx. Returns
167+
* undefined when absent or not a positive number. */
168+
export function contextWindowFromProps(json: unknown): number | undefined {
169+
const j = json as { default_generation_settings?: { n_ctx?: unknown }; n_ctx?: unknown } | null;
170+
const n = Number(j?.default_generation_settings?.n_ctx ?? j?.n_ctx);
171+
return Number.isFinite(n) && n > 0 ? n : undefined;
172+
}
173+
174+
export interface ProbeDeps {
175+
fetchImpl?: typeof fetch;
176+
timeoutMs?: number;
177+
url?: string;
178+
}
179+
180+
/** Ask a llama.cpp server for its live context window via /props. Returns
181+
* undefined on ANY failure (server down, no /props, non-JSON, timeout) so the
182+
* caller falls back to the declared window — never throws, never blocks beyond
183+
* timeoutMs. */
184+
export async function probeContextWindow(baseUrl: string, deps: ProbeDeps = {}): Promise<number | undefined> {
185+
const fetchImpl = deps.fetchImpl ?? fetch;
186+
const url = deps.url ?? propsUrlFor(baseUrl);
187+
const timeoutMs = deps.timeoutMs ?? 1500;
188+
const ctrl = new AbortController();
189+
const timer = setTimeout(() => ctrl.abort(), timeoutMs);
190+
try {
191+
const res = await fetchImpl(url, { signal: ctrl.signal });
192+
if (!res.ok) return undefined;
193+
return contextWindowFromProps(await res.json());
194+
} catch {
195+
return undefined;
196+
} finally {
197+
clearTimeout(timer);
198+
}
199+
}

.pi/extensions/llama-cpp-provider/index.ts

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { dirname, resolve } from "node:path";
22
import { fileURLToPath } from "node:url";
33
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
4-
import { loadProviders } from "./config.ts";
4+
import { loadProviders, probeContextWindow } from "./config.ts";
55

66
// Data-driven provider registration. Reads:
77
// 1. <pkgRoot>/models.json (shipped default)
@@ -16,7 +16,7 @@ import { loadProviders } from "./config.ts";
1616
const here = dirname(fileURLToPath(import.meta.url));
1717
const pkgRoot = resolve(here, "..", "..", "..");
1818

19-
export default function (pi: ExtensionAPI) {
19+
export default async function (pi: ExtensionAPI) {
2020
const result = loadProviders(pkgRoot);
2121

2222
for (const src of result.sources) {
@@ -33,12 +33,32 @@ export default function (pi: ExtensionAPI) {
3333
return;
3434
}
3535

36+
// Opt-out for offline / CI / no-server launches that don't want a startup probe.
37+
const probeDisabled = process.env.LITTLE_CODER_NO_CTX_PROBE === "1";
38+
3639
for (const [name, entry] of Object.entries(result.providers)) {
40+
let models = entry.models;
41+
42+
// Auto-detect the server's live context window so the model registers with
43+
// the real n_ctx (e.g. a `-c 131072` server) instead of models.json's
44+
// declared default — the TUI readout, read-guard, and context budget all
45+
// follow the registered window. llama.cpp-only (the /props endpoint); any
46+
// failure silently keeps the declared window, so this never breaks startup.
47+
if (!probeDisabled && name === "llamacpp" && entry.models.length > 0) {
48+
const probed = await probeContextWindow(entry.baseUrl, {
49+
url: process.env.LITTLE_CODER_LLAMACPP_PROPS_URL || undefined,
50+
timeoutMs: Number(process.env.LITTLE_CODER_CTX_PROBE_TIMEOUT_MS) || undefined,
51+
});
52+
if (probed) {
53+
models = entry.models.map((m) => ({ ...m, contextWindow: probed }));
54+
}
55+
}
56+
3757
pi.registerProvider(name, {
3858
baseUrl: entry.baseUrl,
3959
apiKey: entry.apiKey,
4060
api: entry.api,
41-
models: entry.models,
61+
models,
4262
});
4363
}
4464
}

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,21 @@
22

33
All notable changes to little-coder are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and little-coder's public interface (CLI, providers, tools, skills) follows semver starting at `v0.0.1` post-rename.
44

5+
## [v1.8.0] — 2026-05-23
6+
7+
little-coder now **auto-detects the llama.cpp server's live context window** at startup and registers the model with it, so a `llama-server -c 131072` shows 128k instead of the declared default — no config edit. This completes [v1.7.0](#v170--2026-05-23): the budget already *followed* the registered window; now the registered window itself comes from the running server.
8+
9+
### Added
10+
- **Live context-window detection for llama.cpp.** On startup `llama-cpp-provider` GETs the server's `/props` endpoint, reads its actual `n_ctx`, and registers the model with that window in place of the static `contextWindow` in `models.json`. The TUI context readout, read-guard's overflow trim, and the skill/knowledge budgets all then track the server's real window — bump `llama-server -c` and little-coder follows, no `models.json` or settings edit. The `/props` URL is derived from the provider baseUrl by stripping `/v1` (llama-server serves it at the root); the value is read from `default_generation_settings.n_ctx`. New tested helpers `propsUrlFor` / `contextWindowFromProps` / `probeContextWindow`, validated end-to-end against a live `-c 131072` server (→ 131072).
11+
- **Best-effort and safe:** 1.5 s timeout, `llamacpp` provider only, and ANY failure (server down, no `/props`, non-JSON, timeout) silently falls back to the declared window — startup is never blocked or broken.
12+
- **Env knobs:** `LITTLE_CODER_NO_CTX_PROBE=1` disables the probe (offline / CI); `LITTLE_CODER_LLAMACPP_PROPS_URL` overrides the `/props` URL for non-standard setups; `LITTLE_CODER_CTX_PROBE_TIMEOUT_MS` tunes the timeout.
13+
14+
### Notes for upgraders
15+
- This adds one best-effort HTTP GET to the llama.cpp `/props` endpoint at launch (only for the `llamacpp` provider). If your server/proxy doesn't expose `/props`, behaviour is unchanged — the declared `models.json` `contextWindow` (default 32768) is used. Set `LITTLE_CODER_NO_CTX_PROBE=1` to skip the probe entirely.
16+
- No CLI-flag or public-API changes.
17+
18+
---
19+
520
## [v1.7.0] — 2026-05-23
621

722
little-coder's context budget now follows the model's **live registered context window** instead of a hardcoded 32 768. Whatever window your provider declares for the active model (`contextWindow` in `models.json`, user-overridable) is what the whole harness budgets against — bump the model once and the TUI's context readout, read-guard's overflow trim, and the skill/knowledge-injection budgets all move together. This closes the common report: *"I bumped llama.cpp to 128k but little-coder still says 33k."*

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "little-coder",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"description": "A pi-based coding agent optimized for small local language models. Reproduces the whitepaper's scaffold-model-fit adaptations as pi extensions.",
55
"homepage": "https://github.com/itayinbarr/little-coder",
66
"repository": {

0 commit comments

Comments
 (0)