v1.8.0: auto-detect llama.cpp live context window at startup

Itay Inbar · claude · Itay Inbar · commit c3a71449102d · 2026-05-23T17:51:49.000+03:00
llama-cpp-provider now probes the server's /props endpoint for its real
n_ctx and registers the model with that window instead of models.json's
declared default — so a `-c 131072` server shows 128k with no config
edit, and (via v1.7.0) the TUI readout, read-guard, and budgets all
follow it. /props is derived from baseUrl by stripping /v1; n_ctx read
from default_generation_settings.n_ctx. Best-effort: llamacpp-only,
1.5s timeout, any failure falls back to the declared window — startup
never blocks/breaks. Env: LITTLE_CODER_NO_CTX_PROBE,
LITTLE_CODER_LLAMACPP_PROPS_URL, LITTLE_CODER_CTX_PROBE_TIMEOUT_MS.
New tested propsUrlFor/contextWindowFromProps/probeContextWindow;
validated end-to-end against a live -c 131072 server (→ 131072).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.pi/extensions/llama-cpp-provider/config.test.ts b/.pi/extensions/llama-cpp-provider/config.test.ts
@@ -3,7 +3,16 @@ import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs";
 import { tmpdir } from "node:os";
 import { dirname, join, resolve } from "node:path";
 import { fileURLToPath } from "node:url";
-import { applyEnvOverrides, loadProviders, mergeProviders, resolveOverridePath, type ProviderEntry } from "./config.ts";
+import {
+  applyEnvOverrides,
+  loadProviders,
+  mergeProviders,
+  resolveOverridePath,
+  propsUrlFor,
+  contextWindowFromProps,
+  probeContextWindow,
+  type ProviderEntry,
+} from "./config.ts";
 
 const sampleProvider = (baseUrl: string, modelId: string): ProviderEntry => ({
   api: "openai-completions",
@@ -185,3 +194,65 @@ describe("shipped models.json", () => {
     expect(Object.keys(result.providers).sort()).toEqual(["llamacpp", "lmstudio", "ollama"]);
   });
 });
+
+describe("propsUrlFor", () => {
+  it("strips a trailing /v1 and points at the server root /props", () => {
+    expect(propsUrlFor("http://127.0.0.1:8888/v1")).toBe("http://127.0.0.1:8888/props");
+    expect(propsUrlFor("http://host:8888/v1/")).toBe("http://host:8888/props");
+    expect(propsUrlFor("http://host:8888")).toBe("http://host:8888/props");
+    expect(propsUrlFor("http://host:8888/")).toBe("http://host:8888/props");
+  });
+});
+
+describe("contextWindowFromProps", () => {
+  it("reads default_generation_settings.n_ctx (real llama.cpp shape)", () => {
+    expect(contextWindowFromProps({ default_generation_settings: { n_ctx: 131072 } })).toBe(131072);
+  });
+  it("falls back to a top-level n_ctx", () => {
+    expect(contextWindowFromProps({ n_ctx: 65536 })).toBe(65536);
+  });
+  it("returns undefined when absent or non-positive", () => {
+    expect(contextWindowFromProps({})).toBeUndefined();
+    expect(contextWindowFromProps({ default_generation_settings: { n_ctx: 0 } })).toBeUndefined();
+    expect(contextWindowFromProps({ default_generation_settings: { n_ctx: "lots" } })).toBeUndefined();
+    expect(contextWindowFromProps(null)).toBeUndefined();
+  });
+});
+
+describe("probeContextWindow", () => {
+  const okRes = (body: unknown) => ({ ok: true, json: async () => body }) as Response;
+
+  it("returns the server's n_ctx on success", async () => {
+    const fetchImpl = (async () =>
+      okRes({ default_generation_settings: { n_ctx: 131072 } })) as unknown as typeof fetch;
+    expect(await probeContextWindow("http://x:8888/v1", { fetchImpl })).toBe(131072);
+  });
+
+  it("returns undefined on a non-OK response", async () => {
+    const fetchImpl = (async () => ({ ok: false }) as Response) as unknown as typeof fetch;
+    expect(await probeContextWindow("http://x:8888/v1", { fetchImpl })).toBeUndefined();
+  });
+
+  it("returns undefined when fetch throws (server down / unreachable)", async () => {
+    const fetchImpl = (async () => {
+      throw new Error("ECONNREFUSED");
+    }) as unknown as typeof fetch;
+    expect(await probeContextWindow("http://x:8888/v1", { fetchImpl })).toBeUndefined();
+  });
+
+  it("returns undefined when the response lacks n_ctx", async () => {
+    const fetchImpl = (async () => okRes({ total_slots: 1 })) as unknown as typeof fetch;
+    expect(await probeContextWindow("http://x:8888/v1", { fetchImpl })).toBeUndefined();
+  });
+
+  it("honors an explicit props url override", async () => {
+    let seen = "";
+    const fetchImpl = (async (u: string) => {
+      seen = u;
+      return okRes({ default_generation_settings: { n_ctx: 40960 } });
+    }) as unknown as typeof fetch;
+    const got = await probeContextWindow("http://x:8888/v1", { fetchImpl, url: "http://other/props" });
+    expect(seen).toBe("http://other/props");
+    expect(got).toBe(40960);
+  });
+});
diff --git a/.pi/extensions/llama-cpp-provider/config.ts b/.pi/extensions/llama-cpp-provider/config.ts
@@ -146,3 +146,54 @@ export function loadProviders(pkgRoot: string, env: NodeJS.ProcessEnv = process.
   const withEnv = applyEnvOverrides(merged, env);
   return { providers: withEnv, sources };
 }
+
+// ── live context-window detection (llama.cpp /props) ────────────────────────
+// little-coder budgets against the model's registered contextWindow. Rather than
+// trust the static value in models.json, we ask a running llama.cpp server for
+// its actual n_ctx at startup, so a `-c 131072` server shows 128k instead of the
+// declared default. Best-effort: any failure falls back to the declared window.
+
+/** Derive the llama.cpp `/props` URL from an OpenAI-style baseUrl. llama-server
+ *  serves /props at the server ROOT, not under /v1 (which 404s), so strip a
+ *  trailing /v1 (and any trailing slash) before appending /props. */
+export function propsUrlFor(baseUrl: string): string {
+  const root = baseUrl.replace(/\/+$/, "").replace(/\/v1$/, "");
+  return `${root}/props`;
+}
+
+/** Pull the context window (n_ctx) out of a llama.cpp /props response. It lives
+ *  at default_generation_settings.n_ctx (the per-slot window — exactly what one
+ *  conversation can use); some builds also expose a top-level n_ctx. Returns
+ *  undefined when absent or not a positive number. */
+export function contextWindowFromProps(json: unknown): number | undefined {
+  const j = json as { default_generation_settings?: { n_ctx?: unknown }; n_ctx?: unknown } | null;
+  const n = Number(j?.default_generation_settings?.n_ctx ?? j?.n_ctx);
+  return Number.isFinite(n) && n > 0 ? n : undefined;
+}
+
+export interface ProbeDeps {
+  fetchImpl?: typeof fetch;
+  timeoutMs?: number;
+  url?: string;
+}
+
+/** Ask a llama.cpp server for its live context window via /props. Returns
+ *  undefined on ANY failure (server down, no /props, non-JSON, timeout) so the
+ *  caller falls back to the declared window — never throws, never blocks beyond
+ *  timeoutMs. */
+export async function probeContextWindow(baseUrl: string, deps: ProbeDeps = {}): Promise<number | undefined> {
+  const fetchImpl = deps.fetchImpl ?? fetch;
+  const url = deps.url ?? propsUrlFor(baseUrl);
+  const timeoutMs = deps.timeoutMs ?? 1500;
+  const ctrl = new AbortController();
+  const timer = setTimeout(() => ctrl.abort(), timeoutMs);
+  try {
+    const res = await fetchImpl(url, { signal: ctrl.signal });
+    if (!res.ok) return undefined;
+    return contextWindowFromProps(await res.json());
+  } catch {
+    return undefined;
+  } finally {
+    clearTimeout(timer);
+  }
+}
diff --git a/.pi/extensions/llama-cpp-provider/index.ts b/.pi/extensions/llama-cpp-provider/index.ts
@@ -1,7 +1,7 @@
 import { dirname, resolve } from "node:path";
 import { fileURLToPath } from "node:url";
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
-import { loadProviders } from "./config.ts";
+import { loadProviders, probeContextWindow } from "./config.ts";
 
 // Data-driven provider registration. Reads:
 //   1. <pkgRoot>/models.json                       (shipped default)
@@ -16,7 +16,7 @@ import { loadProviders } from "./config.ts";
 const here = dirname(fileURLToPath(import.meta.url));
 const pkgRoot = resolve(here, "..", "..", "..");
 
-export default function (pi: ExtensionAPI) {
+export default async function (pi: ExtensionAPI) {
   const result = loadProviders(pkgRoot);
 
   for (const src of result.sources) {
@@ -33,12 +33,32 @@ export default function (pi: ExtensionAPI) {
     return;
   }
 
+  // Opt-out for offline / CI / no-server launches that don't want a startup probe.
+  const probeDisabled = process.env.LITTLE_CODER_NO_CTX_PROBE === "1";
+
   for (const [name, entry] of Object.entries(result.providers)) {
+    let models = entry.models;
+
+    // Auto-detect the server's live context window so the model registers with
+    // the real n_ctx (e.g. a `-c 131072` server) instead of models.json's
+    // declared default — the TUI readout, read-guard, and context budget all
+    // follow the registered window. llama.cpp-only (the /props endpoint); any
+    // failure silently keeps the declared window, so this never breaks startup.
+    if (!probeDisabled && name === "llamacpp" && entry.models.length > 0) {
+      const probed = await probeContextWindow(entry.baseUrl, {
+        url: process.env.LITTLE_CODER_LLAMACPP_PROPS_URL || undefined,
+        timeoutMs: Number(process.env.LITTLE_CODER_CTX_PROBE_TIMEOUT_MS) || undefined,
+      });
+      if (probed) {
+        models = entry.models.map((m) => ({ ...m, contextWindow: probed }));
+      }
+    }
+
     pi.registerProvider(name, {
       baseUrl: entry.baseUrl,
       apiKey: entry.apiKey,
       api: entry.api,
-      models: entry.models,
+      models,
     });
   }
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,21 @@
 
 All notable changes to little-coder are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and little-coder's public interface (CLI, providers, tools, skills) follows semver starting at `v0.0.1` post-rename.
 
+## [v1.8.0] — 2026-05-23
+
+little-coder now **auto-detects the llama.cpp server's live context window** at startup and registers the model with it, so a `llama-server -c 131072` shows 128k instead of the declared default — no config edit. This completes [v1.7.0](#v170--2026-05-23): the budget already *followed* the registered window; now the registered window itself comes from the running server.
+
+### Added
+- **Live context-window detection for llama.cpp.** On startup `llama-cpp-provider` GETs the server's `/props` endpoint, reads its actual `n_ctx`, and registers the model with that window in place of the static `contextWindow` in `models.json`. The TUI context readout, read-guard's overflow trim, and the skill/knowledge budgets all then track the server's real window — bump `llama-server -c` and little-coder follows, no `models.json` or settings edit. The `/props` URL is derived from the provider baseUrl by stripping `/v1` (llama-server serves it at the root); the value is read from `default_generation_settings.n_ctx`. New tested helpers `propsUrlFor` / `contextWindowFromProps` / `probeContextWindow`, validated end-to-end against a live `-c 131072` server (→ 131072).
+  - **Best-effort and safe:** 1.5 s timeout, `llamacpp` provider only, and ANY failure (server down, no `/props`, non-JSON, timeout) silently falls back to the declared window — startup is never blocked or broken.
+  - **Env knobs:** `LITTLE_CODER_NO_CTX_PROBE=1` disables the probe (offline / CI); `LITTLE_CODER_LLAMACPP_PROPS_URL` overrides the `/props` URL for non-standard setups; `LITTLE_CODER_CTX_PROBE_TIMEOUT_MS` tunes the timeout.
+
+### Notes for upgraders
+- This adds one best-effort HTTP GET to the llama.cpp `/props` endpoint at launch (only for the `llamacpp` provider). If your server/proxy doesn't expose `/props`, behaviour is unchanged — the declared `models.json` `contextWindow` (default 32768) is used. Set `LITTLE_CODER_NO_CTX_PROBE=1` to skip the probe entirely.
+- No CLI-flag or public-API changes.
+
+---
+
 ## [v1.7.0] — 2026-05-23
 
 little-coder's context budget now follows the model's **live registered context window** instead of a hardcoded 32 768. Whatever window your provider declares for the active model (`contextWindow` in `models.json`, user-overridable) is what the whole harness budgets against — bump the model once and the TUI's context readout, read-guard's overflow trim, and the skill/knowledge-injection budgets all move together. This closes the common report: *"I bumped llama.cpp to 128k but little-coder still says 33k."*
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "little-coder",
-  "version": "1.7.0",
+  "version": "1.8.0",
   "description": "A pi-based coding agent optimized for small local language models. Reproduces the whitepaper's scaffold-model-fit adaptations as pi extensions.",
   "homepage": "https://github.com/itayinbarr/little-coder",
   "repository": {

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "little-coder",`
`3`		`- "version": "1.7.0",`
	`3`	`+ "version": "1.8.0",`
`4`	`4`	`"description": "A pi-based coding agent optimized for small local language models. Reproduces the whitepaper's scaffold-model-fit adaptations as pi extensions.",`
`5`	`5`	`"homepage": "https://github.com/itayinbarr/little-coder",`
`6`	`6`	`"repository": {`