Skip to content

Commit 8c72568

Browse files
Shawclaude
andcommitted
feat(plugin-local-inference): serialize resizeParallel with growLock
Adds a per-instance async mutex around DesktopLlamaAdapter.resizeParallel() to serialize concurrent callers. The C-side llama_init_from_model is already thread-safe (Metal registry uses static std::mutex; CUDA/Vulkan ctx ctors are independent) and bun:ffi calls block the JS thread, so within one call the for-loop body is already safe. This lock exists so future `await`s added inside resizeParallel cannot let two callers interleave pool mutations on ctxPool, hasDecodedFlags, and drafterAttached. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c793ccd commit 8c72568

1 file changed

Lines changed: 75 additions & 54 deletions

File tree

plugins/plugin-local-inference/src/services/desktop-llama-adapter.ts

Lines changed: 75 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,17 @@ export class DesktopLlamaAdapter {
602602
private modelPtr: Pointer | null = null;
603603
/** Pool of `llama_context` instances. Index 0 is allocated by `loadModel()`. */
604604
private ctxPool: Pointer[] = [];
605+
/**
606+
* Serializes concurrent resizeParallel() callers. The C-side
607+
* llama_init_from_model is itself thread-safe (Metal registry uses
608+
* static std::mutex; CUDA/Vulkan ctx ctors are independent) and
609+
* bun:ffi calls block the JS thread, so within one call the for-loop
610+
* inside resizeParallel is already safe. This lock exists so future
611+
* `await`s added inside resizeParallel cannot let two callers
612+
* interleave pool mutations (push/pop on ctxPool, hasDecodedFlags,
613+
* drafterAttached).
614+
*/
615+
private growLock: Promise<unknown> = Promise.resolve();
605616
/** Per-ctx KV-decoded flag (drives the `memory_clear` guard between sessions). */
606617
private hasDecodedFlags: boolean[] = [];
607618
/** Per-ctx attached drafter — `null` when no drafter on that ctx. */
@@ -880,66 +891,76 @@ export class DesktopLlamaAdapter {
880891
* Returns true when the pool size actually changed, false on no-op.
881892
*/
882893
async resizeParallel(target: number): Promise<boolean> {
883-
if (!this.modelPtr || !this.loadOpts) {
884-
throw new Error("[desktop-llama] resizeParallel before model load");
885-
}
886-
if (target < 1) {
887-
throw new Error(
888-
`[desktop-llama] resizeParallel target must be >= 1, got ${target}`,
889-
);
890-
}
891-
const current = this.ctxPool.length;
892-
if (target === current) return false;
893-
if (target < current) {
894-
// Refuse to shrink while sessions are still pinned to outgoing slots.
895-
for (const sess of this.sessions.values()) {
896-
if (sess.ctxIdx >= target) {
894+
const prev = this.growLock;
895+
let release!: () => void;
896+
this.growLock = new Promise<void>((r) => {
897+
release = r;
898+
});
899+
try {
900+
await prev;
901+
if (!this.modelPtr || !this.loadOpts) {
902+
throw new Error("[desktop-llama] resizeParallel before model load");
903+
}
904+
if (target < 1) {
905+
throw new Error(
906+
`[desktop-llama] resizeParallel target must be >= 1, got ${target}`,
907+
);
908+
}
909+
const current = this.ctxPool.length;
910+
if (target === current) return false;
911+
if (target < current) {
912+
// Refuse to shrink while sessions are still pinned to outgoing slots.
913+
for (const sess of this.sessions.values()) {
914+
if (sess.ctxIdx >= target) {
915+
throw new Error(
916+
`[desktop-llama] cannot shrink pool to ${target}: session pinned to ctxIdx=${sess.ctxIdx}`,
917+
);
918+
}
919+
}
920+
for (let i = current - 1; i >= target; i--) {
921+
const ctx = this.ctxPool[i];
922+
if (ctx !== undefined) this.llama.llama_free(ctx);
923+
this.ctxPool.pop();
924+
this.hasDecodedFlags.pop();
925+
this.drafterAttached.pop();
926+
}
927+
return true;
928+
}
929+
// Grow: allocate (target - current) additional ctxs.
930+
for (let i = current; i < target; i++) {
931+
const cp = this.shim.eliza_llama_context_params_default();
932+
let nextCtx: Pointer;
933+
try {
934+
const ctxSize = this.loadOpts.contextSize ?? 4096;
935+
const nBatch = this.loadOpts.nBatch ?? 256;
936+
const threads = this.loadOpts.threads ?? defaultThreads();
937+
this.shim.eliza_llama_context_params_set_n_ctx(cp, ctxSize);
938+
this.shim.eliza_llama_context_params_set_n_batch(cp, nBatch);
939+
this.shim.eliza_llama_context_params_set_n_ubatch(
940+
cp,
941+
this.loadOpts.nUBatch ?? nBatch,
942+
);
943+
this.shim.eliza_llama_context_params_set_n_threads(cp, threads);
944+
this.shim.eliza_llama_context_params_set_n_threads_batch(cp, threads);
945+
this.shim.eliza_llama_context_params_set_embeddings(cp, false);
946+
this.shim.eliza_llama_context_params_set_offload_kqv(cp, true);
947+
nextCtx = this.shim.eliza_llama_init_from_model(this.modelPtr, cp);
948+
} finally {
949+
this.shim.eliza_llama_context_params_free(cp);
950+
}
951+
if (!nextCtx) {
897952
throw new Error(
898-
`[desktop-llama] cannot shrink pool to ${target}: session pinned to ctxIdx=${sess.ctxIdx}`,
953+
`[desktop-llama] llama_init_from_model failed when growing pool to ${target}`,
899954
);
900955
}
901-
}
902-
for (let i = current - 1; i >= target; i--) {
903-
const ctx = this.ctxPool[i];
904-
if (ctx !== undefined) this.llama.llama_free(ctx);
905-
this.ctxPool.pop();
906-
this.hasDecodedFlags.pop();
907-
this.drafterAttached.pop();
956+
this.ctxPool.push(nextCtx);
957+
this.hasDecodedFlags.push(false);
958+
this.drafterAttached.push(false);
908959
}
909960
return true;
961+
} finally {
962+
release();
910963
}
911-
// Grow: allocate (target - current) additional ctxs.
912-
for (let i = current; i < target; i++) {
913-
const cp = this.shim.eliza_llama_context_params_default();
914-
let nextCtx: Pointer;
915-
try {
916-
const ctxSize = this.loadOpts.contextSize ?? 4096;
917-
const nBatch = this.loadOpts.nBatch ?? 256;
918-
const threads = this.loadOpts.threads ?? defaultThreads();
919-
this.shim.eliza_llama_context_params_set_n_ctx(cp, ctxSize);
920-
this.shim.eliza_llama_context_params_set_n_batch(cp, nBatch);
921-
this.shim.eliza_llama_context_params_set_n_ubatch(
922-
cp,
923-
this.loadOpts.nUBatch ?? nBatch,
924-
);
925-
this.shim.eliza_llama_context_params_set_n_threads(cp, threads);
926-
this.shim.eliza_llama_context_params_set_n_threads_batch(cp, threads);
927-
this.shim.eliza_llama_context_params_set_embeddings(cp, false);
928-
this.shim.eliza_llama_context_params_set_offload_kqv(cp, true);
929-
nextCtx = this.shim.eliza_llama_init_from_model(this.modelPtr, cp);
930-
} finally {
931-
this.shim.eliza_llama_context_params_free(cp);
932-
}
933-
if (!nextCtx) {
934-
throw new Error(
935-
`[desktop-llama] llama_init_from_model failed when growing pool to ${target}`,
936-
);
937-
}
938-
this.ctxPool.push(nextCtx);
939-
this.hasDecodedFlags.push(false);
940-
this.drafterAttached.push(false);
941-
}
942-
return true;
943964
}
944965

945966
/**

0 commit comments

Comments
 (0)