Skip to content

Commit a45713e

Browse files
Shawclaude
andcommitted
refactor(local-inference): rewrite vision shim against mtmd ABI
Upstream llama.cpp removed `examples/llava/` and consolidated multimodal under `tools/mtmd/`. Rewrite the desktop dylib build, shim wrappers, and TS adapter bindings against the mtmd surface. - Build script: drop `LLAMA_BUILD_EXAMPLES=ON` for vision; enable `LLAMA_BUILD_MTMD=ON` instead. Replace the cmake target probe loop (`llava_static`/`llava`/`mtmd`) with a single `--target mtmd` build. Drop static-lib find for `libllava*.a`/`libclip*.a`; stage shared `libmtmd.<ext>` next to libllama and link the shim with `-lmtmd`. Drop `examples/llava` include path; add `tools/mtmd`. - Shim .h/.c: replace the llava/clip wrappers (eliza_clip_load / _free, eliza_llava_image_embed_load / _free / _eval) with the mtmd pointer-style surface (eliza_mtmd_init / _free, _bitmap_init_rgb / _bitmap_free, _input_chunks_init / _free / _size / _get, _input_chunk_type / _n_tokens, _tokenize, _encode_chunk, _output_embd). All gated behind #ifdef ELIZA_ENABLE_VISION. - Desktop adapter: replace VisionShimSymbols + bindVision with the mtmd bindings. Rename clip ctx state to mtmd ctx. Stub describeImage with an actionable error citing the two missing pieces (JS-side RGB decode via `sharp` — already in app-core but not this plugin — and an embeddings-batch wrapper for llama_batch). Add u64 to the FFITypeEnum for the size_t mtmd args. - FFI_BACKEND_WIREUP_PLAN.md: document mtmd ABI status, remaining gaps, and runtime smoke-test gate before defaulting on. Non-vision builds are unchanged: ELIZA_ENABLE_VISION defaults off, no mtmd target is built, bindVision returns null, and describeImage throws the existing "vision build flag not set" error. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 8c72568 commit a45713e

5 files changed

Lines changed: 364 additions & 362 deletions

File tree

packages/app-core/scripts/build-llama-cpp-desktop-dylib.mjs

Lines changed: 76 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -198,21 +198,26 @@ function buildTarget(targetKey) {
198198
fs.mkdirSync(buildDir, { recursive: true });
199199

200200
// ── Step 1: build libllama as a shared library ────────────────────────────
201-
// Vision (mmproj/llava) is opt-in: setting `ELIZA_ENABLE_VISION=1` in the
202-
// build env flips `LLAMA_BUILD_EXAMPLES=ON` so the llava + clip libraries
203-
// get built alongside libllama. The shim then compiles with
204-
// `-DELIZA_ENABLE_VISION=1` and links against them. Default builds skip
205-
// vision entirely (no examples target, no shim vision wrappers).
201+
// Vision (mmproj/mtmd) is opt-in: setting `ELIZA_ENABLE_VISION=1` in the
202+
// build env flips `LLAMA_BUILD_MTMD=ON` so the mtmd shared library
203+
// gets built alongside libllama. The shim then compiles with
204+
// `-DELIZA_ENABLE_VISION=1` and links against `libmtmd`. Default builds
205+
// skip vision entirely (no mtmd target, no shim vision wrappers).
206+
//
207+
// Upstream renamed the multimodal surface from `examples/llava/` to
208+
// `tools/mtmd/` and consolidated llava + clip into a single mtmd target
209+
// built as a shared library when `BUILD_SHARED_LIBS=ON`.
206210
const ENABLE_VISION = process.env.ELIZA_ENABLE_VISION === "1";
207211
const cmakeArgs = [
208212
srcDir,
209213
"-DCMAKE_BUILD_TYPE=Release",
210214
"-DBUILD_SHARED_LIBS=ON",
211215
"-DGGML_NATIVE=OFF",
212216
"-DLLAMA_BUILD_TESTS=OFF",
213-
`-DLLAMA_BUILD_EXAMPLES=${ENABLE_VISION ? "ON" : "OFF"}`,
217+
"-DLLAMA_BUILD_EXAMPLES=OFF",
214218
"-DLLAMA_BUILD_SERVER=OFF",
215219
"-DLLAMA_CURL=OFF",
220+
...(ENABLE_VISION ? ["-DLLAMA_BUILD_MTMD=ON"] : []),
216221
...t.cmakeFlags,
217222
];
218223
log(`cmake configure ${targetKey} (shared libllama)`);
@@ -233,46 +238,27 @@ function buildTarget(targetKey) {
233238
buildDir,
234239
);
235240

236-
// ── Step 1b: build llava + clip when vision is enabled ───────────────────
237-
// llama.cpp's llava lives under `examples/llava/` (older checkouts) or
238-
// `tools/mtmd/` (recent). The `llava` target builds both the llava and
239-
// clip object/static libraries when LLAMA_BUILD_EXAMPLES=ON. We build
240-
// both static libs here and link them into the shim below.
241+
// ── Step 1b: build mtmd when vision is enabled ───────────────────────────
242+
// llama.cpp HEAD exposes multimodal under `tools/mtmd/`. The `mtmd`
243+
// cmake target builds `libmtmd.<ext>` as a shared library (because
244+
// BUILD_SHARED_LIBS=ON). LLAMA_BUILD_MTMD=ON enables the target. The
245+
// shim links against this shared lib via `-lmtmd`.
241246
if (ENABLE_VISION) {
242-
log(`cmake build ${targetKey} (llava + clip for vision)`);
243-
// The target name has historically been `llava` (older) or
244-
// `llava_static`/`mtmd` (newer). Try the common ones; let cmake
245-
// surface a clear error if neither exists.
246-
const visionTargets = ["llava_static", "llava", "mtmd"];
247-
let visionTargetBuilt = false;
248-
for (const vt of visionTargets) {
249-
const probe = spawnSync(
250-
"cmake",
251-
[
252-
"--build",
253-
".",
254-
"--config",
255-
"Release",
256-
"--target",
257-
vt,
258-
"--parallel",
259-
String(os.cpus().length),
260-
],
261-
{ cwd: buildDir, stdio: "inherit" },
262-
);
263-
if (probe.status === 0) {
264-
visionTargetBuilt = true;
265-
log(`built vision target ${vt}`);
266-
break;
267-
}
268-
}
269-
if (!visionTargetBuilt) {
270-
die(
271-
`ELIZA_ENABLE_VISION=1 but could not build any of {${visionTargets.join(", ")}}; ` +
272-
`the llama.cpp checkout may not expose llava/mtmd. Check ` +
273-
`${path.join(srcDir, "examples", "llava")} or ${path.join(srcDir, "tools", "mtmd")}.`,
274-
);
275-
}
247+
log(`cmake build ${targetKey} (mtmd for vision)`);
248+
run(
249+
"cmake",
250+
[
251+
"--build",
252+
".",
253+
"--config",
254+
"Release",
255+
"--target",
256+
"mtmd",
257+
"--parallel",
258+
String(os.cpus().length),
259+
],
260+
buildDir,
261+
);
276262
}
277263

278264
// ── Step 2: locate the built libllama.<ext> and stage it ─────────────────
@@ -304,6 +290,34 @@ function buildTarget(targetKey) {
304290
log(`staging ${libllamaSrcPath}${outDir}`);
305291
fs.copyFileSync(libllamaSrcPath, path.join(outDir, libllamaName));
306292

293+
// ── Step 2b: stage libmtmd.<ext> when vision is enabled ──────────────────
294+
// mtmd is built as a shared lib (BUILD_SHARED_LIBS=ON propagates to all
295+
// targets). The shim's `-lmtmd` link will need it resolvable next to
296+
// libllama at load time via the same rpath.
297+
if (ENABLE_VISION) {
298+
const libmtmdName = `libmtmd.${t.libExt}`;
299+
const mtmdCandidates = [
300+
path.join(buildDir, libmtmdName),
301+
path.join(buildDir, "bin", libmtmdName),
302+
path.join(buildDir, "tools", "mtmd", libmtmdName),
303+
];
304+
let libmtmdSrcPath = mtmdCandidates.find((p) => fs.existsSync(p));
305+
if (!libmtmdSrcPath) {
306+
const found = spawnSync("find", [buildDir, "-name", libmtmdName, "-print"], {
307+
encoding: "utf8",
308+
});
309+
libmtmdSrcPath = found.stdout.split("\n").find((s) => s.trim());
310+
}
311+
if (!libmtmdSrcPath) {
312+
die(
313+
`ELIZA_ENABLE_VISION=1 but ${libmtmdName} not found in ${buildDir}; ` +
314+
`check that -DLLAMA_BUILD_MTMD=ON + -DBUILD_SHARED_LIBS=ON took effect.`,
315+
);
316+
}
317+
log(`staging ${libmtmdSrcPath}${outDir}`);
318+
fs.copyFileSync(libmtmdSrcPath, path.join(outDir, libmtmdName));
319+
}
320+
307321
// ── Step 3: stage headers ────────────────────────────────────────────────
308322
const incDir = path.join(outDir, "include");
309323
fs.mkdirSync(incDir, { recursive: true });
@@ -320,6 +334,15 @@ function buildTarget(targetKey) {
320334
path.join(SHIM_DIR, "eliza_llama_shim.h"),
321335
path.join(incDir, "eliza_llama_shim.h"),
322336
);
337+
// mtmd.h is staged into include/ for debug/reference; the shim compile
338+
// also has `-I${srcDir}/tools/mtmd` so this copy is optional but matches
339+
// how llama.h is staged.
340+
if (ENABLE_VISION) {
341+
const mtmdH = path.join(srcDir, "tools", "mtmd", "mtmd.h");
342+
if (fs.existsSync(mtmdH)) {
343+
fs.copyFileSync(mtmdH, path.join(incDir, "mtmd.h"));
344+
}
345+
}
323346

324347
// ── Step 4: compile the shim and NEEDED-link libllama ────────────────────
325348
const shimOut = path.join(outDir, `libeliza-llama-shim.${t.libExt}`);
@@ -340,52 +363,16 @@ function buildTarget(targetKey) {
340363
"-lllama",
341364
];
342365

343-
// Vision opt-in: link against the static llava + clip libraries built
344-
// in Step 1b. We add their include dirs + static-lib paths to the
345-
// compile invocation, and define ELIZA_ENABLE_VISION so the shim's
346-
// `#ifdef` block compiles.
366+
// Vision opt-in: link against the shared `libmtmd.<ext>` built in
367+
// Step 1b. We add the mtmd include dir to the compile invocation and
368+
// define ELIZA_ENABLE_VISION so the shim's `#ifdef` block compiles.
347369
if (ENABLE_VISION) {
348370
compilerArgs.push("-DELIZA_ENABLE_VISION=1");
349-
// Header search paths for llava.h / clip.h. Older llama.cpp keeps them
350-
// under examples/llava/; newer puts them under tools/mtmd/. Add both —
351-
// missing dirs are harmless.
352-
compilerArgs.push(
353-
`-I${path.join(srcDir, "examples", "llava")}`,
354-
`-I${path.join(srcDir, "tools", "mtmd")}`,
355-
);
356-
// Find the built static libs and link them. They land somewhere
357-
// under buildDir depending on cmake generator; do a shallow scan
358-
// for `libllava*.a` and `libclip*.a` plus `libmtmd*.a`.
359-
const visionLibCandidates = spawnSync(
360-
"find",
361-
[
362-
buildDir,
363-
"-name",
364-
"libllava*.a",
365-
"-o",
366-
"-name",
367-
"libclip*.a",
368-
"-o",
369-
"-name",
370-
"libmtmd*.a",
371-
],
372-
{ encoding: "utf8" },
373-
);
374-
const visionLibs = visionLibCandidates.stdout
375-
.split("\n")
376-
.filter((s) => s.trim());
377-
if (visionLibs.length === 0) {
378-
die(
379-
`ELIZA_ENABLE_VISION=1 but no llava/clip/mtmd static lib found in ${buildDir} ` +
380-
`after the vision target build step. Check Step 1b's cmake output.`,
381-
);
382-
}
383-
for (const lib of visionLibs) {
384-
log(`linking vision lib ${lib}`);
385-
compilerArgs.push(lib);
386-
}
387-
// Linker also needs C++ stdlib for llava (it's C++ under the hood).
388-
compilerArgs.push(platform === "darwin" ? "-lc++" : "-lstdc++");
371+
// Header search path for mtmd.h.
372+
compilerArgs.push(`-I${path.join(srcDir, "tools", "mtmd")}`);
373+
// NEEDED-link against the staged libmtmd next to libllama. The shared
374+
// lib pulls in its own C++ transitive deps, so no -lc++/-lstdc++ here.
375+
compilerArgs.push("-lmtmd");
389376
}
390377

391378
// Set rpath so libeliza-llama-shim resolves libllama from its own dir at

packages/app-core/scripts/desktop-llama-shim/eliza_llama_shim.c

Lines changed: 43 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -607,67 +607,64 @@ void* eliza_llama_sampler_init_prefill_plan(const uint8_t* plan_bytes, size_t pl
607607
return llama_sampler_init(&prefill_plan_sampler_i, s);
608608
}
609609

610-
// ── vision: mmproj-driven image describe (OPT-IN) ────────────────────────────
610+
// ── vision: mmproj-driven image describe (OPT-IN, mtmd ABI) ──────────────────
611611
//
612612
// Gated by `-DELIZA_ENABLE_VISION=1` at shim compile time. The build script
613613
// flips this when `ELIZA_ENABLE_VISION=1` is set in the build env, AND
614-
// supplements the compile invocation with the llava + clip object files
615-
// (or static lib) so the symbols exist to link.
614+
// adds `-I${srcDir}/tools/mtmd` + `-lmtmd` so the symbols below resolve.
616615
//
617-
// llama.cpp's vision API has shifted between releases: older checkouts
618-
// expose llava via `examples/llava/{llava.h, clip.h}`; recent ones moved
619-
// to `tools/mtmd/mtmd.h` with a slightly different surface. The wrappers
620-
// below target the older but more stable API. The build script's vision
621-
// step is responsible for putting those headers on the include path.
616+
// llama.cpp HEAD consolidated the historical `examples/llava/{llava.h,clip.h}`
617+
// path into a single `tools/mtmd/mtmd.h` surface. The wrappers below target
618+
// that ABI and assume `libmtmd.<ext>` is staged next to libllama so the
619+
// runtime loader can resolve it via the shim's rpath.
622620

623621
#ifdef ELIZA_ENABLE_VISION
622+
#include "mtmd.h"
624623

625-
#include "clip.h"
626-
#include "llava.h"
627-
628-
void* eliza_clip_load(const char* path) {
629-
if (!path) return NULL;
630-
// verbosity=0 silences clip_model_load's stderr chatter (it prints
631-
// per-layer load progress otherwise, which spams the host).
632-
return (void*)clip_model_load(path, /*verbosity=*/0);
624+
void* eliza_mtmd_init(const char* mmproj_path, void* text_model, bool use_gpu, int n_threads) {
625+
if (!mmproj_path || !text_model) return NULL;
626+
struct mtmd_context_params p = mtmd_context_params_default();
627+
p.use_gpu = use_gpu;
628+
p.n_threads = n_threads;
629+
p.print_timings = false;
630+
return (void*)mtmd_init_from_file(mmproj_path, (const struct llama_model*)text_model, p);
633631
}
632+
void eliza_mtmd_free(void* ctx) { if (ctx) mtmd_free((mtmd_context*)ctx); }
634633

635-
void eliza_clip_free(void* ctx_clip) {
636-
if (ctx_clip) clip_free((struct clip_ctx*)ctx_clip);
634+
void* eliza_mtmd_bitmap_init_rgb(uint32_t nx, uint32_t ny, const uint8_t* rgb) {
635+
return (!rgb) ? NULL : (void*)mtmd_bitmap_init(nx, ny, rgb);
637636
}
637+
void eliza_mtmd_bitmap_free(void* bm) { if (bm) mtmd_bitmap_free((mtmd_bitmap*)bm); }
638638

639-
void* eliza_llava_image_embed_load(
640-
void* ctx_clip,
641-
int32_t n_threads,
642-
const uint8_t* image_bytes,
643-
int32_t image_bytes_length)
644-
{
645-
if (!ctx_clip || !image_bytes || image_bytes_length <= 0) return NULL;
646-
return (void*)llava_image_embed_make_with_bytes(
647-
(struct clip_ctx*)ctx_clip,
648-
n_threads,
649-
image_bytes,
650-
image_bytes_length);
639+
void* eliza_mtmd_input_chunks_init(void) { return (void*)mtmd_input_chunks_init(); }
640+
void eliza_mtmd_input_chunks_free(void* c) { if (c) mtmd_input_chunks_free((mtmd_input_chunks*)c); }
641+
642+
int32_t eliza_mtmd_tokenize(void* ctx, void* out_chunks,
643+
const char* text, bool add_special, bool parse_special,
644+
void* const* bitmaps, size_t n_bitmaps) {
645+
if (!ctx || !out_chunks || !text) return -1;
646+
struct mtmd_input_text t = { text, add_special, parse_special };
647+
return mtmd_tokenize((mtmd_context*)ctx, (mtmd_input_chunks*)out_chunks,
648+
&t, (const mtmd_bitmap**)bitmaps, n_bitmaps);
651649
}
652650

653-
void eliza_llava_image_embed_free(void* embed) {
654-
if (embed) llava_image_embed_free((struct llava_image_embed*)embed);
651+
size_t eliza_mtmd_input_chunks_size(void* c) { return mtmd_input_chunks_size((const mtmd_input_chunks*)c); }
652+
void* eliza_mtmd_input_chunks_get(void* c, size_t i) {
653+
return (void*)mtmd_input_chunks_get((const mtmd_input_chunks*)c, i);
654+
}
655+
int32_t eliza_mtmd_input_chunk_type(void* ch) {
656+
return (int32_t)mtmd_input_chunk_get_type((const mtmd_input_chunk*)ch);
657+
}
658+
size_t eliza_mtmd_input_chunk_n_tokens(void* ch) {
659+
return mtmd_input_chunk_get_n_tokens((const mtmd_input_chunk*)ch);
655660
}
656661

657-
int32_t eliza_llava_image_embed_eval(
658-
void* ctx_llama,
659-
void* embed,
660-
int32_t n_batch,
661-
int32_t* n_past)
662-
{
663-
if (!ctx_llama || !embed || !n_past) return -1;
664-
// llava_eval_image_embed returns bool; map to int32 for the ABI.
665-
bool ok = llava_eval_image_embed(
666-
(struct llama_context*)ctx_llama,
667-
(const struct llava_image_embed*)embed,
668-
n_batch,
669-
n_past);
670-
return ok ? 0 : -1;
662+
int32_t eliza_mtmd_encode_chunk(void* ctx, void* chunk) {
663+
if (!ctx || !chunk) return -1;
664+
return mtmd_encode_chunk((mtmd_context*)ctx, (const mtmd_input_chunk*)chunk);
671665
}
672666

667+
const float* eliza_mtmd_output_embd(void* ctx) {
668+
return ctx ? mtmd_get_output_embd((mtmd_context*)ctx) : NULL;
669+
}
673670
#endif // ELIZA_ENABLE_VISION

0 commit comments

Comments
 (0)