Skip to content

Commit 9323f4b

Browse files
localai-botmudler
andauthored
feat(llama-cpp): video input support (mtmd #24269) (#10216)
* chore(llama-cpp): bump to 8f83d6c for mtmd video input support Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(llama-cpp): forward video input to mtmd (template + non-template paths) Wire request->videos() into grpc-server.cpp mirroring the existing image and audio handling: a video_data build + non-template files extraction, and input_video chat chunks on the tokenizer-template path. allow_video is auto-set at model load by the vendored upstream chat_params. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(ui): add video attachment support to the chat UI Mirror the image/audio attachment path for video: emit video_url content parts, accept video/* in the picker, keep video files as base64, show a film icon badge, and render attached video inline with a <video> player. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(llama-cpp): patch mtmd video stdin double-close (heap crash) Upstream mtmd video input (ggml-org/llama.cpp#24269) double-fcloses the ffmpeg/ffprobe stdin FILE: feed_stdin() fclose()s the FILE returned by subprocess_stdin() (which is sp->stdin_file), then subprocess_destroy() fclose()s the same pointer again -> heap corruption that aborts the backend on any base64 input_video request (the CLI --video file path is unaffected). Vendor a one-line fix (null sp->stdin_file after fclose) via prepare.sh's patches/ until upstream merges it. Verified e2e with gemma-4-e2b-it-qat-q4_0: video frames decode via ffmpeg and the model answers correctly (red clip -> 'Red', blue -> 'Blue'). Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore(llama-cpp): re-pin to upstream #24316, drop vendored stdin patch Upstream replaced the ad-hoc video stdin handling with a proper RAII refactor (ggml-org/llama.cpp#24316, "mtmd: refactor video subproc handling"), which includes the same `sp->stdin_file = nullptr` guard our patch added (plus join-before-destroy ordering). Re-pin LLAMA_VERSION to that branch head and drop patches/0001 - it's now redundant. Verified e2e with gemma-4-e2b-it-qat-q4_0: no crash, video frames decode and the model answers correctly (red clip -> "Red", blue -> "Blue"). NOTE: #24316 is not yet merged, so this pins to its branch-head commit (28ca1e60). Re-pin to the squash-merge commit on master once it lands, otherwise `git fetch` may lose the commit after the branch is deleted. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
1 parent c20225f commit 9323f4b

4 files changed

Lines changed: 85 additions & 7 deletions

File tree

backend/cpp/llama-cpp/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
LLAMA_VERSION?=9e3b928fd8c9d14dbf15a8768b9fdd7e5c721d66
2+
LLAMA_VERSION?=28ca1e600c5dac1854fb7e09611914013430b037
33
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
44

55
CMAKE_ARGS?=

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,15 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
381381
});
382382
}
383383

384+
// for each video in the request, add the video data
385+
for (int i = 0; i < predict->videos_size(); i++) {
386+
data["video_data"].push_back(json
387+
{
388+
{"id", i},
389+
{"data", predict->videos(i)},
390+
});
391+
}
392+
384393
data["stop"] = predict->stopprompts();
385394
// data["n_probs"] = predict->nprobs();
386395
//TODO: images,
@@ -1503,7 +1512,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
15031512
msg_json["role"] = msg.role();
15041513

15051514
bool is_last_user_msg = (i == last_user_msg_idx);
1506-
bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0);
1515+
bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);
15071516

15081517
// Handle content - can be string, null, or array
15091518
// For multimodal content, we'll embed images/audio from separate fields
@@ -1554,6 +1563,16 @@ class BackendServiceImpl final : public backend::Backend::Service {
15541563
content_array.push_back(audio_chunk);
15551564
}
15561565
}
1566+
if (request->videos_size() > 0) {
1567+
for (int j = 0; j < request->videos_size(); j++) {
1568+
json video_chunk;
1569+
video_chunk["type"] = "input_video";
1570+
json input_video;
1571+
input_video["data"] = request->videos(j);
1572+
video_chunk["input_video"] = input_video;
1573+
content_array.push_back(video_chunk);
1574+
}
1575+
}
15571576
msg_json["content"] = content_array;
15581577
} else {
15591578
// Use content as-is (already array or not last user message)
@@ -1588,6 +1607,16 @@ class BackendServiceImpl final : public backend::Backend::Service {
15881607
content_array.push_back(audio_chunk);
15891608
}
15901609
}
1610+
if (request->videos_size() > 0) {
1611+
for (int j = 0; j < request->videos_size(); j++) {
1612+
json video_chunk;
1613+
video_chunk["type"] = "input_video";
1614+
json input_video;
1615+
input_video["data"] = request->videos(j);
1616+
video_chunk["input_video"] = input_video;
1617+
content_array.push_back(video_chunk);
1618+
}
1619+
}
15911620
msg_json["content"] = content_array;
15921621
} else if (msg.role() == "tool") {
15931622
// Tool role messages must have content field set, even if empty
@@ -2039,6 +2068,16 @@ class BackendServiceImpl final : public backend::Backend::Service {
20392068
files.push_back(decoded_data);
20402069
}
20412070
}
2071+
2072+
const auto &video_data = data.find("video_data");
2073+
if (video_data != data.end() && video_data->is_array())
2074+
{
2075+
for (const auto &video : *video_data)
2076+
{
2077+
auto decoded_data = base64_decode(video["data"].get<std::string>());
2078+
files.push_back(decoded_data);
2079+
}
2080+
}
20422081
}
20432082

20442083
const bool has_mtmd = ctx_server.impl->mctx != nullptr;
@@ -2291,7 +2330,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
22912330
}
22922331

22932332
bool is_last_user_msg = (i == last_user_msg_idx);
2294-
bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0);
2333+
bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);
22952334

22962335
// Handle content - can be string, null, or array
22972336
// For multimodal content, we'll embed images/audio from separate fields
@@ -2344,6 +2383,16 @@ class BackendServiceImpl final : public backend::Backend::Service {
23442383
content_array.push_back(audio_chunk);
23452384
}
23462385
}
2386+
if (request->videos_size() > 0) {
2387+
for (int j = 0; j < request->videos_size(); j++) {
2388+
json video_chunk;
2389+
video_chunk["type"] = "input_video";
2390+
json input_video;
2391+
input_video["data"] = request->videos(j);
2392+
video_chunk["input_video"] = input_video;
2393+
content_array.push_back(video_chunk);
2394+
}
2395+
}
23472396
msg_json["content"] = content_array;
23482397
} else {
23492398
// Use content as-is (already array or not last user message)
@@ -2383,6 +2432,16 @@ class BackendServiceImpl final : public backend::Backend::Service {
23832432
content_array.push_back(audio_chunk);
23842433
}
23852434
}
2435+
if (request->videos_size() > 0) {
2436+
for (int j = 0; j < request->videos_size(); j++) {
2437+
json video_chunk;
2438+
video_chunk["type"] = "input_video";
2439+
json input_video;
2440+
input_video["data"] = request->videos(j);
2441+
video_chunk["input_video"] = input_video;
2442+
content_array.push_back(video_chunk);
2443+
}
2444+
}
23862445
msg_json["content"] = content_array;
23872446
SRV_INF("[CONTENT DEBUG] Predict: Message %d created content array with media\n", i);
23882447
} else if (!msg.tool_calls().empty()) {
@@ -2845,6 +2904,16 @@ class BackendServiceImpl final : public backend::Backend::Service {
28452904
files.push_back(decoded_data);
28462905
}
28472906
}
2907+
2908+
const auto &video_data = data.find("video_data");
2909+
if (video_data != data.end() && video_data->is_array())
2910+
{
2911+
for (const auto &video : *video_data)
2912+
{
2913+
auto decoded_data = base64_decode(video["data"].get<std::string>());
2914+
files.push_back(decoded_data);
2915+
}
2916+
}
28482917
}
28492918

28502919
// process files

core/http/react-ui/src/hooks/useChat.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,12 @@ export function useChat(initialModel = '') {
216216
audio_url: { url: `data:${file.type};base64,${file.base64}` },
217217
})
218218
userFiles.push({ name: file.name, type: 'audio' })
219+
} else if (file.type?.startsWith('video/')) {
220+
messageContent.push({
221+
type: 'video_url',
222+
video_url: { url: `data:${file.type};base64,${file.base64}` },
223+
})
224+
userFiles.push({ name: file.name, type: 'video' })
219225
} else {
220226
// Text/PDF files - append to content
221227
if (file.textContent) {

core/http/react-ui/src/pages/Chat.jsx

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ function UserMessageContent({ content, files }) {
265265
<div className="chat-message-files">
266266
{files.map((f, i) => (
267267
<span key={i} className="chat-file-inline">
268-
<i className={`fas ${f.type === 'image' ? 'fa-image' : f.type === 'audio' ? 'fa-headphones' : 'fa-file'}`} />
268+
<i className={`fas ${f.type === 'image' ? 'fa-image' : f.type === 'audio' ? 'fa-headphones' : f.type === 'video' ? 'fa-film' : 'fa-file'}`} />
269269
{f.name}
270270
</span>
271271
))}
@@ -274,6 +274,9 @@ function UserMessageContent({ content, files }) {
274274
{Array.isArray(content) && content.filter(c => c.type === 'image_url').map((img, i) => (
275275
<img key={i} src={img.image_url.url} alt="attached" className="chat-inline-image" />
276276
))}
277+
{Array.isArray(content) && content.filter(c => c.type === 'video_url').map((vid, i) => (
278+
<video key={i} src={vid.video_url.url} controls className="chat-inline-video" />
279+
))}
277280
</>
278281
)
279282
}
@@ -711,7 +714,7 @@ export default function Chat() {
711714
for (const file of e.target.files) {
712715
const base64 = await fileToBase64(file)
713716
const entry = { name: file.name, type: file.type, base64 }
714-
if (!file.type.startsWith('image/') && !file.type.startsWith('audio/')) {
717+
if (!file.type.startsWith('image/') && !file.type.startsWith('audio/') && !file.type.startsWith('video/')) {
715718
entry.textContent = await file.text().catch(() => '')
716719
}
717720
newFiles.push(entry)
@@ -1244,7 +1247,7 @@ export default function Chat() {
12441247
<div className="chat-files">
12451248
{files.map((f, i) => (
12461249
<span key={i} className="chat-file-badge">
1247-
<i className={`fas ${f.type?.startsWith('image/') ? 'fa-image' : f.type?.startsWith('audio/') ? 'fa-headphones' : 'fa-file'}`} />
1250+
<i className={`fas ${f.type?.startsWith('image/') ? 'fa-image' : f.type?.startsWith('audio/') ? 'fa-headphones' : f.type?.startsWith('video/') ? 'fa-film' : 'fa-file'}`} />
12481251
{f.name}
12491252
<button onClick={() => setFiles(prev => prev.filter((_, idx) => idx !== i))}>
12501253
<i className="fas fa-xmark" />
@@ -1343,7 +1346,7 @@ export default function Chat() {
13431346
ref={fileInputRef}
13441347
type="file"
13451348
multiple
1346-
accept="image/*,audio/*,application/pdf,.txt,.md,.csv,.json"
1349+
accept="image/*,audio/*,video/*,application/pdf,.txt,.md,.csv,.json"
13471350
style={{ display: 'none' }}
13481351
onChange={handleFileChange}
13491352
/>

0 commit comments

Comments
 (0)