mtmd: support "frame merge" for qwen-vl-based models (#21858)

ngxson · andrewmd5 · web-flow · commit 31e82494c0a3 · 2026-06-06T21:17:25.000+02:00
* feat: add video support for Qwen3.5

* various clean up

* revise the design

* fix llava-uhd case

* nits

* nits 2

---------

Co-authored-by: andrewmd5 &lt;1297077+andrewmd5@users.noreply.github.com&gt;
diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
@@ -37,6 +37,9 @@ struct clip_graph {
     float kq_scale; // TODO: maybe move this to hparams
     const clip_flash_attn_type flash_attn_type;
 
+    // TODO [QWEN_VIDEO]: improve this in the future
+    int n_batch = 1;
+
     ggml_context_ptr ctx0_ptr;
     ggml_context * ctx0;
     ggml_cgraph * gf;
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
@@ -480,10 +480,6 @@ struct clip_image_u8 {
         buf[idx + 2] = rgb[2];
     }
 
-    size_t n_pixels() const {
-        return (size_t) nx * (size_t) ny;
-    }
-
     size_t n_elements() const {
         return n_pixels() * 3;
     }
@@ -492,10 +488,16 @@ struct clip_image_u8 {
     std::vector<uint8_t> buf;
     int nx = 0;
     int ny = 0;
+
+    size_t n_pixels() const {
+        return (size_t) nx * (size_t) ny;
+    }
 };
 
 // For images, buf.size() == nx*ny*3
 //     Memory layout: RGBRGBRGB...
+// For seq, buf.size() == nx*ny*3*nt
+//     Memory layout: RGBRGB...RGBRGB... (nt times)
 // For audio, only one channel is used, buf.size() == nx*ny
 //     nx will be n_frames and ny will be n_mel
 struct clip_image_f32 {
@@ -544,10 +546,6 @@ struct clip_image_f32 {
         }
     }
 
-    size_t n_pixels() const {
-        return (size_t) nx_ * (size_t) ny_;
-    }
-
     size_t n_elements() const {
         return n_pixels() * 3;
     }
@@ -580,6 +578,10 @@ struct clip_image_f32 {
     std::vector<float> buf;
     int nx_ = 0;
     int ny_ = 0;
+
+    size_t n_pixels() const {
+        return (size_t) nx_ * (size_t) ny_;
+    }
 };
 
 //
@@ -627,6 +629,7 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
     va_end(args);
 }
 
+#define LOG_TRC(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
 #define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -527,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
 }
 
 ggml_tensor * clip_graph::build_inp_raw(int channels) {
-    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels);
+    ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels, n_batch);
     ggml_set_name(inp_raw, "inp_raw");
     ggml_set_input(inp_raw);
     return inp_raw;
@@ -848,8 +848,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
 }
 
 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
-    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
-
     const clip_image_f32 & img = *imgs.entries[0];
     std::unique_ptr<clip_graph> builder;
 
@@ -1009,6 +1007,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             GGML_ABORT("missing cgraph builder");
     }
 
+    // TODO [QWEN_VIDEO]: improve this in the future
+    builder->n_batch = imgs.entries.size();
+
     return builder->build();
 }
 
@@ -3479,12 +3480,15 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
 
 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
-    int batch_size = imgs.entries.size();
+    int n_batch_cur = imgs.entries.size();
+
+    // maximum supported batch size, usually == 2 for qwen-vl-based models
+    int n_batch_max = clip_model_n_batch_max(ctx);
 
     // TODO @ngxson : implement batch size > 1 as a loop
     //                we don't need true batching support because the cgraph will gonna be big anyway
-    if (batch_size != 1) {
-        return false; // only support batch size of 1
+    if (n_batch_cur > n_batch_max) {
+        return false;
     }
 
     // if buffers are not allocated, we need to do a warmup run to allocate them
@@ -3555,18 +3559,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         // └─────┘ │
         //   ──────┘ x B
 
-        for (size_t i = 0; i < imgs.entries.size(); i++) {
-            const int nx = imgs.entries[i]->nx();
-            const int ny = imgs.entries[i]->ny();
-            const int n = nx * ny;
+        // IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models
+        // All entries must have the same spatial size (enforced by can_batch_with() during merging)
+        {
+            const int nx = imgs.entries[0]->nx();
+            const int ny = imgs.entries[0]->ny();
+            const int n  = nx * ny;
 
-            for (int b = 0; b < batch_size; b++) {
+            for (int b = 0; b < n_batch_cur; b++) {
                 const auto & buf = imgs.entries[b]->get_ro_buf();
                 float * batch_entry = inp_raw.data() + b * (3*n);
                 for (int y = 0; y < ny; y++) {
                     for (int x = 0; x < nx; x++) {
-                        size_t base_src = 3*(y * nx + x); // idx of the first channel
-                        size_t base_dst =    y * nx + x;  // idx of the first channel
+                        size_t base_src = 3*(y * nx + x);
+                        size_t base_dst =    y * nx + x;
                         batch_entry[      base_dst] = buf[base_src    ];
                         batch_entry[1*n + base_dst] = buf[base_src + 1];
                         batch_entry[2*n + base_dst] = buf[base_src + 2];
@@ -4549,6 +4555,17 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
     return ctx->model.modality == CLIP_MODALITY_AUDIO;
 }
 
+int clip_model_n_batch_max(const struct clip_ctx * ctx) {
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+            return 2;
+        default:
+            return 1;
+    }
+}
+
 //
 // API used internally with mtmd
 //
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -20,6 +20,12 @@ struct clip_image_size {
     bool operator==(const clip_image_size & other) const {
         return width == other.width && height == other.height;
     }
+    bool operator!=(const clip_image_size & other) const {
+        return !(*this == other);
+    }
+    int area() const {
+        return width * height;
+    }
 };
 
 struct clip_image_f32;
@@ -101,6 +107,8 @@ bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 
+int clip_model_n_batch_max(const struct clip_ctx * ctx);
+
 std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
 
 struct clip_cap {
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
@@ -31,10 +31,11 @@ struct clip_graph_pixtral : clip_graph {
 struct clip_graph_qwen2vl : clip_graph {
     clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
+    ggml_tensor * build_inp_with_temporal_merge();
 };
 
-struct clip_graph_qwen3vl : clip_graph {
-    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+struct clip_graph_qwen3vl : clip_graph_qwen2vl {
+    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_qwen2vl(ctx, img) {}
     ggml_cgraph * build() override;
 };
 
diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp
@@ -1,5 +1,34 @@
 #include "models.h"
 
+ggml_tensor * clip_graph_qwen2vl::build_inp_with_temporal_merge() {
+    ggml_tensor * inp_raw = build_inp_raw();
+
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
+
+    const size_t nb1 = ggml_row_size(inp_raw->type, img.nx());
+    const size_t nb2 = ggml_row_size(inp_raw->type, img.nx() * img.ny());
+
+    if (n_batch == 1) {
+        // still image input
+        return ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
+    } else if (n_batch == 2) {
+        // 2 frames input (video input)
+        ggml_tensor * inp_0 = ggml_view_3d(ctx0, inp_raw,
+                                    img.nx(), img.ny(), 3, nb1, nb2, 0);
+        ggml_tensor * inp_1 = ggml_view_3d(ctx0, inp_raw,
+                                    img.nx(), img.ny(), 3, nb1, nb2,
+                                    nb2 * 3); // move to the second frame
+        return ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_0, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_1, patch_size, patch_size, 0, 0, 1, 1));
+    } else {
+        GGML_ASSERT(false && "n_batch > 2 is not supported");
+    }
+}
+
 ggml_cgraph * clip_graph_qwen2vl::build() {
     GGML_ASSERT(model.patch_bias == nullptr);
     GGML_ASSERT(model.class_embedding == nullptr);
@@ -16,17 +45,10 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
 
     int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
 
-    ggml_tensor * inp_raw = build_inp_raw();
-    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
+    ggml_tensor * inp = build_inp_with_temporal_merge();
 
     // second conv dimension
     {
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
-
         inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
         inp = ggml_cont_4d(
             ctx0, inp,
diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp
@@ -13,17 +13,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
 
     int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
 
-    ggml_tensor * inp_raw = build_inp_raw();
-    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    ggml_tensor * inp = build_inp_with_temporal_merge();
 
-    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
-
-    // second conv dimension
+    // spatial merge
     {
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
-
         inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
         inp = ggml_cont_4d(
             ctx0, inp,
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
@@ -1116,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
     static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
     // TODO: support 512 (tiny) and 640 (small) once we have eval data for them
 
-    const int64_t orig_area = static_cast<int64_t>(img.n_pixels());
+    const int64_t orig_area = static_cast<int64_t>(img.get_size().area());
 
     size_t  mode_i   = 0;
     int64_t min_diff = std::numeric_limits<int64_t>::max();
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h