mtmd: llava_uhd should no longer use batch dim

ngxson · ngxson · commit 15cc47f3c07d · 2026-06-17T21:46:49.000+02:00
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
@@ -1105,6 +1105,8 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
         img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
         output.entries.push_back(std::move(res));
     }
+    output.grid_x = inst.grid_size.width;
+    output.grid_y = inst.grid_size.height;
     return true;
 }
 
@@ -1558,3 +1560,22 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
     output.entries.push_back(std::move(img_f32));
     return true;
 }
+
+bool mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    // call super class preprocessor
+    bool ok = mtmd_image_preprocessor_llava_uhd::preprocess(img, output);
+    if (!ok) {
+        return false;
+    }
+    if (output.entries.size() == 1) {
+        // Single-tile (overview only): append one newline row.
+        output.entries[0]->add_newline = true;
+    } else {
+        // Multi-tile: overview gets no newline, grid tiles get one.
+        output.entries[0]->add_newline = false;
+        for (size_t i = 1; i < output.entries.size(); ++i) {
+            output.entries[i]->add_newline = true;
+        }
+    }
+    return true;
+}
diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h
@@ -197,3 +197,9 @@ struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
     mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
     bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
+
+// similar to llava_uhd, but has add_newline
+struct mtmd_image_preprocessor_granite : mtmd_image_preprocessor_llava_uhd {
+    mtmd_image_preprocessor_granite(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+};
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -639,7 +639,7 @@ struct mtmd_context {
                 {
                     img_beg = "<image>";
                     img_end = "";
-                    image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
                 } break;
             default:
                 throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
@@ -1033,7 +1033,10 @@ struct mtmd_tokenizer {
     int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
         GGML_ASSERT(!bitmaps.empty());
 
-        if (!bitmaps[0]->is_audio) {
+        // note: only one type of media is supported per call, caller should enforce this
+        const bool is_vision = !bitmaps[0]->is_audio;
+
+        if (is_vision) {
             // handle image
 
             if (!ctx->ctx_v) {
@@ -1085,31 +1088,9 @@ struct mtmd_tokenizer {
                 batch_f32.grid_y = tmp_batch.grid_y;
             }
 
-            // Annotate llava-next style tiles so clip_n_output_tokens accounts
-            // for per-tile newline injection.
-            if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) {
-                if (batch_f32.entries.size() == 1) {
-                    // Single-tile (overview only): append one newline row.
-                    batch_f32.entries[0]->add_newline = true;
-                } else {
-                    // Multi-tile: overview gets no newline, grid tiles get one.
-                    batch_f32.entries[0]->add_newline = false;
-                    for (size_t i = 1; i < batch_f32.entries.size(); ++i) {
-                        batch_f32.entries[i]->add_newline = true;
-                    }
-                }
-            }
-
             // handle llava-uhd style preprocessing
             const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
-            if (
-                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
-                || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
-            ) {
+            if (has_tiling_grid) {
                 // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
                 GGML_ASSERT(bitmaps.size() == 1);