Skip to content

Commit 15cc47f

Browse files
committed
mtmd: llava_uhd should no longer use batch dim
1 parent 4b4d13a commit 15cc47f

3 files changed

Lines changed: 33 additions & 25 deletions

File tree

tools/mtmd/mtmd-image.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,6 +1105,8 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
11051105
img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
11061106
output.entries.push_back(std::move(res));
11071107
}
1108+
output.grid_x = inst.grid_size.width;
1109+
output.grid_y = inst.grid_size.height;
11081110
return true;
11091111
}
11101112

@@ -1558,3 +1560,22 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
15581560
output.entries.push_back(std::move(img_f32));
15591561
return true;
15601562
}
1563+
1564+
bool mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
1565+
// call super class preprocessor
1566+
bool ok = mtmd_image_preprocessor_llava_uhd::preprocess(img, output);
1567+
if (!ok) {
1568+
return false;
1569+
}
1570+
if (output.entries.size() == 1) {
1571+
// Single-tile (overview only): append one newline row.
1572+
output.entries[0]->add_newline = true;
1573+
} else {
1574+
// Multi-tile: overview gets no newline, grid tiles get one.
1575+
output.entries[0]->add_newline = false;
1576+
for (size_t i = 1; i < output.entries.size(); ++i) {
1577+
output.entries[i]->add_newline = true;
1578+
}
1579+
}
1580+
return true;
1581+
}

tools/mtmd/mtmd-image.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,3 +197,9 @@ struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
197197
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
198198
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
199199
};
200+
201+
// similar to llava_uhd, but has add_newline
202+
struct mtmd_image_preprocessor_granite : mtmd_image_preprocessor_llava_uhd {
203+
mtmd_image_preprocessor_granite(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
204+
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
205+
};

tools/mtmd/mtmd.cpp

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -639,7 +639,7 @@ struct mtmd_context {
639639
{
640640
img_beg = "<image>";
641641
img_end = "";
642-
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
642+
image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
643643
} break;
644644
default:
645645
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
@@ -1033,7 +1033,10 @@ struct mtmd_tokenizer {
10331033
int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
10341034
GGML_ASSERT(!bitmaps.empty());
10351035

1036-
if (!bitmaps[0]->is_audio) {
1036+
// note: only one type of media is supported per call, caller should enforce this
1037+
const bool is_vision = !bitmaps[0]->is_audio;
1038+
1039+
if (is_vision) {
10371040
// handle image
10381041

10391042
if (!ctx->ctx_v) {
@@ -1085,31 +1088,9 @@ struct mtmd_tokenizer {
10851088
batch_f32.grid_y = tmp_batch.grid_y;
10861089
}
10871090

1088-
// Annotate llava-next style tiles so clip_n_output_tokens accounts
1089-
// for per-tile newline injection.
1090-
if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) {
1091-
if (batch_f32.entries.size() == 1) {
1092-
// Single-tile (overview only): append one newline row.
1093-
batch_f32.entries[0]->add_newline = true;
1094-
} else {
1095-
// Multi-tile: overview gets no newline, grid tiles get one.
1096-
batch_f32.entries[0]->add_newline = false;
1097-
for (size_t i = 1; i < batch_f32.entries.size(); ++i) {
1098-
batch_f32.entries[i]->add_newline = true;
1099-
}
1100-
}
1101-
}
1102-
11031091
// handle llava-uhd style preprocessing
11041092
const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
1105-
if (
1106-
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
1107-
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
1108-
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
1109-
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
1110-
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
1111-
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
1112-
) {
1093+
if (has_tiling_grid) {
11131094
// [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
11141095
GGML_ASSERT(bitmaps.size() == 1);
11151096

0 commit comments

Comments
 (0)