@@ -639,7 +639,7 @@ struct mtmd_context {
639639 {
640640 img_beg = " <image>" ;
641641 img_end = " " ;
642- image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd >(ctx_v);
642+ image_preproc = std::make_unique<mtmd_image_preprocessor_granite >(ctx_v);
643643 } break ;
644644 default :
645645 throw std::runtime_error (string_format (" %s: unexpected vision projector type %d\n " , __func__, proj));
@@ -1033,7 +1033,10 @@ struct mtmd_tokenizer {
10331033 int32_t add_media (std::vector<const mtmd_bitmap *> & bitmaps) {
10341034 GGML_ASSERT (!bitmaps.empty ());
10351035
1036- if (!bitmaps[0 ]->is_audio ) {
1036+ // note: only one type of media is supported per call, caller should enforce this
1037+ const bool is_vision = !bitmaps[0 ]->is_audio ;
1038+
1039+ if (is_vision) {
10371040 // handle image
10381041
10391042 if (!ctx->ctx_v ) {
@@ -1085,31 +1088,9 @@ struct mtmd_tokenizer {
10851088 batch_f32.grid_y = tmp_batch.grid_y ;
10861089 }
10871090
1088- // Annotate llava-next style tiles so clip_n_output_tokens accounts
1089- // for per-tile newline injection.
1090- if (ctx->proj_type_v () == PROJECTOR_TYPE_GRANITE4_VISION ) {
1091- if (batch_f32.entries .size () == 1 ) {
1092- // Single-tile (overview only): append one newline row.
1093- batch_f32.entries [0 ]->add_newline = true ;
1094- } else {
1095- // Multi-tile: overview gets no newline, grid tiles get one.
1096- batch_f32.entries [0 ]->add_newline = false ;
1097- for (size_t i = 1 ; i < batch_f32.entries .size (); ++i) {
1098- batch_f32.entries [i]->add_newline = true ;
1099- }
1100- }
1101- }
1102-
11031091 // handle llava-uhd style preprocessing
11041092 const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0 ;
1105- if (
1106- ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
1107- || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
1108- || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
1109- || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
1110- || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
1111- || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
1112- ) {
1093+ if (has_tiling_grid) {
11131094 // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
11141095 GGML_ASSERT (bitmaps.size () == 1 );
11151096
0 commit comments