@@ -527,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
527527}
528528
529529ggml_tensor * clip_graph::build_inp_raw (int channels) {
530- ggml_tensor * inp_raw = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32 , img.nx (), img.ny (), channels);
530+ ggml_tensor * inp_raw = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32 , img.nx (), img.ny (), channels, n_batch );
531531 ggml_set_name (inp_raw, " inp_raw" );
532532 ggml_set_input (inp_raw);
533533 return inp_raw;
@@ -848,8 +848,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
848848}
849849
850850static ggml_cgraph * clip_image_build_graph (clip_ctx * ctx, const clip_image_f32_batch & imgs) {
851- GGML_ASSERT (imgs.entries .size () == 1 && " n_batch > 1 is not supported" );
852-
853851 const clip_image_f32 & img = *imgs.entries [0 ];
854852 std::unique_ptr<clip_graph> builder;
855853
@@ -1009,6 +1007,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
10091007 GGML_ABORT (" missing cgraph builder" );
10101008 }
10111009
1010+ // TODO [QWEN_VIDEO]: improve this in the future
1011+ builder->n_batch = imgs.entries .size ();
1012+
10121013 return builder->build ();
10131014}
10141015
@@ -3479,12 +3480,15 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
34793480
34803481bool clip_image_batch_encode (clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
34813482 const clip_image_f32_batch & imgs = *imgs_c_ptr;
3482- int batch_size = imgs.entries .size ();
3483+ int n_batch_cur = imgs.entries .size ();
3484+
3485+ // maximum supported batch size, usually == 2 for qwen-vl-based models
3486+ int n_batch_max = clip_model_n_batch_max (ctx);
34833487
34843488 // TODO @ngxson : implement batch size > 1 as a loop
34853489 // we don't need true batching support because the cgraph will gonna be big anyway
3486- if (batch_size != 1 ) {
3487- return false ; // only support batch size of 1
3490+ if (n_batch_cur > n_batch_max ) {
3491+ return false ;
34883492 }
34893493
34903494 // if buffers are not allocated, we need to do a warmup run to allocate them
@@ -3555,18 +3559,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
35553559 // └─────┘ │
35563560 // ──────┘ x B
35573561
3558- for (size_t i = 0 ; i < imgs.entries .size (); i++) {
3559- const int nx = imgs.entries [i]->nx ();
3560- const int ny = imgs.entries [i]->ny ();
3561- const int n = nx * ny;
3562+ // IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models
3563+ // All entries must have the same spatial size (enforced by can_batch_with() during merging)
3564+ {
3565+ const int nx = imgs.entries [0 ]->nx ();
3566+ const int ny = imgs.entries [0 ]->ny ();
3567+ const int n = nx * ny;
35623568
3563- for (int b = 0 ; b < batch_size ; b++) {
3569+ for (int b = 0 ; b < n_batch_cur ; b++) {
35643570 const auto & buf = imgs.entries [b]->get_ro_buf ();
35653571 float * batch_entry = inp_raw.data () + b * (3 *n);
35663572 for (int y = 0 ; y < ny; y++) {
35673573 for (int x = 0 ; x < nx; x++) {
3568- size_t base_src = 3 *(y * nx + x); // idx of the first channel
3569- size_t base_dst = y * nx + x; // idx of the first channel
3574+ size_t base_src = 3 *(y * nx + x);
3575+ size_t base_dst = y * nx + x;
35703576 batch_entry[ base_dst] = buf[base_src ];
35713577 batch_entry[1 *n + base_dst] = buf[base_src + 1 ];
35723578 batch_entry[2 *n + base_dst] = buf[base_src + 2 ];
@@ -4549,6 +4555,17 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
45494555 return ctx->model .modality == CLIP_MODALITY_AUDIO ;
45504556}
45514557
4558+ int clip_model_n_batch_max (const struct clip_ctx * ctx) {
4559+ switch (ctx->proj_type ()) {
4560+ case PROJECTOR_TYPE_QWEN2VL :
4561+ case PROJECTOR_TYPE_QWEN25VL :
4562+ case PROJECTOR_TYPE_QWEN3VL :
4563+ return 2 ;
4564+ default :
4565+ return 1 ;
4566+ }
4567+ }
4568+
45524569//
45534570// API used internally with mtmd
45544571//
0 commit comments