@@ -40,6 +40,7 @@ def __init__(
4040 threads : int = 16 , # Threads to use for decoding visuals
4141 trust_remote_code : Optional [bool ] = True ,
4242 chat_template : Optional [str ] = None ,
43+ min_image_pixels : int = 28 , # minimum image dimension, required for Qwen 2/2.5-VL models
4344 ** kwargs ,
4445 ) -> None :
4546 super ().__init__ ()
@@ -50,6 +51,9 @@ def __init__(
5051 self .max_frame_num = max_frame_num
5152 self .threads = threads
5253 self .chat_template = chat_template
54+ self .min_image_pixels = min_image_pixels
55+ # Qwen 2/2.5-VL models enforce minimum image dimensions
56+ self ._enforce_image_resize = self ._is_qwen_vl_model (model_version )
5357
5458 # Convert any string arguments that start with { and end with } to dictionaries
5559 for key , value in kwargs .items ():
@@ -85,13 +89,32 @@ def __init__(
8589 self .device = self .accelerator .device
8690 self .batch_size_per_gpu = int (batch_size )
8791
92+ def _is_qwen_vl_model (self , model_version : str ) -> bool :
93+ qwen_vl_patterns = ["qwen2-vl" , "qwen2.5-vl" ]
94+ return any (pattern in model_version .lower () for pattern in qwen_vl_patterns )
95+
96+ def _maybe_resize_image (self , img : Image .Image ) -> Image .Image :
97+ # edge‐case validation
98+ if self .min_image_pixels <= 0 :
99+ return img
100+ if min (img .size ) <= 0 :
101+ raise ValueError (f"Invalid image dimensions: { img .size } " )
102+
103+ if not self ._enforce_image_resize or min (img .size ) >= self .min_image_pixels :
104+ return img
105+
106+ scale = self .min_image_pixels / min (img .size ) # maintain original aspect ratio
107+ new_size = tuple (int (dim * scale ) for dim in img .size )
108+ return img .resize (new_size , Image .BICUBIC )
109+
88110 # Function to encode the image
89111 def encode_image (self , image : Union [Image .Image , str ]):
90112 if isinstance (image , str ):
91113 img = Image .open (image ).convert ("RGB" )
92114 else :
93115 img = image .copy ()
94116
117+ img = self ._maybe_resize_image (img )
95118 output_buffer = BytesIO ()
96119 img .save (output_buffer , format = "PNG" )
97120 byte_data = output_buffer .getvalue ()
@@ -115,6 +138,7 @@ def encode_video(self, video_path):
115138 base64_frames = []
116139 for frame in frames :
117140 img = Image .fromarray (frame )
141+ img = self ._maybe_resize_image (img )
118142 output_buffer = BytesIO ()
119143 img .save (output_buffer , format = "PNG" )
120144 byte_data = output_buffer .getvalue ()
0 commit comments