@@ -86,6 +86,25 @@ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int
86
86
pass
87
87
88
88
89
+ def _cal_max_frames_each_video (durations : list , video_maxlen_ttl : int , video_maxlen : int ) -> list [int ]:
90
+ """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen]."""
91
+ dura_ttl = sum (durations )
92
+ max_nums_of_frames = [ # 2 < max_num_of_frames < video_maxlen
93
+ min (max (int (video_maxlen_ttl * dura / dura_ttl ), 2 ), video_maxlen ) for dura in durations
94
+ ] # list of `max_num_of_frames`
95
+ if sum (max_nums_of_frames ) > video_maxlen_ttl : # may be bigger if some are set 2
96
+ delta = sum (max_nums_of_frames ) - video_maxlen_ttl
97
+ for _ in range (delta ): #
98
+ max_idx = max_nums_of_frames .index (max (max_nums_of_frames ))
99
+ if max (max_nums_of_frames ) - 1 >= 2 : # should still >= 2
100
+ max_nums_of_frames [max_idx ] -= 1
101
+ else :
102
+ raise ValueError (
103
+ f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={ 2 * len (max_nums_of_frames )} )."
104
+ )
105
+ return max_nums_of_frames
106
+
107
+
89
108
def _concatenate_list (input_list : list [Any ]) -> Union [list [Any ], "NDArray" , "torch.Tensor" ]:
90
109
r"""Concatenate a list of lists, numpy arrays or torch tensors.
91
110
@@ -247,10 +266,20 @@ def _regularize_images(self, images: list["ImageInput"], **kwargs) -> dict[str,
247
266
def _regularize_videos (self , videos : list ["VideoInput" ], ** kwargs ) -> dict [str , list [list ["ImageObject" ]]]:
248
267
r"""Regularizes videos to avoid error. Including reading, resizing and converting."""
249
268
results = []
250
- for video in videos :
269
+ video_streams = []
270
+ durations = []
271
+ for video in videos : # prepare durations first
251
272
container = av .open (video , "r" )
252
273
video_stream = next (stream for stream in container .streams if stream .type == "video" )
253
- sample_indices = self ._get_video_sample_indices (video_stream , ** kwargs )
274
+ durations .append (video_stream .duration * video_stream .time_base ) # unit: second
275
+ video_streams .append (video_stream )
276
+ max_frames_each_video = _cal_max_frames_each_video (durations , ** kwargs )
277
+ for video_stream , max_frames in zip (video_streams , max_frames_each_video ):
278
+ sample_indices = self ._get_video_sample_indices (
279
+ video_stream ,
280
+ video_fps = kwargs ["video_fps" ],
281
+ video_maxlen = max_frames ,
282
+ )
254
283
frames : list [ImageObject ] = []
255
284
container .seek (0 )
256
285
for frame_idx , frame in enumerate (container .decode (video_stream )):
@@ -340,6 +369,7 @@ def _get_mm_inputs(
340
369
image_min_pixels = getattr (processor , "video_min_pixels" , 16 * 16 ),
341
370
video_fps = getattr (processor , "video_fps" , 2.0 ),
342
371
video_maxlen = getattr (processor , "video_maxlen" , 128 ),
372
+ video_maxlen_ttl = getattr (processor , "video_maxlen_ttl" , 128 * len (videos )), # disabled by default
343
373
)["videos" ]
344
374
if "videos" in inspect .signature (video_processor .preprocess ).parameters : # for qwen2_vl and video_llava
345
375
mm_inputs .update (video_processor (images = None , videos = videos , return_tensors = "pt" ))
@@ -516,6 +546,7 @@ def _get_mm_inputs(
516
546
image_min_pixels = getattr (processor , "video_min_pixels" , 16 * 16 ),
517
547
video_fps = getattr (processor , "video_fps" , 2.0 ),
518
548
video_maxlen = getattr (processor , "video_maxlen" , 128 ),
549
+ video_maxlen_ttl = getattr (processor , "video_maxlen_ttl" , 128 * len (videos )), # disabled by default
519
550
)["videos" ]
520
551
521
552
if len (images ) != 0 :
@@ -1055,6 +1086,7 @@ def _get_mm_inputs(
1055
1086
image_min_pixels = getattr (processor , "video_min_pixels" , 16 * 16 ),
1056
1087
video_fps = getattr (processor , "video_fps" , 2.0 ),
1057
1088
video_maxlen = getattr (processor , "video_maxlen" , 128 ),
1089
+ video_maxlen_ttl = getattr (processor , "video_maxlen_ttl" , 128 * len (videos )), # disabled by default
1058
1090
)["videos" ]
1059
1091
video_inputs = image_processor (videos , do_pad = True , max_slice_nums = 2 , return_tensors = "pt" )
1060
1092
mm_inputs .update (video_inputs )
@@ -1439,10 +1471,20 @@ def _regularize_videos(
1439
1471
self , videos : list ["VideoInput" ], ** kwargs
1440
1472
) -> dict [str , Union [list [list ["ImageObject" ]], list [float ]]]:
1441
1473
results , fps_per_video = [], []
1474
+ video_streams = []
1475
+ durations = []
1442
1476
for video in videos :
1443
1477
container = av .open (video , "r" )
1444
1478
video_stream = next (stream for stream in container .streams if stream .type == "video" )
1445
- sample_indices = self ._get_video_sample_indices (video_stream , ** kwargs )
1479
+ durations .append (video_stream .duration * video_stream .time_base ) # unit: second
1480
+ video_streams .append (video_stream )
1481
+ max_frames_each_video = _cal_max_frames_each_video (durations , ** kwargs )
1482
+ for video_stream , max_frames in zip (video_streams , max_frames_each_video ):
1483
+ sample_indices = self ._get_video_sample_indices (
1484
+ video_stream ,
1485
+ video_fps = kwargs ["video_fps" ],
1486
+ video_maxlen = max_frames ,
1487
+ )
1446
1488
frames : list [ImageObject ] = []
1447
1489
container .seek (0 )
1448
1490
for frame_idx , frame in enumerate (container .decode (video_stream )):
@@ -1486,6 +1528,7 @@ def _get_mm_inputs(
1486
1528
image_min_pixels = getattr (processor , "video_min_pixels" , 16 * 16 ),
1487
1529
video_fps = getattr (processor , "video_fps" , 2.0 ),
1488
1530
video_maxlen = getattr (processor , "video_maxlen" , 128 ),
1531
+ video_maxlen_ttl = getattr (processor , "video_maxlen_ttl" , 128 * len (videos )), # disabled by default
1489
1532
)
1490
1533
mm_inputs .update (image_processor (images = None , videos = video_data ["videos" ], return_tensors = "pt" ))
1491
1534
temporal_patch_size : int = getattr (image_processor , "temporal_patch_size" , 2 )
@@ -1577,6 +1620,7 @@ def _get_mm_inputs(
1577
1620
image_min_pixels = getattr (processor , "video_min_pixels" , 16 * 16 ),
1578
1621
video_fps = getattr (processor , "video_fps" , 2.0 ),
1579
1622
video_maxlen = getattr (processor , "video_maxlen" , 128 ),
1623
+ video_maxlen_ttl = getattr (processor , "video_maxlen_ttl" , 128 * len (videos )), # disabled by default
1580
1624
)
1581
1625
mm_inputs .update (image_processor (images = None , videos = video_dict ["videos" ], return_tensors = "pt" ))
1582
1626
temporal_patch_size : int = getattr (image_processor , "temporal_patch_size" , 2 )
0 commit comments