hiyouga · Luffy-ZY-Wang · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
@@ -189,6 +189,7 @@ async def _generate(
                     image_min_pixels=self.model_args.video_min_pixels,
                     video_fps=self.model_args.video_fps,
                     video_maxlen=self.model_args.video_maxlen,
+                    video_maxlen_ttl=self.model_args.video_maxlen_ttl,
                 )["videos"]
             }
         elif audios is not None:

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
@@ -85,6 +85,24 @@ class MMProcessor(ProcessorMixin):
         def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
             pass
 
+def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]:
+    """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen]."""
+    dura_ttl = sum(durations)
+    max_nums_of_frames = [  # 2 < max_num_of_frames < video_maxlen
+        min(max(int(video_maxlen_ttl * dura / dura_ttl), 2), video_maxlen) for dura in durations
+    ]  # list of `max_num_of_frames`
+    if sum(max_nums_of_frames) > video_maxlen_ttl:  # may be bigger if some are set 2
+        delta = sum(max_nums_of_frames) - video_maxlen_ttl
+        for _ in range(delta):  #
+            max_idx = max_nums_of_frames.index(max(max_nums_of_frames))
+            if max(max_nums_of_frames) - 1 >= 2:  # should still >= 2
+                max_nums_of_frames[max_idx] -= 1
+            else:
+                raise ValueError(
+                    f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={2 * len(max_nums_of_frames)})."
+                )
+    return max_nums_of_frames
+
 
 def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]:
     r"""Get paligemma token type ids for computing loss.
@@ -233,10 +251,20 @@ def _regularize_images(self, images: list["ImageInput"], **kwargs) -> dict[str,
     def _regularize_videos(self, videos: list["VideoInput"], **kwargs) -> dict[str, list[list["ImageObject"]]]:
         r"""Regularizes videos to avoid error. Including reading, resizing and converting."""
         results = []
-        for video in videos:
+        video_streams = []
+        durations = []
+        for video in videos:  # prepare durations first
             container = av.open(video, "r")
             video_stream = next(stream for stream in container.streams if stream.type == "video")
-            sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
+            durations.append(video_stream.duration * video_stream.time_base)  # unit: second
+            video_streams.append(video_stream)
+        max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs)
+        for video_stream, max_frames in zip(video_streams, max_frames_each_video):
+            sample_indices = self._get_video_sample_indices(
+                video_stream,
+                video_fps=kwargs["video_fps"],
+                video_maxlen=max_frames,
+            )
             frames: list[ImageObject] = []
             container.seek(0)
             for frame_idx, frame in enumerate(container.decode(video_stream)):
@@ -326,6 +354,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )["videos"]
             if "videos" in inspect.signature(video_processor.preprocess).parameters:  # for qwen2_vl and video_llava
                 mm_inputs.update(video_processor(images=None, videos=videos, return_tensors="pt"))
@@ -509,6 +538,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )["videos"]
 
         if len(images) != 0:
@@ -1046,6 +1076,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )["videos"]
             video_inputs = image_processor(videos, do_pad=True, max_slice_nums=2, return_tensors="pt")
             mm_inputs.update(video_inputs)
@@ -1430,10 +1461,20 @@ def _regularize_videos(
         self, videos: list["VideoInput"], **kwargs
     ) -> dict[str, Union[list[list["ImageObject"]], list[float]]]:
         results, fps_per_video = [], []
+        video_streams = []
+        durations = []
         for video in videos:
             container = av.open(video, "r")
             video_stream = next(stream for stream in container.streams if stream.type == "video")
-            sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
+            durations.append(video_stream.duration * video_stream.time_base)  # unit: second
+            video_streams.append(video_stream)
+        max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs)
+        for video_stream, max_frames in zip(video_streams, max_frames_each_video):
+            sample_indices = self._get_video_sample_indices(
+                video_stream,
+                video_fps=kwargs["video_fps"],
+                video_maxlen=max_frames,
+            )
             frames: list[ImageObject] = []
             container.seek(0)
             for frame_idx, frame in enumerate(container.decode(video_stream)):
@@ -1477,6 +1518,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )
             mm_inputs.update(image_processor(images=None, videos=video_data["videos"], return_tensors="pt"))
             temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
@@ -1568,6 +1610,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )
             mm_inputs.update(image_processor(images=None, videos=video_dict["videos"], return_tensors="pt"))
             temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)

diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
@@ -253,7 +253,11 @@ class ProcessorArguments:
     )
     video_maxlen: int = field(
         default=128,
-        metadata={"help": "The maximum number of sampled frames for video inputs."},
+        metadata={"help": "The unified maximum number of sampled frames for each video inputs."},
+    )
+    video_maxlen_ttl: int = field(
+        default=128 * 50,  # assume 50 videos at max in 1 input
+        metadata={"help": "The maximum number of total sampled frames of all video inputs."},
     )
     audio_sampling_rate: int = field(
         default=16000,

diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
@@ -85,6 +85,7 @@ def patch_processor(
     setattr(processor, "video_min_pixels", model_args.video_min_pixels)
     setattr(processor, "video_fps", model_args.video_fps)
     setattr(processor, "video_maxlen", model_args.video_maxlen)
+    setattr(processor, "video_maxlen_ttl", model_args.video_maxlen_ttl)
     setattr(processor, "audio_sampling_rate", model_args.audio_sampling_rate)
     setattr(processor, "use_audio_in_video", model_args.use_audio_in_video)