support new processor arg video_maxlen_ttl

Luffy-ZY-Wang · Luffy-ZY-Wang · commit f8f16ea78edd · 2025-04-22T10:23:12.000Z
diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
@@ -86,6 +86,25 @@ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int
             pass
 
 
+def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]:
+    """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen]."""
+    dura_ttl = sum(durations)
+    max_nums_of_frames = [  # 2 < max_num_of_frames < video_maxlen
+        min(max(int(video_maxlen_ttl * dura / dura_ttl), 2), video_maxlen) for dura in durations
+    ]  # list of `max_num_of_frames`
+    if sum(max_nums_of_frames) > video_maxlen_ttl:  # may be bigger if some are set 2
+        delta = sum(max_nums_of_frames) - video_maxlen_ttl
+        for _ in range(delta):  #
+            max_idx = max_nums_of_frames.index(max(max_nums_of_frames))
+            if max(max_nums_of_frames) - 1 >= 2:  # should still >= 2
+                max_nums_of_frames[max_idx] -= 1
+            else:
+                raise ValueError(
+                    f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={2 * len(max_nums_of_frames)})."
+                )
+    return max_nums_of_frames
+
+
 def _concatenate_list(input_list: list[Any]) -> Union[list[Any], "NDArray", "torch.Tensor"]:
     r"""Concatenate a list of lists, numpy arrays or torch tensors.
 
@@ -247,10 +266,20 @@ def _regularize_images(self, images: list["ImageInput"], **kwargs) -> dict[str,
     def _regularize_videos(self, videos: list["VideoInput"], **kwargs) -> dict[str, list[list["ImageObject"]]]:
         r"""Regularizes videos to avoid error. Including reading, resizing and converting."""
         results = []
-        for video in videos:
+        video_streams = []
+        durations = []
+        for video in videos:  # prepare durations first
             container = av.open(video, "r")
             video_stream = next(stream for stream in container.streams if stream.type == "video")
-            sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
+            durations.append(video_stream.duration * video_stream.time_base)  # unit: second
+            video_streams.append(video_stream)
+        max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs)
+        for video_stream, max_frames in zip(video_streams, max_frames_each_video):
+            sample_indices = self._get_video_sample_indices(
+                video_stream,
+                video_fps=kwargs["video_fps"],
+                video_maxlen=max_frames,
+            )
             frames: list[ImageObject] = []
             container.seek(0)
             for frame_idx, frame in enumerate(container.decode(video_stream)):
@@ -340,6 +369,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )["videos"]
             if "videos" in inspect.signature(video_processor.preprocess).parameters:  # for qwen2_vl and video_llava
                 mm_inputs.update(video_processor(images=None, videos=videos, return_tensors="pt"))
@@ -516,6 +546,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )["videos"]
 
         if len(images) != 0:
@@ -1055,6 +1086,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )["videos"]
             video_inputs = image_processor(videos, do_pad=True, max_slice_nums=2, return_tensors="pt")
             mm_inputs.update(video_inputs)
@@ -1439,10 +1471,20 @@ def _regularize_videos(
         self, videos: list["VideoInput"], **kwargs
     ) -> dict[str, Union[list[list["ImageObject"]], list[float]]]:
         results, fps_per_video = [], []
+        video_streams = []
+        durations = []
         for video in videos:
             container = av.open(video, "r")
             video_stream = next(stream for stream in container.streams if stream.type == "video")
-            sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
+            durations.append(video_stream.duration * video_stream.time_base)  # unit: second
+            video_streams.append(video_stream)
+        max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs)
+        for video_stream, max_frames in zip(video_streams, max_frames_each_video):
+            sample_indices = self._get_video_sample_indices(
+                video_stream,
+                video_fps=kwargs["video_fps"],
+                video_maxlen=max_frames,
+            )
             frames: list[ImageObject] = []
             container.seek(0)
             for frame_idx, frame in enumerate(container.decode(video_stream)):
@@ -1486,6 +1528,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )
             mm_inputs.update(image_processor(images=None, videos=video_data["videos"], return_tensors="pt"))
             temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
@@ -1577,6 +1620,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )
             mm_inputs.update(image_processor(images=None, videos=video_dict["videos"], return_tensors="pt"))
             temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
@@ -249,7 +249,11 @@ class ProcessorArguments:
     )
     video_maxlen: int = field(
         default=128,
-        metadata={"help": "The maximum number of sampled frames for video inputs."},
+        metadata={"help": "The unified maximum number of sampled frames for each video inputs."},
+    )
+    video_maxlen_ttl: int = field(
+        default=128 * 50,  # assume 50 videos at max in 1 input
+        metadata={"help": "The maximum number of total sampled frames of all video inputs."},
     )
     audio_sampling_rate: int = field(
         default=16000,
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
@@ -84,6 +84,7 @@ def patch_processor(
     setattr(processor, "video_min_pixels", model_args.video_min_pixels)
     setattr(processor, "video_fps", model_args.video_fps)
     setattr(processor, "video_maxlen", model_args.video_maxlen)
+    setattr(processor, "video_maxlen_ttl", model_args.video_maxlen_ttl)
     setattr(processor, "audio_sampling_rate", model_args.audio_sampling_rate)
     setattr(processor, "use_audio_in_video", model_args.use_audio_in_video)
 

Original file line number	Diff line number	Diff line change
`@@ -249,7 +249,11 @@ class ProcessorArguments:`
`249`	`249`	`)`
`250`	`250`	`video_maxlen: int = field(`
`251`	`251`	`default=128,`
`252`		`- metadata={"help": "The maximum number of sampled frames for video inputs."},`
	`252`	`+ metadata={"help": "The unified maximum number of sampled frames for each video inputs."},`
	`253`	`+ )`
	`254`	`+ video_maxlen_ttl: int = field(`
	`255`	`+ default=128 * 50, # assume 50 videos at max in 1 input`
	`256`	`+ metadata={"help": "The maximum number of total sampled frames of all video inputs."},`
`253`	`257`	`)`
`254`	`258`	`audio_sampling_rate: int = field(`
`255`	`259`	`default=16000,`