From 351126361481a4ea309d354ec3f7652f8b53ac31 Mon Sep 17 00:00:00 2001
From: Luffy <1074726817@qq.com>
Date: Tue, 22 Apr 2025 10:23:12 +0000
Subject: [PATCH 1/6] support new processor arg video_maxlen_ttl

---
 src/llamafactory/chat/vllm_engine.py   |  1 +
 src/llamafactory/data/mm_plugin.py     | 49 ++++++++++++++++++++++++--
 src/llamafactory/hparams/model_args.py |  6 +++-
 src/llamafactory/model/patcher.py      |  1 +
 4 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index 1100fc8a53..c544ccf5af 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -189,6 +189,7 @@ async def _generate(
                     image_min_pixels=self.model_args.video_min_pixels,
                     video_fps=self.model_args.video_fps,
                     video_maxlen=self.model_args.video_maxlen,
+                    video_maxlen_ttl=self.model_args.video_maxlen_ttl,
                 )["videos"]
             }
         elif audios is not None:
diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 5e32fab433..8760ef2e0b 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -86,6 +86,24 @@ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int
             pass
 
 
+def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]:
+    """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen]."""
+    dura_ttl = sum(durations)
+    max_nums_of_frames = [  # 2 < max_num_of_frames < video_maxlen
+        min(max(int(video_maxlen_ttl * dura / dura_ttl), 2), video_maxlen) for dura in durations
+    ]  # list of `max_num_of_frames`
+    if sum(max_nums_of_frames) > video_maxlen_ttl:  # may be bigger if some are set 2
+        delta = sum(max_nums_of_frames) - video_maxlen_ttl
+        for _ in range(delta):  #
+            max_idx = max_nums_of_frames.index(max(max_nums_of_frames))
+            if max(max_nums_of_frames) - 1 >= 2:  # should still >= 2
+                max_nums_of_frames[max_idx] -= 1
+            else:
+                raise ValueError(
+                    f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={2 * len(max_nums_of_frames)})."
+                )
+    return max_nums_of_frames
+
 def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]:
     r"""Get paligemma token type ids for computing loss.
 
@@ -233,10 +251,20 @@ def _regularize_images(self, images: list["ImageInput"], **kwargs) -> dict[str,
     def _regularize_videos(self, videos: list["VideoInput"], **kwargs) -> dict[str, list[list["ImageObject"]]]:
         r"""Regularizes videos to avoid error. Including reading, resizing and converting."""
         results = []
-        for video in videos:
+        video_streams = []
+        durations = []
+        for video in videos:  # prepare durations first
             container = av.open(video, "r")
             video_stream = next(stream for stream in container.streams if stream.type == "video")
-            sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
+            durations.append(video_stream.duration * video_stream.time_base)  # unit: second
+            video_streams.append(video_stream)
+        max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs)
+        for video_stream, max_frames in zip(video_streams, max_frames_each_video):
+            sample_indices = self._get_video_sample_indices(
+                video_stream,
+                video_fps=kwargs["video_fps"],
+                video_maxlen=max_frames,
+            )
             frames: list[ImageObject] = []
             container.seek(0)
             for frame_idx, frame in enumerate(container.decode(video_stream)):
@@ -326,6 +354,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )["videos"]
             if "videos" in inspect.signature(video_processor.preprocess).parameters:  # for qwen2_vl and video_llava
                 mm_inputs.update(video_processor(images=None, videos=videos, return_tensors="pt"))
@@ -509,6 +538,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )["videos"]
 
         if len(images) != 0:
@@ -1046,6 +1076,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )["videos"]
             video_inputs = image_processor(videos, do_pad=True, max_slice_nums=2, return_tensors="pt")
             mm_inputs.update(video_inputs)
@@ -1430,10 +1461,20 @@ def _regularize_videos(
         self, videos: list["VideoInput"], **kwargs
     ) -> dict[str, Union[list[list["ImageObject"]], list[float]]]:
         results, fps_per_video = [], []
+        video_streams = []
+        durations = []
         for video in videos:
             container = av.open(video, "r")
             video_stream = next(stream for stream in container.streams if stream.type == "video")
-            sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
+            durations.append(video_stream.duration * video_stream.time_base)  # unit: second
+            video_streams.append(video_stream)
+        max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs)
+        for video_stream, max_frames in zip(video_streams, max_frames_each_video):
+            sample_indices = self._get_video_sample_indices(
+                video_stream,
+                video_fps=kwargs["video_fps"],
+                video_maxlen=max_frames,
+            )
             frames: list[ImageObject] = []
             container.seek(0)
             for frame_idx, frame in enumerate(container.decode(video_stream)):
@@ -1477,6 +1518,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )
             mm_inputs.update(image_processor(images=None, videos=video_data["videos"], return_tensors="pt"))
             temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
@@ -1568,6 +1610,7 @@ def _get_mm_inputs(
                 image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
                 video_fps=getattr(processor, "video_fps", 2.0),
                 video_maxlen=getattr(processor, "video_maxlen", 128),
+                video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)),  # disabled by default
             )
             mm_inputs.update(image_processor(images=None, videos=video_dict["videos"], return_tensors="pt"))
             temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index e7a74046c7..64e605a08b 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -253,7 +253,11 @@ class ProcessorArguments:
     )
     video_maxlen: int = field(
         default=128,
-        metadata={"help": "The maximum number of sampled frames for video inputs."},
+        metadata={"help": "The unified maximum number of sampled frames for each video inputs."},
+    )
+    video_maxlen_ttl: int = field(
+        default=128 * 50,  # assume 50 videos at max in 1 input
+        metadata={"help": "The maximum number of total sampled frames of all video inputs."},
     )
     audio_sampling_rate: int = field(
         default=16000,
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index ce1a5a7d8c..95129df37c 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -85,6 +85,7 @@ def patch_processor(
     setattr(processor, "video_min_pixels", model_args.video_min_pixels)
     setattr(processor, "video_fps", model_args.video_fps)
     setattr(processor, "video_maxlen", model_args.video_maxlen)
+    setattr(processor, "video_maxlen_ttl", model_args.video_maxlen_ttl)
     setattr(processor, "audio_sampling_rate", model_args.audio_sampling_rate)
     setattr(processor, "use_audio_in_video", model_args.use_audio_in_video)
 

From 098e5ac473d7c657e58124051671c22023c84209 Mon Sep 17 00:00:00 2001
From: Luffy <1074726817@qq.com>
Date: Tue, 22 Apr 2025 10:23:12 +0000
Subject: [PATCH 2/6] support new processor arg video_maxlen_ttl

---
 src/llamafactory/data/mm_plugin.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 8760ef2e0b..9aeb4b12b1 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -104,6 +104,7 @@ def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_max
                 )
     return max_nums_of_frames
 
+
 def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]:
     r"""Get paligemma token type ids for computing loss.
 

From 60b7d6b01d25304eba5dc315f745ac340b221907 Mon Sep 17 00:00:00 2001
From: Luffy <1074726817@qq.com>
Date: Tue, 22 Apr 2025 10:23:12 +0000
Subject: [PATCH 3/6] support new processor arg video_maxlen_ttl

---
 src/llamafactory/data/mm_plugin.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 9aeb4b12b1..abe12f6cf7 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -85,7 +85,6 @@ class MMProcessor(ProcessorMixin):
         def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
             pass
 
-
 def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]:
     """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen]."""
     dura_ttl = sum(durations)

From 49ce350c80817d6571f1eeff9c28dffc5f42a7f4 Mon Sep 17 00:00:00 2001
From: Luffy <1074726817@qq.com>
Date: Tue, 22 Apr 2025 10:23:12 +0000
Subject: [PATCH 4/6] support new processor arg video_maxlen_ttl

---
 src/llamafactory/data/mm_plugin.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index abe12f6cf7..82bcf14a89 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -104,6 +104,24 @@ def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_max
     return max_nums_of_frames
 
 
+def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]:
+    """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen]."""
+    dura_ttl = sum(durations)
+    max_nums_of_frames = [  # 2 < max_num_of_frames < video_maxlen
+        min(max(int(video_maxlen_ttl * dura / dura_ttl), 2), video_maxlen) for dura in durations
+    ]  # list of `max_num_of_frames`
+    if sum(max_nums_of_frames) > video_maxlen_ttl:  # may be bigger if some are set 2
+        delta = sum(max_nums_of_frames) - video_maxlen_ttl
+        for _ in range(delta):  #
+            max_idx = max_nums_of_frames.index(max(max_nums_of_frames))
+            if max(max_nums_of_frames) - 1 >= 2:  # should still >= 2
+                max_nums_of_frames[max_idx] -= 1
+            else:
+                raise ValueError(
+                    f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={2 * len(max_nums_of_frames)})."
+                )
+    return max_nums_of_frames
+
 def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]:
     r"""Get paligemma token type ids for computing loss.
 

From 071939fda49e3fdd14cf7673cc6b153f8414356b Mon Sep 17 00:00:00 2001
From: Luffy <1074726817@qq.com>
Date: Tue, 22 Apr 2025 10:23:12 +0000
Subject: [PATCH 5/6] support new processor arg video_maxlen_ttl

---
 src/llamafactory/data/mm_plugin.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 82bcf14a89..903a3008ef 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -122,6 +122,7 @@ def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_max
                 )
     return max_nums_of_frames
 
+
 def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]:
     r"""Get paligemma token type ids for computing loss.
 

From 140adb4dd00f4050a71eb2de9b7881e059f797c6 Mon Sep 17 00:00:00 2001
From: Luffy <1074726817@qq.com>
Date: Tue, 22 Apr 2025 10:23:12 +0000
Subject: [PATCH 6/6] support new processor arg video_maxlen_ttl

---
 src/llamafactory/data/mm_plugin.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 903a3008ef..abe12f6cf7 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -104,25 +104,6 @@ def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_max
     return max_nums_of_frames
 
 
-def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]:
-    """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen]."""
-    dura_ttl = sum(durations)
-    max_nums_of_frames = [  # 2 < max_num_of_frames < video_maxlen
-        min(max(int(video_maxlen_ttl * dura / dura_ttl), 2), video_maxlen) for dura in durations
-    ]  # list of `max_num_of_frames`
-    if sum(max_nums_of_frames) > video_maxlen_ttl:  # may be bigger if some are set 2
-        delta = sum(max_nums_of_frames) - video_maxlen_ttl
-        for _ in range(delta):  #
-            max_idx = max_nums_of_frames.index(max(max_nums_of_frames))
-            if max(max_nums_of_frames) - 1 >= 2:  # should still >= 2
-                max_nums_of_frames[max_idx] -= 1
-            else:
-                raise ValueError(
-                    f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={2 * len(max_nums_of_frames)})."
-                )
-    return max_nums_of_frames
-
-
 def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]:
     r"""Get paligemma token type ids for computing loss.