From 351126361481a4ea309d354ec3f7652f8b53ac31 Mon Sep 17 00:00:00 2001 From: Luffy <1074726817@qq.com> Date: Tue, 22 Apr 2025 10:23:12 +0000 Subject: [PATCH 1/6] support new processor arg video_maxlen_ttl --- src/llamafactory/chat/vllm_engine.py | 1 + src/llamafactory/data/mm_plugin.py | 49 ++++++++++++++++++++++++-- src/llamafactory/hparams/model_args.py | 6 +++- src/llamafactory/model/patcher.py | 1 + 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py index 1100fc8a53..c544ccf5af 100644 --- a/src/llamafactory/chat/vllm_engine.py +++ b/src/llamafactory/chat/vllm_engine.py @@ -189,6 +189,7 @@ async def _generate( image_min_pixels=self.model_args.video_min_pixels, video_fps=self.model_args.video_fps, video_maxlen=self.model_args.video_maxlen, + video_maxlen_ttl=self.model_args.video_maxlen_ttl, )["videos"] } elif audios is not None: diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 5e32fab433..8760ef2e0b 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -86,6 +86,24 @@ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int pass +def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]: + """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen].""" + dura_ttl = sum(durations) + max_nums_of_frames = [ # 2 < max_num_of_frames < video_maxlen + min(max(int(video_maxlen_ttl * dura / dura_ttl), 2), video_maxlen) for dura in durations + ] # list of `max_num_of_frames` + if sum(max_nums_of_frames) > video_maxlen_ttl: # may be bigger if some are set 2 + delta = sum(max_nums_of_frames) - video_maxlen_ttl + for _ in range(delta): # + max_idx = max_nums_of_frames.index(max(max_nums_of_frames)) + if max(max_nums_of_frames) - 1 >= 2: # should still >= 2 + max_nums_of_frames[max_idx] -= 1 + else: + raise ValueError( + f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={2 * len(max_nums_of_frames)})." + ) + return max_nums_of_frames + def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]: r"""Get paligemma token type ids for computing loss. @@ -233,10 +251,20 @@ def _regularize_images(self, images: list["ImageInput"], **kwargs) -> dict[str, def _regularize_videos(self, videos: list["VideoInput"], **kwargs) -> dict[str, list[list["ImageObject"]]]: r"""Regularizes videos to avoid error. Including reading, resizing and converting.""" results = [] - for video in videos: + video_streams = [] + durations = [] + for video in videos: # prepare durations first container = av.open(video, "r") video_stream = next(stream for stream in container.streams if stream.type == "video") - sample_indices = self._get_video_sample_indices(video_stream, **kwargs) + durations.append(video_stream.duration * video_stream.time_base) # unit: second + video_streams.append(video_stream) + max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs) + for video_stream, max_frames in zip(video_streams, max_frames_each_video): + sample_indices = self._get_video_sample_indices( + video_stream, + video_fps=kwargs["video_fps"], + video_maxlen=max_frames, + ) frames: list[ImageObject] = [] container.seek(0) for frame_idx, frame in enumerate(container.decode(video_stream)): @@ -326,6 +354,7 @@ def _get_mm_inputs( image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16), video_fps=getattr(processor, "video_fps", 2.0), video_maxlen=getattr(processor, "video_maxlen", 128), + video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default )["videos"] if "videos" in inspect.signature(video_processor.preprocess).parameters: # for qwen2_vl and video_llava mm_inputs.update(video_processor(images=None, videos=videos, return_tensors="pt")) @@ -509,6 +538,7 @@ def _get_mm_inputs( image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16), video_fps=getattr(processor, "video_fps", 2.0), video_maxlen=getattr(processor, "video_maxlen", 128), + video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default )["videos"] if len(images) != 0: @@ -1046,6 +1076,7 @@ def _get_mm_inputs( image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16), video_fps=getattr(processor, "video_fps", 2.0), video_maxlen=getattr(processor, "video_maxlen", 128), + video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default )["videos"] video_inputs = image_processor(videos, do_pad=True, max_slice_nums=2, return_tensors="pt") mm_inputs.update(video_inputs) @@ -1430,10 +1461,20 @@ def _regularize_videos( self, videos: list["VideoInput"], **kwargs ) -> dict[str, Union[list[list["ImageObject"]], list[float]]]: results, fps_per_video = [], [] + video_streams = [] + durations = [] for video in videos: container = av.open(video, "r") video_stream = next(stream for stream in container.streams if stream.type == "video") - sample_indices = self._get_video_sample_indices(video_stream, **kwargs) + durations.append(video_stream.duration * video_stream.time_base) # unit: second + video_streams.append(video_stream) + max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs) + for video_stream, max_frames in zip(video_streams, max_frames_each_video): + sample_indices = self._get_video_sample_indices( + video_stream, + video_fps=kwargs["video_fps"], + video_maxlen=max_frames, + ) frames: list[ImageObject] = [] container.seek(0) for frame_idx, frame in enumerate(container.decode(video_stream)): @@ -1477,6 +1518,7 @@ def _get_mm_inputs( image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16), video_fps=getattr(processor, "video_fps", 2.0), video_maxlen=getattr(processor, "video_maxlen", 128), + video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default ) mm_inputs.update(image_processor(images=None, videos=video_data["videos"], return_tensors="pt")) temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2) @@ -1568,6 +1610,7 @@ def _get_mm_inputs( image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16), video_fps=getattr(processor, "video_fps", 2.0), video_maxlen=getattr(processor, "video_maxlen", 128), + video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default ) mm_inputs.update(image_processor(images=None, videos=video_dict["videos"], return_tensors="pt")) temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2) diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index e7a74046c7..64e605a08b 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -253,7 +253,11 @@ class ProcessorArguments: ) video_maxlen: int = field( default=128, - metadata={"help": "The maximum number of sampled frames for video inputs."}, + metadata={"help": "The unified maximum number of sampled frames for each video inputs."}, + ) + video_maxlen_ttl: int = field( + default=128 * 50, # assume 50 videos at max in 1 input + metadata={"help": "The maximum number of total sampled frames of all video inputs."}, ) audio_sampling_rate: int = field( default=16000, diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py index ce1a5a7d8c..95129df37c 100644 --- a/src/llamafactory/model/patcher.py +++ b/src/llamafactory/model/patcher.py @@ -85,6 +85,7 @@ def patch_processor( setattr(processor, "video_min_pixels", model_args.video_min_pixels) setattr(processor, "video_fps", model_args.video_fps) setattr(processor, "video_maxlen", model_args.video_maxlen) + setattr(processor, "video_maxlen_ttl", model_args.video_maxlen_ttl) setattr(processor, "audio_sampling_rate", model_args.audio_sampling_rate) setattr(processor, "use_audio_in_video", model_args.use_audio_in_video) From 098e5ac473d7c657e58124051671c22023c84209 Mon Sep 17 00:00:00 2001 From: Luffy <1074726817@qq.com> Date: Tue, 22 Apr 2025 10:23:12 +0000 Subject: [PATCH 2/6] support new processor arg video_maxlen_ttl --- src/llamafactory/data/mm_plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 8760ef2e0b..9aeb4b12b1 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -104,6 +104,7 @@ def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_max ) return max_nums_of_frames + def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]: r"""Get paligemma token type ids for computing loss. From 60b7d6b01d25304eba5dc315f745ac340b221907 Mon Sep 17 00:00:00 2001 From: Luffy <1074726817@qq.com> Date: Tue, 22 Apr 2025 10:23:12 +0000 Subject: [PATCH 3/6] support new processor arg video_maxlen_ttl --- src/llamafactory/data/mm_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 9aeb4b12b1..abe12f6cf7 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -85,7 +85,6 @@ class MMProcessor(ProcessorMixin): def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int: pass - def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]: """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen].""" dura_ttl = sum(durations) From 49ce350c80817d6571f1eeff9c28dffc5f42a7f4 Mon Sep 17 00:00:00 2001 From: Luffy <1074726817@qq.com> Date: Tue, 22 Apr 2025 10:23:12 +0000 Subject: [PATCH 4/6] support new processor arg video_maxlen_ttl --- src/llamafactory/data/mm_plugin.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index abe12f6cf7..82bcf14a89 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -104,6 +104,24 @@ def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_max return max_nums_of_frames +def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]: + """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen].""" + dura_ttl = sum(durations) + max_nums_of_frames = [ # 2 < max_num_of_frames < video_maxlen + min(max(int(video_maxlen_ttl * dura / dura_ttl), 2), video_maxlen) for dura in durations + ] # list of `max_num_of_frames` + if sum(max_nums_of_frames) > video_maxlen_ttl: # may be bigger if some are set 2 + delta = sum(max_nums_of_frames) - video_maxlen_ttl + for _ in range(delta): # + max_idx = max_nums_of_frames.index(max(max_nums_of_frames)) + if max(max_nums_of_frames) - 1 >= 2: # should still >= 2 + max_nums_of_frames[max_idx] -= 1 + else: + raise ValueError( + f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={2 * len(max_nums_of_frames)})." + ) + return max_nums_of_frames + def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]: r"""Get paligemma token type ids for computing loss. From 071939fda49e3fdd14cf7673cc6b153f8414356b Mon Sep 17 00:00:00 2001 From: Luffy <1074726817@qq.com> Date: Tue, 22 Apr 2025 10:23:12 +0000 Subject: [PATCH 5/6] support new processor arg video_maxlen_ttl --- src/llamafactory/data/mm_plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 82bcf14a89..903a3008ef 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -122,6 +122,7 @@ def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_max ) return max_nums_of_frames + def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]: r"""Get paligemma token type ids for computing loss. From 140adb4dd00f4050a71eb2de9b7881e059f797c6 Mon Sep 17 00:00:00 2001 From: Luffy <1074726817@qq.com> Date: Tue, 22 Apr 2025 10:23:12 +0000 Subject: [PATCH 6/6] support new processor arg video_maxlen_ttl --- src/llamafactory/data/mm_plugin.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 903a3008ef..abe12f6cf7 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -104,25 +104,6 @@ def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_max return max_nums_of_frames -def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]: - """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen].""" - dura_ttl = sum(durations) - max_nums_of_frames = [ # 2 < max_num_of_frames < video_maxlen - min(max(int(video_maxlen_ttl * dura / dura_ttl), 2), video_maxlen) for dura in durations - ] # list of `max_num_of_frames` - if sum(max_nums_of_frames) > video_maxlen_ttl: # may be bigger if some are set 2 - delta = sum(max_nums_of_frames) - video_maxlen_ttl - for _ in range(delta): # - max_idx = max_nums_of_frames.index(max(max_nums_of_frames)) - if max(max_nums_of_frames) - 1 >= 2: # should still >= 2 - max_nums_of_frames[max_idx] -= 1 - else: - raise ValueError( - f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={2 * len(max_nums_of_frames)})." - ) - return max_nums_of_frames - - def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]: r"""Get paligemma token type ids for computing loss.