diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py index 1100fc8a53..c544ccf5af 100644 --- a/src/llamafactory/chat/vllm_engine.py +++ b/src/llamafactory/chat/vllm_engine.py @@ -189,6 +189,7 @@ async def _generate( image_min_pixels=self.model_args.video_min_pixels, video_fps=self.model_args.video_fps, video_maxlen=self.model_args.video_maxlen, + video_maxlen_ttl=self.model_args.video_maxlen_ttl, )["videos"] } elif audios is not None: diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 5e32fab433..abe12f6cf7 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -85,6 +85,24 @@ class MMProcessor(ProcessorMixin): def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int: pass +def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]: + """Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen].""" + dura_ttl = sum(durations) + max_nums_of_frames = [ # 2 < max_num_of_frames < video_maxlen + min(max(int(video_maxlen_ttl * dura / dura_ttl), 2), video_maxlen) for dura in durations + ] # list of `max_num_of_frames` + if sum(max_nums_of_frames) > video_maxlen_ttl: # may be bigger if some are set 2 + delta = sum(max_nums_of_frames) - video_maxlen_ttl + for _ in range(delta): # + max_idx = max_nums_of_frames.index(max(max_nums_of_frames)) + if max(max_nums_of_frames) - 1 >= 2: # should still >= 2 + max_nums_of_frames[max_idx] -= 1 + else: + raise ValueError( + f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={2 * len(max_nums_of_frames)})." + ) + return max_nums_of_frames + def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]: r"""Get paligemma token type ids for computing loss. @@ -233,10 +251,20 @@ def _regularize_images(self, images: list["ImageInput"], **kwargs) -> dict[str, def _regularize_videos(self, videos: list["VideoInput"], **kwargs) -> dict[str, list[list["ImageObject"]]]: r"""Regularizes videos to avoid error. Including reading, resizing and converting.""" results = [] - for video in videos: + video_streams = [] + durations = [] + for video in videos: # prepare durations first container = av.open(video, "r") video_stream = next(stream for stream in container.streams if stream.type == "video") - sample_indices = self._get_video_sample_indices(video_stream, **kwargs) + durations.append(video_stream.duration * video_stream.time_base) # unit: second + video_streams.append(video_stream) + max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs) + for video_stream, max_frames in zip(video_streams, max_frames_each_video): + sample_indices = self._get_video_sample_indices( + video_stream, + video_fps=kwargs["video_fps"], + video_maxlen=max_frames, + ) frames: list[ImageObject] = [] container.seek(0) for frame_idx, frame in enumerate(container.decode(video_stream)): @@ -326,6 +354,7 @@ def _get_mm_inputs( image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16), video_fps=getattr(processor, "video_fps", 2.0), video_maxlen=getattr(processor, "video_maxlen", 128), + video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default )["videos"] if "videos" in inspect.signature(video_processor.preprocess).parameters: # for qwen2_vl and video_llava mm_inputs.update(video_processor(images=None, videos=videos, return_tensors="pt")) @@ -509,6 +538,7 @@ def _get_mm_inputs( image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16), video_fps=getattr(processor, "video_fps", 2.0), video_maxlen=getattr(processor, "video_maxlen", 128), + video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default )["videos"] if len(images) != 0: @@ -1046,6 +1076,7 @@ def _get_mm_inputs( image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16), video_fps=getattr(processor, "video_fps", 2.0), video_maxlen=getattr(processor, "video_maxlen", 128), + video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default )["videos"] video_inputs = image_processor(videos, do_pad=True, max_slice_nums=2, return_tensors="pt") mm_inputs.update(video_inputs) @@ -1430,10 +1461,20 @@ def _regularize_videos( self, videos: list["VideoInput"], **kwargs ) -> dict[str, Union[list[list["ImageObject"]], list[float]]]: results, fps_per_video = [], [] + video_streams = [] + durations = [] for video in videos: container = av.open(video, "r") video_stream = next(stream for stream in container.streams if stream.type == "video") - sample_indices = self._get_video_sample_indices(video_stream, **kwargs) + durations.append(video_stream.duration * video_stream.time_base) # unit: second + video_streams.append(video_stream) + max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs) + for video_stream, max_frames in zip(video_streams, max_frames_each_video): + sample_indices = self._get_video_sample_indices( + video_stream, + video_fps=kwargs["video_fps"], + video_maxlen=max_frames, + ) frames: list[ImageObject] = [] container.seek(0) for frame_idx, frame in enumerate(container.decode(video_stream)): @@ -1477,6 +1518,7 @@ def _get_mm_inputs( image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16), video_fps=getattr(processor, "video_fps", 2.0), video_maxlen=getattr(processor, "video_maxlen", 128), + video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default ) mm_inputs.update(image_processor(images=None, videos=video_data["videos"], return_tensors="pt")) temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2) @@ -1568,6 +1610,7 @@ def _get_mm_inputs( image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16), video_fps=getattr(processor, "video_fps", 2.0), video_maxlen=getattr(processor, "video_maxlen", 128), + video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default ) mm_inputs.update(image_processor(images=None, videos=video_dict["videos"], return_tensors="pt")) temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2) diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index e7a74046c7..64e605a08b 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -253,7 +253,11 @@ class ProcessorArguments: ) video_maxlen: int = field( default=128, - metadata={"help": "The maximum number of sampled frames for video inputs."}, + metadata={"help": "The unified maximum number of sampled frames for each video inputs."}, + ) + video_maxlen_ttl: int = field( + default=128 * 50, # assume 50 videos at max in 1 input + metadata={"help": "The maximum number of total sampled frames of all video inputs."}, ) audio_sampling_rate: int = field( default=16000, diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py index ce1a5a7d8c..95129df37c 100644 --- a/src/llamafactory/model/patcher.py +++ b/src/llamafactory/model/patcher.py @@ -85,6 +85,7 @@ def patch_processor( setattr(processor, "video_min_pixels", model_args.video_min_pixels) setattr(processor, "video_fps", model_args.video_fps) setattr(processor, "video_maxlen", model_args.video_maxlen) + setattr(processor, "video_maxlen_ttl", model_args.video_maxlen_ttl) setattr(processor, "audio_sampling_rate", model_args.audio_sampling_rate) setattr(processor, "use_audio_in_video", model_args.use_audio_in_video)