Skip to content

support new processor arg video_maxlen_ttl #7810

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/llamafactory/chat/vllm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ async def _generate(
image_min_pixels=self.model_args.video_min_pixels,
video_fps=self.model_args.video_fps,
video_maxlen=self.model_args.video_maxlen,
video_maxlen_ttl=self.model_args.video_maxlen_ttl,
)["videos"]
}
elif audios is not None:
Expand Down
49 changes: 46 additions & 3 deletions src/llamafactory/data/mm_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,24 @@ class MMProcessor(ProcessorMixin):
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
pass

def _cal_max_frames_each_video(durations: list, video_maxlen_ttl: int, video_maxlen: int) -> list[int]:
"""Calculate `max_num_of_frames` for each video based on their durations, and return a list of `max_num_of_frames`. Every `max_num_of_frames` should be in [2, video_maxlen]."""
dura_ttl = sum(durations)
max_nums_of_frames = [ # 2 < max_num_of_frames < video_maxlen
min(max(int(video_maxlen_ttl * dura / dura_ttl), 2), video_maxlen) for dura in durations
] # list of `max_num_of_frames`
if sum(max_nums_of_frames) > video_maxlen_ttl: # may be bigger if some are set 2
delta = sum(max_nums_of_frames) - video_maxlen_ttl
for _ in range(delta): #
max_idx = max_nums_of_frames.index(max(max_nums_of_frames))
if max(max_nums_of_frames) - 1 >= 2: # should still >= 2
max_nums_of_frames[max_idx] -= 1
else:
raise ValueError(
f"Too many videos. Couldn't satisfy the requirement of having at least 2 frames for each video. Please decrease the number of videos or increase `video_maxlen_ttl` (e.g. >={2 * len(max_nums_of_frames)})."
)
return max_nums_of_frames


def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]:
r"""Get paligemma token type ids for computing loss.
Expand Down Expand Up @@ -233,10 +251,20 @@ def _regularize_images(self, images: list["ImageInput"], **kwargs) -> dict[str,
def _regularize_videos(self, videos: list["VideoInput"], **kwargs) -> dict[str, list[list["ImageObject"]]]:
r"""Regularizes videos to avoid error. Including reading, resizing and converting."""
results = []
for video in videos:
video_streams = []
durations = []
for video in videos: # prepare durations first
container = av.open(video, "r")
video_stream = next(stream for stream in container.streams if stream.type == "video")
sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
durations.append(video_stream.duration * video_stream.time_base) # unit: second
video_streams.append(video_stream)
max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs)
for video_stream, max_frames in zip(video_streams, max_frames_each_video):
sample_indices = self._get_video_sample_indices(
video_stream,
video_fps=kwargs["video_fps"],
video_maxlen=max_frames,
)
frames: list[ImageObject] = []
container.seek(0)
for frame_idx, frame in enumerate(container.decode(video_stream)):
Expand Down Expand Up @@ -326,6 +354,7 @@ def _get_mm_inputs(
image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
video_fps=getattr(processor, "video_fps", 2.0),
video_maxlen=getattr(processor, "video_maxlen", 128),
video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default
)["videos"]
if "videos" in inspect.signature(video_processor.preprocess).parameters: # for qwen2_vl and video_llava
mm_inputs.update(video_processor(images=None, videos=videos, return_tensors="pt"))
Expand Down Expand Up @@ -509,6 +538,7 @@ def _get_mm_inputs(
image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
video_fps=getattr(processor, "video_fps", 2.0),
video_maxlen=getattr(processor, "video_maxlen", 128),
video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default
)["videos"]

if len(images) != 0:
Expand Down Expand Up @@ -1046,6 +1076,7 @@ def _get_mm_inputs(
image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
video_fps=getattr(processor, "video_fps", 2.0),
video_maxlen=getattr(processor, "video_maxlen", 128),
video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default
)["videos"]
video_inputs = image_processor(videos, do_pad=True, max_slice_nums=2, return_tensors="pt")
mm_inputs.update(video_inputs)
Expand Down Expand Up @@ -1430,10 +1461,20 @@ def _regularize_videos(
self, videos: list["VideoInput"], **kwargs
) -> dict[str, Union[list[list["ImageObject"]], list[float]]]:
results, fps_per_video = [], []
video_streams = []
durations = []
for video in videos:
container = av.open(video, "r")
video_stream = next(stream for stream in container.streams if stream.type == "video")
sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
durations.append(video_stream.duration * video_stream.time_base) # unit: second
video_streams.append(video_stream)
max_frames_each_video = _cal_max_frames_each_video(durations, **kwargs)
for video_stream, max_frames in zip(video_streams, max_frames_each_video):
sample_indices = self._get_video_sample_indices(
video_stream,
video_fps=kwargs["video_fps"],
video_maxlen=max_frames,
)
frames: list[ImageObject] = []
container.seek(0)
for frame_idx, frame in enumerate(container.decode(video_stream)):
Expand Down Expand Up @@ -1477,6 +1518,7 @@ def _get_mm_inputs(
image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
video_fps=getattr(processor, "video_fps", 2.0),
video_maxlen=getattr(processor, "video_maxlen", 128),
video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default
)
mm_inputs.update(image_processor(images=None, videos=video_data["videos"], return_tensors="pt"))
temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
Expand Down Expand Up @@ -1568,6 +1610,7 @@ def _get_mm_inputs(
image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
video_fps=getattr(processor, "video_fps", 2.0),
video_maxlen=getattr(processor, "video_maxlen", 128),
video_maxlen_ttl=getattr(processor, "video_maxlen_ttl", 128 * len(videos)), # disabled by default
)
mm_inputs.update(image_processor(images=None, videos=video_dict["videos"], return_tensors="pt"))
temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
Expand Down
6 changes: 5 additions & 1 deletion src/llamafactory/hparams/model_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,11 @@ class ProcessorArguments:
)
video_maxlen: int = field(
default=128,
metadata={"help": "The maximum number of sampled frames for video inputs."},
metadata={"help": "The unified maximum number of sampled frames for each video inputs."},
)
video_maxlen_ttl: int = field(
default=128 * 50, # assume 50 videos at max in 1 input
metadata={"help": "The maximum number of total sampled frames of all video inputs."},
)
audio_sampling_rate: int = field(
default=16000,
Expand Down
1 change: 1 addition & 0 deletions src/llamafactory/model/patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def patch_processor(
setattr(processor, "video_min_pixels", model_args.video_min_pixels)
setattr(processor, "video_fps", model_args.video_fps)
setattr(processor, "video_maxlen", model_args.video_maxlen)
setattr(processor, "video_maxlen_ttl", model_args.video_maxlen_ttl)
setattr(processor, "audio_sampling_rate", model_args.audio_sampling_rate)
setattr(processor, "use_audio_in_video", model_args.use_audio_in_video)

Expand Down