Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions vllm_omni/benchmarks/data_modules/ucf101_multi_modal_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import base64
import logging
import os
from collections.abc import Mapping
from typing import Any

import numpy as np
from vllm.benchmarks.datasets import RandomMultiModalDataset

logger = logging.getLogger(__name__)


def load_ucf101_subset(
dataset_path: str,
subset_ratio: float = 0.1,
random_seed: int = 42,
) -> list[str]:
"""
Load a subset of the UCF101 dataset following the standard UCF101 directory structure.

UCF101 directory format: dataset_path/Class_Name/Video_Name.avi/mp4

Raises:
ValueError: If no video files are found or the input path is invalid.
"""
if not os.path.isdir(dataset_path):
raise ValueError(f"UCF101 dataset path is not a valid directory: {dataset_path}")

video_paths = []
rng = np.random.RandomState(random_seed)

for class_name in os.listdir(dataset_path):
class_dir = os.path.join(dataset_path, class_name)
if not os.path.isdir(class_dir):
continue

class_videos = [
os.path.join(class_dir, f) for f in os.listdir(class_dir) if f.lower().endswith((".avi", ".mp4"))
]
if not class_videos:
logger.warning(f"No video files found in class directory: {class_dir}")
continue

subset_size = max(1, int(len(class_videos) * subset_ratio))
subset_videos = rng.choice(class_videos, size=subset_size, replace=False).tolist()
video_paths.extend(subset_videos)

if not video_paths:
raise ValueError(f"No valid UCF101 video files found in {dataset_path} (support .avi/.mp4)")

logger.info(
f"Successfully loaded UCF101 subset: {len(video_paths)} videos from {len(os.listdir(dataset_path))} classes"
)
return video_paths


def process_ucf101_video(video_file_path: str) -> Mapping[str, Any]:
"""
Process a single UCF101 video file and return a multimedia content dictionary.

Aligns with the output format of process_video from RandomMultiModalDataset to
ensure upper-layer logic is unaware of the underlying video source.

Raises:
FileNotFoundError: If the video file does not exist.
IOError: If reading the video file fails.
"""
if not os.path.exists(video_file_path):
raise FileNotFoundError(f"UCF101 video file not found: {video_file_path}")

try:
with open(video_file_path, "rb") as f:
video_raw_bytes = f.read()
video_base64 = base64.b64encode(video_raw_bytes).decode("utf-8")
except Exception as e:
raise OSError(f"Failed to read UCF101 video {video_file_path}: {str(e)}")

return {
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
Comment on lines +78 to +80

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Fix MIME type for .avi videos in data URLs

UCF101 includes many .avi files, and load_ucf101_subset explicitly allows .avi, but process_ucf101_video always emits data:video/mp4;base64,.... When the selected file is AVI, downstream clients that rely on the MIME header can fail to decode or mis-handle the payload. The MIME type should be derived from the file extension (e.g., video/x-msvideo for .avi) to avoid decoding errors for the default UCF101 files.

Useful? React with 👍 / 👎.

}


# -----------------------------------------------------------------------------
# UCF101 MultiModalDataset Implementation
# -----------------------------------------------------------------------------
class UCF101MultiModalDataset(RandomMultiModalDataset):
def __init__(
self,
dataset_path: str,
subset_ratio: float = 0.1,
random_seed: int = 42,
**kwargs,
):
super().__init__(random_seed=random_seed, **kwargs)

self.dataset_path = dataset_path
self.subset_ratio = subset_ratio

self.ucf101_video_paths = load_ucf101_subset(
dataset_path=dataset_path,
subset_ratio=subset_ratio,
random_seed=random_seed,
)

self._video_rng = np.random.RandomState(random_seed)

def sample_ucf101_video(self) -> Mapping[str, Any]:
selected_video_path = self._video_rng.choice(self.ucf101_video_paths)
return process_ucf101_video(selected_video_path)

def generate_mm_item(
self,
mm_item_config: tuple[int, int, int],
) -> Mapping[str, Any]:
"""
Create UCF101 video items or synthetic image/audio items.

Follows the OpenAI API chat completions format:
https://github.com/openai/openai-python
"""

modality = self.map_config_to_modality(mm_item_config)

if modality == "video":
return self.sample_ucf101_video()
elif modality in ("image", "audio"):
return super().generate_mm_item(mm_item_config)
else:
raise ValueError(f"Invalid multimodal item configuration: {mm_item_config}")
23 changes: 23 additions & 0 deletions vllm_omni/benchmarks/patch/patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,29 @@
no_oversample=args.no_oversample,
)
return input_requests
elif args.dataset_name == "hf":
if not args.dataset_path:
raise ValueError("dataset_path must be specified for ucf101-subset dataset.")
dataset = UCF101MultiModalDataset(

Check failure on line 64 in vllm_omni/benchmarks/patch/patch.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (F821)

vllm_omni/benchmarks/patch/patch.py:64:19: F821 Undefined name `UCF101MultiModalDataset`

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Add missing import for UCF101MultiModalDataset

When args.dataset_name == "hf" this branch constructs UCF101MultiModalDataset, but patch.py never imports that symbol, so the benchmark will raise NameError at runtime as soon as the branch is hit. This makes the new dataset path unusable in the openai-chat-omni flow until the class is imported from vllm_omni.benchmarks.data_modules.ucf101_multi_modal_dataset.

Useful? React with 👍 / 👎.

dataset_path=args.dataset_path,
subset_ratio=getattr(args, "ucf101_subset_ratio", 0.1),
random_seed=args.seed,
)
input_requests = dataset.sample(
tokenizer=tokenizer,
num_requests=args.num_prompts,
prefix_len=args.random_prefix_len,
range_ratio=args.random_range_ratio,
input_len=args.random_input_len,
output_len=args.random_output_len,
base_items_per_request=args.random_mm_base_items_per_request,
limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt,
num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
bucket_config=args.random_mm_bucket_config,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
)
return input_requests
else:
return get_samples_old(args, tokenizer)

Expand Down
Loading