Skip to content

Commit 5cf5823

Browse files
committed
kcz/add_mp4_disabling_into_frames
1 parent a77ce48 commit 5cf5823

File tree

4 files changed

+44
-3
lines changed

4 files changed

+44
-3
lines changed

tools/llm_bench/benchmark.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ def get_argprser():
219219
help="Path to .bin or .pt file with speaker embeddings for text to speech scenarios")
220220
parser.add_argument("--vocoder_path", type=str, default=None,
221221
help="Path to vocoder for text to speech scenarios")
222+
parser.add_argument("--video_frames", type=int, default=None,
223+
help="number of video frames to process")
222224
return parser.parse_args()
223225

224226

tools/llm_bench/llm_bench_utils/prompt_utils.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22
# Copyright (C) 2023-2025 Intel Corporation
33
# SPDX-License-Identifier: Apache-2.0
44

5+
6+
import os
7+
import cv2
8+
from PIL import Image
59
from .model_utils import get_param_from_file
610
from .parse_json_data import parse_text_json_data
711

8-
912
def get_text_prompt(args):
1013
text_list = []
1114
output_data_list, is_json_data = get_param_from_file(args, 'prompt')
@@ -17,3 +20,31 @@ def get_text_prompt(args):
1720
else:
1821
text_list.append(output_data_list[0])
1922
return text_list
23+
24+
25+
def split_video_into_frames(video_path, decym_frames=None):
26+
supported_files = set([".mp4"])
27+
assert os.path.exists(video_path), "no input video file"
28+
assert video_path.suffix.lower() in supported_files, "no supported video file"
29+
cap = cv2.VideoCapture(video_path)
30+
31+
output_frames = []
32+
while True:
33+
ret, frame = cap.read()
34+
if not ret: break
35+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
36+
pil_image = Image.fromarray(frame_rgb)
37+
output_frames.append(pil_image)
38+
if decym_frames is None:
39+
return output_frames
40+
41+
# decimation procedure:
42+
# decim_fames is required frame number
43+
#
44+
decym_frames = int(decym_frames)
45+
if len(output_frames) <= decym_frames:
46+
return output_frames
47+
decym_factor = len(output_frames) / decym_frames
48+
if decym_factor >= 2:
49+
return list(output_frames[::decym_factor])
50+
return output_frames

tools/llm_bench/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ pillow
88
torch
99
transformers[sentencepiece]>=4.40.0
1010
diffusers>=0.22.0
11-
#optimum is in dependency list of optimum-intel
11+
#optimum is in dependency list of optimum-intel
1212
optimum-intel[nncf]>=1.25.0
1313
packaging
1414
psutil
@@ -20,3 +20,4 @@ jinja2>=3.1.0
2020
scipy
2121
gguf_parser
2222
gguf>=0.10
23+
opencv-python

tools/llm_bench/task/visual_language_generation.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717
import llm_bench_utils.output_file
1818
import llm_bench_utils.gen_output_data as gen_output_data
1919
import llm_bench_utils.parse_json_data as parse_json_data
20+
import llm_bench_utils.prompt_utils as pu
2021
from pathlib import Path
2122

22-
2323
FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils}
2424

2525
DEFAULT_OUTPUT_TOKEN_SIZE = 512
@@ -44,6 +44,13 @@ def run_visual_language_generation_optimum(
4444
images.append(load_image(str(file)))
4545
else:
4646
images.append(load_image(input_data["media"]))
47+
48+
if input_data.get("video", None):
49+
entry = Path(input_data["video"])
50+
required_frames = args.get('video_frames')
51+
ordered_frames = pu.split_video_into_frames(entry, required_frames)
52+
images.extend(ordered_frames)
53+
4754
prompts.append(input_data["prompt"])
4855
prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
4956
log.info(f'{prefix}[P{prompt_index}] Input image nums:{len(images)}')

0 commit comments

Comments
 (0)