Skip to content

Commit a61bf46

Browse files
committed
kcz/add_mp4_disabling_into_frames
1 parent 29e8b27 commit a61bf46

File tree

5 files changed

+105
-64
lines changed

5 files changed

+105
-64
lines changed

tools/llm_bench/benchmark.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ def get_argprser():
219219
help="Path to .bin or .pt file with speaker embeddings for text to speech scenarios")
220220
parser.add_argument("--vocoder_path", type=str, default=None,
221221
help="Path to vocoder for text to speech scenarios")
222+
parser.add_argument("--video_frames", type=int, default=None,
223+
help="number of video frames to process")
222224
return parser.parse_args()
223225

224226

tools/llm_bench/llm_bench_utils/parse_json_data.py

Lines changed: 33 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2,78 +2,63 @@
22
# Copyright (C) 2023-2025 Intel Corporation
33
# SPDX-License-Identifier: Apache-2.0
44

5+
def create_base_prompt(json_data, key='prompt'):
6+
prompt_data = {}
7+
if key in json_data:
8+
if json_data[key] != "":
9+
prompt_data[key] = json_data[key]
10+
else: raise RuntimeError(f"== {key} should not be empty string ==")
11+
else: raise RuntimeError(f"== key word '{key}' does not exist ==")
12+
return prompt_data
513

614
def parse_text_json_data(json_data_list):
715
text_param_list = []
816
for json_data in json_data_list:
9-
if 'prompt' in json_data:
10-
if json_data['prompt'] != '':
11-
text_param_list.append(json_data['prompt'])
12-
else:
13-
raise RuntimeError('== prompt should not be empty string ==')
14-
else:
15-
raise RuntimeError('== key word "prompt" does not exist ==')
17+
prompt_data = create_base_prompt(json_data)
18+
text_param_list.append(prompt_data["prompt"])
1619
return text_param_list
1720

1821

1922
def parse_vlm_json_data(json_data_list):
2023
text_param_list = []
2124
for json_data in json_data_list:
22-
prompt_data = {}
23-
if 'prompt' in json_data:
24-
if json_data['prompt'] != '':
25-
prompt_data["prompt"] = json_data['prompt']
26-
else:
27-
raise RuntimeError('== prompt should not be empty string ==')
28-
else:
29-
raise RuntimeError('== key word "prompt" does not exist ==')
25+
prompt_data = create_base_prompt(json_data)
26+
assert ("media" in json_data) ^ ("video" in json_data)
3027
if "media" in json_data:
3128
prompt_data["media"] = json_data["media"]
29+
if "video" in json_data:
30+
prompt_data["video"] = json_data["video"]
3231
text_param_list.append(prompt_data)
3332
return text_param_list
3433

3534

3635
def parse_image_json_data(json_data_list):
3736
image_param_list = []
38-
for data in json_data_list:
39-
image_param = {}
40-
if 'prompt' in data:
41-
if data['prompt'] != '':
42-
image_param['prompt'] = data['prompt']
43-
else:
44-
raise RuntimeError('== prompt should not be empty string ==')
45-
else:
46-
raise RuntimeError('== key word "prompt" does not exist in prompt file ==')
47-
if 'width' in data:
48-
image_param['width'] = int(data['width'])
49-
if 'height' in data:
50-
image_param['height'] = int(data['height'])
51-
if 'steps' in data:
52-
image_param['steps'] = int(data['steps'])
53-
if 'guidance_scale' in data:
54-
image_param['guidance_scale'] = float(data['guidance_scale'])
55-
if 'media' in data:
56-
image_param['media'] = data['media']
57-
if 'mask_image' in data:
58-
image_param['mask_image'] = data['mask_image']
37+
for json_data in json_data_list:
38+
image_param = create_base_prompt(json_data)
39+
if 'width' in json_data:
40+
image_param['width'] = int(json_data['width'])
41+
if 'height' in json_data:
42+
image_param['height'] = int(json_data['height'])
43+
if 'steps' in json_data:
44+
image_param['steps'] = int(json_data['steps'])
45+
if 'guidance_scale' in json_data:
46+
image_param['guidance_scale'] = float(json_data['guidance_scale'])
47+
if 'media' in json_data:
48+
image_param['media'] = json_data['media']
49+
if 'mask_image' in json_data:
50+
image_param['mask_image'] = json_data['mask_image']
5951
image_param_list.append(image_param)
6052
return image_param_list
6153

6254

6355
def parse_speech_json_data(json_data_list):
6456
speech_param_list = []
6557
for json_data in json_data_list:
66-
speech_param = {}
67-
if 'media' in json_data:
68-
if json_data['media'] != '':
69-
speech_param['media'] = json_data['media']
70-
else:
71-
raise RuntimeError('== media path should not be empty string ==')
72-
else:
73-
raise RuntimeError('== key word "media" does not exist ==')
74-
if 'language' in json_data:
75-
speech_param['language'] = json_data['language']
76-
if 'timestamp' in json_data:
77-
speech_param['timestamp'] = json_data['timestamp']
58+
speech_param = create_base_prompt(json_data, "media")
59+
if "language" in json_data:
60+
speech_param["language"] = json_data["language"]
61+
if "timestamp" in json_data:
62+
speech_param["timestamp"] = json_data["timestamp"]
7863
speech_param_list.append(speech_param)
7964
return speech_param_list

tools/llm_bench/llm_bench_utils/prompt_utils.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,14 @@
22
# Copyright (C) 2023-2025 Intel Corporation
33
# SPDX-License-Identifier: Apache-2.0
44

5+
6+
import os
7+
import cv2
8+
from PIL import Image
9+
import logging as log
510
from .model_utils import get_param_from_file
611
from .parse_json_data import parse_text_json_data
712

8-
913
def get_text_prompt(args):
1014
text_list = []
1115
output_data_list, is_json_data = get_param_from_file(args, 'prompt')
@@ -17,3 +21,42 @@ def get_text_prompt(args):
1721
else:
1822
text_list.append(output_data_list[0])
1923
return text_list
24+
25+
26+
def print_frames_number(func):
27+
def inner(video_path, decym_frames):
28+
log.info(f"Input video file: {video_path}")
29+
log.info(f"Requested to reduce into {decym_frames} frames")
30+
out_frames = func(video_path, decym_frames)
31+
log.info(f"Final frames number: {len(out_frames)}")
32+
return out_frames
33+
return inner
34+
35+
@print_frames_number
36+
def split_video_into_frames(video_path, decym_frames=None):
37+
supported_files = set([".mp4"])
38+
39+
assert os.path.exists(video_path), f"no input video file: {video_path}"
40+
assert video_path.suffix.lower() in supported_files, "no supported video file"
41+
cap = cv2.VideoCapture(video_path)
42+
43+
output_frames = []
44+
while True:
45+
ret, frame = cap.read()
46+
if not ret: break
47+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
48+
pil_image = Image.fromarray(frame_rgb)
49+
output_frames.append(pil_image)
50+
if decym_frames is None:
51+
return output_frames
52+
53+
# decimation procedure:
54+
# decim_fames is required frame number
55+
#
56+
decym_frames = int(decym_frames)
57+
if len(output_frames) <= decym_frames:
58+
return output_frames
59+
decym_factor = len(output_frames) / decym_frames
60+
if decym_factor >= 2:
61+
return list(output_frames[::decym_factor])
62+
return output_frames

tools/llm_bench/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ pillow
88
torch
99
transformers[sentencepiece]>=4.40.0
1010
diffusers>=0.22.0
11-
#optimum is in dependency list of optimum-intel
11+
#optimum is in dependency list of optimum-intel
1212
optimum-intel[nncf]>=1.25.0
1313
packaging
1414
psutil
@@ -20,3 +20,4 @@ jinja2>=3.1.0
2020
scipy
2121
gguf_parser
2222
gguf>=0.10
23+
opencv-python

tools/llm_bench/task/visual_language_generation.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@
1717
import llm_bench_utils.output_file
1818
import llm_bench_utils.gen_output_data as gen_output_data
1919
import llm_bench_utils.parse_json_data as parse_json_data
20+
import llm_bench_utils.prompt_utils as pu
2021
from pathlib import Path
2122

22-
2323
FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils}
2424

2525
DEFAULT_OUTPUT_TOKEN_SIZE = 512
2626

2727

2828
def run_visual_language_generation_optimum(
29-
inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index, bench_hook, model_precision, proc_id, mem_consumption
30-
):
29+
inputs, num, model, processor, args, iter_data_list, md5_list,
30+
prompt_index, bench_hook, model_precision, proc_id, mem_consumption):
3131
from optimum.intel.utils.import_utils import is_transformers_version
3232
set_seed(args['seed'])
3333
if args['batch_size'] != 1:
@@ -37,13 +37,17 @@ def run_visual_language_generation_optimum(
3737
prompts = []
3838
inputs = [inputs] if not isinstance(inputs, (list, tuple)) else inputs
3939
for input_data in inputs:
40-
if input_data.get("media", None):
40+
if input_data.get("video", None):
41+
entry = Path(input_data["video"])
42+
required_frames = args.get("video_frames")
43+
ordered_frames = pu.split_video_into_frames(entry, required_frames)
44+
images.extend(ordered_frames)
45+
elif input_data.get("media", None):
4146
entry = Path(input_data["media"])
4247
if entry.is_dir():
4348
for file in sorted(entry.iterdir()):
4449
images.append(load_image(str(file)))
45-
else:
46-
images.append(load_image(input_data["media"]))
50+
else: images.append(load_image(input_data["media"]))
4751
prompts.append(input_data["prompt"])
4852
prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
4953
log.info(f'{prefix}[P{prompt_index}] Input image nums:{len(images)}')
@@ -198,13 +202,17 @@ def run_visual_language_generation_genai(
198202
prompts = []
199203
inputs = [inputs] if not isinstance(inputs, (list, tuple)) else inputs
200204
for input_data in inputs:
201-
if input_data.get("media", None):
205+
if input_data.get("video", None):
206+
entry = Path(input_data["media"])
207+
required_frames = args.get('video_frames')
208+
ordered_frames = pu.split_video_into_frames(entry, required_frames)
209+
images.extend(ordered_frames)
210+
elif input_data.get("media", None):
202211
entry = Path(input_data["media"])
203212
if entry.is_dir():
204213
for file in sorted(entry.iterdir()):
205214
images.append(load_image_genai(str(file)))
206-
else:
207-
images.append(load_image_genai(input_data["media"]))
215+
else: images.append(load_image_genai(input_data["media"]))
208216
prompts.append(input_data["prompt"])
209217
if args["output_dir"] is not None and num == 0:
210218
for bs_index, in_text in enumerate(prompts):
@@ -365,14 +373,16 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
365373

366374
def get_image_text_prompt(args):
367375
vlm_file_list = []
368-
output_data_list, is_json_data = model_utils.get_param_from_file(args, ['media', "prompt"])
376+
output_data_list, is_json_data = model_utils.get_param_from_file(args, ["media", "prompt"])
369377
if is_json_data:
370378
vlm_param_list = parse_json_data.parse_vlm_json_data(output_data_list)
371379
if len(vlm_param_list) > 0:
372380
for vlm_file in vlm_param_list:
373-
if args['prompt_file'] is not None and len(args['prompt_file']) > 0:
374-
vlm_file['media'] = model_utils.resolve_media_file_path(vlm_file.get("media"), args['prompt_file'][0])
381+
if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'media' in vlm_file:
382+
if 'video' in vlm_file: log.warning('media and video cannot be specify in a single prompt file')
383+
vlm_file['media'] = model_utils.resolve_media_file_path(vlm_file.get('media'), args['prompt_file'][0])
384+
elif args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'video' in vlm_file:
385+
vlm_file['video'] = model_utils.resolve_media_file_path(vlm_file.get('video'), args['prompt_file'][0])
375386
vlm_file_list.append(vlm_file)
376-
else:
377-
vlm_file_list.append(output_data_list)
387+
else: vlm_file_list.append(output_data_list)
378388
return vlm_file_list

0 commit comments

Comments
 (0)