Skip to content

Commit 8c8c93d

Browse files
committed
kcz/add_mp4_disabling_into_frames
1 parent 29e8b27 commit 8c8c93d

File tree

5 files changed

+118
-77
lines changed

5 files changed

+118
-77
lines changed

tools/llm_bench/benchmark.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ def get_argprser():
219219
help="Path to .bin or .pt file with speaker embeddings for text to speech scenarios")
220220
parser.add_argument("--vocoder_path", type=str, default=None,
221221
help="Path to vocoder for text to speech scenarios")
222+
parser.add_argument("-vf", "--video_frames", type=int, default=None,
223+
help="number of video frames to process")
222224
return parser.parse_args()
223225

224226

@@ -244,7 +246,6 @@ def main():
244246
**logging_kwargs
245247
)
246248
args = get_argprser()
247-
248249
if args.tokens_len is not None and not args.streaming:
249250
log.error("--tokens_len requires --streaming to be set.")
250251
exit(1)
@@ -305,7 +306,8 @@ def main():
305306
args.num_iters, memory_data_collector)
306307
else:
307308
iter_data_list, pretrain_time, iter_timestamp = CASE_TO_BENCH[model_args['use_case'].task](
308-
model_path, framework, args.device, model_args, args.num_iters, memory_data_collector)
309+
model_path, framework, args.device, model_args, args.num_iters,
310+
memory_data_collector, args.video_frames)
309311
if args.report is not None or args.report_json is not None:
310312
model_precision = ''
311313
if framework == 'ov':

tools/llm_bench/llm_bench_utils/parse_json_data.py

Lines changed: 33 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2,78 +2,63 @@
22
# Copyright (C) 2023-2025 Intel Corporation
33
# SPDX-License-Identifier: Apache-2.0
44

5+
def create_base_prompt(json_data, key='prompt'):
6+
prompt_data = {}
7+
if key in json_data:
8+
if json_data[key] != "":
9+
prompt_data[key] = json_data[key]
10+
else: raise RuntimeError(f"== {key} should not be empty string ==")
11+
else: raise RuntimeError(f"== key word '{key}' does not exist ==")
12+
return prompt_data
513

614
def parse_text_json_data(json_data_list):
715
text_param_list = []
816
for json_data in json_data_list:
9-
if 'prompt' in json_data:
10-
if json_data['prompt'] != '':
11-
text_param_list.append(json_data['prompt'])
12-
else:
13-
raise RuntimeError('== prompt should not be empty string ==')
14-
else:
15-
raise RuntimeError('== key word "prompt" does not exist ==')
17+
prompt_data = create_base_prompt(json_data)
18+
text_param_list.append(prompt_data["prompt"])
1619
return text_param_list
1720

1821

1922
def parse_vlm_json_data(json_data_list):
2023
text_param_list = []
2124
for json_data in json_data_list:
22-
prompt_data = {}
23-
if 'prompt' in json_data:
24-
if json_data['prompt'] != '':
25-
prompt_data["prompt"] = json_data['prompt']
26-
else:
27-
raise RuntimeError('== prompt should not be empty string ==')
28-
else:
29-
raise RuntimeError('== key word "prompt" does not exist ==')
25+
prompt_data = create_base_prompt(json_data)
26+
assert ("media" in json_data) ^ ("video" in json_data)
3027
if "media" in json_data:
3128
prompt_data["media"] = json_data["media"]
29+
if "video" in json_data:
30+
prompt_data["video"] = json_data["video"]
3231
text_param_list.append(prompt_data)
3332
return text_param_list
3433

3534

3635
def parse_image_json_data(json_data_list):
3736
image_param_list = []
38-
for data in json_data_list:
39-
image_param = {}
40-
if 'prompt' in data:
41-
if data['prompt'] != '':
42-
image_param['prompt'] = data['prompt']
43-
else:
44-
raise RuntimeError('== prompt should not be empty string ==')
45-
else:
46-
raise RuntimeError('== key word "prompt" does not exist in prompt file ==')
47-
if 'width' in data:
48-
image_param['width'] = int(data['width'])
49-
if 'height' in data:
50-
image_param['height'] = int(data['height'])
51-
if 'steps' in data:
52-
image_param['steps'] = int(data['steps'])
53-
if 'guidance_scale' in data:
54-
image_param['guidance_scale'] = float(data['guidance_scale'])
55-
if 'media' in data:
56-
image_param['media'] = data['media']
57-
if 'mask_image' in data:
58-
image_param['mask_image'] = data['mask_image']
37+
for json_data in json_data_list:
38+
image_param = create_base_prompt(json_data)
39+
if 'width' in json_data:
40+
image_param['width'] = int(json_data['width'])
41+
if 'height' in json_data:
42+
image_param['height'] = int(json_data['height'])
43+
if 'steps' in json_data:
44+
image_param['steps'] = int(json_data['steps'])
45+
if 'guidance_scale' in json_data:
46+
image_param['guidance_scale'] = float(json_data['guidance_scale'])
47+
if 'media' in json_data:
48+
image_param['media'] = json_data['media']
49+
if 'mask_image' in json_data:
50+
image_param['mask_image'] = json_data['mask_image']
5951
image_param_list.append(image_param)
6052
return image_param_list
6153

6254

6355
def parse_speech_json_data(json_data_list):
6456
speech_param_list = []
6557
for json_data in json_data_list:
66-
speech_param = {}
67-
if 'media' in json_data:
68-
if json_data['media'] != '':
69-
speech_param['media'] = json_data['media']
70-
else:
71-
raise RuntimeError('== media path should not be empty string ==')
72-
else:
73-
raise RuntimeError('== key word "media" does not exist ==')
74-
if 'language' in json_data:
75-
speech_param['language'] = json_data['language']
76-
if 'timestamp' in json_data:
77-
speech_param['timestamp'] = json_data['timestamp']
58+
speech_param = create_base_prompt(json_data, "media")
59+
if "language" in json_data:
60+
speech_param["language"] = json_data["language"]
61+
if "timestamp" in json_data:
62+
speech_param["timestamp"] = json_data["timestamp"]
7863
speech_param_list.append(speech_param)
7964
return speech_param_list

tools/llm_bench/llm_bench_utils/prompt_utils.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,14 @@
22
# Copyright (C) 2023-2025 Intel Corporation
33
# SPDX-License-Identifier: Apache-2.0
44

5+
6+
import os
7+
import cv2
8+
from PIL import Image
9+
import logging as log
510
from .model_utils import get_param_from_file
611
from .parse_json_data import parse_text_json_data
712

8-
913
def get_text_prompt(args):
1014
text_list = []
1115
output_data_list, is_json_data = get_param_from_file(args, 'prompt')
@@ -17,3 +21,43 @@ def get_text_prompt(args):
1721
else:
1822
text_list.append(output_data_list[0])
1923
return text_list
24+
25+
26+
def print_frames_number(func):
27+
def inner(video_path, decym_frames):
28+
log.info(f"Input video file: {video_path}")
29+
if decym_frames is not None:
30+
log.info(f"Requested to reduce into {decym_frames} frames")
31+
out_frames = func(video_path, decym_frames)
32+
log.info(f"Final frames number: {len(out_frames)}")
33+
return out_frames
34+
return inner
35+
36+
@print_frames_number
37+
def split_video_into_frames(video_path, decym_frames=None):
38+
supported_files = set([".mp4"])
39+
40+
assert os.path.exists(video_path), f"no input video file: {video_path}"
41+
assert video_path.suffix.lower() in supported_files, "no supported video file"
42+
cap = cv2.VideoCapture(video_path)
43+
44+
output_frames = []
45+
while True:
46+
ret, frame = cap.read()
47+
if not ret: break
48+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
49+
pil_image = Image.fromarray(frame_rgb)
50+
output_frames.append(pil_image)
51+
if decym_frames is None:
52+
return output_frames
53+
54+
# decimation procedure:
55+
# decim_fames is required frame number
56+
#
57+
decym_frames = int(decym_frames)
58+
if len(output_frames) <= decym_frames:
59+
return output_frames
60+
decym_factor = len(output_frames) / decym_frames
61+
if decym_factor >= 2:
62+
return list(output_frames[::decym_factor])
63+
return output_frames

tools/llm_bench/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ pillow
88
torch
99
transformers[sentencepiece]>=4.40.0
1010
diffusers>=0.22.0
11-
#optimum is in dependency list of optimum-intel
11+
#optimum is in dependency list of optimum-intel
1212
optimum-intel[nncf]>=1.25.0
1313
packaging
1414
psutil
@@ -20,3 +20,4 @@ jinja2>=3.1.0
2020
scipy
2121
gguf_parser
2222
gguf>=0.10
23+
opencv-python

tools/llm_bench/task/visual_language_generation.py

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@
1717
import llm_bench_utils.output_file
1818
import llm_bench_utils.gen_output_data as gen_output_data
1919
import llm_bench_utils.parse_json_data as parse_json_data
20+
import llm_bench_utils.prompt_utils as pu
2021
from pathlib import Path
2122

22-
2323
FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils}
2424

2525
DEFAULT_OUTPUT_TOKEN_SIZE = 512
2626

2727

2828
def run_visual_language_generation_optimum(
29-
inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index, bench_hook, model_precision, proc_id, mem_consumption
30-
):
29+
inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index,
30+
bench_hook, model_precision, proc_id, mem_consumption, required_frames=None):
3131
from optimum.intel.utils.import_utils import is_transformers_version
3232
set_seed(args['seed'])
3333
if args['batch_size'] != 1:
@@ -37,13 +37,16 @@ def run_visual_language_generation_optimum(
3737
prompts = []
3838
inputs = [inputs] if not isinstance(inputs, (list, tuple)) else inputs
3939
for input_data in inputs:
40-
if input_data.get("media", None):
40+
if input_data.get("video", None):
41+
entry = Path(input_data["video"])
42+
ordered_frames = pu.split_video_into_frames(entry, required_frames)
43+
images.extend(ordered_frames)
44+
elif input_data.get("media", None):
4145
entry = Path(input_data["media"])
4246
if entry.is_dir():
4347
for file in sorted(entry.iterdir()):
4448
images.append(load_image(str(file)))
45-
else:
46-
images.append(load_image(input_data["media"]))
49+
else: images.append(load_image(input_data["media"]))
4750
prompts.append(input_data["prompt"])
4851
prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
4952
log.info(f'{prefix}[P{prompt_index}] Input image nums:{len(images)}')
@@ -189,22 +192,25 @@ def load_image_genai(image_path):
189192

190193

191194
def run_visual_language_generation_genai(
192-
inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index, streamer, model_precision, proc_id, mem_consumption
193-
):
195+
inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index,
196+
streamer, model_precision, proc_id, mem_consumption, required_frames=None):
194197
if args['batch_size'] != 1:
195198
log.warning("Only batch size 1 available for benchmarking")
196199
args["batch_size"] = 1
197200
images = []
198201
prompts = []
199202
inputs = [inputs] if not isinstance(inputs, (list, tuple)) else inputs
200203
for input_data in inputs:
201-
if input_data.get("media", None):
204+
if input_data.get("video", None):
205+
entry = Path(input_data["media"])
206+
ordered_frames = pu.split_video_into_frames(entry, required_frames)
207+
images.extend(ordered_frames)
208+
elif input_data.get("media", None):
202209
entry = Path(input_data["media"])
203210
if entry.is_dir():
204211
for file in sorted(entry.iterdir()):
205212
images.append(load_image_genai(str(file)))
206-
else:
207-
images.append(load_image_genai(input_data["media"]))
213+
else: images.append(load_image_genai(input_data["media"]))
208214
prompts.append(input_data["prompt"])
209215
if args["output_dir"] is not None and num == 0:
210216
for bs_index, in_text in enumerate(prompts):
@@ -304,8 +310,11 @@ def run_visual_language_generation_genai(
304310
metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
305311

306312

307-
def run_visual_language_generation_benchmark(model_path, framework, device, args, num_iters, mem_consumption):
308-
model, processor, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_image_text_gen_model(model_path, device, mem_consumption, **args)
313+
def run_visual_language_generation_benchmark(
314+
model_path, framework, device, args, num_iters,
315+
mem_consumption, required_frames=None):
316+
outs = FW_UTILS[framework].create_image_text_gen_model(model_path, device, mem_consumption, **args)
317+
model, processor, pretrain_time, bench_hook, use_genai = outs
309318
model_precision = model_utils.get_model_precision(model_path.parts)
310319
iter_data_list = []
311320
md5_list = {num : {} for num in range(num_iters + 1)}
@@ -325,10 +334,8 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
325334
log.info(f"Numbeams: {args['num_beams']}, benchmarking iter nums(exclude warm-up): {num_iters}, "
326335
f'prompt nums: {len(image_text_list)}, prompt idx: {prompt_idx_list}')
327336

328-
if not use_genai:
329-
gen_fn = run_visual_language_generation_optimum
330-
else:
331-
gen_fn = run_visual_language_generation_genai
337+
if use_genai: gen_fn = run_visual_language_generation_genai
338+
else: gen_fn = run_visual_language_generation_optimum
332339

333340
proc_id = os.getpid()
334341
iter_timestamp = model_utils.init_timestamp(num_iters, image_text_list, prompt_idx_list)
@@ -341,7 +348,7 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
341348
iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
342349
gen_fn(
343350
input_text, num, model, processor, args, iter_data_list, md5_list,
344-
p_idx, bench_hook, model_precision, proc_id, mem_consumption)
351+
p_idx, bench_hook, model_precision, proc_id, mem_consumption, required_frames)
345352
iter_timestamp[num][p_idx]['end'] = datetime.datetime.now().isoformat()
346353
prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
347354
log.info(f"{prefix}[P{p_idx}] start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}")
@@ -353,8 +360,8 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
353360
log.info(f'[warm-up][P{p_idx}] Input text: {input_text}')
354361
iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
355362
gen_fn(
356-
input_text, num, model, processor, args, iter_data_list, md5_list,
357-
prompt_idx_list[idx], bench_hook, model_precision, proc_id, mem_consumption)
363+
input_text, num, model, processor, args, iter_data_list, md5_list, prompt_idx_list[idx],
364+
bench_hook, model_precision, proc_id, mem_consumption, required_frames)
358365
iter_timestamp[num][p_idx]['end'] = datetime.datetime.now().isoformat()
359366
prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
360367
log.info(f"{prefix}[P{p_idx}] start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}")
@@ -365,14 +372,16 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
365372

366373
def get_image_text_prompt(args):
367374
vlm_file_list = []
368-
output_data_list, is_json_data = model_utils.get_param_from_file(args, ['media', "prompt"])
375+
output_data_list, is_json_data = model_utils.get_param_from_file(args, ["media", "prompt"])
369376
if is_json_data:
370377
vlm_param_list = parse_json_data.parse_vlm_json_data(output_data_list)
371378
if len(vlm_param_list) > 0:
372379
for vlm_file in vlm_param_list:
373-
if args['prompt_file'] is not None and len(args['prompt_file']) > 0:
374-
vlm_file['media'] = model_utils.resolve_media_file_path(vlm_file.get("media"), args['prompt_file'][0])
380+
if args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'media' in vlm_file:
381+
if 'video' in vlm_file: log.warning('media and video cannot be specify in a single prompt file')
382+
vlm_file['media'] = model_utils.resolve_media_file_path(vlm_file.get('media'), args['prompt_file'][0])
383+
elif args['prompt_file'] is not None and len(args['prompt_file']) > 0 and 'video' in vlm_file:
384+
vlm_file['video'] = model_utils.resolve_media_file_path(vlm_file.get('video'), args['prompt_file'][0])
375385
vlm_file_list.append(vlm_file)
376-
else:
377-
vlm_file_list.append(output_data_list)
386+
else: vlm_file_list.append(output_data_list)
378387
return vlm_file_list

0 commit comments

Comments
 (0)