1717import llm_bench_utils .output_file
1818import llm_bench_utils .gen_output_data as gen_output_data
1919import llm_bench_utils .parse_json_data as parse_json_data
20+ import llm_bench_utils .prompt_utils as pu
2021from pathlib import Path
2122
22-
2323FW_UTILS = {'pt' : llm_bench_utils .pt_utils , 'ov' : llm_bench_utils .ov_utils }
2424
2525DEFAULT_OUTPUT_TOKEN_SIZE = 512
2626
2727
2828def run_visual_language_generation_optimum (
29- inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index , bench_hook , model_precision , proc_id , mem_consumption
30- ):
29+ inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index ,
30+ bench_hook , model_precision , proc_id , mem_consumption , required_frames = None ):
3131 from optimum .intel .utils .import_utils import is_transformers_version
3232 set_seed (args ['seed' ])
3333 if args ['batch_size' ] != 1 :
@@ -37,13 +37,16 @@ def run_visual_language_generation_optimum(
3737 prompts = []
3838 inputs = [inputs ] if not isinstance (inputs , (list , tuple )) else inputs
3939 for input_data in inputs :
40- if input_data .get ("media" , None ):
40+ if input_data .get ("video" , None ):
41+ entry = Path (input_data ["video" ])
42+ ordered_frames = pu .split_video_into_frames (entry , required_frames )
43+ images .extend (ordered_frames )
44+ elif input_data .get ("media" , None ):
4145 entry = Path (input_data ["media" ])
4246 if entry .is_dir ():
4347 for file in sorted (entry .iterdir ()):
4448 images .append (load_image (str (file )))
45- else :
46- images .append (load_image (input_data ["media" ]))
49+ else : images .append (load_image (input_data ["media" ]))
4750 prompts .append (input_data ["prompt" ])
4851 prefix = '[warm-up]' if num == 0 else '[{}]' .format (num )
4952 log .info (f'{ prefix } [P{ prompt_index } ] Input image nums:{ len (images )} ' )
@@ -189,22 +192,25 @@ def load_image_genai(image_path):
189192
190193
191194def run_visual_language_generation_genai (
192- inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index , streamer , model_precision , proc_id , mem_consumption
193- ):
195+ inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index ,
196+ streamer , model_precision , proc_id , mem_consumption , required_frames = None ):
194197 if args ['batch_size' ] != 1 :
195198 log .warning ("Only batch size 1 available for benchmarking" )
196199 args ["batch_size" ] = 1
197200 images = []
198201 prompts = []
199202 inputs = [inputs ] if not isinstance (inputs , (list , tuple )) else inputs
200203 for input_data in inputs :
201- if input_data .get ("media" , None ):
204+ if input_data .get ("video" , None ):
205+ entry = Path (input_data ["media" ])
206+ ordered_frames = pu .split_video_into_frames (entry , required_frames )
207+ images .extend (ordered_frames )
208+ elif input_data .get ("media" , None ):
202209 entry = Path (input_data ["media" ])
203210 if entry .is_dir ():
204211 for file in sorted (entry .iterdir ()):
205212 images .append (load_image_genai (str (file )))
206- else :
207- images .append (load_image_genai (input_data ["media" ]))
213+ else : images .append (load_image_genai (input_data ["media" ]))
208214 prompts .append (input_data ["prompt" ])
209215 if args ["output_dir" ] is not None and num == 0 :
210216 for bs_index , in_text in enumerate (prompts ):
@@ -304,8 +310,11 @@ def run_visual_language_generation_genai(
304310 metrics_print .print_generated (num , warm_up = (num == 0 ), generated = generated_text [0 ], prompt_idx = prompt_index )
305311
306312
307- def run_visual_language_generation_benchmark (model_path , framework , device , args , num_iters , mem_consumption ):
308- model , processor , pretrain_time , bench_hook , use_genai = FW_UTILS [framework ].create_image_text_gen_model (model_path , device , mem_consumption , ** args )
313+ def run_visual_language_generation_benchmark (
314+ model_path , framework , device , args , num_iters ,
315+ mem_consumption , required_frames = None ):
316+ outs = FW_UTILS [framework ].create_image_text_gen_model (model_path , device , mem_consumption , ** args )
317+ model , processor , pretrain_time , bench_hook , use_genai = outs
309318 model_precision = model_utils .get_model_precision (model_path .parts )
310319 iter_data_list = []
311320 md5_list = {num : {} for num in range (num_iters + 1 )}
@@ -325,10 +334,8 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
325334 log .info (f"Numbeams: { args ['num_beams' ]} , benchmarking iter nums(exclude warm-up): { num_iters } , "
326335 f'prompt nums: { len (image_text_list )} , prompt idx: { prompt_idx_list } ' )
327336
328- if not use_genai :
329- gen_fn = run_visual_language_generation_optimum
330- else :
331- gen_fn = run_visual_language_generation_genai
337+ if use_genai : gen_fn = run_visual_language_generation_genai
338+ else : gen_fn = run_visual_language_generation_optimum
332339
333340 proc_id = os .getpid ()
334341 iter_timestamp = model_utils .init_timestamp (num_iters , image_text_list , prompt_idx_list )
@@ -341,7 +348,7 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
341348 iter_timestamp [num ][p_idx ]['start' ] = datetime .datetime .now ().isoformat ()
342349 gen_fn (
343350 input_text , num , model , processor , args , iter_data_list , md5_list ,
344- p_idx , bench_hook , model_precision , proc_id , mem_consumption )
351+ p_idx , bench_hook , model_precision , proc_id , mem_consumption , required_frames )
345352 iter_timestamp [num ][p_idx ]['end' ] = datetime .datetime .now ().isoformat ()
346353 prefix = '[warm-up]' if num == 0 else '[{}]' .format (num )
347354 log .info (f"{ prefix } [P{ p_idx } ] start: { iter_timestamp [num ][p_idx ]['start' ]} , end: { iter_timestamp [num ][p_idx ]['end' ]} " )
@@ -353,8 +360,8 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
353360 log .info (f'[warm-up][P{ p_idx } ] Input text: { input_text } ' )
354361 iter_timestamp [num ][p_idx ]['start' ] = datetime .datetime .now ().isoformat ()
355362 gen_fn (
356- input_text , num , model , processor , args , iter_data_list , md5_list ,
357- prompt_idx_list [ idx ], bench_hook , model_precision , proc_id , mem_consumption )
363+ input_text , num , model , processor , args , iter_data_list , md5_list , prompt_idx_list [ idx ],
364+ bench_hook , model_precision , proc_id , mem_consumption , required_frames )
358365 iter_timestamp [num ][p_idx ]['end' ] = datetime .datetime .now ().isoformat ()
359366 prefix = '[warm-up]' if num == 0 else '[{}]' .format (num )
360367 log .info (f"{ prefix } [P{ p_idx } ] start: { iter_timestamp [num ][p_idx ]['start' ]} , end: { iter_timestamp [num ][p_idx ]['end' ]} " )
@@ -365,14 +372,16 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
365372
366373def get_image_text_prompt (args ):
367374 vlm_file_list = []
368- output_data_list , is_json_data = model_utils .get_param_from_file (args , [' media' , "prompt" ])
375+ output_data_list , is_json_data = model_utils .get_param_from_file (args , [" media" , "prompt" ])
369376 if is_json_data :
370377 vlm_param_list = parse_json_data .parse_vlm_json_data (output_data_list )
371378 if len (vlm_param_list ) > 0 :
372379 for vlm_file in vlm_param_list :
373- if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 :
374- vlm_file ['media' ] = model_utils .resolve_media_file_path (vlm_file .get ("media" ), args ['prompt_file' ][0 ])
380+ if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'media' in vlm_file :
381+ if 'video' in vlm_file : log .warning ('media and video cannot be specify in a single prompt file' )
382+ vlm_file ['media' ] = model_utils .resolve_media_file_path (vlm_file .get ('media' ), args ['prompt_file' ][0 ])
383+ elif args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'video' in vlm_file :
384+ vlm_file ['video' ] = model_utils .resolve_media_file_path (vlm_file .get ('video' ), args ['prompt_file' ][0 ])
375385 vlm_file_list .append (vlm_file )
376- else :
377- vlm_file_list .append (output_data_list )
386+ else : vlm_file_list .append (output_data_list )
378387 return vlm_file_list
0 commit comments