1717import llm_bench_utils .output_file
1818import llm_bench_utils .gen_output_data as gen_output_data
1919import llm_bench_utils .parse_json_data as parse_json_data
20+ import llm_bench_utils .prompt_utils as pu
2021from pathlib import Path
2122
22-
2323FW_UTILS = {'pt' : llm_bench_utils .pt_utils , 'ov' : llm_bench_utils .ov_utils }
2424
2525DEFAULT_OUTPUT_TOKEN_SIZE = 512
2626
2727
2828def run_visual_language_generation_optimum (
29- inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index , bench_hook , model_precision , proc_id , mem_consumption
30- ):
29+ inputs , num , model , processor , args , iter_data_list , md5_list ,
30+ prompt_index , bench_hook , model_precision , proc_id , mem_consumption ):
3131 from optimum .intel .utils .import_utils import is_transformers_version
3232 set_seed (args ['seed' ])
3333 if args ['batch_size' ] != 1 :
@@ -37,13 +37,17 @@ def run_visual_language_generation_optimum(
3737 prompts = []
3838 inputs = [inputs ] if not isinstance (inputs , (list , tuple )) else inputs
3939 for input_data in inputs :
40- if input_data .get ("media" , None ):
40+ if input_data .get ("video" , None ):
41+ entry = Path (input_data ["video" ])
42+ required_frames = args .get ("video_frames" )
43+ ordered_frames = pu .split_video_into_frames (entry , required_frames )
44+ images .extend (ordered_frames )
45+ elif input_data .get ("media" , None ):
4146 entry = Path (input_data ["media" ])
4247 if entry .is_dir ():
4348 for file in sorted (entry .iterdir ()):
4449 images .append (load_image (str (file )))
45- else :
46- images .append (load_image (input_data ["media" ]))
50+ else : images .append (load_image (input_data ["media" ]))
4751 prompts .append (input_data ["prompt" ])
4852 prefix = '[warm-up]' if num == 0 else '[{}]' .format (num )
4953 log .info (f'{ prefix } [P{ prompt_index } ] Input image nums:{ len (images )} ' )
@@ -198,13 +202,17 @@ def run_visual_language_generation_genai(
198202 prompts = []
199203 inputs = [inputs ] if not isinstance (inputs , (list , tuple )) else inputs
200204 for input_data in inputs :
201- if input_data .get ("media" , None ):
205+ if input_data .get ("video" , None ):
206+ entry = Path (input_data ["media" ])
207+ required_frames = args .get ('video_frames' )
208+ ordered_frames = pu .split_video_into_frames (entry , required_frames )
209+ images .extend (ordered_frames )
210+ elif input_data .get ("media" , None ):
202211 entry = Path (input_data ["media" ])
203212 if entry .is_dir ():
204213 for file in sorted (entry .iterdir ()):
205214 images .append (load_image_genai (str (file )))
206- else :
207- images .append (load_image_genai (input_data ["media" ]))
215+ else : images .append (load_image_genai (input_data ["media" ]))
208216 prompts .append (input_data ["prompt" ])
209217 if args ["output_dir" ] is not None and num == 0 :
210218 for bs_index , in_text in enumerate (prompts ):
@@ -365,14 +373,16 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
365373
366374def get_image_text_prompt (args ):
367375 vlm_file_list = []
368- output_data_list , is_json_data = model_utils .get_param_from_file (args , [' media' , "prompt" ])
376+ output_data_list , is_json_data = model_utils .get_param_from_file (args , [" media" , "prompt" ])
369377 if is_json_data :
370378 vlm_param_list = parse_json_data .parse_vlm_json_data (output_data_list )
371379 if len (vlm_param_list ) > 0 :
372380 for vlm_file in vlm_param_list :
373- if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 :
374- vlm_file ['media' ] = model_utils .resolve_media_file_path (vlm_file .get ("media" ), args ['prompt_file' ][0 ])
381+ if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'media' in vlm_file :
382+ if 'video' in vlm_file : log .warning ('media and video cannot be specify in a single prompt file' )
383+ vlm_file ['media' ] = model_utils .resolve_media_file_path (vlm_file .get ('media' ), args ['prompt_file' ][0 ])
384+ elif args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'video' in vlm_file :
385+ vlm_file ['video' ] = model_utils .resolve_media_file_path (vlm_file .get ('video' ), args ['prompt_file' ][0 ])
375386 vlm_file_list .append (vlm_file )
376- else :
377- vlm_file_list .append (output_data_list )
387+ else : vlm_file_list .append (output_data_list )
378388 return vlm_file_list
0 commit comments