1717import llm_bench_utils .output_file
1818import llm_bench_utils .gen_output_data as gen_output_data
1919import llm_bench_utils .parse_json_data as parse_json_data
20+ import llm_bench_utils .prompt_utils as pu
2021from pathlib import Path
2122
22-
2323FW_UTILS = {'pt' : llm_bench_utils .pt_utils , 'ov' : llm_bench_utils .ov_utils }
2424
2525DEFAULT_OUTPUT_TOKEN_SIZE = 512
2626
2727
2828def run_visual_language_generation_optimum (
29- inputs , num , model , processor , args , iter_data_list , md5_list , prompt_index , bench_hook , model_precision , proc_id , mem_consumption
30- ):
29+ inputs , num , model , processor , args , iter_data_list , md5_list ,
30+ prompt_index , bench_hook , model_precision , proc_id , mem_consumption ):
3131 from optimum .intel .utils .import_utils import is_transformers_version
3232 set_seed (args ['seed' ])
3333 if args ['batch_size' ] != 1 :
@@ -37,13 +37,19 @@ def run_visual_language_generation_optimum(
3737 prompts = []
3838 inputs = [inputs ] if not isinstance (inputs , (list , tuple )) else inputs
3939 for input_data in inputs :
40- if input_data .get ("media" , None ):
40+ if input_data .get ("video" , None ):
41+ entry = Path (input_data ["video" ])
42+ required_frames = args .get ("video_frames" )
43+ ordered_frames = pu .split_video_into_frames (entry , required_frames )
44+ images .extend (ordered_frames )
45+
46+ elif input_data .get ("media" , None ):
4147 entry = Path (input_data ["media" ])
4248 if entry .is_dir ():
4349 for file in sorted (entry .iterdir ()):
4450 images .append (load_image (str (file )))
45- else :
46- images . append ( load_image ( input_data [ "media" ]))
51+ else : images . append ( load_image ( input_data [ "media" ]))
52+
4753 prompts .append (input_data ["prompt" ])
4854 prefix = '[warm-up]' if num == 0 else '[{}]' .format (num )
4955 log .info (f'{ prefix } [P{ prompt_index } ] Input image nums:{ len (images )} ' )
@@ -198,13 +204,17 @@ def run_visual_language_generation_genai(
198204 prompts = []
199205 inputs = [inputs ] if not isinstance (inputs , (list , tuple )) else inputs
200206 for input_data in inputs :
201- if input_data .get ("media" , None ):
207+ if input_data .get ("video" , None ):
208+ entry = Path (input_data ["media" ])
209+ required_frames = args .get ('video_frames' )
210+ ordered_frames = pu .split_video_into_frames (entry , required_frames )
211+ images .extend (ordered_frames )
212+ elif input_data .get ("media" , None ):
202213 entry = Path (input_data ["media" ])
203214 if entry .is_dir ():
204215 for file in sorted (entry .iterdir ()):
205216 images .append (load_image_genai (str (file )))
206- else :
207- images .append (load_image_genai (input_data ["media" ]))
217+ else : images .append (load_image_genai (input_data ["media" ]))
208218 prompts .append (input_data ["prompt" ])
209219 if args ["output_dir" ] is not None and num == 0 :
210220 for bs_index , in_text in enumerate (prompts ):
@@ -365,14 +375,16 @@ def run_visual_language_generation_benchmark(model_path, framework, device, args
365375
366376def get_image_text_prompt (args ):
367377 vlm_file_list = []
368- output_data_list , is_json_data = model_utils .get_param_from_file (args , [' media' , "prompt" ])
378+ output_data_list , is_json_data = model_utils .get_param_from_file (args , [" media" , "prompt" ])
369379 if is_json_data :
370380 vlm_param_list = parse_json_data .parse_vlm_json_data (output_data_list )
371381 if len (vlm_param_list ) > 0 :
372382 for vlm_file in vlm_param_list :
373- if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 :
374- vlm_file ['media' ] = model_utils .resolve_media_file_path (vlm_file .get ("media" ), args ['prompt_file' ][0 ])
383+ if args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'media' in vlm_file :
384+ if 'video' in vlm_file : log .warning ('media and video cannot be specify in a single prompt file' )
385+ vlm_file ['video' ] = model_utils .resolve_media_file_path (vlm_file .get ('video' ), args ['prompt_file' ][0 ])
386+ elif args ['prompt_file' ] is not None and len (args ['prompt_file' ]) > 0 and 'video' in vlm_file :
387+ vlm_file ['media' ] = model_utils .resolve_media_file_path (vlm_file .get ('video' ), args ['prompt_file' ][0 ])
375388 vlm_file_list .append (vlm_file )
376- else :
377- vlm_file_list .append (output_data_list )
389+ else : vlm_file_list .append (output_data_list )
378390 return vlm_file_list
0 commit comments