fix sdl

sbalandi · sbalandi · commit c8abea463ed7 · 2025-12-21T21:21:21.000Z
diff --git a/tests/python_tests/samples/conftest.py b/tests/python_tests/samples/conftest.py
@@ -159,7 +159,7 @@
     },
     "tiny-random-ltx-video": {
         "name": "optimum-intel-internal-testing/tiny-random-ltx-video",
-        "convert_args": ["--trust-remote-code"]
+        "convert_args": ["--trust-remote-code"],
     },
 }
 
@@ -424,6 +424,7 @@ def generate_test_content(request):
             logger.info(f"Removing test content: {file_path}")
             os.remove(file_path)
 
+
 @pytest.fixture(scope="session")
 def generate_llm_bench_input_generation_jsonl(request):
     """Generate a JSONL file for image generation prompts."""
diff --git a/tests/python_tests/samples/test_tools_llm_benchmark.py b/tests/python_tests/samples/test_tools_llm_benchmark.py
@@ -337,7 +337,9 @@ def test_python_tool_llm_benchmark_video_prompts(self, download_test_content, co
     @pytest.mark.samples
     @pytest.mark.parametrize("convert_model, sample_args", [
         pytest.param("tiny-random-ltx-video",
-                     ["-d", "cpu", "-n", "1", "--optimum", "--num_steps", "5", "--num_frames", "9", "--frame_rate", 23, "width", 256, "height", 256]),
+                     ["-d", "cpu", "-n", "1", "--optimum", "--num_steps", "5", "--num_frames", "9", "--frame_rate", "23", "width", "256", "height", "256"]),
+        pytest.param("tiny-random-ltx-video", 
+                     [ "-d", "cpu", "-n", "1", "--num_steps", "4", "--static_reshape", "width", "256", "height", "256", "--optimum"]),
         # pytest.param("tiny-random-ltx-video",
         #              ["-d", "cpu", "-n", "1", "--genai", "--num_steps", "5", "--num_frames", "9", "width", 256, "height", 256]),
     ], indirect=["convert_model"])
@@ -359,10 +361,7 @@ def test_python_tool_llm_benchmark_video_gen(self, convert_model, sample_args):
     @pytest.mark.parametrize("sample_args", [["-d", "cpu", "-n", "1", "--num_steps", "4", "--task", "text-to-video", "--optimum"]])
     @pytest.mark.parametrize("convert_model", ["tiny-random-ltx-video"], indirect=True)
     @pytest.mark.parametrize("generate_llm_bench_input_generation_jsonl", [("video_generation.jsonl", video_generation_json)], indirect=True)
-    def test_python_tool_llm_benchmark_video_gen_json(self, convert_model, download_test_content, generate_llm_bench_input_generation_jsonl, sample_args):
-        # to use the relative media and mask_image paths
-        os.chdir(os.path.dirname(download_test_content))
-
+    def test_python_tool_llm_benchmark_video_gen_json(self, convert_model, generate_llm_bench_input_generation_jsonl, sample_args):
         # Run Python benchmark
         benchmark_script = SAMPLES_PY_DIR / 'llm_bench/benchmark.py'
         benchmark_py_command = [
diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
@@ -199,7 +199,7 @@ def get_argprser():
         help="Reshape image generation pipeline to specific width & height at pipeline creation time. Applicable for Image Generation.")
     parser.add_argument("--frame_rate", type=float, required=False, help="Frame rate for video generation and saving. Applicable only for Video Generation.")
     parser.add_argument("--num_frames", type=int, required=False, help="Generated guidance scale. Applicable only for Video Generation.")
-    parser.add_argument("--negative_prompt", type=int, required=False, help="Negative prompts for Video Generation.")
+    parser.add_argument("--negative_prompt", type=str, required=False, help="Negative prompts for Video Generation.")
     parser.add_argument('-mi', '--mask_image', default=None,
                         help='Mask image for Inpainting pipelines. Can be directory or path to single image. Applicable for Image Generation.')
     parser.add_argument('-t', '--task', default=None,
diff --git a/tools/llm_bench/llm_bench_utils/config_class.py b/tools/llm_bench/llm_bench_utils/config_class.py
@@ -12,7 +12,7 @@
     SpeechT5HifiGan,
     AutoModelForSequenceClassification
 )
-from diffusers.pipelines import DiffusionPipeline, LDMSuperResolutionPipeline
+from diffusers.pipelines import DiffusionPipeline, LDMSuperResolutionPipeline, LTXPipeline
 from optimum.intel.openvino import (
     OVModelForCausalLM,
     OVModelForSeq2SeqLM,
@@ -23,15 +23,16 @@
     OVPipelineForImage2Image,
     OVModelForFeatureExtraction,
     OVModelForTextToSpeechSeq2Seq,
-    OVModelForSequenceClassification
+    OVModelForSequenceClassification,
+    OVLTXPipeline,
 )
 from llm_bench_utils.ov_model_classes import OVMPTModel, OVLDMSuperResolutionPipeline, OVChatGLMModel
 from dataclasses import dataclass, field
 
 
 @dataclass
 class UseCase:
-    task = ''
+    task = ""
     model_types: list[str] = field(default_factory=list)
     ov_cls: type | None = None
     pt_cls: type | None = AutoModel
@@ -45,17 +46,17 @@ class UseCaseImageGen(UseCase):
     pt_cls: type | None = DiffusionPipeline
 
     TASK = {
-        "text2img": {"name": 'text-to-image', "ov_cls": OVDiffusionPipeline},
-        "img2img": {"name": 'image-to-image', "ov_cls": OVPipelineForImage2Image},
-        "inpainting": {"name": 'inpainting', "ov_cls": OVPipelineForInpainting}
+        "text2img": {"name": "text-to-image", "ov_cls": OVDiffusionPipeline},
+        "img2img": {"name": "image-to-image", "ov_cls": OVPipelineForImage2Image},
+        "inpainting": {"name": "inpainting", "ov_cls": OVPipelineForInpainting}
     }
 
 
 @dataclass
 class UseCaseVideoGen(UseCase):
     task = "video_gen"
-    ov_cls: type | None = OVDiffusionPipeline
-    pt_cls: type | None = DiffusionPipeline
+    ov_cls: type | None = OVLTXPipeline
+    pt_cls: type | None = LTXPipeline
 
 
 @dataclass
@@ -81,35 +82,35 @@ class UseCaseTextGen(UseCase):
 
 @dataclass
 class UseCaseCodeGen(UseCase):
-    task = 'code_gen'
+    task = "code_gen"
     ov_cls: type | None = OVModelForCausalLM
     pt_cls: type | None = AutoModelForCausalLM
 
 
 @dataclass
 class UseCaseImageCls(UseCase):
-    task = 'image_cls'
+    task = "image_cls"
     ov_cls: type | None = OVModelForCausalLM
     pt_cls: type | None = AutoModelForCausalLM
 
 
 @dataclass
 class UseCaseLDMSuperResolution(UseCase):
-    task = 'ldm_super_resolution'
+    task = "ldm_super_resolution"
     ov_cls: type | None = OVLDMSuperResolutionPipeline
     pt_cls: type | None = LDMSuperResolutionPipeline
 
 
 @dataclass
 class UseCaseTextEmbeddings(UseCase):
-    task = 'text_embed'
+    task = "text_embed"
     ov_cls: type | None = OVModelForFeatureExtraction
     pt_cls: type | None = AutoModel
 
 
 @dataclass
 class UseCaseTextReranker(UseCase):
-    task = 'text_rerank'
+    task = "text_rerank"
     ov_cls: type | None = OVModelForSequenceClassification
     pt_cls: type | None = AutoModelForSequenceClassification
 
@@ -125,36 +126,36 @@ def is_qwen_causallm_arch(config):
 
 @dataclass
 class UseCaseTextToSpeech(UseCase):
-    task = 'text_to_speech'
+    task = "text_to_speech"
     ov_cls: type | None = OVModelForTextToSpeechSeq2Seq
     pt_cls: type | None = SpeechT5ForTextToSpeech
     tokenizer_cls: type = SpeechT5Processor
     vocoder_cls: type = SpeechT5HifiGan
 
 
 USE_CASES = {
-    'image_gen': [UseCaseImageGen(['stable-diffusion-', 'ssd-', 'tiny-sd', 'small-sd', 'lcm-', 'sdxl', 'dreamlike', "flux"])],
-    'video_gen': [UseCaseVideoGen('ltx')],
+    "image_gen": [UseCaseImageGen(["stable-diffusion-", "ssd-", "tiny-sd", "small-sd", "lcm-", "sdxl", "dreamlike", "flux"])],
+    "video_gen": [UseCaseVideoGen(["ltx"])],
     "visual_text_gen": [UseCaseVLM(["llava", "llava-next", "qwen2-vl", "llava-qwen2", "internvl-chat", "minicpmv", "phi3-v",
                                     "minicpm-v", "minicpmo", "maira2", "qwen2-5-vl", "smolvlm"])],
-    'speech_to_text': [UseCaseSpeech2Text(['whisper'])],
-    'image_cls': [UseCaseImageCls(['vit'])],
-    'code_gen': [UseCaseCodeGen(["codegen", "codegen2", "stable-code"]),
-                 UseCaseCodeGen(['replit'], ov_cls=OVMPTModel),
-                 UseCaseCodeGen(['codet5'], ov_cls=OVModelForSeq2SeqLM)],
-    'text_gen': [UseCaseTextGen(['arcee', "decoder", "falcon", "glm", "aquila", "gpt", "gpt-", "gpt2", "open-llama", "openchat", "neural-chat", "llama",
+    "speech_to_text": [UseCaseSpeech2Text(["whisper"])],
+    "image_cls": [UseCaseImageCls(["vit"])],
+    "code_gen": [UseCaseCodeGen(["codegen", "codegen2", "stable-code"]),
+                 UseCaseCodeGen(["replit"], ov_cls=OVMPTModel),
+                 UseCaseCodeGen(["codet5"], ov_cls=OVModelForSeq2SeqLM)],
+    "text_gen": [UseCaseTextGen(["arcee", "decoder", "falcon", "glm", "aquila", "gpt", "gpt-", "gpt2", "open-llama", "openchat", "neural-chat", "llama",
                                  "tiny-llama", "tinyllama", "opt", "opt-", "pythia", "pythia-", "stablelm", "stablelm-", "stable-zephyr-", "rocket-",
                                  "vicuna", "dolly", "bloom", "red-pajama", "xgen", "longchat", "jais", "orca-mini", "baichuan", "qwen", "zephyr",
                                  "mistral", "mixtral", "phi", "phi2-", "minicpm", "gemma", "deci", "phi3", "internlm", "olmo", "starcoder", "instruct-gpt",
                                  "granite", "granitemoe", "gptj", "yi-"]),
-                 UseCaseTextGen(['t5'], ov_cls=OVModelForSeq2SeqLM, pt_cls=T5ForConditionalGeneration),
-                 UseCaseTextGen(['mpt'], OVMPTModel),
-                 UseCaseTextGen(['blenderbot'], ov_cls=OVModelForSeq2SeqLM, pt_cls=BlenderbotForConditionalGeneration),
-                 UseCaseTextGen(['chatglm'], ov_cls=OVChatGLMModel, pt_cls=AutoModel)],
-    'ldm_super_resolution': [UseCaseLDMSuperResolution(['ldm-super-resolution'])],
-    'text_embed': [UseCaseTextEmbeddings(["qwen3", "bge", "bert", "albert", "roberta", "xlm-roberta"])],
-    'text_rerank': [UseCaseTextReranker(["qwen3", "bge", "bert", "albert", "roberta", "xlm-roberta"])],
-    'text_to_speech': [UseCaseTextToSpeech(['speecht5'])],
+                 UseCaseTextGen(["t5"], ov_cls=OVModelForSeq2SeqLM, pt_cls=T5ForConditionalGeneration),
+                 UseCaseTextGen(["mpt"], OVMPTModel),
+                 UseCaseTextGen(["blenderbot"], ov_cls=OVModelForSeq2SeqLM, pt_cls=BlenderbotForConditionalGeneration),
+                 UseCaseTextGen(["chatglm"], ov_cls=OVChatGLMModel, pt_cls=AutoModel)],
+    "ldm_super_resolution": [UseCaseLDMSuperResolution(["ldm-super-resolution"])],
+    "text_embed": [UseCaseTextEmbeddings(["qwen3", "bge", "bert", "albert", "roberta", "xlm-roberta"])],
+    "text_rerank": [UseCaseTextReranker(["qwen3", "bge", "bert", "albert", "roberta", "xlm-roberta"])],
+    "text_to_speech": [UseCaseTextToSpeech(["speecht5"])],
 }
 
 PA_ATTENTION_BACKEND = "PA"
diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py
@@ -39,7 +39,7 @@ def get_param_from_file(args, input_key):
                 elif args['use_case'].task == 'image_gen':
                     data_list.append('sailing ship in storm by Leonardo da Vinci')
                 elif args['use_case'].task == 'video_gen':
-                    data_dict["prompt"] = 'cat plays with ball on the christmas tree'
+                    data_list["prompt"] = 'cat plays with ball on the christmas tree'
                 else:
                     raise RuntimeError(f'== {input_key} and prompt file is empty ==')
             elif args[input_key] is not None and args['prompt_file'] is not None:
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -1271,11 +1271,11 @@ def create_video_gen_model(model_path, device, memory_data_collector, **kwargs):
         start = time.perf_counter()
         if kwargs.get("static_reshape", False):
             ov_model = model_class.from_pretrained(model_path, device=device, ov_config=ov_config, compile=False)
-            num_images_per_prompt = kwargs.get("batch_size", 1)
             height = kwargs.get("height", 512)
             width = kwargs.get("width", 512)
-            log.info(f"Video Pipeline reshape(batch_size=1, height={height}, width={width}, num_images_per_prompt={num_images_per_prompt})")
-            ov_model.reshape(batch_size=1, height=height, width=width, num_images_per_prompt=num_images_per_prompt)
+            num_frames = kwargs.get("num_frames", 25)
+            log.info(f"Video Pipeline reshape(batch_size=1, height={height}, width={width}, num_frames={num_frames})")
+            ov_model.reshape(batch_size=1, height=height, width=width, num_frames=num_frames)
             ov_model.compile()
         else:
             ov_model = model_class.from_pretrained(model_path, device=device, ov_config=ov_config)
diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py
@@ -235,7 +235,7 @@ def run_image_generation_benchmark(model_path, framework, device, args, num_iter
 
     if framework == "ov" and not use_genai:
         stable_diffusion_hook.new_text_encoder(pipe)
-        stable_diffusion_hook.new_unet(pipe)
+        stable_diffusion_hook.new_main_model(pipe)
         stable_diffusion_hook.new_vae_decoder(pipe)
 
     log.info(f'Benchmarking iter nums(exclude warm-up): {num_iters}, prompt nums: {len(image_list)}, prompt idx: {prompt_idx_list}')
diff --git a/tools/llm_bench/task/pipeline_utils.py b/tools/llm_bench/task/pipeline_utils.py
@@ -228,7 +228,8 @@ def collect_prompts_step(args, get_prompt_fn):
     return text_list, prompt_idx_list
 
 
-def launch(pipeline: CommonPipeline, iter_num: int, prompt_idx: int, iter_timestamp: dict, input_item: str|dict, proc_id: int, bench_hook: object | None) -> dict:
+def launch(pipeline: CommonPipeline, iter_num: int, prompt_idx: int, iter_timestamp: dict, input_item: str | dict,
+           proc_id: int, bench_hook: object | None) -> dict:
     iter_timestamp[iter_num][prompt_idx]["start"] = datetime.datetime.now().isoformat()
     iter_data, _ = pipeline.run(input_item, iter_num, prompt_idx, proc_id, bench_hook)
     iter_timestamp[iter_num][prompt_idx]["end"] = datetime.datetime.now().isoformat()
diff --git a/tools/llm_bench/task/video_generation.py b/tools/llm_bench/task/video_generation.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (C) 2023-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-import copy
-# import hashlib
 import logging as log
 
 from typing import Any
@@ -20,7 +18,7 @@
 from llm_bench_utils.prompt_utils import get_video_gen_prompt
 from task.pipeline_utils import CommonPipeline, execution_time_in_sec, collect_prompts_step, iteration_step
 
-FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils}
+FW_UTILS = {"pt": llm_bench_utils.pt_utils, "ov": llm_bench_utils.ov_utils}
 
 DEFAULT_NUM_FRAMES = 25
 DEFAULT_INFERENCE_STEPS = 25
@@ -29,28 +27,41 @@
 DEFAULT_FRAME_RATE = 25
 
 
-def collect_input_args(input_param: dict, width: int = None, height: int = None, num_steps: int = None, num_frames: int = None, frame_rate: int = None):
+def collect_input_args(
+    input_param: dict,
+    width: int = None,
+    height: int = None,
+    num_steps: int = None,
+    num_frames: int = None,
+    frame_rate: int = None,
+):
     input_args = {}
+    input_args["width"] = input_param.get("width", width or DEFAULT_IMAGE_WIDTH)
+    input_args["height"] = input_param.get("height", height or DEFAULT_IMAGE_HEIGHT)
+    input_args["num_inference_steps"] = input_param.get("num_steps", num_steps or DEFAULT_INFERENCE_STEPS)
+    input_args["num_frames"] = input_param.get("num_frames", num_frames or DEFAULT_NUM_FRAMES)
+    input_args["frame_rate"] = input_param.get("frame_rate", frame_rate or DEFAULT_FRAME_RATE)
 
-    input_args["width"] = input_param.get('width', width or DEFAULT_IMAGE_WIDTH)
-    input_args["height"] = input_param.get('height', height or DEFAULT_IMAGE_HEIGHT)
-    input_args["num_inference_steps"] = input_param.get('num_steps', num_steps or DEFAULT_INFERENCE_STEPS)
-    input_args["num_frames"] = input_param.get('num_frames', num_frames or DEFAULT_NUM_FRAMES)
-    input_args["frame_rate"] = input_param.get('frame_rate', frame_rate or DEFAULT_FRAME_RATE)
-
-    guidance_scale = input_param.get('guidance_scale')
+    guidance_scale = input_param.get("guidance_scale")
     if guidance_scale is not None:
         input_args["guidance_scale"] = guidance_scale
-    guidance_rescale = input_param.get('guidance_scale')
+    guidance_rescale = input_param.get("guidance_scale")
     if guidance_rescale is not None:
         input_args["guidance_rescale"] = guidance_rescale
 
     return input_args
 
 
 class TextToVideoOptimum(CommonPipeline):
-    def __init__(self, model: object, tokenizer: object | None, args: dict, model_path: Path,
-                 mem_consumption_meter: MemMonitorWrapper, time_collection_hook: StableDiffusionHook):
+    def __init__(
+        self,
+        model: object,
+        tokenizer: object | None,
+        args: dict,
+        model_path: Path,
+        mem_consumption_meter: MemMonitorWrapper,
+        time_collection_hook: StableDiffusionHook,
+    ):
         super().__init__(model, tokenizer, args, model_path, mem_consumption_meter)
         self.genai = False
 
@@ -69,12 +80,14 @@ def generate(self, input_data: Any, **kwargs):
 
     def print_batch_size_info(self, iter_num: int, input_args: dict):
         out_str = "[warm-up]" if iter_num == 0 else "[{}]".format(iter_num)
-        out_str = f"Input params: Batch_size={self.batch_size}, " \
-                  f"steps={self.num_steps}, width={input_args['width']}, " \
-                  f"height={input_args['height']}, frame number={input_args['num_frames']}"
-        if input_args.get('guidance_scale'):
+        out_str = (
+            f"Input params: Batch_size={self.batch_size}, "
+            f"steps={self.num_steps}, width={input_args['width']}, "
+            f"height={input_args['height']}, frame number={input_args['num_frames']}"
+        )
+        if input_args.get("guidance_scale"):
             out_str += f", guidance_scale={input_args['guidance_scale']}"
-        if input_args.get('guidance_rescale'):
+        if input_args.get("guidance_rescale"):
             out_str += f", guidance_rescale={input_args['guidance_rescale']}"
         log.info(out_str)
 
@@ -196,5 +209,5 @@ def run_video_generation_benchmark(model_path, framework, device, args, num_iter
 
     iter_data_list, iter_timestamp = iteration_step(image_gen_pipeline, num_iters, text_list, prompt_idx_list, bench_hook=None, subsequent=args['subsequent'])
 
-    metrics_print.print_average(iter_data_list, prompt_idx_list, args['batch_size'], False)
+    metrics_print.print_average(iter_data_list, prompt_idx_list, args["batch_size"], False)
     return iter_data_list, pretrain_time, iter_timestamp