Skip to content
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions samples/cpp/video_generation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,29 @@ install(TARGETS text2video
RUNTIME DESTINATION samples_bin/
COMPONENT samples_bin
EXCLUDE_FROM_ALL)

# create LoRA sample executable
add_executable(lora_text2video lora_text2video.cpp imwrite_video.cpp)

target_include_directories(lora_text2video PRIVATE ${CMAKE_BINARY_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/../image_generation/" "${CMAKE_CURRENT_SOURCE_DIR}/../../../src/cpp/src/")
ov_genai_link_opencv(lora_text2video core imgproc videoio imgcodecs)
target_link_libraries(lora_text2video PRIVATE openvino::genai indicators::indicators)

if(UNIX AND NOT APPLE)
set_target_properties(lora_text2video PROPERTIES
INSTALL_RPATH "$ORIGIN/../lib"
)
elseif(APPLE)
set_target_properties(lora_text2video PROPERTIES
INSTALL_RPATH "@loader_path/../lib"
)
endif()

set_target_properties(lora_text2video PROPERTIES
# Ensure out of box LC_RPATH on macOS with SIP
INSTALL_RPATH_USE_LINK_PATH ON)

install(TARGETS lora_text2video
RUNTIME DESTINATION samples_bin/
COMPONENT samples_bin
EXCLUDE_FROM_ALL)
19 changes: 19 additions & 0 deletions samples/cpp/video_generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,25 @@ GPUs usually provide better performance compared to CPUs. Modify the source code
./text2video ltx_video_ov/INT8 "A woman with long brown hair and light skin smiles at another woman with long blonde hair"
```

### LoRA Text to Video Sample (`lora_text2video.cpp`)

- **Description:**
Video generation with LoRA adapters using a text-to-video model. This sample demonstrates how to generate videos from text prompts while applying multiple LoRA adapters.

Recommended models: Lightricks/LTX-Video

- **Main Feature:** Apply LoRA adapters to a text-to-video pipeline for customized generation.

- **Run Command:**
```bash
./lora_text2video model_dir prompt [lora_adapter_path] [alpha] ...
```

Example:
```bash
./lora_text2video ltx_video_ov/INT8 "A woman with long brown hair and light skin smiles at another woman with long blonde hair" adapter1.safetensors 1.0 adapter2.safetensors 0.5
```

The sample will generate a video file `genai_video.avi` in the current directory.

Users can modify the source code to experiment with different generation parameters:
Expand Down
78 changes: 78 additions & 0 deletions samples/cpp/video_generation/lora_text2video.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright (C) 2025-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include <memory>
#include <string>
#include <random>
#include <filesystem>

#include "progress_bar.hpp"
#include "imwrite_video.hpp"

#include <openvino/genai/video_generation/text2video_pipeline.hpp>

int main(int32_t argc, char* argv[]) try {
OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>' [<LORA_SAFETENSORS> <ALPHA> ...]]");

std::filesystem::path models_dir = argv[1];
std::string prompt = argv[2];

const std::string device = "CPU"; // GPU can be used as well
float frame_rate = 25.0f;

ov::genai::AdapterConfig adapter_config;
// Multiple LoRA adapters applied simultaneously are supported, parse them all and corresponding alphas from cmd parameters:
for(size_t i = 0; i < (argc - 3)/2; ++i) {
ov::genai::Adapter adapter(argv[3 + 2*i]);
float alpha = std::atof(argv[3 + 2*i + 1]);
adapter_config.add(adapter, alpha);
}

// LoRA adapters passed to the constructor will be activated by default in next generates
ov::genai::Text2VideoPipeline pipe(models_dir, device, ov::genai::adapters(adapter_config));

std::cout << "Generating video with LoRA adapters applied, resulting video will be in lora_video.avi\n";
auto output = pipe.generate(
prompt,
ov::genai::negative_prompt("worst quality, inconsistent motion, blurry, jittery, distorted"),
ov::genai::height(480),
ov::genai::width(704),
ov::genai::num_frames(161),
ov::genai::num_inference_steps(25),
ov::genai::num_videos_per_prompt(1),
ov::genai::callback(progress_bar),
ov::genai::frame_rate(frame_rate),
ov::genai::guidance_scale(3)
);

save_video("lora_video.avi", output.video, frame_rate);

std::cout << "Generating video without LoRA adapters applied, resulting video will be in baseline_video.avi\n";
output = pipe.generate(
prompt,
ov::genai::adapters(), // passing adapters in generate overrides adapters set in the constructor; adapters() means no adapters
ov::genai::negative_prompt("worst quality, inconsistent motion, blurry, jittery, distorted"),
ov::genai::height(480),
ov::genai::width(704),
ov::genai::num_frames(161),
ov::genai::num_inference_steps(25),
ov::genai::num_videos_per_prompt(1),
ov::genai::callback(progress_bar),
ov::genai::frame_rate(frame_rate),
ov::genai::guidance_scale(3)
);

save_video("baseline_video.avi", output.video, frame_rate);

return EXIT_SUCCESS;
} catch (const std::exception& error) {
try {
std::cerr << error.what() << '\n';
} catch (const std::ios_base::failure&) {}
return EXIT_FAILURE;
} catch (...) {
try {
std::cerr << "Non-exception object thrown\n";
} catch (const std::ios_base::failure&) {}
return EXIT_FAILURE;
}
21 changes: 20 additions & 1 deletion samples/python/video_generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,26 @@ pip install --upgrade-strategy eager -r ../../deployment-requirements.txt

Example:
```bash
python text2video.py ./ltx_video_ov/INT8 "A woman with long brown hair and light skin smiles at another woman with long blonde hair"
python text2video.py ./ltx_video_ov/INT8 "A cute golden retriever puppy running in a green grassy field on a sunny day, high quality, photorealistic"
```

### LoRA Text to Video Sample (`lora_text2video.py`)

- **Description:**
Video generation with LoRA adapters using a text-to-video model. This sample demonstrates how to generate videos from text prompts while applying a LoRA adapter.

Recommended models: Lightricks/LTX-Video

- **Main Feature:** Apply a LoRA adapter to a text-to-video pipeline for customized generation.

- **Run Command:**
```bash
python lora_text2video.py model_dir prompt lora_adapter_path
```

Example:
```bash
python lora_text2video.py ./ltx_video_ov/INT8 "A cute golden retriever puppy running in a green grassy field on a sunny day, high quality, photorealistic" adapter.safetensors
```

The sample will generate a video file `genai_video.avi` in the current directory.
Expand Down
77 changes: 77 additions & 0 deletions samples/python/video_generation/lora_text2video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env python3
# Copyright (C) 2025-2026 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import argparse
import cv2
import openvino_genai


def save_video(filename: str, video_tensor, fps: int = 25):
batch_size, num_frames, height, width, _ = video_tensor.shape
video_data = video_tensor.data

for b in range(batch_size):
if batch_size == 1:
output_path = filename
else:
base, ext = filename.rsplit(".", 1) if "." in filename else (filename, "avi")
output_path = f"{base}_b{b}.{ext}"

fourcc = cv2.VideoWriter_fourcc(*"MJPG")
writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

for f in range(num_frames):
frame_bgr = cv2.cvtColor(video_data[b, f], cv2.COLOR_RGB2BGR)
writer.write(frame_bgr)

writer.release()
print(f"Wrote {output_path} ({num_frames} frames, {width}x{height} @ {fps} fps)")


def main():
parser = argparse.ArgumentParser(
description="Generate video from text prompt using OpenVINO GenAI with LoRA adapters"
)
parser.add_argument("model_dir", help="Path to the model directory")
parser.add_argument("prompt", help="Text prompt for video generation")
parser.add_argument("lora_adapter", help="Path to the LoRA adapter file (.safetensors)")
args = parser.parse_args()

# Load adapter
adapter_config = openvino_genai.AdapterConfig()
adapter_config.add(openvino_genai.Adapter(args.lora_adapter))

pipe = openvino_genai.Text2VideoPipeline(args.model_dir, "CPU", adapters=adapter_config) # GPU can be used as well

frame_rate = 25

def callback(step, num_steps, latent):
print(f"Generation step {step + 1} / {num_steps}")
return False

output = pipe.generate(
args.prompt,
negative_prompt="worst quality, inconsistent motion, blurry, jittery, distorted",
height=480,
width=704,
num_frames=161,
num_inference_steps=25,
num_videos_per_prompt=1,
callback=callback,
frame_rate=frame_rate,
guidance_scale=3,
adapters=adapter_config,
)

save_video("genai_video.avi", output.video, frame_rate)

print(f"\nPerformance metrics:")
print(f" Load time: {output.perf_metrics.get_load_time():.2f} ms")
print(f" Generate duration: {output.perf_metrics.get_generate_duration():.2f} ms")
print(f" Transformer duration: {output.perf_metrics.get_transformer_infer_duration().mean:.2f} ms")
print(f" VAE decoder duration: {output.perf_metrics.get_vae_decoder_infer_duration():.2f} ms")


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <optional>

#include "openvino/genai/image_generation/generation_config.hpp"

#include "openvino/genai/lora_adapter.hpp"

namespace ov::genai {
/**
Expand Down Expand Up @@ -56,6 +56,9 @@ struct VideoGenerationConfig {
/// Video frame rate. Affects rope_interpolation_scale. Any value can be used although positive
/// non-infinity makes the most sense. NaN corresponds to model default which is 25.0f for LTX-Video.
std::optional<float> frame_rate = std::nullopt;

/// LoRA adapters applied during generation.
std::optional<AdapterConfig> adapters = std::nullopt;
};

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "openvino/runtime/infer_request.hpp"
#include "openvino/runtime/properties.hpp"
#include "openvino/runtime/tensor.hpp"
#include "openvino/genai/lora_adapter.hpp"
#include "openvino/genai/visibility.hpp"

namespace ov::genai {
Expand Down Expand Up @@ -46,6 +47,8 @@ class OPENVINO_GENAI_EXPORTS LTXVideoTransformer3DModel {

void set_hidden_states(const std::string& tensor_name, const ov::Tensor& encoder_hidden_states);

void set_adapters(const std::optional<AdapterConfig>& adapters);

ov::Tensor infer(const ov::Tensor& latent, const ov::Tensor& timestep);

LTXVideoTransformer3DModel& reshape(int64_t batch_size, int64_t num_frames, int64_t height, int64_t width, int64_t tokenizer_model_max_length);
Expand All @@ -58,6 +61,7 @@ class OPENVINO_GENAI_EXPORTS LTXVideoTransformer3DModel {
std::shared_ptr<Inference> m_impl;

Config m_config;
AdapterController m_adapter_controller;
ov::InferRequest m_request;
std::shared_ptr<ov::Model> m_model;
size_t m_expected_batch_size = 0;
Expand Down
2 changes: 2 additions & 0 deletions src/cpp/src/video_generation/generation_config_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ void update_generation_config(VideoGenerationConfig& config, const ov::AnyMap& p
read_anymap_param(properties, "num_inference_steps", config.num_inference_steps);
read_anymap_param(properties, "max_sequence_length", config.max_sequence_length);

read_anymap_param(properties, "adapters", config.adapters);

// 'generator' has higher priority than 'seed' parameter
const bool have_generator_param =
properties.find(ov::genai::generator.name()) != properties.end();
Expand Down
8 changes: 5 additions & 3 deletions src/cpp/src/video_generation/ltx_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,9 +422,9 @@ class Text2VideoPipeline::LTXPipeline {
}

void rebuild_models() {
m_t5_text_encoder = std::make_shared<T5EncoderModel>(m_models_dir / "text_encoder");
m_transformer = std::make_shared<LTXVideoTransformer3DModel>(m_models_dir / "transformer");
m_vae = std::make_shared<AutoencoderKLLTXVideo>(m_models_dir / "vae_decoder");
m_t5_text_encoder = std::make_shared<T5EncoderModel>(m_models_dir / "text_encoder", m_text_encode_device, m_compile_properties);
m_transformer = std::make_shared<LTXVideoTransformer3DModel>(m_models_dir / "transformer", m_denoise_device, m_compile_properties);
m_vae = std::make_shared<AutoencoderKLLTXVideo>(m_models_dir / "vae_decoder", m_vae_device, m_compile_properties);
}

void reshape_models(const VideoGenerationConfig& generation_config, size_t batch_size_multiplier) {
Expand Down Expand Up @@ -499,6 +499,8 @@ class Text2VideoPipeline::LTXPipeline {
const auto& transformer_config = m_transformer->get_config();
check_inputs(merged_generation_config, vae_scale_factor);

m_transformer->set_adapters(merged_generation_config.adapters);

// use callback if defined
std::shared_ptr<ThreadedCallbackWrapper> callback_ptr = nullptr;
auto callback_iter = properties.find(ov::genai::callback.name());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ LTXVideoTransformer3DModel& LTXVideoTransformer3DModel::compile(const std::strin
OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model");
std::optional<AdapterConfig> adapters;
auto filtered_properties = extract_adapters_from_properties(properties, &adapters);
OPENVINO_ASSERT(!adapters, "Adapters are not currently supported for Video Generation Pipeline.");
if (adapters) {
adapters->set_tensor_name_prefix(adapters->get_tensor_name_prefix().value_or("transformer"));
m_adapter_controller = AdapterController(m_model, *adapters, device);
}
ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, *filtered_properties);
ov::genai::utils::print_compiled_model_properties(compiled_model, "LTX Video Transformer 3D model");
m_request = compiled_model.create_infer_request();
Expand All @@ -87,6 +90,15 @@ void LTXVideoTransformer3DModel::set_hidden_states(const std::string& tensor_nam
m_request.set_tensor(tensor_name, encoder_hidden_states);
}

void LTXVideoTransformer3DModel::set_adapters(const std::optional<AdapterConfig>& adapters) {
OPENVINO_ASSERT(m_request, "Transformer model must be compiled first");
if (adapters) {
OPENVINO_ASSERT(m_adapter_controller,
"Adapter controller is not initialized. Adapters must be provided during model construction or compilation to enable adapter support.");
m_adapter_controller.apply(m_request, *adapters);
}
}

ov::Tensor LTXVideoTransformer3DModel::infer(const ov::Tensor& latent_model_input, const ov::Tensor& timestep) {
OPENVINO_ASSERT(m_request, "Transformer model must be compiled first. Cannot infer non-compiled model");

Expand Down
2 changes: 2 additions & 0 deletions src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2009,6 +2009,8 @@ class LTXVideoTransformer3DModel:
width (int): Video width.
tokenizer_model_max_length (int): Maximum sequence length for tokenizer.
"""
def set_adapters(self, adapters: openvino_genai.py_openvino_genai.AdapterConfig | None) -> None:
...
def set_hidden_states(self, tensor_name: str, encoder_hidden_states: openvino._pyopenvino.Tensor) -> None:
"""
Sets encoder hidden states tensor.
Expand Down
3 changes: 3 additions & 0 deletions src/python/py_video_generation_models.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ void init_ltx_video_transformer_3d_model(py::module_& m) {
tensor_name (str): Name of the tensor input.
encoder_hidden_states (ov.Tensor): Hidden states from text encoder.
)")
.def("set_adapters",
&ov::genai::LTXVideoTransformer3DModel::set_adapters,
py::arg("adapters"))
.def("infer",
&ov::genai::LTXVideoTransformer3DModel::infer,
py::call_guard<py::gil_scoped_release>(),
Expand Down
1 change: 1 addition & 0 deletions tests/python_tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ librosa==0.11.0
soundfile==0.13.1
datasets==4.1.1; sys_platform == "linux"
datasets==3.6.0; sys_platform != "linux"
sentence-transformers>=2.2.2,<=5.2.2
torchcodec==0.7.0; sys_platform == "linux"
rouge==1.0.1
# - microsoft/Phi-4-multimodal-instruct
Expand Down
Loading