diff --git a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp index c9689b2ea1..afa9dc3178 100644 --- a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp +++ b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp @@ -3,100 +3,14 @@ #pragma once -#include -#include - -#include "openvino/genai/visibility.hpp" -#include "openvino/genai/tokenizer.hpp" -#include "openvino/genai/lora_adapter.hpp" - -#include "openvino/core/any.hpp" -#include "openvino/runtime/tensor.hpp" -#include "openvino/runtime/infer_request.hpp" -#include "openvino/runtime/properties.hpp" +#include "openvino/genai/image_generation/clip_text_model.hpp" namespace ov { namespace genai { -class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection { +class CLIPTextModelWithProjection : public CLIPTextModel { public: - struct OPENVINO_GENAI_EXPORTS Config { - size_t max_position_embeddings = 77; - size_t num_hidden_layers = 32; - - explicit Config(const std::filesystem::path& config_path); - }; - - explicit CLIPTextModelWithProjection(const std::filesystem::path& root_dir); - - CLIPTextModelWithProjection(const std::filesystem::path& root_dir, - const std::string& device, - const ov::AnyMap& properties = {}); - - CLIPTextModelWithProjection(const std::string& model, - const Tensor& weights, - const Config& config, - const Tokenizer& clip_tokenizer); - - CLIPTextModelWithProjection(const std::string& model, - const Tensor& weights, - const Config& config, - const Tokenizer& clip_tokenizer, - const std::string& device, - const ov::AnyMap& properties = {}); - - template ::value, bool>::type = true> - CLIPTextModelWithProjection(const std::filesystem::path& root_dir, - const std::string& device, - Properties&&... properties) - : CLIPTextModelWithProjection(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } - - template ::value, bool>::type = true> - CLIPTextModelWithProjection(const std::string& model, - const Tensor& weights, - const Config& config, - const Tokenizer& clip_tokenizer, - const std::string& device, - Properties&&... properties) - : CLIPTextModelWithProjection(model, - weights, - config, - clip_tokenizer, - device, - ov::AnyMap{std::forward(properties)...}) { } - - CLIPTextModelWithProjection(const CLIPTextModelWithProjection&); - - const Config& get_config() const; - - CLIPTextModelWithProjection& reshape(int batch_size); - - CLIPTextModelWithProjection& compile(const std::string& device, const ov::AnyMap& properties = {}); - - template - ov::util::EnableIfAllStringAny compile( - const std::string& device, - Properties&&... properties) { - return compile(device, ov::AnyMap{std::forward(properties)...}); - } - - void set_adapters(const std::optional& adapters); - - ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance); - - ov::Tensor get_output_tensor(const size_t idx); - -private: - Config m_config; - AdapterController m_adapter_controller; - ov::InferRequest m_request; - std::shared_ptr m_model; - - Tokenizer m_clip_tokenizer; - - bool m_slice_batch1_output = false; + using CLIPTextModel::CLIPTextModel; }; } // namespace genai diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp deleted file mode 100644 index d47ad6ed02..0000000000 --- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (C) 2023-2025 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "openvino/genai/image_generation/clip_text_model_with_projection.hpp" - -#include - -#include "lora_helper.hpp" -#include "json_utils.hpp" -#include "utils.hpp" - -namespace ov { -namespace genai { - -std::filesystem::path get_tokenizer_path_by_text_encoder(const std::filesystem::path& text_encoder_path); - -CLIPTextModelWithProjection::Config::Config(const std::filesystem::path& config_path) { - std::ifstream file(config_path); - OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); - - nlohmann::json data = nlohmann::json::parse(file); - using utils::read_json_param; - - read_json_param(data, "max_position_embeddings", max_position_embeddings); - read_json_param(data, "num_hidden_layers", num_hidden_layers); -} - -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir) : - m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)), - m_config(root_dir / "config.json") { - m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml"); -} - -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir, - const std::string& device, - const ov::AnyMap& properties) : - CLIPTextModelWithProjection(root_dir) { - compile(device, properties); -} - -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model, - const Tensor& weights, - const Config& config, - const Tokenizer& clip_tokenizer) : - m_clip_tokenizer(clip_tokenizer), m_config(config) { - m_model = utils::singleton_core().read_model(model, weights); -} - -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model, - const Tensor& weights, - const Config& config, - const Tokenizer& clip_tokenizer, - const std::string& device, - const ov::AnyMap& properties) : - CLIPTextModelWithProjection(model, weights, config, clip_tokenizer) { - compile(device, properties); -} - -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const CLIPTextModelWithProjection&) = default; - -const CLIPTextModelWithProjection::Config& CLIPTextModelWithProjection::get_config() const { - return m_config; -} - -CLIPTextModelWithProjection& CLIPTextModelWithProjection::reshape(int batch_size) { - OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); - - ov::PartialShape input_shape = m_model->input(0).get_partial_shape(); - input_shape[0] = batch_size; - input_shape[1] = m_config.max_position_embeddings; - std::map idx_to_shape{{0, input_shape}}; - m_model->reshape(idx_to_shape); - - return *this; -} - -CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::string& device, const ov::AnyMap& properties) { - OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); - ov::Core core = utils::singleton_core(); - std::optional adapters; - auto filtered_properties = extract_adapters_from_properties(properties, &adapters); - if (adapters) { - adapters->set_tensor_name_prefix(adapters->get_tensor_name_prefix().value_or("lora_te")); - m_adapter_controller = AdapterController(m_model, *adapters, device); - } - ov::CompiledModel compiled_model = core.compile_model(m_model, device, *filtered_properties); - ov::genai::utils::print_compiled_model_properties(compiled_model, "Clip Text with projection model"); - m_request = compiled_model.create_infer_request(); - // release the original model - m_model.reset(); - - return *this; -} - -void CLIPTextModelWithProjection::set_adapters(const std::optional& adapters) { - if (adapters) { - m_adapter_controller.apply(m_request, *adapters); - } -} - -ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance) { - OPENVINO_ASSERT(m_request, "CLIP text encoder model must be compiled first. Cannot infer non-compiled model"); - - const int32_t pad_token_id = m_clip_tokenizer.get_pad_token_id(); - const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1; - - auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) { - ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids; - - if (input_ids.get_element_type() == ov::element::i32) { - std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); - std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); - } else { - std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); - std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); - } - }; - - ov::PartialShape compiled_input_partial_shape = m_request.get_compiled_model().inputs()[0].get_partial_shape(); - - ov::Tensor input_ids = m_request.get_input_tensor(); - - if (compiled_input_partial_shape.is_dynamic()) { - input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings}); - } else { - auto compiled_input_shape = input_ids.get_shape(); - OPENVINO_ASSERT(compiled_input_shape.size() == 2, "CLIP text encoder model input must have rank of 2"); - OPENVINO_ASSERT(text_embedding_batch_size <= compiled_input_shape[0], - "text_embedding_batch_size (", text_embedding_batch_size, - ") > CLIP text encoder model batch size (", compiled_input_shape[0], ")."); - OPENVINO_ASSERT(m_config.max_position_embeddings == compiled_input_shape[1], - "max_position_embeddings (", m_config.max_position_embeddings, - ") != what CLIP text encoder model was compiled for (", compiled_input_shape[1], ")."); - } - - size_t current_batch_idx = 0; - - if (input_ids.get_shape()[0] == 2) { - perform_tokenization(neg_prompt, - ov::Tensor(input_ids, {current_batch_idx , 0}, - {current_batch_idx + 1, m_config.max_position_embeddings})); - ++current_batch_idx; - } else { - // Negative prompt is ignored when --guidanceScale < 1.0 - } - - perform_tokenization(pos_prompt, - ov::Tensor(input_ids, {current_batch_idx , 0}, - {current_batch_idx + 1, m_config.max_position_embeddings})); - - // text embeddings - m_request.infer(); - - // This is true when text_embedding_batch_size is 1, but model was reshaped / compiled as batch size 2. - m_slice_batch1_output = (text_embedding_batch_size != input_ids.get_shape()[0]); - - return get_output_tensor(0); -} - -ov::Tensor CLIPTextModelWithProjection::get_output_tensor(const size_t idx) { - auto infer_out_tensor = m_request.get_output_tensor(idx); - if (m_slice_batch1_output) { - // Slice and return batch index 1 output. - auto out_shape = infer_out_tensor.get_shape(); - auto begin_coord = ov::Coordinate(out_shape.size(), 0); - begin_coord[0] = 1; - auto end_coord = ov::Coordinate(out_shape); - auto sliced_out_tensor = ov::Tensor(infer_out_tensor, begin_coord, end_coord); - return sliced_out_tensor; - } else { - return infer_out_tensor; - } -} - -} // namespace genai -} // namespace ov diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index def2719cdc..22988d0181 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -271,55 +271,10 @@ class CLIPTextModel: ... def set_adapters(self, adapters: AdapterConfig | None) -> None: ... -class CLIPTextModelWithProjection: +class CLIPTextModelWithProjection(CLIPTextModel): """ CLIPTextModelWithProjection class. """ - class Config: - """ - This class is used for storing CLIPTextModelWithProjection config. - """ - max_position_embeddings: int - num_hidden_layers: int - def __init__(self, config_path: os.PathLike) -> None: - ... - @typing.overload - def __init__(self, root_dir: os.PathLike) -> None: - """ - CLIPTextModelWithProjection class - root_dir (os.PathLike): Model root directory. - """ - @typing.overload - def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: - """ - CLIPTextModelWithProjection class - root_dir (os.PathLike): Model root directory. - device (str): Device on which inference will be done. - kwargs: Device properties. - """ - @typing.overload - def __init__(self, model: CLIPTextModelWithProjection) -> None: - """ - CLIPTextModelWithProjection model - CLIPTextModelWithProjection class - model (CLIPTextModelWithProjection): CLIPTextModelWithProjection model - """ - def compile(self, device: str, **kwargs) -> None: - """ - Compiles the model. - device (str): Device to run the model on (e.g., CPU, GPU). - kwargs: Device properties. - """ - def get_config(self) -> CLIPTextModelWithProjection.Config: - ... - def get_output_tensor(self, idx: int) -> openvino._pyopenvino.Tensor: - ... - def infer(self, pos_prompt: str, neg_prompt: str, do_classifier_free_guidance: bool) -> openvino._pyopenvino.Tensor: - ... - def reshape(self, batch_size: int) -> CLIPTextModelWithProjection: - ... - def set_adapters(self, adapters: AdapterConfig | None) -> None: - ... class CacheEvictionConfig: """ diff --git a/src/python/py_image_generation_models.cpp b/src/python/py_image_generation_models.cpp index 0b5f4fee48..32248f011c 100644 --- a/src/python/py_image_generation_models.cpp +++ b/src/python/py_image_generation_models.cpp @@ -98,76 +98,7 @@ void init_clip_text_model(py::module_& m) { } void init_clip_text_model_with_projection(py::module_& m) { - auto clip_text_model_with_projection = py::class_(m, "CLIPTextModelWithProjection", "CLIPTextModelWithProjection class.") - .def(py::init([](const std::filesystem::path& root_dir) { - ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(root_dir); - }), - py::arg("root_dir"), "Model root directory", - R"( - CLIPTextModelWithProjection class - root_dir (os.PathLike): Model root directory. - )") - .def(py::init([]( - const std::filesystem::path& root_dir, - const std::string& device, - const py::kwargs& kwargs - ) { - ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(root_dir, device, pyutils::kwargs_to_any_map(kwargs)); - }), - py::arg("root_dir"), "Model root directory", - py::arg("device"), "Device on which inference will be done", - R"( - CLIPTextModelWithProjection class - root_dir (os.PathLike): Model root directory. - device (str): Device on which inference will be done. - kwargs: Device properties. - )") - .def(py::init([](const ov::genai::CLIPTextModelWithProjection& model) { - return std::make_unique(model); - }), - py::arg("model"), "CLIPTextModelWithProjection model" - R"( - CLIPTextModelWithProjection class - model (CLIPTextModelWithProjection): CLIPTextModelWithProjection model - )"); - - py::class_(clip_text_model_with_projection, "Config", "This class is used for storing CLIPTextModelWithProjection config.") - .def(py::init([](const std::filesystem::path& config_path) { - return std::make_unique(config_path); - }), - py::arg("config_path")) - .def_readwrite("max_position_embeddings", &ov::genai::CLIPTextModelWithProjection::Config::max_position_embeddings) - .def_readwrite("num_hidden_layers", &ov::genai::CLIPTextModelWithProjection::Config::num_hidden_layers); - - clip_text_model_with_projection.def("reshape", &ov::genai::CLIPTextModelWithProjection::reshape, py::arg("batch_size")) - .def("infer", &ov::genai::CLIPTextModelWithProjection::infer, - py::call_guard(), - py::arg("pos_prompt"), - py::arg("neg_prompt"), - py::arg("do_classifier_free_guidance")) - .def("get_config", &ov::genai::CLIPTextModelWithProjection::get_config) - .def("get_output_tensor", &ov::genai::CLIPTextModelWithProjection::get_output_tensor, py::arg("idx")) - .def("set_adapters", &ov::genai::CLIPTextModelWithProjection::set_adapters, py::arg("adapters")) - .def( - "compile", - [](ov::genai::CLIPTextModelWithProjection& self, - const std::string& device, - const py::kwargs& kwargs - ) { - auto map = pyutils::kwargs_to_any_map(kwargs); - { - py::gil_scoped_release rel; - self.compile(device, map); - } - }, - py::arg("device"), "device on which inference will be done", - R"( - Compiles the model. - device (str): Device to run the model on (e.g., CPU, GPU). - kwargs: Device properties. - )"); + auto clip_text_model_with_projection = py::class_(m, "CLIPTextModelWithProjection", "CLIPTextModelWithProjection class."); } void init_t5_encoder_model(py::module_& m) {