From 699c569958b58913afdc418ab05d4c78cf6a80ff Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 26 Feb 2025 08:30:31 +0100 Subject: [PATCH 1/2] Alias CLIPTextModelWithProjection as CLIPTextModel --- .../clip_text_model_with_projection.hpp | 79 +--------- .../clip_text_model_with_projection.cpp | 148 ------------------ 2 files changed, 1 insertion(+), 226 deletions(-) delete mode 100644 src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp diff --git a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp index cb74d8a214..fc78c7e914 100644 --- a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp +++ b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp @@ -18,84 +18,7 @@ namespace ov { namespace genai { -class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection { -public: - struct OPENVINO_GENAI_EXPORTS Config { - size_t max_position_embeddings = 77; - size_t num_hidden_layers = 32; - - explicit Config(const std::filesystem::path& config_path); - }; - - explicit CLIPTextModelWithProjection(const std::filesystem::path& root_dir); - - CLIPTextModelWithProjection(const std::filesystem::path& root_dir, - const std::string& device, - const ov::AnyMap& properties = {}); - - CLIPTextModelWithProjection(const std::string& model, - const Tensor& weights, - const Config& config, - const Tokenizer& clip_tokenizer); - - CLIPTextModelWithProjection(const std::string& model, - const Tensor& weights, - const Config& config, - const Tokenizer& clip_tokenizer, - const std::string& device, - const ov::AnyMap& properties = {}); - - template ::value, bool>::type = true> - CLIPTextModelWithProjection(const std::filesystem::path& root_dir, - const std::string& device, - Properties&&... properties) - : CLIPTextModelWithProjection(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } - - template ::value, bool>::type = true> - CLIPTextModelWithProjection(const std::string& model, - const Tensor& weights, - const Config& config, - const Tokenizer& clip_tokenizer, - const std::string& device, - Properties&&... properties) - : CLIPTextModelWithProjection(model, - weights, - config, - clip_tokenizer, - device, - ov::AnyMap{std::forward(properties)...}) { } - - CLIPTextModelWithProjection(const CLIPTextModelWithProjection&); - - const Config& get_config() const; - - CLIPTextModelWithProjection& reshape(int batch_size); - - CLIPTextModelWithProjection& compile(const std::string& device, const ov::AnyMap& properties = {}); - - template - ov::util::EnableIfAllStringAny compile( - const std::string& device, - Properties&&... properties) { - return compile(device, ov::AnyMap{std::forward(properties)...}); - } - - void set_adapters(const std::optional& adapters); - - ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance); - - ov::Tensor get_output_tensor(const size_t idx); - -private: - Config m_config; - AdapterController m_adapter_controller; - ov::InferRequest m_request; - std::shared_ptr m_model; - - Tokenizer m_clip_tokenizer; -}; +using CLIPTextModelWithProjection = CLIPTextModel; } // namespace genai } // namespace ov diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp deleted file mode 100644 index 3bc8deeec9..0000000000 --- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (C) 2023-2025 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "openvino/genai/image_generation/clip_text_model_with_projection.hpp" - -#include - -#include "lora_helper.hpp" -#include "json_utils.hpp" -#include "utils.hpp" - -namespace ov { -namespace genai { - -std::filesystem::path get_tokenizer_path_by_text_encoder(const std::filesystem::path& text_encoder_path); - -CLIPTextModelWithProjection::Config::Config(const std::filesystem::path& config_path) { - std::ifstream file(config_path); - OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); - - nlohmann::json data = nlohmann::json::parse(file); - using utils::read_json_param; - - read_json_param(data, "max_position_embeddings", max_position_embeddings); - read_json_param(data, "num_hidden_layers", num_hidden_layers); -} - -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir) : - m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)), - m_config(root_dir / "config.json") { - m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml"); -} - -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir, - const std::string& device, - const ov::AnyMap& properties) : - CLIPTextModelWithProjection(root_dir) { - compile(device, properties); -} - -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model, - const Tensor& weights, - const Config& config, - const Tokenizer& clip_tokenizer) : - m_clip_tokenizer(clip_tokenizer), m_config(config) { - m_model = utils::singleton_core().read_model(model, weights); -} - -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model, - const Tensor& weights, - const Config& config, - const Tokenizer& clip_tokenizer, - const std::string& device, - const ov::AnyMap& properties) : - CLIPTextModelWithProjection(model, weights, config, clip_tokenizer) { - compile(device, properties); -} - -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const CLIPTextModelWithProjection&) = default; - -const CLIPTextModelWithProjection::Config& CLIPTextModelWithProjection::get_config() const { - return m_config; -} - -CLIPTextModelWithProjection& CLIPTextModelWithProjection::reshape(int batch_size) { - OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); - - ov::PartialShape input_shape = m_model->input(0).get_partial_shape(); - input_shape[0] = batch_size; - input_shape[1] = m_config.max_position_embeddings; - std::map idx_to_shape{{0, input_shape}}; - m_model->reshape(idx_to_shape); - - return *this; -} - -CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::string& device, const ov::AnyMap& properties) { - OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); - ov::Core core = utils::singleton_core(); - std::optional adapters; - auto filtered_properties = extract_adapters_from_properties(properties, &adapters); - if (adapters) { - adapters->set_tensor_name_prefix(adapters->get_tensor_name_prefix().value_or("lora_te")); - m_adapter_controller = AdapterController(m_model, *adapters, device); - } - ov::CompiledModel compiled_model = core.compile_model(m_model, device, *filtered_properties); - ov::genai::utils::print_compiled_model_properties(compiled_model, "Clip Text with projection model"); - m_request = compiled_model.create_infer_request(); - // release the original model - m_model.reset(); - - return *this; -} - -void CLIPTextModelWithProjection::set_adapters(const std::optional& adapters) { - if (adapters) { - m_adapter_controller.apply(m_request, *adapters); - } -} - -ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance) { - OPENVINO_ASSERT(m_request, "CLIP text encoder model must be compiled first. Cannot infer non-compiled model"); - - const int32_t pad_token_id = m_clip_tokenizer.get_pad_token_id(); - const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1; - - auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) { - ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids; - - if (input_ids.get_element_type() == ov::element::i32) { - std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); - std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); - } else { - std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); - std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); - } - }; - - ov::Tensor input_ids = m_request.get_input_tensor(); - input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings}); - - size_t current_batch_idx = 0; - - if (do_classifier_free_guidance) { - perform_tokenization(neg_prompt, - ov::Tensor(input_ids, {current_batch_idx , 0}, - {current_batch_idx + 1, m_config.max_position_embeddings})); - ++current_batch_idx; - } else { - // Negative prompt is ignored when --guidanceScale < 1.0 - } - - perform_tokenization(pos_prompt, - ov::Tensor(input_ids, {current_batch_idx , 0}, - {current_batch_idx + 1, m_config.max_position_embeddings})); - - // text embeddings - m_request.infer(); - - return m_request.get_output_tensor(0); -} - -ov::Tensor CLIPTextModelWithProjection::get_output_tensor(const size_t idx) { - return m_request.get_output_tensor(idx); -} - -} // namespace genai -} // namespace ov From 136a48a2c8253eae6f8ee30b233cdfcf67d1364f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 29 Apr 2025 15:35:54 +0200 Subject: [PATCH 2/2] Fixed python --- .../clip_text_model_with_projection.hpp | 5 +- .../openvino_genai/py_openvino_genai.pyi | 47 +----------- src/python/py_image_generation_models.cpp | 71 +------------------ 3 files changed, 6 insertions(+), 117 deletions(-) diff --git a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp index e62fa3097f..afa9dc3178 100644 --- a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp +++ b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp @@ -8,7 +8,10 @@ namespace ov { namespace genai { -using CLIPTextModelWithProjection = CLIPTextModel; +class CLIPTextModelWithProjection : public CLIPTextModel { +public: + using CLIPTextModel::CLIPTextModel; +}; } // namespace genai } // namespace ov diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index def2719cdc..22988d0181 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -271,55 +271,10 @@ class CLIPTextModel: ... def set_adapters(self, adapters: AdapterConfig | None) -> None: ... -class CLIPTextModelWithProjection: +class CLIPTextModelWithProjection(CLIPTextModel): """ CLIPTextModelWithProjection class. """ - class Config: - """ - This class is used for storing CLIPTextModelWithProjection config. - """ - max_position_embeddings: int - num_hidden_layers: int - def __init__(self, config_path: os.PathLike) -> None: - ... - @typing.overload - def __init__(self, root_dir: os.PathLike) -> None: - """ - CLIPTextModelWithProjection class - root_dir (os.PathLike): Model root directory. - """ - @typing.overload - def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: - """ - CLIPTextModelWithProjection class - root_dir (os.PathLike): Model root directory. - device (str): Device on which inference will be done. - kwargs: Device properties. - """ - @typing.overload - def __init__(self, model: CLIPTextModelWithProjection) -> None: - """ - CLIPTextModelWithProjection model - CLIPTextModelWithProjection class - model (CLIPTextModelWithProjection): CLIPTextModelWithProjection model - """ - def compile(self, device: str, **kwargs) -> None: - """ - Compiles the model. - device (str): Device to run the model on (e.g., CPU, GPU). - kwargs: Device properties. - """ - def get_config(self) -> CLIPTextModelWithProjection.Config: - ... - def get_output_tensor(self, idx: int) -> openvino._pyopenvino.Tensor: - ... - def infer(self, pos_prompt: str, neg_prompt: str, do_classifier_free_guidance: bool) -> openvino._pyopenvino.Tensor: - ... - def reshape(self, batch_size: int) -> CLIPTextModelWithProjection: - ... - def set_adapters(self, adapters: AdapterConfig | None) -> None: - ... class CacheEvictionConfig: """ diff --git a/src/python/py_image_generation_models.cpp b/src/python/py_image_generation_models.cpp index 0b5f4fee48..32248f011c 100644 --- a/src/python/py_image_generation_models.cpp +++ b/src/python/py_image_generation_models.cpp @@ -98,76 +98,7 @@ void init_clip_text_model(py::module_& m) { } void init_clip_text_model_with_projection(py::module_& m) { - auto clip_text_model_with_projection = py::class_(m, "CLIPTextModelWithProjection", "CLIPTextModelWithProjection class.") - .def(py::init([](const std::filesystem::path& root_dir) { - ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(root_dir); - }), - py::arg("root_dir"), "Model root directory", - R"( - CLIPTextModelWithProjection class - root_dir (os.PathLike): Model root directory. - )") - .def(py::init([]( - const std::filesystem::path& root_dir, - const std::string& device, - const py::kwargs& kwargs - ) { - ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(root_dir, device, pyutils::kwargs_to_any_map(kwargs)); - }), - py::arg("root_dir"), "Model root directory", - py::arg("device"), "Device on which inference will be done", - R"( - CLIPTextModelWithProjection class - root_dir (os.PathLike): Model root directory. - device (str): Device on which inference will be done. - kwargs: Device properties. - )") - .def(py::init([](const ov::genai::CLIPTextModelWithProjection& model) { - return std::make_unique(model); - }), - py::arg("model"), "CLIPTextModelWithProjection model" - R"( - CLIPTextModelWithProjection class - model (CLIPTextModelWithProjection): CLIPTextModelWithProjection model - )"); - - py::class_(clip_text_model_with_projection, "Config", "This class is used for storing CLIPTextModelWithProjection config.") - .def(py::init([](const std::filesystem::path& config_path) { - return std::make_unique(config_path); - }), - py::arg("config_path")) - .def_readwrite("max_position_embeddings", &ov::genai::CLIPTextModelWithProjection::Config::max_position_embeddings) - .def_readwrite("num_hidden_layers", &ov::genai::CLIPTextModelWithProjection::Config::num_hidden_layers); - - clip_text_model_with_projection.def("reshape", &ov::genai::CLIPTextModelWithProjection::reshape, py::arg("batch_size")) - .def("infer", &ov::genai::CLIPTextModelWithProjection::infer, - py::call_guard(), - py::arg("pos_prompt"), - py::arg("neg_prompt"), - py::arg("do_classifier_free_guidance")) - .def("get_config", &ov::genai::CLIPTextModelWithProjection::get_config) - .def("get_output_tensor", &ov::genai::CLIPTextModelWithProjection::get_output_tensor, py::arg("idx")) - .def("set_adapters", &ov::genai::CLIPTextModelWithProjection::set_adapters, py::arg("adapters")) - .def( - "compile", - [](ov::genai::CLIPTextModelWithProjection& self, - const std::string& device, - const py::kwargs& kwargs - ) { - auto map = pyutils::kwargs_to_any_map(kwargs); - { - py::gil_scoped_release rel; - self.compile(device, map); - } - }, - py::arg("device"), "device on which inference will be done", - R"( - Compiles the model. - device (str): Device to run the model on (e.g., CPU, GPU). - kwargs: Device properties. - )"); + auto clip_text_model_with_projection = py::class_(m, "CLIPTextModelWithProjection", "CLIPTextModelWithProjection class."); } void init_t5_encoder_model(py::module_& m) {