From 699c569958b58913afdc418ab05d4c78cf6a80ff Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 26 Feb 2025 08:30:31 +0100
Subject: [PATCH 1/2] Alias CLIPTextModelWithProjection as CLIPTextModel

---
 .../clip_text_model_with_projection.hpp       |  79 +---------
 .../clip_text_model_with_projection.cpp       | 148 ------------------
 2 files changed, 1 insertion(+), 226 deletions(-)
 delete mode 100644 src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
diff --git a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
index cb74d8a214..fc78c7e914 100644
--- a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
@@ -18,84 +18,7 @@
 namespace ov {
 namespace genai {
 
-class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection {
-public:
-    struct OPENVINO_GENAI_EXPORTS Config {
-        size_t max_position_embeddings = 77;
-        size_t num_hidden_layers = 32;
-
-        explicit Config(const std::filesystem::path& config_path);
-    };
-
-    explicit CLIPTextModelWithProjection(const std::filesystem::path& root_dir);
-
-    CLIPTextModelWithProjection(const std::filesystem::path& root_dir,
-                                const std::string& device,
-                                const ov::AnyMap& properties = {});
-
-    CLIPTextModelWithProjection(const std::string& model,
-                                const Tensor& weights,
-                                const Config& config,
-                                const Tokenizer& clip_tokenizer);
-
-    CLIPTextModelWithProjection(const std::string& model,
-                                const Tensor& weights,
-                                const Config& config,
-                                const Tokenizer& clip_tokenizer,
-                                const std::string& device,
-                                const ov::AnyMap& properties = {});
-
-    template <typename... Properties,
-              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
-    CLIPTextModelWithProjection(const std::filesystem::path& root_dir,
-                                const std::string& device,
-                                Properties&&... properties)
-        : CLIPTextModelWithProjection(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
-
-    template <typename... Properties,
-              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
-    CLIPTextModelWithProjection(const std::string& model,
-                                const Tensor& weights,
-                                const Config& config,
-                                const Tokenizer& clip_tokenizer,
-                                const std::string& device,
-                                Properties&&... properties)
-        : CLIPTextModelWithProjection(model,
-                                      weights,
-                                      config,
-                                      clip_tokenizer,
-                                      device,
-                                      ov::AnyMap{std::forward<Properties>(properties)...}) { }
-
-    CLIPTextModelWithProjection(const CLIPTextModelWithProjection&);
-
-    const Config& get_config() const;
-
-    CLIPTextModelWithProjection& reshape(int batch_size);
-
-    CLIPTextModelWithProjection& compile(const std::string& device, const ov::AnyMap& properties = {});
-
-    template <typename... Properties>
-    ov::util::EnableIfAllStringAny<CLIPTextModelWithProjection&, Properties...> compile(
-            const std::string& device,
-            Properties&&... properties) {
-        return compile(device, ov::AnyMap{std::forward<Properties>(properties)...});
-    }
-
-    void set_adapters(const std::optional<AdapterConfig>& adapters);
-
-    ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance);
-
-    ov::Tensor get_output_tensor(const size_t idx);
-
-private:
-    Config m_config;
-    AdapterController m_adapter_controller;
-    ov::InferRequest m_request;
-    std::shared_ptr<ov::Model> m_model;
-
-    Tokenizer m_clip_tokenizer;
-};
+using CLIPTextModelWithProjection = CLIPTextModel;
 
 } // namespace genai
 } // namespace ov
diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
deleted file mode 100644
index 3bc8deeec9..0000000000
--- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (C) 2023-2025 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "openvino/genai/image_generation/clip_text_model_with_projection.hpp"
-
-#include <fstream>
-
-#include "lora_helper.hpp"
-#include "json_utils.hpp"
-#include "utils.hpp"
-
-namespace ov {
-namespace genai {
-
-std::filesystem::path get_tokenizer_path_by_text_encoder(const std::filesystem::path& text_encoder_path);
-
-CLIPTextModelWithProjection::Config::Config(const std::filesystem::path& config_path) {
-    std::ifstream file(config_path);
-    OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path);
-
-    nlohmann::json data = nlohmann::json::parse(file);
-    using utils::read_json_param;
-
-    read_json_param(data, "max_position_embeddings", max_position_embeddings);
-    read_json_param(data, "num_hidden_layers", num_hidden_layers);
-}
-
-CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir) :
-    m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)),
-    m_config(root_dir / "config.json") {
-    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
-}
-
-CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir,
-                const std::string& device,
-                const ov::AnyMap& properties) :
-    CLIPTextModelWithProjection(root_dir) {
-    compile(device, properties);
-}
-
-CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model,
-                                                         const Tensor& weights,
-                                                         const Config& config,
-                                                         const Tokenizer& clip_tokenizer) :
-    m_clip_tokenizer(clip_tokenizer), m_config(config) {
-    m_model = utils::singleton_core().read_model(model, weights);
-}
-
-CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model,
-                                                         const Tensor& weights,
-                                                         const Config& config,
-                                                         const Tokenizer& clip_tokenizer,
-                                                         const std::string& device,
-                                                         const ov::AnyMap& properties) :
-    CLIPTextModelWithProjection(model, weights, config, clip_tokenizer) {
-    compile(device, properties);
-}
-
-CLIPTextModelWithProjection::CLIPTextModelWithProjection(const CLIPTextModelWithProjection&) = default;
-
-const CLIPTextModelWithProjection::Config& CLIPTextModelWithProjection::get_config() const {
-    return m_config;
-}
-
-CLIPTextModelWithProjection& CLIPTextModelWithProjection::reshape(int batch_size) {
-    OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model");
-
-    ov::PartialShape input_shape = m_model->input(0).get_partial_shape();
-    input_shape[0] = batch_size;
-    input_shape[1] = m_config.max_position_embeddings;
-    std::map<size_t, ov::PartialShape> idx_to_shape{{0, input_shape}};
-    m_model->reshape(idx_to_shape);
-
-    return *this;
-}
-
-CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::string& device, const ov::AnyMap& properties) {
-    OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model");
-    ov::Core core = utils::singleton_core();
-    std::optional<AdapterConfig> adapters;
-    auto filtered_properties = extract_adapters_from_properties(properties, &adapters);
-    if (adapters) {
-        adapters->set_tensor_name_prefix(adapters->get_tensor_name_prefix().value_or("lora_te"));
-        m_adapter_controller = AdapterController(m_model, *adapters, device);
-    }
-    ov::CompiledModel compiled_model = core.compile_model(m_model, device, *filtered_properties);
-    ov::genai::utils::print_compiled_model_properties(compiled_model, "Clip Text with projection model");
-    m_request = compiled_model.create_infer_request();
-    // release the original model
-    m_model.reset();
-
-    return *this;
-}
-
-void CLIPTextModelWithProjection::set_adapters(const std::optional<AdapterConfig>& adapters) {
-    if (adapters) {
-        m_adapter_controller.apply(m_request, *adapters);
-    }
-}
-
-ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance) {
-    OPENVINO_ASSERT(m_request, "CLIP text encoder model must be compiled first. Cannot infer non-compiled model");
-
-    const int32_t pad_token_id = m_clip_tokenizer.get_pad_token_id();
-    const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1;
-
-    auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) {
-        ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids;
-
-        if (input_ids.get_element_type() == ov::element::i32) {
-            std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
-            std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int32_t>());
-        } else {
-            std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
-            std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int64_t>());
-        }
-    };
-
-    ov::Tensor input_ids = m_request.get_input_tensor();
-    input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings});
-
-    size_t current_batch_idx = 0;
-
-    if (do_classifier_free_guidance) {
-        perform_tokenization(neg_prompt,
-                             ov::Tensor(input_ids, {current_batch_idx    , 0},
-                                                   {current_batch_idx + 1, m_config.max_position_embeddings}));
-        ++current_batch_idx;
-    } else {
-        // Negative prompt is ignored when --guidanceScale < 1.0
-    }
-
-    perform_tokenization(pos_prompt,
-                         ov::Tensor(input_ids, {current_batch_idx    , 0},
-                                               {current_batch_idx + 1, m_config.max_position_embeddings}));
-
-    // text embeddings
-    m_request.infer();
-
-    return m_request.get_output_tensor(0);
-}
-
-ov::Tensor CLIPTextModelWithProjection::get_output_tensor(const size_t idx) {
-    return m_request.get_output_tensor(idx);
-}
-
-} // namespace genai
-} // namespace ov

From 136a48a2c8253eae6f8ee30b233cdfcf67d1364f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 29 Apr 2025 15:35:54 +0200
Subject: [PATCH 2/2] Fixed python

---
 .../clip_text_model_with_projection.hpp       |  5 +-
 .../openvino_genai/py_openvino_genai.pyi      | 47 +-----------
 src/python/py_image_generation_models.cpp     | 71 +------------------
 3 files changed, 6 insertions(+), 117 deletions(-)

diff --git a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
index e62fa3097f..afa9dc3178 100644
--- a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
@@ -8,7 +8,10 @@
 namespace ov {
 namespace genai {
 
-using CLIPTextModelWithProjection = CLIPTextModel;
+class CLIPTextModelWithProjection : public CLIPTextModel {
+public:
+    using CLIPTextModel::CLIPTextModel;
+};
 
 } // namespace genai
 } // namespace ov
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index def2719cdc..22988d0181 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -271,55 +271,10 @@ class CLIPTextModel:
         ...
     def set_adapters(self, adapters: AdapterConfig | None) -> None:
         ...
-class CLIPTextModelWithProjection:
+class CLIPTextModelWithProjection(CLIPTextModel):
     """
     CLIPTextModelWithProjection class.
     """
-    class Config:
-        """
-        This class is used for storing CLIPTextModelWithProjection config.
-        """
-        max_position_embeddings: int
-        num_hidden_layers: int
-        def __init__(self, config_path: os.PathLike) -> None:
-            ...
-    @typing.overload
-    def __init__(self, root_dir: os.PathLike) -> None:
-        """
-                    CLIPTextModelWithProjection class
-                    root_dir (os.PathLike): Model root directory.
-        """
-    @typing.overload
-    def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None:
-        """
-                    CLIPTextModelWithProjection class
-                    root_dir (os.PathLike): Model root directory.
-                    device (str): Device on which inference will be done.
-                    kwargs: Device properties.
-        """
-    @typing.overload
-    def __init__(self, model: CLIPTextModelWithProjection) -> None:
-        """
-        CLIPTextModelWithProjection model
-                    CLIPTextModelWithProjection class
-                    model (CLIPTextModelWithProjection): CLIPTextModelWithProjection model
-        """
-    def compile(self, device: str, **kwargs) -> None:
-        """
-                        Compiles the model.
-                        device (str): Device to run the model on (e.g., CPU, GPU).
-                        kwargs: Device properties.
-        """
-    def get_config(self) -> CLIPTextModelWithProjection.Config:
-        ...
-    def get_output_tensor(self, idx: int) -> openvino._pyopenvino.Tensor:
-        ...
-    def infer(self, pos_prompt: str, neg_prompt: str, do_classifier_free_guidance: bool) -> openvino._pyopenvino.Tensor:
-        ...
-    def reshape(self, batch_size: int) -> CLIPTextModelWithProjection:
-        ...
-    def set_adapters(self, adapters: AdapterConfig | None) -> None:
-        ...
 class CacheEvictionConfig:
     """
     
diff --git a/src/python/py_image_generation_models.cpp b/src/python/py_image_generation_models.cpp
index 0b5f4fee48..32248f011c 100644
--- a/src/python/py_image_generation_models.cpp
+++ b/src/python/py_image_generation_models.cpp
@@ -98,76 +98,7 @@ void init_clip_text_model(py::module_& m) {
 }
 
 void init_clip_text_model_with_projection(py::module_& m) {
-    auto clip_text_model_with_projection = py::class_<ov::genai::CLIPTextModelWithProjection>(m, "CLIPTextModelWithProjection", "CLIPTextModelWithProjection class.")
-        .def(py::init([](const std::filesystem::path& root_dir) {
-            ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
-            return std::make_unique<ov::genai::CLIPTextModelWithProjection>(root_dir);
-        }),
-        py::arg("root_dir"), "Model root directory",
-        R"(
-            CLIPTextModelWithProjection class
-            root_dir (os.PathLike): Model root directory.
-        )")
-        .def(py::init([](
-            const std::filesystem::path& root_dir,
-            const std::string& device,
-            const py::kwargs& kwargs
-        ) {
-            ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
-            return std::make_unique<ov::genai::CLIPTextModelWithProjection>(root_dir, device,  pyutils::kwargs_to_any_map(kwargs));
-        }),
-        py::arg("root_dir"), "Model root directory",
-        py::arg("device"), "Device on which inference will be done",
-        R"(
-            CLIPTextModelWithProjection class
-            root_dir (os.PathLike): Model root directory.
-            device (str): Device on which inference will be done.
-            kwargs: Device properties.
-        )")
-        .def(py::init([](const ov::genai::CLIPTextModelWithProjection& model) {
-            return std::make_unique<ov::genai::CLIPTextModelWithProjection>(model);
-        }),
-        py::arg("model"), "CLIPTextModelWithProjection model"
-        R"(
-            CLIPTextModelWithProjection class
-            model (CLIPTextModelWithProjection): CLIPTextModelWithProjection model
-        )");
-
-    py::class_<ov::genai::CLIPTextModelWithProjection::Config>(clip_text_model_with_projection, "Config", "This class is used for storing CLIPTextModelWithProjection config.")
-        .def(py::init([](const std::filesystem::path& config_path) {
-            return std::make_unique<ov::genai::CLIPTextModelWithProjection::Config>(config_path);
-        }),
-        py::arg("config_path"))
-        .def_readwrite("max_position_embeddings", &ov::genai::CLIPTextModelWithProjection::Config::max_position_embeddings)
-        .def_readwrite("num_hidden_layers", &ov::genai::CLIPTextModelWithProjection::Config::num_hidden_layers);
-
-    clip_text_model_with_projection.def("reshape", &ov::genai::CLIPTextModelWithProjection::reshape, py::arg("batch_size"))
-        .def("infer", &ov::genai::CLIPTextModelWithProjection::infer, 
-            py::call_guard<py::gil_scoped_release>(), 
-            py::arg("pos_prompt"), 
-            py::arg("neg_prompt"), 
-            py::arg("do_classifier_free_guidance"))
-        .def("get_config", &ov::genai::CLIPTextModelWithProjection::get_config)
-        .def("get_output_tensor", &ov::genai::CLIPTextModelWithProjection::get_output_tensor, py::arg("idx"))
-        .def("set_adapters", &ov::genai::CLIPTextModelWithProjection::set_adapters, py::arg("adapters"))
-        .def(
-            "compile",
-            [](ov::genai::CLIPTextModelWithProjection& self,
-                const std::string& device,
-                const py::kwargs& kwargs
-            ) {
-                auto map = pyutils::kwargs_to_any_map(kwargs);
-                {
-                    py::gil_scoped_release rel;
-                    self.compile(device, map);
-                }
-            },
-            py::arg("device"), "device on which inference will be done",
-            R"(
-                Compiles the model.
-                device (str): Device to run the model on (e.g., CPU, GPU).
-                kwargs: Device properties.
-            )");
+    auto clip_text_model_with_projection = py::class_<ov::genai::CLIPTextModelWithProjection, ov::genai::CLIPTextModel>(m, "CLIPTextModelWithProjection", "CLIPTextModelWithProjection class.");
 }
 
 void init_t5_encoder_model(py::module_& m) {