From dc0ca1bf372035bcca8ea2c67d6da6c4b683731f Mon Sep 17 00:00:00 2001
From: Maciej Smiatacz <maciej.smiatacz@intel.com>
Date: Wed, 17 Dec 2025 14:45:16 +0100
Subject: [PATCH] Add support for exporting CLIP submodels in OpenVINO

---
 docs/source/openvino/export.mdx        | 25 ++++++++++++++++++
 optimum/commands/export/openvino.py    | 11 ++++++++
 optimum/exporters/openvino/__main__.py | 15 +++++++++++
 tests/openvino/test_export.py          | 36 ++++++++++++++++++++++++++
 4 files changed, 87 insertions(+)

diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
index f152dc711d..c933b1e92e 100644
--- a/docs/source/openvino/export.mdx
+++ b/docs/source/openvino/export.mdx
@@ -41,6 +41,7 @@ usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt}
                                    [--quantization-statistics-path QUANTIZATION_STATISTICS_PATH]
                                    [--num-samples NUM_SAMPLES] [--disable-stateful] [--disable-convert-tokenizer]
                                    [--smooth-quant-alpha SMOOTH_QUANT_ALPHA]
+                                   [--submodel {vision,text,full}]
                                    output
 
 optional arguments:
@@ -165,6 +166,10 @@ Optional arguments:
   --smooth-quant-alpha SMOOTH_QUANT_ALPHA
                         SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers
                         and reduces quantization error. Valid only when activations quantization is enabled.
+  --submodel {vision,text,full}
+                        For CLIP models (Transformers) with `feature-extraction`, export only the specified
+                        submodule. Use `vision` to export the vision encoder, `text` for the text encoder,
+                        or `full` to export the default combined behavior. Default is `full`.
 ```
 
 You can also apply fp16, 8-bit or 4-bit weight-only quantization on the Linear, Convolutional and Embedding layers when exporting your model by setting `--weight-format` to respectively `fp16`, `int8` or `int4`.
@@ -220,6 +225,26 @@ or
 optimum-cli export openvino -m openai/clip-vit-base-patch16 --quant-mode int8 ./clip-vit-base-patch16
 ```
 
+### CLIP submodules
+
+For CLIP models loaded via Transformers and exported for `feature-extraction`, you can export only the vision or only the text encoder using `--submodel`:
+
+- Vision encoder only:
+
+```bash
+optimum-cli export openvino --model openai/clip-vit-base-patch32 \
+  --task feature-extraction --submodel vision ov_clip_vision/
+```
+
+- Text encoder only:
+
+```bash
+optimum-cli export openvino --model openai/clip-vit-base-patch32 \
+  --task feature-extraction --submodel text ov_clip_text/
+```
+
+If `--submodel` is omitted or set to `full`, the default combined CLIP export is used.
+
 
 ### Decoder models
 
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 31687c5e1e..90c3377923 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -288,6 +288,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "reduces quantization error. Valid only when activations quantization is enabled."
         ),
     )
+    optional_group.add_argument(
+        "--submodel",
+        type=str,
+        choices=["vision", "text", "full"],
+        default="full",
+        help=(
+            "For CLIP/OpenCLIP feature-extraction, export only the specified submodule. "
+            "Use 'vision' to export the vision encoder, 'text' for the text encoder, or 'full' for the combined/default behavior."
+        ),
+    )
     optional_group.add_argument(
         "--model-kwargs",
         type=json.loads,
@@ -480,6 +490,7 @@ def run(self):
                 library_name=library_name,
                 variant=self.args.variant,
                 model_kwargs=self.args.model_kwargs,
+                submodel=self.args.submodel,
                 # **input_shapes,
             )
             if apply_main_quantize:
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 91e4abfe08..d3292f87d1 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -184,6 +184,7 @@ def main_export(
     library_name: Optional[str] = None,
     model_loading_kwargs: Optional[Dict[str, Any]] = None,
     variant: Optional[str] = None,
+    submodel: Optional[str] = None,
     **kwargs_shapes,
 ):
     """
@@ -485,6 +486,20 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs):
                 **loading_kwargs,
             )
 
+        # If user asks for a specific CLIP submodel, swap to that submodule so
+        # TasksManager sees the right model_type (e.g. clip_vision_model) and config.
+        if (
+            submodel in {"vision", "text"}
+            and library_name == "transformers"
+            and task == "feature-extraction"
+        ):
+            if submodel == "vision" and hasattr(model, "vision_model"):
+                logger.info("Exporting CLIP vision submodel via TasksManager registry.")
+                model = model.vision_model
+            elif submodel == "text" and hasattr(model, "text_model"):
+                logger.info("Exporting CLIP text submodel via TasksManager registry.")
+                model = model.text_model
+
         needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None
 
         if needs_pad_token_id:
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 776196adb7..a34e3c9d47 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -347,3 +347,39 @@ def test_export_custom_model(self):
         ov_outputs = ov_model(**tokens)
         self.assertTrue(torch.allclose(ov_outputs.token_embeddings, model_outputs.token_embeddings, atol=1e-4))
         self.assertTrue(torch.allclose(ov_outputs.sentence_embedding, model_outputs.sentence_embedding, atol=1e-4))
+
+
+class CLIPSubmodelExportTest(unittest.TestCase):
+    def test_export_clip_vision_submodel_feature_extraction(self):
+        model_id = MODEL_NAMES["clip"]
+        with TemporaryDirectory() as tmpdirname:
+            main_export(
+                model_name_or_path=model_id,
+                library_name="transformers",
+                output=Path(tmpdirname),
+                task="feature-extraction",
+                submodel="vision",
+            )
+
+            ov_model = OVModelForFeatureExtraction.from_pretrained(tmpdirname, device=OPENVINO_DEVICE)
+            self.assertIsInstance(ov_model, OVBaseModel)
+            # Vision submodel should accept image tensors, not text inputs
+            self.assertIn("pixel_values", ov_model.input_names)
+            self.assertNotIn("input_ids", ov_model.input_names)
+
+    def test_export_clip_text_submodel_feature_extraction(self):
+        model_id = MODEL_NAMES["clip"]
+        with TemporaryDirectory() as tmpdirname:
+            main_export(
+                model_name_or_path=model_id,
+                library_name="transformers",
+                output=Path(tmpdirname),
+                task="feature-extraction",
+                submodel="text",
+            )
+
+            ov_model = OVModelForFeatureExtraction.from_pretrained(tmpdirname, device=OPENVINO_DEVICE)
+            self.assertIsInstance(ov_model, OVBaseModel)
+            # Text submodel should accept token ids, not image tensors
+            self.assertIn("input_ids", ov_model.input_names)
+            self.assertNotIn("pixel_values", ov_model.input_names)