From dc0ca1bf372035bcca8ea2c67d6da6c4b683731f Mon Sep 17 00:00:00 2001 From: Maciej Smiatacz Date: Wed, 17 Dec 2025 14:45:16 +0100 Subject: [PATCH] Add support for exporting CLIP submodels in OpenVINO --- docs/source/openvino/export.mdx | 25 ++++++++++++++++++ optimum/commands/export/openvino.py | 11 ++++++++ optimum/exporters/openvino/__main__.py | 15 +++++++++++ tests/openvino/test_export.py | 36 ++++++++++++++++++++++++++ 4 files changed, 87 insertions(+) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index f152dc711d..c933b1e92e 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -41,6 +41,7 @@ usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt} [--quantization-statistics-path QUANTIZATION_STATISTICS_PATH] [--num-samples NUM_SAMPLES] [--disable-stateful] [--disable-convert-tokenizer] [--smooth-quant-alpha SMOOTH_QUANT_ALPHA] + [--submodel {vision,text,full}] output optional arguments: @@ -165,6 +166,10 @@ Optional arguments: --smooth-quant-alpha SMOOTH_QUANT_ALPHA SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and reduces quantization error. Valid only when activations quantization is enabled. + --submodel {vision,text,full} + For CLIP models (Transformers) with `feature-extraction`, export only the specified + submodule. Use `vision` to export the vision encoder, `text` for the text encoder, + or `full` to export the default combined behavior. Default is `full`. ``` You can also apply fp16, 8-bit or 4-bit weight-only quantization on the Linear, Convolutional and Embedding layers when exporting your model by setting `--weight-format` to respectively `fp16`, `int8` or `int4`. @@ -220,6 +225,26 @@ or optimum-cli export openvino -m openai/clip-vit-base-patch16 --quant-mode int8 ./clip-vit-base-patch16 ``` +### CLIP submodules + +For CLIP models loaded via Transformers and exported for `feature-extraction`, you can export only the vision or only the text encoder using `--submodel`: + +- Vision encoder only: + +```bash +optimum-cli export openvino --model openai/clip-vit-base-patch32 \ + --task feature-extraction --submodel vision ov_clip_vision/ +``` + +- Text encoder only: + +```bash +optimum-cli export openvino --model openai/clip-vit-base-patch32 \ + --task feature-extraction --submodel text ov_clip_text/ +``` + +If `--submodel` is omitted or set to `full`, the default combined CLIP export is used. + ### Decoder models diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 31687c5e1e..90c3377923 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -288,6 +288,16 @@ def parse_args_openvino(parser: "ArgumentParser"): "reduces quantization error. Valid only when activations quantization is enabled." ), ) + optional_group.add_argument( + "--submodel", + type=str, + choices=["vision", "text", "full"], + default="full", + help=( + "For CLIP/OpenCLIP feature-extraction, export only the specified submodule. " + "Use 'vision' to export the vision encoder, 'text' for the text encoder, or 'full' for the combined/default behavior." + ), + ) optional_group.add_argument( "--model-kwargs", type=json.loads, @@ -480,6 +490,7 @@ def run(self): library_name=library_name, variant=self.args.variant, model_kwargs=self.args.model_kwargs, + submodel=self.args.submodel, # **input_shapes, ) if apply_main_quantize: diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 91e4abfe08..d3292f87d1 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -184,6 +184,7 @@ def main_export( library_name: Optional[str] = None, model_loading_kwargs: Optional[Dict[str, Any]] = None, variant: Optional[str] = None, + submodel: Optional[str] = None, **kwargs_shapes, ): """ @@ -485,6 +486,20 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs): **loading_kwargs, ) + # If user asks for a specific CLIP submodel, swap to that submodule so + # TasksManager sees the right model_type (e.g. clip_vision_model) and config. + if ( + submodel in {"vision", "text"} + and library_name == "transformers" + and task == "feature-extraction" + ): + if submodel == "vision" and hasattr(model, "vision_model"): + logger.info("Exporting CLIP vision submodel via TasksManager registry.") + model = model.vision_model + elif submodel == "text" and hasattr(model, "text_model"): + logger.info("Exporting CLIP text submodel via TasksManager registry.") + model = model.text_model + needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None if needs_pad_token_id: diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 776196adb7..a34e3c9d47 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -347,3 +347,39 @@ def test_export_custom_model(self): ov_outputs = ov_model(**tokens) self.assertTrue(torch.allclose(ov_outputs.token_embeddings, model_outputs.token_embeddings, atol=1e-4)) self.assertTrue(torch.allclose(ov_outputs.sentence_embedding, model_outputs.sentence_embedding, atol=1e-4)) + + +class CLIPSubmodelExportTest(unittest.TestCase): + def test_export_clip_vision_submodel_feature_extraction(self): + model_id = MODEL_NAMES["clip"] + with TemporaryDirectory() as tmpdirname: + main_export( + model_name_or_path=model_id, + library_name="transformers", + output=Path(tmpdirname), + task="feature-extraction", + submodel="vision", + ) + + ov_model = OVModelForFeatureExtraction.from_pretrained(tmpdirname, device=OPENVINO_DEVICE) + self.assertIsInstance(ov_model, OVBaseModel) + # Vision submodel should accept image tensors, not text inputs + self.assertIn("pixel_values", ov_model.input_names) + self.assertNotIn("input_ids", ov_model.input_names) + + def test_export_clip_text_submodel_feature_extraction(self): + model_id = MODEL_NAMES["clip"] + with TemporaryDirectory() as tmpdirname: + main_export( + model_name_or_path=model_id, + library_name="transformers", + output=Path(tmpdirname), + task="feature-extraction", + submodel="text", + ) + + ov_model = OVModelForFeatureExtraction.from_pretrained(tmpdirname, device=OPENVINO_DEVICE) + self.assertIsInstance(ov_model, OVBaseModel) + # Text submodel should accept token ids, not image tensors + self.assertIn("input_ids", ov_model.input_names) + self.assertNotIn("pixel_values", ov_model.input_names)