Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions docs/source/openvino/export.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt}
[--quantization-statistics-path QUANTIZATION_STATISTICS_PATH]
[--num-samples NUM_SAMPLES] [--disable-stateful] [--disable-convert-tokenizer]
[--smooth-quant-alpha SMOOTH_QUANT_ALPHA]
[--submodel {vision,text,full}]
output

optional arguments:
Expand Down Expand Up @@ -165,6 +166,10 @@ Optional arguments:
--smooth-quant-alpha SMOOTH_QUANT_ALPHA
SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers
and reduces quantization error. Valid only when activations quantization is enabled.
--submodel {vision,text,full}
For CLIP models (Transformers) with `feature-extraction`, export only the specified
submodule. Use `vision` to export the vision encoder, `text` for the text encoder,
or `full` to export the default combined behavior. Default is `full`.
```

You can also apply fp16, 8-bit or 4-bit weight-only quantization on the Linear, Convolutional and Embedding layers when exporting your model by setting `--weight-format` to respectively `fp16`, `int8` or `int4`.
Expand Down Expand Up @@ -220,6 +225,26 @@ or
optimum-cli export openvino -m openai/clip-vit-base-patch16 --quant-mode int8 ./clip-vit-base-patch16
```

### CLIP submodules

For CLIP models loaded via Transformers and exported for `feature-extraction`, you can export only the vision or only the text encoder using `--submodel`:

- Vision encoder only:

```bash
optimum-cli export openvino --model openai/clip-vit-base-patch32 \
--task feature-extraction --submodel vision ov_clip_vision/
```

- Text encoder only:

```bash
optimum-cli export openvino --model openai/clip-vit-base-patch32 \
--task feature-extraction --submodel text ov_clip_text/
```

If `--submodel` is omitted or set to `full`, the default combined CLIP export is used.


### Decoder models

Expand Down
11 changes: 11 additions & 0 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
"reduces quantization error. Valid only when activations quantization is enabled."
),
)
optional_group.add_argument(
"--submodel",
type=str,
choices=["vision", "text", "full"],
default="full",
help=(
"For CLIP/OpenCLIP feature-extraction, export only the specified submodule. "
"Use 'vision' to export the vision encoder, 'text' for the text encoder, or 'full' for the combined/default behavior."
),
)
optional_group.add_argument(
"--model-kwargs",
type=json.loads,
Expand Down Expand Up @@ -480,6 +490,7 @@ def run(self):
library_name=library_name,
variant=self.args.variant,
model_kwargs=self.args.model_kwargs,
submodel=self.args.submodel,
# **input_shapes,
)
if apply_main_quantize:
Expand Down
15 changes: 15 additions & 0 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ def main_export(
library_name: Optional[str] = None,
model_loading_kwargs: Optional[Dict[str, Any]] = None,
variant: Optional[str] = None,
submodel: Optional[str] = None,
**kwargs_shapes,
):
"""
Expand Down Expand Up @@ -485,6 +486,20 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs):
**loading_kwargs,
)

# If user asks for a specific CLIP submodel, swap to that submodule so
# TasksManager sees the right model_type (e.g. clip_vision_model) and config.
if (
submodel in {"vision", "text"}
and library_name == "transformers"
and task == "feature-extraction"
):
if submodel == "vision" and hasattr(model, "vision_model"):
logger.info("Exporting CLIP vision submodel via TasksManager registry.")
model = model.vision_model
elif submodel == "text" and hasattr(model, "text_model"):
logger.info("Exporting CLIP text submodel via TasksManager registry.")
model = model.text_model

needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None

if needs_pad_token_id:
Expand Down
36 changes: 36 additions & 0 deletions tests/openvino/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,3 +347,39 @@ def test_export_custom_model(self):
ov_outputs = ov_model(**tokens)
self.assertTrue(torch.allclose(ov_outputs.token_embeddings, model_outputs.token_embeddings, atol=1e-4))
self.assertTrue(torch.allclose(ov_outputs.sentence_embedding, model_outputs.sentence_embedding, atol=1e-4))


class CLIPSubmodelExportTest(unittest.TestCase):
def test_export_clip_vision_submodel_feature_extraction(self):
model_id = MODEL_NAMES["clip"]
with TemporaryDirectory() as tmpdirname:
main_export(
model_name_or_path=model_id,
library_name="transformers",
output=Path(tmpdirname),
task="feature-extraction",
submodel="vision",
)

ov_model = OVModelForFeatureExtraction.from_pretrained(tmpdirname, device=OPENVINO_DEVICE)
self.assertIsInstance(ov_model, OVBaseModel)
# Vision submodel should accept image tensors, not text inputs
self.assertIn("pixel_values", ov_model.input_names)
self.assertNotIn("input_ids", ov_model.input_names)

def test_export_clip_text_submodel_feature_extraction(self):
model_id = MODEL_NAMES["clip"]
with TemporaryDirectory() as tmpdirname:
main_export(
model_name_or_path=model_id,
library_name="transformers",
output=Path(tmpdirname),
task="feature-extraction",
submodel="text",
)

ov_model = OVModelForFeatureExtraction.from_pretrained(tmpdirname, device=OPENVINO_DEVICE)
self.assertIsInstance(ov_model, OVBaseModel)
# Text submodel should accept token ids, not image tensors
self.assertIn("input_ids", ov_model.input_names)
self.assertNotIn("pixel_values", ov_model.input_names)