Skip to content
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3b366e6
[OpenVINO] Support openbmb/MiniCPM-o-2_6 for image-text-to-text task
rkazants Sep 29, 2025
66416ab
Fix export for minicpmo
rkazants Sep 30, 2025
bc2ca88
Add max version of transformers for support
rkazants Sep 30, 2025
8574228
Update documentation
rkazants Sep 30, 2025
b485b83
Update docs/source/openvino/models.mdx
rkazants Oct 1, 2025
faa8378
Add minimal version for transformers
rkazants Oct 2, 2025
4533a70
Revert changes with import of VisionRotaryEmbedding
rkazants Oct 2, 2025
0c99715
Add custom prepare_generation_inputs for MiniCPMO
rkazants Oct 2, 2025
6b50d6d
Add vocos dependency for minicpmo validation
rkazants Oct 3, 2025
14e4309
Add additional vector_quantize_pytorch deps
rkazants Oct 3, 2025
3810295
Adjust tests for minicpmo
rkazants Oct 3, 2025
3e642d2
Correct trust_remote_code option for test
rkazants Oct 3, 2025
0e8c00e
U se additional_inputs for tokenizer
rkazants Oct 3, 2025
72a3052
Update optimum/exporters/openvino/model_configs.py
rkazants Oct 3, 2025
72c7a97
Update tests/openvino/test_seq2seq.py
rkazants Oct 3, 2025
977647e
Added tests for quantiozation
rkazants Oct 3, 2025
3297035
Update tests/openvino/test_exporters_cli.py
rkazants Oct 3, 2025
d6f63d0
Merge remote-tracking branch 'remotes/upstream/main' into support_min…
rkazants Oct 3, 2025
e9e7e68
Update util tests with minicpmo
rkazants Oct 4, 2025
91239a9
Fix test_filtered_architectures for latest transformers
rkazants Oct 4, 2025
2bd642a
Update tests/openvino/test_exporters_cli.py
rkazants Oct 6, 2025
69571e9
Update tests/openvino/test_exporters_cli.py
rkazants Oct 6, 2025
d873661
Update optimum/exporters/openvino/model_configs.py
rkazants Oct 6, 2025
d14820a
Adjust reference for quantization tests
rkazants Oct 6, 2025
92c6325
Update tests/openvino/utils_tests.py
rkazants Oct 6, 2025
2c5861e
Update tests/openvino/test_exporters_cli.py
rkazants Oct 6, 2025
f70dfa0
Update docs/source/openvino/models.mdx
rkazants Oct 6, 2025
348ac8b
Update docs/source/openvino/models.mdx
rkazants Oct 6, 2025
8b4afa3
Update tests/openvino/test_quantization.py
rkazants Oct 6, 2025
34ad21d
Apply suggestion from @rkazants
rkazants Oct 7, 2025
681fb2d
Apply suggestion from @rkazants
rkazants Oct 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/openvino/models.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ Here is the list of the supported architectures :
- Marian
- MiniCPM
- MiniCPM3
- MiniCPM-o
- MiniCPMV
- Mistral
- Mixtral
Expand Down
2 changes: 1 addition & 1 deletion optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ def export_from_model(
# some model configs may have issues with loading without parameters initialization
try:
misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
except (KeyError, TypeError):
except (AttributeError, KeyError, TypeError):
misplaced_generation_parameters = {}
if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
logger.warning(
Expand Down
6 changes: 6 additions & 0 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2682,6 +2682,12 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[
return super().patch_model_for_export(model, model_kwargs)


@register_in_tasks_manager("minicpmo", *["image-text-to-text"], library_name="transformers")
class MiniCPMOOpenVINOConfig(MiniCPMVOpenVINOConfig):
MIN_TRANSFORMERS_VERSION = "4.43.0"
MAX_TRANSFORMERS_VERSION = "4.51.99"


class Phi3VisionConfigBehavior(str, enum.Enum):
LANGUAGE = "language"
VISION_PROJECTION = "vision_projection"
Expand Down
1 change: 1 addition & 0 deletions optimum/exporters/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ def get_submodels(model):
"phi4mm",
"phi4_multimodal",
"llama4",
"minicpmo",
]

SSM_MODELS = ["mamba", "falcon_mamba"]
Expand Down
31 changes: 31 additions & 0 deletions optimum/intel/openvino/modeling_visual_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -2114,6 +2114,36 @@ def preprocess_inputs(
return inputs


class _OVMiniCPMOForCausalLM(_OVMiniCPMVForCausalLM):
def prepare_inputs_for_generation(
self,
input_ids,
past_key_values=None,
inputs_embeds=None,
pixel_values=None,
image_sizes=None,
attention_mask=None,
audio_bounds=None,
spk_bounds=None,
audio_features=None,
audio_feature_lens=None,
**kwargs,
):
# Audio modality is not supported for MiniCPMO
if audio_features is not None and len(audio_features) > 0:
raise ValueError("Audio input is not supported for MiniCPMO")

return super().prepare_inputs_for_generation(
input_ids=input_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
pixel_values=pixel_values,
image_sizes=image_sizes,
attention_mask=attention_mask,
**kwargs,
)


class _OVNanoLlavaForCausalLM(OVModelForVisualCausalLM):
def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
if input_ids is not None and input_ids.shape[1] == 1:
Expand Down Expand Up @@ -4355,4 +4385,5 @@ def preprocess_inputs(
"phi4mm": _OVPhi4MMForCausalLM,
"phi4_multimodal": _OVPhi4MMForCausalLM,
"llama4": _OVLlama4ForCausalLM,
"minicpmo": _OVMiniCPMOForCausalLM,
}
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
"langchain-huggingface",
"hf_xet",
"num2words",
"vocos",
"vector_quantize_pytorch",
]

QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"]
Expand Down
14 changes: 13 additions & 1 deletion tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,18 @@ class OVCLIExportTestCase(unittest.TestCase):
"vision_embeddings_model": {"int8": 16},
},
),
(
"image-text-to-text",
"minicpmo",
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
"--dataset contextual --num-samples 1 --trust-remote-code",
{
"lm_model": {"int8": 6, "int4": 10},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 8},
"resampler_model": {"int8": 6},
},
),
]

# filter models type depending on min max transformers version
Expand All @@ -754,7 +766,7 @@ def test_filtered_architectures(cls):
elif is_transformers_version("<", "4.52"):
expected = set()
else:
expected = {"llava-qwen2", "phi3_v", "phi4mm"}
expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"}

all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS}
Expand Down
26 changes: 25 additions & 1 deletion tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,27 @@ class OVWeightCompressionTest(unittest.TestCase):
"vision_embeddings_model": {"int8": 16},
},
),
(
OVModelForVisualCausalLM,
"minicpmo",
True,
dict(
bits=4,
group_size=16,
dataset="contextual",
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
num_samples=1,
processor=MODEL_NAMES["minicpmo"],
trust_remote_code=True,
),
{
"lm_model": {"int8": 4, "int4": 0},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 8},
"resampler_model": {"int8": 6},
},
),
]

# filter models type depending on min max transformers version
Expand Down Expand Up @@ -1037,6 +1058,9 @@ class OVWeightCompressionTest(unittest.TestCase):
if is_transformers_version("<", "4.54.0"):
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "llava-qwen2", True))

if is_transformers_version("<", "4.52.0"):
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmo", True))

SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [
(OVStableDiffusionPipeline, "stable-diffusion", 72, 195),
(OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331),
Expand All @@ -1058,7 +1082,7 @@ def test_filtered_architectures(cls):
elif is_transformers_version("<", "4.52"):
expected = set()
else:
expected = {"llava-qwen2", "phi3_v"}
expected = {"llava-qwen2", "phi3_v", "minicpmo"}

all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE}
Expand Down
21 changes: 16 additions & 5 deletions tests/openvino/test_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,13 +498,15 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"]
if is_transformers_version(">=", "4.51"):
SUPPORTED_ARCHITECTURES += ["llama4"]
if is_transformers_version("<", "4.52"):
SUPPORTED_ARCHITECTURES += ["minicpmo"]

if is_transformers_version(">=", "4.54.0"):
# remote code models differs after transformers v4.54
SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"}

TASK = "image-text-to-text"
REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]

IMAGE = Image.open(
requests.get(
Expand Down Expand Up @@ -609,7 +611,7 @@ def test_compare_to_transformers(self, model_arch):
self._check_device_and_request(ov_model, test_device, False)

# pytorch minicpmv and internvl_chat are not designed to be used via forward
if model_arch not in ["minicpmv", "internvl_chat"]:
if model_arch not in ["minicpmv", "minicpmo", "internvl_chat"]:
set_seed(SEED)
ov_outputs = ov_model(**inputs)
set_seed(SEED)
Expand Down Expand Up @@ -654,12 +656,21 @@ def test_compare_to_transformers(self, model_arch):
transformers_inputs["past_key_values"] = DynamicCache()

with torch.no_grad():
if model_arch in ["minicpmo"]:
# `generate` method for minicpmo requires tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
)
additional_inputs["tokenizer"] = tokenizer
transformers_outputs = transformers_model.generate(
**transformers_inputs, generation_config=gen_config, **additional_inputs
)
if model_arch in ["minicpmo"]:
# retrieve decoded tokens for comparation
transformers_outputs = transformers_outputs[1].sequences

# original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
if model_arch in ["minicpmv", "internvl_chat"]:
if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
self.assertTrue(
torch.equal(ov_outputs, transformers_outputs),
Expand All @@ -685,7 +696,7 @@ def test_compare_to_transformers(self, model_arch):
transformers_inputs = copy.deepcopy(inputs)
ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
# original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
if model_arch in ["minicpmv", "internvl_chat"]:
if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
with torch.no_grad():
transformers_outputs = transformers_model.generate(
Expand All @@ -703,7 +714,7 @@ def test_compare_to_transformers(self, model_arch):
transformers_inputs = copy.deepcopy(inputs)
ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
# original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
if model_arch in ["minicpmv", "internvl_chat"]:
if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
with torch.no_grad():
transformers_outputs = transformers_model.generate(
Expand Down
7 changes: 7 additions & 0 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
"minicpm": "katuni4ka/tiny-random-minicpm",
"minicpm3": "katuni4ka/tiny-random-minicpm3",
"minicpmv": "katuni4ka/tiny-random-minicpmv-2_6",
"minicpmo": "rkazants/tiny-random-MiniCPM-o-2_6",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this model will slow down our ci greatly, it is 400MB 🫨
https://huggingface.co/rkazants/tiny-random-MiniCPM-o-2_6/tree/main

Copy link
Collaborator Author

@rkazants rkazants Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a minimal size I managed to receive. minicpmv is about ~300MB and it is tested: https://huggingface.co/katuni4ka/tiny-random-minicpmv-2_6/tree/main

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be reduced as well

Copy link
Collaborator Author

@rkazants rkazants Oct 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reduced to 144MB. Minimal hidden_size is 128 for llm part: https://huggingface.co/rkazants/tiny-random-MiniCPM-o-2_6/blob/main/modeling_minicpmo.py#L209
That also impacts apm and tts module size.

@IlyasMoutawwakil, @echarlaix, I propose to do further reduction in further PR(s) if any ideas. Now my other colleagues anticipate this PR merge, let us not block PR merge due to tiny model size. We know that the implemented logic are passing the tests in GHA.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

completely agree with @IlyasMoutawwakil comment, we should be super careful with our tiny random models size to not slow down the ci, could you extend on the different models parameters constraint @rkazants https://huggingface.co/rkazants/tiny-random-MiniCPM-o-2_6/blob/main/config.json#L20 for example I see d_model / decoder_ffn_dim / encoder_ffn_dim respectively set to 1024, 1024 and 4096

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also if the PR really needs to be merged asap I'm ok with keeping this model but would like to have a following PR to change it to a smaller model or if that cannnot be done due to modeling constraint then would like to have more information on what are the constraints / why it cannot be done, would that sound reasonable @rkazants ?

Copy link
Collaborator Author

@rkazants rkazants Oct 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline with @echarlaix to proceed with the merge.
I will take this AR for further optimization. Indeed, there is a room for optimization such as d_model, encoder_ffn_dim but it will take some time because varying these parameters values needs to adjust several parameters from other modalities. It requires a bit deeper model understanding.
Thanks!

"mistral": "echarlaix/tiny-random-mistral",
"mistral-nemo": "katuni4ka/tiny-random-mistral-nemo",
"mixtral": "TitanML/tiny-mixtral",
Expand Down Expand Up @@ -327,6 +328,12 @@
"clip": {"model": 130},
"mamba": {"model": 386},
"falcon-mamba": {"model": 194},
"minicpmo": {
"lm_model": 16,
"text_embeddings_model": 1,
"vision_embeddings_model": 8,
"resampler_model": 6,
},
}

TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"
Expand Down
Loading