Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2625,12 +2625,22 @@ def with_behavior(
"""
if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
behavior = MiniCPMVConfigBehavior(behavior)

model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"}
if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
return get_vlm_text_embeddings_config(
model_mapping[self._orig_config.version],
self._orig_config,
self.int_dtype,
self.float_dtype,
)

if behavior == MiniCPMVConfigBehavior.LANGUAGE:
return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
return get_vlm_text_generation_config(
model_mapping[self._orig_config.version],
self._orig_config,
self.int_dtype,
self.float_dtype,
)

if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
return self.__class__(
Expand Down
22 changes: 22 additions & 0 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,28 @@ class OVCLIExportTestCase(unittest.TestCase):
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"minicpmv4",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
{
"lm_model": {"int8": 12, "int4": 18},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 14},
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"minicpmv4_5",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
{
"lm_model": {"int8": 12, "int4": 18},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 14},
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"internvl_chat",
Expand Down
44 changes: 44 additions & 0 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,48 @@ class OVWeightCompressionTest(unittest.TestCase):
"resampler_model": {"int8": 6},
},
),
(
OVModelForVisualCausalLM,
"minicpmv4",
True,
dict(
bits=4,
group_size=16,
dataset="contextual",
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
num_samples=1,
processor=MODEL_NAMES["minicpmv"],
trust_remote_code=True,
),
{
"lm_model": {"int8": 8, "int4": 22},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 26},
"resampler_model": {"int8": 6},
},
),
(
OVModelForVisualCausalLM,
"minicpmv4_5",
True,
dict(
bits=4,
group_size=16,
dataset="contextual",
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
num_samples=1,
processor=MODEL_NAMES["minicpmv"],
trust_remote_code=True,
),
{
"lm_model": {"int8": 8, "int4": 22},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 26},
"resampler_model": {"int8": 6},
},
),
]

# filter models type depending on min max transformers version
Expand All @@ -1053,6 +1095,8 @@ class OVWeightCompressionTest(unittest.TestCase):
(OVModelForVisualCausalLM, "llava_next_video", False),
(OVModelForVisualCausalLM, "minicpmv", True),
(OVModelForVisualCausalLM, "qwen2_vl", False),
(OVModelForVisualCausalLM, "minicpmv4", False),
(OVModelForVisualCausalLM, "minicpmv4_5", False),
]

if is_transformers_version("<", "4.54.0"):
Expand Down
12 changes: 6 additions & 6 deletions tests/openvino/test_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
if is_transformers_version(">", "4.49"):
SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"]
if is_transformers_version(">=", "4.51"):
SUPPORTED_ARCHITECTURES += ["llama4"]
SUPPORTED_ARCHITECTURES += ["llama4", "minicpmv4", "minicpmv4_5"]
if is_transformers_version("<", "4.52"):
SUPPORTED_ARCHITECTURES += ["minicpmo"]

Expand All @@ -506,7 +506,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"}

TASK = "image-text-to-text"
REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]

IMAGE = Image.open(
requests.get(
Expand Down Expand Up @@ -611,7 +611,7 @@ def test_compare_to_transformers(self, model_arch):
self._check_device_and_request(ov_model, test_device, False)

# pytorch minicpmv and internvl_chat are not designed to be used via forward
if model_arch not in ["minicpmv", "minicpmo", "internvl_chat"]:
if model_arch not in ["minicpmv", "minicpmv4","minicpmv4_5","minicpmo", "internvl_chat"]:
set_seed(SEED)
ov_outputs = ov_model(**inputs)
set_seed(SEED)
Expand Down Expand Up @@ -670,7 +670,7 @@ def test_compare_to_transformers(self, model_arch):
transformers_outputs = transformers_outputs[1].sequences

# original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]:
ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
self.assertTrue(
torch.equal(ov_outputs, transformers_outputs),
Expand All @@ -696,7 +696,7 @@ def test_compare_to_transformers(self, model_arch):
transformers_inputs = copy.deepcopy(inputs)
ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
# original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]:
ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
with torch.no_grad():
transformers_outputs = transformers_model.generate(
Expand All @@ -714,7 +714,7 @@ def test_compare_to_transformers(self, model_arch):
transformers_inputs = copy.deepcopy(inputs)
ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
# original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]:
ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
with torch.no_grad():
transformers_outputs = transformers_model.generate(
Expand Down
14 changes: 14 additions & 0 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@
"minicpm": "katuni4ka/tiny-random-minicpm",
"minicpm3": "katuni4ka/tiny-random-minicpm3",
"minicpmv": "katuni4ka/tiny-random-minicpmv-2_6",
"minicpmv4": "snake7gun/minicpm-v-4-tiny",
"minicpmv4_5": "tiny-random/minicpm-v-4_5",
"minicpmo": "rkazants/tiny-random-MiniCPM-o-2_6",
"mistral": "echarlaix/tiny-random-mistral",
"mistral-nemo": "katuni4ka/tiny-random-mistral-nemo",
Expand Down Expand Up @@ -285,6 +287,18 @@
"vision_embeddings_model": 26,
"resampler_model": 6,
},
"minicpmv4": {
"lm_model": 30,
"text_embeddings_model": 1,
"vision_embeddings_model": 14,
"resampler_model": 6,
},
"minicpmv4_5": {
"lm_model": 30,
"text_embeddings_model": 1,
"vision_embeddings_model": 14,
"resampler_model": 6,
},
"llava_next_video": {
"lm_model": 30,
"text_embeddings_model": 1,
Expand Down