Skip to content

Commit ce32380

Browse files
authored
[wwb] Add custom processor for qwen3_vl (openvinotoolkit#3487)
## Description New model should avoid using of preprocessing from optimum-intel. <!-- Jira ticket number (e.g., 123). Delete if there's no ticket. --> as part of the task [CVS-175205](https://jira.devtools.intel.com/browse/CVS-175205) ## Checklist: - [x] This PR follows [GenAI Contributing guidelines](https://github.com/openvinotoolkit/openvino.genai?tab=contributing-ov-file#contributing). <!-- Always follow them. If there are deviations, explain what and why. --> - [ ] Tests have been updated or added to cover the new code. <!-- Specify exactly which tests were added or updated. If the change isn't maintenance related, update the tests at https://github.com/openvinotoolkit/openvino.genai/tree/master/tests or explain in the description why the tests don't need an update. --> - [ ] This PR fully addresses the ticket. <!--- If not, explain clearly what is covered and what is not. If follow-up pull requests are needed, specify in the description. --> - [ ] I have made corresponding changes to the documentation. <!-- Run github.com/\<username>/openvino.genai/actions/workflows/deploy_gh_pages.yml on your fork with your branch as a parameter to deploy a test version with the updated content. Replace this comment with the link to the built docs. If the documentation is updated in a separate PR, clearly specify it. -->
1 parent 589d6e3 commit ce32380

2 files changed

Lines changed: 108 additions & 18 deletions

File tree

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import numpy as np
2+
from transformers import (
3+
AutoImageProcessor,
4+
PretrainedConfig,
5+
PreTrainedTokenizer,
6+
)
7+
from abc import ABC, abstractmethod
8+
from packaging.version import Version
9+
from typing import TYPE_CHECKING, Optional
10+
11+
from transformers import __version__
12+
13+
if TYPE_CHECKING:
14+
from PIL.Image import Image
15+
from transformers.image_utils import VideoInput
16+
17+
18+
TRANSFORMERS_VERSION = Version(__version__)
19+
20+
21+
def fix_phi3_v_eos_token_id(model_type, tokenizer):
22+
"""
23+
phi3_v configs aren't consistent. Override the default
24+
eos_token_id with the one from a tokenizer similar to
25+
an example in
26+
https://huggingface.co/microsoft/Phi-3.5-vision-instruct
27+
"""
28+
if "phi3_v" == model_type:
29+
return {"eos_token_id": tokenizer.eos_token_id}
30+
else:
31+
return dict()
32+
33+
34+
class VLMInputsPreprocessor(ABC):
35+
def __init__(self, chat_mode: bool = False):
36+
self.images = None
37+
self.videos = []
38+
self.chat_history = []
39+
self.chat_mode = chat_mode
40+
41+
@abstractmethod
42+
def preprocess_inputs(
43+
self,
44+
text: str,
45+
image: Optional["Image"] = None,
46+
processor: Optional[AutoImageProcessor] = None,
47+
tokenizer: Optional[PreTrainedTokenizer] = None,
48+
config: Optional[PretrainedConfig] = None,
49+
video: Optional["VideoInput"] = None,
50+
audio: Optional[np.ndarray] = None,
51+
):
52+
return None
53+
54+
@abstractmethod
55+
def update_chat_history_with_answer(self, answer):
56+
pass
57+
58+
59+
class Qwen3VLInputsPreprocessor(VLMInputsPreprocessor):
60+
def __init__(self, chat_mode: bool = False):
61+
super().__init__(chat_mode)
62+
63+
def update_chat_history_with_answer(self, answer):
64+
self.chat_history.append({"role": "assistant", "content": answer})
65+
66+
def preprocess_inputs(
67+
self,
68+
text: str,
69+
image: Optional["Image"] = None,
70+
processor: Optional[AutoImageProcessor] = None,
71+
tokenizer: Optional[PreTrainedTokenizer] = None,
72+
config: Optional[PretrainedConfig] = None,
73+
video: Optional["VideoInput"] = None,
74+
audio: Optional[np.ndarray] = None,
75+
):
76+
if processor is None:
77+
raise ValueError("Processor is required.")
78+
if audio is not None:
79+
raise ValueError("Audio input is not supported")
80+
conversation = [
81+
{
82+
"role": "user",
83+
"content": [
84+
{"type": "text", "text": text},
85+
],
86+
}
87+
]
88+
if image is not None:
89+
conversation[0]["content"].insert(0, {"type": "image", "image": image})
90+
if video is not None:
91+
conversation[0]["content"].insert(0, {"type": "video", "video": video})
92+
93+
inputs = processor.apply_chat_template(
94+
conversation, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
95+
)
96+
return inputs
97+
98+
99+
MODEL_TYPE_TO_CLS_MAPPING = {"qwen3_vl": Qwen3VLInputsPreprocessor}

tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,11 @@
1212
from .registry import register_evaluator
1313
from .text_evaluator import TextEvaluator
1414
from .utils import get_ignore_parameters_flag, prepare_default_data_image, prepare_default_data_video
15+
from .visual_utils import fix_phi3_v_eos_token_id, MODEL_TYPE_TO_CLS_MAPPING
1516

1617
DEF_VIDEO_FRAMES_AMOUNT = 10
1718

1819

19-
def fix_phi3_v_eos_token_id(model_type, tokenizer):
20-
"""
21-
phi3_v configs aren't consistent. Override the default
22-
eos_token_id with the one from a tokenizer similar to
23-
an example in
24-
https://huggingface.co/microsoft/Phi-3.5-vision-instruct
25-
"""
26-
if 'phi3_v' == model_type:
27-
return {"eos_token_id": tokenizer.eos_token_id}
28-
else:
29-
return dict()
30-
31-
3220
@register_evaluator("visual-text", "visual-video-text")
3321
class VisualTextEvaluator(TextEvaluator):
3422
def __init__(
@@ -128,12 +116,15 @@ def default_gen_answer(
128116
pruning_ratio,
129117
relevance_weight,
130118
):
119+
if model.config.model_type in MODEL_TYPE_TO_CLS_MAPPING and "transformers" in str(type(model)):
120+
inputs_processor = MODEL_TYPE_TO_CLS_MAPPING[model.config.model_type]()
121+
preprocess_inputs = inputs_processor.preprocess_inputs
122+
else:
123+
from optimum.intel.openvino.modeling_visual_language import (
124+
MODEL_TYPE_TO_CLS_MAPPING as MODEL_TYPE_TO_CLS_MAPPING_OPT,
125+
)
131126

132-
from optimum.intel.openvino.modeling_visual_language import \
133-
MODEL_TYPE_TO_CLS_MAPPING
134-
preprocess_inputs = MODEL_TYPE_TO_CLS_MAPPING[
135-
model.config.model_type
136-
].preprocess_inputs
127+
preprocess_inputs = MODEL_TYPE_TO_CLS_MAPPING_OPT[model.config.model_type].preprocess_inputs
137128
inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config, video=video)
138129
tokens = model.generate(
139130
**inputs,

0 commit comments

Comments
 (0)