Skip to content

Commit 58c9af8

Browse files
authored
[wwb] Add possibility to check video inputs for VLM (openvinotoolkit#3074)
# Description WWB is updated to run VLM pipeline with video inputs. - New model type has been added for this purpose, so to configure the video input it's needed to run wwb with `--model-type visual-video-text`. - Default video data is data from dataset: lmms-lab/LLaVA-Video-178K , subset 30_60_s_academic_v0_1 , video from archive 30_60_s_academic_v0_1_videos_10.tar.gz (was chosen because of its weight - 274 MB). Archive includs 56 videos from different datasets: youcook2, NextQA, ego4d, Chrades and activitynet. how to use: `optimum-cli export openvino -m Qwen/Qwen2-VL-7B-Instruct --weight-format int8 qwen2-vl-7b-Instruct` `python whowhatbench/wwb.py --base-model qwen2-vl-7b-Instruct --model-type visual-video-text --gt-data vlm_video_gt.csv` Ticket: CVS-173847 ## Checklist: - [X] Tests have been updated or added to cover the new code. <!-- If the change isn't maintenance related, update the tests at https://github.com/openvinotoolkit/openvino.genai/tree/master/tests or explain in the description why the tests don't need an update. --> - [X] This patch fully addresses the ticket. <!--- If follow-up pull requests are needed, specify in description. --> - [X] I have made corresponding changes to the documentation. <!-- Run github.com/\<username>/openvino.genai/actions/workflows/deploy_gh_pages.yml on your fork with your branch as a parameter to deploy a test version with the updated content. Replace this comment with the link to the built docs. -->
1 parent 1d5dde7 commit 58c9af8

File tree

7 files changed

+157
-43
lines changed

7 files changed

+157
-43
lines changed

tools/who_what_benchmark/README.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text --genai
5252

5353
> **NOTE**: use --verbose option for debug to see the outputs with the largest difference.
5454
55-
### Compare Visual Language Models (VLMs)
55+
### Compare Visual Language Models with image inputs (VLMs)
5656
```sh
5757
# Export FP16 model to OpenVINO
5858
optimum-cli export openvino -m llava-hf/llava-v1.6-mistral-7b-hf --weight-format int8 llava-int8
@@ -64,6 +64,18 @@ wwb --base-model llava-hf/llava-v1.6-mistral-7b-hf --gt-data llava_test/gt.csv -
6464
wwb --target-model llava-int8 --gt-data llava_test/gt.csv --model-type visual-text --genai
6565
```
6666

67+
### Compare Visual Language Models with video inputs (VLMs)
68+
```sh
69+
# Export FP16 model to OpenVINO
70+
optimum-cli export openvino -m Qwen/Qwen2-VL-7B-Instruct --weight-format int8 qwen2-vl-7b-Instruct
71+
# Collect the references and save the mapping in the .csv file.
72+
# Reference images will be stored in the "reference" subfolder under the same path with .csv.
73+
wwb --base-model Qwen/Qwen2-VL-7B-Instruct --gt-data qwen_video_test/gt.csv --model-type visual-video-text --hf
74+
# Compute the metric
75+
# Target images will be stored in the "target" subfolder under the same path with .csv.
76+
wwb --target-model qwen2-vl-7b-Instruct --gt-data qwen_video_test/gt.csv --model-type visual-video-text --genai
77+
```
78+
6779
### Compare Text-to-image models
6880
```sh
6981
# Export model with 8-bit quantized weights to OpenVINO

tools/who_what_benchmark/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ auto-gptq; sys_platform == "linux"
1212
autoawq<0.2.8; sys_platform == "linux"
1313
sentencepiece
1414
jinja2>=3.1.0
15-
scipy
15+
scipy
16+
opencv-python

tools/who_what_benchmark/tests/test_cli_vlm.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,3 +112,14 @@ def test_vlm_basic(model_id, model_type, tmp_path):
112112
)
113113
def test_vlm_nanollava(model_id, model_type, optimum_threshold, genai_threshold, tmp_path):
114114
run_test(model_id, model_type, optimum_threshold, genai_threshold, tmp_path)
115+
116+
117+
@pytest.mark.parametrize(
118+
("model_id", "model_type"),
119+
[
120+
("katuni4ka/tiny-random-qwen2vl", "visual-video-text"),
121+
("katuni4ka/tiny-random-llava-next-video", "visual-video-text"),
122+
],
123+
)
124+
def test_vlm_video(model_id, model_type, tmp_path):
125+
run_test(model_id, model_type, 0.8, 0.8, tmp_path)

tools/who_what_benchmark/whowhatbench/model_loaders.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(self, model, model_dir, model_type):
2424
self.model = model
2525
self.model_type = model_type
2626

27-
if model_type in ["text", "visual-text", "text-embedding", "text-reranking"]:
27+
if model_type in ["text", "visual-text", "visual-video-text", "text-embedding", "text-reranking"]:
2828
try:
2929
self.config = AutoConfig.from_pretrained(model_dir)
3030
except Exception:
@@ -321,7 +321,7 @@ def load_visual_text_genai_pipeline(model_dir, device="CPU", ov_config=None, **k
321321
return GenAIModelWrapper(
322322
pipeline,
323323
model_dir,
324-
"visual-text"
324+
kwargs.get("model_type", "visual-text")
325325
)
326326

327327

@@ -641,7 +641,8 @@ def load_model(
641641
return load_text2image_model(
642642
model_id, device, ov_options, use_hf, use_genai, **kwargs
643643
)
644-
elif model_type == "visual-text":
644+
elif model_type == "visual-text" or model_type == "visual-video-text":
645+
kwargs["model_type"] = model_type
645646
return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)
646647
elif model_type == "image-to-image":
647648
return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)

tools/who_what_benchmark/whowhatbench/utils.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,25 @@
11
from typing import Union, Optional
22
from packaging.version import Version
3+
4+
import os
35
import json
46
import torch
7+
import random
58
import logging
9+
import tarfile
10+
import datasets
611
import itertools
712
import transformers
813

14+
import numpy as np
915
import pyarrow as pa
1016
import pyarrow.parquet as pq
1117

1218
from pathlib import Path
19+
from transformers import set_seed
1320
from contextlib import contextmanager
1421
from datasets.utils.file_utils import xopen
22+
from transformers.image_utils import load_image
1523

1624
logging.basicConfig(level=logging.INFO)
1725
logger = logging.getLogger(__name__)
@@ -138,6 +146,78 @@ def get_json_config(config):
138146
return json_config
139147

140148

149+
# preapre default dataset for visualtext(VLM) evalutor
150+
def preprocess_fn(example):
151+
return {
152+
"prompts": example["instruction"],
153+
"images": load_image(example["image_url"]),
154+
"videos": None,
155+
}
156+
157+
158+
def prepare_default_data_image(num_samples=None):
159+
DATASET_NAME = "ucla-contextual/contextual_test"
160+
NUM_SAMPLES = 24 if num_samples is None else num_samples
161+
set_seed(42)
162+
default_dataset = datasets.load_dataset(
163+
DATASET_NAME, split="test", streaming=True
164+
).shuffle(42).take(NUM_SAMPLES)
165+
return default_dataset.map(
166+
lambda x: preprocess_fn(x), remove_columns=default_dataset.column_names
167+
)
168+
169+
170+
def prepare_default_data_video(num_samples=None, num_frames=10):
171+
from huggingface_hub import hf_hub_download
172+
from transformers.video_utils import load_video
173+
174+
DATASET_NAME = "lmms-lab/LLaVA-Video-178K"
175+
SUBSET = "30_60_s_academic_v0_1"
176+
NUM_SAMPLES = 24 if num_samples is None else num_samples
177+
178+
questions_per_video_set = datasets.load_dataset(DATASET_NAME, SUBSET,
179+
split="open_ended",
180+
data_files={"open_ended": f"{SUBSET}/30_60_s_academic_oe_v0_1_qa_processed.json"})
181+
questions_per_video = {val['video']: val for val in questions_per_video_set}
182+
183+
# 30_60_s_academic_v0_1_videos_10.tar.gz - just the most lightweight chunk among subset
184+
# https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K/tree/main/30_60_s_academic_v0_1
185+
# the archive contains 56 videos
186+
videos_arc_path = hf_hub_download(repo_id="lmms-lab/LLaVA-Video-178K",
187+
filename=f"{SUBSET}/{SUBSET}_videos_10.tar.gz",
188+
repo_type="dataset")
189+
190+
video_samples = []
191+
extract_dir = "./videos"
192+
os.makedirs(extract_dir, exist_ok=True)
193+
with tarfile.open(videos_arc_path, "r:gz") as tar:
194+
all_videos = tar.getnames()
195+
196+
if len(all_videos) < NUM_SAMPLES:
197+
logger.warning(f"The required number of samples {NUM_SAMPLES} exceeds the available amount of data {len(all_videos)}."
198+
f"num-samples will be updated to max available: {len(all_videos)}.")
199+
NUM_SAMPLES = len(all_videos)
200+
201+
video_samples = random.Random(42).sample(all_videos, NUM_SAMPLES) # nosec
202+
for sample in video_samples:
203+
tar.extract(sample, path=extract_dir)
204+
205+
# if num_frames < total_num_frames, sample each total_num_frames/num_frames frames or sample all frames
206+
def default_sample_indices_fn(metadata, **kwargs):
207+
total_num_frames = metadata.total_num_frames
208+
if num_frames < total_num_frames:
209+
return np.arange(0, total_num_frames, total_num_frames / num_frames, dtype=int)
210+
return np.arange(0, total_num_frames, dtype=int)
211+
212+
data = []
213+
for video_rel_path in video_samples:
214+
video_tensor = load_video(os.path.join(extract_dir, video_rel_path), backend="opencv", sample_indices_fn=default_sample_indices_fn)
215+
prompt = questions_per_video[video_rel_path]['conversations'][0]['value'].replace("<image>\n", "")
216+
data.append({'prompts': prompt, "images": None, 'videos': video_tensor[0]})
217+
218+
return data
219+
220+
141221
# for patching function datasets.packaged_modules.parquet.parquet.Parquet._generate_tables
142222
# according to code: https://github.com/huggingface/datasets/issues/7357#issuecomment-3354047772
143223
def parquet_generate_tables(self, files, *args, **kwargs):

tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,16 @@
1-
from typing import Any, Union
2-
31
import os
4-
import datasets
2+
53
import pandas as pd
6-
from transformers.image_utils import load_image
4+
75
from tqdm import tqdm
8-
from transformers import set_seed
6+
from itertools import zip_longest
7+
from typing import Literal, Any, Union
98

109
from .registry import register_evaluator
1110
from .text_evaluator import TextEvaluator
12-
from .utils import get_ignore_parameters_flag
13-
14-
15-
def preprocess_fn(example):
16-
return {
17-
"prompts": example["instruction"],
18-
"images": load_image(example["image_url"]),
19-
}
20-
11+
from .utils import get_ignore_parameters_flag, prepare_default_data_image, prepare_default_data_video
2112

22-
def prepare_default_data(num_samples=None):
23-
DATASET_NAME = "ucla-contextual/contextual_test"
24-
NUM_SAMPLES = 24 if num_samples is None else num_samples
25-
set_seed(42)
26-
default_dataset = datasets.load_dataset(
27-
DATASET_NAME, split="test", streaming=True
28-
).shuffle(42).take(NUM_SAMPLES)
29-
return default_dataset.map(
30-
lambda x: preprocess_fn(x), remove_columns=default_dataset.column_names
31-
)
13+
DEF_VIDEO_FRAMES_AMOUNT = 10
3214

3315

3416
def fix_phi3_v_eos_token_id(model_type, tokenizer):
@@ -44,7 +26,7 @@ def fix_phi3_v_eos_token_id(model_type, tokenizer):
4426
return dict()
4527

4628

47-
@register_evaluator("visual-text")
29+
@register_evaluator("visual-text", "visual-video-text")
4830
class VisualTextEvaluator(TextEvaluator):
4931
def __init__(
5032
self,
@@ -60,8 +42,12 @@ def __init__(
6042
gen_answer_fn=None,
6143
generation_config=None,
6244
seqs_per_request=None,
45+
task_type: Literal['visual-text', 'visual-video-text'] = "visual-text",
46+
frames_num: int | None = None,
6347
) -> None:
6448
self.processor = processor
49+
self.is_image_input = (task_type == "visual-text")
50+
self.frames_num = frames_num or DEF_VIDEO_FRAMES_AMOUNT
6551
super().__init__(
6652
base_model=base_model,
6753
tokenizer=tokenizer,
@@ -124,15 +110,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
124110

125111
def _generate_data(self, model, gen_answer_fn=None, generation_config=None):
126112
def default_gen_answer(
127-
model, prompt, image, processor, tokenizer, max_new_tokens, crop_question
113+
model, prompt, image, video, processor, tokenizer, max_new_tokens, crop_question
128114
):
129115

130116
from optimum.intel.openvino.modeling_visual_language import \
131117
MODEL_TYPE_TO_CLS_MAPPING
132118
preprocess_inputs = MODEL_TYPE_TO_CLS_MAPPING[
133119
model.config.model_type
134120
].preprocess_inputs
135-
inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config)
121+
inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config, video=video)
136122
tokens = model.generate(
137123
**inputs,
138124
**fix_phi3_v_eos_token_id(model.config.model_type, tokenizer),
@@ -160,24 +146,29 @@ def default_gen_answer(
160146
if isinstance(self.test_data, dict):
161147
assert "prompts" in self.test_data
162148
assert "images" in self.test_data
149+
assert "videos" in self.test_data
163150
data = dict(self.test_data)
164151
data = pd.DataFrame.from_dict(data)
165152
else:
166-
data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples))
153+
input_data = prepare_default_data_image(self.num_samples) if self.is_image_input else prepare_default_data_video(self.num_samples, self.frames_num)
154+
data = pd.DataFrame.from_dict(input_data)
167155

168156
prompt_data = data["prompts"]
169157
image_data = data["images"]
158+
videos_data = data["videos"]
170159

171160
answers = []
172161
prompts = prompt_data.values
173162
images = image_data.values
163+
videos = videos_data.values
174164

175-
for p, i in tqdm(zip(prompts, images), desc="Evaluate pipeline"):
165+
for p, i, v in tqdm(zip_longest(prompts, images, videos), desc="Evaluate pipeline"):
176166
answers.append(
177167
gen_answer_fn(
178168
model,
179169
p,
180170
i,
171+
v,
181172
self.processor,
182173
self.tokenizer,
183174
self.max_new_tokens,

tools/who_what_benchmark/whowhatbench/wwb.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,11 @@ def parse_args():
6363
parser.add_argument(
6464
"--model-type",
6565
type=str,
66-
choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"],
66+
choices=["text", "text-to-image", "visual-text", "visual-video-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"],
6767
default="text",
6868
help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, "
69-
"visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt "
69+
"visual-text - for Visual Language Models with image inputs, visual-video-text - for Visual Language Models with video inputs, "
70+
"image-to-image - for image generation based on image and prompt "
7071
"image-inpainting - for image generation based on image, mask and prompt, text-reranking - for reranking a list of texts based on relevance to query",
7172
)
7273
parser.add_argument(
@@ -267,6 +268,14 @@ def parse_args():
267268
default=None,
268269
help="Config option assistant_confidence_threshold for Speculative decoding.",
269270
)
271+
parser.add_argument(
272+
"--video-frames-num",
273+
type=int,
274+
default=None,
275+
help="The number of frames that will be taken from video for input, the frames will be taken evenly across the entire length, "
276+
"applicable for Visual Language Models with video inputs",
277+
)
278+
270279
return parser.parse_args()
271280

272281

@@ -508,15 +517,22 @@ def genai_gen_inpainting(model, prompt, image, mask, num_inference_steps, genera
508517
return image
509518

510519

511-
def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_tokens, crop_question):
512-
image_data = ov.Tensor(np.array(image)[None])
520+
def genai_gen_visual_text(model, prompt, image, video, processor, tokenizer, max_new_tokens, crop_question):
521+
kwargs = {
522+
"do_sample": False,
523+
"max_new_tokens": max_new_tokens
524+
}
525+
if image is not None:
526+
kwargs['image'] = ov.Tensor(np.array(image)[None])
527+
if video is not None:
528+
kwargs['videos'] = [ov.Tensor(np.array(video))]
529+
513530
out = model.generate(
514531
prompt,
515532
**fix_phi3_v_eos_token_id(model.config.model_type, tokenizer),
516-
image=image_data,
517-
do_sample=False,
518-
max_new_tokens=max_new_tokens
533+
**kwargs
519534
)
535+
520536
return out.texts[0]
521537

522538

@@ -589,7 +605,7 @@ def create_evaluator(base_model, args):
589605
is_genai=args.genai,
590606
seed=args.seed,
591607
)
592-
elif task == "visual-text":
608+
elif task == "visual-text" or task == "visual-video-text":
593609
processor, config = load_processor(args)
594610
tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else load_tokenizer(args)
595611
if config and is_model_with_automatic_crop(config) and args.hf:
@@ -606,6 +622,8 @@ def create_evaluator(base_model, args):
606622
gen_answer_fn=genai_gen_visual_text if args.genai else None,
607623
processor=processor,
608624
crop_question=crop_question,
625+
task_type=task,
626+
frames_num=args.video_frames_num
609627
)
610628
elif task == "image-to-image":
611629
return EvaluatorCLS(
@@ -828,7 +846,7 @@ def main():
828846
evaluator.dump_predictions(os.path.join(args.output, "target.csv"))
829847

830848
if args.verbose and (args.target_model or args.target_data):
831-
if args.model_type == "text" or args.model_type == "visual-text":
849+
if args.model_type in ["text", "visual-text", "visual-video-text"]:
832850
print_text_results(evaluator)
833851
elif "text-to-image" in args.model_type or "image-to-image" in args.model_type:
834852
print_image_results(evaluator)

0 commit comments

Comments
 (0)