forked from openvinotoolkit/openvino.genai
-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathbenchmark_vlm.py
More file actions
executable file
·140 lines (115 loc) · 5.82 KB
/
benchmark_vlm.py
File metadata and controls
executable file
·140 lines (115 loc) · 5.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
# Copyright (C) 2023-2026 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import sys
import argparse
import openvino_genai as ov_genai
from PIL import Image
from openvino import Tensor
from pathlib import Path
import numpy as np
from openvino import get_version
def read_image(path: str) -> Tensor:
'''
Args:
path: The path to the image.
Returns: the ov.Tensor containing the image.
'''
pic = Image.open(path).convert("RGB")
image_data = np.array(pic)
return Tensor(image_data)
def read_images(path: str) -> list[Tensor]:
entry = Path(path)
if entry.is_dir():
return [read_image(str(file)) for file in sorted(entry.iterdir())]
return [read_image(path)]
def ratio_type(value):
ivalue = int(value)
if ivalue < 0 or ivalue > 100:
raise argparse.ArgumentTypeError(f"pruning_ratio must be between 0 and 100, got {value}")
return ivalue
def weight_0_1(value):
fvalue = float(value)
if not 0.0 <= fvalue <= 1.0:
raise argparse.ArgumentTypeError(f"relevance_weight must be between 0 and 1, got {value}")
return fvalue
def main():
parser = argparse.ArgumentParser(description="Help command")
parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
parser.add_argument("-p", "--prompt", type=str, default=None, help="Prompt")
parser.add_argument("-pf", "--prompt_file", type=str, help="Read prompt from file")
parser.add_argument("-i", "--image", type=str, default="image.jpg", help="Image")
parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
parser.add_argument(
"--pruning_ratio",
type=ratio_type,
default=0,
help="(optional): Percentage of visual tokens to prune (valid range: 0-100). If this option is not provided, pruning is disabled.",
)
parser.add_argument(
"--relevance_weight",
type=weight_0_1,
help="(optional): Float value from 0 to 1, control the trade-off between diversity and relevance for visual tokens pruning, "
"a value of 0 disables relevance weighting, while higher values (up to 1.0) emphasize relevance, making pruning more conservative on borderline tokens.",
)
args = parser.parse_args()
if args.prompt is not None and args.prompt_file is not None:
raise RuntimeError(f'Prompt and prompt file should not exist together!')
else:
if args.prompt_file is not None:
with open(args.prompt_file, 'r', encoding='utf-8') as f:
prompt = f.read()
else:
prompt = 'What is on the image?' if args.prompt is None else args.prompt
if len(prompt) == 0:
raise RuntimeError(f'Prompt is empty!')
print(f'openvino runtime version: {get_version()}, genai version: {ov_genai.__version__}')
# Perf metrics is stored in VLMDecodedResults.
# In order to get VLMDecodedResults instead of a string input should be a list.
models_path = args.model
images = read_images(args.image)
device = args.device
num_warmup = args.num_warmup
num_iter = args.num_iter
config = ov_genai.GenerationConfig()
config.max_new_tokens = args.max_new_tokens
if args.pruning_ratio is not None:
config.pruning_ratio = args.pruning_ratio
if args.relevance_weight is not None:
config.relevance_weight = args.relevance_weight
if device == "NPU":
pipe = ov_genai.VLMPipeline(models_path, device)
else:
# Setting of Scheduler config will trigger usage of ContinuousBatching pipeline, which is not default for Qwen2VL, Qwen2.5VL, Gemma3 due to accuracy issues.
scheduler_config = ov_genai.SchedulerConfig()
scheduler_config.enable_prefix_caching = False
scheduler_config.max_num_batched_tokens = sys.maxsize
pipe = ov_genai.VLMPipeline(models_path, device, scheduler_config=scheduler_config)
input_data = pipe.get_tokenizer().encode(prompt)
prompt_token_size = input_data.input_ids.get_shape()[1]
print(f"Number of images:{len(images)}, Prompt token size: {prompt_token_size}")
for _ in range(num_warmup):
pipe.generate(prompt, images=images, generation_config=config)
res = pipe.generate(prompt, images=images, generation_config=config)
perf_metrics = res.perf_metrics
for _ in range(num_iter - 1):
res = pipe.generate(prompt, images=images, generation_config=config)
perf_metrics += res.perf_metrics
print(f"Output token size: {res.perf_metrics.get_num_generated_tokens()}")
print(f"Load time: {perf_metrics.get_load_time():.2f} ms")
print(
f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms")
print(
f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms")
print(
f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms")
print(
f"Embeddings preparation time: {perf_metrics.get_prepare_embeddings_duration().mean:.2f} ± {perf_metrics.get_prepare_embeddings_duration().std:.2f} ms")
print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms")
print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms")
print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s")
if __name__ == "__main__":
main()