openvino.genai/samples/python/visual_language_chat/benchmark_vlm.py at 59d5b574c0f4406b377c8f28dab22e039e616f49 · xipingyan/openvino.genai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
# Copyright (C) 2023-2026 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import sys
import argparse
import openvino_genai as ov_genai
from PIL import Image
from openvino import Tensor
from pathlib import Path
import numpy as np
from openvino import get_version


def read_image(path: str) -> Tensor:
    '''

    Args:
        path: The path to the image.

    Returns: the ov.Tensor containing the image.

    '''
    pic = Image.open(path).convert("RGB")
    image_data = np.array(pic)
    return Tensor(image_data)

def read_images(path: str) -> list[Tensor]:
    entry = Path(path)
    if entry.is_dir():
        return [read_image(str(file)) for file in sorted(entry.iterdir())]
    return [read_image(path)]


def ratio_type(value):
    ivalue = int(value)
    if ivalue < 0 or ivalue > 100:
        raise argparse.ArgumentTypeError(f"pruning_ratio must be between 0 and 100, got {value}")
    return ivalue


def weight_0_1(value):
    fvalue = float(value)
    if not 0.0 <= fvalue <= 1.0:
        raise argparse.ArgumentTypeError(f"relevance_weight must be between 0 and 1, got {value}")
    return fvalue


def main():
    parser = argparse.ArgumentParser(description="Help command")
    parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
    parser.add_argument("-p", "--prompt", type=str, default=None, help="Prompt")
    parser.add_argument("-pf", "--prompt_file", type=str, help="Read prompt from file")
    parser.add_argument("-i", "--image", type=str, default="image.jpg", help="Image")
    parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
    parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
    parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
    parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
    parser.add_argument(
        "--pruning_ratio",
        type=ratio_type,
        default=0,
        help="(optional): Percentage of visual tokens to prune (valid range: 0-100). If this option is not provided, pruning is disabled.",
    )
    parser.add_argument(
        "--relevance_weight",
        type=weight_0_1,
        help="(optional): Float value from 0 to 1, control the trade-off between diversity and relevance for visual tokens pruning, "
        "a value of 0 disables relevance weighting, while higher values (up to 1.0) emphasize relevance, making pruning more conservative on borderline tokens.",
    )

    args = parser.parse_args()

    if args.prompt is not None and args.prompt_file is not None:
        raise RuntimeError(f'Prompt and prompt file should not exist together!')
    else:
        if args.prompt_file is not None:
            with open(args.prompt_file, 'r', encoding='utf-8') as f:
                prompt = f.read()
        else:
            prompt = 'What is on the image?' if args.prompt is None else args.prompt
    if len(prompt) == 0:
        raise RuntimeError(f'Prompt is empty!')

    print(f'openvino runtime version: {get_version()}, genai version: {ov_genai.__version__}')

    # Perf metrics is stored in VLMDecodedResults.
    # In order to get VLMDecodedResults instead of a string input should be a list.
    models_path = args.model
    images = read_images(args.image)
    device = args.device
    num_warmup = args.num_warmup
    num_iter = args.num_iter

    config = ov_genai.GenerationConfig()
    config.max_new_tokens = args.max_new_tokens
    if args.pruning_ratio is not None:
        config.pruning_ratio = args.pruning_ratio
    if args.relevance_weight is not None:
        config.relevance_weight = args.relevance_weight

    if device == "NPU":
        pipe = ov_genai.VLMPipeline(models_path, device)
    else:
        # Setting of Scheduler config will trigger usage of ContinuousBatching pipeline, which is not default for Qwen2VL, Qwen2.5VL, Gemma3 due to accuracy issues.
        scheduler_config = ov_genai.SchedulerConfig()
        scheduler_config.enable_prefix_caching = False
        scheduler_config.max_num_batched_tokens = sys.maxsize
        pipe = ov_genai.VLMPipeline(models_path, device, scheduler_config=scheduler_config)

    input_data = pipe.get_tokenizer().encode(prompt)
    prompt_token_size = input_data.input_ids.get_shape()[1]
    print(f"Number of images:{len(images)}, Prompt token size: {prompt_token_size}")

    for _ in range(num_warmup):
        pipe.generate(prompt, images=images, generation_config=config)

    res = pipe.generate(prompt, images=images, generation_config=config)
    perf_metrics = res.perf_metrics
    for _ in range(num_iter - 1):
        res = pipe.generate(prompt, images=images, generation_config=config)
        perf_metrics += res.perf_metrics

    print(f"Output token size: {res.perf_metrics.get_num_generated_tokens()}")
    print(f"Load time: {perf_metrics.get_load_time():.2f} ms")
    print(
        f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms")
    print(
        f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms")
    print(
        f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms")
    print(
        f"Embeddings preparation time: {perf_metrics.get_prepare_embeddings_duration().mean:.2f} ± {perf_metrics.get_prepare_embeddings_duration().std:.2f} ms")
    print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms")
    print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms")
    print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s")


if __name__ == "__main__":
    main()