-
Notifications
You must be signed in to change notification settings - Fork 661
Open
Description
Your current environment
The output of above commands
How would you like to use vllm on ascend
This script runs deepseek-ocr and paddle-ocr-vl and hunyuan-ocr
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
import os
import random
from contextlib import contextmanager
from dataclasses import asdict
from typing import NamedTuple
from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import convert_image_mode
from vllm.utils.argparse_utils import FlexibleArgumentParser
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompts: list[str]
stop_token_ids: list[int] | None = None
lora_requests: list[LoRARequest] | None = None
sampling_params: list[SamplingParams] | None = None
def run_deepseek_ocr(questions: list[str], modality: str) -> ModelRequestData:
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
assert modality == "image"
model_name = "deepseek-ai/DeepSeek-OCR"
engine_args = EngineArgs(
model=model_name,
limit_mm_per_prompt={modality: 1},
logits_processors=[NGramPerReqLogitsProcessor],
)
# deepseek-ocr use plain prompt template
prompts = [f"<image>\n{question}" for question in questions]
# The following sampling params config is taken from
# the official Deepseek-OCR inference example.
# (IMPORTANT) Use the custom logits processor and avoid skipping
# special tokens for this model for the optimal OCR performance.
sampling_params = [
SamplingParams(
temperature=0.0,
max_tokens=8192,
# ngram logit processor args
extra_args=dict(
ngram_size=30,
window_size=90,
# whitelist: <td>, </td>
whitelist_token_ids={128821, 128822},
),
skip_special_tokens=False,
)
for _ in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
sampling_params=sampling_params,
)
# PaddleOCR-VL
def run_paddleocr_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "PaddlePaddle/PaddleOCR-VL"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
trust_remote_code=True,
)
placeholder = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
prompts = [
(f"<|begin_of_sentence|>User: {question}{placeholder}\nAssistant: ")
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# HunyuanOCR
def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "Tencent-Hunyuan/HunyuanOCR"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
limit_mm_per_prompt={modality: 1},
)
placeholder = "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501
prompts = [
f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=None,
)
model_example_map = {
"paddleocr_vl": run_paddleocr_vl,
"deepseek_ocr": run_deepseek_ocr,
"hunyuan_vl": run_hunyuan_vl,
}
MODELS_NEED_VIDEO_METADATA = [
"glm4_1v",
"glm4_5v",
"glm4_5v_fp8",
"qwen3_vl",
"qwen3_vl_moe",
]
def get_multi_modal_input(args):
"""
return {
"data": image or video,
"question": question,
}
"""
if args.modality == "image":
# Input image and question
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
img_questions = [
"What is the content of this image?",
"Describe the content of this image in detail.",
"What's in the image?",
"Where is this image taken?",
]
return {
"data": image,
"questions": img_questions,
}
if args.modality == "video":
# Input video and question
needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA
video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
vid_questions = ["Why is this video funny?"]
return {
"data": ([(video, metadata)] if needs_metadata else video),
"questions": vid_questions,
}
msg = f"Modality {args.modality} is not supported."
raise ValueError(msg)
def apply_image_repeat(
image_repeat_prob, num_prompts, data, prompts: list[str], modality
):
"""Repeats images with provided probability of "image_repeat_prob".
Used to simulate hit/miss for the MM preprocessor cache.
"""
assert image_repeat_prob <= 1.0 and image_repeat_prob >= 0
no_yes = [0, 1]
probs = [1.0 - image_repeat_prob, image_repeat_prob]
inputs = []
inputs_with_empty_media = []
cur_image = data
for i in range(num_prompts):
if image_repeat_prob is not None:
res = random.choices(no_yes, probs)[0]
if res == 0:
# No repeat => Modify one pixel
cur_image = cur_image.copy()
new_val = (i // 256 // 256, i // 256, i % 256)
cur_image.putpixel((0, 0), new_val)
uuid = "uuid_{}".format(i)
inputs.append(
{
"prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: cur_image},
"multi_modal_uuids": {modality: uuid},
}
)
inputs_with_empty_media.append(
{
"prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: None},
"multi_modal_uuids": {modality: uuid},
}
)
return inputs, inputs_with_empty_media
@contextmanager
def time_counter(enable: bool):
if enable:
import time
start_time = time.time()
yield
elapsed_time = time.time() - start_time
print("-" * 50)
print("-- generate time = {}".format(elapsed_time))
print("-" * 50)
else:
yield
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with "
"vision language models for text generation"
)
parser.add_argument(
"--model-type",
"-m",
type=str,
default="paddleocr_vl",
choices=model_example_map.keys(),
help='Huggingface "model_type".',
)
parser.add_argument(
"--num-prompts", type=int, default=4, help="Number of prompts to run."
)
parser.add_argument(
"--modality",
type=str,
default="image",
choices=["image", "video"],
help="Modality of the input.",
)
parser.add_argument(
"--num-frames",
type=int,
default=16,
help="Number of frames to extract from the video.",
)
parser.add_argument(
"--seed",
type=int,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
parser.add_argument(
"--image-repeat-prob",
type=float,
default=None,
help="Simulates the hit-ratio for multi-modal preprocessor cache (if enabled)",
)
parser.add_argument(
"--disable-mm-processor-cache",
action="store_true",
help="If True, disables caching of multi-modal processor.",
)
parser.add_argument(
"--time-generate",
action="store_true",
help="If True, then print the total generate() call time",
)
parser.add_argument(
"--use-different-prompt-per-request",
action="store_true",
help="If True, then use different prompt (with the same multi-modal "
"data) for each request.",
)
parser.add_argument(
"--verify-mm-cache-hit-with-uuids",
action="store_true",
help="If True, will send all requests in a second batch with empty mm "
"data to verify cache hits with UUIDs.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args()
def main(args):
model = args.model_type
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
modality = args.modality
mm_input = get_multi_modal_input(args)
data = mm_input["data"]
questions = mm_input["questions"]
req_data = model_example_map[model](questions, modality)
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {
"seed": args.seed,
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
# Don't want to check the flag multiple times, so just hijack `prompts`.
prompts = (
req_data.prompts
if args.use_different_prompt_per_request
else [req_data.prompts[0]]
)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = (
SamplingParams(
temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
)
if req_data.sampling_params is None
else req_data.sampling_params
)
assert args.num_prompts > 0
if args.num_prompts == 1:
# Single inference
uuid = "uuid_0"
inputs = {
"prompt": prompts[0],
"multi_modal_data": {modality: data},
"multi_modal_uuids": {modality: uuid},
}
inputs_with_empty_media = {
"prompt": prompts[0],
"multi_modal_data": {modality: None},
"multi_modal_uuids": {modality: uuid},
}
else:
# Batch inference
if args.image_repeat_prob is not None:
# Repeat images with specified probability of "image_repeat_prob"
inputs, inputs_with_empty_media = apply_image_repeat(
args.image_repeat_prob,
args.num_prompts,
data,
prompts,
modality,
)
else:
# Use the same image for all prompts
inputs = []
inputs_with_empty_media = []
for i in range(args.num_prompts):
uuid = "uuid_{}".format(i)
inputs.append(
{
"prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: data},
"multi_modal_uuids": {modality: uuid},
}
)
inputs_with_empty_media.append(
{
"prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: None},
"multi_modal_uuids": {modality: uuid},
}
)
# Add LoRA request if applicable
lora_request = (
req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
)
with time_counter(args.time_generate):
outputs = llm.generate(
inputs,
sampling_params=sampling_params,
lora_request=lora_request,
)
print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
print("-" * 50)
if args.verify_mm_cache_hit_with_uuids:
try:
# Verify cache hits with UUIDs
print(
"Sending a second batch of requests with empty media"
" and matching UUIDs."
)
outputs = llm.generate(
inputs_with_empty_media,
sampling_params=sampling_params,
lora_request=lora_request,
)
print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
print("-" * 50)
except Exception as e:
print(f"Failed to verify cache hits with UUIDs. Error: {e}")
if __name__ == "__main__":
args = parse_args()
main(args)for example:
export VLLM_USE_MODELSCOPE=true
python run_vision_language.py --model-type paddleocr_vlMetadata
Metadata
Assignees
Labels
No labels