Skip to content

Commit 12ebda1

Browse files
authored
Support for qwen2_5_vl (#108)
* Initial support for qwen2_5_vl * Add Qwen2_5_VLProcessingInfo * Add support for Qwen2_5_vl * Add package * Fix bugs * Fix bugs * Fix input messages * clean up code * Fix format of examples * Fix extract_modify_mm * Simlify and fix weigths loading for vl * Simplify weights loading for glm * Update images in examples * Fix pure text chat for qwen_vl * Fix memory overflow for embedding cache * Set top_k 1 for qwen_vl * Update readme
1 parent e134c70 commit 12ebda1

20 files changed

Lines changed: 1877 additions & 125 deletions

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Global Balanced Pipeline Parallelism System for Distributed LLM Serving with Tok
1818
Integreted with features like **continuous batching**, **paged attention**, **chunked prefill**, **prefix caching**, **token throttling**, **pipeline parallelism**, **expert parallelsim** and **tensor parallelism**, gLLM provides basic functionality (**offline/online inference and interactive chat**) to deploy distributed LLMs (**supported in huggingface**) inference. gLLM provides **equivalent or superior** offline/online inference speed with mainstream inference engine and **minimal** (~6k loc) code base. You can also see gLLM as a LLM inference playground for doing experiment or academic research.
1919

2020
*Latest News* :fire:
21+
- [2025/08/15]: Qwen2.5 VL is supported :hugs:
2122
- [2025/08/01]: DeepSeek V2/3 is supported :clap:
2223
- [2025/07/12]: FP8 quantization for Qwen3/2.5 is supported :tada:
2324
- [2025/06/27]: gLLM is accepted by SC'25. Congratulations :smiling_face_with_three_hearts:
@@ -26,12 +27,18 @@ Integreted with features like **continuous batching**, **paged attention**, **ch
2627
- [2025/05/05]: MoE architecture is supported. Try Qwen2/3 MoE models :star_struck:
2728
- [2025/04/29]: Qwen3 day 1 support. Come and try Qwen3 :tada:
2829
- [2025/04/27]: gLLM is open sourced :earth_asia:
30+
31+
<details>
32+
<summary>Previous News</summary>
33+
2934
- [2025/04/27]: We support multi-node deployments. You can serve your model across different machines :blush:
3035
- [2025/04/21]: We release our paper on [arXiv:2504.14775](https://arxiv.org/abs/2504.14775) :partying_face:
3136
- [2025/03/15]: Chunked prefill has been integrated. You can input any length of text you want :hugs:
3237
- [2025/03/01]: Pipeline parallelism has been integrated. You can run any size of model you want :laughing:
3338
- [2025/02/27]: We apply numerous optimizations which lowers CPU overhead a lot :clap:
3439

40+
</details>
41+
3542
## Token Throttling
3643

3744
### Prefill Token Throttling
@@ -147,7 +154,7 @@ python benchmarks/evaluate_MMLU_pro.py --model $MODEL
147154
## Supported Models
148155

149156
- DeepSeek Series: DeepSeek V2/3 (MLA)
150-
- Qwen Series: Qwen3, Qwen2.5, Qwen2
157+
- Qwen Series: Qwen3, Qwen2.5 VL, Qwen2.5, Qwen2
151158
- Llama Series: Llama3.2, Llama3.1, Llama3, Llama2 and deepseek-coder
152159
- Mixtral Series: Mixtral-8x7B, Mixtral-8x22B
153160
- ChatGLM Series: Glm4 and Chatglm3

examples/mm_chat.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
"""An example showing how to use vLLM to serve multimodal models
4+
and run online serving with OpenAI client.
5+
6+
Launch the vLLM server with the following command:
7+
8+
(single image inference with Llava)
9+
vllm serve llava-hf/llava-1.5-7b-hf
10+
11+
(multi-image inference with Phi-3.5-vision-instruct)
12+
vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
13+
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
14+
15+
(audio inference with Ultravox)
16+
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
17+
--max-model-len 4096 --trust-remote-code
18+
19+
run the script with
20+
python openai_chat_completion_client_for_multimodal.py --chat-type audio
21+
"""
22+
23+
import base64
24+
25+
import requests
26+
from openai import OpenAI
27+
from openai import APIConnectionError, OpenAI
28+
from openai.pagination import SyncPage
29+
from openai.types.model import Model
30+
from argparse import ArgumentParser
31+
32+
def get_first_model(client: OpenAI) -> str:
33+
"""
34+
Get the first model from the vLLM server.
35+
"""
36+
try:
37+
models: SyncPage[Model] = client.models.list()
38+
except APIConnectionError as e:
39+
raise RuntimeError(
40+
"Failed to get the list of models from the vLLM server at "
41+
f"{client.base_url} with API key {client.api_key}. Check\n"
42+
"1. the server is running\n"
43+
"2. the server URL is correct\n"
44+
"3. the API key is correct"
45+
) from e
46+
47+
if len(models.data) == 0:
48+
raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
49+
50+
return models.data[0].id
51+
52+
53+
def encode_base64_content_from_url(content_url: str) -> str:
54+
"""Encode a content retrieved from a remote url to base64 format."""
55+
56+
with requests.get(content_url) as response:
57+
response.raise_for_status()
58+
result = base64.b64encode(response.content).decode("utf-8")
59+
60+
return result
61+
62+
63+
# Text-only inference
64+
def run_text_only(model: str, client) -> None:
65+
chat_completion = client.chat.completions.create(
66+
messages=[{"role": "user", "content": "What's the capital of France?"}],
67+
model=model,
68+
max_completion_tokens=64,
69+
)
70+
71+
result = chat_completion.choices[0].message.content
72+
print("Chat completion output:", result)
73+
74+
75+
# Single-image input inference
76+
def run_single_image(model: str, client) -> None:
77+
## Use image url in the payload
78+
image_url = 'https://2026.eurosys.org/img/EuroSys-2026-logo.png'
79+
chat_completion_from_url = client.chat.completions.create(
80+
messages=[
81+
{
82+
"role": "user",
83+
"content": [
84+
{"type": "text", "text": "描述下这个图片?"},
85+
{
86+
"type": "image_url",
87+
"image_url": {'url': image_url},
88+
},
89+
],
90+
}
91+
],
92+
model=model,
93+
max_tokens=512,
94+
)
95+
96+
result = chat_completion_from_url.choices[0].message.content
97+
print("Chat completion output from image url:", result)
98+
99+
# Multi-image input inference
100+
def run_multi_image(model: str, client) -> None:
101+
image1 = "https://www.sigops.org/wp-content/uploads/2025/05/ChatGPT-Image-May-4-2025-09_43_00-PM-980x653-1.png"
102+
image2 = "https://www.sigops.org/wp-content/uploads/2025/02/Picture1-1204x904.jpg"
103+
chat_completion_from_url = client.chat.completions.create(
104+
messages=[
105+
{
106+
"role": "user",
107+
"content": [
108+
{"type": "text", "text": "What's in this image?"},
109+
{
110+
"type": "image_url",
111+
"image_url": {'url':image1},
112+
},
113+
{
114+
"type": "image_url",
115+
"image_url": {'url':image2},
116+
},
117+
],
118+
}
119+
],
120+
model=model,
121+
max_tokens=1024,
122+
)
123+
124+
result = chat_completion_from_url.choices[0].message.content
125+
print("Chat completion output:", result)
126+
127+
128+
# Video input inference
129+
def run_video(model: str, client) -> None:
130+
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
131+
video_base64 = encode_base64_content_from_url(video_url)
132+
133+
## Use video url in the payload
134+
chat_completion_from_url = client.chat.completions.create(
135+
messages=[
136+
{
137+
"role": "user",
138+
"content": [
139+
{"type": "text", "text": "What's in this video?"},
140+
{
141+
"type": "video_url",
142+
"video_url": {"url": video_url},
143+
},
144+
],
145+
}
146+
],
147+
model=model,
148+
max_completion_tokens=64,
149+
)
150+
151+
result = chat_completion_from_url.choices[0].message.content
152+
print("Chat completion output from image url:", result)
153+
154+
## Use base64 encoded video in the payload
155+
chat_completion_from_base64 = client.chat.completions.create(
156+
messages=[
157+
{
158+
"role": "user",
159+
"content": [
160+
{"type": "text", "text": "What's in this video?"},
161+
{
162+
"type": "video_url",
163+
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
164+
},
165+
],
166+
}
167+
],
168+
model=model,
169+
max_completion_tokens=64,
170+
)
171+
172+
result = chat_completion_from_base64.choices[0].message.content
173+
print("Chat completion output from base64 encoded image:", result)
174+
175+
176+
177+
example_function_map = {
178+
"text-only": run_text_only,
179+
"single-image": run_single_image,
180+
"multi-image": run_multi_image,
181+
"video": run_video,
182+
}
183+
184+
185+
def parse_args():
186+
parser = ArgumentParser(
187+
description="Demo on using OpenAI client for online serving with "
188+
"multimodal language models served with vLLM."
189+
)
190+
parser.add_argument(
191+
"--chat-type",
192+
"-c",
193+
type=str,
194+
default="single-image",
195+
choices=list(example_function_map.keys()),
196+
help="Conversation type with multimodal data.",
197+
)
198+
parser.add_argument(
199+
'--port',
200+
'-p',
201+
type=int,
202+
default=8000
203+
)
204+
return parser.parse_args()
205+
206+
207+
def main(args) -> None:
208+
# Modify OpenAI's API key and API base to use vLLM's API server.
209+
openai_api_key = "EMPTY"
210+
openai_api_base = f"http://localhost:{args.port}/v1"
211+
212+
client = OpenAI(
213+
# defaults to os.environ.get("OPENAI_API_KEY")
214+
api_key=openai_api_key,
215+
base_url=openai_api_base,
216+
)
217+
218+
chat_type = args.chat_type
219+
model = get_first_model(client)
220+
example_function_map[chat_type](model, client)
221+
222+
223+
if __name__ == "__main__":
224+
args = parse_args()
225+
main(args)

gllm/async_llm_engine.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,11 @@ def __init__(self, *args, **kwargs):
6060
self.schedule_engine = None
6161

6262
async def add_requests_async(self, raw_request: Request, token_ids: List[int], output_len: int, ignore_eos: bool,
63-
temperature: float, top_p: float, top_k: float, repetition_penalty: float):
63+
temperature: float, top_p: float, top_k: float, repetition_penalty: float,
64+
mm_contents=None):
6465
seq = self.allocate_seq(token_ids, output_len, ignore_eos,
65-
temperature, top_p, top_k, repetition_penalty)
66+
temperature, top_p, top_k, repetition_penalty,
67+
mm_contents)
6668
stream = AsyncStream(raw_request)
6769
assert seq.seq_id not in self.async_streams
6870
self.async_streams[seq.seq_id] = stream

gllm/entrypoints/api_server.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,12 @@ async def show_available_models():
2828

2929
@router.post("/v1/chat/completions")
3030
async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
31-
token_ids = await make_async(llm.model_runner.encode)(request.messages, chat=True)
31+
mm_contents = await make_async(llm.model_runner.extract_modify_mm)(request.messages)
32+
token_ids = await make_async(llm.model_runner.encode)(request.messages, chat=True, has_mm=mm_contents is not None)
3233
if llm.check_seq_length(token_ids, request.max_tokens):
3334
stream = await llm.add_requests_async(raw_request, token_ids, request.max_tokens, request.ignore_eos,
34-
request.temperature, request.top_p, request.top_k, request.repetition_penalty)
35+
request.temperature, request.top_p, request.top_k, request.repetition_penalty,
36+
mm_contents)
3537
else:
3638
return ErrorResponse(message="seq length exceeds max model length",
3739
type="BadRequestError",

0 commit comments

Comments
 (0)