gty111
diff --git a/‎README.md‎
Lines changed: 8 additions & 1 deletion b/‎README.md‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎examples/mm_chat.py‎
Lines changed: 225 additions & 0 deletions b/‎examples/mm_chat.py‎
Lines changed: 225 additions & 0 deletions
diff --git a/‎gllm/async_llm_engine.py‎
Lines changed: 4 additions & 2 deletions b/‎gllm/async_llm_engine.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎gllm/entrypoints/api_server.py‎
Lines changed: 4 additions & 2 deletions b/‎gllm/entrypoints/api_server.py‎
Lines changed: 4 additions & 2 deletions
@@ -18,6 +18,7 @@ Global Balanced Pipeline Parallelism System for Distributed LLM Serving with Tok
 Integreted with features like **continuous batching**, **paged attention**, **chunked prefill**, **prefix caching**, **token throttling**, **pipeline parallelism**, **expert parallelsim** and **tensor parallelism**, gLLM provides basic functionality (**offline/online inference and interactive chat**) to deploy distributed LLMs (**supported in huggingface**) inference. gLLM provides **equivalent or superior** offline/online inference speed with mainstream inference engine and **minimal** (~6k loc) code base. You can also see gLLM as a LLM inference playground for doing experiment or academic research.
 
 *Latest News* :fire:
+- [2025/08/15]: Qwen2.5 VL is supported :hugs:
 - [2025/08/01]: DeepSeek V2/3 is supported :clap:
 - [2025/07/12]: FP8 quantization for Qwen3/2.5 is supported :tada:
 - [2025/06/27]: gLLM is accepted by SC'25. Congratulations :smiling_face_with_three_hearts:
@@ -26,12 +27,18 @@ Integreted with features like **continuous batching**, **paged attention**, **ch
 - [2025/05/05]: MoE architecture is supported. Try Qwen2/3 MoE models :star_struck:
 - [2025/04/29]: Qwen3 day 1 support. Come and try Qwen3 :tada:
 - [2025/04/27]: gLLM is open sourced :earth_asia:
+
+<details>
+<summary>Previous News</summary>
+
 - [2025/04/27]: We support multi-node deployments. You can serve your model across different machines :blush:
 - [2025/04/21]: We release our paper on [arXiv:2504.14775](https://arxiv.org/abs/2504.14775) :partying_face:
 - [2025/03/15]: Chunked prefill has been integrated. You can input any length of text you want :hugs:
 - [2025/03/01]: Pipeline parallelism has been integrated. You can run any size of model you want :laughing: 
 - [2025/02/27]: We apply numerous optimizations which lowers CPU overhead a lot :clap: 
 
+</details>
+
 ## Token Throttling
 
 ### Prefill Token Throttling
@@ -147,7 +154,7 @@ python benchmarks/evaluate_MMLU_pro.py --model $MODEL
 ## Supported Models
 
 - DeepSeek Series: DeepSeek V2/3 (MLA)
-- Qwen Series: Qwen3, Qwen2.5, Qwen2
+- Qwen Series: Qwen3, Qwen2.5 VL, Qwen2.5, Qwen2
 - Llama Series: Llama3.2, Llama3.1, Llama3, Llama2 and deepseek-coder
 - Mixtral Series: Mixtral-8x7B, Mixtral-8x22B
 - ChatGLM Series: Glm4 and Chatglm3 
 
@@ -0,0 +1,225 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""An example showing how to use vLLM to serve multimodal models
+and run online serving with OpenAI client.
+
+Launch the vLLM server with the following command:
+
+(single image inference with Llava)
+vllm serve llava-hf/llava-1.5-7b-hf
+
+(multi-image inference with Phi-3.5-vision-instruct)
+vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
+
+(audio inference with Ultravox)
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
+    --max-model-len 4096 --trust-remote-code
+
+run the script with
+python openai_chat_completion_client_for_multimodal.py --chat-type audio
+"""
+
+import base64
+
+import requests
+from openai import OpenAI
+from openai import APIConnectionError, OpenAI
+from openai.pagination import SyncPage
+from openai.types.model import Model
+from argparse import ArgumentParser
+
+def get_first_model(client: OpenAI) -> str:
+    """
+    Get the first model from the vLLM server.
+    """
+    try:
+        models: SyncPage[Model] = client.models.list()
+    except APIConnectionError as e:
+        raise RuntimeError(
+            "Failed to get the list of models from the vLLM server at "
+            f"{client.base_url} with API key {client.api_key}. Check\n"
+            "1. the server is running\n"
+            "2. the server URL is correct\n"
+            "3. the API key is correct"
+        ) from e
+
+    if len(models.data) == 0:
+        raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
+
+    return models.data[0].id
+
+
+def encode_base64_content_from_url(content_url: str) -> str:
+    """Encode a content retrieved from a remote url to base64 format."""
+
+    with requests.get(content_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode("utf-8")
+
+    return result
+
+
+# Text-only inference
+def run_text_only(model: str, client) -> None:
+    chat_completion = client.chat.completions.create(
+        messages=[{"role": "user", "content": "What's the capital of France?"}],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion.choices[0].message.content
+    print("Chat completion output:", result)
+
+
+# Single-image input inference
+def run_single_image(model: str, client) -> None:
+    ## Use image url in the payload
+    image_url = 'https://2026.eurosys.org/img/EuroSys-2026-logo.png'
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "描述下这个图片?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {'url': image_url},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_tokens=512,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+
+# Multi-image input inference
+def run_multi_image(model: str, client) -> None:
+    image1 = "https://www.sigops.org/wp-content/uploads/2025/05/ChatGPT-Image-May-4-2025-09_43_00-PM-980x653-1.png"
+    image2 = "https://www.sigops.org/wp-content/uploads/2025/02/Picture1-1204x904.jpg"
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {'url':image1},
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {'url':image2},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_tokens=1024,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output:", result)
+
+
+# Video input inference
+def run_video(model: str, client) -> None:
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+    video_base64 = encode_base64_content_from_url(video_url)
+
+    ## Use video url in the payload
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this video?"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": video_url},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+
+    ## Use base64 encoded video in the payload
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this video?"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded image:", result)
+
+
+
+example_function_map = {
+    "text-only": run_text_only,
+    "single-image": run_single_image,
+    "multi-image": run_multi_image,
+    "video": run_video,
+}
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description="Demo on using OpenAI client for online serving with "
+        "multimodal language models served with vLLM."
+    )
+    parser.add_argument(
+        "--chat-type",
+        "-c",
+        type=str,
+        default="single-image",
+        choices=list(example_function_map.keys()),
+        help="Conversation type with multimodal data.",
+    )
+    parser.add_argument(
+        '--port',
+        '-p',
+        type=int,
+        default=8000
+    )
+    return parser.parse_args()
+
+
+def main(args) -> None:
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = f"http://localhost:{args.port}/v1"
+
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    
+    chat_type = args.chat_type
+    model = get_first_model(client)
+    example_function_map[chat_type](model, client)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
@@ -60,9 +60,11 @@ def __init__(self, *args, **kwargs):
         self.schedule_engine = None
 
     async def add_requests_async(self, raw_request: Request, token_ids: List[int], output_len: int, ignore_eos: bool,
-                                 temperature: float, top_p: float, top_k: float, repetition_penalty: float):
+                                 temperature: float, top_p: float, top_k: float, repetition_penalty: float,
+                                 mm_contents=None):
         seq = self.allocate_seq(token_ids, output_len, ignore_eos,
-                                temperature, top_p, top_k, repetition_penalty)
+                                temperature, top_p, top_k, repetition_penalty,
+                                mm_contents)
         stream = AsyncStream(raw_request)
         assert seq.seq_id not in self.async_streams
         self.async_streams[seq.seq_id] = stream
 
@@ -28,10 +28,12 @@ async def show_available_models():
 
 @router.post("/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
-    token_ids = await make_async(llm.model_runner.encode)(request.messages, chat=True)
+    mm_contents = await make_async(llm.model_runner.extract_modify_mm)(request.messages)
+    token_ids = await make_async(llm.model_runner.encode)(request.messages, chat=True, has_mm=mm_contents is not None)
     if llm.check_seq_length(token_ids, request.max_tokens):
         stream = await llm.add_requests_async(raw_request, token_ids, request.max_tokens, request.ignore_eos,
-                                              request.temperature, request.top_p, request.top_k, request.repetition_penalty)
+                                              request.temperature, request.top_p, request.top_k, request.repetition_penalty,
+                                              mm_contents)
     else:
         return ErrorResponse(message="seq length exceeds max model length",
                              type="BadRequestError",