Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,33 @@ The benchmark measures:
lemonade-eval -i Qwen3-4B-Instruct-2507-GGUF load bench --iterations 5 --warmup-iterations 2 --output-tokens 128
```

### VLM (Vision-Language Model) Benchmarking

Benchmark Vision-Language Models by providing an image with the `--image` flag:

```bash
# Benchmark a VLM with an image
lemonade-eval -i Qwen3-4B-VL-FLM load bench --image photo.jpg -p "Describe this image in detail" --output-tokens 128
```

Use `--image-size` to resize the image before sending, which reduces visual token count to fit within the model's context window:

```bash
# Resize to exact dimensions (WIDTHxHEIGHT)
lemonade-eval -i Qwen3-4B-VL-FLM load bench --image photo.jpg --image-size 1024x800 --output-tokens 128

# Resize by capping the longest side (preserves aspect ratio)
lemonade-eval -i Qwen3-4B-VL-FLM load bench --image photo.jpg --image-size 384 --output-tokens 128
```

The `-p` flag controls the text portion of the prompt. Pass an integer for a synthetic prompt of that token length, or a string for a custom prompt. Image tokens are additional and reported by the server in the total input token count.

Full example with all options:

```bash
lemonade-eval -i Qwen3-4B-VL-FLM load bench --image photo.jpg --image-size 1024x800 -p 128 --output-tokens 128 --warmup-iterations 2 --iterations 3
```

## Exporting a Finetuned Model

To prepare your own fine-tuned model for OGA:
Expand Down
65 changes: 61 additions & 4 deletions src/lemonade/tools/server_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ class ServerBench(Bench):
/api/v1/stats endpoint to collect performance metrics. It follows the
same benchmarking methodology as other *-bench tools.

Supports both text-only LLMs and Vision-Language Models (VLMs). For VLMs,
use the --image flag to provide an image file or URL.

Required input state:
- model: ServerAdapter instance (set by the `load` tool)
- tokenizer: ServerTokenizerAdapter instance
Expand All @@ -27,13 +30,15 @@ class ServerBench(Bench):
- Performance statistics including TTFT, tokens/second, etc.

Example usage:
lemonade-eval -i Qwen3-0.6B-GGUF load --server-url http://localhost:8000 bench
lemonade-eval -i Qwen3-0.6B-GGUF load bench
lemonade-eval -i Qwen3-4B-VL-FLM load bench --image photo.jpg
"""

unique_name = "bench"

def __init__(self):
super().__init__(monitor_message="Benchmarking model on Lemonade Server")
self._image = None

@staticmethod
def parser(add_help: bool = True) -> argparse.ArgumentParser:
Expand All @@ -44,6 +49,27 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:

parser = Bench.parser(parser)

parser.add_argument(
"--image",
type=str,
default=None,
help="Path to an image file or URL for VLM (Vision-Language Model) "
"benchmarking. When provided, each benchmark iteration sends a "
"multimodal prompt containing both the image and text. "
"The -p flag controls the text portion of the prompt.",
)

parser.add_argument(
"--image-size",
type=str,
default=None,
help="Resize the image before sending to the server. Accepts "
"WIDTHxHEIGHT (e.g. --image-size 1024x800) to resize to exact "
"dimensions, or a single integer (e.g. --image-size 384) to cap "
"the longest side while preserving aspect ratio. Reduces visual "
"token count for VLM models. Only applies when --image is set.",
)

return parser

# Prefix to encourage long model responses for benchmarking
Expand All @@ -52,20 +78,47 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
"but goes from there: "
)

# Prefix for VLM benchmarking that encourages long responses about the image
VLM_PROMPT_PREFIX = (
"Describe this image in extreme detail, covering every single element, "
"color, texture, shape, and spatial relationship you can observe. "
"Then tell an extremely long creative story inspired by the image: "
)

def parse(self, state, args, known_only=True):
"""
Override parse to extract --image before prompt processing, so that
get_prompt_str() can use a VLM-appropriate prompt prefix.
"""
pre_parser = argparse.ArgumentParser(add_help=False)
pre_parser.add_argument("--image", type=str, default=None)
pre_parser.add_argument("--image-size", type=str, default=None)
pre_args, _ = pre_parser.parse_known_args(args)
self._image = pre_args.image

return super().parse(state, args, known_only)

def get_prompt_str(self, state, token_length):
"""
Returns a string with approximately the prescribed token length.

The prompt includes a prefix that encourages long responses, followed
by synthetic "word" tokens. We use calibration via the server's actual
token count to match the target length.

For VLM models (when --image is set), uses a VLM-appropriate prefix
that encourages detailed image description. The -p token count controls
only the text portion; image tokens are additional.
"""
model: ServerAdapter = state.model

# Choose prefix based on whether this is a VLM benchmark
prefix = self.VLM_PROMPT_PREFIX if self._image else self.PROMPT_PREFIX

# Start with an initial estimate: prefix + "word " repeated
# Assume prefix is ~20 tokens, so start with (token_length - 20) words
initial_word_count = max(1, token_length - 20)
test_prompt = self.PROMPT_PREFIX + "word " * initial_word_count
test_prompt = prefix + "word " * initial_word_count

# Make a calibration request to get the actual token count
try:
Expand All @@ -83,7 +136,7 @@ def get_prompt_str(self, state, token_length):

# Calculate adjusted word count
adjusted_words = max(1, initial_word_count - delta)
return self.PROMPT_PREFIX + "word " * adjusted_words
return prefix + "word " * adjusted_words

except Exception: # pylint: disable=broad-exception-caught
# If calibration fails, use initial estimation
Expand All @@ -109,8 +162,10 @@ def run_prompt(
iterations: Number of benchmark iterations
warmup_iterations: Number of warmup iterations (not counted in results)
output_tokens: Target number of output tokens
**kwargs: Additional arguments (ignored)
**kwargs: Additional arguments, including 'image' for VLM benchmarking
"""
image = kwargs.get("image", None)
Comment thread
praveen-iyer marked this conversation as resolved.
image_size = kwargs.get("image_size", None)
if self.first_run_prompt:
if not hasattr(state, "model"):
raise ValueError(
Expand Down Expand Up @@ -147,6 +202,8 @@ def run_prompt(
prompt,
max_new_tokens=output_tokens,
save_max_memory_used=self.save_max_memory_used,
image=image,
image_size=image_size,
)

# Check that we got valid metrics
Expand Down
106 changes: 105 additions & 1 deletion src/lemonade/tools/server_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
"""

import argparse
import base64
import io
import mimetypes
import platform
from typing import Optional

Expand Down Expand Up @@ -88,6 +91,92 @@ def __init__(
self.timeout = timeout
self.type = "server"

@staticmethod
def _resize_image(image_path: str, width: int, height: int) -> bytes:
"""
Resize an image to the given width and height.

Args:
image_path: Path to the image file.
width: Target width in pixels.
height: Target height in pixels.

Returns:
JPEG-encoded bytes of the resized image.
"""
from PIL import Image # pylint: disable=import-outside-toplevel

img = Image.open(image_path)
img = img.resize(
(width, height),
Image.Resampling.LANCZOS, # pylint: disable=no-member
)
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=85)
Comment thread
praveen-iyer marked this conversation as resolved.
Outdated
return buf.getvalue()

@staticmethod
def _parse_image_size(image_size: str):
"""
Parse an image size string into (width, height) or a single max dimension.

Args:
image_size: Either "WIDTHxHEIGHT" (e.g. "1024x800") for exact
dimensions, or a single integer string (e.g. "384") to cap
the longest side while preserving aspect ratio.

Returns:
Tuple of (width, height) for exact resize, or (max_dim, None) for
aspect-ratio-preserving resize.
"""
if "x" in image_size.lower():
parts = image_size.lower().split("x")
return int(parts[0]), int(parts[1])
return int(image_size), None

@staticmethod
def _prepare_image_url(image_path: str, image_size: str = None) -> str:
"""
Convert an image file path to a base64 data URL, or return a URL as-is.

When image_size is provided, the image is resized client-side to reduce
the number of visual tokens the VLM needs to process.

Args:
image_path: Local file path or HTTP(S) URL to an image.
image_size: Optional resize spec -- "WIDTHxHEIGHT" for exact
dimensions, or a single integer for max longest side.

Returns:
A data URL (base64-encoded) or the original URL.
"""
if image_path.startswith(("http://", "https://")):
return image_path
Comment thread
praveen-iyer marked this conversation as resolved.

if image_size is not None:
width, height = ServerAdapter._parse_image_size(image_size)
Comment thread
praveen-iyer marked this conversation as resolved.
if height is None:
# Single value: cap longest side, preserve aspect ratio
from PIL import Image # pylint: disable=import-outside-toplevel

img = Image.open(image_path)
orig_w, orig_h = img.size
scale = width / max(orig_w, orig_h)
height = int(orig_h * scale)
width = int(orig_w * scale)

image_bytes = ServerAdapter._resize_image(image_path, width, height)
mime_type = "image/jpeg"
else:
mime_type, _ = mimetypes.guess_type(image_path)
if mime_type is None:
mime_type = "image/jpeg"
with open(image_path, "rb") as f:
image_bytes = f.read()

image_data = base64.b64encode(image_bytes).decode("utf-8")
return f"data:{mime_type};base64,{image_data}"

def generate(
self,
input_ids,
Expand All @@ -97,6 +186,8 @@ def generate(
top_k: int = None,
repeat_penalty: float = None,
save_max_memory_used: bool = False,
image: str = None,
image_size: str = None,
**kwargs, # pylint: disable=unused-argument
):
"""
Expand All @@ -110,17 +201,30 @@ def generate(
top_k: Top-k sampling parameter
repeat_penalty: Repetition penalty
save_max_memory_used: If True, capture wrapped server memory usage
image: Optional path or URL to an image for VLM models
image_size: Optional resize spec ("WIDTHxHEIGHT" or single int string)
**kwargs: Additional arguments (ignored)

Returns:
Generated text response
"""
prompt = input_ids # PassthroughTokenizer passes text directly

# Build message content (multimodal if image is provided)
if image is not None:
image_url = self._prepare_image_url(image, image_size=image_size)
content = [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": prompt},
]
messages = [{"role": "user", "content": content}]
else:
messages = [{"role": "user", "content": prompt}]

# Build request payload using chat/completions format
payload = {
"model": self.model_name,
"messages": [{"role": "user", "content": prompt}],
"messages": messages,
"max_tokens": max_new_tokens,
"stream": False,
"cache_prompt": False, # Disable prompt caching for accurate benchmarking
Expand Down