lemonade-sdk · praveen-iyer · Feb 18, 2026 · Feb 16, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/README.md b/README.md
@@ -259,6 +259,33 @@ The benchmark measures:
 lemonade-eval -i Qwen3-4B-Instruct-2507-GGUF load bench --iterations 5 --warmup-iterations 2 --output-tokens 128
 ```
 
+### VLM (Vision-Language Model) Benchmarking
+
+Benchmark Vision-Language Models by providing an image with the `--image` flag:
+
+```bash
+# Benchmark a VLM with an image
+lemonade-eval -i Qwen3-4B-VL-FLM load bench --image photo.jpg -p "Describe this image in detail" --output-tokens 128
+```
+
+Use `--image-size` to resize the image before sending, which reduces visual token count to fit within the model's context window:
+
+```bash
+# Resize to exact dimensions (WIDTHxHEIGHT)
+lemonade-eval -i Qwen3-4B-VL-FLM load bench --image photo.jpg --image-size 1024x800 --output-tokens 128
+
+# Resize by capping the longest side (preserves aspect ratio)
+lemonade-eval -i Qwen3-4B-VL-FLM load bench --image photo.jpg --image-size 384 --output-tokens 128
+```
+
+The `-p` flag controls the text portion of the prompt. Pass an integer for a synthetic prompt of that token length, or a string for a custom prompt. Image tokens are additional and reported by the server in the total input token count.
+
+Full example with all options:
+
+```bash
+lemonade-eval -i Qwen3-4B-VL-FLM load bench --image photo.jpg --image-size 1024x800 -p 128 --output-tokens 128 --warmup-iterations 2 --iterations 3
+```
+
 ## Exporting a Finetuned Model
 
 To prepare your own fine-tuned model for OGA:

diff --git a/src/lemonade/tools/server_bench.py b/src/lemonade/tools/server_bench.py
@@ -19,6 +19,9 @@ class ServerBench(Bench):
     /api/v1/stats endpoint to collect performance metrics. It follows the
     same benchmarking methodology as other *-bench tools.
 
+    Supports both text-only LLMs and Vision-Language Models (VLMs). For VLMs,
+    use the --image flag to provide an image file or URL.
+
     Required input state:
         - model: ServerAdapter instance (set by the `load` tool)
         - tokenizer: ServerTokenizerAdapter instance
@@ -27,13 +30,15 @@ class ServerBench(Bench):
         - Performance statistics including TTFT, tokens/second, etc.
 
     Example usage:
-        lemonade-eval -i Qwen3-0.6B-GGUF load --server-url http://localhost:8000 bench
+        lemonade-eval -i Qwen3-0.6B-GGUF load bench
+        lemonade-eval -i Qwen3-4B-VL-FLM load bench --image photo.jpg
     """
 
     unique_name = "bench"
 
     def __init__(self):
         super().__init__(monitor_message="Benchmarking model on Lemonade Server")
+        self._image = None
 
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
@@ -44,6 +49,27 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
 
         parser = Bench.parser(parser)
 
+        parser.add_argument(
+            "--image",
+            type=str,
+            default=None,
+            help="Path to an image file or URL for VLM (Vision-Language Model) "
+            "benchmarking. When provided, each benchmark iteration sends a "
+            "multimodal prompt containing both the image and text. "
+            "The -p flag controls the text portion of the prompt.",
+        )
+
+        parser.add_argument(
+            "--image-size",
+            type=str,
+            default=None,
+            help="Resize the image before sending to the server. Accepts "
+            "WIDTHxHEIGHT (e.g. --image-size 1024x800) to resize to exact "
+            "dimensions, or a single integer (e.g. --image-size 384) to cap "
+            "the longest side while preserving aspect ratio. Reduces visual "
+            "token count for VLM models. Only applies when --image is set.",
+        )
+
         return parser
 
     # Prefix to encourage long model responses for benchmarking
@@ -52,20 +78,47 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
         "but goes from there: "
     )
 
+    # Prefix for VLM benchmarking that encourages long responses about the image
+    VLM_PROMPT_PREFIX = (
+        "Describe this image in extreme detail, covering every single element, "
+        "color, texture, shape, and spatial relationship you can observe. "
+        "Then tell an extremely long creative story inspired by the image: "
+    )
+
+    def parse(self, state, args, known_only=True):
+        """
+        Override parse to extract --image before prompt processing, so that
+        get_prompt_str() can use a VLM-appropriate prompt prefix.
+        """
+        pre_parser = argparse.ArgumentParser(add_help=False)
+        pre_parser.add_argument("--image", type=str, default=None)
+        pre_parser.add_argument("--image-size", type=str, default=None)
+        pre_args, _ = pre_parser.parse_known_args(args)
+        self._image = pre_args.image
+
+        return super().parse(state, args, known_only)
+
     def get_prompt_str(self, state, token_length):
         """
         Returns a string with approximately the prescribed token length.
 
         The prompt includes a prefix that encourages long responses, followed
         by synthetic "word" tokens. We use calibration via the server's actual
         token count to match the target length.
+
+        For VLM models (when --image is set), uses a VLM-appropriate prefix
+        that encourages detailed image description. The -p token count controls
+        only the text portion; image tokens are additional.
         """
         model: ServerAdapter = state.model
 
+        # Choose prefix based on whether this is a VLM benchmark
+        prefix = self.VLM_PROMPT_PREFIX if self._image else self.PROMPT_PREFIX
+
         # Start with an initial estimate: prefix + "word " repeated
         # Assume prefix is ~20 tokens, so start with (token_length - 20) words
         initial_word_count = max(1, token_length - 20)
-        test_prompt = self.PROMPT_PREFIX + "word " * initial_word_count
+        test_prompt = prefix + "word " * initial_word_count
 
         # Make a calibration request to get the actual token count
         try:
@@ -83,7 +136,7 @@ def get_prompt_str(self, state, token_length):
 
             # Calculate adjusted word count
             adjusted_words = max(1, initial_word_count - delta)
-            return self.PROMPT_PREFIX + "word " * adjusted_words
+            return prefix + "word " * adjusted_words
 
         except Exception:  # pylint: disable=broad-exception-caught
             # If calibration fails, use initial estimation
@@ -109,8 +162,10 @@ def run_prompt(
             iterations: Number of benchmark iterations
             warmup_iterations: Number of warmup iterations (not counted in results)
             output_tokens: Target number of output tokens
-            **kwargs: Additional arguments (ignored)
+            **kwargs: Additional arguments, including 'image' for VLM benchmarking
         """
+        image = kwargs.get("image", None)
+        image_size = kwargs.get("image_size", None)
         if self.first_run_prompt:
             if not hasattr(state, "model"):
                 raise ValueError(
@@ -147,6 +202,8 @@ def run_prompt(
                     prompt,
                     max_new_tokens=output_tokens,
                     save_max_memory_used=self.save_max_memory_used,
+                    image=image,
+                    image_size=image_size,
                 )
 
                 # Check that we got valid metrics

diff --git a/src/lemonade/tools/server_load.py b/src/lemonade/tools/server_load.py
@@ -3,6 +3,9 @@
 """
 
 import argparse
+import base64
+import io
+import mimetypes
 import platform
 from typing import Optional
 
@@ -88,6 +91,92 @@ def __init__(
         self.timeout = timeout
         self.type = "server"
 
+    @staticmethod
+    def _resize_image(image_path: str, width: int, height: int) -> bytes:
+        """
+        Resize an image to the given width and height.
+
+        Args:
+            image_path: Path to the image file.
+            width: Target width in pixels.
+            height: Target height in pixels.
+
+        Returns:
+            JPEG-encoded bytes of the resized image.
+        """
+        from PIL import Image  # pylint: disable=import-outside-toplevel
+
+        img = Image.open(image_path)
+        img = img.resize(
+            (width, height),
+            Image.Resampling.LANCZOS,  # pylint: disable=no-member
+        )
+        buf = io.BytesIO()
+        img.save(buf, format="JPEG", quality=85)
+        return buf.getvalue()
+
+    @staticmethod
+    def _parse_image_size(image_size: str):
+        """
+        Parse an image size string into (width, height) or a single max dimension.
+
+        Args:
+            image_size: Either "WIDTHxHEIGHT" (e.g. "1024x800") for exact
+                dimensions, or a single integer string (e.g. "384") to cap
+                the longest side while preserving aspect ratio.
+
+        Returns:
+            Tuple of (width, height) for exact resize, or (max_dim, None) for
+            aspect-ratio-preserving resize.
+        """
+        if "x" in image_size.lower():
+            parts = image_size.lower().split("x")
+            return int(parts[0]), int(parts[1])
+        return int(image_size), None
+
+    @staticmethod
+    def _prepare_image_url(image_path: str, image_size: str = None) -> str:
+        """
+        Convert an image file path to a base64 data URL, or return a URL as-is.
+
+        When image_size is provided, the image is resized client-side to reduce
+        the number of visual tokens the VLM needs to process.
+
+        Args:
+            image_path: Local file path or HTTP(S) URL to an image.
+            image_size: Optional resize spec -- "WIDTHxHEIGHT" for exact
+                dimensions, or a single integer for max longest side.
+
+        Returns:
+            A data URL (base64-encoded) or the original URL.
+        """
+        if image_path.startswith(("http://", "https://")):
+            return image_path
+
+        if image_size is not None:
+            width, height = ServerAdapter._parse_image_size(image_size)
+            if height is None:
+                # Single value: cap longest side, preserve aspect ratio
+                from PIL import Image  # pylint: disable=import-outside-toplevel
+
+                img = Image.open(image_path)
+                orig_w, orig_h = img.size
+                scale = width / max(orig_w, orig_h)
+                height = int(orig_h * scale)
+                width = int(orig_w * scale)
+
+            image_bytes = ServerAdapter._resize_image(image_path, width, height)
+            mime_type = "image/jpeg"
+        else:
+            mime_type, _ = mimetypes.guess_type(image_path)
+            if mime_type is None:
+                mime_type = "image/jpeg"
+            with open(image_path, "rb") as f:
+                image_bytes = f.read()
+
+        image_data = base64.b64encode(image_bytes).decode("utf-8")
+        return f"data:{mime_type};base64,{image_data}"
+
     def generate(
         self,
         input_ids,
@@ -97,6 +186,8 @@ def generate(
         top_k: int = None,
         repeat_penalty: float = None,
         save_max_memory_used: bool = False,
+        image: str = None,
+        image_size: str = None,
         **kwargs,  # pylint: disable=unused-argument
     ):
         """
@@ -110,17 +201,30 @@ def generate(
             top_k: Top-k sampling parameter
             repeat_penalty: Repetition penalty
             save_max_memory_used: If True, capture wrapped server memory usage
+            image: Optional path or URL to an image for VLM models
+            image_size: Optional resize spec ("WIDTHxHEIGHT" or single int string)
             **kwargs: Additional arguments (ignored)
 
         Returns:
             Generated text response
         """
         prompt = input_ids  # PassthroughTokenizer passes text directly
 
+        # Build message content (multimodal if image is provided)
+        if image is not None:
+            image_url = self._prepare_image_url(image, image_size=image_size)
+            content = [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": prompt},
+            ]
+            messages = [{"role": "user", "content": content}]
+        else:
+            messages = [{"role": "user", "content": prompt}]
+
         # Build request payload using chat/completions format
         payload = {
             "model": self.model_name,
-            "messages": [{"role": "user", "content": prompt}],
+            "messages": messages,
             "max_tokens": max_new_tokens,
             "stream": False,
             "cache_prompt": False,  # Disable prompt caching for accurate benchmarking