diff --git a/examples/01_standalone_sdk/17_image_input.py b/examples/01_standalone_sdk/17_image_input.py index 8ec5d55292..976a2cb943 100644 --- a/examples/01_standalone_sdk/17_image_input.py +++ b/examples/01_standalone_sdk/17_image_input.py @@ -2,10 +2,17 @@ This script mirrors the basic setup from ``examples/01_hello_world.py`` but adds vision support by sending an image to the agent alongside text instructions. + +It also demonstrates multi-image input with base64-encoded images that exercise +the Anthropic many-image resizing path (>20 images are automatically downscaled +to 2000×2000 px). """ +import base64 +import io import os +from PIL import Image from pydantic import SecretStr from openhands.sdk import ( @@ -27,6 +34,16 @@ logger = get_logger(__name__) + +def _make_png_data_url(width: int, height: int, color: str = "red") -> str: + """Create a base64 PNG data URL with the given dimensions and colour.""" + image = Image.new("RGB", (width, height), color=color) + buffer = io.BytesIO() + image.save(buffer, format="PNG") + encoded = base64.b64encode(buffer.getvalue()).decode("ascii") + return f"data:image/png;base64,{encoded}" + + # Configure LLM (vision-capable model) api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." @@ -65,6 +82,7 @@ def conversation_callback(event: Event) -> None: agent=agent, callbacks=[conversation_callback], workspace=cwd ) +# ── Part 1: single URL image ────────────────────────────────────────────── IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png" conversation.send_message( @@ -88,6 +106,54 @@ def conversation_callback(event: Event) -> None: ) conversation.run() +# ── Part 2: many oversized base64 images (exercises Anthropic resize) ───── +# Generate 21 base64 images at 2500×100 px — just above the 20-image threshold +# that triggers Anthropic's many-image limit (2000×2000 px per image). +# The SDK will automatically downscale these before sending to the provider. +COLORS = [ + "red", + "green", + "blue", + "yellow", + "cyan", + "magenta", + "orange", + "purple", + "pink", + "brown", + "gray", + "white", + "navy", + "teal", + "olive", + "maroon", + "lime", + "aqua", + "coral", + "gold", + "indigo", +] +oversized_data_urls = [ + _make_png_data_url(2500, 100, color=COLORS[i % len(COLORS)]) for i in range(21) +] + +conversation.send_message( + Message( + role="user", + content=[ + TextContent( + text=( + "I'm sending you 21 solid-colour test images. " + "List the dominant colour of each image in order, " + "one per line." + ) + ), + ImageContent(image_urls=oversized_data_urls), + ], + ) +) +conversation.run() + print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index e3410551d1..94d5466b9c 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -93,6 +93,7 @@ from openhands.sdk.llm.streaming import ( TokenCallbackType, ) +from openhands.sdk.llm.utils.image_resize import maybe_resize_messages_for_provider from openhands.sdk.llm.utils.litellm_provider import infer_litellm_provider from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot from openhands.sdk.llm.utils.model_features import get_features @@ -1101,6 +1102,14 @@ def _infer_litellm_provider(self) -> str | None: self._litellm_provider = provider return provider + def _infer_model_info_provider(self) -> str | None: + if self._model_info is not None: + provider = self._model_info.get("litellm_provider") + if isinstance(provider, str) and provider: + return provider + + return self._infer_litellm_provider() + def _get_litellm_api_key_value(self) -> str | None: api_key_value: str | None = None if self.api_key: @@ -1414,6 +1423,12 @@ def format_messages_for_llm(self, messages: list[Message]) -> list[dict]: ) send_reasoning_content = model_features.send_reasoning_content + messages = maybe_resize_messages_for_provider( + messages, + provider=self._infer_model_info_provider(), + vision_enabled=vision_enabled, + ) + formatted_messages = [ message.to_chat_dict( cache_enabled=cache_enabled, diff --git a/openhands-sdk/openhands/sdk/llm/utils/image_resize.py b/openhands-sdk/openhands/sdk/llm/utils/image_resize.py new file mode 100644 index 0000000000..69ed52f031 --- /dev/null +++ b/openhands-sdk/openhands/sdk/llm/utils/image_resize.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +import base64 +import copy +import io + +from PIL import Image + +from openhands.sdk.llm.message import ImageContent, Message +from openhands.sdk.logger import get_logger + + +logger = get_logger(__name__) + +# Anthropic vision docs: requests with more than 20 images cap each image at +# 2000x2000 pixels. Requests with 20 or fewer images cap each image at +# 8000x8000 pixels. +# https://docs.anthropic.com/en/docs/build-with-claude/vision +ANTHROPIC_MANY_IMAGE_THRESHOLD = 20 +ANTHROPIC_MANY_IMAGE_MAX_DIMENSION = 2000 +ANTHROPIC_STANDARD_IMAGE_MAX_DIMENSION = 8000 + + +def maybe_resize_messages_for_provider( + messages: list[Message], *, provider: str | None, vision_enabled: bool +) -> list[Message]: + """Return a detached message list with provider-specific image resizing.""" + max_dimension = _get_image_max_dimension( + messages=messages, + provider=provider, + vision_enabled=vision_enabled, + ) + if max_dimension is None: + return messages + + resized_messages = copy.deepcopy(messages) + for message in resized_messages: + for content_item in message.content: + if isinstance(content_item, ImageContent): + content_item.image_urls = [ + _resize_base64_data_url(url, max_dimension=max_dimension) + for url in content_item.image_urls + ] + return resized_messages + + +def _get_image_max_dimension( + messages: list[Message], *, provider: str | None, vision_enabled: bool +) -> int | None: + if not vision_enabled or provider != "anthropic": + return None + + total_images = sum( + len(content_item.image_urls) + for message in messages + for content_item in message.content + if isinstance(content_item, ImageContent) + ) + if total_images == 0: + return None + if total_images <= ANTHROPIC_MANY_IMAGE_THRESHOLD: + return ANTHROPIC_STANDARD_IMAGE_MAX_DIMENSION + + return ANTHROPIC_MANY_IMAGE_MAX_DIMENSION + + +def _resize_base64_data_url(url: str, *, max_dimension: int) -> str: + if not url.startswith("data:image/"): + return url + + header, sep, encoded = url.partition(";base64,") + if not sep: + return url + + mime_type = header.removeprefix("data:") + + try: + raw_bytes = base64.b64decode(encoded) + with Image.open(io.BytesIO(raw_bytes)) as image: + if max(image.size) <= max_dimension: + return url + + image.thumbnail( + (max_dimension, max_dimension), + Image.Resampling.LANCZOS, + ) + image_format = image.format or mime_type.split("/", 1)[1].upper() + + if image_format == "JPG": + image_format = "JPEG" + + output_image = image + if image_format == "JPEG" and image.mode not in ("RGB", "L"): + output_image = image.convert("RGB") + + buffer = io.BytesIO() + output_image.save(buffer, format=image_format) + except Exception: + logger.warning( + "Failed to resize base64 data image for outgoing LLM request", + exc_info=True, + ) + return url + + resized_encoded = base64.b64encode(buffer.getvalue()).decode("ascii") + return f"data:{mime_type};base64,{resized_encoded}" diff --git a/openhands-sdk/pyproject.toml b/openhands-sdk/pyproject.toml index b5d9e78a60..8ab48a706a 100644 --- a/openhands-sdk/pyproject.toml +++ b/openhands-sdk/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "filelock>=3.20.1", "httpx[socks]>=0.27.0", "litellm>=1.83.7", + "pillow>=12.1.1", "pydantic>=2.12.5", "python-frontmatter>=1.1.0", "python-json-logger>=3.3.0", diff --git a/pyproject.toml b/pyproject.toml index d92b3cac55..5d5b46781e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ openhands-agent-server = { workspace = true } dev = [ "pre-commit>=4.3.0", "packaging>=24.2", + "pillow>=12.1.1", "psutil>=7.0.0", "pyright[nodejs]>=1.1.405", "pytest>=9.0.3", diff --git a/tests/sdk/llm/test_llm_image_resizing.py b/tests/sdk/llm/test_llm_image_resizing.py new file mode 100644 index 0000000000..795ea2d140 --- /dev/null +++ b/tests/sdk/llm/test_llm_image_resizing.py @@ -0,0 +1,225 @@ +import base64 +import io +from unittest.mock import patch + +from PIL import Image +from pydantic import SecretStr + +from openhands.sdk.llm import LLM, ImageContent, Message, TextContent +from openhands.sdk.llm.utils.image_resize import maybe_resize_messages_for_provider + + +def _make_png_data_url(width: int, height: int) -> str: + image = Image.new("RGB", (width, height), color="red") + buffer = io.BytesIO() + image.save(buffer, format="PNG") + encoded = base64.b64encode(buffer.getvalue()).decode("ascii") + return f"data:image/png;base64,{encoded}" + + +def _data_url_dimensions(url: str) -> tuple[int, int]: + _header, _sep, encoded = url.partition(";base64,") + image_bytes = base64.b64decode(encoded) + with Image.open(io.BytesIO(image_bytes)) as image: + return image.size + + +def _image_urls_from_chat_message(chat_message: dict) -> list[str]: + return [ + item["image_url"]["url"] + for item in chat_message["content"] + if item.get("type") == "image_url" + ] + + +def _format_for_provider( + llm: LLM, messages: list[Message], *, provider: str +) -> list[dict]: + with ( + patch.object(LLM, "vision_is_active", return_value=True), + patch.object(LLM, "_infer_litellm_provider", return_value=provider), + ): + return llm.format_messages_for_llm(messages) + + +def test_maybe_resize_messages_for_provider_does_not_mutate_inputs(): + original_url = _make_png_data_url(2400, 1200) + original_message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[original_url] * 21), + ], + ) + + resized_messages = maybe_resize_messages_for_provider( + [original_message], provider="anthropic", vision_enabled=True + ) + + resized_content = resized_messages[0].content[1] + assert isinstance(resized_content, ImageContent) + assert resized_messages[0] is not original_message + assert _data_url_dimensions(resized_content.image_urls[0]) == (2000, 1000) + + original_content = original_message.content[1] + assert isinstance(original_content, ImageContent) + assert original_content.image_urls[0] == original_url + + +def test_anthropic_many_image_requests_resize_base64_images(): + original_url = _make_png_data_url(2400, 1200) + message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[original_url] * 21), + ], + ) + llm = LLM( + model="anthropic/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-anthropic-many-image", + ) + + formatted = _format_for_provider(llm, [message], provider="anthropic") + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert len(image_urls) == 21 + assert _data_url_dimensions(image_urls[0]) == (2000, 1000) + original_content = message.content[1] + assert isinstance(original_content, ImageContent) + assert original_content.image_urls[0] == original_url + + +def test_proxy_anthropic_many_image_requests_use_model_info_provider(): + original_url = _make_png_data_url(2400, 1200) + message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[original_url] * 21), + ], + ) + llm = LLM( + model="litellm_proxy/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-proxy-anthropic-many-image", + ) + llm._model_info = {"litellm_provider": "anthropic"} + + with ( + patch.object(LLM, "vision_is_active", return_value=True), + patch.object(LLM, "_infer_litellm_provider", return_value="litellm_proxy"), + ): + formatted = llm.format_messages_for_llm([message]) + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert len(image_urls) == 21 + assert _data_url_dimensions(image_urls[0]) == (2000, 1000) + + +def test_anthropic_exactly_twenty_images_use_standard_limit(): + original_url = _make_png_data_url(8001, 400) + message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[original_url] * 20), + ], + ) + llm = LLM( + model="anthropic/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-anthropic-twenty-images", + ) + + formatted = _format_for_provider(llm, [message], provider="anthropic") + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert len(image_urls) == 20 + assert _data_url_dimensions(image_urls[0]) == (8000, 400) + + +def test_anthropic_single_image_requests_do_not_resize(): + original_url = _make_png_data_url(2400, 2400) + message = Message( + role="user", + content=[ + TextContent(text="Describe this image."), + ImageContent(image_urls=[original_url]), + ], + ) + llm = LLM( + model="anthropic/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-anthropic-single-image", + ) + + formatted = _format_for_provider(llm, [message], provider="anthropic") + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert image_urls == [original_url] + assert _data_url_dimensions(image_urls[0]) == (2400, 2400) + + +def test_anthropic_single_image_requests_resize_above_standard_limit(): + original_url = _make_png_data_url(8001, 400) + message = Message( + role="user", + content=[ + TextContent(text="Describe this image."), + ImageContent(image_urls=[original_url]), + ], + ) + llm = LLM( + model="anthropic/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-anthropic-single-image-large", + ) + + formatted = _format_for_provider(llm, [message], provider="anthropic") + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert _data_url_dimensions(image_urls[0]) == (8000, 400) + + +def test_anthropic_many_image_requests_leave_url_images_unchanged(): + image_url = "https://example.com/image.png" + message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[image_url] * 21), + ], + ) + llm = LLM( + model="anthropic/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-anthropic-url-images", + ) + + formatted = _format_for_provider(llm, [message], provider="anthropic") + + assert _image_urls_from_chat_message(formatted[0]) == [image_url] * 21 + + +def test_non_anthropic_many_image_requests_do_not_resize(): + original_url = _make_png_data_url(2400, 1200) + message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[original_url] * 25), + ], + ) + llm = LLM( + model="gpt-4o", + api_key=SecretStr("test-key"), + usage_id="test-openai-many-image", + ) + + formatted = _format_for_provider(llm, [message], provider="openai") + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert len(image_urls) == 25 + assert _data_url_dimensions(image_urls[0]) == (2400, 1200) diff --git a/uv.lock b/uv.lock index 8b433dc050..f02e2a6005 100644 --- a/uv.lock +++ b/uv.lock @@ -8,7 +8,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-27T19:20:35.138318429Z" +exclude-newer = "2026-04-28T15:16:34.692638Z" exclude-newer-span = "P7D" [manifest] @@ -33,6 +33,7 @@ constraints = [ dev = [ { name = "griffe", extras = ["pypi"], specifier = ">=2.0.0" }, { name = "packaging", specifier = ">=24.2" }, + { name = "pillow", specifier = ">=12.1.1" }, { name = "pre-commit", specifier = ">=4.3.0" }, { name = "psutil", specifier = ">=7.0.0" }, { name = "pycodestyle", specifier = ">=2.12.0" }, @@ -2492,6 +2493,7 @@ dependencies = [ { name = "httpx", extra = ["socks"] }, { name = "litellm" }, { name = "lmnr" }, + { name = "pillow" }, { name = "pydantic" }, { name = "python-frontmatter" }, { name = "python-json-logger" }, @@ -2515,6 +2517,7 @@ requires-dist = [ { name = "httpx", extras = ["socks"], specifier = ">=0.27.0" }, { name = "litellm", specifier = ">=1.83.7" }, { name = "lmnr", specifier = ">=0.7.47" }, + { name = "pillow", specifier = ">=12.1.1" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, { name = "python-json-logger", specifier = ">=3.3.0" },