From 19c998b9f85107f22e9a26514aa8210a27f9c8c9 Mon Sep 17 00:00:00 2001 From: Zheng Lu Date: Mon, 23 Mar 2026 22:39:33 +0000 Subject: [PATCH 01/10] fix(sdk): resize Anthropic many-image inputs Co-authored-by: openhands --- openhands-sdk/openhands/sdk/llm/llm.py | 85 +++++++++++++++ openhands-sdk/pyproject.toml | 1 + tests/sdk/llm/test_llm_image_resizing.py | 130 +++++++++++++++++++++++ 3 files changed, 216 insertions(+) create mode 100644 tests/sdk/llm/test_llm_image_resizing.py diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index c80ffcf0d1..4eb59a5b78 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -1,6 +1,8 @@ from __future__ import annotations +import base64 import copy +import io import json import os import warnings @@ -73,6 +75,7 @@ supports_vision, token_counter, ) +from PIL import Image from openhands.sdk.llm.exceptions import ( LLMContextWindowTooSmallError, @@ -83,6 +86,7 @@ # OpenHands utilities from openhands.sdk.llm.llm_response import LLMResponse from openhands.sdk.llm.message import ( + ImageContent, Message, ) from openhands.sdk.llm.mixins.non_native_fc import NonNativeToolCallingMixin @@ -127,6 +131,8 @@ # This cap prevents requesting output that exceeds the context window. # 16384 is a safe default that works for most models (GPT-4o: 16k, Claude: 8k). DEFAULT_MAX_OUTPUT_TOKENS_CAP: Final[int] = 16384 +ANTHROPIC_MANY_IMAGE_THRESHOLD: Final[int] = 20 +ANTHROPIC_MANY_IMAGE_MAX_DIMENSION: Final[int] = 2000 class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin): @@ -1268,6 +1274,83 @@ def _apply_prompt_caching(self, messages: list[Message]) -> None: ].cache_prompt = True # Last item inside the message content break + def _apply_outgoing_image_resize( + self, messages: list[Message], *, vision_enabled: bool + ) -> None: + max_dimension = self._get_outgoing_image_max_dimension( + messages=messages, vision_enabled=vision_enabled + ) + if max_dimension is None: + return + + for message in messages: + for content_item in message.content: + if isinstance(content_item, ImageContent): + content_item.image_urls = [ + self._resize_base64_data_image_url( + url, max_dimension=max_dimension + ) + for url in content_item.image_urls + ] + + def _get_outgoing_image_max_dimension( + self, messages: list[Message], *, vision_enabled: bool + ) -> int | None: + if not vision_enabled or self._infer_litellm_provider() != "anthropic": + return None + + total_images = sum( + len(content_item.image_urls) + for message in messages + for content_item in message.content + if isinstance(content_item, ImageContent) + ) + if total_images <= ANTHROPIC_MANY_IMAGE_THRESHOLD: + return None + + return ANTHROPIC_MANY_IMAGE_MAX_DIMENSION + + @staticmethod + def _resize_base64_data_image_url(url: str, *, max_dimension: int) -> str: + if not url.startswith("data:image/"): + return url + + header, sep, encoded = url.partition(";base64,") + if not sep: + return url + + mime_type = header.removeprefix("data:") + + try: + raw_bytes = base64.b64decode(encoded) + with Image.open(io.BytesIO(raw_bytes)) as image: + if max(image.size) <= max_dimension: + return url + + resized_image = image.copy() + resized_image.thumbnail( + (max_dimension, max_dimension), Image.Resampling.LANCZOS + ) + image_format = image.format or mime_type.split("/", 1)[1].upper() + + if image_format == "JPG": + image_format = "JPEG" + + if image_format == "JPEG" and resized_image.mode not in ("RGB", "L"): + resized_image = resized_image.convert("RGB") + + buffer = io.BytesIO() + resized_image.save(buffer, format=image_format) + except Exception: + logger.warning( + "Failed to resize base64 data image for outgoing LLM request", + exc_info=True, + ) + return url + + resized_encoded = base64.b64encode(buffer.getvalue()).decode("ascii") + return f"data:{mime_type};base64,{resized_encoded}" + def format_messages_for_llm(self, messages: list[Message]) -> list[dict]: """Formats Message objects for LLM consumption.""" @@ -1286,6 +1369,8 @@ def format_messages_for_llm(self, messages: list[Message]) -> list[dict]: ) send_reasoning_content = model_features.send_reasoning_content + self._apply_outgoing_image_resize(messages, vision_enabled=vision_enabled) + formatted_messages = [ message.to_chat_dict( cache_enabled=cache_enabled, diff --git a/openhands-sdk/pyproject.toml b/openhands-sdk/pyproject.toml index 248921c6e2..ff8d0f3f1f 100644 --- a/openhands-sdk/pyproject.toml +++ b/openhands-sdk/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "filelock>=3.20.1", "httpx>=0.27.0", "litellm>=1.80.10", + "pillow>=12.1.1", "pydantic>=2.12.5", "python-frontmatter>=1.1.0", "python-json-logger>=3.3.0", diff --git a/tests/sdk/llm/test_llm_image_resizing.py b/tests/sdk/llm/test_llm_image_resizing.py new file mode 100644 index 0000000000..fe2d094158 --- /dev/null +++ b/tests/sdk/llm/test_llm_image_resizing.py @@ -0,0 +1,130 @@ +import base64 +import io +from unittest.mock import patch + +from PIL import Image +from pydantic import SecretStr + +from openhands.sdk.llm import LLM, ImageContent, Message, TextContent + + +def _make_png_data_url(width: int, height: int) -> str: + image = Image.new("RGB", (width, height), color="red") + buffer = io.BytesIO() + image.save(buffer, format="PNG") + encoded = base64.b64encode(buffer.getvalue()).decode("ascii") + return f"data:image/png;base64,{encoded}" + + +def _data_url_dimensions(url: str) -> tuple[int, int]: + _header, _sep, encoded = url.partition(";base64,") + image_bytes = base64.b64decode(encoded) + with Image.open(io.BytesIO(image_bytes)) as image: + return image.size + + +def _image_urls_from_chat_message(chat_message: dict) -> list[str]: + return [ + item["image_url"]["url"] + for item in chat_message["content"] + if item.get("type") == "image_url" + ] + + +def _format_for_provider( + llm: LLM, messages: list[Message], *, provider: str +) -> list[dict]: + with ( + patch.object(LLM, "vision_is_active", return_value=True), + patch.object(LLM, "_infer_litellm_provider", return_value=provider), + ): + return llm.format_messages_for_llm(messages) + + +def test_anthropic_many_image_requests_resize_base64_images(): + original_url = _make_png_data_url(2400, 1200) + message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[original_url] * 21), + ], + ) + llm = LLM( + model="anthropic/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-anthropic-many-image", + ) + + formatted = _format_for_provider(llm, [message], provider="anthropic") + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert len(image_urls) == 21 + assert _data_url_dimensions(image_urls[0]) == (2000, 1000) + original_content = message.content[1] + assert isinstance(original_content, ImageContent) + assert original_content.image_urls[0] == original_url + + +def test_anthropic_exactly_twenty_images_do_not_resize(): + original_url = _make_png_data_url(2400, 1200) + message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[original_url] * 20), + ], + ) + llm = LLM( + model="anthropic/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-anthropic-twenty-images", + ) + + formatted = _format_for_provider(llm, [message], provider="anthropic") + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert len(image_urls) == 20 + assert _data_url_dimensions(image_urls[0]) == (2400, 1200) + + +def test_anthropic_single_image_requests_do_not_resize(): + original_url = _make_png_data_url(2400, 2400) + message = Message( + role="user", + content=[ + TextContent(text="Describe this image."), + ImageContent(image_urls=[original_url]), + ], + ) + llm = LLM( + model="anthropic/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-anthropic-single-image", + ) + + formatted = _format_for_provider(llm, [message], provider="anthropic") + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert image_urls == [original_url] + assert _data_url_dimensions(image_urls[0]) == (2400, 2400) + + +def test_anthropic_many_image_requests_leave_url_images_unchanged(): + image_url = "https://example.com/image.png" + message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[image_url] * 21), + ], + ) + llm = LLM( + model="anthropic/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-anthropic-url-images", + ) + + formatted = _format_for_provider(llm, [message], provider="anthropic") + + assert _image_urls_from_chat_message(formatted[0]) == [image_url] * 21 From 4d047ed08adcdf90c2283fce4366751457a4b171 Mon Sep 17 00:00:00 2001 From: Zheng Lu Date: Sat, 28 Mar 2026 00:26:28 +0000 Subject: [PATCH 02/10] fix(sdk): handle Anthropic single-image limits Co-authored-by: openhands --- openhands-sdk/openhands/sdk/llm/llm.py | 90 +------------ .../openhands/sdk/llm/utils/image_resize.py | 122 ++++++++++++++++++ openhands-sdk/pyproject.toml | 2 +- pyproject.toml | 1 + tests/sdk/llm/test_llm_image_resizing.py | 49 ++++++- 5 files changed, 176 insertions(+), 88 deletions(-) create mode 100644 openhands-sdk/openhands/sdk/llm/utils/image_resize.py diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index 927933307f..90d8f3316e 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -1,8 +1,6 @@ from __future__ import annotations -import base64 import copy -import io import json import os import warnings @@ -76,7 +74,6 @@ supports_vision, token_counter, ) -from PIL import Image from openhands.sdk.llm.exceptions import ( LLMContextWindowTooSmallError, @@ -87,7 +84,6 @@ # OpenHands utilities from openhands.sdk.llm.llm_response import LLMResponse from openhands.sdk.llm.message import ( - ImageContent, Message, ) from openhands.sdk.llm.mixins.non_native_fc import NonNativeToolCallingMixin @@ -96,6 +92,7 @@ from openhands.sdk.llm.streaming import ( TokenCallbackType, ) +from openhands.sdk.llm.utils.image_resize import maybe_resize_messages_for_provider from openhands.sdk.llm.utils.litellm_provider import infer_litellm_provider from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot from openhands.sdk.llm.utils.model_features import get_features @@ -132,8 +129,6 @@ # This cap prevents requesting output that exceeds the context window. # 16384 is a safe default that works for most models (GPT-4o: 16k, Claude: 8k). DEFAULT_MAX_OUTPUT_TOKENS_CAP: Final[int] = 16384 -ANTHROPIC_MANY_IMAGE_THRESHOLD: Final[int] = 20 -ANTHROPIC_MANY_IMAGE_MAX_DIMENSION: Final[int] = 2000 class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin): @@ -1350,83 +1345,6 @@ def _apply_prompt_caching(self, messages: list[Message]) -> None: ].cache_prompt = True # Last item inside the message content break - def _apply_outgoing_image_resize( - self, messages: list[Message], *, vision_enabled: bool - ) -> None: - max_dimension = self._get_outgoing_image_max_dimension( - messages=messages, vision_enabled=vision_enabled - ) - if max_dimension is None: - return - - for message in messages: - for content_item in message.content: - if isinstance(content_item, ImageContent): - content_item.image_urls = [ - self._resize_base64_data_image_url( - url, max_dimension=max_dimension - ) - for url in content_item.image_urls - ] - - def _get_outgoing_image_max_dimension( - self, messages: list[Message], *, vision_enabled: bool - ) -> int | None: - if not vision_enabled or self._infer_litellm_provider() != "anthropic": - return None - - total_images = sum( - len(content_item.image_urls) - for message in messages - for content_item in message.content - if isinstance(content_item, ImageContent) - ) - if total_images <= ANTHROPIC_MANY_IMAGE_THRESHOLD: - return None - - return ANTHROPIC_MANY_IMAGE_MAX_DIMENSION - - @staticmethod - def _resize_base64_data_image_url(url: str, *, max_dimension: int) -> str: - if not url.startswith("data:image/"): - return url - - header, sep, encoded = url.partition(";base64,") - if not sep: - return url - - mime_type = header.removeprefix("data:") - - try: - raw_bytes = base64.b64decode(encoded) - with Image.open(io.BytesIO(raw_bytes)) as image: - if max(image.size) <= max_dimension: - return url - - resized_image = image.copy() - resized_image.thumbnail( - (max_dimension, max_dimension), Image.Resampling.LANCZOS - ) - image_format = image.format or mime_type.split("/", 1)[1].upper() - - if image_format == "JPG": - image_format = "JPEG" - - if image_format == "JPEG" and resized_image.mode not in ("RGB", "L"): - resized_image = resized_image.convert("RGB") - - buffer = io.BytesIO() - resized_image.save(buffer, format=image_format) - except Exception: - logger.warning( - "Failed to resize base64 data image for outgoing LLM request", - exc_info=True, - ) - return url - - resized_encoded = base64.b64encode(buffer.getvalue()).decode("ascii") - return f"data:{mime_type};base64,{resized_encoded}" - def format_messages_for_llm(self, messages: list[Message]) -> list[dict]: """Formats Message objects for LLM consumption.""" @@ -1445,7 +1363,11 @@ def format_messages_for_llm(self, messages: list[Message]) -> list[dict]: ) send_reasoning_content = model_features.send_reasoning_content - self._apply_outgoing_image_resize(messages, vision_enabled=vision_enabled) + maybe_resize_messages_for_provider( + messages, + provider=self._infer_litellm_provider(), + vision_enabled=vision_enabled, + ) formatted_messages = [ message.to_chat_dict( diff --git a/openhands-sdk/openhands/sdk/llm/utils/image_resize.py b/openhands-sdk/openhands/sdk/llm/utils/image_resize.py new file mode 100644 index 0000000000..2a07fbcc1a --- /dev/null +++ b/openhands-sdk/openhands/sdk/llm/utils/image_resize.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +import base64 +import io +from typing import Any + +from openhands.sdk.llm.message import ImageContent, Message +from openhands.sdk.logger import get_logger + + +logger = get_logger(__name__) + +# Anthropic vision docs: requests with more than 20 images cap each image at +# 2000x2000 pixels. Requests with 20 or fewer images cap each image at +# 8000x8000 pixels. +# https://docs.anthropic.com/en/docs/build-with-claude/vision +ANTHROPIC_MANY_IMAGE_THRESHOLD = 20 +ANTHROPIC_MANY_IMAGE_MAX_DIMENSION = 2000 +ANTHROPIC_STANDARD_IMAGE_MAX_DIMENSION = 8000 + + +def maybe_resize_messages_for_provider( + messages: list[Message], *, provider: str | None, vision_enabled: bool +) -> None: + max_dimension = _get_image_max_dimension( + messages=messages, + provider=provider, + vision_enabled=vision_enabled, + ) + if max_dimension is None: + return + + image_module = _load_pillow_image_module() + if image_module is None: + return + + for message in messages: + for content_item in message.content: + if isinstance(content_item, ImageContent): + content_item.image_urls = [ + resize_base64_data_url( + url, + max_dimension=max_dimension, + image_module=image_module, + ) + for url in content_item.image_urls + ] + + +def _get_image_max_dimension( + messages: list[Message], *, provider: str | None, vision_enabled: bool +) -> int | None: + if not vision_enabled or provider != "anthropic": + return None + + total_images = sum( + len(content_item.image_urls) + for message in messages + for content_item in message.content + if isinstance(content_item, ImageContent) + ) + if total_images == 0: + return None + if total_images <= ANTHROPIC_MANY_IMAGE_THRESHOLD: + return ANTHROPIC_STANDARD_IMAGE_MAX_DIMENSION + + return ANTHROPIC_MANY_IMAGE_MAX_DIMENSION + + +def _load_pillow_image_module() -> Any | None: + try: + from PIL import Image + except ImportError: + logger.warning( + "pillow is not installed; skipping Anthropic image resizing. " + "Install openhands-sdk[pillow] to enable base64 image downscaling." + ) + return None + + return Image + + +def resize_base64_data_url(url: str, *, max_dimension: int, image_module: Any) -> str: + if not url.startswith("data:image/"): + return url + + header, sep, encoded = url.partition(";base64,") + if not sep: + return url + + mime_type = header.removeprefix("data:") + + try: + raw_bytes = base64.b64decode(encoded) + with image_module.open(io.BytesIO(raw_bytes)) as image: + if max(image.size) <= max_dimension: + return url + + image.thumbnail( + (max_dimension, max_dimension), + image_module.Resampling.LANCZOS, + ) + image_format = image.format or mime_type.split("/", 1)[1].upper() + + if image_format == "JPG": + image_format = "JPEG" + + output_image = image + if image_format == "JPEG" and image.mode not in ("RGB", "L"): + output_image = image.convert("RGB") + + buffer = io.BytesIO() + output_image.save(buffer, format=image_format) + except Exception: + logger.warning( + "Failed to resize base64 data image for outgoing LLM request", + exc_info=True, + ) + return url + + resized_encoded = base64.b64encode(buffer.getvalue()).decode("ascii") + return f"data:{mime_type};base64,{resized_encoded}" diff --git a/openhands-sdk/pyproject.toml b/openhands-sdk/pyproject.toml index 2710de1d7e..d58445eb82 100644 --- a/openhands-sdk/pyproject.toml +++ b/openhands-sdk/pyproject.toml @@ -12,7 +12,6 @@ dependencies = [ "filelock>=3.20.1", "httpx[socks]>=0.27.0", "litellm==1.80.10", - "pillow>=12.1.1", "pydantic>=2.12.5", "python-frontmatter>=1.1.0", "python-json-logger>=3.3.0", @@ -29,6 +28,7 @@ Documentation = "https://docs.openhands.dev/sdk" [project.optional-dependencies] boto3 = ["boto3>=1.35.0"] +pillow = ["pillow>=12.1.1"] [build-system] requires = ["setuptools>=61.0", "wheel"] diff --git a/pyproject.toml b/pyproject.toml index a2106c91d1..f70837c53f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ openhands-agent-server = { workspace = true } dev = [ "pre-commit>=4.3.0", "packaging>=24.2", + "pillow>=12.1.1", "psutil>=7.0.0", "pyright[nodejs]>=1.1.405", "pytest>=8.4.1", diff --git a/tests/sdk/llm/test_llm_image_resizing.py b/tests/sdk/llm/test_llm_image_resizing.py index fe2d094158..c713042c21 100644 --- a/tests/sdk/llm/test_llm_image_resizing.py +++ b/tests/sdk/llm/test_llm_image_resizing.py @@ -66,8 +66,8 @@ def test_anthropic_many_image_requests_resize_base64_images(): assert original_content.image_urls[0] == original_url -def test_anthropic_exactly_twenty_images_do_not_resize(): - original_url = _make_png_data_url(2400, 1200) +def test_anthropic_exactly_twenty_images_use_standard_limit(): + original_url = _make_png_data_url(8001, 400) message = Message( role="user", content=[ @@ -85,7 +85,7 @@ def test_anthropic_exactly_twenty_images_do_not_resize(): image_urls = _image_urls_from_chat_message(formatted[0]) assert len(image_urls) == 20 - assert _data_url_dimensions(image_urls[0]) == (2400, 1200) + assert _data_url_dimensions(image_urls[0]) == (8000, 400) def test_anthropic_single_image_requests_do_not_resize(): @@ -110,6 +110,27 @@ def test_anthropic_single_image_requests_do_not_resize(): assert _data_url_dimensions(image_urls[0]) == (2400, 2400) +def test_anthropic_single_image_requests_resize_above_standard_limit(): + original_url = _make_png_data_url(8001, 400) + message = Message( + role="user", + content=[ + TextContent(text="Describe this image."), + ImageContent(image_urls=[original_url]), + ], + ) + llm = LLM( + model="anthropic/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-anthropic-single-image-large", + ) + + formatted = _format_for_provider(llm, [message], provider="anthropic") + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert _data_url_dimensions(image_urls[0]) == (8000, 400) + + def test_anthropic_many_image_requests_leave_url_images_unchanged(): image_url = "https://example.com/image.png" message = Message( @@ -128,3 +149,25 @@ def test_anthropic_many_image_requests_leave_url_images_unchanged(): formatted = _format_for_provider(llm, [message], provider="anthropic") assert _image_urls_from_chat_message(formatted[0]) == [image_url] * 21 + + +def test_non_anthropic_many_image_requests_do_not_resize(): + original_url = _make_png_data_url(2400, 1200) + message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[original_url] * 25), + ], + ) + llm = LLM( + model="gpt-4o", + api_key=SecretStr("test-key"), + usage_id="test-openai-many-image", + ) + + formatted = _format_for_provider(llm, [message], provider="openai") + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert len(image_urls) == 25 + assert _data_url_dimensions(image_urls[0]) == (2400, 1200) From e2ed9aa0b5fff7124ea49634cdc32d4edaf4c836 Mon Sep 17 00:00:00 2001 From: Zheng Lu Date: Sun, 19 Apr 2026 00:08:00 +0100 Subject: [PATCH 03/10] fix(sdk): simplify image resize utilities Co-authored-by: openhands --- openhands-sdk/openhands/sdk/llm/llm.py | 2 +- .../openhands/sdk/llm/utils/image_resize.py | 49 ++++++++----------- tests/sdk/llm/test_llm_image_resizing.py | 25 ++++++++++ 3 files changed, 47 insertions(+), 29 deletions(-) diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index c80b8d246c..b0468ced88 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -1393,7 +1393,7 @@ def format_messages_for_llm(self, messages: list[Message]) -> list[dict]: ) send_reasoning_content = model_features.send_reasoning_content - maybe_resize_messages_for_provider( + messages = maybe_resize_messages_for_provider( messages, provider=self._infer_litellm_provider(), vision_enabled=vision_enabled, diff --git a/openhands-sdk/openhands/sdk/llm/utils/image_resize.py b/openhands-sdk/openhands/sdk/llm/utils/image_resize.py index 2a07fbcc1a..e742e23b9d 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/image_resize.py +++ b/openhands-sdk/openhands/sdk/llm/utils/image_resize.py @@ -1,8 +1,8 @@ from __future__ import annotations import base64 +import copy import io -from typing import Any from openhands.sdk.llm.message import ImageContent, Message from openhands.sdk.logger import get_logger @@ -21,30 +21,34 @@ def maybe_resize_messages_for_provider( messages: list[Message], *, provider: str | None, vision_enabled: bool -) -> None: +) -> list[Message]: + """Return a detached message list with provider-specific image resizing.""" max_dimension = _get_image_max_dimension( messages=messages, provider=provider, vision_enabled=vision_enabled, ) if max_dimension is None: - return + return messages - image_module = _load_pillow_image_module() - if image_module is None: - return + try: + from PIL import Image # noqa: F401 + except ImportError: + logger.warning( + "pillow is not installed; skipping Anthropic image resizing. " + "Install openhands-sdk[pillow] to enable base64 image downscaling." + ) + return messages - for message in messages: + resized_messages = copy.deepcopy(messages) + for message in resized_messages: for content_item in message.content: if isinstance(content_item, ImageContent): content_item.image_urls = [ - resize_base64_data_url( - url, - max_dimension=max_dimension, - image_module=image_module, - ) + _resize_base64_data_url(url, max_dimension=max_dimension) for url in content_item.image_urls ] + return resized_messages def _get_image_max_dimension( @@ -67,20 +71,7 @@ def _get_image_max_dimension( return ANTHROPIC_MANY_IMAGE_MAX_DIMENSION -def _load_pillow_image_module() -> Any | None: - try: - from PIL import Image - except ImportError: - logger.warning( - "pillow is not installed; skipping Anthropic image resizing. " - "Install openhands-sdk[pillow] to enable base64 image downscaling." - ) - return None - - return Image - - -def resize_base64_data_url(url: str, *, max_dimension: int, image_module: Any) -> str: +def _resize_base64_data_url(url: str, *, max_dimension: int) -> str: if not url.startswith("data:image/"): return url @@ -91,14 +82,16 @@ def resize_base64_data_url(url: str, *, max_dimension: int, image_module: Any) - mime_type = header.removeprefix("data:") try: + from PIL import Image + raw_bytes = base64.b64decode(encoded) - with image_module.open(io.BytesIO(raw_bytes)) as image: + with Image.open(io.BytesIO(raw_bytes)) as image: if max(image.size) <= max_dimension: return url image.thumbnail( (max_dimension, max_dimension), - image_module.Resampling.LANCZOS, + Image.Resampling.LANCZOS, ) image_format = image.format or mime_type.split("/", 1)[1].upper() diff --git a/tests/sdk/llm/test_llm_image_resizing.py b/tests/sdk/llm/test_llm_image_resizing.py index c713042c21..26c4ded770 100644 --- a/tests/sdk/llm/test_llm_image_resizing.py +++ b/tests/sdk/llm/test_llm_image_resizing.py @@ -6,6 +6,7 @@ from pydantic import SecretStr from openhands.sdk.llm import LLM, ImageContent, Message, TextContent +from openhands.sdk.llm.utils.image_resize import maybe_resize_messages_for_provider def _make_png_data_url(width: int, height: int) -> str: @@ -41,6 +42,30 @@ def _format_for_provider( return llm.format_messages_for_llm(messages) +def test_maybe_resize_messages_for_provider_does_not_mutate_inputs(): + original_url = _make_png_data_url(2400, 1200) + original_message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[original_url] * 21), + ], + ) + + resized_messages = maybe_resize_messages_for_provider( + [original_message], provider="anthropic", vision_enabled=True + ) + + resized_content = resized_messages[0].content[1] + assert isinstance(resized_content, ImageContent) + assert resized_messages[0] is not original_message + assert _data_url_dimensions(resized_content.image_urls[0]) == (2000, 1000) + + original_content = original_message.content[1] + assert isinstance(original_content, ImageContent) + assert original_content.image_urls[0] == original_url + + def test_anthropic_many_image_requests_resize_base64_images(): original_url = _make_png_data_url(2400, 1200) message = Message( From 64f0104b1542af7dca467a9fb00ac3a13617b2a3 Mon Sep 17 00:00:00 2001 From: Zheng Lu Date: Sun, 19 Apr 2026 01:18:05 +0100 Subject: [PATCH 04/10] fix(sdk): require pillow for image resizing Co-authored-by: openhands --- .../openhands/sdk/llm/utils/image_resize.py | 13 ++----------- openhands-sdk/pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/openhands-sdk/openhands/sdk/llm/utils/image_resize.py b/openhands-sdk/openhands/sdk/llm/utils/image_resize.py index e742e23b9d..69ed52f031 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/image_resize.py +++ b/openhands-sdk/openhands/sdk/llm/utils/image_resize.py @@ -4,6 +4,8 @@ import copy import io +from PIL import Image + from openhands.sdk.llm.message import ImageContent, Message from openhands.sdk.logger import get_logger @@ -31,15 +33,6 @@ def maybe_resize_messages_for_provider( if max_dimension is None: return messages - try: - from PIL import Image # noqa: F401 - except ImportError: - logger.warning( - "pillow is not installed; skipping Anthropic image resizing. " - "Install openhands-sdk[pillow] to enable base64 image downscaling." - ) - return messages - resized_messages = copy.deepcopy(messages) for message in resized_messages: for content_item in message.content: @@ -82,8 +75,6 @@ def _resize_base64_data_url(url: str, *, max_dimension: int) -> str: mime_type = header.removeprefix("data:") try: - from PIL import Image - raw_bytes = base64.b64decode(encoded) with Image.open(io.BytesIO(raw_bytes)) as image: if max(image.size) <= max_dimension: diff --git a/openhands-sdk/pyproject.toml b/openhands-sdk/pyproject.toml index 6aa2c32e4f..1867b20a35 100644 --- a/openhands-sdk/pyproject.toml +++ b/openhands-sdk/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "filelock>=3.20.1", "httpx[socks]>=0.27.0", "litellm>=1.82.6,!=1.82.7,!=1.82.8", + "pillow>=12.1.1", "pydantic>=2.12.5", "python-frontmatter>=1.1.0", "python-json-logger>=3.3.0", @@ -28,7 +29,6 @@ Documentation = "https://docs.openhands.dev/sdk" [project.optional-dependencies] boto3 = ["boto3>=1.35.0"] -pillow = ["pillow>=12.1.1"] [build-system] requires = ["setuptools>=61.0", "wheel"] From 4303819484c367eb6f81cf97b55a1631637bccda Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 27 Apr 2026 14:08:49 +0000 Subject: [PATCH 05/10] feat(example): add many-image send_message to exercise Anthropic resize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a second send_message in 17_image_input.py that sends 21 oversized base64 images (2500×100 px each), triggering the Anthropic many-image threshold (>20 images → 2000 px cap). This exercises the image resizing path introduced in this PR end-to-end. Co-authored-by: openhands --- examples/01_standalone_sdk/17_image_input.py | 66 ++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/examples/01_standalone_sdk/17_image_input.py b/examples/01_standalone_sdk/17_image_input.py index 8ec5d55292..976a2cb943 100644 --- a/examples/01_standalone_sdk/17_image_input.py +++ b/examples/01_standalone_sdk/17_image_input.py @@ -2,10 +2,17 @@ This script mirrors the basic setup from ``examples/01_hello_world.py`` but adds vision support by sending an image to the agent alongside text instructions. + +It also demonstrates multi-image input with base64-encoded images that exercise +the Anthropic many-image resizing path (>20 images are automatically downscaled +to 2000×2000 px). """ +import base64 +import io import os +from PIL import Image from pydantic import SecretStr from openhands.sdk import ( @@ -27,6 +34,16 @@ logger = get_logger(__name__) + +def _make_png_data_url(width: int, height: int, color: str = "red") -> str: + """Create a base64 PNG data URL with the given dimensions and colour.""" + image = Image.new("RGB", (width, height), color=color) + buffer = io.BytesIO() + image.save(buffer, format="PNG") + encoded = base64.b64encode(buffer.getvalue()).decode("ascii") + return f"data:image/png;base64,{encoded}" + + # Configure LLM (vision-capable model) api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." @@ -65,6 +82,7 @@ def conversation_callback(event: Event) -> None: agent=agent, callbacks=[conversation_callback], workspace=cwd ) +# ── Part 1: single URL image ────────────────────────────────────────────── IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png" conversation.send_message( @@ -88,6 +106,54 @@ def conversation_callback(event: Event) -> None: ) conversation.run() +# ── Part 2: many oversized base64 images (exercises Anthropic resize) ───── +# Generate 21 base64 images at 2500×100 px — just above the 20-image threshold +# that triggers Anthropic's many-image limit (2000×2000 px per image). +# The SDK will automatically downscale these before sending to the provider. +COLORS = [ + "red", + "green", + "blue", + "yellow", + "cyan", + "magenta", + "orange", + "purple", + "pink", + "brown", + "gray", + "white", + "navy", + "teal", + "olive", + "maroon", + "lime", + "aqua", + "coral", + "gold", + "indigo", +] +oversized_data_urls = [ + _make_png_data_url(2500, 100, color=COLORS[i % len(COLORS)]) for i in range(21) +] + +conversation.send_message( + Message( + role="user", + content=[ + TextContent( + text=( + "I'm sending you 21 solid-colour test images. " + "List the dominant colour of each image in order, " + "one per line." + ) + ), + ImageContent(image_urls=oversized_data_urls), + ], + ) +) +conversation.run() + print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): From 5095dd0f66fef9da8e8715a392aebac32f2dbc21 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 27 Apr 2026 14:58:13 +0000 Subject: [PATCH 06/10] fix(ci): use head SHA for wait-on-check-action in fork PRs The wait-on-check-action was using github.event.pull_request.head.ref (branch name) to look up check runs. For fork PRs, the branch name doesn't exist in the upstream repo, causing a 422 error: 'No commit found for SHA: fix/2467-image-downscale' Switch to github.event.pull_request.head.sha which always resolves correctly regardless of whether the PR is from a fork. Co-authored-by: openhands --- .github/workflows/run-examples.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml index 79a25d750d..1913b32f42 100644 --- a/.github/workflows/run-examples.yml +++ b/.github/workflows/run-examples.yml @@ -30,7 +30,7 @@ jobs: if: github.event_name == 'pull_request' uses: lewagon/wait-on-check-action@v1.7.0 with: - ref: ${{ github.event.pull_request.head.ref }} + ref: ${{ github.event.pull_request.head.sha }} check-name: Build & Push (python-amd64) repo-token: ${{ secrets.GITHUB_TOKEN }} wait-interval: 10 From f339ec1ba0df64602e7bdac1a1c4dba1255782df Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 27 Apr 2026 15:01:53 +0000 Subject: [PATCH 07/10] fix(ci): handle missing Build & Push check for fork PRs The Build & Push workflow doesn't run for fork PRs, so the wait-on-check-action was timing out and failing with 'The requested check was never run against this ref'. Set fail-on-no-checks: false so the wait step is a no-op when the check doesn't exist (fork PRs) while still waiting properly when it does (non-fork PRs). Co-authored-by: openhands --- .github/workflows/run-examples.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml index 1913b32f42..bcbf226316 100644 --- a/.github/workflows/run-examples.yml +++ b/.github/workflows/run-examples.yml @@ -34,6 +34,7 @@ jobs: check-name: Build & Push (python-amd64) repo-token: ${{ secrets.GITHUB_TOKEN }} wait-interval: 10 + fail-on-no-checks: false - name: Checkout uses: actions/checkout@v6 From 76aec9c4512d78d05bcc1e2181995208f503e3f8 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 27 Apr 2026 15:05:26 +0000 Subject: [PATCH 08/10] fix(ci): make PR comment posting non-fatal for fork PRs Fork PRs have restricted GITHUB_TOKEN permissions and cannot write comments on the upstream repo's PR. The update_comment function was calling exit 1 on failure, which aborted the entire test run. Change to emit warnings and continue when comment creation fails, clearing API_URL to skip subsequent comment attempts. Co-authored-by: openhands --- .github/workflows/run-examples.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml index bcbf226316..b8a2544311 100644 --- a/.github/workflows/run-examples.yml +++ b/.github/workflows/run-examples.yml @@ -110,13 +110,15 @@ jobs: -H "Content-Type: application/json" \ "${API_URL}" \ -d "$payload"); then - echo "::error::Failed to create PR comment." - exit 1 + echo "::warning::Failed to create PR comment (expected for fork PRs). Continuing without comments." + API_URL="" + return fi COMMENT_ID=$(echo "$response" | jq -r '.id // ""') if [ -z "$COMMENT_ID" ]; then - echo "::error::GitHub API response did not include a comment id: $response" - exit 1 + echo "::warning::GitHub API response did not include a comment id. Continuing without comments." + API_URL="" + return fi echo "Created comment with ID: $COMMENT_ID" else @@ -127,8 +129,8 @@ jobs: -H "Content-Type: application/json" \ "https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/issues/comments/${COMMENT_ID}" \ -d "$payload" > /dev/null; then - echo "::error::Failed to update PR comment (ID: $COMMENT_ID)." - exit 1 + echo "::warning::Failed to update PR comment (ID: $COMMENT_ID). Continuing." + return fi fi } From 75390e7c0e7b2fb1e28fa9393de0fe2311b870e7 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 5 May 2026 11:17:18 -0400 Subject: [PATCH 09/10] fix(sdk): resize proxy Anthropic images --- openhands-sdk/openhands/sdk/llm/llm.py | 10 ++++++++- tests/sdk/llm/test_llm_image_resizing.py | 27 ++++++++++++++++++++++++ uv.lock | 5 ++++- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index 38c6e679d8..94d5466b9c 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -1102,6 +1102,14 @@ def _infer_litellm_provider(self) -> str | None: self._litellm_provider = provider return provider + def _infer_model_info_provider(self) -> str | None: + if self._model_info is not None: + provider = self._model_info.get("litellm_provider") + if isinstance(provider, str) and provider: + return provider + + return self._infer_litellm_provider() + def _get_litellm_api_key_value(self) -> str | None: api_key_value: str | None = None if self.api_key: @@ -1417,7 +1425,7 @@ def format_messages_for_llm(self, messages: list[Message]) -> list[dict]: messages = maybe_resize_messages_for_provider( messages, - provider=self._infer_litellm_provider(), + provider=self._infer_model_info_provider(), vision_enabled=vision_enabled, ) diff --git a/tests/sdk/llm/test_llm_image_resizing.py b/tests/sdk/llm/test_llm_image_resizing.py index 26c4ded770..795ea2d140 100644 --- a/tests/sdk/llm/test_llm_image_resizing.py +++ b/tests/sdk/llm/test_llm_image_resizing.py @@ -91,6 +91,33 @@ def test_anthropic_many_image_requests_resize_base64_images(): assert original_content.image_urls[0] == original_url +def test_proxy_anthropic_many_image_requests_use_model_info_provider(): + original_url = _make_png_data_url(2400, 1200) + message = Message( + role="user", + content=[ + TextContent(text="Describe these images."), + ImageContent(image_urls=[original_url] * 21), + ], + ) + llm = LLM( + model="litellm_proxy/claude-opus-4-6", + api_key=SecretStr("test-key"), + usage_id="test-proxy-anthropic-many-image", + ) + llm._model_info = {"litellm_provider": "anthropic"} + + with ( + patch.object(LLM, "vision_is_active", return_value=True), + patch.object(LLM, "_infer_litellm_provider", return_value="litellm_proxy"), + ): + formatted = llm.format_messages_for_llm([message]) + + image_urls = _image_urls_from_chat_message(formatted[0]) + assert len(image_urls) == 21 + assert _data_url_dimensions(image_urls[0]) == (2000, 1000) + + def test_anthropic_exactly_twenty_images_use_standard_limit(): original_url = _make_png_data_url(8001, 400) message = Message( diff --git a/uv.lock b/uv.lock index 8b433dc050..f02e2a6005 100644 --- a/uv.lock +++ b/uv.lock @@ -8,7 +8,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-27T19:20:35.138318429Z" +exclude-newer = "2026-04-28T15:16:34.692638Z" exclude-newer-span = "P7D" [manifest] @@ -33,6 +33,7 @@ constraints = [ dev = [ { name = "griffe", extras = ["pypi"], specifier = ">=2.0.0" }, { name = "packaging", specifier = ">=24.2" }, + { name = "pillow", specifier = ">=12.1.1" }, { name = "pre-commit", specifier = ">=4.3.0" }, { name = "psutil", specifier = ">=7.0.0" }, { name = "pycodestyle", specifier = ">=2.12.0" }, @@ -2492,6 +2493,7 @@ dependencies = [ { name = "httpx", extra = ["socks"] }, { name = "litellm" }, { name = "lmnr" }, + { name = "pillow" }, { name = "pydantic" }, { name = "python-frontmatter" }, { name = "python-json-logger" }, @@ -2515,6 +2517,7 @@ requires-dist = [ { name = "httpx", extras = ["socks"], specifier = ">=0.27.0" }, { name = "litellm", specifier = ">=1.83.7" }, { name = "lmnr", specifier = ">=0.7.47" }, + { name = "pillow", specifier = ">=12.1.1" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, { name = "python-json-logger", specifier = ">=3.3.0" }, From 1e420e2a5a972410e38aa37791033463cba992ac Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 5 May 2026 11:35:19 -0400 Subject: [PATCH 10/10] chore: remove run examples workflow changes --- .github/workflows/run-examples.yml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml index b8a2544311..79a25d750d 100644 --- a/.github/workflows/run-examples.yml +++ b/.github/workflows/run-examples.yml @@ -30,11 +30,10 @@ jobs: if: github.event_name == 'pull_request' uses: lewagon/wait-on-check-action@v1.7.0 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ github.event.pull_request.head.ref }} check-name: Build & Push (python-amd64) repo-token: ${{ secrets.GITHUB_TOKEN }} wait-interval: 10 - fail-on-no-checks: false - name: Checkout uses: actions/checkout@v6 @@ -110,15 +109,13 @@ jobs: -H "Content-Type: application/json" \ "${API_URL}" \ -d "$payload"); then - echo "::warning::Failed to create PR comment (expected for fork PRs). Continuing without comments." - API_URL="" - return + echo "::error::Failed to create PR comment." + exit 1 fi COMMENT_ID=$(echo "$response" | jq -r '.id // ""') if [ -z "$COMMENT_ID" ]; then - echo "::warning::GitHub API response did not include a comment id. Continuing without comments." - API_URL="" - return + echo "::error::GitHub API response did not include a comment id: $response" + exit 1 fi echo "Created comment with ID: $COMMENT_ID" else @@ -129,8 +126,8 @@ jobs: -H "Content-Type: application/json" \ "https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/issues/comments/${COMMENT_ID}" \ -d "$payload" > /dev/null; then - echo "::warning::Failed to update PR comment (ID: $COMMENT_ID). Continuing." - return + echo "::error::Failed to update PR comment (ID: $COMMENT_ID)." + exit 1 fi fi }