Enable Gemini prompt cache markers (#3090)

mindbomber · xingyaoww · web-flow · commit 12ec83fda65c · 2026-05-07T16:03:01.000Z
Co-authored-by: Xingyao Wang &lt;xingyao@all-hands.dev&gt;
diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py
@@ -1356,8 +1356,8 @@ def is_caching_prompt_active(self) -> bool:
         """
         if not self.caching_prompt:
             return False
-        # We don't need to look-up model_info, because
-        # only Anthropic models need explicit caching breakpoints
+        # We don't need to look up model_info because explicit caching
+        # breakpoint support is tracked in the local feature table.
         return (
             self.caching_prompt
             and get_features(self._model_name_for_capabilities()).supports_prompt_cache
@@ -1397,7 +1397,8 @@ def _apply_prompt_caching(self, messages: list[Message]) -> None:
                 # Single block: mark it for caching
                 sys_content[0].cache_prompt = True
 
-        # NOTE: this is only needed for anthropic
+        # Anthropic and Gemini both use these cache_control markers. LiteLLM
+        # performs the provider-specific cache setup for Gemini downstream.
         for message in reversed(messages):
             if message.role in ("user", "tool"):
                 message.content[
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -112,6 +112,10 @@ def _supports_reasoning_effort(model: str | None) -> bool:
     "claude-opus-4-6",
     "claude-opus-4-7",
     "claude-sonnet-4-6",
+    # Gemini uses the same cache_control marker format. LiteLLM handles
+    # Vertex/Gemini context-cache creation when these markers are present.
+    "gemini-2.5",
+    "gemini-3",
 ]
 
 # Models that support a top-level prompt_cache_retention parameter
diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py
@@ -123,6 +123,10 @@ def test_extended_thinking_support(model, expected_extended_thinking):
         ("anthropic.claude-3-5-sonnet-20241022", True),
         ("anthropic.claude-3-haiku-20240307", True),
         ("anthropic.claude-3-opus-20240229", True),
+        # Gemini explicit context caching through LiteLLM.
+        ("gemini-2.5-pro", True),
+        ("gemini-3.1-pro-preview", True),
+        ("litellm_proxy/gemini-3.1-pro-preview", True),
         ("gpt-4o", False),  # OpenAI doesn't support explicit prompt caching
         ("gemini-1.5-pro", False),
         ("unknown-model", False),
diff --git a/tests/sdk/llm/test_prompt_caching_cross_conversation.py b/tests/sdk/llm/test_prompt_caching_cross_conversation.py
@@ -144,6 +144,36 @@ def on_event(event):
     assert messages[1].content[-1].cache_prompt is True
 
 
+def test_gemini_prompt_caching_marks_formatted_messages():
+    """Gemini models should emit cache_control markers when caching is enabled."""
+    llm = LLM(
+        model="litellm_proxy/gemini-3.1-pro-preview",
+        usage_id="test",
+        caching_prompt=True,
+    )
+    messages = [
+        Message(
+            role="system",
+            content=[
+                TextContent(text="Static system prompt"),
+                TextContent(text="Dynamic context"),
+            ],
+        ),
+        Message(
+            role="user",
+            content=[TextContent(text="Hello")],
+        ),
+    ]
+
+    formatted_messages = llm.format_messages_for_llm(messages)
+
+    system_content = formatted_messages[0]["content"]
+    user_content = formatted_messages[1]["content"]
+    assert system_content[0]["cache_control"] == {"type": "ephemeral"}
+    assert "cache_control" not in system_content[1]
+    assert user_content[-1]["cache_control"] == {"type": "ephemeral"}
+
+
 @pytest.mark.parametrize(
     ("first_suffix", "second_suffix"),
     [