feat(vlm-clients): pass cachecontrol to the correct part of an anthropic prompt

liamlaverty · liamlaverty · commit b203e919653c · 2026-04-22T10:53:45.000+01:00
diff --git a/src/paint_by_language_model/services/clients/stroke_vlm_client.py b/src/paint_by_language_model/services/clients/stroke_vlm_client.py
@@ -219,25 +219,34 @@ def suggest_strokes(
 
         # Query VLM
         try:
-            images: list[tuple[bytes, str]] = []
+            # Static stroke sample images: routed to ``cached_images`` so they
+            # form the cached prefix of the user message on Anthropic. These
+            # are byte-identical across all iterations of a run and are
+            # therefore ideal cache content. ``cache_control`` is placed by
+            # ``VLMClient`` on the last cached image block.
+            cached_images: list[tuple[bytes, str]] = []
             allowed_lower = (
                 [t.lower() for t in self.allowed_stroke_types]
                 if self.allowed_stroke_types
                 else None
             )
             for stroke_type, sample_bytes in self._stroke_samples.items():
                 if allowed_lower is None or stroke_type.lower() in allowed_lower:
-                    images.append((sample_bytes, f"{stroke_type.upper()} stroke sample"))
-            images.append((canvas_image, "Current canvas"))
+                    cached_images.append((sample_bytes, f"{stroke_type.upper()} stroke sample"))
+
+            # Dynamic per-iteration content: just the current canvas.
+            images: list[tuple[bytes, str]] = [(canvas_image, "Current canvas")]
             logger.debug(
-                f"Attaching {len(images) - 1} stroke sample image(s) "
+                f"Attaching {len(cached_images)} stroke sample image(s) as cached prefix "
+                f"and 1 canvas image as dynamic content "
                 f"(allowed: {self.allowed_stroke_types or 'all'})"
             )
 
             response_text = self.client.query_multimodal_multi_image(
                 prompt=user_prompt,
                 images=images,
                 system_prompt=system_prompt,
+                cached_images=cached_images,
             )
 
             # Store raw response immediately so it is always available,
diff --git a/src/paint_by_language_model/vlm_client.py b/src/paint_by_language_model/vlm_client.py
@@ -87,10 +87,13 @@ def _log_request(
             payload (dict): Request body sent to the API
             response (requests.Response): The HTTP response received
         """
-        log_dir = Path(GLOBAL_PROMPT_LOG_DIR)
+        now = datetime.now()
+        log_dir = (
+            Path(GLOBAL_PROMPT_LOG_DIR) / f"{now.year:04d}" / f"{now.month:02d}" / f"{now.day:02d}"
+        )
         log_dir.mkdir(parents=True, exist_ok=True)
 
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        timestamp = now.strftime("%Y%m%d_%H%M%S_%f")
         log_path = log_dir / f"{timestamp}-request.log"
 
         # Mask sensitive header values
@@ -475,27 +478,71 @@ def _build_multi_image_payload(
         max_tokens: int,
         *,
         system_prompt: str,
+        cached_images: list[tuple[bytes, str]] | None = None,
     ) -> dict:
         """
         Build request payload for a multi-image multimodal query.
 
         Each image is preceded by a text label block. A final text block
         containing the main prompt is appended after all image blocks.
 
+        When ``cached_images`` is provided, those (image_bytes, label) pairs are
+        prepended to the user message before the dynamic ``images``. On
+        Anthropic, ``cache_control: ephemeral`` is placed on the **last cached
+        image block**, telling Anthropic to cache the entire prompt prefix up
+        to and including that block (system prompt + all cached images). The
+        dynamic per-request ``images`` and the final prompt text follow and are
+        not cached.
+
         Args:
             prompt (str): The main text prompt appended after all images
-            images (list[tuple[bytes, str]]): List of (image_bytes, label) pairs
+            images (list[tuple[bytes, str]]): Dynamic per-request
+                (image_bytes, label) pairs that change between calls.
             max_tokens (int): Maximum tokens in the response
             system_prompt (str): System-level instructions; provider-agnostic.
                 Anthropic: placed in top-level ``system`` field as a content
                 block array with block-level ``cache_control``. OpenAI-compatible
                 providers: prepended as a ``role: system`` message.
+            cached_images (list[tuple[bytes, str]] | None): Optional list of
+                static (image_bytes, label) pairs that are byte-identical
+                across requests. Prepended to the user message before the
+                dynamic ``images``. On Anthropic, the **last** of these image
+                blocks carries ``cache_control: ephemeral`` to mark the cache
+                prefix boundary. On OpenAI-compatible providers, they are
+                still prepended (no cache marker — caching is Anthropic-only).
 
         Returns:
             dict: Request payload structure for the API
         """
         message_content: list[dict] = []
 
+        # Prepend cached static images (with cache_control on the last image
+        # block for Anthropic). Anthropic caches everything up to and
+        # including the marked block, so the system prefix + all of these
+        # images become the cache prefix.
+        cached_list = cached_images or []
+        for idx, (image_bytes, label) in enumerate(cached_list):
+            base64_image = base64.b64encode(image_bytes).decode("utf-8")
+            message_content.append({"type": "text", "text": label})
+            is_last_cached = idx == len(cached_list) - 1
+            if self.provider == "anthropic":
+                image_block: dict = {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": base64_image,
+                    },
+                }
+                if is_last_cached:
+                    image_block["cache_control"] = {"type": "ephemeral"}
+                message_content.append(image_block)
+            else:
+                # OpenAI-compatible: use data URL format, no cache support
+                data_url = f"data:image/png;base64,{base64_image}"
+                message_content.append({"type": "image_url", "image_url": {"url": data_url}})
+
+        # Append dynamic per-request images
         for image_bytes, label in images:
             base64_image = base64.b64encode(image_bytes).decode("utf-8")
             # Label block before each image
@@ -523,6 +570,11 @@ def _build_multi_image_payload(
         payload["model"] = self.model
 
         if self.provider == "anthropic":
+            # When cached_images are present, the cache breakpoint lives on
+            # the last cached image block in the user message. The system
+            # block keeps its own breakpoint for text-only / fallback cases,
+            # which is harmless (Anthropic allows up to 4 breakpoints per
+            # request and caches the longest matching prefix).
             payload["system"] = [
                 {
                     "type": "text",
@@ -549,6 +601,7 @@ def query_multimodal_multi_image(
         max_tokens: int = MAX_TOKENS,
         *,
         system_prompt: str,
+        cached_images: list[tuple[bytes, str]] | None = None,
     ) -> str:
         """
         Send multiple labelled images and a text prompt to the VLM in one request.
@@ -559,13 +612,20 @@ def query_multimodal_multi_image(
 
         Args:
             prompt (str): The main text prompt sent after all images
-            images (list[tuple[bytes, str]]): List of (image_bytes, label) pairs
+            images (list[tuple[bytes, str]]): Dynamic per-request
+                (image_bytes, label) pairs that change between calls.
             max_tokens (int): Maximum tokens in response (default from config)
             system_prompt (str): System-level instructions sent to the model.
                 Required keyword-only argument. Anthropic: placed in the
                 top-level ``system`` field as a content block with
                 ``cache_control``. OpenAI-compatible providers: prepended as
                 the first ``role: system`` message.
+            cached_images (list[tuple[bytes, str]] | None): Optional list of
+                static images to include in the cached prompt prefix.
+                Prepended to the user message. On Anthropic, ``cache_control``
+                is placed on the last cached image block to mark the cache
+                boundary. Ignored as a cache marker on OpenAI-compatible
+                providers (still prepended for content parity).
 
         Returns:
             str: The VLM's response text
@@ -575,14 +635,22 @@ def query_multimodal_multi_image(
             ValueError: If image encoding fails
             requests.RequestException: For other HTTP errors
         """
+        cached_count = len(cached_images) if cached_images else 0
         total_bytes = sum(len(img_bytes) for img_bytes, _ in images)
+        cached_bytes = sum(len(img_bytes) for img_bytes, _ in cached_images) if cached_images else 0
         logger.info(
-            f"Sending multi-image query to VLM ({len(images)} images, {total_bytes} total bytes)"
+            f"Sending multi-image query to VLM "
+            f"({len(images)} dynamic images / {total_bytes} bytes; "
+            f"{cached_count} cached images / {cached_bytes} bytes)"
         )
 
         try:
             payload = self._build_multi_image_payload(
-                prompt, images, max_tokens, system_prompt=system_prompt
+                prompt,
+                images,
+                max_tokens,
+                system_prompt=system_prompt,
+                cached_images=cached_images,
             )
 
             # Retry loop for rate limiting
@@ -624,6 +692,7 @@ def query_multimodal_multi_image(
             response.raise_for_status()
 
             response_data = response.json()
+            self.last_usage = response_data.get("usage")
             response_text: str = self._extract_response_text(response_data)
 
             logger.info(f"Received VLM response ({len(response_text)} characters)")
diff --git a/tests/test_stroke_vlm_client.py b/tests/test_stroke_vlm_client.py
@@ -66,13 +66,12 @@ def test_sample_generator_initialized_at_init() -> None:
 
 
 def test_suggest_strokes_sends_sample_images() -> None:
-    """suggest_strokes() calls query_multimodal_multi_image with canvas + 9 sample images.
+    """suggest_strokes() routes stroke samples to cached_images and canvas to images.
 
     Verifies:
     - ``query_multimodal_multi_image`` is called (not ``query_multimodal``)
-        - The ``images`` argument contains exactly 11 entries (1 canvas + 10 samples)
-    - The first image label is ``"Current canvas""
-    - The remaining labels match the expected stroke sample names
+    - The ``images`` argument contains exactly 1 entry (the current canvas)
+    - The ``cached_images`` argument contains the 10 stroke sample entries
     - ``system_prompt`` keyword argument is passed
     """
     client = StrokeVLMClient()
@@ -107,22 +106,18 @@ def test_suggest_strokes_sends_sample_images() -> None:
     assert isinstance(call_kwargs["system_prompt"], str)
     assert len(call_kwargs["system_prompt"]) > 0
 
-    # Inspect the images argument (passed as keyword argument)
-    images: list[tuple[bytes, str]] = (
-        call_kwargs.get("images") or mock_multi.call_args.args[1]
-    )
-
-    assert len(images) == 11, (
-        f"Expected 11 images (1 canvas + 10 samples), got {len(images)}"
-    )
+    # images = dynamic per-iteration content (just the canvas)
+    images: list[tuple[bytes, str]] = call_kwargs["images"]
+    assert len(images) == 1, f"Expected 1 image (canvas only), got {len(images)}"
+    assert images[0][1] == "Current canvas"
+    assert images[0][0] == b"fake_canvas_bytes"
 
-    # Last entry must be the current canvas
-    assert images[-1][1] == "Current canvas", (
-        f"Last image label should be 'Current canvas', got '{images[-1][1]}'"
+    # cached_images = static prefix content (the 10 stroke samples)
+    cached_images: list[tuple[bytes, str]] = call_kwargs["cached_images"]
+    assert len(cached_images) == 10, (
+        f"Expected 10 stroke samples in cached_images, got {len(cached_images)}"
     )
-
-    # First 10 labels must be the stroke sample labels
-    sample_labels = {label for _, label in images[:-1]}
+    sample_labels = {label for _, label in cached_images}
     assert sample_labels == _EXPECTED_SAMPLE_LABELS, (
         f"Sample labels mismatch. Expected {_EXPECTED_SAMPLE_LABELS}, got {sample_labels}"
     )
@@ -234,9 +229,8 @@ def test_suggest_strokes_filters_samples_to_allowed_type() -> None:
     """suggest_strokes() only attaches sample images for allowed stroke types.
 
     When ``allowed_stroke_types=["line"]`` is set, exactly one sample image
-    (the LINE sample) should be appended beyond the canvas image, giving a total
-    of 2 entries in the ``images`` argument passed to
-    ``query_multimodal_multi_image``.
+    (the LINE sample) should appear in ``cached_images``. ``images`` always
+    contains only the current canvas.
     """
     client = StrokeVLMClient(allowed_stroke_types=["line"])
 
@@ -253,27 +247,25 @@ def test_suggest_strokes_filters_samples_to_allowed_type() -> None:
         )
 
     mock_multi.assert_called_once()
-    call_kwargs = mock_multi.call_args
-    images: list[tuple[bytes, str]] = (
-        call_kwargs.kwargs.get("images") or call_kwargs.args[1]
-    )
+    call_kwargs = mock_multi.call_args.kwargs
 
-    assert len(images) == 2, (
-        f"Expected 2 images (1 canvas + 1 allowed sample), got {len(images)}"
-    )
-    assert images[0][1] == "LINE stroke sample", (
-        f"First image label should be 'LINE stroke sample', got '{images[0][1]}'"
-    )
-    assert images[-1][1] == "Current canvas", (
-        f"Last image label should be 'Current canvas', got '{images[-1][1]}'"
+    images: list[tuple[bytes, str]] = call_kwargs["images"]
+    assert len(images) == 1, f"Expected 1 image (canvas), got {len(images)}"
+    assert images[0][1] == "Current canvas"
+
+    cached_images: list[tuple[bytes, str]] = call_kwargs["cached_images"]
+    assert len(cached_images) == 1, (
+        f"Expected 1 sample in cached_images (LINE only), got {len(cached_images)}"
     )
+    assert cached_images[0][1] == "LINE stroke sample"
 
 
 def test_suggest_strokes_sends_all_samples_when_allowed_none() -> None:
-    """suggest_strokes() attaches all sample images when allowed_stroke_types is None.
+    """suggest_strokes() attaches all sample images to cached_images when allowed is None.
 
     When no ``allowed_stroke_types`` restriction is set (the default), all ten
-    stroke sample images should be attached giving 11 total (canvas + 10 samples).
+    stroke sample images should appear in ``cached_images``. ``images`` contains
+    only the current canvas.
     """
     client = StrokeVLMClient()  # allowed_stroke_types defaults to None
 
@@ -290,15 +282,17 @@ def test_suggest_strokes_sends_all_samples_when_allowed_none() -> None:
         )
 
     mock_multi.assert_called_once()
-    call_kwargs = mock_multi.call_args
-    images: list[tuple[bytes, str]] = (
-        call_kwargs.kwargs.get("images") or call_kwargs.args[1]
-    )
+    call_kwargs = mock_multi.call_args.kwargs
+
+    images: list[tuple[bytes, str]] = call_kwargs["images"]
+    assert len(images) == 1
+    assert images[0][1] == "Current canvas"
 
-    assert len(images) == 11, (
-        f"Expected 11 images (1 canvas + 10 samples), got {len(images)}"
+    cached_images: list[tuple[bytes, str]] = call_kwargs["cached_images"]
+    assert len(cached_images) == 10, (
+        f"Expected 10 stroke samples in cached_images, got {len(cached_images)}"
     )
-    sample_labels = {label for _, label in images[:-1]}
+    sample_labels = {label for _, label in cached_images}
     assert sample_labels == _EXPECTED_SAMPLE_LABELS, (
         f"Sample labels mismatch. Expected {_EXPECTED_SAMPLE_LABELS}, got {sample_labels}"
     )
diff --git a/tests/test_vlm_client.py b/tests/test_vlm_client.py