fix(actions): extract text from multimodal events in colang history (#1636)

Pouyanpi · web-flow · commit ef1dba542a89 · 2026-05-11T13:29:13.000+02:00
- Fix multimodal (text + image) user messages causing context overflow in Colang 1.0 history
- Root cause: `get_colang_history()` interpolates `event["text"]` directly via fstring. when the message is multimodal, `text` is a list of content dicts (including base64 image blobs), and Python's `str()` representation of the entire list gets embedded into every downstream LLM prompt
- Add `_extract_user_text_from_event()` helper that joins text parts and replaces image entries with `[+ image]`
- Apply the same extraction in `get_last_user_utterance()` so the `mask_prev_user_message` path matches correctly
diff --git a/nemoguardrails/actions/llm/utils.py b/nemoguardrails/actions/llm/utils.py
@@ -420,6 +420,47 @@ def warn_if_truncated(response: LLMResponse, task: str) -> bool:
     return truncated
 
 
+def _extract_user_text_from_event(event_text: Union[str, List[Dict[str, Any]]]) -> str:
+    """Flatten a multimodal user-message payload into a string for colang history.
+
+    Multimodal user events carry ``event_text`` as a list of OpenAI-style
+    content parts (``[{"type": "text", "text": "..."}, {"type": "image_url",
+    "image_url": {...}}, ...]``). Including the full list in the colang
+    history bloats the context with raw base64 data; this helper extracts the
+    visible text parts and appends a ``[+ image]`` marker when one or more
+    image parts were present.
+
+    Non-string text fields (``None`` or other types) inside a content part
+    are skipped so the ``" ".join(...)`` step cannot crash. If the message
+    is image-only, the result is just ``"[+ image]"`` without a leading
+    space.
+
+    Args:
+        event_text: Either a string (already flat) or a list of multimodal
+            content parts.
+
+    Returns:
+        The flattened text. A list input always produces a string; a string
+        input is returned unchanged.
+    """
+    if isinstance(event_text, list):
+        text_parts = []
+        has_images = False
+        for item in event_text:
+            if isinstance(item, dict):
+                if item.get("type") == "text":
+                    text = item.get("text")
+                    if isinstance(text, str) and text:
+                        text_parts.append(text)
+                elif item.get("type") == "image_url":
+                    has_images = True
+        text = " ".join(text_parts)
+        if has_images:
+            text = f"{text} [+ image]".strip() if text else "[+ image]"
+        return text
+    return event_text
+
+
 def get_colang_history(
     events: List[dict],
     include_texts: bool = True,
@@ -463,7 +504,7 @@ def get_colang_history(
 
         for idx, event in enumerate(events):
             if event["type"] == "UserMessage" and include_texts:
-                history += f'user "{event["text"]}"\n'
+                history += f'user "{_extract_user_text_from_event(event["text"])}"\n'
             elif event["type"] == "UserIntent":
                 if include_texts:
                     history += f"  {event['intent']}\n"
@@ -636,7 +677,7 @@ def get_last_user_utterance(events: List[dict]) -> Optional[str]:
     """Returns the last user utterance from the events."""
     for event in reversed(events):
         if event["type"] == "UserMessage":
-            return event["text"]
+            return _extract_user_text_from_event(event["text"])
 
     return None
 
diff --git a/tests/test_actions_llm_utils_multimodal.py b/tests/test_actions_llm_utils_multimodal.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemoguardrails.actions.llm.utils import (
+    get_colang_history,
+    get_last_user_utterance,
+)
+
+FAKE_BASE64 = "iVBORw0KGgoAAAANSUhEUg" * 5000
+
+
+def _multimodal_content(text=None, image_b64=None):
+    parts = []
+    if text is not None:
+        parts.append({"type": "text", "text": text})
+    if image_b64 is not None:
+        parts.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}})
+    return parts
+
+
+class TestGetColangHistoryMultimodal:
+    def test_text_only_message_unchanged(self):
+        events = [{"type": "UserMessage", "text": "Hello there"}]
+        result = get_colang_history(events)
+        assert 'user "Hello there"' in result
+
+    def test_multimodal_text_and_image(self):
+        events = [{"type": "UserMessage", "text": _multimodal_content("Describe this", FAKE_BASE64)}]
+        result = get_colang_history(events)
+        assert FAKE_BASE64 not in result
+        assert "Describe this [+ image]" in result
+
+    def test_multimodal_image_only(self):
+        events = [{"type": "UserMessage", "text": _multimodal_content(image_b64=FAKE_BASE64)}]
+        result = get_colang_history(events)
+        assert FAKE_BASE64 not in result
+        assert 'user "[+ image]"' in result
+
+    def test_multimodal_multiple_text_parts(self):
+        content = [
+            {"type": "text", "text": "First part"},
+            {"type": "text", "text": "Second part"},
+            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{FAKE_BASE64}"}},
+        ]
+        events = [{"type": "UserMessage", "text": content}]
+        result = get_colang_history(events)
+        assert FAKE_BASE64 not in result
+        assert "First part Second part [+ image]" in result
+
+    def test_multimodal_does_not_bloat_history(self):
+        events = [{"type": "UserMessage", "text": _multimodal_content("Describe this", FAKE_BASE64)}]
+        result = get_colang_history(events)
+        assert FAKE_BASE64 not in result
+        assert len(result) < 1000
+
+    def test_mixed_text_and_multimodal_conversation(self):
+        events = [
+            {"type": "UserMessage", "text": "Hi"},
+            {"type": "UserIntent", "intent": "express greeting"},
+            {"type": "BotIntent", "intent": "express greeting"},
+            {"type": "StartUtteranceBotAction", "script": "Hello!"},
+            {"type": "UserMessage", "text": _multimodal_content("What is this?", FAKE_BASE64)},
+        ]
+        result = get_colang_history(events)
+        assert 'user "Hi"' in result
+        assert FAKE_BASE64 not in result
+        assert 'user "What is this? [+ image]"' in result
+
+
+class TestGetLastUserUtteranceMultimodal:
+    def test_text_returns_string(self):
+        events = [{"type": "UserMessage", "text": "Plain text"}]
+        result = get_last_user_utterance(events)
+        assert result == "Plain text"
+        assert isinstance(result, str)
+
+    def test_multimodal_returns_string(self):
+        events = [{"type": "UserMessage", "text": _multimodal_content("Describe this", FAKE_BASE64)}]
+        result = get_last_user_utterance(events)
+        assert isinstance(result, str)
+        assert FAKE_BASE64 not in result
+        assert "[+ image]" in result
+
+    def test_multimodal_image_only(self):
+        events = [{"type": "UserMessage", "text": _multimodal_content(image_b64=FAKE_BASE64)}]
+        result = get_last_user_utterance(events)
+        assert isinstance(result, str)
+        assert FAKE_BASE64 not in result
+        assert result == "[+ image]"
+
+    def test_multimodal_none_text_part_does_not_crash(self):
+        events = [
+            {
+                "type": "UserMessage",
+                "text": [
+                    {"type": "text", "text": None},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{FAKE_BASE64}"}},
+                ],
+            }
+        ]
+        result = get_last_user_utterance(events)
+        assert isinstance(result, str)
+        assert FAKE_BASE64 not in result
+        assert result == "[+ image]"
+
+    def test_empty_list_returns_empty_string(self):
+        events = [{"type": "UserMessage", "text": []}]
+        assert get_last_user_utterance(events) == ""
+
+    def test_multiple_images_single_placeholder(self):
+        events = [
+            {
+                "type": "UserMessage",
+                "text": [
+                    {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAA"}},
+                    {"type": "image_url", "image_url": {"url": "data:image/png;base64,BBB"}},
+                ],
+            }
+        ]
+        assert get_last_user_utterance(events) == "[+ image]"