Skip to content

Commit ef1dba5

Browse files
authored
fix(actions): extract text from multimodal events in colang history (#1636)
- Fix multimodal (text + image) user messages causing context overflow in Colang 1.0 history - Root cause: `get_colang_history()` interpolates `event["text"]` directly via fstring. when the message is multimodal, `text` is a list of content dicts (including base64 image blobs), and Python's `str()` representation of the entire list gets embedded into every downstream LLM prompt - Add `_extract_user_text_from_event()` helper that joins text parts and replaces image entries with `[+ image]` - Apply the same extraction in `get_last_user_utterance()` so the `mask_prev_user_message` path matches correctly
1 parent 17973d5 commit ef1dba5

2 files changed

Lines changed: 175 additions & 2 deletions

File tree

nemoguardrails/actions/llm/utils.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,47 @@ def warn_if_truncated(response: LLMResponse, task: str) -> bool:
420420
return truncated
421421

422422

423+
def _extract_user_text_from_event(event_text: Union[str, List[Dict[str, Any]]]) -> str:
424+
"""Flatten a multimodal user-message payload into a string for colang history.
425+
426+
Multimodal user events carry ``event_text`` as a list of OpenAI-style
427+
content parts (``[{"type": "text", "text": "..."}, {"type": "image_url",
428+
"image_url": {...}}, ...]``). Including the full list in the colang
429+
history bloats the context with raw base64 data; this helper extracts the
430+
visible text parts and appends a ``[+ image]`` marker when one or more
431+
image parts were present.
432+
433+
Non-string text fields (``None`` or other types) inside a content part
434+
are skipped so the ``" ".join(...)`` step cannot crash. If the message
435+
is image-only, the result is just ``"[+ image]"`` without a leading
436+
space.
437+
438+
Args:
439+
event_text: Either a string (already flat) or a list of multimodal
440+
content parts.
441+
442+
Returns:
443+
The flattened text. A list input always produces a string; a string
444+
input is returned unchanged.
445+
"""
446+
if isinstance(event_text, list):
447+
text_parts = []
448+
has_images = False
449+
for item in event_text:
450+
if isinstance(item, dict):
451+
if item.get("type") == "text":
452+
text = item.get("text")
453+
if isinstance(text, str) and text:
454+
text_parts.append(text)
455+
elif item.get("type") == "image_url":
456+
has_images = True
457+
text = " ".join(text_parts)
458+
if has_images:
459+
text = f"{text} [+ image]".strip() if text else "[+ image]"
460+
return text
461+
return event_text
462+
463+
423464
def get_colang_history(
424465
events: List[dict],
425466
include_texts: bool = True,
@@ -463,7 +504,7 @@ def get_colang_history(
463504

464505
for idx, event in enumerate(events):
465506
if event["type"] == "UserMessage" and include_texts:
466-
history += f'user "{event["text"]}"\n'
507+
history += f'user "{_extract_user_text_from_event(event["text"])}"\n'
467508
elif event["type"] == "UserIntent":
468509
if include_texts:
469510
history += f" {event['intent']}\n"
@@ -636,7 +677,7 @@ def get_last_user_utterance(events: List[dict]) -> Optional[str]:
636677
"""Returns the last user utterance from the events."""
637678
for event in reversed(events):
638679
if event["type"] == "UserMessage":
639-
return event["text"]
680+
return _extract_user_text_from_event(event["text"])
640681

641682
return None
642683

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
from nemoguardrails.actions.llm.utils import (
17+
get_colang_history,
18+
get_last_user_utterance,
19+
)
20+
21+
FAKE_BASE64 = "iVBORw0KGgoAAAANSUhEUg" * 5000
22+
23+
24+
def _multimodal_content(text=None, image_b64=None):
25+
parts = []
26+
if text is not None:
27+
parts.append({"type": "text", "text": text})
28+
if image_b64 is not None:
29+
parts.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}})
30+
return parts
31+
32+
33+
class TestGetColangHistoryMultimodal:
34+
def test_text_only_message_unchanged(self):
35+
events = [{"type": "UserMessage", "text": "Hello there"}]
36+
result = get_colang_history(events)
37+
assert 'user "Hello there"' in result
38+
39+
def test_multimodal_text_and_image(self):
40+
events = [{"type": "UserMessage", "text": _multimodal_content("Describe this", FAKE_BASE64)}]
41+
result = get_colang_history(events)
42+
assert FAKE_BASE64 not in result
43+
assert "Describe this [+ image]" in result
44+
45+
def test_multimodal_image_only(self):
46+
events = [{"type": "UserMessage", "text": _multimodal_content(image_b64=FAKE_BASE64)}]
47+
result = get_colang_history(events)
48+
assert FAKE_BASE64 not in result
49+
assert 'user "[+ image]"' in result
50+
51+
def test_multimodal_multiple_text_parts(self):
52+
content = [
53+
{"type": "text", "text": "First part"},
54+
{"type": "text", "text": "Second part"},
55+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{FAKE_BASE64}"}},
56+
]
57+
events = [{"type": "UserMessage", "text": content}]
58+
result = get_colang_history(events)
59+
assert FAKE_BASE64 not in result
60+
assert "First part Second part [+ image]" in result
61+
62+
def test_multimodal_does_not_bloat_history(self):
63+
events = [{"type": "UserMessage", "text": _multimodal_content("Describe this", FAKE_BASE64)}]
64+
result = get_colang_history(events)
65+
assert FAKE_BASE64 not in result
66+
assert len(result) < 1000
67+
68+
def test_mixed_text_and_multimodal_conversation(self):
69+
events = [
70+
{"type": "UserMessage", "text": "Hi"},
71+
{"type": "UserIntent", "intent": "express greeting"},
72+
{"type": "BotIntent", "intent": "express greeting"},
73+
{"type": "StartUtteranceBotAction", "script": "Hello!"},
74+
{"type": "UserMessage", "text": _multimodal_content("What is this?", FAKE_BASE64)},
75+
]
76+
result = get_colang_history(events)
77+
assert 'user "Hi"' in result
78+
assert FAKE_BASE64 not in result
79+
assert 'user "What is this? [+ image]"' in result
80+
81+
82+
class TestGetLastUserUtteranceMultimodal:
83+
def test_text_returns_string(self):
84+
events = [{"type": "UserMessage", "text": "Plain text"}]
85+
result = get_last_user_utterance(events)
86+
assert result == "Plain text"
87+
assert isinstance(result, str)
88+
89+
def test_multimodal_returns_string(self):
90+
events = [{"type": "UserMessage", "text": _multimodal_content("Describe this", FAKE_BASE64)}]
91+
result = get_last_user_utterance(events)
92+
assert isinstance(result, str)
93+
assert FAKE_BASE64 not in result
94+
assert "[+ image]" in result
95+
96+
def test_multimodal_image_only(self):
97+
events = [{"type": "UserMessage", "text": _multimodal_content(image_b64=FAKE_BASE64)}]
98+
result = get_last_user_utterance(events)
99+
assert isinstance(result, str)
100+
assert FAKE_BASE64 not in result
101+
assert result == "[+ image]"
102+
103+
def test_multimodal_none_text_part_does_not_crash(self):
104+
events = [
105+
{
106+
"type": "UserMessage",
107+
"text": [
108+
{"type": "text", "text": None},
109+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{FAKE_BASE64}"}},
110+
],
111+
}
112+
]
113+
result = get_last_user_utterance(events)
114+
assert isinstance(result, str)
115+
assert FAKE_BASE64 not in result
116+
assert result == "[+ image]"
117+
118+
def test_empty_list_returns_empty_string(self):
119+
events = [{"type": "UserMessage", "text": []}]
120+
assert get_last_user_utterance(events) == ""
121+
122+
def test_multiple_images_single_placeholder(self):
123+
events = [
124+
{
125+
"type": "UserMessage",
126+
"text": [
127+
{"type": "image_url", "image_url": {"url": "data:image/png;base64,AAA"}},
128+
{"type": "image_url", "image_url": {"url": "data:image/png;base64,BBB"}},
129+
],
130+
}
131+
]
132+
assert get_last_user_utterance(events) == "[+ image]"

0 commit comments

Comments
 (0)