mem0ai · haosenwang1018 · Apr 29, 2026
diff --git a/mem0/memory/utils.py b/mem0/memory/utils.py
@@ -179,6 +179,15 @@ def parse_vision_messages(messages, llm=None, vision_details="auto"):
 
         # Handle message content
         if isinstance(msg["content"], list):
+            if llm is None:
+                text_parts = [
+                    part.get("text", "")
+                    for part in msg["content"]
+                    if isinstance(part, dict) and part.get("type") == "text" and part.get("text")
+                ]
+                returned_messages.append({"role": msg["role"], "content": "\n".join(text_parts)})
+                continue
+
             # Multiple image URLs in content
             description = get_image_description(msg, llm, vision_details)
             returned_messages.append({"role": msg["role"], "content": description})
@@ -292,4 +301,3 @@ def remove_spaces_from_entities(
         item["destination"] = item["destination"].lower().replace(" ", "_")
         cleaned.append(item)
     return cleaned
-
diff --git a/tests/memory/test_memory_utils.py b/tests/memory/test_memory_utils.py
@@ -1,5 +1,5 @@
 import pytest
-from mem0.memory.utils import remove_spaces_from_entities, sanitize_relationship_for_cypher
+from mem0.memory.utils import parse_vision_messages, remove_spaces_from_entities, sanitize_relationship_for_cypher
 
 
 class TestRemoveSpacesFromEntities:
@@ -55,3 +55,36 @@ def test_sanitize_true_vs_false_slash_in_relationship(self):
         f = remove_spaces_from_entities([dict(base)], sanitize_relationship=False)[0]["relationship"]
         assert t == sanitize_relationship_for_cypher("a/b")
         assert f == "a/b"
+
+
+class TestParseVisionMessages:
+    def test_flattens_openai_text_parts_without_vision_llm(self):
+        messages = [
+            {"role": "user", "content": "你好"},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "For context:"},
+                    {"type": "text", "text": "xxxxxxx"},
+                ],
+            },
+        ]
+
+        assert parse_vision_messages(messages) == [
+            {"role": "user", "content": "你好"},
+            {"role": "user", "content": "For context:\nxxxxxxx"},
+        ]
+
+    def test_uses_llm_for_list_content_when_vision_enabled(self, mocker):
+        mock_llm = mocker.Mock()
+        mock_llm.generate_response.return_value = "image description"
+        message = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Describe this"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}},
+            ],
+        }
+
+        assert parse_vision_messages([message], llm=mock_llm) == [{"role": "user", "content": "image description"}]
+        mock_llm.generate_response.assert_called_once()