fix: prepend text message to content blocks in multimodal agent loop

Cairn-2001 · Axiom · commit fd0675a4c92f · 2026-04-12T11:36:07.000Z
When a user sends a message with image attachments via the upload API,
the agent loop receives both `user_message` (text) and
`user_content_blocks` (images). Previously, when content blocks were
present, only the blocks were pushed to the session — the text message
was silently dropped. The LLM received the images but not the user's
question or context.

This fix prepends the text message as a ContentBlock::Text into the
blocks vector before pushing to the session, so the LLM sees both
the user's text AND any attached images in a single turn.

Both the non-streaming and streaming agent loop paths are fixed.

Before:
  User: "What color is this?" + [image of blue square]
  LLM receives: [image only, no text]
  Response: "I can't see the image directly"

After:
  User: "What color is this?" + [image of blue square]
  LLM receives: [text: "What color is this?", image: blue square]
  Response: "Blue"

Tested with Qwen 3.5 Plus and Gemini 2.5 Flash via OpenRouter.
Images up to 1.3MB confirmed working through the full pipeline.

Signed-off-by: Cairn-2001 &lt;Cairn-2001@smoothcurves.nexus&gt;
diff --git a/crates/openfang-runtime/src/agent_loop.rs b/crates/openfang-runtime/src/agent_loop.rs
@@ -279,7 +279,18 @@ pub async fn run_agent_loop(
     // Add the user message to session history.
     // When content blocks are provided (e.g. text + image from a channel),
     // use multimodal message format so the LLM receives the image for vision.
-    if let Some(blocks) = user_content_blocks {
+    // The text message is prepended to the blocks so the LLM sees both the
+    // user's question AND any attached images in a single turn.
+    if let Some(mut blocks) = user_content_blocks {
+        if !user_message.is_empty() {
+            blocks.insert(
+                0,
+                ContentBlock::Text {
+                    text: user_message.to_string(),
+                    provider_metadata: None,
+                },
+            );
+        }
         session.messages.push(Message::user_with_blocks(blocks));
     } else {
         session.messages.push(Message::user(user_message));
@@ -1448,7 +1459,18 @@ pub async fn run_agent_loop_streaming(
     // Add the user message to session history.
     // When content blocks are provided (e.g. text + image from a channel),
     // use multimodal message format so the LLM receives the image for vision.
-    if let Some(blocks) = user_content_blocks {
+    // The text message is prepended to the blocks so the LLM sees both the
+    // user's question AND any attached images in a single turn.
+    if let Some(mut blocks) = user_content_blocks {
+        if !user_message.is_empty() {
+            blocks.insert(
+                0,
+                ContentBlock::Text {
+                    text: user_message.to_string(),
+                    provider_metadata: None,
+                },
+            );
+        }
         session.messages.push(Message::user_with_blocks(blocks));
     } else {
         session.messages.push(Message::user(user_message));