mistralai · abdelhadi703 · Mar 3, 2026
diff --git a/eval/models.py b/eval/models.py
@@ -47,9 +47,8 @@ def _wait_till_healthy(self) -> bool:
                 # Ignore exception
                 pass
             else:
-                if (
-                    req.status_code == 200
-                    and req.content == b""
+                if req.status_code == 200 and (
+                    req.content == b""
                     or req.json() == {"status": "OK"}
                 ):
                     return True

diff --git a/eval/tasks/mm_mt_bench.py b/eval/tasks/mm_mt_bench.py
@@ -1,4 +1,6 @@
 import ast
+import base64
+import io
 import json
 import re
 import time
@@ -30,6 +32,38 @@ class Judgement:
     grade: float
 
 
+def _convert_image_chunks(content: Any) -> Any:
+    """Convert image chunks with PIL objects to image_url chunks with base64 data."""
+    if not isinstance(content, list):
+        return content
+    converted = []
+    for chunk in content:
+        if isinstance(chunk, dict) and chunk.get("type") == "image" and "image" in chunk:
+            image = chunk["image"]
+            stream = io.BytesIO()
+            im_format = image.format or "PNG"
+            image.save(stream, format=im_format)
+            im_b64 = base64.b64encode(stream.getvalue()).decode("ascii")
+            converted.append({
+                "type": "image_url",
+                "image_url": {"url": f"data:image/{im_format.lower()};base64,{im_b64}"},
+            })
+        else:
+            converted.append(chunk)
+    return converted
+
+
+def _extract_text_content(content: Any) -> str:
+    """Extract plain text from content that may be a string or a list of chunks."""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return "".join(
+            chunk["text"] for chunk in content if isinstance(chunk, dict) and chunk.get("type") == "text"
+        )
+    return str(content)
+
+
 class MultimodalLLMJudge:
     API_MAX_RETRY: int = 3
     JUDGE_DEFAULT_TEMPERATURE: float = 0.0
@@ -53,7 +87,8 @@ def _add_or_append_chunk(
         self, prompt: list[dict[str, Any]], chunk: str | dict[str, Any]
     ):
         if isinstance(chunk, dict) and chunk["type"] == "image_url":
-            return chunk
+            prompt.append(chunk)
+            return
 
         text: str = chunk["text"] if isinstance(chunk, dict) else chunk
         assert isinstance(text, str)
@@ -140,10 +175,12 @@ def _query_judge(self, prompt):
                     raise e
 
     def get_judgement(self, interaction: Interaction):
-        questions = [m for m in interaction.request["messages"] if m["role"] == "user"]
+        questions = [_convert_image_chunks(m.get("content", "")) for m in interaction.request["messages"] if m["role"] == "user"]
         ref_answers = [
-            m for m in interaction.request["messages"] if m["role"] == "assistant"
-        ] + [interaction.reference_answer]
+            _extract_text_content(m.get("content", ""))
+            for m in interaction.request["messages"]
+            if m["role"] == "assistant"
+        ] + [_extract_text_content(interaction.reference_answer)]
         assert interaction.model_answer is not None
         prompt = self._get_judge_prompt(
             questions, ref_answers, interaction.model_answer