Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions eval/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,8 @@ def _wait_till_healthy(self) -> bool:
# Ignore exception
pass
else:
if (
req.status_code == 200
and req.content == b""
if req.status_code == 200 and (
req.content == b""
or req.json() == {"status": "OK"}
):
return True
Expand Down
45 changes: 41 additions & 4 deletions eval/tasks/mm_mt_bench.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import ast
import base64
import io
import json
import re
import time
Expand Down Expand Up @@ -30,6 +32,38 @@ class Judgement:
grade: float


def _convert_image_chunks(content: Any) -> Any:
"""Convert image chunks with PIL objects to image_url chunks with base64 data."""
if not isinstance(content, list):
return content
converted = []
for chunk in content:
if isinstance(chunk, dict) and chunk.get("type") == "image" and "image" in chunk:
image = chunk["image"]
stream = io.BytesIO()
im_format = image.format or "PNG"
image.save(stream, format=im_format)
im_b64 = base64.b64encode(stream.getvalue()).decode("ascii")
converted.append({
"type": "image_url",
"image_url": {"url": f"data:image/{im_format.lower()};base64,{im_b64}"},
})
else:
converted.append(chunk)
return converted


def _extract_text_content(content: Any) -> str:
"""Extract plain text from content that may be a string or a list of chunks."""
if isinstance(content, str):
return content
if isinstance(content, list):
return "".join(
chunk["text"] for chunk in content if isinstance(chunk, dict) and chunk.get("type") == "text"
)
return str(content)


class MultimodalLLMJudge:
API_MAX_RETRY: int = 3
JUDGE_DEFAULT_TEMPERATURE: float = 0.0
Expand All @@ -53,7 +87,8 @@ def _add_or_append_chunk(
self, prompt: list[dict[str, Any]], chunk: str | dict[str, Any]
):
if isinstance(chunk, dict) and chunk["type"] == "image_url":
return chunk
prompt.append(chunk)
return

text: str = chunk["text"] if isinstance(chunk, dict) else chunk
assert isinstance(text, str)
Expand Down Expand Up @@ -140,10 +175,12 @@ def _query_judge(self, prompt):
raise e

def get_judgement(self, interaction: Interaction):
questions = [m for m in interaction.request["messages"] if m["role"] == "user"]
questions = [_convert_image_chunks(m.get("content", "")) for m in interaction.request["messages"] if m["role"] == "user"]
ref_answers = [
m for m in interaction.request["messages"] if m["role"] == "assistant"
] + [interaction.reference_answer]
_extract_text_content(m.get("content", ""))
for m in interaction.request["messages"]
if m["role"] == "assistant"
] + [_extract_text_content(interaction.reference_answer)]
assert interaction.model_answer is not None
prompt = self._get_judge_prompt(
questions, ref_answers, interaction.model_answer
Expand Down