Skip to content

Commit 474fe12

Browse files
authored
fix: openai vision with base64 image
1 parent cd91485 commit 474fe12

File tree

1 file changed

+30
-6
lines changed

1 file changed

+30
-6
lines changed

llm_dialog_manager/agent.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ def load_env_vars():
3838

3939
load_env_vars()
4040

41+
def encode_image(image_path):
42+
with open(image_path, "rb") as image_file:
43+
return base64.b64encode(image_file.read()).decode("utf-8")
44+
4145
def format_messages_for_gemini(messages):
4246
"""
4347
将标准化的消息格式转化为 Gemini 格式。
@@ -393,22 +397,42 @@ def add_image(self, image_path: Optional[str] = None, image_url: Optional[str] =
393397
# For Gemini, load as PIL.Image
394398
image_pil = Image.open(image_path)
395399
image_block = image_pil
396-
else:
400+
elif "claude" in self.model_name and "openai" not in self.model_name:
397401
# For Claude and others, use base64 encoding
398402
with open(image_path, "rb") as img_file:
399403
image_data = base64.standard_b64encode(img_file.read()).decode("utf-8")
400404
image_block = {
401-
"type": "image_base64",
402-
"image_base64": {
405+
"type": "image",
406+
"source": {
407+
"type": "base64",
403408
"media_type": media_type,
404-
"data": image_data
405-
}
409+
"data": image_data,
410+
},
411+
}
412+
else:
413+
# openai format
414+
base64_image = encode_image(image_path)
415+
image_block = {
416+
"type": "image_url",
417+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
406418
}
407419
else:
408420
# If image_url is provided
409421
if "gemini" in self.model_name and "openai" not in self.model_name:
410422
# For Gemini, you can pass image URLs directly
411423
image_block = {"type": "image_url", "image_url": {"url": image_url}}
424+
elif "claude" in self.model_name and "openai" not in self.model_name:
425+
import httpx
426+
media_type = "image/jpeg"
427+
image_data = base64.standard_b64encode(httpx.get(image_url).content).decode("utf-8")
428+
image_block = {
429+
"type": "image",
430+
"source": {
431+
"type": "base64",
432+
"media_type": media_type,
433+
"data": image_data,
434+
},
435+
}
412436
else:
413437
# For Claude and others, use image URLs
414438
image_block = {
@@ -525,7 +549,7 @@ def add_repo(self, repo_url: Optional[str] = None, username: Optional[str] = Non
525549
agent = Agent("gemini-1.5-flash", "you are Jack101", memory_enabled=True)
526550

527551
# Add an image
528-
agent.add_image(image_path="/Users/junfan/Projects/Personal/oneapi/dialog_manager/example.png")
552+
agent.add_image(image_path="example.png")
529553

530554
# Add a user message
531555
agent.add_message("user", "Who are you? What's in this image?")

0 commit comments

Comments
 (0)