Skip to content

Commit 386bf4e

Browse files
committed
Enabled multimodal chat complelely
1 parent 0f5cc00 commit 386bf4e

1 file changed

Lines changed: 69 additions & 56 deletions

File tree

app/api/v1/multimodal.py

Lines changed: 69 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import uuid
2+
from typing import Optional
23

34
from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
45
from sqlalchemy.ext.asyncio import AsyncSession
@@ -22,12 +23,12 @@
2223
router = APIRouter(prefix="/multimodal", tags=["Multimodal"])
2324

2425

25-
@router.post("/media")
26-
async def upload_meida(
27-
file: UploadFile = File(...),
28-
session_id: uuid.UUID | None = Form(None),
29-
prompt: str | None = Form(None),
30-
audio_output: bool = Form(False),
26+
@router.post("/chat")
27+
async def multimodal_chat(
28+
file: Optional[UploadFile] = File(None, description="Optional image/audio file."),
29+
session_id: Optional[uuid.UUID] = Form(None),
30+
prompt: Optional[str] = Form(None, description="User text input or question."),
31+
audio_output: bool = Form(False, description="Return response as audio if True."),
3132
voice_style: VoiceStyle = Form(
3233
VoiceStyle.alloy,
3334
description="""
@@ -44,8 +45,10 @@ async def upload_meida(
4445
current_user=Depends(get_current_user),
4546
):
4647
"""
47-
Accepts image or audio, processes it via OCR or Transcription,
48-
and optionally returns the LLM response as audio.
48+
Handles multimodal chat:
49+
- Accepts optional text (`prompt`) or media file (image/audio)
50+
- Supports text or audio output response
51+
- Returns assistant response and optional audio file URL
4952
"""
5053
SUPPORTED_TYPES = {
5154
"image": ["image/jpeg", "image/png", "image/webp"],
@@ -58,10 +61,12 @@ async def upload_meida(
5861
"audio/ogg",
5962
],
6063
}
64+
6165
all_types = SUPPORTED_TYPES["image"] + SUPPORTED_TYPES["audio"]
6266

63-
if file.content_type not in all_types:
64-
raise HTTPException(status_code=400, detail="Unsupported file type")
67+
# Check if both file and prompt are empty
68+
if not file and not prompt:
69+
raise HTTPException(status_code=400, detail="Either file or prompt is required")
6570

6671
# Session handling
6772
if not session_id:
@@ -71,73 +76,81 @@ async def upload_meida(
7176
if not session or session.user_id != current_user.id:
7277
raise HTTPException(status_code=403, detail="Invalid session")
7378

74-
# Upload to S3
75-
file_bytes = await file.read()
76-
s3_obj = UploadToS3()
77-
file_url = s3_obj.upload_file_to_s3(file_bytes, file.filename, file.content_type)
79+
# Step 1: Determine content
80+
content_summary = ""
81+
file_url = None
82+
media_type = None
7883

79-
# Handle type
80-
if file.content_type in SUPPORTED_TYPES["image"]:
81-
media_type = MediaType.image
82-
text_content = extract_text_from_s3_image(file_url)
83-
content_summary = f"User uploaded an image. Extracted text: {text_content}. Prompt: {prompt or ''}"
84+
if file:
85+
if file.content_type not in all_types:
86+
raise HTTPException(status_code=400, detail="Unsupported file type")
8487

85-
elif file.content_type in SUPPORTED_TYPES["audio"]:
86-
media_type = MediaType.audio
87-
job_name = f"audio_transcribe_{uuid.uuid4().hex[:6]}"
88-
text_content = transcribe_file(job_name=job_name, s3_uri=file_url)
89-
content_summary = f"User uploaded audio. Transcription: {text_content}. Prompt: {prompt or ''}"
88+
file_bytes = await file.read()
89+
s3_obj = UploadToS3()
90+
file_url = s3_obj.upload_file_to_s3(
91+
file_bytes, file.filename, file.content_type
92+
)
9093

94+
# Image Processing
95+
if file.content_type in SUPPORTED_TYPES["image"]:
96+
media_type = MediaType.image
97+
ocr_text = extract_text_from_s3_image(file_url)
98+
content_summary = (
99+
f"User uploaded an image. Extracted text: {ocr_text}. Prompt: {prompt}"
100+
)
101+
102+
# Audio Processing
103+
elif file.content_type in SUPPORTED_TYPES["audio"]:
104+
media_type = MediaType.audio
105+
text_content = transcribe_file(job_name="audio_transcribe", s3_uri=file_url)
106+
content_summary = (
107+
f"User uploaded an audio file. Transcription: {text_content}. "
108+
f"Convert it to English unless explicitly asked otherwise. Prompt: {prompt}"
109+
)
110+
111+
# Save attachment
112+
user_msg = await create_message(
113+
db, session.id, RoleEnum.user, prompt or f"Uploaded a {media_type.value}"
114+
)
115+
await create_attachment(
116+
db,
117+
session.id,
118+
user_msg.id,
119+
file_url,
120+
media_type,
121+
{"filename": file.filename},
122+
)
91123
else:
92-
raise HTTPException(status_code=400, detail="Unsupported media type")
124+
# Text-only chat
125+
content_summary = f"User says: {prompt}"
93126

94-
# Save user message
95-
user_msg = await create_message(
96-
db, session.id, RoleEnum.user, prompt or f"Uploaded {media_type.value}"
97-
)
98-
await create_attachment(
99-
db,
100-
session.id,
101-
user_msg.id,
102-
file_url,
103-
media_type,
104-
{"filename": file.filename},
105-
)
106-
107-
# Generate assistant response
127+
# Step 2: Generate Assistant Response
108128
history = [{"role": "user", "content": content_summary}]
109129
assistant_content = await generate_response(history)
110130

111-
# Save assistant message
112131
assistant_msg = await create_message(
113132
db, session.id, RoleEnum.assistant, assistant_content
114133
)
115134

116-
response_data = {
135+
response_payload = {
117136
"assistant_message": assistant_msg.content,
118-
"file_url": file_url,
119137
"session_id": str(session.id),
120138
"message_id": str(assistant_msg.id),
121139
}
122140

123-
# --- Optional Audio Output ---
141+
# Step 3: Audio Output
124142
if audio_output:
125143
# Convert text to audio and Upload on S3
126144
audio_output_service = AudioOutput()
127-
audio_url = await audio_output_service.convert_text_into_audio(
128-
voice_style=voice_style, assistant_content=assistant_msg.content
145+
audio_s3_url = await audio_output_service.convert_text_into_audio(
146+
assistant_content=assistant_content,
147+
voice_style=voice_style.value,
129148
)
130149

131-
# Save assistant audio attachment
132-
await create_attachment(
133-
db,
134-
session.id,
135-
assistant_msg.id,
136-
audio_url,
137-
MediaType.audio,
138-
{"source": "generated_speech"},
139-
)
150+
response_payload["audio_output_url"] = audio_s3_url
140151

141-
response_data["assistant_audio_url"] = audio_url
152+
# Add media file link if uploaded
153+
if file_url:
154+
response_payload["uploaded_file_url"] = file_url
142155

143-
return response_data
156+
return response_payload

0 commit comments

Comments
 (0)