11import uuid
2+ from typing import Optional
23
34from fastapi import APIRouter , Depends , File , Form , HTTPException , UploadFile
45from sqlalchemy .ext .asyncio import AsyncSession
2223router = APIRouter (prefix = "/multimodal" , tags = ["Multimodal" ])
2324
2425
25- @router .post ("/media " )
26- async def upload_meida (
27- file : UploadFile = File (... ),
28- session_id : uuid .UUID | None = Form (None ),
29- prompt : str | None = Form (None ),
30- audio_output : bool = Form (False ),
26+ @router .post ("/chat " )
27+ async def multimodal_chat (
28+ file : Optional [ UploadFile ] = File (None , description = "Optional image/audio file." ),
29+ session_id : Optional [ uuid .UUID ] = Form (None ),
30+ prompt : Optional [ str ] = Form (None , description = "User text input or question." ),
31+ audio_output : bool = Form (False , description = "Return response as audio if True." ),
3132 voice_style : VoiceStyle = Form (
3233 VoiceStyle .alloy ,
3334 description = """
@@ -44,8 +45,10 @@ async def upload_meida(
4445 current_user = Depends (get_current_user ),
4546):
4647 """
47- Accepts image or audio, processes it via OCR or Transcription,
48- and optionally returns the LLM response as audio.
48+ Handles multimodal chat:
49+ - Accepts optional text (`prompt`) or media file (image/audio)
50+ - Supports text or audio output response
51+ - Returns assistant response and optional audio file URL
4952 """
5053 SUPPORTED_TYPES = {
5154 "image" : ["image/jpeg" , "image/png" , "image/webp" ],
@@ -58,10 +61,12 @@ async def upload_meida(
5861 "audio/ogg" ,
5962 ],
6063 }
64+
6165 all_types = SUPPORTED_TYPES ["image" ] + SUPPORTED_TYPES ["audio" ]
6266
63- if file .content_type not in all_types :
64- raise HTTPException (status_code = 400 , detail = "Unsupported file type" )
67+ # Check if both file and prompt are empty
68+ if not file and not prompt :
69+ raise HTTPException (status_code = 400 , detail = "Either file or prompt is required" )
6570
6671 # Session handling
6772 if not session_id :
@@ -71,73 +76,81 @@ async def upload_meida(
7176 if not session or session .user_id != current_user .id :
7277 raise HTTPException (status_code = 403 , detail = "Invalid session" )
7378
74- # Upload to S3
75- file_bytes = await file . read ()
76- s3_obj = UploadToS3 ()
77- file_url = s3_obj . upload_file_to_s3 ( file_bytes , file . filename , file . content_type )
79+ # Step 1: Determine content
80+ content_summary = ""
81+ file_url = None
82+ media_type = None
7883
79- # Handle type
80- if file .content_type in SUPPORTED_TYPES ["image" ]:
81- media_type = MediaType .image
82- text_content = extract_text_from_s3_image (file_url )
83- content_summary = f"User uploaded an image. Extracted text: { text_content } . Prompt: { prompt or '' } "
84+ if file :
85+ if file .content_type not in all_types :
86+ raise HTTPException (status_code = 400 , detail = "Unsupported file type" )
8487
85- elif file . content_type in SUPPORTED_TYPES [ "audio" ]:
86- media_type = MediaType . audio
87- job_name = f"audio_transcribe_ { uuid . uuid4 (). hex [: 6 ] } "
88- text_content = transcribe_file ( job_name = job_name , s3_uri = file_url )
89- content_summary = f"User uploaded audio. Transcription: { text_content } . Prompt: { prompt or '' } "
88+ file_bytes = await file . read ()
89+ s3_obj = UploadToS3 ()
90+ file_url = s3_obj . upload_file_to_s3 (
91+ file_bytes , file . filename , file . content_type
92+ )
9093
94+ # Image Processing
95+ if file .content_type in SUPPORTED_TYPES ["image" ]:
96+ media_type = MediaType .image
97+ ocr_text = extract_text_from_s3_image (file_url )
98+ content_summary = (
99+ f"User uploaded an image. Extracted text: { ocr_text } . Prompt: { prompt } "
100+ )
101+
102+ # Audio Processing
103+ elif file .content_type in SUPPORTED_TYPES ["audio" ]:
104+ media_type = MediaType .audio
105+ text_content = transcribe_file (job_name = "audio_transcribe" , s3_uri = file_url )
106+ content_summary = (
107+ f"User uploaded an audio file. Transcription: { text_content } . "
108+ f"Convert it to English unless explicitly asked otherwise. Prompt: { prompt } "
109+ )
110+
111+ # Save attachment
112+ user_msg = await create_message (
113+ db , session .id , RoleEnum .user , prompt or f"Uploaded a { media_type .value } "
114+ )
115+ await create_attachment (
116+ db ,
117+ session .id ,
118+ user_msg .id ,
119+ file_url ,
120+ media_type ,
121+ {"filename" : file .filename },
122+ )
91123 else :
92- raise HTTPException (status_code = 400 , detail = "Unsupported media type" )
124+ # Text-only chat
125+ content_summary = f"User says: { prompt } "
93126
94- # Save user message
95- user_msg = await create_message (
96- db , session .id , RoleEnum .user , prompt or f"Uploaded { media_type .value } "
97- )
98- await create_attachment (
99- db ,
100- session .id ,
101- user_msg .id ,
102- file_url ,
103- media_type ,
104- {"filename" : file .filename },
105- )
106-
107- # Generate assistant response
127+ # Step 2: Generate Assistant Response
108128 history = [{"role" : "user" , "content" : content_summary }]
109129 assistant_content = await generate_response (history )
110130
111- # Save assistant message
112131 assistant_msg = await create_message (
113132 db , session .id , RoleEnum .assistant , assistant_content
114133 )
115134
116- response_data = {
135+ response_payload = {
117136 "assistant_message" : assistant_msg .content ,
118- "file_url" : file_url ,
119137 "session_id" : str (session .id ),
120138 "message_id" : str (assistant_msg .id ),
121139 }
122140
123- # --- Optional Audio Output ---
141+ # Step 3: Audio Output
124142 if audio_output :
125143 # Convert text to audio and Upload on S3
126144 audio_output_service = AudioOutput ()
127- audio_url = await audio_output_service .convert_text_into_audio (
128- voice_style = voice_style , assistant_content = assistant_msg .content
145+ audio_s3_url = await audio_output_service .convert_text_into_audio (
146+ assistant_content = assistant_content ,
147+ voice_style = voice_style .value ,
129148 )
130149
131- # Save assistant audio attachment
132- await create_attachment (
133- db ,
134- session .id ,
135- assistant_msg .id ,
136- audio_url ,
137- MediaType .audio ,
138- {"source" : "generated_speech" },
139- )
150+ response_payload ["audio_output_url" ] = audio_s3_url
140151
141- response_data ["assistant_audio_url" ] = audio_url
152+ # Add media file link if uploaded
153+ if file_url :
154+ response_payload ["uploaded_file_url" ] = file_url
142155
143- return response_data
156+ return response_payload
0 commit comments