11import uuid
2+ from typing import Optional
23
34from fastapi import APIRouter , Depends , File , Form , HTTPException , UploadFile
45from sqlalchemy .ext .asyncio import AsyncSession
89from app .crud .message import create_message
910from app .crud .session import create_chat_session , get_chat_session
1011from app .db .session import get_async_session
12+ from app .models import VoiceStyle
1113from app .models .attachment import MediaType
1214from app .models .message import RoleEnum
1315from app .services import (
16+ AudioOutput ,
1417 UploadToS3 ,
1518 extract_text_from_s3_image ,
1619 generate_response ,
2023router = APIRouter (prefix = "/multimodal" , tags = ["Multimodal" ])
2124
2225
23- @router .post ("/media" )
24- async def upload_media (
25- file : UploadFile = File (...),
26- session_id : uuid .UUID | None = Form (None ),
27- prompt : str | None = Form (None ),
26+ @router .post ("/chat" )
27+ async def multimodal_chat (
28+ file : Optional [UploadFile ] = File (None , description = "Optional image/audio file." ),
29+ session_id : Optional [uuid .UUID ] = Form (None ),
30+ prompt : Optional [str ] = Form (None , description = "User text input or question." ),
31+ audio_output : bool = Form (False , description = "Return response as audio if True." ),
32+ voice_style : VoiceStyle = Form (
33+ VoiceStyle .alloy ,
34+ description = """
35+ Choose the output voice style:\n
36+ - alloy: Versatile and neutral-sounding voice.\n
37+ - echo: Warm and resonant voice.\n
38+ - fable: Clear and articulate voice.\n
39+ - onyx: Deep and commanding voice.\n
40+ - nova: Bright and energetic voice.\n
41+ - shimmer: Smooth and calming voice.\n
42+ """ ,
43+ ),
2844 db : AsyncSession = Depends (get_async_session ),
2945 current_user = Depends (get_current_user ),
3046):
3147 """
32- Accepts image and audio
48+ Handles multimodal chat:
49+ - Accepts optional text (`prompt`) or media file (image/audio)
50+ - Supports text or audio output response
51+ - Returns assistant response and optional audio file URL
3352 """
34- # Accept images and audio formats
3553 SUPPORTED_TYPES = {
3654 "image" : ["image/jpeg" , "image/png" , "image/webp" ],
3755 "audio" : [
@@ -43,71 +61,107 @@ async def upload_media(
4361 "audio/ogg" ,
4462 ],
4563 }
64+
4665 all_types = SUPPORTED_TYPES ["image" ] + SUPPORTED_TYPES ["audio" ]
4766
48- if file .content_type not in all_types :
49- raise HTTPException (status_code = 400 , detail = "Unsupported file type" )
67+ # Check if both file and prompt are empty
68+ if not file and not prompt :
69+ raise HTTPException (status_code = 400 , detail = "Either file or prompt is required" )
5070
51- # If no session provided, create one
71+ # Session handling
5272 if not session_id :
5373 session = await create_chat_session (db , current_user .id , title = "Media Session" )
5474 else :
5575 session = await get_chat_session (db , session_id )
5676 if not session or session .user_id != current_user .id :
5777 raise HTTPException (status_code = 403 , detail = "Invalid session" )
5878
59- # Read file
60- file_bytes = await file .read ()
61-
62- # Upload to S3
63- s3_obj = UploadToS3 ()
64- file_url = s3_obj .upload_file_to_s3 (file_bytes , file .filename , file .content_type )
65-
66- # Determine type and handle processing
67- if file .content_type in SUPPORTED_TYPES ["image" ]:
68- media_type = MediaType .image
69- text_content = extract_text_from_s3_image (file_url ) # OCR for images
70- content_summary = f"User uploaded an image: The content of image is: \n { text_content } . Prompt:\n { prompt } "
71-
72- elif file .content_type in SUPPORTED_TYPES ["audio" ]:
73- media_type = MediaType .audio
74- # Transcribe the audio file using AWS Transcribe
75- job_name = "audio_transcribe"
76- text_content = transcribe_file (job_name = job_name , s3_uri = file_url )
77-
78- content_summary = f"User uploaded an audio file: The transcribe of audio is: \n { text_content } . In whatever language the transcribe is convert it into english and then reply only in English, Unless user explicitly asks for the specific language. \n Prompt:\n { prompt } "
79+ # Step 1: Determine content
80+ content_summary = ""
81+ file_url = None
82+ media_type = None
83+
84+ if file :
85+ if file .content_type not in all_types :
86+ raise HTTPException (status_code = 400 , detail = "Unsupported file type" )
87+
88+ file_bytes = await file .read ()
89+ s3_obj = UploadToS3 ()
90+ file_url = s3_obj .upload_file_to_s3 (
91+ file_bytes , file .filename , file .content_type
92+ )
93+
94+ # Image Processing
95+ if file .content_type in SUPPORTED_TYPES ["image" ]:
96+ media_type = MediaType .image
97+ ocr_text = extract_text_from_s3_image (file_url )
98+ content_summary = (
99+ f"User uploaded an image. Extracted text: { ocr_text } . Prompt: { prompt } "
100+ )
101+
102+ # Audio Processing
103+ elif file .content_type in SUPPORTED_TYPES ["audio" ]:
104+ media_type = MediaType .audio
105+ text_content = transcribe_file (job_name = "audio_transcribe" , s3_uri = file_url )
106+ content_summary = (
107+ f"User uploaded an audio file. Transcription: { text_content } . "
108+ f"Convert it to English unless explicitly asked otherwise. Prompt: { prompt } "
109+ )
110+
111+ # Save attachment
112+ user_msg = await create_message (
113+ db , session .id , RoleEnum .user , prompt or f"Uploaded a { media_type .value } "
114+ )
115+ await create_attachment (
116+ db ,
117+ session .id ,
118+ user_msg .id ,
119+ file_url ,
120+ media_type ,
121+ {"filename" : file .filename },
122+ )
79123 else :
80- raise HTTPException (status_code = 400 , detail = "Unsupported media type" )
81-
82- # Save user message with attachment
83- user_msg = await create_message (
84- db , session .id , RoleEnum .user , prompt or f"Uploaded a { media_type .value } "
85- )
86- await create_attachment (
87- db ,
88- session .id ,
89- user_msg .id ,
90- file_url ,
91- media_type ,
92- {"filename" : file .filename },
93- )
94-
95- history = [
96- {
97- "role" : "user" ,
98- "content" : content_summary ,
99- }
100- ]
124+ # Text-only chat
125+ content_summary = f"User says: { prompt } "
101126
127+ # Step 2: Generate Assistant Response
128+ history = [{"role" : "user" , "content" : content_summary }]
102129 assistant_content = await generate_response (history )
103130
104131 assistant_msg = await create_message (
105132 db , session .id , RoleEnum .assistant , assistant_content
106133 )
107134
108- return {
135+ response_payload = {
109136 "assistant_message" : assistant_msg .content ,
110- "file_url" : file_url ,
111137 "session_id" : str (session .id ),
112138 "message_id" : str (assistant_msg .id ),
113139 }
140+
141+ # Step 3: Audio Output
142+ if audio_output :
143+ # Convert text to audio and upload to S3
144+ audio_output_service = AudioOutput ()
145+ audio_s3_url = await audio_output_service .convert_text_into_audio (
146+ assistant_content = assistant_content ,
147+ voice_style = voice_style .value ,
148+ )
149+
150+ response_payload ["audio_output_url" ] = audio_s3_url
151+
152+ # Save assistant audio as an attachment in DB
153+ await create_attachment (
154+ db = db ,
155+ session_id = session .id ,
156+ message_id = assistant_msg .id ,
157+ url = file_url ,
158+ media_type = media_type ,
159+ metadata_ = {"voice_style" : voice_style .value },
160+ audio_url = audio_s3_url ,
161+ )
162+
163+ # Add media file link if uploaded
164+ if file_url :
165+ response_payload ["uploaded_file_url" ] = file_url
166+
167+ return response_payload
0 commit comments