Implemented multimodal chat endpoint with image, textand audio as input and audio, text as output

abhishekbuild · abhishekbuild · commit 368a7e92a9c7 · 2025-10-05T18:08:47.000+05:30
diff --git a/app/api/v1/multimodal.py b/app/api/v1/multimodal.py
@@ -1,4 +1,5 @@
 import uuid
+from typing import Optional
 
 from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -8,9 +9,11 @@
 from app.crud.message import create_message
 from app.crud.session import create_chat_session, get_chat_session
 from app.db.session import get_async_session
+from app.models import VoiceStyle
 from app.models.attachment import MediaType
 from app.models.message import RoleEnum
 from app.services import (
+    AudioOutput,
     UploadToS3,
     extract_text_from_s3_image,
     generate_response,
@@ -20,18 +23,33 @@
 router = APIRouter(prefix="/multimodal", tags=["Multimodal"])
 
 
-@router.post("/media")
-async def upload_media(
-    file: UploadFile = File(...),
-    session_id: uuid.UUID | None = Form(None),
-    prompt: str | None = Form(None),
+@router.post("/chat")
+async def multimodal_chat(
+    file: Optional[UploadFile] = File(None, description="Optional image/audio file."),
+    session_id: Optional[uuid.UUID] = Form(None),
+    prompt: Optional[str] = Form(None, description="User text input or question."),
+    audio_output: bool = Form(False, description="Return response as audio if True."),
+    voice_style: VoiceStyle = Form(
+        VoiceStyle.alloy,
+        description="""
+        Choose the output voice style:\n
+        - alloy: Versatile and neutral-sounding voice.\n
+        - echo: Warm and resonant voice.\n
+        - fable: Clear and articulate voice.\n
+        - onyx: Deep and commanding voice.\n
+        - nova: Bright and energetic voice.\n
+        - shimmer: Smooth and calming voice.\n
+        """,
+    ),
     db: AsyncSession = Depends(get_async_session),
     current_user=Depends(get_current_user),
 ):
     """
-    Accepts image and audio
+    Handles multimodal chat:
+    - Accepts optional text (`prompt`) or media file (image/audio)
+    - Supports text or audio output response
+    - Returns assistant response and optional audio file URL
     """
-    # Accept images and audio formats
     SUPPORTED_TYPES = {
         "image": ["image/jpeg", "image/png", "image/webp"],
         "audio": [
@@ -43,71 +61,107 @@ async def upload_media(
             "audio/ogg",
         ],
     }
+
     all_types = SUPPORTED_TYPES["image"] + SUPPORTED_TYPES["audio"]
 
-    if file.content_type not in all_types:
-        raise HTTPException(status_code=400, detail="Unsupported file type")
+    # Check if both file and prompt are empty
+    if not file and not prompt:
+        raise HTTPException(status_code=400, detail="Either file or prompt is required")
 
-    # If no session provided, create one
+    # Session handling
     if not session_id:
         session = await create_chat_session(db, current_user.id, title="Media Session")
     else:
         session = await get_chat_session(db, session_id)
         if not session or session.user_id != current_user.id:
             raise HTTPException(status_code=403, detail="Invalid session")
 
-    # Read file
-    file_bytes = await file.read()
-
-    # Upload to S3
-    s3_obj = UploadToS3()
-    file_url = s3_obj.upload_file_to_s3(file_bytes, file.filename, file.content_type)
-
-    # Determine type and handle processing
-    if file.content_type in SUPPORTED_TYPES["image"]:
-        media_type = MediaType.image
-        text_content = extract_text_from_s3_image(file_url)  # OCR for images
-        content_summary = f"User uploaded an image: The content of image is: \n{text_content}. Prompt:\n {prompt}"
-
-    elif file.content_type in SUPPORTED_TYPES["audio"]:
-        media_type = MediaType.audio
-        # Transcribe the audio file using AWS Transcribe
-        job_name = "audio_transcribe"
-        text_content = transcribe_file(job_name=job_name, s3_uri=file_url)
-
-        content_summary = f"User uploaded an audio file: The transcribe of audio is: \n{text_content}. In whatever language the transcribe is convert it into english and then reply only in English, Unless user explicitly asks for the specific language. \nPrompt:\n {prompt}"
+    # Step 1: Determine content
+    content_summary = ""
+    file_url = None
+    media_type = None
+
+    if file:
+        if file.content_type not in all_types:
+            raise HTTPException(status_code=400, detail="Unsupported file type")
+
+        file_bytes = await file.read()
+        s3_obj = UploadToS3()
+        file_url = s3_obj.upload_file_to_s3(
+            file_bytes, file.filename, file.content_type
+        )
+
+        # Image Processing
+        if file.content_type in SUPPORTED_TYPES["image"]:
+            media_type = MediaType.image
+            ocr_text = extract_text_from_s3_image(file_url)
+            content_summary = (
+                f"User uploaded an image. Extracted text: {ocr_text}. Prompt: {prompt}"
+            )
+
+        # Audio Processing
+        elif file.content_type in SUPPORTED_TYPES["audio"]:
+            media_type = MediaType.audio
+            text_content = transcribe_file(job_name="audio_transcribe", s3_uri=file_url)
+            content_summary = (
+                f"User uploaded an audio file. Transcription: {text_content}. "
+                f"Convert it to English unless explicitly asked otherwise. Prompt: {prompt}"
+            )
+
+        # Save attachment
+        user_msg = await create_message(
+            db, session.id, RoleEnum.user, prompt or f"Uploaded a {media_type.value}"
+        )
+        await create_attachment(
+            db,
+            session.id,
+            user_msg.id,
+            file_url,
+            media_type,
+            {"filename": file.filename},
+        )
     else:
-        raise HTTPException(status_code=400, detail="Unsupported media type")
-
-    # Save user message with attachment
-    user_msg = await create_message(
-        db, session.id, RoleEnum.user, prompt or f"Uploaded a {media_type.value}"
-    )
-    await create_attachment(
-        db,
-        session.id,
-        user_msg.id,
-        file_url,
-        media_type,
-        {"filename": file.filename},
-    )
-
-    history = [
-        {
-            "role": "user",
-            "content": content_summary,
-        }
-    ]
+        # Text-only chat
+        content_summary = f"User says: {prompt}"
 
+    # Step 2: Generate Assistant Response
+    history = [{"role": "user", "content": content_summary}]
     assistant_content = await generate_response(history)
 
     assistant_msg = await create_message(
         db, session.id, RoleEnum.assistant, assistant_content
     )
 
-    return {
+    response_payload = {
         "assistant_message": assistant_msg.content,
-        "file_url": file_url,
         "session_id": str(session.id),
         "message_id": str(assistant_msg.id),
     }
+
+    # Step 3: Audio Output
+    if audio_output:
+        # Convert text to audio and upload to S3
+        audio_output_service = AudioOutput()
+        audio_s3_url = await audio_output_service.convert_text_into_audio(
+            assistant_content=assistant_content,
+            voice_style=voice_style.value,
+        )
+
+        response_payload["audio_output_url"] = audio_s3_url
+
+        # Save assistant audio as an attachment in DB
+        await create_attachment(
+            db=db,
+            session_id=session.id,
+            message_id=assistant_msg.id,
+            url=file_url,
+            media_type=media_type,
+            metadata_={"voice_style": voice_style.value},
+            audio_url=audio_s3_url,
+        )
+
+    # Add media file link if uploaded
+    if file_url:
+        response_payload["uploaded_file_url"] = file_url
+
+    return response_payload
diff --git a/app/crud/attachments.py b/app/crud/attachments.py
@@ -1,9 +1,25 @@
+from app.models.attachment import Attachment
 from sqlalchemy.ext.asyncio import AsyncSession
-from app.models.attachment import Attachment, MediaType
 
-async def create_attachment(db: AsyncSession, session_id, message_id, url: str, media_type: MediaType, metadata: dict = None) -> Attachment:
-    attach = Attachment(session_id=session_id, message_id=message_id, url=url, media_type=media_type, metadata=metadata)
-    db.add(attach)
+
+async def create_attachment(
+    db: AsyncSession,
+    session_id,
+    message_id,
+    url,
+    media_type,
+    metadata_=None,
+    audio_url=None,
+):
+    attachment = Attachment(
+        session_id=session_id,
+        message_id=message_id,
+        url=url,
+        media_type=media_type,
+        metadata_=metadata_,
+        audio_url=audio_url,
+    )
+    db.add(attachment)
     await db.commit()
-    await db.refresh(attach)
-    return attach
+    await db.refresh(attachment)
+    return attachment
diff --git a/app/models/__init__.py b/app/models/__init__.py
@@ -1,4 +1,5 @@
 from .chat_session import ChatSession
 from .message import Message, RoleEnum
 from .user import User
-from .attachment import Attachment
+from .attachment import Attachment
+from .voice_styles import VoiceStyle
diff --git a/app/models/attachment.py b/app/models/attachment.py
@@ -23,15 +23,19 @@ class Attachment(Base):
         nullable=True,
     )
     message_id = Column(
-        UUID(as_uuid=True), ForeignKey("messages.id", ondelete="CASCADE"), nullable=True
+        UUID(as_uuid=True),
+        ForeignKey("messages.id", ondelete="CASCADE"),
+        nullable=True,
     )
 
-    url = Column(String, nullable=False)
-    media_type = Column(Enum(MediaType), nullable=False)
+    url = Column(String, nullable=True)
+    media_type = Column(Enum(MediaType), nullable=True)
     metadata_ = Column(JSONB, nullable=True)
 
+    # To store generated assistant audio responses
+    audio_url = Column(String, nullable=True)
+
     created_at = Column(DateTime, default=datetime.datetime.utcnow)
 
     session = relationship("ChatSession", back_populates="attachments")
     message = relationship("Message", back_populates="attachments")
-
diff --git a/app/models/voice_styles.py b/app/models/voice_styles.py
@@ -0,0 +1,10 @@
+from enum import Enum
+
+
+class VoiceStyle(str, Enum):
+    alloy = "alloy"
+    verse = "verse"
+    fable = "fable"
+    onyx = "onyx"
+    nova = "nova"
+    shimmer = "shimmer"
diff --git a/app/services/__init__.py b/app/services/__init__.py
@@ -1,4 +1,5 @@
 from .llm_client import generate_response
 from .textract import extract_text_from_s3_image
 from .s3_storage import UploadToS3
-from .transcribe import transcribe_file
+from .transcribe import transcribe_file
+from .audio_output import AudioOutput
diff --git a/app/services/audio_output.py b/app/services/audio_output.py
@@ -0,0 +1,31 @@
+from openai import AsyncOpenAI
+import tempfile
+from app.core.config import settings
+import uuid
+from app.services import UploadToS3
+
+
+class AudioOutput:
+    def __init__(self):
+        self.client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
+        self.s3_obj = UploadToS3()
+
+    async def convert_text_into_audio(self, voice_style: str, assistant_content: str):
+        """
+        Converts text into audio using OpenAI's TTS model and uploads it to S3.
+        """
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
+            audio_resp = await self.client.audio.speech.create(
+                model="gpt-4o-mini-tts",
+                voice=voice_style,
+                input=assistant_content,
+            )
+        audio_resp.stream_to_file(temp_audio.name)
+
+        # Upload generated audio to S3
+        with open(temp_audio.name, "rb") as audio_file:
+            audio_url = self.s3_obj.upload_file_to_s3(
+                audio_file.read(), f"{uuid.uuid4()}.mp3", "audio/mpeg"
+            )
+
+        return audio_url
diff --git a/migrations/versions/0adeb6494274_make_media_type_nullable.py b/migrations/versions/0adeb6494274_make_media_type_nullable.py
@@ -0,0 +1,36 @@
+"""Make media_type nullable
+
+Revision ID: 0adeb6494274
+Revises: f8e5c60d6d7c
+Create Date: 2025-10-05 17:50:45.936151
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision: str = '0adeb6494274'
+down_revision: Union[str, Sequence[str], None] = 'f8e5c60d6d7c'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.alter_column('attachments', 'media_type',
+               existing_type=postgresql.ENUM('image', 'audio', name='mediatype'),
+               nullable=True)
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.alter_column('attachments', 'media_type',
+               existing_type=postgresql.ENUM('image', 'audio', name='mediatype'),
+               nullable=False)
+    # ### end Alembic commands ###
diff --git a/migrations/versions/f8e5c60d6d7c_add_audio_url_column_to_attachments.py b/migrations/versions/f8e5c60d6d7c_add_audio_url_column_to_attachments.py
diff --git a/migrations/versions/fa1dbf5dd801_make_media_type_nullable.py b/migrations/versions/fa1dbf5dd801_make_media_type_nullable.py