Provide in-line citations for file search (#485)

ekassos · web-flow · commit d37c1458d31e · 2024-09-12T13:33:53.000-04:00
Closes #31 by adding an in-line citation mentioning the referencing file name when File Search is used. See discussion in #31 as to why this is the best we can do right now. Because of the limited amount of information, it would make no sense to direct users at the end of the message with footnotes.
diff --git a/pingpong/ai.py b/pingpong/ai.py
@@ -10,7 +10,6 @@
 from openai.types.beta.threads import ImageFile, MessageContentPartParam
 from openai.types.beta.threads.runs import ToolCallsStepDetails, CodeInterpreterToolCall
 from pingpong.schemas import CodeInterpreterMessage
-
 import pingpong.models as models
 from .config import config
 
@@ -118,9 +117,10 @@ async def get_ci_messages_from_step(
 
 
 class BufferedStreamHandler(openai.AsyncAssistantEventHandler):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, file_names: dict[str, str], *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.__buffer = io.BytesIO()
+        self.file_names = file_names
 
     def enqueue(self, data: Dict) -> None:
         self.__buffer.write(orjson.dumps(data))
@@ -150,10 +150,18 @@ async def on_message_created(self, message) -> None:
         )
 
     async def on_message_delta(self, delta, snapshot) -> None:
+        message_delta = delta.model_dump()
+        for content in message_delta["content"]:
+            if content["text"]["annotations"]:
+                for annotation in content["text"]["annotations"]:
+                    if annotation["type"] == "file_citation":
+                        annotation["file_citation"]["file_name"] = self.file_names.get(
+                            annotation["file_citation"]["file_id"], ""
+                        )
         self.enqueue(
             {
                 "type": "message_delta",
-                "delta": delta.model_dump(),
+                "delta": message_delta,
             }
         )
 
@@ -199,6 +207,7 @@ async def run_thread(
     thread_id: str,
     assistant_id: int,
     message: list[MessageContentPartParam],
+    file_names: dict[str, str] = {},
     metadata: Dict[str, str | int] | None = None,
 ):
     try:
@@ -209,8 +218,7 @@ async def run_thread(
                 content=message,
                 metadata=metadata,
             )
-
-        handler = BufferedStreamHandler()
+        handler = BufferedStreamHandler(file_names=file_names)
         async with cli.beta.threads.runs.stream(
             thread_id=thread_id,
             assistant_id=assistant_id,
diff --git a/pingpong/models.py b/pingpong/models.py
@@ -1,6 +1,7 @@
+import asyncio
 import json
 from datetime import datetime
-from typing import AsyncGenerator, List, Optional
+from typing import AsyncGenerator, List, Optional, Union
 
 from sqlalchemy import Boolean, Column, DateTime, UniqueConstraint
 from sqlalchemy import Enum as SQLEnum
@@ -687,6 +688,16 @@ async def get_file_ids_by_id(
         for file in vector_store.files:
             yield file.file_id, file.id
 
+    @classmethod
+    async def get_file_names_ids_by_id(
+        cls, session: AsyncSession, id_: int
+    ) -> dict[str, str]:
+        stmt = select(VectorStore).where(VectorStore.id == int(id_))
+        vector_store = await session.scalar(stmt)
+        if not vector_store:
+            return {}
+        return {file.file_id: file.name for file in vector_store.files}
+
     @classmethod
     async def add_files(
         cls, session: AsyncSession, vector_store_id: int, file_ids: list[str]
@@ -1698,3 +1709,52 @@ async def add_image_files(
             )
         )
         await session.execute(stmt)
+
+    @classmethod
+    async def get_file_search_files(
+        cls, session: AsyncSession, thread_id: int
+    ) -> dict[str, str]:
+        stmt = (
+            select(Thread)
+            .outerjoin(Thread.assistant)
+            .options(
+                contains_eager(Thread.assistant).load_only(Assistant.vector_store_id)
+            )
+            .where(Thread.id == thread_id)
+        )
+        thread = await session.scalar(stmt)
+        if not thread:
+            return {}
+        return await cls.get_file_search_files_by_thread(session, thread)
+
+    @classmethod
+    async def get_file_search_files_assistant(
+        cls, session: AsyncSession, thread_id: int
+    ) -> tuple[Union["Assistant", None], dict[str, str]]:
+        stmt = (
+            select(Thread)
+            .options(joinedload(Thread.assistant))
+            .where(Thread.id == thread_id)
+        )
+        thread = await session.scalar(stmt)
+        if not thread:
+            return None, {}
+        return thread.assistant, await cls.get_file_search_files_by_thread(
+            session, thread
+        )
+
+    @classmethod
+    async def get_file_search_files_by_thread(
+        cls, session: AsyncSession, thread: "Thread"
+    ) -> dict[str, str]:
+        vector_store_ids: list[int] = list(
+            filter(None, [thread.assistant.vector_store_id, thread.vector_store_id])
+        )
+        if not vector_store_ids:
+            return {}
+        tasks = [
+            VectorStore.get_file_names_ids_by_id(session, vector_store_id)
+            for vector_store_id in vector_store_ids
+        ]
+        results = await asyncio.gather(*tasks)
+        return {k: v for result in results for k, v in result.items()}
diff --git a/pingpong/server.py b/pingpong/server.py
@@ -1419,13 +1419,18 @@ async def get_thread(
     class_id: str, thread_id: str, request: Request, openai_client: OpenAIClient
 ):
     thread = await models.Thread.get_by_id(request.state.db, int(thread_id))
-    messages, assistant, runs_result = await asyncio.gather(
+    messages, [assistant, file_names], runs_result = await asyncio.gather(
         openai_client.beta.threads.messages.list(
             thread.thread_id, limit=20, order="desc"
         ),
-        models.Assistant.get_by_id(request.state.db, thread.assistant_id),
+        models.Thread.get_file_search_files_assistant(request.state.db, thread.id),
         openai_client.beta.threads.runs.list(thread.thread_id, limit=1, order="desc"),
     )
+    if not assistant:
+        raise HTTPException(
+            status_code=404,
+            detail="Assistant not found",
+        )
     last_run = [r async for r in runs_result]
     current_user_ids = [
         request.state.session.user.id
@@ -1436,6 +1441,13 @@ async def get_thread(
         users = {str(u.id): u for u in thread.users}
 
     for message in messages.data:
+        for content in message.content:
+            if content.type and content.type == "text" and content.text.annotations:
+                for annotation in content.text.annotations:
+                    if annotation.type == "file_citation":
+                        annotation.file_citation.file_name = file_names.get(
+                            annotation.file_citation.file_id, ""
+                        )
         user_id = message.metadata.pop("user_id", None)
         if not user_id:
             continue
@@ -1527,11 +1539,19 @@ async def list_thread_messages(
     messages = await openai_client.beta.threads.messages.list(
         thread.thread_id, limit=limit, order="asc", before=before
     )
+    file_names = await models.Thread.get_file_search_files(request.state.db, thread.id)
 
     if messages.data:
         users = {u.id: u.created for u in thread.users}
 
     for message in messages.data:
+        for content in message.content:
+            if content.type == "text" and content.text.annotations:
+                for annotation in content.text.annotations:
+                    if annotation.type == "file_citation":
+                        annotation.file_citation.file_name = file_names.get(
+                            annotation.file_citation.file_id, ""
+                        )
         user_id = message.metadata.pop("user_id", None)
         if not user_id:
             continue
@@ -1831,12 +1851,13 @@ async def create_run(
 ):
     thread = await models.Thread.get_by_id(request.state.db, int(thread_id))
     asst = await models.Assistant.get_by_id(request.state.db, thread.assistant_id)
-
+    file_names = await models.Thread.get_file_search_files(request.state.db, thread.id)
     stream = run_thread(
         openai_client,
         thread_id=thread.thread_id,
         assistant_id=asst.assistant_id,
         message=[],
+        file_names=file_names,
     )
 
     return StreamingResponse(stream, media_type="text/event-stream")
@@ -1926,13 +1947,15 @@ async def send_message(
         thread=thread.thread_id,
     )
 
+    file_names = await models.Thread.get_file_search_files(request.state.db, thread.id)
     # Create a generator that will stream chunks to the client.
     stream = run_thread(
         openai_client,
         thread_id=thread.thread_id,
         assistant_id=asst.assistant_id,
         message=messageContent,
         metadata={"user_id": str(request.state.session.user.id)},
+        file_names=file_names,
     )
     return StreamingResponse(stream, media_type="text/event-stream")
 
diff --git a/web/pingpong/src/lib/api.ts b/web/pingpong/src/lib/api.ts
@@ -1307,6 +1307,7 @@ export type TextAnnotationFilePath = {
 
 export type TextAnnotationFileCitationFileCitation = {
   file_id: string;
+  file_name: string;
   quote: string;
 };
 
diff --git a/web/pingpong/src/lib/content.ts b/web/pingpong/src/lib/content.ts
@@ -18,6 +18,10 @@ export const parseTextContent = (text: Text, baseUrl: string = '') => {
         const { start_index, end_index, file_path } = annotation;
         const url = join(baseUrl, `/file/${file_path.file_id}`);
         replacements.push({ start: start_index, end: end_index, newValue: url });
+      } else if (annotation.type === 'file_citation') {
+        const { start_index, end_index, file_citation } = annotation;
+        const fileName = ` (${file_citation.file_name})`;
+        replacements.push({ start: start_index, end: end_index, newValue: fileName });
       }
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,10 @@ export const parseTextContent = (text: Text, baseUrl: string = '') => {`
`18`	`18`	`const { start_index, end_index, file_path } = annotation;`
`19`	`19`	const url = join(baseUrl, `/file/${file_path.file_id}`);
`20`	`20`	`replacements.push({ start: start_index, end: end_index, newValue: url });`
	`21`	`+ } else if (annotation.type === 'file_citation') {`
	`22`	`+ const { start_index, end_index, file_citation } = annotation;`
	`23`	+ const fileName = ` (${file_citation.file_name})`;
	`24`	`+ replacements.push({ start: start_index, end: end_index, newValue: fileName });`
`21`	`25`	`}`
`22`	`26`	`}`
`23`	`27`	`}`