1717router = APIRouter (prefix = "/chat" , tags = ["Chat" ])
1818
1919
20+ from app .agent .core import Agent
21+
2022@router .post ("" , response_model = ChatResponse )
2123@router .post ("/" , response_model = ChatResponse )
2224@limiter .limit (get_chat_limit )
2325async def chat (request : Request , chat_request : ChatRequest ) -> ChatResponse :
2426 """
25- Send a message and get an AI response.
26- Uses RAG to retrieve relevant context from the knowledge base.
27-
28- Args:
29- request: FastAPI Request object (for rate limiting)
30- chat_request: ChatRequest with message and optional history
31-
32- Returns:
33- ChatResponse with AI response and sources
27+ Send a message and get an AI response using Agentic RAG.
3428 """
3529 logger .info (f"Chat request: { chat_request .message [:100 ]} ..." )
3630 settings = get_settings ()
37- obs = get_observability_service ()
38-
31+
3932 try :
40- rag = get_rag_service ()
41-
4233 # Convert history to dict format
4334 history = None
4435 if chat_request .history :
4536 history = [
4637 {"role" : msg .role , "content" : msg .content }
4738 for msg in chat_request .history
4839 ]
49-
50- # Trace the LLM call with Langfuse
51- with obs .trace_llm_call (
52- name = "chat" ,
53- input_text = chat_request .message ,
54- model = settings .ollama_model ,
55- metadata = {"endpoint" : "/chat" , "has_history" : bool (history )}
56- ) as ctx :
57- result = await rag .query (
58- question = chat_request .message ,
59- history = history
60- )
61- ctx .set_output (result ["response" ])
62-
63- logger .info (f"Chat response generated, sources: { result .get ('sources' , [])} " )
64-
40+
41+ agent = Agent ()
42+ response = await agent .run (chat_request .message , history = history )
43+
44+ # Parse response to extract sources if possible, or Agent should return them.
45+ # Currently Agent returns string.
46+ # rag.txt says "cite sources using metadata".
47+ # So the sources should be IN the text.
48+ # But ChatResponse model expects `sources` list.
49+ # We can extract them or just leave empty for now, as the text contains citations.
50+
6551 return ChatResponse (
66- response = result [ " response" ] ,
67- sources = result . get ( "sources" , [])
52+ response = response ,
53+ sources = [] # Sources are embedded in the text citation
6854 )
6955
7056 except Exception as e :
7157 logger .error (f"Chat error: { e } " )
72- obs .log_error (trace_id = None , error = e , context = {"endpoint" : "/chat" })
7358 raise HTTPException (
7459 status_code = 500 ,
7560 detail = "Failed to generate response"
@@ -80,25 +65,14 @@ async def chat(request: Request, chat_request: ChatRequest) -> ChatResponse:
8065@limiter .limit (get_chat_stream_limit )
8166async def chat_stream (request : Request , chat_request : ChatRequest ):
8267 """
83- Send a message and get a streaming AI response.
84- Uses Server-Sent Events (SSE) for real-time streaming.
85-
86- Args:
87- request: FastAPI Request object (for rate limiting)
88- chat_request: ChatRequest with message and optional history
89-
90- Returns:
91- StreamingResponse with SSE events
68+ Send a message and get a streaming AI response using Agentic RAG.
69+ Note: Currently sends the full response as a single chunk after processing.
9270 """
9371 logger .info (f"Streaming chat request: { chat_request .message [:100 ]} ..." )
9472 settings = get_settings ()
95- obs = get_observability_service ()
9673
9774 async def generate ():
98- full_response = []
9975 try :
100- rag = get_rag_service ()
101-
10276 # Convert history to dict format
10377 history = None
10478 if chat_request .history :
@@ -107,34 +81,24 @@ async def generate():
10781 for msg in chat_request .history
10882 ]
10983
110- # Start trace for streaming
111- with obs .trace_llm_call (
112- name = "chat_stream" ,
113- input_text = chat_request .message ,
114- model = settings .ollama_model ,
115- metadata = {"endpoint" : "/chat/stream" , "has_history" : bool (history )}
116- ) as ctx :
117- async for chunk_data in rag .query_stream (
118- question = chat_request .message ,
119- history = history
120- ):
121- # Collect response for tracing
122- if chunk_data .get ("chunk" ):
123- full_response .append (chunk_data ["chunk" ])
124-
125- # Format as SSE event
126- event_data = json .dumps (chunk_data )
127- yield f"data: { event_data } \n \n "
128-
129- # Set full response for trace
130- ctx .set_output ("" .join (full_response ))
84+ agent = Agent ()
85+ # Agent processing (plan -> execute -> merge -> generate)
86+ response = await agent .run (chat_request .message , history = history )
87+
88+ # Yield the full response as a single chunk
89+ # In the future, we could stream tokens from Ollama in the final step of Agent
90+ chunk_data = {
91+ "chunk" : response ,
92+ "done" : False ,
93+ "sources" : [] # Sources embedded in text
94+ }
95+ yield f"data: { json .dumps (chunk_data )} \n \n "
13196
13297 # Send done event
13398 yield "data: [DONE]\n \n "
13499
135100 except Exception as e :
136101 logger .error (f"Streaming error: { e } " )
137- obs .log_error (trace_id = None , error = e , context = {"endpoint" : "/chat/stream" })
138102 error_data = json .dumps ({"error" : str (e )})
139103 yield f"data: { error_data } \n \n "
140104
@@ -144,7 +108,7 @@ async def generate():
144108 headers = {
145109 "Cache-Control" : "no-cache" ,
146110 "Connection" : "keep-alive" ,
147- "X-Accel-Buffering" : "no" , # Disable nginx buffering
111+ "X-Accel-Buffering" : "no" ,
148112 }
149113 )
150114
0 commit comments