5454from .engine_worker import EngineWorker
5555
5656log = logging .getLogger (__name__ )
57+ io_log = logging .getLogger ("kvboost.server.io" )
58+
59+
60+ def _truncate (text : str , limit : int = 500 ) -> str :
61+ if text is None :
62+ return ""
63+ if len (text ) <= limit :
64+ return text
65+ return f"{ text [:limit ]} …<+{ len (text ) - limit } chars>"
5766
5867
5968def build_app (worker : EngineWorker , model_name : Optional [str ] = None ) -> FastAPI :
@@ -80,6 +89,31 @@ def build_app(worker: EngineWorker, model_name: Optional[str] = None) -> FastAPI
8089 allow_headers = ["*" ],
8190 )
8291
92+ # ── Request/response access log ───────────────────────────────────────────
93+
94+ @app .middleware ("http" )
95+ async def _log_io (request : Request , call_next ):
96+ req_id = uuid .uuid4 ().hex [:8 ]
97+ start = time .perf_counter ()
98+ client = f"{ request .client .host } :{ request .client .port } " if request .client else "-"
99+ io_log .info (
100+ "REQ id=%s %s %s client=%s" ,
101+ req_id , request .method , request .url .path , client ,
102+ )
103+ try :
104+ response = await call_next (request )
105+ except Exception :
106+ elapsed_ms = (time .perf_counter () - start ) * 1000
107+ io_log .exception ("ERR id=%s %s %s elapsed=%.1fms" ,
108+ req_id , request .method , request .url .path , elapsed_ms )
109+ raise
110+ elapsed_ms = (time .perf_counter () - start ) * 1000
111+ io_log .info (
112+ "RES id=%s %s %s status=%d elapsed=%.1fms" ,
113+ req_id , request .method , request .url .path , response .status_code , elapsed_ms ,
114+ )
115+ return response
116+
83117 # ── Startup / shutdown ────────────────────────────────────────────────────
84118
85119 @app .on_event ("startup" )
@@ -117,7 +151,9 @@ async def warm(request: Request):
117151 text = body .get ("text" , "" )
118152 if not text :
119153 raise HTTPException (status_code = 400 , detail = "Field 'text' is required." )
154+ io_log .info ("WARM in: chars=%d text=%r" , len (text ), _truncate (text ))
120155 await worker .warm (text )
156+ io_log .info ("WARM out: chars=%d" , len (text ))
121157 return {"status" : "warmed" , "chars" : len (text )}
122158
123159 # ── /v1/completions ───────────────────────────────────────────────────────
@@ -126,6 +162,13 @@ async def warm(request: Request):
126162 async def completions (req : CompletionRequest ):
127163 _validate_model (req .model , _model_name )
128164
165+ io_log .info (
166+ "COMPLETIONS in: model=%s n_prompts=%d max_tokens=%d temp=%s stream=%s" ,
167+ req .model , len (req .prompts ), req .max_tokens , req .temperature , req .stream ,
168+ )
169+ for i , p in enumerate (req .prompts ):
170+ io_log .info (" prompt[%d]=%r" , i , _truncate (p ))
171+
129172 if req .stream :
130173 return StreamingResponse (
131174 _stream_completions (req , worker , _model_name ),
@@ -137,6 +180,13 @@ async def completions(req: CompletionRequest):
137180 prompt_tokens = sum (len (worker ._tokenize (p )) for p in req .prompts )
138181 completion_tokens = sum (len (worker ._tokenize (r .output_text )) for r in results )
139182
183+ for i , r in enumerate (results ):
184+ io_log .info ("COMPLETIONS out[%d]=%r" , i , _truncate (r .output_text ))
185+ io_log .info (
186+ "COMPLETIONS done: prompt_tokens=%d completion_tokens=%d total=%d" ,
187+ prompt_tokens , completion_tokens , prompt_tokens + completion_tokens ,
188+ )
189+
140190 choices = [
141191 CompletionChoice (text = r .output_text , index = i )
142192 for i , r in enumerate (results )
@@ -159,6 +209,14 @@ async def chat_completions(req: ChatCompletionRequest):
159209
160210 prompt = req .to_prompt (worker .engine .tokenizer )
161211
212+ io_log .info (
213+ "CHAT in: model=%s n_messages=%d max_tokens=%d temp=%s stream=%s" ,
214+ req .model , len (req .messages ), req .max_tokens , req .temperature , req .stream ,
215+ )
216+ for i , m in enumerate (req .messages ):
217+ io_log .info (" msg[%d] role=%s content=%r" , i , m .role , _truncate (m .content ))
218+ io_log .debug ("CHAT prompt=%r" , _truncate (prompt , 1000 ))
219+
162220 if req .stream :
163221 return StreamingResponse (
164222 _stream_chat (req , prompt , worker , _model_name ),
@@ -172,6 +230,13 @@ async def chat_completions(req: ChatCompletionRequest):
172230 prompt_tokens = len (worker ._tokenize (prompt ))
173231 completion_tokens = len (worker ._tokenize (result .output_text ))
174232
233+ io_log .info ("CHAT out=%r" , _truncate (result .output_text ))
234+ io_log .info (
235+ "CHAT done: prompt_tokens=%d completion_tokens=%d total=%d generated=%d" ,
236+ prompt_tokens , completion_tokens , prompt_tokens + completion_tokens ,
237+ result .generated_tokens ,
238+ )
239+
175240 return ChatCompletionResponse (
176241 model = _model_name ,
177242 choices = [
@@ -289,6 +354,8 @@ async def _stream_completions(
289354 "logprobs" : None ,
290355 }],
291356 )
357+ io_log .info ("COMPLETIONS stream out[%d] id=%s text=%r" ,
358+ i , request_id , _truncate (result .output_text ))
292359 yield f"data: { chunk .model_dump_json ()} \n \n "
293360
294361 yield "data: [DONE]\n \n "
@@ -386,5 +453,10 @@ async def _stream_chat(
386453 "finish_reason" : finish_reason ,
387454 }],
388455 )
456+ generated = final_result .generated_tokens if final_result is not None else len (all_tokens )
457+ io_log .info (
458+ "CHAT stream out id=%s finish=%s generated_tokens=%d text=%r" ,
459+ request_id , finish_reason , generated , _truncate (prev_text ),
460+ )
389461 yield f"data: { stop_chunk .model_dump_json ()} \n \n "
390462 yield "data: [DONE]\n \n "
0 commit comments