Skip to content

Commit 97d3321

Browse files
committed
completions
1 parent 5044e1c commit 97d3321

1 file changed

Lines changed: 72 additions & 0 deletions

File tree

src/kvboost/server/app.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,15 @@
5454
from .engine_worker import EngineWorker
5555

5656
log = logging.getLogger(__name__)
57+
io_log = logging.getLogger("kvboost.server.io")
58+
59+
60+
def _truncate(text: str, limit: int = 500) -> str:
61+
if text is None:
62+
return ""
63+
if len(text) <= limit:
64+
return text
65+
return f"{text[:limit]}…<+{len(text) - limit} chars>"
5766

5867

5968
def build_app(worker: EngineWorker, model_name: Optional[str] = None) -> FastAPI:
@@ -80,6 +89,31 @@ def build_app(worker: EngineWorker, model_name: Optional[str] = None) -> FastAPI
8089
allow_headers=["*"],
8190
)
8291

92+
# ── Request/response access log ───────────────────────────────────────────
93+
94+
@app.middleware("http")
95+
async def _log_io(request: Request, call_next):
96+
req_id = uuid.uuid4().hex[:8]
97+
start = time.perf_counter()
98+
client = f"{request.client.host}:{request.client.port}" if request.client else "-"
99+
io_log.info(
100+
"REQ id=%s %s %s client=%s",
101+
req_id, request.method, request.url.path, client,
102+
)
103+
try:
104+
response = await call_next(request)
105+
except Exception:
106+
elapsed_ms = (time.perf_counter() - start) * 1000
107+
io_log.exception("ERR id=%s %s %s elapsed=%.1fms",
108+
req_id, request.method, request.url.path, elapsed_ms)
109+
raise
110+
elapsed_ms = (time.perf_counter() - start) * 1000
111+
io_log.info(
112+
"RES id=%s %s %s status=%d elapsed=%.1fms",
113+
req_id, request.method, request.url.path, response.status_code, elapsed_ms,
114+
)
115+
return response
116+
83117
# ── Startup / shutdown ────────────────────────────────────────────────────
84118

85119
@app.on_event("startup")
@@ -117,7 +151,9 @@ async def warm(request: Request):
117151
text = body.get("text", "")
118152
if not text:
119153
raise HTTPException(status_code=400, detail="Field 'text' is required.")
154+
io_log.info("WARM in: chars=%d text=%r", len(text), _truncate(text))
120155
await worker.warm(text)
156+
io_log.info("WARM out: chars=%d", len(text))
121157
return {"status": "warmed", "chars": len(text)}
122158

123159
# ── /v1/completions ───────────────────────────────────────────────────────
@@ -126,6 +162,13 @@ async def warm(request: Request):
126162
async def completions(req: CompletionRequest):
127163
_validate_model(req.model, _model_name)
128164

165+
io_log.info(
166+
"COMPLETIONS in: model=%s n_prompts=%d max_tokens=%d temp=%s stream=%s",
167+
req.model, len(req.prompts), req.max_tokens, req.temperature, req.stream,
168+
)
169+
for i, p in enumerate(req.prompts):
170+
io_log.info(" prompt[%d]=%r", i, _truncate(p))
171+
129172
if req.stream:
130173
return StreamingResponse(
131174
_stream_completions(req, worker, _model_name),
@@ -137,6 +180,13 @@ async def completions(req: CompletionRequest):
137180
prompt_tokens = sum(len(worker._tokenize(p)) for p in req.prompts)
138181
completion_tokens = sum(len(worker._tokenize(r.output_text)) for r in results)
139182

183+
for i, r in enumerate(results):
184+
io_log.info("COMPLETIONS out[%d]=%r", i, _truncate(r.output_text))
185+
io_log.info(
186+
"COMPLETIONS done: prompt_tokens=%d completion_tokens=%d total=%d",
187+
prompt_tokens, completion_tokens, prompt_tokens + completion_tokens,
188+
)
189+
140190
choices = [
141191
CompletionChoice(text=r.output_text, index=i)
142192
for i, r in enumerate(results)
@@ -159,6 +209,14 @@ async def chat_completions(req: ChatCompletionRequest):
159209

160210
prompt = req.to_prompt(worker.engine.tokenizer)
161211

212+
io_log.info(
213+
"CHAT in: model=%s n_messages=%d max_tokens=%d temp=%s stream=%s",
214+
req.model, len(req.messages), req.max_tokens, req.temperature, req.stream,
215+
)
216+
for i, m in enumerate(req.messages):
217+
io_log.info(" msg[%d] role=%s content=%r", i, m.role, _truncate(m.content))
218+
io_log.debug("CHAT prompt=%r", _truncate(prompt, 1000))
219+
162220
if req.stream:
163221
return StreamingResponse(
164222
_stream_chat(req, prompt, worker, _model_name),
@@ -172,6 +230,13 @@ async def chat_completions(req: ChatCompletionRequest):
172230
prompt_tokens = len(worker._tokenize(prompt))
173231
completion_tokens = len(worker._tokenize(result.output_text))
174232

233+
io_log.info("CHAT out=%r", _truncate(result.output_text))
234+
io_log.info(
235+
"CHAT done: prompt_tokens=%d completion_tokens=%d total=%d generated=%d",
236+
prompt_tokens, completion_tokens, prompt_tokens + completion_tokens,
237+
result.generated_tokens,
238+
)
239+
175240
return ChatCompletionResponse(
176241
model=_model_name,
177242
choices=[
@@ -289,6 +354,8 @@ async def _stream_completions(
289354
"logprobs": None,
290355
}],
291356
)
357+
io_log.info("COMPLETIONS stream out[%d] id=%s text=%r",
358+
i, request_id, _truncate(result.output_text))
292359
yield f"data: {chunk.model_dump_json()}\n\n"
293360

294361
yield "data: [DONE]\n\n"
@@ -386,5 +453,10 @@ async def _stream_chat(
386453
"finish_reason": finish_reason,
387454
}],
388455
)
456+
generated = final_result.generated_tokens if final_result is not None else len(all_tokens)
457+
io_log.info(
458+
"CHAT stream out id=%s finish=%s generated_tokens=%d text=%r",
459+
request_id, finish_reason, generated, _truncate(prev_text),
460+
)
389461
yield f"data: {stop_chunk.model_dump_json()}\n\n"
390462
yield "data: [DONE]\n\n"

0 commit comments

Comments
 (0)