-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllm_client.py
More file actions
61 lines (53 loc) · 1.92 KB
/
llm_client.py
File metadata and controls
61 lines (53 loc) · 1.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import json
import os
from typing import Any, AsyncGenerator, Dict, List, Optional
import httpx
REQUEST_TIMEOUT = float(os.getenv("REQUEST_TIMEOUT", "20"))
async def stream_upstream_sse(
upstream_type: str,
upstream_url: str,
model: str,
messages: List[Dict[str, str]],
temperature: Optional[float] = 0.7,
top_p: Optional[float] = 0.95,
top_k: Optional[int] = 60,
presence_penalty: Optional[float] = 0.0,
frequency_penalty: Optional[float] = 0.0,
) -> AsyncGenerator[str, None]:
"""
Forwards the request to llama.cpp or LM Studio and passes through their SSE chunks 1:1.
Expected upstream API: OpenAI-compatible /v1/chat/completions with stream=true.
"""
if upstream_url.endswith("/"):
base = upstream_url[:-1]
else:
base = upstream_url
url = f"{base}/v1/chat/completions"
body = {
"model": model,
"messages": messages,
"temperature": temperature,
"top_p": top_p,
"top_k": top_k,
"presence_penalty": presence_penalty,
"frequency_penalty": frequency_penalty,
"max_tokens": -1,
"stream": True,
}
headers = {
"Accept": "text/event-stream",
"Cache-Control": "no-store",
"Connection": "keep-alive",
# (optional) "Authorization": f"Bearer {token}"
}
timeout = httpx.Timeout(REQUEST_TIMEOUT, connect=REQUEST_TIMEOUT)
async with httpx.AsyncClient(timeout=timeout, headers=headers) as client:
async with client.stream("POST", url, json=body) as resp:
resp.raise_for_status()
async for line in resp.aiter_lines():
if not line:
continue
if not line.startswith("data:"):
continue
# We forward exactly the same line, including JSON or [DONE]
yield f"{line}\n\n"