forked from Ashfaqbs/TinyLLM-usecases
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathserver.py
More file actions
157 lines (118 loc) · 4.43 KB
/
Copy pathserver.py
File metadata and controls
157 lines (118 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
FastAPI server for Qwen3 4B tool calling with telemetry.
Tools imported from tools.py. Every response includes LLM telemetry.
"""
import time
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from langchain_ollama import ChatOllama
from tools import ALL_TOOLS, TOOL_MAP
MODEL = "qwen3:4b"
llm = ChatOllama(model=MODEL, temperature=0)
llm_with_tools = llm.bind_tools(ALL_TOOLS)
# --- Request/Response Models ---
class QueryRequest(BaseModel):
question: str = Field(description="Natural language question for Qwen3")
class ToolCallResult(BaseModel):
tool_name: str
tool_args: dict
result: str
class Telemetry(BaseModel):
model: str
done_reason: str
prompt_tokens: int
eval_tokens: int
total_tokens: int
total_duration_ms: float
load_duration_ms: float
prompt_eval_ms: float
eval_duration_ms: float
tokens_per_sec: float
wall_time_ms: float
class QueryResponse(BaseModel):
question: str
tool_calls: list[ToolCallResult]
llm_response: str
telemetry: Telemetry
# --- FastAPI App ---
app = FastAPI(
title="Qwen3 4B Tool Calling API",
description="Tool calling + conversation powered by Qwen3 4B via Ollama, with telemetry",
version="1.0.0",
)
def extract_telemetry(response, wall_time_ms: float) -> Telemetry:
"""Extract telemetry from LangChain response metadata."""
meta = getattr(response, "response_metadata", {}) or {}
prompt_tokens = meta.get("prompt_eval_count", 0)
eval_tokens = meta.get("eval_count", 0)
total_ns = meta.get("total_duration", 0)
load_ns = meta.get("load_duration", 0)
prompt_eval_ns = meta.get("prompt_eval_duration", 0)
eval_ns = meta.get("eval_duration", 0)
tokens_per_sec = (eval_tokens / (eval_ns / 1_000_000_000)) if eval_ns > 0 else 0
return Telemetry(
model=meta.get("model", MODEL),
done_reason=meta.get("done_reason", "unknown"),
prompt_tokens=prompt_tokens,
eval_tokens=eval_tokens,
total_tokens=prompt_tokens + eval_tokens,
total_duration_ms=round(total_ns / 1_000_000, 1),
load_duration_ms=round(load_ns / 1_000_000, 1),
prompt_eval_ms=round(prompt_eval_ns / 1_000_000, 1),
eval_duration_ms=round(eval_ns / 1_000_000, 1),
tokens_per_sec=round(tokens_per_sec, 1),
wall_time_ms=wall_time_ms,
)
def run_tool_call(question: str) -> QueryResponse:
"""Send question to Qwen3, execute tools, return results with telemetry."""
wall_start = time.perf_counter()
response = llm_with_tools.invoke(question)
wall_ms = round((time.perf_counter() - wall_start) * 1000, 1)
results = []
if response.tool_calls:
for tc in response.tool_calls:
matched_tool = TOOL_MAP.get(tc["name"])
if matched_tool is None:
raise HTTPException(status_code=500, detail=f"Unknown tool: {tc['name']}")
tool_result = matched_tool.invoke(tc["args"])
results.append(
ToolCallResult(
tool_name=tc["name"],
tool_args=tc["args"],
result=str(tool_result),
)
)
# Extract text content, strip thinking tags
llm_text = ""
if response.content:
llm_text = response.content.strip()
if "</think>" in llm_text:
llm_text = llm_text.split("</think>")[-1].strip()
return QueryResponse(
question=question,
tool_calls=results,
llm_response=llm_text,
telemetry=extract_telemetry(response, wall_ms),
)
@app.post("/ask", response_model=QueryResponse)
def ask(request: QueryRequest):
"""Generic endpoint - Qwen3 auto-routes to the right tool."""
return run_tool_call(request.question)
@app.post("/weather", response_model=QueryResponse)
def weather(request: QueryRequest):
"""Ask a weather-related question."""
return run_tool_call(request.question)
@app.post("/calculate", response_model=QueryResponse)
def calculate(request: QueryRequest):
"""Ask a math/calculation question."""
return run_tool_call(request.question)
@app.post("/contacts", response_model=QueryResponse)
def contacts(request: QueryRequest):
"""Ask a contact lookup question."""
return run_tool_call(request.question)
@app.get("/health")
def health():
return {"status": "ok", "model": MODEL}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=8001)