TinyLLM-usecases/qwen3/server.py at main · asbrwl/TinyLLM-usecases · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
FastAPI server for Qwen3 4B tool calling with telemetry.

Tools imported from tools.py. Every response includes LLM telemetry.
"""

import time

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from langchain_ollama import ChatOllama

from tools import ALL_TOOLS, TOOL_MAP

MODEL = "qwen3:4b"

llm = ChatOllama(model=MODEL, temperature=0)
llm_with_tools = llm.bind_tools(ALL_TOOLS)


# --- Request/Response Models ---

class QueryRequest(BaseModel):
    question: str = Field(description="Natural language question for Qwen3")


class ToolCallResult(BaseModel):
    tool_name: str
    tool_args: dict
    result: str


class Telemetry(BaseModel):
    model: str
    done_reason: str
    prompt_tokens: int
    eval_tokens: int
    total_tokens: int
    total_duration_ms: float
    load_duration_ms: float
    prompt_eval_ms: float
    eval_duration_ms: float
    tokens_per_sec: float
    wall_time_ms: float


class QueryResponse(BaseModel):
    question: str
    tool_calls: list[ToolCallResult]
    llm_response: str
    telemetry: Telemetry


# --- FastAPI App ---

app = FastAPI(
    title="Qwen3 4B Tool Calling API",
    description="Tool calling + conversation powered by Qwen3 4B via Ollama, with telemetry",
    version="1.0.0",
)


def extract_telemetry(response, wall_time_ms: float) -> Telemetry:
    """Extract telemetry from LangChain response metadata."""
    meta = getattr(response, "response_metadata", {}) or {}

    prompt_tokens = meta.get("prompt_eval_count", 0)
    eval_tokens = meta.get("eval_count", 0)
    total_ns = meta.get("total_duration", 0)
    load_ns = meta.get("load_duration", 0)
    prompt_eval_ns = meta.get("prompt_eval_duration", 0)
    eval_ns = meta.get("eval_duration", 0)
    tokens_per_sec = (eval_tokens / (eval_ns / 1_000_000_000)) if eval_ns > 0 else 0

    return Telemetry(
        model=meta.get("model", MODEL),
        done_reason=meta.get("done_reason", "unknown"),
        prompt_tokens=prompt_tokens,
        eval_tokens=eval_tokens,
        total_tokens=prompt_tokens + eval_tokens,
        total_duration_ms=round(total_ns / 1_000_000, 1),
        load_duration_ms=round(load_ns / 1_000_000, 1),
        prompt_eval_ms=round(prompt_eval_ns / 1_000_000, 1),
        eval_duration_ms=round(eval_ns / 1_000_000, 1),
        tokens_per_sec=round(tokens_per_sec, 1),
        wall_time_ms=wall_time_ms,
    )


def run_tool_call(question: str) -> QueryResponse:
    """Send question to Qwen3, execute tools, return results with telemetry."""
    wall_start = time.perf_counter()
    response = llm_with_tools.invoke(question)
    wall_ms = round((time.perf_counter() - wall_start) * 1000, 1)

    results = []
    if response.tool_calls:
        for tc in response.tool_calls:
            matched_tool = TOOL_MAP.get(tc["name"])
            if matched_tool is None:
                raise HTTPException(status_code=500, detail=f"Unknown tool: {tc['name']}")
            tool_result = matched_tool.invoke(tc["args"])
            results.append(
                ToolCallResult(
                    tool_name=tc["name"],
                    tool_args=tc["args"],
                    result=str(tool_result),
                )
            )

    # Extract text content, strip thinking tags
    llm_text = ""
    if response.content:
        llm_text = response.content.strip()
        if "</think>" in llm_text:
            llm_text = llm_text.split("</think>")[-1].strip()

    return QueryResponse(
        question=question,
        tool_calls=results,
        llm_response=llm_text,
        telemetry=extract_telemetry(response, wall_ms),
    )


@app.post("/ask", response_model=QueryResponse)
def ask(request: QueryRequest):
    """Generic endpoint - Qwen3 auto-routes to the right tool."""
    return run_tool_call(request.question)


@app.post("/weather", response_model=QueryResponse)
def weather(request: QueryRequest):
    """Ask a weather-related question."""
    return run_tool_call(request.question)


@app.post("/calculate", response_model=QueryResponse)
def calculate(request: QueryRequest):
    """Ask a math/calculation question."""
    return run_tool_call(request.question)


@app.post("/contacts", response_model=QueryResponse)
def contacts(request: QueryRequest):
    """Ask a contact lookup question."""
    return run_tool_call(request.question)


@app.get("/health")
def health():
    return {"status": "ok", "model": MODEL}


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="127.0.0.1", port=8001)