forked from Ashfaqbs/TinyLLM-usecases
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
120 lines (96 loc) · 4.15 KB
/
Copy pathmain.py
File metadata and controls
120 lines (96 loc) · 4.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Qwen3 4B tool calling demo with full telemetry.
Unlike FunctionGemma (pure router), Qwen3 can:
- Think before calling tools (thinking mode)
- Summarize tool results in natural language
- Hold multi-turn conversations
Telemetry printed: prompt tokens, eval tokens, response time,
tokens/sec, load time, and model info.
"""
import time
from langchain_ollama import ChatOllama
from tools import ALL_TOOLS, TOOL_MAP
MODEL = "qwen3:4b"
llm = ChatOllama(model=MODEL, temperature=0)
llm_with_tools = llm.bind_tools(ALL_TOOLS)
TEST_QUERIES = [
"What is the weather in Tokyo?",
"Add 25 and 17",
"Find contact info for Alice",
"What is the weather in London and also find contact info for Bob",
]
def extract_telemetry(response) -> dict:
"""Extract telemetry metadata from the LangChain AIMessage response_metadata."""
meta = getattr(response, "response_metadata", {}) or {}
prompt_tokens = meta.get("prompt_eval_count", 0)
eval_tokens = meta.get("eval_count", 0)
total_duration_ns = meta.get("total_duration", 0)
load_duration_ns = meta.get("load_duration", 0)
prompt_eval_ns = meta.get("prompt_eval_duration", 0)
eval_duration_ns = meta.get("eval_duration", 0)
model = meta.get("model", MODEL)
done_reason = meta.get("done_reason", "unknown")
total_duration_ms = total_duration_ns / 1_000_000
load_duration_ms = load_duration_ns / 1_000_000
prompt_eval_ms = prompt_eval_ns / 1_000_000
eval_duration_ms = eval_duration_ns / 1_000_000
tokens_per_sec = (eval_tokens / (eval_duration_ns / 1_000_000_000)) if eval_duration_ns > 0 else 0
return {
"model": model,
"done_reason": done_reason,
"prompt_tokens": prompt_tokens,
"eval_tokens": eval_tokens,
"total_tokens": prompt_tokens + eval_tokens,
"total_duration_ms": round(total_duration_ms, 1),
"load_duration_ms": round(load_duration_ms, 1),
"prompt_eval_ms": round(prompt_eval_ms, 1),
"eval_duration_ms": round(eval_duration_ms, 1),
"tokens_per_sec": round(tokens_per_sec, 1),
}
def print_telemetry(telemetry: dict) -> None:
"""Pretty print telemetry data."""
print(f" --- Telemetry ---")
print(f" Model: {telemetry['model']}")
print(f" Done reason: {telemetry['done_reason']}")
print(f" Prompt tokens: {telemetry['prompt_tokens']}")
print(f" Eval tokens: {telemetry['eval_tokens']}")
print(f" Total tokens: {telemetry['total_tokens']}")
print(f" Total time: {telemetry['total_duration_ms']} ms")
print(f" Model load time: {telemetry['load_duration_ms']} ms")
print(f" Prompt eval time: {telemetry['prompt_eval_ms']} ms")
print(f" Generation time: {telemetry['eval_duration_ms']} ms")
print(f" Tokens/sec: {telemetry['tokens_per_sec']}")
def main() -> None:
for query in TEST_QUERIES:
print(f"\n{'='*60}")
print(f"QUERY: {query}")
print(f"{'='*60}")
wall_start = time.perf_counter()
response = llm_with_tools.invoke(query)
wall_elapsed_ms = round((time.perf_counter() - wall_start) * 1000, 1)
# Tool calls
if response.tool_calls:
for tc in response.tool_calls:
print(f" Tool: {tc['name']}")
print(f" Args: {tc['args']}")
matched_tool = TOOL_MAP.get(tc["name"])
if matched_tool:
result = matched_tool.invoke(tc["args"])
print(f" Result: {result}")
else:
print(f" No tool call detected.")
# Qwen3 can also produce text content alongside tool calls
if response.content:
content = response.content.strip()
# Filter out thinking tags if present
if "</think>" in content:
content = content.split("</think>")[-1].strip()
if content:
print(f" Response: {content}")
# Telemetry
telemetry = extract_telemetry(response)
telemetry["wall_time_ms"] = wall_elapsed_ms
print_telemetry(telemetry)
print(f" Wall clock time: {wall_elapsed_ms} ms")
if __name__ == "__main__":
main()