TinyLLM-usecases/qwen3/main.py at main · asbrwl/TinyLLM-usecases · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Qwen3 4B tool calling demo with full telemetry.

Unlike FunctionGemma (pure router), Qwen3 can:
- Think before calling tools (thinking mode)
- Summarize tool results in natural language
- Hold multi-turn conversations

Telemetry printed: prompt tokens, eval tokens, response time,
tokens/sec, load time, and model info.
"""

import time

from langchain_ollama import ChatOllama

from tools import ALL_TOOLS, TOOL_MAP

MODEL = "qwen3:4b"

llm = ChatOllama(model=MODEL, temperature=0)
llm_with_tools = llm.bind_tools(ALL_TOOLS)

TEST_QUERIES = [
    "What is the weather in Tokyo?",
    "Add 25 and 17",
    "Find contact info for Alice",
    "What is the weather in London and also find contact info for Bob",
]


def extract_telemetry(response) -> dict:
    """Extract telemetry metadata from the LangChain AIMessage response_metadata."""
    meta = getattr(response, "response_metadata", {}) or {}

    prompt_tokens = meta.get("prompt_eval_count", 0)
    eval_tokens = meta.get("eval_count", 0)
    total_duration_ns = meta.get("total_duration", 0)
    load_duration_ns = meta.get("load_duration", 0)
    prompt_eval_ns = meta.get("prompt_eval_duration", 0)
    eval_duration_ns = meta.get("eval_duration", 0)
    model = meta.get("model", MODEL)
    done_reason = meta.get("done_reason", "unknown")

    total_duration_ms = total_duration_ns / 1_000_000
    load_duration_ms = load_duration_ns / 1_000_000
    prompt_eval_ms = prompt_eval_ns / 1_000_000
    eval_duration_ms = eval_duration_ns / 1_000_000
    tokens_per_sec = (eval_tokens / (eval_duration_ns / 1_000_000_000)) if eval_duration_ns > 0 else 0

    return {
        "model": model,
        "done_reason": done_reason,
        "prompt_tokens": prompt_tokens,
        "eval_tokens": eval_tokens,
        "total_tokens": prompt_tokens + eval_tokens,
        "total_duration_ms": round(total_duration_ms, 1),
        "load_duration_ms": round(load_duration_ms, 1),
        "prompt_eval_ms": round(prompt_eval_ms, 1),
        "eval_duration_ms": round(eval_duration_ms, 1),
        "tokens_per_sec": round(tokens_per_sec, 1),
    }


def print_telemetry(telemetry: dict) -> None:
    """Pretty print telemetry data."""
    print(f"  --- Telemetry ---")
    print(f"  Model:            {telemetry['model']}")
    print(f"  Done reason:      {telemetry['done_reason']}")
    print(f"  Prompt tokens:    {telemetry['prompt_tokens']}")
    print(f"  Eval tokens:      {telemetry['eval_tokens']}")
    print(f"  Total tokens:     {telemetry['total_tokens']}")
    print(f"  Total time:       {telemetry['total_duration_ms']} ms")
    print(f"  Model load time:  {telemetry['load_duration_ms']} ms")
    print(f"  Prompt eval time: {telemetry['prompt_eval_ms']} ms")
    print(f"  Generation time:  {telemetry['eval_duration_ms']} ms")
    print(f"  Tokens/sec:       {telemetry['tokens_per_sec']}")


def main() -> None:
    for query in TEST_QUERIES:
        print(f"\n{'='*60}")
        print(f"QUERY: {query}")
        print(f"{'='*60}")

        wall_start = time.perf_counter()
        response = llm_with_tools.invoke(query)
        wall_elapsed_ms = round((time.perf_counter() - wall_start) * 1000, 1)

        # Tool calls
        if response.tool_calls:
            for tc in response.tool_calls:
                print(f"  Tool:   {tc['name']}")
                print(f"  Args:   {tc['args']}")

                matched_tool = TOOL_MAP.get(tc["name"])
                if matched_tool:
                    result = matched_tool.invoke(tc["args"])
                    print(f"  Result: {result}")
        else:
            print(f"  No tool call detected.")

        # Qwen3 can also produce text content alongside tool calls
        if response.content:
            content = response.content.strip()
            # Filter out thinking tags if present
            if "</think>" in content:
                content = content.split("</think>")[-1].strip()
            if content:
                print(f"  Response: {content}")

        # Telemetry
        telemetry = extract_telemetry(response)
        telemetry["wall_time_ms"] = wall_elapsed_ms
        print_telemetry(telemetry)
        print(f"  Wall clock time:  {wall_elapsed_ms} ms")


if __name__ == "__main__":
    main()