TinyLLM-usecases/qwen3/client.py at main · asbrwl/TinyLLM-usecases · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Client script for Qwen3 API. Sends questions, prints results
with telemetry, and stores everything to responses.json.
"""

import json
import logging
from datetime import datetime, timezone
from pathlib import Path

import httpx

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)

BASE_URL = "http://127.0.0.1:8001"
OUTPUT_FILE = Path(__file__).parent / "responses.json"

QUESTIONS = [
    {"endpoint": "/weather", "question": "What is the weather in London?"},
    {"endpoint": "/calculate", "question": "Add 100 and 250"},
    {"endpoint": "/contacts", "question": "Find contact info for Bob"},
    {"endpoint": "/ask", "question": "What is the weather in Mumbai?"},
    {"endpoint": "/ask", "question": "Add 42 and 58"},
    {"endpoint": "/ask", "question": "Search for Charlie in contacts"},
]


def send_question(client: httpx.Client, endpoint: str, question: str) -> dict:
    """Send question to the server and return parsed response."""
    response = client.post(
        f"{BASE_URL}{endpoint}",
        json={"question": question},
        timeout=60.0,
    )
    response.raise_for_status()
    return response.json()


def print_telemetry(telemetry: dict) -> None:
    """Pretty print telemetry section."""
    logger.info(f"  --- Telemetry ---")
    logger.info(f"  Model:            {telemetry['model']}")
    logger.info(f"  Prompt tokens:    {telemetry['prompt_tokens']}")
    logger.info(f"  Eval tokens:      {telemetry['eval_tokens']}")
    logger.info(f"  Total tokens:     {telemetry['total_tokens']}")
    logger.info(f"  Total time:       {telemetry['total_duration_ms']} ms")
    logger.info(f"  Model load:       {telemetry['load_duration_ms']} ms")
    logger.info(f"  Prompt eval:      {telemetry['prompt_eval_ms']} ms")
    logger.info(f"  Generation:       {telemetry['eval_duration_ms']} ms")
    logger.info(f"  Tokens/sec:       {telemetry['tokens_per_sec']}")
    logger.info(f"  Wall clock:       {telemetry['wall_time_ms']} ms")


def main() -> None:
    stored_responses = []

    with httpx.Client() as client:
        health = client.get(f"{BASE_URL}/health", timeout=5.0)
        logger.info(f"Server health: {health.json()}")
        logger.info("")

        for item in QUESTIONS:
            endpoint = item["endpoint"]
            question = item["question"]

            logger.info(f"{'='*60}")
            logger.info(f"  Endpoint: {endpoint}")
            logger.info(f"  Question: {question}")

            result = send_question(client, endpoint, question)

            for tc in result["tool_calls"]:
                logger.info(f"  Tool:     {tc['tool_name']}")
                logger.info(f"  Args:     {tc['tool_args']}")
                logger.info(f"  Result:   {tc['result']}")

            if result.get("llm_response"):
                logger.info(f"  LLM says: {result['llm_response'][:200]}")

            if not result["tool_calls"] and not result.get("llm_response"):
                logger.info(f"  No tool call or response.")

            print_telemetry(result["telemetry"])

            stored_responses.append({
                "timestamp": datetime.now(timezone.utc).isoformat(),
                "endpoint": endpoint,
                "question": question,
                "tool_calls": result["tool_calls"],
                "llm_response": result.get("llm_response", ""),
                "telemetry": result["telemetry"],
            })

    OUTPUT_FILE.write_text(json.dumps(stored_responses, indent=2), encoding="utf-8")
    logger.info("")
    logger.info(f"{'='*60}")
    logger.info(f"Saved {len(stored_responses)} responses to {OUTPUT_FILE}")


if __name__ == "__main__":
    main()