Skip to content

Commit 59a73f2

Browse files
committed
new structure
1 parent 92cc200 commit 59a73f2

File tree

13 files changed

+159
-1210
lines changed

13 files changed

+159
-1210
lines changed

CLAUDE.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ This project uses `uv` for Python dependency management. Key commands:
1515
uv sync
1616

1717
# Run the CLI directly
18-
tacho gpt-4.1-mini gemini-2.5-flash
18+
tacho gpt-4.1-mini gemini-2.0-flash
1919

2020
# Build the package
2121
uv build
@@ -25,14 +25,18 @@ uv build
2525

2626
## Architecture
2727

28-
The project is intentionally simple with all logic in a single file (`tacho.py`):
28+
The project has been refactored into a modular structure:
2929

30-
- **Entry point**: `tacho:main` - wrapper function that uses `os._exit()` to suppress warnings
31-
- **Main CLI app**: `app` - Typer CLI application
30+
- **Entry point**: `tacho.cli:main` - wrapper function that uses `os._exit()` to suppress warnings
31+
- **Core modules**:
32+
- `cli.py`: Main CLI app using Typer, handles command parsing
33+
- `ai.py`: Core benchmarking logic with async model benchmarking
34+
- `display.py`: Results presentation and metrics calculation
35+
- `config.py`: Configuration and constants
3236
- **Main functions**:
3337
- `validate_models()`: Pre-flight validation of model availability
34-
- `benchmark_model()`: Core benchmarking logic with optional progress tracking
35-
- `calculate_metrics()`: Extracts performance metrics from raw benchmark data
38+
- `benchmark_model()`: Core benchmarking logic in ai.py
39+
- `calculate_metrics()`: Extracts performance metrics from raw benchmark data (in display.py)
3640
- `run_benchmarks()`: Orchestrates parallel benchmarking of multiple models
3741

3842
## Key Design Decisions

tacho/ai.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import time
2+
3+
import litellm
4+
from rich.console import Console
5+
6+
console = Console()
7+
8+
BENCHMARK_PROMPT = """Generate a ~2000 word summary of the history of the USA."""
9+
VALIDATION_PROMPT = "Do you have time to help? (yes/no)"
10+
11+
12+
async def llm(model: str, prompt: str, tokens: int | None = None):
13+
messages = [{"role": "user", "content": prompt}]
14+
return await litellm.acompletion(model, messages, max_tokens=tokens)
15+
16+
17+
async def ping_model(model: str) -> bool:
18+
try:
19+
await llm(model, VALIDATION_PROMPT, 1)
20+
console.print(f"[green]✓[/green] {model}")
21+
return True
22+
except Exception as e:
23+
console.print(f"[red]✗[/red] {model} - {str(e)}")
24+
return False
25+
26+
27+
async def bench_model(model: str, max_tokens: int) -> tuple[float, int]:
28+
"""Measure inference time for a single run and return time and tokens"""
29+
start_time = time.time()
30+
response = await llm(model, BENCHMARK_PROMPT, max_tokens)
31+
duration = time.time() - start_time
32+
tokens = response.usage.completion_tokens if response.usage else 0
33+
return duration, tokens

tacho/cli.py

Lines changed: 7 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,13 @@
33

44
import typer
55
from rich.console import Console
6-
from rich.table import Table
76

87
from .config import load_env
9-
from .core import ping_models, bench_models
8+
from .display import run_pings, run_benchmarks, display_results
109

1110
load_env()
1211

13-
app = typer.Typer(
14-
help="CLI tool for measuring LLM inference speeds",
15-
)
12+
app = typer.Typer(help="CLI tool for measuring LLM inference speeds")
1613
console = Console()
1714

1815

@@ -51,44 +48,12 @@ def bench(
5148
),
5249
):
5350
"""Benchmark inference speed of different LLM models"""
54-
res = asyncio.run(ping_models(models))
51+
res = asyncio.run(run_pings(models))
5552
valid_models = [models[i] for i in range(len(models)) if res[i]]
56-
5753
if not valid_models:
5854
raise typer.Exit(1)
59-
60-
results = asyncio.run(bench_models(valid_models, runs, lim))
61-
62-
if not results:
63-
raise typer.Exit(1)
64-
65-
table = Table(show_header=True, header_style="bold magenta")
66-
table.add_column("Model", style="cyan", no_wrap=True)
67-
table.add_column("Avg tok/s", justify="right", style="bold green")
68-
# table.add_column("Median tok/s", justify="right")
69-
table.add_column("Min tok/s", justify="right")
70-
table.add_column("Max tok/s", justify="right")
71-
table.add_column("Avg Time", justify="right")
72-
# table.add_column("Avg Tokens", justify="right")
73-
74-
# Sort by mean tokens per second (descending)
75-
sorted_models = sorted(
76-
results.keys(), key=lambda x: results[x]["mean_tps"], reverse=True
77-
)
78-
79-
for model in sorted_models:
80-
data = results[model]
81-
table.add_row(
82-
model,
83-
f"{data['mean_tps']:.1f}",
84-
# f"{data['median_tps']:.1f}",
85-
f"{data['min_tps']:.1f}",
86-
f"{data['max_tps']:.1f}",
87-
f"{data['avg_time']:.1f}s",
88-
# f"{data['avg_tokens']:.0f}",
89-
)
90-
91-
console.print(table)
55+
res = asyncio.run(run_benchmarks(valid_models, runs, lim))
56+
display_results(valid_models, runs, res)
9257

9358

9459
@app.command()
@@ -99,26 +64,8 @@ def ping(
9964
),
10065
):
10166
"""Check which LLM models are accessible without running benchmarks"""
102-
res = asyncio.run(ping_models(models))
103-
104-
# Count successful models
105-
successful = sum(res)
106-
107-
# Print summary
108-
console.print()
109-
if successful == len(models):
110-
console.print(
111-
f"[bold green]All {len(models)} models are accessible![/bold green]"
112-
)
113-
elif successful > 0:
114-
console.print(
115-
f"[bold yellow]{successful}/{len(models)} models are accessible[/bold yellow]"
116-
)
117-
else:
118-
console.print("[bold red]No models are accessible[/bold red]")
119-
120-
# Exit with appropriate code
121-
if successful == 0:
67+
res = asyncio.run(run_pings(models))
68+
if not sum(res):
12269
raise typer.Exit(1)
12370

12471

tacho/core.py

Lines changed: 0 additions & 120 deletions
This file was deleted.

tacho/display.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import asyncio
2+
from statistics import mean, median
3+
4+
from rich.console import Console
5+
from rich.progress import Progress, SpinnerColumn, TextColumn
6+
from rich.table import Table
7+
8+
from collections import defaultdict
9+
10+
from .ai import ping_model, bench_model
11+
12+
console = Console()
13+
14+
15+
async def run_pings(models: list[str]):
16+
"""Run ping checks with progress indicator"""
17+
spinner = SpinnerColumn()
18+
text = TextColumn("[progress.description]{task.description}")
19+
with Progress(spinner, text, transient=True) as prog:
20+
prog.add_task("[bold cyan]Checking Model Access...[bold cyan]", total=None)
21+
return await asyncio.gather(*[ping_model(m) for m in models])
22+
23+
24+
async def run_benchmarks(models: list[str], runs: int, tokens: int):
25+
"""Run benchmarks with progress indicator"""
26+
spinner = SpinnerColumn()
27+
text = TextColumn("[progress.description]{task.description}")
28+
with Progress(spinner, text, transient=True) as prog:
29+
prog.add_task("[bold cyan]Running Benchmark...[/bold cyan]", total=None)
30+
tasks = []
31+
for m in models:
32+
for _ in range(runs):
33+
tasks.append(bench_model(m, tokens))
34+
return await asyncio.gather(*tasks)
35+
36+
37+
def calculate_metrics(times: list[float], tokens: list[int]) -> dict:
38+
"""Calculate performance metrics from benchmark results"""
39+
if not times or not tokens:
40+
return {}
41+
42+
tokens_per_second = [t / time for t, time in zip(tokens, times) if time > 0]
43+
44+
return {
45+
"mean_tps": mean(tokens_per_second) if tokens_per_second else 0,
46+
"median_tps": median(tokens_per_second) if tokens_per_second else 0,
47+
"min_tps": min(tokens_per_second) if tokens_per_second else 0,
48+
"max_tps": max(tokens_per_second) if tokens_per_second else 0,
49+
"avg_time": mean(times),
50+
"avg_tokens": mean(tokens),
51+
}
52+
53+
54+
def display_results(models: list[str], runs: int, results: list):
55+
"""Process and display benchmark results in a formatted table"""
56+
# Process raw results into metrics by model
57+
model_results = defaultdict(list)
58+
59+
# Map results back to models based on known ordering
60+
idx = 0
61+
for model in models:
62+
for _ in range(runs):
63+
if (
64+
idx < len(results)
65+
and isinstance(results[idx], tuple)
66+
and len(results[idx]) == 2
67+
):
68+
model_results[model].append(results[idx])
69+
idx += 1
70+
71+
# Calculate metrics for each model
72+
processed_results = {
73+
model: calculate_metrics(
74+
[time for time, _ in data], [tokens for _, tokens in data]
75+
)
76+
if data
77+
else {}
78+
for model, data in model_results.items()
79+
}
80+
81+
if not processed_results:
82+
return
83+
84+
# Display the table
85+
table = Table(show_header=True, header_style="bold magenta")
86+
table.add_column("Model", style="cyan", no_wrap=True)
87+
table.add_column("Avg tok/s", justify="right", style="bold green")
88+
table.add_column("Min tok/s", justify="right")
89+
table.add_column("Max tok/s", justify="right")
90+
table.add_column("Avg Time", justify="right")
91+
92+
# Sort by mean tokens per second (descending)
93+
sorted_models = sorted(
94+
processed_results.keys(),
95+
key=lambda x: processed_results[x]["mean_tps"],
96+
reverse=True,
97+
)
98+
99+
for model in sorted_models:
100+
data = processed_results[model]
101+
table.add_row(
102+
model,
103+
f"{data['mean_tps']:.1f}",
104+
f"{data['min_tps']:.1f}",
105+
f"{data['max_tps']:.1f}",
106+
f"{data['avg_time']:.1f}s",
107+
)
108+
109+
console.print(table)

0 commit comments

Comments
 (0)