pietz
diff --git a/‎CLAUDE.md‎
Lines changed: 10 additions & 6 deletions b/‎CLAUDE.md‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎tacho/ai.py‎
Lines changed: 33 additions & 0 deletions b/‎tacho/ai.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎tacho/cli.py‎
Lines changed: 7 additions & 60 deletions b/‎tacho/cli.py‎
Lines changed: 7 additions & 60 deletions
diff --git a/‎tacho/core.py‎
Lines changed: 0 additions & 120 deletions b/‎tacho/core.py‎
Lines changed: 0 additions & 120 deletions
diff --git a/‎tacho/display.py‎
Lines changed: 109 additions & 0 deletions b/‎tacho/display.py‎
Lines changed: 109 additions & 0 deletions
@@ -15,7 +15,7 @@ This project uses `uv` for Python dependency management. Key commands:
 uv sync
 
 # Run the CLI directly
-tacho gpt-4.1-mini gemini-2.5-flash
+tacho gpt-4.1-mini gemini-2.0-flash
 
 # Build the package
 uv build
@@ -25,14 +25,18 @@ uv build
 
 ## Architecture
 
-The project is intentionally simple with all logic in a single file (`tacho.py`):
+The project has been refactored into a modular structure:
 
-- **Entry point**: `tacho:main` - wrapper function that uses `os._exit()` to suppress warnings
-- **Main CLI app**: `app` - Typer CLI application
+- **Entry point**: `tacho.cli:main` - wrapper function that uses `os._exit()` to suppress warnings
+- **Core modules**:
+  - `cli.py`: Main CLI app using Typer, handles command parsing
+  - `ai.py`: Core benchmarking logic with async model benchmarking
+  - `display.py`: Results presentation and metrics calculation
+  - `config.py`: Configuration and constants
 - **Main functions**:
   - `validate_models()`: Pre-flight validation of model availability
-  - `benchmark_model()`: Core benchmarking logic with optional progress tracking
-  - `calculate_metrics()`: Extracts performance metrics from raw benchmark data
+  - `benchmark_model()`: Core benchmarking logic in ai.py
+  - `calculate_metrics()`: Extracts performance metrics from raw benchmark data (in display.py)
   - `run_benchmarks()`: Orchestrates parallel benchmarking of multiple models
 
 ## Key Design Decisions
 
@@ -0,0 +1,33 @@
+import time
+
+import litellm
+from rich.console import Console
+
+console = Console()
+
+BENCHMARK_PROMPT = """Generate a ~2000 word summary of the history of the USA."""
+VALIDATION_PROMPT = "Do you have time to help? (yes/no)"
+
+
+async def llm(model: str, prompt: str, tokens: int | None = None):
+    messages = [{"role": "user", "content": prompt}]
+    return await litellm.acompletion(model, messages, max_tokens=tokens)
+
+
+async def ping_model(model: str) -> bool:
+    try:
+        await llm(model, VALIDATION_PROMPT, 1)
+        console.print(f"[green]✓[/green] {model}")
+        return True
+    except Exception as e:
+        console.print(f"[red]✗[/red] {model} - {str(e)}")
+        return False
+
+
+async def bench_model(model: str, max_tokens: int) -> tuple[float, int]:
+    """Measure inference time for a single run and return time and tokens"""
+    start_time = time.time()
+    response = await llm(model, BENCHMARK_PROMPT, max_tokens)
+    duration = time.time() - start_time
+    tokens = response.usage.completion_tokens if response.usage else 0
+    return duration, tokens
@@ -3,16 +3,13 @@
 
 import typer
 from rich.console import Console
-from rich.table import Table
 
 from .config import load_env
-from .core import ping_models, bench_models
+from .display import run_pings, run_benchmarks, display_results
 
 load_env()
 
-app = typer.Typer(
-    help="CLI tool for measuring LLM inference speeds",
-)
+app = typer.Typer(help="CLI tool for measuring LLM inference speeds")
 console = Console()
 
 
@@ -51,44 +48,12 @@ def bench(
     ),
 ):
     """Benchmark inference speed of different LLM models"""
-    res = asyncio.run(ping_models(models))
+    res = asyncio.run(run_pings(models))
     valid_models = [models[i] for i in range(len(models)) if res[i]]
-
     if not valid_models:
         raise typer.Exit(1)
-
-    results = asyncio.run(bench_models(valid_models, runs, lim))
-
-    if not results:
-        raise typer.Exit(1)
-
-    table = Table(show_header=True, header_style="bold magenta")
-    table.add_column("Model", style="cyan", no_wrap=True)
-    table.add_column("Avg tok/s", justify="right", style="bold green")
-    # table.add_column("Median tok/s", justify="right")
-    table.add_column("Min tok/s", justify="right")
-    table.add_column("Max tok/s", justify="right")
-    table.add_column("Avg Time", justify="right")
-    # table.add_column("Avg Tokens", justify="right")
-
-    # Sort by mean tokens per second (descending)
-    sorted_models = sorted(
-        results.keys(), key=lambda x: results[x]["mean_tps"], reverse=True
-    )
-
-    for model in sorted_models:
-        data = results[model]
-        table.add_row(
-            model,
-            f"{data['mean_tps']:.1f}",
-            # f"{data['median_tps']:.1f}",
-            f"{data['min_tps']:.1f}",
-            f"{data['max_tps']:.1f}",
-            f"{data['avg_time']:.1f}s",
-            # f"{data['avg_tokens']:.0f}",
-        )
-
-    console.print(table)
+    res = asyncio.run(run_benchmarks(valid_models, runs, lim))
+    display_results(valid_models, runs, res)
 
 
 @app.command()
@@ -99,26 +64,8 @@ def ping(
     ),
 ):
     """Check which LLM models are accessible without running benchmarks"""
-    res = asyncio.run(ping_models(models))
-
-    # Count successful models
-    successful = sum(res)
-
-    # Print summary
-    console.print()
-    if successful == len(models):
-        console.print(
-            f"[bold green]All {len(models)} models are accessible![/bold green]"
-        )
-    elif successful > 0:
-        console.print(
-            f"[bold yellow]{successful}/{len(models)} models are accessible[/bold yellow]"
-        )
-    else:
-        console.print("[bold red]No models are accessible[/bold red]")
-
-    # Exit with appropriate code
-    if successful == 0:
+    res = asyncio.run(run_pings(models))
+    if not sum(res):
         raise typer.Exit(1)
 
 
 
@@ -0,0 +1,109 @@
+import asyncio
+from statistics import mean, median
+
+from rich.console import Console
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.table import Table
+
+from collections import defaultdict
+
+from .ai import ping_model, bench_model
+
+console = Console()
+
+
+async def run_pings(models: list[str]):
+    """Run ping checks with progress indicator"""
+    spinner = SpinnerColumn()
+    text = TextColumn("[progress.description]{task.description}")
+    with Progress(spinner, text, transient=True) as prog:
+        prog.add_task("[bold cyan]Checking Model Access...[bold cyan]", total=None)
+        return await asyncio.gather(*[ping_model(m) for m in models])
+
+
+async def run_benchmarks(models: list[str], runs: int, tokens: int):
+    """Run benchmarks with progress indicator"""
+    spinner = SpinnerColumn()
+    text = TextColumn("[progress.description]{task.description}")
+    with Progress(spinner, text, transient=True) as prog:
+        prog.add_task("[bold cyan]Running Benchmark...[/bold cyan]", total=None)
+        tasks = []
+        for m in models:
+            for _ in range(runs):
+                tasks.append(bench_model(m, tokens))
+        return await asyncio.gather(*tasks)
+
+
+def calculate_metrics(times: list[float], tokens: list[int]) -> dict:
+    """Calculate performance metrics from benchmark results"""
+    if not times or not tokens:
+        return {}
+
+    tokens_per_second = [t / time for t, time in zip(tokens, times) if time > 0]
+
+    return {
+        "mean_tps": mean(tokens_per_second) if tokens_per_second else 0,
+        "median_tps": median(tokens_per_second) if tokens_per_second else 0,
+        "min_tps": min(tokens_per_second) if tokens_per_second else 0,
+        "max_tps": max(tokens_per_second) if tokens_per_second else 0,
+        "avg_time": mean(times),
+        "avg_tokens": mean(tokens),
+    }
+
+
+def display_results(models: list[str], runs: int, results: list):
+    """Process and display benchmark results in a formatted table"""
+    # Process raw results into metrics by model
+    model_results = defaultdict(list)
+
+    # Map results back to models based on known ordering
+    idx = 0
+    for model in models:
+        for _ in range(runs):
+            if (
+                idx < len(results)
+                and isinstance(results[idx], tuple)
+                and len(results[idx]) == 2
+            ):
+                model_results[model].append(results[idx])
+            idx += 1
+
+    # Calculate metrics for each model
+    processed_results = {
+        model: calculate_metrics(
+            [time for time, _ in data], [tokens for _, tokens in data]
+        )
+        if data
+        else {}
+        for model, data in model_results.items()
+    }
+
+    if not processed_results:
+        return
+
+    # Display the table
+    table = Table(show_header=True, header_style="bold magenta")
+    table.add_column("Model", style="cyan", no_wrap=True)
+    table.add_column("Avg tok/s", justify="right", style="bold green")
+    table.add_column("Min tok/s", justify="right")
+    table.add_column("Max tok/s", justify="right")
+    table.add_column("Avg Time", justify="right")
+
+    # Sort by mean tokens per second (descending)
+    sorted_models = sorted(
+        processed_results.keys(),
+        key=lambda x: processed_results[x]["mean_tps"],
+        reverse=True,
+    )
+
+    for model in sorted_models:
+        data = processed_results[model]
+        table.add_row(
+            model,
+            f"{data['mean_tps']:.1f}",
+            f"{data['min_tps']:.1f}",
+            f"{data['max_tps']:.1f}",
+            f"{data['avg_time']:.1f}s",
+        )
+
+    console.print(table)