reasoning support, unit tests, bug fixes

pietz · pietz · commit 0b03d691dfa2 · 2025-06-30T09:51:22.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,21 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 - Comprehensive unit test suite with 31 tests covering all major components
-- Test configuration in `pytest.ini` with markers for unit and integration tests
-- Shared test fixtures in `conftest.py` for mocking external dependencies
-- Tests for configuration module (`test_config.py`)
-- Tests for AI/benchmarking module (`test_ai.py`)
-- Tests for display/metrics module (`test_display.py`)
-- Tests for CLI commands (`test_cli.py`)
-- Development dependencies for testing (pytest, pytest-asyncio, pytest-mock)
 
 ### Changed
-- Project structure now includes a dedicated `tests/` directory
-- All external API calls are properly mocked in tests for reliability
+- Renamed `--lim` to `--tokens`
 
-### Developer Experience
-- Run tests with `uv run pytest` or `uv run pytest -v` for verbose output
-- Tests achieve high coverage of core functionality without requiring API keys
 
 ## [0.5.0] - 2025-01-27
 
@@ -85,7 +74,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - `--lim` parameter to control maximum tokens per response (default: 2000)
 - Full async/parallel execution for all benchmarks
-- Dynamic model testing with `test-models` command
 
 ### Changed
 - Tokens/second is now the primary metric instead of raw time
diff --git a/README.md b/README.md
@@ -63,13 +63,13 @@ pip install tacho
 tacho gpt-4.1-nano gemini/gemini-2.0-flash
 
 # Custom settings
-tacho gpt-4.1-nano gemini/gemini-2.0-flash --runs 3 --lim 1000
+tacho gpt-4.1-nano gemini/gemini-2.0-flash --runs 3 --tokens 1000
 ```
 
 ### Command options
 
 - `--runs, -r`: Number of inference runs per model (default: 5)
-- `--lim, -l`: Maximum tokens to generate per response (default: 500)
+- `--tokens, -t`: Maximum tokens to generate per response (default: 500)
 - `--prompt, -p`: Custom prompt for benchmarking
 
 ## Output
diff --git a/ROADMAP.md b/ROADMAP.md
@@ -0,0 +1,11 @@
+## Features
+
+- [ ] Nicely display different types of LiteLLM errors
+    - [ ] Not authorized
+    - [ ] Model not found
+    - [ ] Provider not found (?)
+    - [ ] Rate Limit
+    - [ ] Others
+
+
+## Bugs
diff --git a/docs/index.html b/docs/index.html
@@ -357,16 +357,17 @@ <h1>tacho - LLM Speed Test</h1>
                         <span class="terminal-dot"></span>
                         <span class="terminal-dot"></span>
                     </div>
-                    <div class="terminal-content"><span class="dim">$</span> tacho gpt-4.1-nano gemini/gemini-2.0-flash
-
-<span class="success">✓</span> gemini/gemini-2.0-flash
-<span class="success">✓</span> gpt-4.1-nano
-┌─────────────────────────┬───────────┬───────────┬───────────┬──────────┐
-│ Model                   │ Avg tok/s │ Min tok/s │ Max tok/s │ Avg Time │
-├─────────────────────────┼───────────┼───────────┼───────────┼──────────┤
-│ gemini/gemini-2.0-flash │     124.0 │     110.5 │     136.6 │     4.0s │
-│ gpt-4.1-nano            │     116.9 │     105.4 │     129.5 │     4.3s │
-└─────────────────────────┴───────────┴───────────┴───────────┴──────────┘</div>
+                    <div class="terminal-content"><span class="dim">$</span> tacho gpt-4.1 gemini/gemini-2.5-pro vertex_ai/claude-sonnet-4@20250514
+<span class="success">✓</span> gpt-4.1
+<span class="success">✓</span> vertex_ai/claude-sonnet-4@20250514
+<span class="success">✓</span> gemini/gemini-2.5-pro
+┌────────────────────────────────────┬───────────┬───────────┬───────────┬──────────┐
+│ Model                              │ Avg tok/s │ Min tok/s │ Max tok/s │ Avg Time │
+├────────────────────────────────────┼───────────┼───────────┼───────────┼──────────┤
+│ gemini/gemini-2.5-pro              │      84.6 │      50.3 │     133.8 │   13.44s │
+│ gpt-4.1                            │      49.7 │      35.1 │      66.6 │   10.75s │
+│ vertex_ai/claude-sonnet-4@20250514 │      48.7 │      47.3 │      50.9 │   10.27s │
+└────────────────────────────────────┴───────────┴───────────┴───────────┴──────────┘</div>
                 </div>
             </section>
             
@@ -409,8 +410,8 @@ <h3>🔒 100% Private</h3>
                         <p>No telemetry or data sent to our servers</p>
                     </div>
                     <div class="feature">
-                        <h3>🏓 Ping Models</h3>
-                        <p>Quickly verify API keys and model access</p>
+                        <h3>🧠 Reasoning Support</h3>
+                        <p>Accurately takes into account thinking tokens</p>
                     </div>
                 </div>
             </section>
@@ -425,7 +426,7 @@ <h2>Usage</h2>
                 
                 <p>Run benchmarks with custom settings:</p>
                 <div class="code-block">
-                    <code>tacho gpt-4.1-nano claude-3.5-haiku --runs 3 --lim 1000</code>
+                    <code>tacho gpt-4.1-nano claude-3.5-haiku --runs 3 --tokens 1000</code>
                 </div>
             </section>
         </main>
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,10 +21,14 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 dependencies = [
-    "litellm[bedrock,vertex_ai]>=1.73.1",
+    "boto3>=1.38.46",
+    "google-auth>=2.40.3",
+    "google>=3.0.0",
+    "litellm>=1.73.1",
     "python-dotenv>=1.1.1",
     "rich>=14.0.0",
     "typer>=0.16.0",
+    "google-cloud-aiplatform>=1.100.0",
 ]
 
 [project.urls]
diff --git a/tacho/ai.py b/tacho/ai.py
@@ -26,5 +26,11 @@ async def bench_model(model: str, max_tokens: int) -> tuple[float, int]:
     start_time = time.time()
     response = await llm(model, BENCHMARK_PROMPT, max_tokens)
     duration = time.time() - start_time
-    tokens = response.usage.completion_tokens if response.usage else 0
+    
+
+    tokens = response.usage.completion_tokens
+    if hasattr(response.usage, 'completion_tokens_details') and response.usage.completion_tokens_details:
+        if hasattr(response.usage.completion_tokens_details, 'reasoning_tokens'):
+            tokens += response.usage.completion_tokens_details.reasoning_tokens
+    
     return duration, tokens
diff --git a/tacho/cli.py b/tacho/cli.py
@@ -26,14 +26,14 @@ def cli_main(
     ctx: typer.Context,
     models: list[str] | None = typer.Argument(None),
     runs: int = typer.Option(5, "--runs", "-r"),
-    lim: int = typer.Option(500, "--lim", "-l"),
+    tokens: int = typer.Option(500, "--tokens", "-t"),
     version: bool | None = typer.Option(
         None, "--version", callback=version_callback, is_eager=True
     ),
 ):
     """Default command when models are provided directly"""
     if ctx.invoked_subcommand is None and models:
-        bench(models, runs, lim)
+        bench(models, runs, tokens)
 
 
 @app.command()
@@ -43,16 +43,16 @@ def bench(
         help="List of models to benchmark using LiteLLM names",
     ),
     runs: int = typer.Option(5, "--runs", "-r", help="Number of runs per model"),
-    lim: int = typer.Option(
-        500, "--lim", "-l", help="Maximum tokens to generate per response"
+    tokens: int = typer.Option(
+        500, "--tokens", "-t", help="Maximum tokens to generate per response"
     ),
 ):
     """Benchmark inference speed of different LLM models"""
     res = asyncio.run(run_pings(models))
     valid_models = [models[i] for i in range(len(models)) if res[i]]
     if not valid_models:
         raise typer.Exit(1)
-    res = asyncio.run(run_benchmarks(valid_models, runs, lim))
+    res = asyncio.run(run_benchmarks(valid_models, runs, tokens))
     display_results(valid_models, runs, res)
 
 
diff --git a/tacho/config.py b/tacho/config.py
@@ -24,30 +24,19 @@ def ensure_env_file():
     if not env_path.exists():
         template = """# Tacho Configuration File
 # Add your API keys here to avoid exporting them each time
-#
-# === BASIC PROVIDERS ===
-# OpenAI
 # OPENAI_API_KEY=sk-...
 #
-# Anthropic
 # ANTHROPIC_API_KEY=sk-ant-...
 #
-# Google Gemini
 # GEMINI_API_KEY=...
 #
-# Groq
-# GROQ_API_KEY=gsk_...
-#
-# === CLOUD PROVIDERS ===
-# AWS Bedrock (requires all three)
 # AWS_ACCESS_KEY_ID=...
 # AWS_SECRET_ACCESS_KEY=...
 # AWS_REGION_NAME=us-east-1
 #
-# Google Vertex AI (requires both)
 # GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
 # VERTEXAI_PROJECT=your-gcp-project-id
-# VERTEXAI_LOCATION=us-central1  # optional, defaults to us-central1
+# VERTEXAI_LOCATION=us-east5  # optional, defaults to us-east5
 #
 # For more providers, see: https://docs.litellm.ai/docs/providers
 
diff --git a/tests/test_ai.py b/tests/test_ai.py
@@ -78,9 +78,13 @@ async def test_bench_model_success(self, mock_litellm, mocker):
         mock_time = mocker.patch('tacho.ai.time.time')
         mock_time.side_effect = [100.0, 102.5]  # 2.5 second duration
         
-        # Configure mock response with usage data
+        # Configure mock response with usage data (no reasoning tokens)
         mock_response = MagicMock()
-        mock_response.usage.completion_tokens = 150
+        mock_usage = MagicMock()
+        mock_usage.completion_tokens = 150
+        # Explicitly configure to not have completion_tokens_details
+        mock_usage.completion_tokens_details = None
+        mock_response.usage = mock_usage
         mock_litellm.return_value = mock_response
         
         duration, tokens = await bench_model("gpt-4", 500)
@@ -120,4 +124,55 @@ async def test_bench_model_exception_handling(self, mock_litellm):
         mock_litellm.side_effect = Exception("Network error")
         
         with pytest.raises(Exception, match="Network error"):
-            await bench_model("gpt-4", 500)
+            await bench_model("gpt-4", 500)
+    
+    @pytest.mark.asyncio
+    async def test_bench_model_with_reasoning_tokens(self, mock_litellm, mocker):
+        """Test benchmark with reasoning models that have completion_tokens_details"""
+        # Mock time
+        mock_time = mocker.patch('tacho.ai.time.time')
+        mock_time.side_effect = [100.0, 103.0]  # 3 second duration
+        
+        # Configure mock response with reasoning tokens
+        mock_response = MagicMock()
+        mock_response.usage.completion_tokens = 50  # Regular completion tokens
+        
+        # Mock completion_tokens_details with reasoning_tokens
+        mock_details = MagicMock()
+        mock_details.reasoning_tokens = 200  # Reasoning tokens
+        mock_response.usage.completion_tokens_details = mock_details
+        
+        mock_litellm.return_value = mock_response
+        
+        duration, tokens = await bench_model("o1-mini", 500)
+        
+        # Verify results - should include both completion and reasoning tokens
+        assert duration == 3.0
+        assert tokens == 250  # 50 completion + 200 reasoning
+        
+        # Verify LLM was called correctly
+        mock_litellm.assert_called_once_with(
+            "o1-mini",
+            [{"role": "user", "content": BENCHMARK_PROMPT}],
+            max_tokens=500
+        )
+    
+    @pytest.mark.asyncio
+    async def test_bench_model_with_empty_completion_details(self, mock_litellm, mocker):
+        """Test benchmark when completion_tokens_details exists but has no reasoning_tokens"""
+        # Mock time
+        mock_time = mocker.patch('tacho.ai.time.time')
+        mock_time.side_effect = [100.0, 102.0]
+        
+        # Configure mock response with completion_tokens_details but no reasoning_tokens
+        mock_response = MagicMock()
+        mock_response.usage.completion_tokens = 100
+        mock_response.usage.completion_tokens_details = MagicMock(spec=[])  # No reasoning_tokens attribute
+        
+        mock_litellm.return_value = mock_response
+        
+        duration, tokens = await bench_model("gpt-4", 500)
+        
+        # Should only count regular completion tokens
+        assert duration == 2.0
+        assert tokens == 100
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -45,7 +45,7 @@ def test_bench_command_success(self, runner, mocker):
         mock_display = mocker.patch('tacho.cli.display_results')
         
         # Call bench directly
-        bench(["gpt-4", "claude-3"], runs=5, lim=500)
+        bench(["gpt-4", "claude-3"], runs=5, tokens=500)
         
         # Verify display was called with correct arguments
         mock_display.assert_called_once()
@@ -76,7 +76,7 @@ def test_bench_command_with_options(self, runner, mocker):
         mock_display = mocker.patch('tacho.cli.display_results')
         
         # Call bench directly with options
-        bench(["gpt-4"], runs=10, lim=1000)
+        bench(["gpt-4"], runs=10, tokens=1000)
         
         # Verify display was called
         mock_display.assert_called_once()
@@ -142,7 +142,7 @@ def test_bench_function_partial_valid_models(self, mocker):
         mock_display = mocker.patch('tacho.cli.display_results')
         
         # Call bench with mixed valid/invalid models
-        bench(["gpt-4", "invalid", "claude-3"], runs=3, lim=250)
+        bench(["gpt-4", "invalid", "claude-3"], runs=3, tokens=250)
         
         # Verify only valid models were benchmarked
         mock_display.assert_called_once()
diff --git a/tests/test_providers.py b/tests/test_providers.py
diff --git a/uv.lock b/uv.lock