feat: add benchmarking harness with TTFT, throughput, and latency metrics

kshitijgetsac · kshitijgetsac · commit 621a7e03b224 · 2026-01-09T14:28:21.000-05:00
- Add `tiles bench` command for running benchmarks
- Track TTFT, tokens/sec, total tokens, and latency
- Display metrics after each REPL response
- Save benchmark results to ~/.config/tiles/benchmark_log.jsonl
- Add GenerationMetrics dataclass in Python server
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/server/backend/mlx.py b/server/backend/mlx.py
@@ -1,7 +1,7 @@
 from .mlx_runner import MLXRunner
 from ..cache_utils import get_model_path
 from fastapi import HTTPException
-from ..schemas import ChatMessage,  ChatCompletionRequest, downloadRequest
+from ..schemas import ChatMessage, ChatCompletionRequest, downloadRequest, GenerationMetrics
 from ..hf_downloader import pull_model
 
 import logging
@@ -113,6 +113,7 @@ async def generate_chat_stream(
     yield f"data: {json.dumps(initial_response)}\n\n"
 
     # Stream tokens
+    metrics = None
     try:
         for token in runner.generate_streaming(
             prompt=prompt,
@@ -125,6 +126,11 @@ async def generate_chat_stream(
             use_chat_template=False,  # Already applied in _format_conversation
             use_chat_stop_tokens=False,  # Server mode shouldn't stop on chat markers
         ):
+            # Check if this is metrics object (last item yielded)
+            if isinstance(token, GenerationMetrics):
+                metrics = token
+                continue
+
             chunk_response = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
@@ -156,7 +162,7 @@ async def generate_chat_stream(
         }
         yield f"data: {json.dumps(error_response)}\n\n"
 
-    # Final response
+    # Final response with metrics
     final_response = {
         "id": completion_id,
         "object": "chat.completion.chunk",
@@ -165,6 +171,15 @@ async def generate_chat_stream(
         "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
     }
 
+    # Include benchmarking metrics if available
+    if metrics:
+        final_response["metrics"] = {
+            "ttft_ms": metrics.ttft_ms,
+            "total_tokens": metrics.total_tokens,
+            "tokens_per_second": metrics.tokens_per_second,
+            "total_latency_s": metrics.total_latency_s,
+        }
+
     yield f"data: {json.dumps(final_response)}\n\n"
     yield "data: [DONE]\n\n"
 
diff --git a/server/backend/mlx_runner.py b/server/backend/mlx_runner.py
@@ -20,6 +20,7 @@
 from mlx_lm.sample_utils import make_repetition_penalty, make_sampler
 
 from ..reasoning_utils import ReasoningExtractor, StreamingReasoningParser
+from ..schemas import GenerationMetrics
 
 
 def get_model_context_length(model_path: str) -> int:
@@ -475,6 +476,7 @@ def generate_streaming(
         # Track generation metrics
         start_time = time.time()
         tokens_generated = 0
+        ttft = None  # Time to first token
 
         # Create sampler with our parameters
         sampler = make_sampler(temp=temperature, top_p=top_p)
@@ -567,6 +569,19 @@ def generate_streaming(
                                         yield formatted_token
                                 else:
                                     yield new_part_before_stop
+
+                        # Yield metrics before returning
+                        if reasoning_parser:
+                            yield from reasoning_parser.finalize()
+                        total_latency = time.time() - start_time
+                        tokens_per_second = tokens_generated / total_latency if total_latency > 0 else 0
+                        ttft_ms = (ttft * 1000) if ttft is not None else 0
+                        yield GenerationMetrics(
+                            ttft_ms=ttft_ms,
+                            total_tokens=tokens_generated,
+                            tokens_per_second=tokens_per_second,
+                            total_latency_s=total_latency
+                        )
                         return  # Stop generation without yielding stop token
 
                 # Only check chat stop tokens if no native stop token found (fallback)
@@ -597,9 +612,26 @@ def generate_streaming(
                                             yield formatted_token
                                     else:
                                         yield new_part_before_stop
+
+                            # Yield metrics before returning
+                            if reasoning_parser:
+                                yield from reasoning_parser.finalize()
+                            total_latency = time.time() - start_time
+                            tokens_per_second = tokens_generated / total_latency if total_latency > 0 else 0
+                            ttft_ms = (ttft * 1000) if ttft is not None else 0
+                            yield GenerationMetrics(
+                                ttft_ms=ttft_ms,
+                                total_tokens=tokens_generated,
+                                tokens_per_second=tokens_per_second,
+                                total_latency_s=total_latency
+                            )
                             return  # Stop generation without yielding stop token
 
                 # No stop token found, process the new text
+                # Capture time to first token
+                if ttft is None:
+                    ttft = time.time() - start_time
+
                 if reasoning_parser:
                     # Process through reasoning parser for formatting
                     for formatted_token in reasoning_parser.process_token(new_text):
@@ -617,6 +649,18 @@ def generate_streaming(
         if reasoning_parser:
             yield from reasoning_parser.finalize()
 
+        # Yield metrics at the end
+        total_latency = time.time() - start_time
+        tokens_per_second = tokens_generated / total_latency if total_latency > 0 else 0
+        ttft_ms = (ttft * 1000) if ttft is not None else 0
+        metrics = GenerationMetrics(
+            ttft_ms=ttft_ms,
+            total_tokens=tokens_generated,
+            tokens_per_second=tokens_per_second,
+            total_latency_s=total_latency
+        )
+        yield metrics
+
         # Print generation statistics if verbose
         if self.verbose:
             generation_time = time.time() - start_time
diff --git a/server/schemas.py b/server/schemas.py
@@ -1,5 +1,6 @@
 from pydantic import BaseModel, Field
 from typing import Any, Dict, List, Optional, Union
+from dataclasses import dataclass
 
 class CompletionRequest(BaseModel):
     model: str
@@ -63,3 +64,12 @@ class StartRequest(BaseModel):
 
 class downloadRequest(BaseModel):
     model: str
+
+
+@dataclass
+class GenerationMetrics:
+    """Benchmarking metrics for token generation."""
+    ttft_ms: float  # Time to first token in milliseconds
+    total_tokens: int  # Total tokens generated
+    tokens_per_second: float  # Throughput
+    total_latency_s: float  # End-to-end latency in seconds
diff --git a/tiles/Cargo.toml b/tiles/Cargo.toml
@@ -14,3 +14,4 @@ tokio = { version = "1" , features = ["macros", "rt-multi-thread"]}
 owo-colors = "4"
 futures-util = "0.3"
 hf-hub = {version = "0.4", features = ["tokio"]}
+chrono = "0.4"
diff --git a/tiles/src/commands/mod.rs b/tiles/src/commands/mod.rs
@@ -18,3 +18,7 @@ pub async fn start_server(runtime: &Runtime) {
 pub async fn stop_server(runtime: &Runtime) {
     let _ = runtime.stop_server_daemon().await;
 }
+
+pub async fn bench(runtime: &Runtime, run_args: RunArgs) {
+    runtime.bench(run_args).await;
+}
diff --git a/tiles/src/main.rs b/tiles/src/main.rs
@@ -22,6 +22,12 @@ enum Commands {
         flags: RunFlags,
     },
 
+    /// Runs a benchmark and saves results to log file
+    Bench {
+        /// Path to the Modelfile (uses default model if not provided)
+        modelfile_path: Option<String>,
+    },
+
     /// Checks the status of dependencies
     Health,
 
@@ -70,6 +76,13 @@ pub async fn main() -> Result<(), Box<dyn Error>> {
             };
             commands::run(&runtime, run_args).await;
         }
+        Commands::Bench { modelfile_path } => {
+            let run_args = RunArgs {
+                modelfile_path,
+                relay_count: 0, // unused by bench
+            };
+            commands::bench(&runtime, run_args).await;
+        }
         Commands::Health => {
             commands::check_health();
         }
diff --git a/tiles/src/runtime/cpu.rs b/tiles/src/runtime/cpu.rs
@@ -23,4 +23,8 @@ impl CPURuntime {
     pub async fn stop_server_daemon(&self) -> Result<()> {
         unimplemented!()
     }
+
+    pub async fn bench(&self, _run_args: super::RunArgs) {
+        unimplemented!()
+    }
 }
diff --git a/tiles/src/runtime/mlx.rs b/tiles/src/runtime/mlx.rs
diff --git a/tiles/src/runtime/mod.rs b/tiles/src/runtime/mod.rs

Original file line number	Diff line number	Diff line change
`@@ -18,3 +18,7 @@ pub async fn start_server(runtime: &Runtime) {`
`18`	`18`	`pub async fn stop_server(runtime: &Runtime) {`
`19`	`19`	`let _ = runtime.stop_server_daemon().await;`
`20`	`20`	`}`
	`21`	`+`
	`22`	`+pub async fn bench(runtime: &Runtime, run_args: RunArgs) {`
	`23`	`+ runtime.bench(run_args).await;`
	`24`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -23,4 +23,8 @@ impl CPURuntime {`
`23`	`23`	`pub async fn stop_server_daemon(&self) -> Result<()> {`
`24`	`24`	`unimplemented!()`
`25`	`25`	`}`
	`26`	`+`
	`27`	`+ pub async fn bench(&self, _run_args: super::RunArgs) {`
	`28`	`+ unimplemented!()`
	`29`	`+ }`
`26`	`30`	`}`