|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Real-world benchmark for DynamicContentDetector. |
| 4 | +
|
| 5 | +Tests the detector against realistic system prompts from AI coding agents, |
| 6 | +chatbots, and enterprise applications. |
| 7 | +""" |
| 8 | + |
| 9 | +import time |
| 10 | +import statistics |
| 11 | +from dataclasses import dataclass |
| 12 | +from typing import Any |
| 13 | + |
| 14 | +from headroom.cache.dynamic_detector import ( |
| 15 | + DetectorConfig, |
| 16 | + DynamicContentDetector, |
| 17 | + DynamicCategory, |
| 18 | +) |
| 19 | + |
| 20 | + |
| 21 | +@dataclass |
| 22 | +class BenchmarkResult: |
| 23 | + """Result of a single benchmark run.""" |
| 24 | + name: str |
| 25 | + content_length: int |
| 26 | + spans_found: int |
| 27 | + categories: list[str] |
| 28 | + static_length: int |
| 29 | + dynamic_length: int |
| 30 | + latency_ms: float |
| 31 | + tiers_used: list[str] |
| 32 | + warnings: list[str] |
| 33 | + |
| 34 | + |
| 35 | +# Real-world system prompts |
| 36 | +REAL_WORLD_PROMPTS = { |
| 37 | + "claude_code_style": """You are Claude, an AI assistant created by Anthropic to be helpful, harmless, and honest. |
| 38 | +
|
| 39 | +Today is Tuesday, January 7, 2026. |
| 40 | +Current time: 10:30:45 AM PST. |
| 41 | +
|
| 42 | +You are operating in a software development environment with access to: |
| 43 | +- File system operations |
| 44 | +- Terminal commands |
| 45 | +- Web search |
| 46 | +
|
| 47 | +Session ID: sess_abc123def456ghi789jkl012 |
| 48 | +Request ID: req_xyz789abc123def456ghi789 |
| 49 | +User: tchopra |
| 50 | +Workspace: /Users/tchopra/claude-projects/headroom |
| 51 | +
|
| 52 | +Be concise, accurate, and helpful. Follow the user's instructions carefully.""", |
| 53 | + |
| 54 | + "enterprise_assistant": """You are an enterprise AI assistant for Acme Corporation. |
| 55 | +
|
| 56 | +Current Date: 2026-01-07T10:30:00Z |
| 57 | +Last Updated: 2026-01-07T09:00:00Z |
| 58 | +
|
| 59 | +User Profile: |
| 60 | +- Name: John Smith |
| 61 | +- Employee ID: EMP-2024-00542 |
| 62 | +- Department: Engineering |
| 63 | +- Manager: Sarah Johnson |
| 64 | +- Location: San Francisco, CA |
| 65 | +- Hire Date: March 15, 2023 |
| 66 | +
|
| 67 | +System Status: |
| 68 | +- API Version: v2.3.1-beta |
| 69 | +- Server Load: 45% |
| 70 | +- Active Users: 1,247 |
| 71 | +- Queue Length: 23 |
| 72 | +
|
| 73 | +Budget Information: |
| 74 | +- Monthly Allowance: $5,000.00 |
| 75 | +- Used This Month: $2,341.67 |
| 76 | +- Remaining: $2,658.33 |
| 77 | +
|
| 78 | +Help the user with their work tasks while following company policies.""", |
| 79 | + |
| 80 | + "coding_agent": """You are an autonomous coding agent with access to tools. |
| 81 | +
|
| 82 | +Environment: |
| 83 | +- OS: macOS Darwin 25.1.0 |
| 84 | +- Working Directory: /Users/developer/projects/myapp |
| 85 | +- Git Branch: feature/JIRA-1234-add-auth |
| 86 | +- Last Commit: a1b2c3d4e5f6 (2 hours ago) |
| 87 | +- Node Version: v20.10.0 |
| 88 | +- Python Version: 3.11.7 |
| 89 | +
|
| 90 | +Current Task Context: |
| 91 | +- Task ID: 550e8400-e29b-41d4-a716-446655440000 |
| 92 | +- Created: 2026-01-07T08:15:30Z |
| 93 | +- Priority: High |
| 94 | +- Estimated Time: 2 hours |
| 95 | +
|
| 96 | +API Keys Available: |
| 97 | +- OPENAI_API_KEY: sk-proj-xxxxxxxxxxxxxxxxxxxxxxxxxxxx |
| 98 | +- ANTHROPIC_API_KEY: sk-ant-xxxxxxxxxxxxxxxxxxxxxxxxxxxx |
| 99 | +- DATABASE_URL: postgresql://user:pass@localhost:5432/mydb |
| 100 | +
|
| 101 | +Execute tasks step by step, verify each action, and report progress.""", |
| 102 | + |
| 103 | + "customer_support": """You are a customer support agent for TechStore Inc. |
| 104 | +
|
| 105 | +Current Time: January 7, 2026, 3:45 PM EST |
| 106 | +Support Ticket: #TKT-2026-0107-4521 |
| 107 | +
|
| 108 | +Customer Information: |
| 109 | +- Name: Alice Chen |
| 110 | +- Email: alice.chen@email.com |
| 111 | +- Phone: (555) 123-4567 |
| 112 | +- Customer Since: August 2021 |
| 113 | +- Loyalty Tier: Gold |
| 114 | +- Total Purchases: $12,456.78 |
| 115 | +
|
| 116 | +Recent Orders: |
| 117 | +- Order #ORD-2026-0105-7823 - iPhone 15 Pro - $1,199.00 - Delivered |
| 118 | +- Order #ORD-2025-1220-3456 - AirPods Pro - $249.00 - Delivered |
| 119 | +- Order #ORD-2025-1115-9012 - MacBook Air - $1,299.00 - Returned |
| 120 | +
|
| 121 | +Active Issues: |
| 122 | +- Case #CS-2026-0107-001 - Battery drain issue - Open since today |
| 123 | +
|
| 124 | +Provide helpful, empathetic support while following company guidelines.""", |
| 125 | + |
| 126 | + "data_analysis": """You are a data analysis assistant. |
| 127 | +
|
| 128 | +Report Generated: 2026-01-07 10:30:00 UTC |
| 129 | +Report ID: RPT-550e8400-e29b-41d4-a716-446655440000 |
| 130 | +Data Range: 2025-12-01 to 2025-12-31 |
| 131 | +
|
| 132 | +Summary Statistics: |
| 133 | +- Total Revenue: $1,234,567.89 |
| 134 | +- Total Orders: 45,678 |
| 135 | +- Average Order Value: $27.03 |
| 136 | +- Top Product: Widget Pro ($234,567.00) |
| 137 | +- Top Region: California (23.4%) |
| 138 | +
|
| 139 | +Key Metrics: |
| 140 | +- DAU: 125,000 |
| 141 | +- MAU: 890,000 |
| 142 | +- Churn Rate: 2.3% |
| 143 | +- NPS Score: 67 |
| 144 | +
|
| 145 | +Anomalies Detected: |
| 146 | +- Spike on Dec 15: 3.2x normal traffic |
| 147 | +- Drop on Dec 25: 0.4x normal (expected - holiday) |
| 148 | +
|
| 149 | +Help analyze the data and provide insights.""", |
| 150 | + |
| 151 | + "minimal_static": """You are a helpful AI assistant. |
| 152 | +
|
| 153 | +Your role is to: |
| 154 | +1. Answer questions accurately |
| 155 | +2. Be concise and clear |
| 156 | +3. Follow instructions carefully |
| 157 | +4. Admit when you don't know something |
| 158 | +
|
| 159 | +Always be helpful, harmless, and honest.""", |
| 160 | + |
| 161 | + "heavy_dynamic": """Session started at 2026-01-07T10:30:45.123Z |
| 162 | +Request ID: req_abc123def456ghi789jkl012mno345pqr678 |
| 163 | +Trace ID: 550e8400-e29b-41d4-a716-446655440000 |
| 164 | +Parent Span: span_xyz789abc123 |
| 165 | +User Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) |
| 166 | +IP Address: 192.168.1.100 |
| 167 | +Geo: San Francisco, CA, USA (37.7749, -122.4194) |
| 168 | +
|
| 169 | +Auth Token: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9... |
| 170 | +Token Expires: 2026-01-07T11:30:45Z |
| 171 | +Refresh Token: rt_abc123def456 |
| 172 | +
|
| 173 | +Last Login: 2026-01-06T18:45:30Z |
| 174 | +Login Count: 1,247 |
| 175 | +Account Balance: $5,432.10 |
| 176 | +Credit Limit: $10,000.00 |
| 177 | +
|
| 178 | +Real-time Stock Prices (as of 10:30 AM): |
| 179 | +- AAPL: $185.42 (+1.2%) |
| 180 | +- GOOGL: $142.89 (-0.5%) |
| 181 | +- MSFT: $378.23 (+0.8%) |
| 182 | +- AMZN: $156.78 (+2.1%) |
| 183 | +
|
| 184 | +Process this request.""", |
| 185 | +} |
| 186 | + |
| 187 | + |
| 188 | +def run_benchmark( |
| 189 | + prompts: dict[str, str], |
| 190 | + tiers: list[str], |
| 191 | + iterations: int = 10, |
| 192 | +) -> dict[str, Any]: |
| 193 | + """Run benchmark on prompts with specified tiers.""" |
| 194 | + |
| 195 | + config = DetectorConfig(tiers=tiers) # type: ignore |
| 196 | + detector = DynamicContentDetector(config) |
| 197 | + |
| 198 | + results: dict[str, list[BenchmarkResult]] = {} |
| 199 | + |
| 200 | + for name, content in prompts.items(): |
| 201 | + results[name] = [] |
| 202 | + |
| 203 | + for _ in range(iterations): |
| 204 | + start = time.perf_counter() |
| 205 | + result = detector.detect(content) |
| 206 | + elapsed = (time.perf_counter() - start) * 1000 |
| 207 | + |
| 208 | + categories = list(set(s.category.value for s in result.spans)) |
| 209 | + |
| 210 | + results[name].append(BenchmarkResult( |
| 211 | + name=name, |
| 212 | + content_length=len(content), |
| 213 | + spans_found=len(result.spans), |
| 214 | + categories=categories, |
| 215 | + static_length=len(result.static_content), |
| 216 | + dynamic_length=len(result.dynamic_content), |
| 217 | + latency_ms=elapsed, |
| 218 | + tiers_used=result.tiers_used, |
| 219 | + warnings=result.warnings, |
| 220 | + )) |
| 221 | + |
| 222 | + return results |
| 223 | + |
| 224 | + |
| 225 | +def print_results( |
| 226 | + results: dict[str, list[BenchmarkResult]], |
| 227 | + tier_name: str, |
| 228 | +): |
| 229 | + """Print benchmark results.""" |
| 230 | + |
| 231 | + print(f"\n{'='*80}") |
| 232 | + print(f"BENCHMARK RESULTS: {tier_name}") |
| 233 | + print(f"{'='*80}") |
| 234 | + |
| 235 | + for name, runs in results.items(): |
| 236 | + latencies = [r.latency_ms for r in runs] |
| 237 | + avg_latency = statistics.mean(latencies) |
| 238 | + std_latency = statistics.stdev(latencies) if len(latencies) > 1 else 0 |
| 239 | + |
| 240 | + # Use first run for span info (consistent across runs) |
| 241 | + first = runs[0] |
| 242 | + |
| 243 | + compression = (1 - first.static_length / first.content_length) * 100 if first.content_length > 0 else 0 |
| 244 | + |
| 245 | + print(f"\n📄 {name}") |
| 246 | + print(f" Content: {first.content_length:,} chars") |
| 247 | + print(f" Spans found: {first.spans_found}") |
| 248 | + print(f" Categories: {', '.join(first.categories) if first.categories else 'none'}") |
| 249 | + print(f" Static: {first.static_length:,} chars | Dynamic: {first.dynamic_length:,} chars") |
| 250 | + print(f" Compression: {compression:.1f}% removed") |
| 251 | + print(f" Latency: {avg_latency:.2f}ms ± {std_latency:.2f}ms") |
| 252 | + print(f" Tiers used: {', '.join(first.tiers_used)}") |
| 253 | + if first.warnings: |
| 254 | + print(f" ⚠️ Warnings: {len(first.warnings)}") |
| 255 | + |
| 256 | + |
| 257 | +def print_comparison(all_results: dict[str, dict[str, list[BenchmarkResult]]]): |
| 258 | + """Print comparison across tiers.""" |
| 259 | + |
| 260 | + print(f"\n{'='*80}") |
| 261 | + print("TIER COMPARISON") |
| 262 | + print(f"{'='*80}") |
| 263 | + |
| 264 | + prompts = list(REAL_WORLD_PROMPTS.keys()) |
| 265 | + tiers = list(all_results.keys()) |
| 266 | + |
| 267 | + # Header |
| 268 | + header = f"{'Prompt':<25}" |
| 269 | + for tier in tiers: |
| 270 | + header += f" | {tier:>12} spans | {'latency':>8}" |
| 271 | + print(header) |
| 272 | + print("-" * len(header)) |
| 273 | + |
| 274 | + for prompt in prompts: |
| 275 | + row = f"{prompt:<25}" |
| 276 | + for tier in tiers: |
| 277 | + if prompt in all_results[tier]: |
| 278 | + runs = all_results[tier][prompt] |
| 279 | + spans = runs[0].spans_found |
| 280 | + latency = statistics.mean([r.latency_ms for r in runs]) |
| 281 | + row += f" | {spans:>12} | {latency:>7.2f}ms" |
| 282 | + else: |
| 283 | + row += f" | {'N/A':>12} | {'N/A':>8}" |
| 284 | + print(row) |
| 285 | + |
| 286 | + # Summary |
| 287 | + print(f"\n{'='*80}") |
| 288 | + print("SUMMARY") |
| 289 | + print(f"{'='*80}") |
| 290 | + |
| 291 | + for tier in tiers: |
| 292 | + all_latencies = [] |
| 293 | + total_spans = 0 |
| 294 | + for runs in all_results[tier].values(): |
| 295 | + all_latencies.extend([r.latency_ms for r in runs]) |
| 296 | + total_spans += runs[0].spans_found |
| 297 | + |
| 298 | + avg = statistics.mean(all_latencies) |
| 299 | + p50 = statistics.median(all_latencies) |
| 300 | + p99 = sorted(all_latencies)[int(len(all_latencies) * 0.99)] if len(all_latencies) > 1 else avg |
| 301 | + |
| 302 | + print(f"\n{tier}:") |
| 303 | + print(f" Total spans detected: {total_spans}") |
| 304 | + print(f" Avg latency: {avg:.2f}ms") |
| 305 | + print(f" P50 latency: {p50:.2f}ms") |
| 306 | + print(f" P99 latency: {p99:.2f}ms") |
| 307 | + |
| 308 | + |
| 309 | +def show_detection_details(prompt_name: str, content: str): |
| 310 | + """Show detailed detection for a specific prompt.""" |
| 311 | + |
| 312 | + print(f"\n{'='*80}") |
| 313 | + print(f"DETECTION DETAILS: {prompt_name}") |
| 314 | + print(f"{'='*80}") |
| 315 | + |
| 316 | + config = DetectorConfig(tiers=["regex"]) |
| 317 | + detector = DynamicContentDetector(config) |
| 318 | + result = detector.detect(content) |
| 319 | + |
| 320 | + print(f"\nOriginal content ({len(content)} chars):") |
| 321 | + print("-" * 40) |
| 322 | + print(content[:500] + "..." if len(content) > 500 else content) |
| 323 | + |
| 324 | + print(f"\n\nDetected spans ({len(result.spans)}):") |
| 325 | + print("-" * 40) |
| 326 | + for span in result.spans: |
| 327 | + print(f" [{span.category.value:12}] '{span.text[:50]}{'...' if len(span.text) > 50 else ''}'") |
| 328 | + |
| 329 | + print(f"\n\nStatic content ({len(result.static_content)} chars):") |
| 330 | + print("-" * 40) |
| 331 | + print(result.static_content[:500] + "..." if len(result.static_content) > 500 else result.static_content) |
| 332 | + |
| 333 | + print(f"\n\nDynamic content ({len(result.dynamic_content)} chars):") |
| 334 | + print("-" * 40) |
| 335 | + print(result.dynamic_content if result.dynamic_content else "(none)") |
| 336 | + |
| 337 | + |
| 338 | +def main(): |
| 339 | + """Run the benchmark.""" |
| 340 | + |
| 341 | + print("🚀 Dynamic Content Detector - Real World Benchmark") |
| 342 | + print("=" * 80) |
| 343 | + |
| 344 | + iterations = 20 |
| 345 | + |
| 346 | + # Test each tier configuration |
| 347 | + tier_configs = { |
| 348 | + "regex_only": ["regex"], |
| 349 | + # "regex+ner": ["regex", "ner"], # Uncomment if spacy installed |
| 350 | + # "all_tiers": ["regex", "ner", "semantic"], # Uncomment if all deps installed |
| 351 | + } |
| 352 | + |
| 353 | + all_results: dict[str, dict[str, list[BenchmarkResult]]] = {} |
| 354 | + |
| 355 | + for tier_name, tiers in tier_configs.items(): |
| 356 | + print(f"\n⏱️ Running {tier_name} ({iterations} iterations per prompt)...") |
| 357 | + results = run_benchmark(REAL_WORLD_PROMPTS, tiers, iterations) |
| 358 | + all_results[tier_name] = results |
| 359 | + print_results(results, tier_name) |
| 360 | + |
| 361 | + # Print comparison if multiple tiers tested |
| 362 | + if len(all_results) > 1: |
| 363 | + print_comparison(all_results) |
| 364 | + |
| 365 | + # Show detailed detection for a few prompts |
| 366 | + print("\n" + "=" * 80) |
| 367 | + print("DETAILED DETECTION EXAMPLES") |
| 368 | + print("=" * 80) |
| 369 | + |
| 370 | + for name in ["claude_code_style", "enterprise_assistant", "heavy_dynamic"]: |
| 371 | + show_detection_details(name, REAL_WORLD_PROMPTS[name]) |
| 372 | + |
| 373 | + |
| 374 | +if __name__ == "__main__": |
| 375 | + main() |
0 commit comments