llm-d-planner/src/neuralnav/knowledge_base/benchmarks.py at f218a387bacc2608876d57c3393b5e4af691ccb3 · llm-d-incubation/llm-d-planner · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
"""
Data access layer for model benchmark data using PostgreSQL.

NOTE ON UNUSED METHODS:
This repository contains several query methods that are currently unused in production
but preserved for future use:

- get_benchmark() - Single benchmark lookup (unused - debugging/future API use)
- get_benchmarks_for_traffic_profile() - Filter by traffic profile (unused)
- get_benchmarks_for_model() - All benchmarks for a model (unused)
- get_benchmarks_for_hardware() - All benchmarks for GPU type (unused)
- get_available_models() - List distinct models (unused - may use for UI)
- get_available_hardware_types() - List distinct GPUs (unused - may use for UI)
- get_traffic_profiles() - List distinct traffic profiles (unused)
- get_all_benchmarks() - Full table dump (unused - debugging only)

PRODUCTION METHOD:
- find_configurations_meeting_slo() - Primary method used by ConfigFinder

These methods are kept for potential Phase 2 API endpoints, debugging, interactive
testing, or future UI features that may need to display available options.
"""

import logging
import os

import psycopg2
from psycopg2.extras import RealDictCursor

logger = logging.getLogger(__name__)


class BenchmarkData:
    """Model performance benchmark entry."""

    def __init__(self, data: dict):
        """Initialize from database row dict."""
        self.model_hf_repo = data["model_hf_repo"]
        self.hardware = data["hardware"]
        self.hardware_count = data["hardware_count"]
        self.framework = data.get("framework", "vllm")
        self.framework_version = data.get("framework_version", "0.6.2")

        # Traffic profile
        self.prompt_tokens = data["prompt_tokens"]
        self.output_tokens = data["output_tokens"]
        self.mean_input_tokens = data["mean_input_tokens"]
        self.mean_output_tokens = data["mean_output_tokens"]

        # TTFT metrics (p95)
        self.ttft_mean = data["ttft_mean"]
        self.ttft_p90 = data["ttft_p90"]
        self.ttft_p95 = data["ttft_p95"]
        self.ttft_p99 = data["ttft_p99"]

        # ITL metrics (p95)
        self.itl_mean = data.get("itl_mean")
        self.itl_p90 = data.get("itl_p90")
        self.itl_p95 = data.get("itl_p95")
        self.itl_p99 = data.get("itl_p99")

        # E2E metrics (p95)
        self.e2e_mean = data["e2e_mean"]
        self.e2e_p90 = data["e2e_p90"]
        self.e2e_p95 = data["e2e_p95"]
        self.e2e_p99 = data["e2e_p99"]

        # TPS (tokens per second) metrics - all percentiles
        self.tps_mean = data.get("tps_mean")
        self.tps_p90 = data.get("tps_p90")
        self.tps_p95 = data.get("tps_p95")
        self.tps_p99 = data.get("tps_p99")

        # Throughput (legacy fields kept for backwards compatibility)
        self.tokens_per_second = data["tokens_per_second"]
        self.requests_per_second = data["requests_per_second"]

        # Estimated flag (True for interpolated benchmarks)
        self.estimated = data.get("estimated", False)

    def to_dict(self) -> dict:
        """Convert to dictionary."""
        return {
            "model_hf_repo": self.model_hf_repo,
            "hardware": self.hardware,
            "hardware_count": self.hardware_count,
            "framework": self.framework,
            "framework_version": self.framework_version,
            "prompt_tokens": self.prompt_tokens,
            "output_tokens": self.output_tokens,
            "mean_input_tokens": self.mean_input_tokens,
            "mean_output_tokens": self.mean_output_tokens,
            "ttft_mean": self.ttft_mean,
            "ttft_p90": self.ttft_p90,
            "ttft_p95": self.ttft_p95,
            "ttft_p99": self.ttft_p99,
            "itl_mean": self.itl_mean,
            "itl_p90": self.itl_p90,
            "itl_p95": self.itl_p95,
            "itl_p99": self.itl_p99,
            "e2e_mean": self.e2e_mean,
            "e2e_p90": self.e2e_p90,
            "e2e_p95": self.e2e_p95,
            "e2e_p99": self.e2e_p99,
            "tps_mean": self.tps_mean,
            "tps_p90": self.tps_p90,
            "tps_p95": self.tps_p95,
            "tps_p99": self.tps_p99,
            "tokens_per_second": self.tokens_per_second,
            "requests_per_second": self.requests_per_second,
            "estimated": self.estimated,
        }


class BenchmarkRepository:
    """Repository for querying model benchmark data from PostgreSQL."""

    def __init__(self, database_url: str | None = None, *, validate_connection: bool = False):
        """
        Initialize benchmark repository.

        Args:
            database_url: PostgreSQL connection string (defaults to DATABASE_URL env var)
            validate_connection: If True, test DB connectivity on init (default: False)
        """
        self.database_url = database_url or os.getenv(
            "DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/neuralnav"
        )
        if validate_connection:
            self._test_connection()

    def _test_connection(self):
        """Test database connection on initialization."""
        try:
            conn = self._get_connection()
            conn.close()
            logger.info("Successfully connected to PostgreSQL benchmark database")
        except Exception as e:
            logger.error(f"Failed to connect to PostgreSQL: {e}")
            raise

    def _get_connection(self):
        """Get a database connection."""
        return psycopg2.connect(self.database_url, cursor_factory=RealDictCursor)

    def get_benchmark(
        self,
        model_hf_repo: str,
        hardware: str,
        hardware_count: int,
        prompt_tokens: int,
        output_tokens: int,
    ) -> BenchmarkData | None:
        """
        Get benchmark for specific configuration and traffic profile.

        NOTE: This method is currently unused in production code. It's preserved
        for future API endpoints, debugging, or interactive testing.

        Production code uses find_configurations_meeting_slo() which queries
        multiple benchmarks with SLO filtering.

        Args:
            model_hf_repo: Model HuggingFace repository
            hardware: GPU type (e.g., NVIDIA-L4, NVIDIA-A100-80GB)
            hardware_count: Number of GPUs (tensor parallel size)
            prompt_tokens: Target prompt length (GuideLLM config)
            output_tokens: Target output length (GuideLLM config)

        Returns:
            BenchmarkData if found, None otherwise
        """
        query = """
            SELECT * FROM exported_summaries
            WHERE model_hf_repo = %s
              AND hardware = %s
              AND hardware_count = %s
              AND prompt_tokens = %s
              AND output_tokens = %s
            LIMIT 1
        """

        conn = self._get_connection()
        try:
            cursor = conn.cursor()
            cursor.execute(
                query, (model_hf_repo, hardware, hardware_count, prompt_tokens, output_tokens)
            )
            row = cursor.fetchone()
            cursor.close()

            if row:
                return BenchmarkData(dict(row))
            return None
        finally:
            conn.close()

    def get_benchmarks_for_traffic_profile(
        self,
        model_hf_repo: str,
        hardware: str,
        hardware_count: int,
        prompt_tokens: int,
        output_tokens: int,
    ) -> list[BenchmarkData]:
        """
        Get all benchmarks matching model, hardware, and traffic profile.

        This is similar to get_benchmark but returns a list in case there are
        multiple benchmark runs for the same configuration.

        Args:
            model_hf_repo: Model HuggingFace repository
            hardware: GPU type
            hardware_count: Number of GPUs
            prompt_tokens: Target prompt length
            output_tokens: Target output length

        Returns:
            List of matching benchmarks
        """
        query = """
            SELECT * FROM exported_summaries
            WHERE model_hf_repo = %s
              AND hardware = %s
              AND hardware_count = %s
              AND prompt_tokens = %s
              AND output_tokens = %s
        """

        conn = self._get_connection()
        try:
            cursor = conn.cursor()
            cursor.execute(
                query, (model_hf_repo, hardware, hardware_count, prompt_tokens, output_tokens)
            )
            rows = cursor.fetchall()
            cursor.close()

            return [BenchmarkData(dict(row)) for row in rows]
        finally:
            conn.close()

    def get_benchmarks_for_model(self, model_hf_repo: str) -> list[BenchmarkData]:
        """
        Get all benchmarks for a specific model.

        Args:
            model_hf_repo: Model HuggingFace repository

        Returns:
            List of benchmarks for this model
        """
        query = """
            SELECT * FROM exported_summaries
            WHERE model_hf_repo = %s
            ORDER BY hardware, hardware_count, prompt_tokens, output_tokens
        """

        conn = self._get_connection()
        try:
            cursor = conn.cursor()
            cursor.execute(query, (model_hf_repo,))
            rows = cursor.fetchall()
            cursor.close()

            return [BenchmarkData(dict(row)) for row in rows]
        finally:
            conn.close()

    def get_benchmarks_for_hardware(self, hardware: str) -> list[BenchmarkData]:
        """
        Get all benchmarks for a specific GPU type.

        Args:
            hardware: GPU type (e.g., NVIDIA-A100-80GB)

        Returns:
            List of benchmarks for this hardware
        """
        query = """
            SELECT * FROM exported_summaries
            WHERE hardware = %s
            ORDER BY model_hf_repo, hardware_count, prompt_tokens, output_tokens
        """

        conn = self._get_connection()
        try:
            cursor = conn.cursor()
            cursor.execute(query, (hardware,))
            rows = cursor.fetchall()
            cursor.close()

            return [BenchmarkData(dict(row)) for row in rows]
        finally:
            conn.close()

    def find_configurations_meeting_slo(
        self,
        prompt_tokens: int,
        output_tokens: int,
        ttft_p95_max_ms: int,
        itl_p95_max_ms: int,
        e2e_p95_max_ms: int,
        min_qps: float = 0,
        percentile: str = "p95",
        gpu_types: list[str] | None = None,
    ) -> list[BenchmarkData]:
        """
        Find all configurations that meet SLO requirements for a traffic profile.

        For each unique system configuration (model_hf_repo, hardware, hardware_count,
        prompt_tokens, output_tokens), selects only the benchmark with the highest
        requests_per_second that still meets SLO requirements. This is critical because
        benchmarks are collected at multiple QPS rates, and we want the maximum throughput
        that doesn't violate SLO targets.

        Args:
            prompt_tokens: Target prompt length
            output_tokens: Target output length
            ttft_p95_max_ms: Maximum acceptable TTFT (ms) - parameter name kept for backwards compat
            itl_p95_max_ms: Maximum acceptable ITL (ms/token) - parameter name kept for backwards compat
            e2e_p95_max_ms: Maximum acceptable E2E (ms) - parameter name kept for backwards compat
            min_qps: Minimum required QPS
            percentile: Which percentile column to use (mean, p90, p95, p99)
            gpu_types: Optional list of GPU types to filter by (normalized canonical names)

        Returns:
            List of benchmarks meeting all criteria (one per system configuration)
        """
        # Map percentile to column suffix
        valid_percentiles = {"mean", "p90", "p95", "p99"}
        if percentile not in valid_percentiles:
            logger.warning(f"Invalid percentile '{percentile}', defaulting to p95")
            percentile = "p95"

        # Build column names based on percentile
        ttft_col = f"ttft_{percentile}"
        itl_col = f"itl_{percentile}"
        e2e_col = f"e2e_{percentile}"

        # Build optional GPU filter clause
        gpu_filter = ""
        if gpu_types:
            gpu_filter = "AND hardware = ANY(%s)"
            logger.info(f"Filtering by GPU types: {gpu_types}")

        logger.info(
            f"Querying benchmarks with percentile={percentile} (columns: {ttft_col}, {itl_col}, {e2e_col})"
        )

        # Use window function to rank benchmarks by requests_per_second within each
        # system configuration, then select only the highest QPS that meets SLO.
        # When multiple benchmarks exist at the same QPS, prefer the one with lowest E2E latency.
        # NOTE: Using string formatting for column names is safe here since we validate percentile above
        query = f"""
            WITH ranked_configs AS (
                SELECT *,
                       ROW_NUMBER() OVER (
                           PARTITION BY model_hf_repo, hardware, hardware_count
                           ORDER BY requests_per_second DESC, {e2e_col} ASC
                       ) as rn
                FROM exported_summaries
                WHERE prompt_tokens = %s
                  AND output_tokens = %s
                  AND {ttft_col} <= %s
                  AND {itl_col} <= %s
                  AND {e2e_col} <= %s
                  AND requests_per_second >= %s
                  {gpu_filter}
            )
            SELECT
                id, config_id, model_hf_repo, provider, type,
                ttft_mean, ttft_p90, ttft_p95, ttft_p99,
                e2e_mean, e2e_p90, e2e_p95, e2e_p99,
                itl_mean, itl_p90, itl_p95, itl_p99,
                tps_mean, tps_p90, tps_p95, tps_p99,
                hardware, hardware_count, framework, requests_per_second, tokens_per_second,
                mean_input_tokens, mean_output_tokens,
                prompt_tokens, output_tokens
            FROM ranked_configs
            WHERE rn = 1
            ORDER BY model_hf_repo, hardware, hardware_count
        """

        # Build query parameters
        params: list = [
            prompt_tokens,
            output_tokens,
            ttft_p95_max_ms,
            itl_p95_max_ms,
            e2e_p95_max_ms,
            min_qps,
        ]
        if gpu_types:
            params.append(gpu_types)

        conn = self._get_connection()
        try:
            cursor = conn.cursor()
            cursor.execute(query, params)
            rows = cursor.fetchall()
            cursor.close()

            results = [BenchmarkData(dict(row)) for row in rows]
        finally:
            conn.close()

        logger.info(f"Found {len(results)} benchmarks meeting SLO criteria")
        return results

    def get_available_models(self) -> list[str]:
        """Get list of all available models in the database."""
        query = "SELECT DISTINCT model_hf_repo FROM exported_summaries ORDER BY model_hf_repo"

        conn = self._get_connection()
        try:
            cursor = conn.cursor()
            cursor.execute(query)
            rows = cursor.fetchall()
            cursor.close()

            return [row["model_hf_repo"] for row in rows]
        finally:
            conn.close()

    def get_available_hardware_types(self) -> list[str]:
        """Get list of all available hardware types in the database."""
        query = "SELECT DISTINCT hardware FROM exported_summaries ORDER BY hardware"

        conn = self._get_connection()
        try:
            cursor = conn.cursor()
            cursor.execute(query)
            rows = cursor.fetchall()
            cursor.close()

            return [row["hardware"] for row in rows]
        finally:
            conn.close()

    def get_traffic_profiles(self) -> list[tuple[int, int]]:
        """Get list of all available traffic profiles (prompt_tokens, output_tokens)."""
        query = """
            SELECT DISTINCT prompt_tokens, output_tokens
            FROM exported_summaries
            ORDER BY prompt_tokens, output_tokens
        """

        conn = self._get_connection()
        try:
            cursor = conn.cursor()
            cursor.execute(query)
            rows = cursor.fetchall()
            cursor.close()

            return [(row["prompt_tokens"], row["output_tokens"]) for row in rows]
        finally:
            conn.close()

    def get_all_benchmarks(self) -> list[BenchmarkData]:
        """
        Get all benchmarks.

        Warning: This may return a large dataset. Use with caution.
        """
        query = """
            SELECT * FROM exported_summaries
            ORDER BY model_hf_repo, hardware, hardware_count, prompt_tokens, output_tokens
        """

        conn = self._get_connection()
        try:
            cursor = conn.cursor()
            cursor.execute(query)
            rows = cursor.fetchall()
            cursor.close()

            return [BenchmarkData(dict(row)) for row in rows]
        finally:
            conn.close()