NVIDIA
diff --git a/‎src/srtctl/benchmarks/scripts/sa-bench/rollup.py‎
Lines changed: 316 additions & 10 deletions b/‎src/srtctl/benchmarks/scripts/sa-bench/rollup.py‎
Lines changed: 316 additions & 10 deletions
@@ -2,12 +2,47 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Generate benchmark-rollup.json from sa-bench results."""
+"""Generate benchmark-rollup.json and benchmark-rollup.csv from sa-bench results."""
 
+from __future__ import annotations
+
+import csv
 import json
+from collections import Counter
+import math
+import re
 import sys
 from datetime import datetime, timezone
 from pathlib import Path
+from typing import Any, NamedTuple
+
+import yaml
+
+
+OUTPUT_FIELDS = [
+    "Config",
+    "Total GPU Count",
+    "Decode GPU Count",
+    "Concurrency",
+    "Total Token Throughput",
+    "Output Token Throughput",
+    "Median TTFT",
+    "Median TPOT",
+    "Median ITL",
+    "P90 Decode Running Requests",
+    "Output Token Throughput per User",
+    "Total Token Throughput per GPU",
+]
+
+RUNNING_REQ_PATTERN = re.compile(r"#running-req:\s*(\d+)")
+
+
+class RollupContext(NamedTuple):
+    """Resolved runtime context shared by CSV enrichment helpers."""
+
+    config_name: str | None
+    resources: dict[str, Any] | None
+    backend_type: str | None
 
 
 def _get_percentile(percentiles: list, target: float) -> float | None:
@@ -20,24 +55,273 @@ def _get_percentile(percentiles: list, target: float) -> float | None:
     return None
 
 
+def _read_yaml_dict(path: Path) -> dict[str, Any] | None:
+    """Read a YAML file into a dictionary."""
+    try:
+        data = yaml.safe_load(path.read_text()) or {}
+    except Exception as exc:
+        print(f"Failed to parse {path}: {exc}", file=sys.stderr)
+        return None
+
+    return data if isinstance(data, dict) else None
+
+
+def _read_json_dict(path: Path) -> dict[str, Any] | None:
+    """Read a JSON file into a dictionary."""
+    try:
+        data = json.loads(path.read_text())
+    except Exception as exc:
+        print(f"Failed to parse {path}: {exc}", file=sys.stderr)
+        return None
+
+    return data if isinstance(data, dict) else None
+
+
+def _read_runtime_config(log_dir: Path) -> dict[str, Any] | None:
+    """Read resolved runtime config, preferring override-expanded configs."""
+    output_dir = log_dir.parent
+
+    runtime_configs = sorted(output_dir.glob("config_*.yaml"))
+    for config_path in runtime_configs:
+        config = _read_yaml_dict(config_path)
+        if config:
+            return config
+
+    config_path = log_dir / "config.yaml"
+    if config_path.exists():
+        return _read_yaml_dict(config_path)
+
+    return None
+
+
+def _read_job_metadata(log_dir: Path) -> dict[str, Any] | None:
+    """Read submit metadata JSON from the output directory when available."""
+    output_dir = log_dir.parent
+    for metadata_path in sorted(output_dir.glob("*.json")):
+        data = _read_json_dict(metadata_path)
+        if data:
+            return data
+    return None
+
+
+def _load_rollup_context(log_dir: Path) -> RollupContext:
+    """Load config name, resources, and backend type once for downstream helpers."""
+    runtime_config = _read_runtime_config(log_dir)
+    metadata = _read_job_metadata(log_dir)
+
+    config_name = None
+    if runtime_config:
+        name = runtime_config.get("name")
+        if isinstance(name, str) and name:
+            config_name = name
+    if config_name is None and metadata:
+        job_name = metadata.get("job_name")
+        if isinstance(job_name, str) and job_name:
+            config_name = job_name
+
+    resources = None
+    if metadata:
+        metadata_resources = metadata.get("resources")
+        if isinstance(metadata_resources, dict):
+            resources = metadata_resources
+    if resources is None and runtime_config:
+        runtime_resources = runtime_config.get("resources")
+        if isinstance(runtime_resources, dict):
+            resources = runtime_resources
+
+    backend_type = None
+    if metadata:
+        value = metadata.get("backend_type")
+        if isinstance(value, str) and value:
+            backend_type = value
+    if backend_type is None and runtime_config:
+        backend = runtime_config.get("backend")
+        if isinstance(backend, dict):
+            value = backend.get("type")
+            if isinstance(value, str) and value:
+                backend_type = value
+            elif "sglang_config" in backend:
+                backend_type = "sglang"
+
+    return RollupContext(
+        config_name=config_name,
+        resources=resources,
+        backend_type=backend_type,
+    )
+
+
+def _compute_total_gpu_count(resources: dict[str, Any]) -> int | None:
+    """Compute total GPU count from resources using the same topology semantics as the config."""
+    gpus_per_node_raw = resources.get("gpus_per_node")
+    if gpus_per_node_raw in (None, 0):
+        return None
+    gpus_per_node = int(gpus_per_node_raw)
+
+    prefill_nodes = int(resources.get("prefill_nodes", 0) or 0)
+    decode_nodes = int(resources.get("decode_nodes", 0) or 0)
+    if prefill_nodes or decode_nodes:
+        return (prefill_nodes + decode_nodes) * gpus_per_node
+
+    agg_nodes = int(resources.get("agg_nodes", 0) or 0)
+    if agg_nodes:
+        return agg_nodes * gpus_per_node
+
+    return gpus_per_node
+
+
+def _compute_gpu_counts(resources: dict[str, Any]) -> tuple[int | None, int | None]:
+    """Compute total and decode GPU counts from resource settings."""
+    total_gpu_count = _compute_total_gpu_count(resources)
+
+    decode_workers = int(resources.get("decode_workers", 0) or 0)
+    decode_nodes_raw = resources.get("decode_nodes")
+    decode_nodes = int(decode_nodes_raw) if decode_nodes_raw not in (None, "") else None
+
+    explicit = resources.get("gpus_per_decode")
+    if explicit not in (None, 0):
+        gpus_per_decode = int(explicit)
+        decode_gpu_count = decode_workers * gpus_per_decode if decode_workers else gpus_per_decode
+        return total_gpu_count, decode_gpu_count
+
+    gpus_per_node_raw = resources.get("gpus_per_node")
+    gpus_per_node = int(gpus_per_node_raw) if gpus_per_node_raw not in (None, 0) else None
+
+    if gpus_per_node is None and total_gpu_count not in (None, 0):
+        prefill_nodes = int(resources.get("prefill_nodes", 0) or 0)
+        total_nodes = prefill_nodes + (decode_nodes or 0)
+        if total_nodes > 0 and total_gpu_count % total_nodes == 0:
+            gpus_per_node = total_gpu_count // total_nodes
+
+    if decode_nodes not in (None, 0) and gpus_per_node not in (None, 0):
+        if decode_workers:
+            gpus_per_decode = (decode_nodes * gpus_per_node) // decode_workers
+            return total_gpu_count, decode_workers * gpus_per_decode
+        return total_gpu_count, decode_nodes * gpus_per_node
+
+    if decode_nodes == 0 and decode_workers:
+        explicit_prefill = resources.get("gpus_per_prefill")
+        if explicit_prefill not in (None, 0):
+            gpus_per_prefill = int(explicit_prefill)
+        else:
+            prefill_nodes = resources.get("prefill_nodes")
+            prefill_workers = resources.get("prefill_workers")
+            if prefill_nodes not in (None, 0) and prefill_workers not in (None, 0) and gpus_per_node not in (None, 0):
+                gpus_per_prefill = (int(prefill_nodes) * int(gpus_per_node)) // int(prefill_workers)
+            else:
+                gpus_per_prefill = gpus_per_node
+        if gpus_per_prefill not in (None, 0):
+            return total_gpu_count, decode_workers * gpus_per_prefill
+
+    return total_gpu_count, None
+
+
+def _extract_p90_decode_running_requests(log_dir: Path, context: RollupContext) -> int | None:
+    """Stream decode logs and compute the nearest-rank P90 of #running-req values."""
+    resources = context.resources
+    if context.backend_type != "sglang" or not isinstance(resources, dict):
+        return None
+    if int(resources.get("prefill_nodes", 0) or 0) <= 0:
+        return None
+    if int(resources.get("decode_nodes", 0) or 0) <= 0:
+        return None
+    if int(resources.get("agg_workers", 0) or 0) != 0:
+        return None
+
+    counts: Counter[int] = Counter()
+    total = 0
+
+    for decode_log in sorted(log_dir.glob("*decode*.out")):
+        try:
+            with decode_log.open("r", errors="replace") as f:
+                for line in f:
+                    match = RUNNING_REQ_PATTERN.search(line)
+                    if not match:
+                        continue
+                    value = int(match.group(1))
+                    counts[value] += 1
+                    total += 1
+        except OSError as exc:
+            print(f"Failed to read {decode_log}: {exc}", file=sys.stderr)
+
+    if total == 0:
+        return None
+
+    rank = math.ceil(total * 0.9)
+    cumulative = 0
+    for value in sorted(counts):
+        cumulative += counts[value]
+        if cumulative >= rank:
+            return value
+
+    return None
+
+
+def _safe_ratio(numerator: float | int | None, denominator: float | int | None) -> float | None:
+    """Return numerator / denominator when both values are valid and denominator != 0."""
+    if numerator is None or denominator in (None, 0):
+        return None
+    return float(numerator) / float(denominator)
+
+
+def _format_csv_value(value: object) -> str:
+    """Format CSV values with at most three decimal places for numeric fields."""
+    if value is None:
+        return ""
+    if isinstance(value, int):
+        return str(value)
+    if isinstance(value, float):
+        return f"{value:.3f}".rstrip("0").rstrip(".")
+    return str(value)
+
+
+def _build_csv_row(
+    data: dict[str, object],
+    config_name: str,
+    gpu_num: int | None,
+    decode_gpu_count: int | None,
+    p90_decode_running_requests: int | None,
+) -> dict[str, object]:
+    """Build one CSV row from a parsed sa-bench result."""
+    total_token_throughput = data.get("total_token_throughput")
+    median_tpot = data.get("median_tpot_ms")
+    row = {
+        "Config": config_name,
+        "Total GPU Count": gpu_num,
+        "Decode GPU Count": decode_gpu_count,
+        "Concurrency": data.get("max_concurrency"),
+        "Total Token Throughput": total_token_throughput,
+        "Output Token Throughput": data.get("output_throughput"),
+        "Median TTFT": data.get("median_ttft_ms"),
+        "Median TPOT": median_tpot,
+        "Median ITL": data.get("median_itl_ms"),
+        "P90 Decode Running Requests": p90_decode_running_requests,
+        "Output Token Throughput per User": _safe_ratio(1000.0, median_tpot),
+        "Total Token Throughput per GPU": _safe_ratio(total_token_throughput, gpu_num),
+    }
+    return {key: _format_csv_value(value) for key, value in row.items()}
+
+
 def main(log_dir: Path) -> None:
-    """Generate benchmark-rollup.json from sa-bench result files."""
+    """Generate benchmark-rollup.json and benchmark-rollup.csv from sa-bench result files."""
     result_files = sorted(log_dir.glob("sa-bench_*/results_*.json"))
     if not result_files:
         print("No sa-bench results found", file=sys.stderr)
         return
 
     runs = []
+    csv_rows = []
     config = {}
+    context = _load_rollup_context(log_dir)
+    total_gpu_count, decode_gpu_count = _compute_gpu_counts(context.resources) if context.resources else (None, None)
+    p90_decode_running_requests = _extract_p90_decode_running_requests(log_dir, context)
 
-    for f in result_files:
+    for result_file in result_files:
         try:
-            data = json.loads(f.read_text())
-        except json.JSONDecodeError as e:
-            print(f"Failed to parse {f}: {e}", file=sys.stderr)
+            data = json.loads(result_file.read_text())
+        except json.JSONDecodeError as exc:
+            print(f"Failed to parse {result_file}: {exc}", file=sys.stderr)
             continue
 
-        # Extract config from first file
         if not config:
             config = {
                 "model": data.get("model_id"),
@@ -61,16 +345,38 @@ def main(log_dir: Path) -> None:
             "total_output_tokens": data.get("total_output"),
         })
 
+        csv_rows.append(
+            _build_csv_row(
+                data=data,
+                config_name=context.config_name or str(data.get("model_id") or "unknown"),
+                gpu_num=total_gpu_count,
+                decode_gpu_count=decode_gpu_count,
+                p90_decode_running_requests=p90_decode_running_requests,
+            )
+        )
+
+    if not runs:
+        print("No valid sa-bench results found", file=sys.stderr)
+        return
+
     rollup = {
         "benchmark_type": "sa-bench",
         "timestamp": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
         "config": config,
         "runs": runs,
     }
 
-    output_path = log_dir / "benchmark-rollup.json"
-    output_path.write_text(json.dumps(rollup, indent=2))
-    print(f"Wrote {output_path}")
+    json_path = log_dir / "benchmark-rollup.json"
+    json_path.write_text(json.dumps(rollup, indent=2))
+    print(f"Wrote {json_path}")
+
+    csv_rows.sort(key=lambda row: int(row["Concurrency"]) if row["Concurrency"] else -1)
+    csv_path = log_dir / "benchmark-rollup.csv"
+    with csv_path.open("w", newline="") as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=OUTPUT_FIELDS)
+        writer.writeheader()
+        writer.writerows(csv_rows)
+    print(f"Wrote {csv_path}")
 
 
 if __name__ == "__main__":