Skip to content

Commit d6deb75

Browse files
committed
API latency calculation
1 parent e06438f commit d6deb75

20 files changed

Lines changed: 980 additions & 606 deletions

File tree

config/system.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ storage:
274274
- "response"
275275
- "api_input_tokens"
276276
- "api_output_tokens"
277+
- "api_latency"
277278
# Streaming performance metrics (only populated when using streaming endpoint)
278279
- "time_to_first_token" # Time to first token in seconds
279280
- "streaming_duration" # Total streaming duration in seconds

src/lightspeed_evaluation/core/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@
106106
# Streaming performance metrics
107107
"time_to_first_token",
108108
"streaming_duration",
109+
"api_latency",
109110
"tokens_per_second",
110111
"tool_calls",
111112
"contexts",

src/lightspeed_evaluation/core/models/data.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ class TurnData(StreamingMetricsMixin):
8484
default=0, ge=0, description="Output tokens used by API call"
8585
)
8686

87+
# API execution time tracking (per turn)
88+
api_latency: float = Field(
89+
default=0, ge=0, description="API call latency for this turn in seconds"
90+
)
91+
8792
# Per-turn metrics support
8893
turn_metrics: Optional[list[str]] = Field(
8994
default=None,
@@ -507,6 +512,11 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
507512
execution_time: float = Field(
508513
default=0, ge=0, description="Execution time in seconds"
509514
)
515+
api_latency: float = Field(
516+
default=0,
517+
ge=0,
518+
description="API latency in seconds (per turn or average for conversation)",
519+
)
510520
api_input_tokens: int = Field(default=0, ge=0, description="API input tokens used")
511521
api_output_tokens: int = Field(
512522
default=0, ge=0, description="API output tokens used"

src/lightspeed_evaluation/core/models/quality.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,18 @@
55
"""
66

77
import logging
8-
from typing import Optional
8+
from typing import TYPE_CHECKING, Optional
99

1010
from pydantic import BaseModel, Field
1111

12-
from lightspeed_evaluation.core.models.summary import MetricStats, ScoreStatistics
12+
from lightspeed_evaluation.core.models.summary import (
13+
MetricStats,
14+
NumericStats,
15+
ScoreStatistics,
16+
)
17+
18+
if TYPE_CHECKING:
19+
from lightspeed_evaluation.core.models.data import EvaluationResult
1320

1421
logger = logging.getLogger(__name__)
1522

@@ -44,8 +51,8 @@ class QualityReport(BaseModel):
4451
default_factory=list,
4552
description="Warnings about quality metrics configuration or usage",
4653
)
47-
api_latency: float = Field(
48-
default=0.0, description="[Placeholder] Average API response time in seconds"
54+
api_latency: Optional[NumericStats] = Field(
55+
default=None, description="API latency statistics"
4956
)
5057
api_tokens: int = Field(
5158
default=0,
@@ -55,7 +62,9 @@ class QualityReport(BaseModel):
5562
@staticmethod
5663
def create_report(
5764
by_metric: dict[str, MetricStats],
65+
api_latency_summary: Optional[NumericStats],
5866
quality_score_metrics: list[str],
67+
results: Optional[list["EvaluationResult"]] = None,
5968
) -> Optional["QualityReport"]:
6069
"""Creates a quality report with aggregated quality score from selected metrics.
6170
@@ -66,6 +75,7 @@ def create_report(
6675
by_metric: Dictionary mapping metric identifiers to their computed statistics.
6776
quality_score_metrics: Metric identifiers to include in quality score calculation.
6877
All specified metrics must exist in by_metric.
78+
results: Optional list of EvaluationResult for computing API latency statistics.
6979
7080
Returns:
7181
QualityReport with aggregated quality score and separated quality/extra metrics,
@@ -75,6 +85,7 @@ def create_report(
7585
ValueError: If any quality_score_metrics are not found in by_metric.
7686
"""
7787
warnings: list[str] = []
88+
_ = results
7889

7990
# Validate all quality score metrics exist in computed metrics (by_metric)
8091
missing_metrics = [m for m in quality_score_metrics if m not in by_metric]
@@ -148,14 +159,12 @@ def create_report(
148159
if stats is not None:
149160
extra_metrics[metric_id] = stats
150161

151-
# Calculate aggregated quality score
152-
aggregated_score = QualityReport._calculate_quality_score(quality_metrics)
153-
154162
return QualityReport(
155-
quality_score=aggregated_score,
163+
quality_score=QualityReport._calculate_quality_score(quality_metrics),
156164
quality_metrics=quality_metrics,
157165
extra_metrics=extra_metrics,
158166
warnings=warnings,
167+
api_latency=api_latency_summary,
159168
)
160169

161170
@staticmethod

src/lightspeed_evaluation/core/models/summary.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,23 @@
1414
from lightspeed_evaluation.core.models.data import EvaluationData, EvaluationResult
1515
from lightspeed_evaluation.core.output.statistics import (
1616
bootstrap_intervals,
17+
calculate_api_latency_stats,
1718
calculate_api_token_usage,
1819
calculate_streaming_stats,
1920
)
2021

2122

2223
class NumericStats(BaseModel):
23-
"""Numeric statistics for a set of values (e.g., TTFT, duration)."""
24+
"""Numeric statistics for a set of values (e.g., TTFT, duration, latency)."""
2425

2526
count: int = Field(default=0, description="Number of values")
2627
mean: Optional[float] = Field(default=None, description="Mean value")
2728
median: Optional[float] = Field(default=None, description="Median value")
2829
std: Optional[float] = Field(default=None, description="Standard deviation")
2930
min_value: Optional[float] = Field(default=None, description="Minimum value")
3031
max_value: Optional[float] = Field(default=None, description="Maximum value")
32+
p95: Optional[float] = Field(default=None, description="95th percentile")
33+
p99: Optional[float] = Field(default=None, description="99th percentile")
3134

3235

3336
class ScoreStatistics(BaseModel):
@@ -137,6 +140,9 @@ class EvaluationSummary(BaseModel):
137140
api_tokens: Optional[ApiTokenUsage] = Field(
138141
default=None, description="API token usage (when evaluation data provided)"
139142
)
143+
api_latency: Optional[NumericStats] = Field(
144+
default=None, description="API latency statistics (when API enabled)"
145+
)
140146
streaming: Optional[StreamingStats] = Field(
141147
default=None, description="Streaming performance stats (when available)"
142148
)
@@ -178,6 +184,9 @@ def from_results(
178184
api_tokens = _compute_api_token_usage(evaluation_data)
179185
streaming = _compute_streaming_stats(evaluation_data)
180186

187+
# Compute API latency statistics from results
188+
api_latency = _compute_api_latency_stats(results)
189+
181190
return cls(
182191
timestamp=timestamp,
183192
results=results,
@@ -186,6 +195,7 @@ def from_results(
186195
by_conversation=by_conversation,
187196
by_tag=by_tag,
188197
api_tokens=api_tokens,
198+
api_latency=api_latency,
189199
streaming=streaming,
190200
)
191201

@@ -417,7 +427,7 @@ def _numeric_stats_from_dict(raw: dict[str, Any]) -> Optional[NumericStats]:
417427
"""Convert a raw numeric stats dictionary to a NumericStats model.
418428
419429
Args:
420-
raw: Dictionary with count, mean, median, std, min, max keys.
430+
raw: Dictionary with count, mean, median, std, min, max, p95, p99 keys.
421431
422432
Returns:
423433
NumericStats instance, or None if count is 0.
@@ -432,6 +442,8 @@ def _numeric_stats_from_dict(raw: dict[str, Any]) -> Optional[NumericStats]:
432442
std=raw.get("std"),
433443
min_value=raw.get("min"),
434444
max_value=raw.get("max"),
445+
p95=raw.get("p95"),
446+
p99=raw.get("p99"),
435447
)
436448

437449

@@ -454,6 +466,23 @@ def _compute_api_token_usage(
454466
)
455467

456468

469+
def _compute_api_latency_stats(
470+
results: Optional[list[EvaluationResult]],
471+
) -> Optional[NumericStats]:
472+
"""Compute API latency statistics from evaluation results.
473+
474+
Args:
475+
results: List of evaluation results containing API latency data.
476+
477+
Returns:
478+
NumericStats instance, or None if no API latency data available.
479+
"""
480+
if not results:
481+
return None
482+
raw = calculate_api_latency_stats(results)
483+
return _numeric_stats_from_dict(raw)
484+
485+
457486
def _compute_streaming_stats(
458487
evaluation_data: list[EvaluationData],
459488
) -> Optional[StreamingStats]:

src/lightspeed_evaluation/core/output/generator.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
ConversationStats,
2222
EvaluationSummary,
2323
MetricStats,
24+
NumericStats,
2425
OverallStats,
2526
StreamingStats,
2627
TagStats,
@@ -91,7 +92,9 @@ def generate_reports(
9192
if quality_score_metrics:
9293
quality_report = QualityReport.create_report(
9394
summary.by_metric,
95+
summary.api_latency,
9496
quality_score_metrics,
97+
results=results,
9598
)
9699

97100
# Prepare timestamped base filename
@@ -350,7 +353,11 @@ def _generate_quality_score_report(
350353
}
351354
for metric_id, stats in quality_report.extra_metrics.items()
352355
},
353-
"api_latency": quality_report.api_latency,
356+
"api_latency": (
357+
quality_report.api_latency.model_dump()
358+
if quality_report.api_latency
359+
else None
360+
),
354361
"api_tokens": quality_report.api_tokens,
355362
"warnings": quality_report.warnings,
356363
}
@@ -408,6 +415,9 @@ def _generate_text_summary_from_model(
408415
# Token usage statistics
409416
self._write_token_stats(f, basic_stats, api_tokens)
410417

418+
# API latency statistics
419+
self._write_api_latency_stats(f, summary.api_latency)
420+
411421
# Streaming performance statistics
412422
self._write_streaming_stats(f, streaming_stats)
413423

@@ -456,6 +466,24 @@ def _write_token_stats(
456466
f.write(f"Output Tokens: {api_tokens.get('total_api_output_tokens', 0):,}\n")
457467
f.write(f"Total Tokens: {api_tokens.get('total_api_tokens', 0):,}\n\n")
458468

469+
def _write_api_latency_stats(
470+
self, f: Any, api_latency: Optional["NumericStats"]
471+
) -> None:
472+
"""Write API latency statistics section."""
473+
if api_latency is None or api_latency.count == 0:
474+
return # No API latency data available
475+
476+
f.write("API Latency (seconds):\n")
477+
f.write("-" * 20 + "\n")
478+
f.write(f"Count: {api_latency.count}\n")
479+
f.write(f"Mean: {api_latency.mean:.3f}s\n")
480+
f.write(f"Median: {api_latency.median:.3f}s\n")
481+
f.write(f"Std Dev: {api_latency.std:.3f}s\n")
482+
f.write(f"Min: {api_latency.min_value:.3f}s\n")
483+
f.write(f"Max: {api_latency.max_value:.3f}s\n")
484+
f.write(f"P95: {api_latency.p95:.3f}s\n")
485+
f.write(f"P99: {api_latency.p99:.3f}s\n\n")
486+
459487
def _write_streaming_stats(self, f: Any, streaming_stats: dict[str, Any]) -> None:
460488
"""Write streaming performance statistics section."""
461489
# Check if there are any streaming metrics
@@ -704,6 +732,9 @@ def _build_json_summary_stats(summary: EvaluationSummary) -> dict[str, Any]:
704732
"by_tag": _tag_stats_to_dict(summary.by_tag),
705733
}
706734

735+
if summary.api_latency is not None:
736+
result["api_latency"] = summary.api_latency.model_dump()
737+
707738
if summary.streaming is not None:
708739
result["streaming_performance"] = _streaming_stats_to_dict(summary.streaming)
709740

@@ -735,6 +766,7 @@ def _result_to_json_dict(r: EvaluationResult) -> dict[str, Any]:
735766
),
736767
"time_to_first_token": r.time_to_first_token,
737768
"streaming_duration": r.streaming_duration,
769+
"api_latency": r.api_latency,
738770
"tokens_per_second": r.tokens_per_second,
739771
}
740772

src/lightspeed_evaluation/core/output/statistics.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def _calculate_numeric_stats(values: list[float]) -> dict[str, Any]:
181181
values: List of numeric values.
182182
183183
Returns:
184-
Dictionary containing count, mean, median, std, min, max.
184+
Dictionary containing count, mean, median, std, min, max, p95, p99.
185185
"""
186186
if not values:
187187
return {"count": 0}
@@ -193,6 +193,8 @@ def _calculate_numeric_stats(values: list[float]) -> dict[str, Any]:
193193
"std": statistics.stdev(values) if len(values) > 1 else 0.0,
194194
"min": min(values),
195195
"max": max(values),
196+
"p95": float(np.percentile(values, 95)),
197+
"p99": float(np.percentile(values, 99)),
196198
}
197199

198200

@@ -304,3 +306,21 @@ def calculate_streaming_stats(
304306
"streaming_duration": _calculate_numeric_stats(duration_values),
305307
"tokens_per_second": _calculate_numeric_stats(throughput_values),
306308
}
309+
310+
311+
def calculate_api_latency_stats(results: list[EvaluationResult]) -> dict[str, Any]:
312+
"""Calculate API latency statistics from evaluation results.
313+
314+
Args:
315+
results: List of evaluation results containing API latency data.
316+
317+
Returns:
318+
Dictionary containing API latency statistics with percentiles (p50, p95, p99).
319+
"""
320+
api_latency_values: list[float] = []
321+
322+
for result in results:
323+
if result.api_latency > 0:
324+
api_latency_values.append(result.api_latency)
325+
326+
return _calculate_numeric_stats(api_latency_values)

src/lightspeed_evaluation/core/storage/sql_storage.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class EvaluationResultDB(Base): # pylint: disable=too-few-public-methods
6666
judge_scores = Column(Text, nullable=True)
6767
time_to_first_token = Column(Float, nullable=True)
6868
streaming_duration = Column(Float, nullable=True)
69+
api_latency = Column(Float, nullable=True)
6970
tokens_per_second = Column(Float, nullable=True)
7071
tool_calls = Column(Text, nullable=True)
7172
contexts = Column(Text, nullable=True)
@@ -326,6 +327,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB:
326327
judge_scores=self._serialize_judge_scores(result.judge_scores),
327328
time_to_first_token=result.time_to_first_token,
328329
streaming_duration=result.streaming_duration,
330+
api_latency=result.api_latency,
329331
tokens_per_second=result.tokens_per_second,
330332
tool_calls=result.tool_calls,
331333
contexts=result.contexts,

src/lightspeed_evaluation/core/system/loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig:
149149
judge_panel_data = config_data.get("judge_panel")
150150
judge_panel = JudgePanelConfig(**judge_panel_data) if judge_panel_data else None
151151

152-
# Parse storage backends with backward compatibility
152+
# Parse storage backends with backward compatibility for legacy 'output' section
153153
storage_data = self._get_storage_config_with_backward_compat(config_data)
154154
storage_backends = self._parse_storage_config(storage_data)
155155

src/lightspeed_evaluation/pipeline/evaluation/amender.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""API Data Amendment module - handles API data enrichment."""
22

33
import logging
4+
import time
45
from typing import Any, Optional
56

67
from lightspeed_evaluation.core.api import APIClient
@@ -37,12 +38,15 @@ def amend_single_turn(
3738
logger.debug("Amending turn %s with API data", turn_data.turn_id)
3839

3940
try:
41+
# Track API call execution time
42+
api_start_time = time.perf_counter()
4043
api_response = self.api_client.query(
4144
query=turn_data.query,
4245
conversation_id=conversation_id,
4346
attachments=turn_data.attachments,
4447
extra_request_params=turn_data.extra_request_params,
4548
)
49+
api_latency = time.perf_counter() - api_start_time
4650

4751
# AMEND EVALUATION DATA: This modifies the loaded TurnData object in-place
4852
# Update response from API
@@ -63,11 +67,15 @@ def amend_single_turn(
6367
# Update token usage from API output (with fallback to 0 if not present)
6468
turn_data.api_input_tokens = getattr(api_response, "input_tokens", 0)
6569
turn_data.api_output_tokens = getattr(api_response, "output_tokens", 0)
70+
71+
# Update API latency
72+
turn_data.api_latency = api_latency
6673
logger.debug(
67-
"Token usage for turn %s: input=%d, output=%d",
74+
"Token usage for turn %s: input=%d, output=%d, API latency=%.3fs",
6875
turn_data.turn_id,
6976
turn_data.api_input_tokens,
7077
turn_data.api_output_tokens,
78+
api_latency,
7179
)
7280

7381
# Update streaming performance metrics (only available for streaming endpoint)

0 commit comments

Comments
 (0)