Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ storage:
- "response"
- "api_input_tokens"
- "api_output_tokens"
- "api_latency"
# Streaming performance metrics (only populated when using streaming endpoint)
- "time_to_first_token" # Time to first token in seconds
- "streaming_duration" # Total streaming duration in seconds
Expand Down
1 change: 1 addition & 0 deletions src/lightspeed_evaluation/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
# Streaming performance metrics
"time_to_first_token",
"streaming_duration",
"api_latency",
"tokens_per_second",
"tool_calls",
"contexts",
Expand Down
10 changes: 10 additions & 0 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ class TurnData(StreamingMetricsMixin):
default=0, ge=0, description="Output tokens used by API call"
)

# API execution time tracking (per turn)
api_latency: float = Field(
default=0, ge=0, description="API call latency for this turn in seconds"
)

# Per-turn metrics support
turn_metrics: Optional[list[str]] = Field(
default=None,
Expand Down Expand Up @@ -507,6 +512,11 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
execution_time: float = Field(
default=0, ge=0, description="Execution time in seconds"
)
api_latency: float = Field(
default=0,
ge=0,
description="API latency in seconds (per turn or average for conversation)",
)
api_input_tokens: int = Field(default=0, ge=0, description="API input tokens used")
api_output_tokens: int = Field(
default=0, ge=0, description="API output tokens used"
Expand Down
25 changes: 17 additions & 8 deletions src/lightspeed_evaluation/core/models/quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,18 @@
"""

import logging
from typing import Optional
from typing import TYPE_CHECKING, Optional

from pydantic import BaseModel, Field

from lightspeed_evaluation.core.models.summary import MetricStats, ScoreStatistics
from lightspeed_evaluation.core.models.summary import (
MetricStats,
NumericStats,
ScoreStatistics,
)

if TYPE_CHECKING:
from lightspeed_evaluation.core.models.data import EvaluationResult

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -44,8 +51,8 @@ class QualityReport(BaseModel):
default_factory=list,
description="Warnings about quality metrics configuration or usage",
)
api_latency: float = Field(
default=0.0, description="[Placeholder] Average API response time in seconds"
api_latency: Optional[NumericStats] = Field(
default=None, description="API latency statistics"
)
api_tokens: int = Field(
default=0,
Expand All @@ -55,7 +62,9 @@ class QualityReport(BaseModel):
@staticmethod
def create_report(
by_metric: dict[str, MetricStats],
api_latency_summary: Optional[NumericStats],
quality_score_metrics: list[str],
results: Optional[list["EvaluationResult"]] = None,
) -> Optional["QualityReport"]:
"""Creates a quality report with aggregated quality score from selected metrics.

Expand All @@ -66,6 +75,7 @@ def create_report(
by_metric: Dictionary mapping metric identifiers to their computed statistics.
quality_score_metrics: Metric identifiers to include in quality score calculation.
All specified metrics must exist in by_metric.
results: Optional list of EvaluationResult for computing API latency statistics.

Returns:
QualityReport with aggregated quality score and separated quality/extra metrics,
Expand All @@ -75,6 +85,7 @@ def create_report(
ValueError: If any quality_score_metrics are not found in by_metric.
"""
warnings: list[str] = []
_ = results

# Validate all quality score metrics exist in computed metrics (by_metric)
missing_metrics = [m for m in quality_score_metrics if m not in by_metric]
Expand Down Expand Up @@ -148,14 +159,12 @@ def create_report(
if stats is not None:
extra_metrics[metric_id] = stats

# Calculate aggregated quality score
aggregated_score = QualityReport._calculate_quality_score(quality_metrics)

return QualityReport(
quality_score=aggregated_score,
quality_score=QualityReport._calculate_quality_score(quality_metrics),
quality_metrics=quality_metrics,
extra_metrics=extra_metrics,
warnings=warnings,
api_latency=api_latency_summary,
)

@staticmethod
Expand Down
33 changes: 31 additions & 2 deletions src/lightspeed_evaluation/core/models/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,23 @@
from lightspeed_evaluation.core.models.data import EvaluationData, EvaluationResult
from lightspeed_evaluation.core.output.statistics import (
bootstrap_intervals,
calculate_api_latency_stats,
calculate_api_token_usage,
calculate_streaming_stats,
)


class NumericStats(BaseModel):
"""Numeric statistics for a set of values (e.g., TTFT, duration)."""
"""Numeric statistics for a set of values (e.g., TTFT, duration, latency)."""

count: int = Field(default=0, description="Number of values")
mean: Optional[float] = Field(default=None, description="Mean value")
median: Optional[float] = Field(default=None, description="Median value")
std: Optional[float] = Field(default=None, description="Standard deviation")
min_value: Optional[float] = Field(default=None, description="Minimum value")
max_value: Optional[float] = Field(default=None, description="Maximum value")
p95: Optional[float] = Field(default=None, description="95th percentile")
p99: Optional[float] = Field(default=None, description="99th percentile")


class ScoreStatistics(BaseModel):
Expand Down Expand Up @@ -137,6 +140,9 @@ class EvaluationSummary(BaseModel):
api_tokens: Optional[ApiTokenUsage] = Field(
default=None, description="API token usage (when evaluation data provided)"
)
api_latency: Optional[NumericStats] = Field(
default=None, description="API latency statistics (when API enabled)"
)
streaming: Optional[StreamingStats] = Field(
default=None, description="Streaming performance stats (when available)"
)
Expand Down Expand Up @@ -178,6 +184,9 @@ def from_results(
api_tokens = _compute_api_token_usage(evaluation_data)
streaming = _compute_streaming_stats(evaluation_data)

# Compute API latency statistics from results
api_latency = _compute_api_latency_stats(results)

return cls(
timestamp=timestamp,
results=results,
Expand All @@ -186,6 +195,7 @@ def from_results(
by_conversation=by_conversation,
by_tag=by_tag,
api_tokens=api_tokens,
api_latency=api_latency,
streaming=streaming,
)

Expand Down Expand Up @@ -417,7 +427,7 @@ def _numeric_stats_from_dict(raw: dict[str, Any]) -> Optional[NumericStats]:
"""Convert a raw numeric stats dictionary to a NumericStats model.

Args:
raw: Dictionary with count, mean, median, std, min, max keys.
raw: Dictionary with count, mean, median, std, min, max, p95, p99 keys.

Returns:
NumericStats instance, or None if count is 0.
Expand All @@ -432,6 +442,8 @@ def _numeric_stats_from_dict(raw: dict[str, Any]) -> Optional[NumericStats]:
std=raw.get("std"),
min_value=raw.get("min"),
max_value=raw.get("max"),
p95=raw.get("p95"),
p99=raw.get("p99"),
)


Expand All @@ -454,6 +466,23 @@ def _compute_api_token_usage(
)


def _compute_api_latency_stats(
results: Optional[list[EvaluationResult]],
) -> Optional[NumericStats]:
"""Compute API latency statistics from evaluation results.

Args:
results: List of evaluation results containing API latency data.

Returns:
NumericStats instance, or None if no API latency data available.
"""
if not results:
return None
raw = calculate_api_latency_stats(results)
return _numeric_stats_from_dict(raw)


def _compute_streaming_stats(
evaluation_data: list[EvaluationData],
) -> Optional[StreamingStats]:
Expand Down
34 changes: 33 additions & 1 deletion src/lightspeed_evaluation/core/output/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
ConversationStats,
EvaluationSummary,
MetricStats,
NumericStats,
OverallStats,
StreamingStats,
TagStats,
Expand Down Expand Up @@ -91,7 +92,9 @@ def generate_reports(
if quality_score_metrics:
quality_report = QualityReport.create_report(
summary.by_metric,
summary.api_latency,
quality_score_metrics,
results=results,
)

# Prepare timestamped base filename
Expand Down Expand Up @@ -350,7 +353,11 @@ def _generate_quality_score_report(
}
for metric_id, stats in quality_report.extra_metrics.items()
},
"api_latency": quality_report.api_latency,
"api_latency": (
quality_report.api_latency.model_dump()
if quality_report.api_latency
else None
),
"api_tokens": quality_report.api_tokens,
"warnings": quality_report.warnings,
}
Expand Down Expand Up @@ -408,6 +415,9 @@ def _generate_text_summary_from_model(
# Token usage statistics
self._write_token_stats(f, basic_stats, api_tokens)

# API latency statistics
self._write_api_latency_stats(f, summary.api_latency)

# Streaming performance statistics
self._write_streaming_stats(f, streaming_stats)

Expand Down Expand Up @@ -456,6 +466,24 @@ def _write_token_stats(
f.write(f"Output Tokens: {api_tokens.get('total_api_output_tokens', 0):,}\n")
f.write(f"Total Tokens: {api_tokens.get('total_api_tokens', 0):,}\n\n")

def _write_api_latency_stats(
self, f: Any, api_latency: Optional["NumericStats"]
) -> None:
"""Write API latency statistics section."""
if api_latency is None or api_latency.count == 0:
return # No API latency data available

f.write("API Latency (seconds):\n")
f.write("-" * 20 + "\n")
f.write(f"Count: {api_latency.count}\n")
f.write(f"Mean: {api_latency.mean:.3f}s\n")
f.write(f"Median: {api_latency.median:.3f}s\n")
f.write(f"Std Dev: {api_latency.std:.3f}s\n")
f.write(f"Min: {api_latency.min_value:.3f}s\n")
f.write(f"Max: {api_latency.max_value:.3f}s\n")
f.write(f"P95: {api_latency.p95:.3f}s\n")
f.write(f"P99: {api_latency.p99:.3f}s\n\n")

def _write_streaming_stats(self, f: Any, streaming_stats: dict[str, Any]) -> None:
"""Write streaming performance statistics section."""
# Check if there are any streaming metrics
Expand Down Expand Up @@ -704,6 +732,9 @@ def _build_json_summary_stats(summary: EvaluationSummary) -> dict[str, Any]:
"by_tag": _tag_stats_to_dict(summary.by_tag),
}

if summary.api_latency is not None:
result["api_latency"] = summary.api_latency.model_dump()

if summary.streaming is not None:
result["streaming_performance"] = _streaming_stats_to_dict(summary.streaming)

Expand Down Expand Up @@ -735,6 +766,7 @@ def _result_to_json_dict(r: EvaluationResult) -> dict[str, Any]:
),
"time_to_first_token": r.time_to_first_token,
"streaming_duration": r.streaming_duration,
"api_latency": r.api_latency,
"tokens_per_second": r.tokens_per_second,
}

Expand Down
22 changes: 21 additions & 1 deletion src/lightspeed_evaluation/core/output/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def _calculate_numeric_stats(values: list[float]) -> dict[str, Any]:
values: List of numeric values.

Returns:
Dictionary containing count, mean, median, std, min, max.
Dictionary containing count, mean, median, std, min, max, p95, p99.
"""
if not values:
return {"count": 0}
Expand All @@ -193,6 +193,8 @@ def _calculate_numeric_stats(values: list[float]) -> dict[str, Any]:
"std": statistics.stdev(values) if len(values) > 1 else 0.0,
"min": min(values),
"max": max(values),
"p95": float(np.percentile(values, 95)),
"p99": float(np.percentile(values, 99)),
}


Expand Down Expand Up @@ -304,3 +306,21 @@ def calculate_streaming_stats(
"streaming_duration": _calculate_numeric_stats(duration_values),
"tokens_per_second": _calculate_numeric_stats(throughput_values),
}


def calculate_api_latency_stats(results: list[EvaluationResult]) -> dict[str, Any]:
"""Calculate API latency statistics from evaluation results.

Args:
results: List of evaluation results containing API latency data.

Returns:
Dictionary containing API latency statistics with percentiles (p50, p95, p99).
"""
api_latency_values: list[float] = []

for result in results:
if result.api_latency > 0:
api_latency_values.append(result.api_latency)

return _calculate_numeric_stats(api_latency_values)
2 changes: 2 additions & 0 deletions src/lightspeed_evaluation/core/storage/sql_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class EvaluationResultDB(Base): # pylint: disable=too-few-public-methods
judge_scores = Column(Text, nullable=True)
time_to_first_token = Column(Float, nullable=True)
streaming_duration = Column(Float, nullable=True)
api_latency = Column(Float, nullable=True)
tokens_per_second = Column(Float, nullable=True)
tool_calls = Column(Text, nullable=True)
contexts = Column(Text, nullable=True)
Expand Down Expand Up @@ -326,6 +327,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB:
judge_scores=self._serialize_judge_scores(result.judge_scores),
time_to_first_token=result.time_to_first_token,
streaming_duration=result.streaming_duration,
api_latency=result.api_latency,
tokens_per_second=result.tokens_per_second,
tool_calls=result.tool_calls,
contexts=result.contexts,
Expand Down
10 changes: 9 additions & 1 deletion src/lightspeed_evaluation/pipeline/evaluation/amender.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""API Data Amendment module - handles API data enrichment."""

import logging
import time
from typing import Any, Optional

from lightspeed_evaluation.core.api import APIClient
Expand Down Expand Up @@ -37,12 +38,15 @@ def amend_single_turn(
logger.debug("Amending turn %s with API data", turn_data.turn_id)

try:
# Track API call execution time
api_start_time = time.perf_counter()
api_response = self.api_client.query(
query=turn_data.query,
conversation_id=conversation_id,
attachments=turn_data.attachments,
extra_request_params=turn_data.extra_request_params,
)
api_latency = time.perf_counter() - api_start_time

# AMEND EVALUATION DATA: This modifies the loaded TurnData object in-place
# Update response from API
Expand All @@ -63,11 +67,15 @@ def amend_single_turn(
# Update token usage from API output (with fallback to 0 if not present)
turn_data.api_input_tokens = getattr(api_response, "input_tokens", 0)
turn_data.api_output_tokens = getattr(api_response, "output_tokens", 0)

# Update API latency
turn_data.api_latency = api_latency
logger.debug(
"Token usage for turn %s: input=%d, output=%d",
"Token usage for turn %s: input=%d, output=%d, API latency=%.3fs",
turn_data.turn_id,
turn_data.api_input_tokens,
turn_data.api_output_tokens,
api_latency,
)

# Update streaming performance metrics (only available for streaming endpoint)
Expand Down
Loading
Loading