Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,16 @@ api:
# Legacy authentication (fallback when mcp_headers is not configured or disabled)
# Authentication via API_KEY environment variable only for MCP server (without Server name)

# Quality Score Configuration
# Aggregated score from selected metrics for overall system quality assessment
quality_score:
metrics:
- "ragas:faithfulness"
- "ragas:context_precision_with_reference"
- "custom:tool_eval"
- "custom:answer_correctness"
default: true # If true, all metrics in this list get default: true

# Default metrics metadata
metrics_metadata:
# Turn-level metrics metadata
Expand Down
31 changes: 31 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,37 @@ metrics_metadata:
description: "How completely the conversation addresses user intentions"
```

## Quality Score

Compute an aggregated quality score from selected metrics using weighted averaging.

| Setting (quality_score.) | Default | Description |
|--------------------------|---------|-------------|
| metrics | required | List of metric identifiers (must be defined in metrics_metadata) |
| default | `false` | If `true`, auto-enable all listed metrics globally |

**Validation**: Metrics must exist in `default_turn_metrics_metadata` or `default_conversation_metrics_metadata`. Invalid metrics fail at config load time.

**Calculation**: Weighted average where each metric's weight = its sample_count / total_samples.

### Example
```yaml
# Define metrics
metrics_metadata:
turn_level:
"ragas:faithfulness":
threshold: 0.7
"ragas:answer_relevancy":
threshold: 0.8

# Configure quality score
quality_score:
metrics:
- "ragas:faithfulness"
- "ragas:answer_relevancy"
default: true # Auto-enable these metrics
```

## Storage
Lightspeed Evaluation can persist results to files and/or databases. The `storage` section configures one or more storage backends.

Expand Down
181 changes: 181 additions & 0 deletions src/lightspeed_evaluation/core/models/quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
"""Quality score models for aggregated quality assessment.

Provides Pydantic models for computing and reporting an aggregated quality score
from selected metrics using weighted averaging based on sample sizes.
"""

import logging
from typing import Optional

from pydantic import BaseModel, Field

from lightspeed_evaluation.core.models.summary import MetricStats, ScoreStatistics

logger = logging.getLogger(__name__)


class QualityMetricResult(BaseModel):
"""Quality metric result using composition to add weight to score statistics."""

statistics: ScoreStatistics = Field(
description="Score statistics for this quality metric"
)
weight: float = Field(
default=0.0,
description="Weight proportion (sample_size / total_samples) used in weighted average",
)


class QualityReport(BaseModel):
"""Aggregated quality score from selected metrics."""

aggregated_quality_score: float = Field(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
aggregated_quality_score: float = Field(
quality_score: float = Field(

default=0.0, description="Weighted average of quality score metrics"
)
quality_metrics: dict[str, QualityMetricResult] = Field(
default_factory=dict,
description="Individual metrics used in quality score calculation",
)
extra_metrics: dict[str, ScoreStatistics] = Field(
default_factory=dict,
description="Other evaluated metrics calculated, not used for quality score calculation",
)
warnings: list[str] = Field(
default_factory=list,
description="Warnings about quality metrics configuration or usage",
)
api_latency: float = Field(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
api_latency: float = Field(
agent_latency: float = Field(

default=0.0, description="[Placeholder] Average API response time in seconds"
)
api_tokens: int = Field(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
api_tokens: int = Field(
agent_token_usage: int = Field(

default=0,
description="[Placeholder] Total number of tokens consumed across all API calls",
)

@staticmethod
def create_report(
by_metric: dict[str, MetricStats],
quality_score_metrics: list[str],
) -> Optional["QualityReport"]:
"""Creates a quality report with aggregated quality score from selected metrics.

Separates metrics into quality metrics (used for quality score calculation) and
extra metrics (evaluated but not included in quality score).

Args:
by_metric: Dictionary mapping metric identifiers to their computed statistics.
quality_score_metrics: Metric identifiers to include in quality score calculation.
All specified metrics must exist in by_metric.

Returns:
QualityReport with aggregated quality score and separated quality/extra metrics,
or None if all quality score metrics have zero samples.

Raises:
ValueError: If any quality_score_metrics are not found in by_metric.
"""
warnings: list[str] = []

# Validate all quality score metrics exist in computed metrics (by_metric)
missing_metrics = [m for m in quality_score_metrics if m not in by_metric]
if missing_metrics:
warning_msg = (
"WARNING: "
f"Quality score metrics {missing_metrics} were excluded from "
"quality score computation. "
f"Reason: Not found in evaluation results."
)
warnings.append(warning_msg)
logger.warning(warning_msg)

quality_score_metrics = list(
set(quality_score_metrics) - set(missing_metrics)
)

# Calculate total samples from quality score metrics only
total_samples = 0
for metric_id in quality_score_metrics:
score_stats = by_metric[metric_id].score_statistics
if score_stats is not None:
total_samples += score_stats.count
if total_samples == 0:
logger.warning(
"CRITICAL: Quality score computation failed. "
"All configured quality metrics have zero evaluation results."
)
return None

quality_metrics: dict[str, QualityMetricResult] = {}
extra_metrics: dict[str, ScoreStatistics] = {}

# Separate quality metrics from extra metrics
for metric_id in by_metric:
if metric_id in quality_score_metrics:
score_stats = by_metric[metric_id].score_statistics

# Skip if score_statistics is None
if score_stats is None:
warning_msg = (
f"WARNING: Quality score metric '{metric_id}' "
"was excluded from quality score computation. "
"Reason: Missing score statistics data."
)
warnings.append(warning_msg)
logger.warning(warning_msg)
continue
Comment thread
xmican10 marked this conversation as resolved.

sample_size = score_stats.count

# Skip metrics with zero samples
if sample_size == 0:
warning_msg = (
f"WARNING: Quality score metric '{metric_id}' "
"was excluded from quality score computation. "
"Reason: Zero evaluation results for this metric."
)
warnings.append(warning_msg)
logger.warning(warning_msg)
continue

weight = sample_size / total_samples

quality_metrics[metric_id] = QualityMetricResult(
statistics=score_stats,
weight=weight,
)
else:
stats = by_metric[metric_id].score_statistics
if stats is not None:
extra_metrics[metric_id] = stats

# Calculate aggregated quality score
aggregated_score = QualityReport._calculate_quality_score(quality_metrics)

return QualityReport(
aggregated_quality_score=aggregated_score,
quality_metrics=quality_metrics,
extra_metrics=extra_metrics,
warnings=warnings,
)

@staticmethod
def _calculate_quality_score(
quality_metrics: dict[str, QualityMetricResult],
) -> float:
"""Calculate weighted average quality score from quality metrics.

Computes a weighted average where each metric's weight is proportional to its
sample size relative to the total samples across all quality metrics.

Args:
quality_metrics: Dictionary of quality metric results with statistics and
weights. Each metric contains statistics with a mean score and a weight
(sample_size / total_samples).

Returns:
Weighted average quality score computed as sum of (mean * weight) for all metrics.
"""
weighted_sum = 0.0
for metric in quality_metrics.values():
weighted_sum += metric.statistics.mean * metric.weight
return weighted_sum
66 changes: 66 additions & 0 deletions src/lightspeed_evaluation/core/models/system.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# pylint: disable=too-many-lines
"""System configuration models."""

import os
Expand Down Expand Up @@ -804,6 +805,39 @@ def from_metadata(cls, raw: dict[str, Any]) -> "GEvalConfig":
return cls.model_validate(data)


class QualityScoreConfig(BaseModel):
"""Quality score configuration."""

model_config = ConfigDict(extra="forbid")

metrics: list[str] = Field(
default_factory=list,
description="List of metric identifiers to use for quality score computation",
)
default: bool = Field(
default=False,
description="If true, set default: true for all metrics in the list",
)

@field_validator("metrics")
@classmethod
def validate_metrics(cls, v: list[str]) -> list[str]:
"""Ensure metrics list is not empty and contains no duplicates."""
if len(v) == 0:
raise ValueError(
"Quality score metrics list cannot be empty. "
"Either specify at least one metric or "
"remove the quality_score section from configuration."
)
if len(v) != len(set(v)):
duplicates = [m for m in v if v.count(m) > 1]
raise ValueError(
f"Quality score metrics contains duplicates: {set(duplicates)}. "
"Each metric should appear only once."
)
return v


class SystemConfig(BaseModel):
"""System configuration using individual config models."""

Expand Down Expand Up @@ -848,6 +882,11 @@ class SystemConfig(BaseModel):
default_factory=VisualizationConfig, description="Visualization configuration"
)

# Quality score configuration
quality_score: Optional[QualityScoreConfig] = Field(
default=None, description="Quality score configuration"
)

# Default metrics metadata from system config
default_turn_metrics_metadata: dict[str, dict[str, Any]] = Field(
default_factory=dict, description="Default turn metrics metadata"
Expand Down Expand Up @@ -889,6 +928,33 @@ def validate_default_metrics_metadata_geval(
) from e
return v

@model_validator(mode="after")
def validate_quality_score_metrics(self) -> "SystemConfig":
"""Validate quality_score metrics exist in metrics_metadata.

Raises:
ConfigurationError: When quality_score contains metrics not defined
in turn or conversation level metrics_metadata.
"""
if not self.quality_score:
return self

# Combine all available metrics from both turn and conversation level metadata
all_metrics = set(self.default_turn_metrics_metadata.keys()) | set(
self.default_conversation_metrics_metadata.keys()
)

# Check for invalid metrics
invalid = [m for m in self.quality_score.metrics if m not in all_metrics]
if invalid:
raise ConfigurationError(
f"Invalid quality_score metrics: {invalid}. "
"Must be defined in default_turn_metrics_metadata or "
"default_conversation_metrics_metadata."
)

return self

@property
def turn_level_metric_names(self) -> set[str]:
"""Return turn-level metric names derived from metadata keys."""
Expand Down
Loading
Loading