diff --git a/config/system.yaml b/config/system.yaml index 6110e503..7187a018 100644 --- a/config/system.yaml +++ b/config/system.yaml @@ -104,6 +104,16 @@ api: # Legacy authentication (fallback when mcp_headers is not configured or disabled) # Authentication via API_KEY environment variable only for MCP server (without Server name) +# Quality Score Configuration +# Aggregated score from selected metrics for overall system quality assessment +quality_score: + metrics: + - "ragas:faithfulness" + - "ragas:context_precision_with_reference" + - "custom:tool_eval" + - "custom:answer_correctness" + default: true # If true, all metrics in this list get default: true + # Default metrics metadata metrics_metadata: # Turn-level metrics metadata diff --git a/docs/configuration.md b/docs/configuration.md index 7ddd93f7..05f86ce6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -335,6 +335,37 @@ metrics_metadata: description: "How completely the conversation addresses user intentions" ``` +## Quality Score + +Compute an aggregated quality score from selected metrics using weighted averaging. + +| Setting (quality_score.) | Default | Description | +|--------------------------|---------|-------------| +| metrics | required | List of metric identifiers (must be defined in metrics_metadata) | +| default | `false` | If `true`, auto-enable all listed metrics globally | + +**Validation**: Metrics must exist in `default_turn_metrics_metadata` or `default_conversation_metrics_metadata`. Invalid metrics fail at config load time. + +**Calculation**: Weighted average where each metric's weight = its sample_count / total_samples. + +### Example +```yaml +# Define metrics +metrics_metadata: + turn_level: + "ragas:faithfulness": + threshold: 0.7 + "ragas:answer_relevancy": + threshold: 0.8 + +# Configure quality score +quality_score: + metrics: + - "ragas:faithfulness" + - "ragas:answer_relevancy" + default: true # Auto-enable these metrics +``` + ## Storage Lightspeed Evaluation can persist results to files and/or databases. The `storage` section configures one or more storage backends. diff --git a/src/lightspeed_evaluation/core/models/quality.py b/src/lightspeed_evaluation/core/models/quality.py new file mode 100644 index 00000000..a9cd4e0b --- /dev/null +++ b/src/lightspeed_evaluation/core/models/quality.py @@ -0,0 +1,181 @@ +"""Quality score models for aggregated quality assessment. + +Provides Pydantic models for computing and reporting an aggregated quality score +from selected metrics using weighted averaging based on sample sizes. +""" + +import logging +from typing import Optional + +from pydantic import BaseModel, Field + +from lightspeed_evaluation.core.models.summary import MetricStats, ScoreStatistics + +logger = logging.getLogger(__name__) + + +class QualityMetricResult(BaseModel): + """Quality metric result using composition to add weight to score statistics.""" + + statistics: ScoreStatistics = Field( + description="Score statistics for this quality metric" + ) + weight: float = Field( + default=0.0, + description="Weight proportion (sample_size / total_samples) used in weighted average", + ) + + +class QualityReport(BaseModel): + """Aggregated quality score from selected metrics.""" + + aggregated_quality_score: float = Field( + default=0.0, description="Weighted average of quality score metrics" + ) + quality_metrics: dict[str, QualityMetricResult] = Field( + default_factory=dict, + description="Individual metrics used in quality score calculation", + ) + extra_metrics: dict[str, ScoreStatistics] = Field( + default_factory=dict, + description="Other evaluated metrics calculated, not used for quality score calculation", + ) + warnings: list[str] = Field( + default_factory=list, + description="Warnings about quality metrics configuration or usage", + ) + api_latency: float = Field( + default=0.0, description="[Placeholder] Average API response time in seconds" + ) + api_tokens: int = Field( + default=0, + description="[Placeholder] Total number of tokens consumed across all API calls", + ) + + @staticmethod + def create_report( + by_metric: dict[str, MetricStats], + quality_score_metrics: list[str], + ) -> Optional["QualityReport"]: + """Creates a quality report with aggregated quality score from selected metrics. + + Separates metrics into quality metrics (used for quality score calculation) and + extra metrics (evaluated but not included in quality score). + + Args: + by_metric: Dictionary mapping metric identifiers to their computed statistics. + quality_score_metrics: Metric identifiers to include in quality score calculation. + All specified metrics must exist in by_metric. + + Returns: + QualityReport with aggregated quality score and separated quality/extra metrics, + or None if all quality score metrics have zero samples. + + Raises: + ValueError: If any quality_score_metrics are not found in by_metric. + """ + warnings: list[str] = [] + + # Validate all quality score metrics exist in computed metrics (by_metric) + missing_metrics = [m for m in quality_score_metrics if m not in by_metric] + if missing_metrics: + warning_msg = ( + "WARNING: " + f"Quality score metrics {missing_metrics} were excluded from " + "quality score computation. " + f"Reason: Not found in evaluation results." + ) + warnings.append(warning_msg) + logger.warning(warning_msg) + + quality_score_metrics = list( + set(quality_score_metrics) - set(missing_metrics) + ) + + # Calculate total samples from quality score metrics only + total_samples = 0 + for metric_id in quality_score_metrics: + score_stats = by_metric[metric_id].score_statistics + if score_stats is not None: + total_samples += score_stats.count + if total_samples == 0: + logger.warning( + "CRITICAL: Quality score computation failed. " + "All configured quality metrics have zero evaluation results." + ) + return None + + quality_metrics: dict[str, QualityMetricResult] = {} + extra_metrics: dict[str, ScoreStatistics] = {} + + # Separate quality metrics from extra metrics + for metric_id in by_metric: + if metric_id in quality_score_metrics: + score_stats = by_metric[metric_id].score_statistics + + # Skip if score_statistics is None + if score_stats is None: + warning_msg = ( + f"WARNING: Quality score metric '{metric_id}' " + "was excluded from quality score computation. " + "Reason: Missing score statistics data." + ) + warnings.append(warning_msg) + logger.warning(warning_msg) + continue + + sample_size = score_stats.count + + # Skip metrics with zero samples + if sample_size == 0: + warning_msg = ( + f"WARNING: Quality score metric '{metric_id}' " + "was excluded from quality score computation. " + "Reason: Zero evaluation results for this metric." + ) + warnings.append(warning_msg) + logger.warning(warning_msg) + continue + + weight = sample_size / total_samples + + quality_metrics[metric_id] = QualityMetricResult( + statistics=score_stats, + weight=weight, + ) + else: + stats = by_metric[metric_id].score_statistics + if stats is not None: + extra_metrics[metric_id] = stats + + # Calculate aggregated quality score + aggregated_score = QualityReport._calculate_quality_score(quality_metrics) + + return QualityReport( + aggregated_quality_score=aggregated_score, + quality_metrics=quality_metrics, + extra_metrics=extra_metrics, + warnings=warnings, + ) + + @staticmethod + def _calculate_quality_score( + quality_metrics: dict[str, QualityMetricResult], + ) -> float: + """Calculate weighted average quality score from quality metrics. + + Computes a weighted average where each metric's weight is proportional to its + sample size relative to the total samples across all quality metrics. + + Args: + quality_metrics: Dictionary of quality metric results with statistics and + weights. Each metric contains statistics with a mean score and a weight + (sample_size / total_samples). + + Returns: + Weighted average quality score computed as sum of (mean * weight) for all metrics. + """ + weighted_sum = 0.0 + for metric in quality_metrics.values(): + weighted_sum += metric.statistics.mean * metric.weight + return weighted_sum diff --git a/src/lightspeed_evaluation/core/models/system.py b/src/lightspeed_evaluation/core/models/system.py index 5067d3b2..ae3eaff8 100644 --- a/src/lightspeed_evaluation/core/models/system.py +++ b/src/lightspeed_evaluation/core/models/system.py @@ -1,3 +1,4 @@ +# pylint: disable=too-many-lines """System configuration models.""" import os @@ -804,6 +805,39 @@ def from_metadata(cls, raw: dict[str, Any]) -> "GEvalConfig": return cls.model_validate(data) +class QualityScoreConfig(BaseModel): + """Quality score configuration.""" + + model_config = ConfigDict(extra="forbid") + + metrics: list[str] = Field( + default_factory=list, + description="List of metric identifiers to use for quality score computation", + ) + default: bool = Field( + default=False, + description="If true, set default: true for all metrics in the list", + ) + + @field_validator("metrics") + @classmethod + def validate_metrics(cls, v: list[str]) -> list[str]: + """Ensure metrics list is not empty and contains no duplicates.""" + if len(v) == 0: + raise ValueError( + "Quality score metrics list cannot be empty. " + "Either specify at least one metric or " + "remove the quality_score section from configuration." + ) + if len(v) != len(set(v)): + duplicates = [m for m in v if v.count(m) > 1] + raise ValueError( + f"Quality score metrics contains duplicates: {set(duplicates)}. " + "Each metric should appear only once." + ) + return v + + class SystemConfig(BaseModel): """System configuration using individual config models.""" @@ -848,6 +882,11 @@ class SystemConfig(BaseModel): default_factory=VisualizationConfig, description="Visualization configuration" ) + # Quality score configuration + quality_score: Optional[QualityScoreConfig] = Field( + default=None, description="Quality score configuration" + ) + # Default metrics metadata from system config default_turn_metrics_metadata: dict[str, dict[str, Any]] = Field( default_factory=dict, description="Default turn metrics metadata" @@ -889,6 +928,33 @@ def validate_default_metrics_metadata_geval( ) from e return v + @model_validator(mode="after") + def validate_quality_score_metrics(self) -> "SystemConfig": + """Validate quality_score metrics exist in metrics_metadata. + + Raises: + ConfigurationError: When quality_score contains metrics not defined + in turn or conversation level metrics_metadata. + """ + if not self.quality_score: + return self + + # Combine all available metrics from both turn and conversation level metadata + all_metrics = set(self.default_turn_metrics_metadata.keys()) | set( + self.default_conversation_metrics_metadata.keys() + ) + + # Check for invalid metrics + invalid = [m for m in self.quality_score.metrics if m not in all_metrics] + if invalid: + raise ConfigurationError( + f"Invalid quality_score metrics: {invalid}. " + "Must be defined in default_turn_metrics_metadata or " + "default_conversation_metrics_metadata." + ) + + return self + @property def turn_level_metric_names(self) -> set[str]: """Return turn-level metric names derived from metadata keys.""" diff --git a/src/lightspeed_evaluation/core/output/generator.py b/src/lightspeed_evaluation/core/output/generator.py index 3553ab81..8239e595 100644 --- a/src/lightspeed_evaluation/core/output/generator.py +++ b/src/lightspeed_evaluation/core/output/generator.py @@ -25,6 +25,7 @@ StreamingStats, TagStats, ) +from lightspeed_evaluation.core.models.quality import QualityReport from lightspeed_evaluation.core.storage import FileBackendConfig, get_file_config from lightspeed_evaluation.core.output.visualization import GraphGenerator @@ -69,6 +70,14 @@ def generate_reports( results: List of evaluation results. evaluation_data: Optional evaluation data for API token calculation. """ + # Get quality_score_metrics from system config if available + quality_score_metrics = None + if ( + self.system_config is not None + and self.system_config.quality_score is not None + ): + quality_score_metrics = self.system_config.quality_score.metrics + # Build EvaluationSummary once, use it everywhere. # CLI path computes confidence intervals by default (when sample size > 1). summary = EvaluationSummary.from_results( @@ -77,6 +86,14 @@ def generate_reports( compute_confidence_intervals=True, ) + # Generate QualityReport separately if quality score metrics are configured + quality_report = None + if quality_score_metrics: + quality_report = QualityReport.create_report( + summary.by_metric, + quality_score_metrics, + ) + # Prepare timestamped base filename timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") base_filename = f"{self.base_filename}_{timestamp}" @@ -92,7 +109,7 @@ def generate_reports( # Generate individual reports based on configuration self._generate_individual_reports( - results, base_filename, enabled_outputs, summary + results, base_filename, enabled_outputs, summary, quality_report ) # Generate graphs if enabled @@ -155,12 +172,13 @@ def save( return generated_files - def _generate_individual_reports( + def _generate_individual_reports( # pylint: disable=too-many-arguments, too-many-positional-arguments self, results: list[EvaluationResult], base_filename: str, enabled_outputs: list[str], summary: EvaluationSummary, + quality_report: Optional[QualityReport] = None, ) -> None: """Generate reports based on enabled outputs.""" if "csv" in enabled_outputs: @@ -170,6 +188,12 @@ def _generate_individual_reports( if "json" in enabled_outputs: json_file = self._generate_json_summary_from_model(summary, base_filename) logger.info("JSON: %s", json_file) + # Generate quality_report.json if quality score is configured + if quality_report is not None: + quality_report_file = self._generate_quality_score_report( + quality_report, base_filename + ) + logger.info("JSON: %s", quality_report_file) if "txt" in enabled_outputs: txt_file = self._generate_text_summary_from_model(summary, base_filename) @@ -289,6 +313,53 @@ def _generate_json_summary_from_model( return json_file + def _generate_quality_score_report( + self, + quality_report: QualityReport, + base_filename: str, + target_dir: Optional[Path] = None, + ) -> Path: + """Generate quality score JSON report. + + Args: + quality_report: The QualityReport model instance. + base_filename: Base filename for the output file. + target_dir: Optional directory override for output file location. + + Returns: + Path to the generated quality_score.json file. + """ + out = target_dir if target_dir is not None else self.output_dir + quality_score_file = out / f"{base_filename}_quality_report.json" + + output = { + "timestamp": datetime.now().isoformat(), + "aggregated_quality_score": quality_report.aggregated_quality_score, + "quality_metrics": { + metric_id: { + "mean": metric.statistics.mean, + "count": metric.statistics.count, + "weight": metric.weight, + } + for metric_id, metric in quality_report.quality_metrics.items() + }, + "extra_metrics": { + metric_id: { + "mean": stats.mean, + "count": stats.count, + } + for metric_id, stats in quality_report.extra_metrics.items() + }, + "api_latency": quality_report.api_latency, + "api_tokens": quality_report.api_tokens, + "warnings": quality_report.warnings, + } + + with open(quality_score_file, "w", encoding="utf-8") as f: + json.dump(output, f, indent=2) + + return quality_score_file + def _generate_text_summary_from_model( self, summary: EvaluationSummary, diff --git a/src/lightspeed_evaluation/core/system/loader.py b/src/lightspeed_evaluation/core/system/loader.py index 9b3e055d..6818350a 100644 --- a/src/lightspeed_evaluation/core/system/loader.py +++ b/src/lightspeed_evaluation/core/system/loader.py @@ -18,6 +18,7 @@ from lightspeed_evaluation.core.models.system import ( JudgePanelConfig, LLMPoolConfig, + QualityScoreConfig, ) from lightspeed_evaluation.core.storage.config import ( DatabaseBackendConfig, @@ -121,6 +122,25 @@ def load_system_config(self, config_path: str) -> SystemConfig: def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig: """Create SystemConfig object from validated configuration data.""" metrics_metadata = config_data.get("metrics_metadata", {}) + quality_score_data = config_data.get("quality_score") + + # Process quality_score defaults before creating SystemConfig + turn_level_metadata = metrics_metadata.get("turn_level", {}) + conversation_level_metadata = metrics_metadata.get("conversation_level", {}) + + if quality_score_data is not None: + self._process_quality_score_defaults( + quality_score_data, + turn_level_metadata, + conversation_level_metadata, + ) + + # Parse quality_score config if present + quality_score_config = ( + QualityScoreConfig(**quality_score_data) + if quality_score_data is not None + else None + ) # Parse llm_pool and judge_panel if present (Optional sections) llm_pool_data = config_data.get("llm_pool") @@ -129,7 +149,7 @@ def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig: judge_panel_data = config_data.get("judge_panel") judge_panel = JudgePanelConfig(**judge_panel_data) if judge_panel_data else None - # Parse storage backends with backward compatibility for legacy 'output' section + # Parse storage backends with backward compatibility storage_data = self._get_storage_config_with_backward_compat(config_data) storage_backends = self._parse_storage_config(storage_data) @@ -143,12 +163,57 @@ def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig: visualization=VisualizationConfig(**config_data.get("visualization", {})), llm_pool=llm_pool, judge_panel=judge_panel, - default_turn_metrics_metadata=metrics_metadata.get("turn_level", {}), - default_conversation_metrics_metadata=metrics_metadata.get( - "conversation_level", {} - ), + quality_score=quality_score_config, + default_turn_metrics_metadata=turn_level_metadata, + default_conversation_metrics_metadata=conversation_level_metadata, ) + def _process_quality_score_defaults( + self, + quality_score_config: dict[str, Any], + turn_level_metadata: dict[str, dict[str, Any]], + conversation_level_metadata: dict[str, dict[str, Any]], + ) -> None: + """Apply quality_score.default flag to auto-enable quality metrics. + + When quality_score.default is true, sets default: true for all metrics + in quality_score.metrics, enabling them globally for evaluation. + + Args: + quality_score_config: Quality score configuration from system.yaml. + turn_level_metadata: Turn-level metrics metadata (modified in-place). + conversation_level_metadata: Conversation-level metrics metadata (modified in-place). + + Raises: + ConfigurationError: If a metric in quality_score.metrics is not defined + in turn_level or conversation_level metadata. + """ + # Check if default flag is set to true + default_flag = quality_score_config.get("default", False) + # Get the list of metrics for quality score + quality_score_metrics = quality_score_config.get("metrics", []) + + # Process each metric + for metric_id in quality_score_metrics: + # Check if metric exists in turn_level or conversation_level + if metric_id in turn_level_metadata: + turn_level_metadata[metric_id]["default"] = ( + default_flag or turn_level_metadata[metric_id].get("default", False) + ) + elif metric_id in conversation_level_metadata: + conversation_level_metadata[metric_id]["default"] = ( + default_flag + or conversation_level_metadata[metric_id].get("default", False) + ) + else: + # Metric not found - raise error + raise ConfigurationError( + f"Metric '{metric_id}' is listed in quality_score.metrics but not defined " + "in metrics_metadata.turn_level or metrics_metadata.conversation_level. " + "Please add metadata configuration for this metric before using it " + "in quality_score." + ) + def _get_storage_config_with_backward_compat( self, config_data: dict[str, Any] ) -> list[dict[str, Any]]: diff --git a/tests/unit/core/models/conftest.py b/tests/unit/core/models/conftest.py new file mode 100644 index 00000000..41196ce2 --- /dev/null +++ b/tests/unit/core/models/conftest.py @@ -0,0 +1,77 @@ +"""Fixtures for models tests.""" + +import pytest + +from lightspeed_evaluation.core.models.summary import MetricStats, ScoreStatistics + + +@pytest.fixture +def quality_by_metric() -> dict[str, MetricStats]: + """Sample MetricStats with non-zero counts for quality report tests.""" + return { + "ragas:faithfulness": MetricStats( + score_statistics=ScoreStatistics( + count=10, + mean=0.85, + ), + ), + "ragas:answer_relevancy": MetricStats( + score_statistics=ScoreStatistics( + count=10, + mean=0.90, + ), + ), + "custom:context_recall": MetricStats( + score_statistics=ScoreStatistics( + count=10, + mean=0.75, + ), + ), + } + + +@pytest.fixture +def quality_by_metric_zero() -> dict[str, MetricStats]: + """Sample MetricStats with some zero counts for quality report tests.""" + return { + "ragas:faithfulness": MetricStats( + score_statistics=ScoreStatistics( + count=0, + mean=0.0, + ), + ), + "ragas:answer_relevancy": MetricStats( + score_statistics=ScoreStatistics( + count=0, + mean=0.0, + ), + ), + "custom:context_recall": MetricStats( + score_statistics=ScoreStatistics( + count=10, + mean=0.75, + ), + ), + } + + +@pytest.fixture +def quality_by_metric_with_none() -> dict[str, MetricStats]: + """Sample MetricStats with None score_statistics for quality report tests.""" + return { + "ragas:faithfulness": MetricStats( + score_statistics=None, + ), + "ragas:answer_relevancy": MetricStats( + score_statistics=ScoreStatistics( + count=10, + mean=0.90, + ), + ), + "custom:context_recall": MetricStats( + score_statistics=ScoreStatistics( + count=10, + mean=0.75, + ), + ), + } diff --git a/tests/unit/core/models/test_quality.py b/tests/unit/core/models/test_quality.py new file mode 100644 index 00000000..778df105 --- /dev/null +++ b/tests/unit/core/models/test_quality.py @@ -0,0 +1,165 @@ +"""Unit tests for quality report model.""" + +from pytest import LogCaptureFixture + +from lightspeed_evaluation.core.models.quality import QualityReport +from lightspeed_evaluation.core.models.summary import MetricStats + + +class TestQualityReport: + """Tests for QualityReport model and create_report() method.""" + + def test_quality_report_creation_happy_path( + self, + quality_by_metric: dict[str, MetricStats], + ) -> None: + """Test QualityReport creation with valid metrics.""" + + # Define quality score metrics (subset of all metrics) + quality_score_metrics = ["ragas:faithfulness", "ragas:answer_relevancy"] + + # Create the QualityReport + report = QualityReport.create_report(quality_by_metric, quality_score_metrics) + + # Assertions + assert report is not None + + # Check that quality metrics are correctly separated + assert len(report.quality_metrics) == 2 + assert "ragas:faithfulness" in report.quality_metrics + assert "ragas:answer_relevancy" in report.quality_metrics + + # Check that extra metrics contain the non-quality metrics + assert len(report.extra_metrics) == 1 + assert "custom:context_recall" in report.extra_metrics + + # Each metric has 10 samples, so weights should be 0.5 each + assert report.quality_metrics["ragas:faithfulness"].weight == 0.5 + assert report.quality_metrics["ragas:answer_relevancy"].weight == 0.5 + # Verify weights are calculated correctly (should sum to 1.0) + total_weight = sum(metric.weight for metric in report.quality_metrics.values()) + assert total_weight == 1.0 + + # Verify aggregated quality score is weighted average + # Expected: (0.85 * 0.5) + (0.90 * 0.5) = 0.875 + expected_score = (0.85 * 0.5) + (0.90 * 0.5) + assert report.aggregated_quality_score == expected_score + + # Verify quality metrics contain correct mean scores + assert report.quality_metrics["ragas:faithfulness"].statistics.mean == 0.85 + assert report.quality_metrics["ragas:answer_relevancy"].statistics.mean == 0.90 + + # Verify extra metrics contain correct mean scores + assert report.extra_metrics["custom:context_recall"].mean == 0.75 + + # Verify no warnings for valid configuration + assert len(report.warnings) == 0 + + def test_quality_report_creation_missing_metric( + self, quality_by_metric: dict[str, MetricStats] + ) -> None: + """Test QualityReport excludes missing metrics and generates warning.""" + quality_score_metrics = ["ragas:faithfulness", "ragas:answer_correctness"] + + # Create the QualityReport + report = QualityReport.create_report(quality_by_metric, quality_score_metrics) + + # Assertions + assert report is not None + + # Check that quality metrics are correctly separated + assert len(report.quality_metrics) == 1 + assert "ragas:faithfulness" in report.quality_metrics + assert report.quality_metrics["ragas:faithfulness"].weight == 1.0 + assert ( + report.quality_metrics["ragas:faithfulness"].statistics.mean + == report.aggregated_quality_score + ) + + # Verify warning about missing metric + assert len(report.warnings) == 1 + assert any( + "ragas:answer_correctness" in warning and "excluded" in warning + for warning in report.warnings + ) + + # Check that extra metrics contain the non-quality metrics + assert len(report.extra_metrics) == 2 + assert "custom:context_recall" in report.extra_metrics + assert "ragas:answer_relevancy" in report.extra_metrics + + def test_quality_report_total_samples_zero( + self, quality_by_metric_zero: dict[str, MetricStats], caplog: LogCaptureFixture + ) -> None: + """Test QualityReport returns None when all quality metrics have zero samples.""" + # Define quality score metrics (subset of all metrics) + quality_score_metrics = ["ragas:faithfulness", "ragas:answer_relevancy"] + + # Create the QualityReport + report = QualityReport.create_report( + quality_by_metric_zero, quality_score_metrics + ) + + # Assertions + assert report is None + assert "Quality score computation failed" in caplog.text + + def test_quality_report_sample_size_zero( + self, quality_by_metric_zero: dict[str, MetricStats] + ) -> None: + """Test QualityReport excludes metrics with zero samples and generates warning.""" + # Define quality score metrics (subset of all metrics) + quality_score_metrics = ["ragas:faithfulness", "custom:context_recall"] + + # Create the QualityReport + report = QualityReport.create_report( + quality_by_metric_zero, quality_score_metrics + ) + + # Assertions + assert report is not None + assert any( + "ragas:faithfulness" in warning and "excluded" in warning + for warning in report.warnings + ) + + def test_quality_report_none_score_statistics( + self, + quality_by_metric_with_none: dict[str, MetricStats], + caplog: LogCaptureFixture, + ) -> None: + """Test QualityReport excludes metrics with None score_statistics and logs warning.""" + # Define quality score metrics (subset of all metrics) + quality_score_metrics = ["ragas:faithfulness", "ragas:answer_relevancy"] + + # Create the QualityReport + report = QualityReport.create_report( + quality_by_metric_with_none, quality_score_metrics + ) + + # Assertions + assert report is not None + + # Check that the metric with None score_statistics was excluded + assert len(report.quality_metrics) == 1 + assert "ragas:answer_relevancy" in report.quality_metrics + assert "ragas:faithfulness" not in report.quality_metrics + + # Verify weight is 1.0 since only one metric remains + assert report.quality_metrics["ragas:answer_relevancy"].weight == 1.0 + + # Verify aggregated quality score equals the single metric's mean + assert report.aggregated_quality_score == 0.90 + + # Verify warning was generated and logged + assert len(report.warnings) == 1 + assert any( + "ragas:faithfulness" in warning + and "excluded" in warning + and "Missing score statistics data" in warning + for warning in report.warnings + ) + + # Verify warning was logged + assert "ragas:faithfulness" in caplog.text + assert "Missing score statistics data" in caplog.text diff --git a/tests/unit/core/models/test_system.py b/tests/unit/core/models/test_system.py index 82ae19d9..d9a33698 100644 --- a/tests/unit/core/models/test_system.py +++ b/tests/unit/core/models/test_system.py @@ -23,6 +23,7 @@ LLMParametersConfig, LLMProviderConfig, LoggingConfig, + QualityScoreConfig, ) from lightspeed_evaluation.core.system.exceptions import ConfigurationError @@ -693,3 +694,108 @@ def test_metric_names_are_derived_not_stored(self) -> None: assert names1 == names2 # Each call returns a fresh set (not the same object) assert names1 is not names2 + + +class TestSystemConfigQualityScoreValidation: + """Tests for quality_score metrics validation in SystemConfig.""" + + def test_quality_score_none_skips_validation(self) -> None: + """Test that validation is skipped when quality_score is None.""" + config = SystemConfig( + default_turn_metrics_metadata={ + "ragas:faithfulness": {"threshold": 0.7}, + }, + quality_score=None, + ) + assert config.quality_score is None + + def test_valid_quality_score_metrics_mixed_levels(self) -> None: + """Test quality_score with metrics from both turn and conversation levels.""" + config = SystemConfig( + default_turn_metrics_metadata={ + "ragas:faithfulness": {"threshold": 0.7}, + }, + default_conversation_metrics_metadata={ + "deepeval:conversation_relevancy": {"threshold": 0.6}, + }, + quality_score=QualityScoreConfig( + metrics=["ragas:faithfulness", "deepeval:conversation_relevancy"] + ), + ) + assert config.quality_score is not None + assert len(config.quality_score.metrics) == 2 + + def test_partial_invalid_quality_score_metrics_fails(self) -> None: + """Test quality_score with mix of valid and invalid metrics raises ConfigurationError.""" + with pytest.raises( + ConfigurationError, + match=r"Invalid quality_score metrics:.*ragas:invalid", + ): + SystemConfig( + default_turn_metrics_metadata={ + "ragas:faithfulness": {"threshold": 0.7}, + }, + quality_score=QualityScoreConfig( + metrics=["ragas:faithfulness", "ragas:invalid"] + ), + ) + + def test_quality_score_validation_error_message_includes_metadata_hint( + self, + ) -> None: + """Test that error message guides users to define metrics in metadata.""" + with pytest.raises( + ConfigurationError, + match=r"Must be defined in default_turn_metrics_metadata or " + r"default_conversation_metrics_metadata", + ): + SystemConfig( + default_turn_metrics_metadata={}, + quality_score=QualityScoreConfig(metrics=["ragas:faithfulness"]), + ) + + +class TestQualityScoreConfig: + """Tests for QualityScoreConfig model.""" + + def test_valid_configurations(self) -> None: + """Test valid quality score configurations.""" + # Single metric + config = QualityScoreConfig(metrics=["ragas:faithfulness"]) + assert len(config.metrics) == 1 + assert config.default is False + + # Multiple metrics + config = QualityScoreConfig( + metrics=[ + "ragas:faithfulness", + "deepeval:answer_relevancy", + "custom:correctness", + ], + default=True, + ) + assert len(config.metrics) == 3 + assert config.default is True + + def test_empty_metrics_list_fails(self) -> None: + """Test that empty metrics list raises ValidationError.""" + with pytest.raises( + ValidationError, + match="Quality score metrics list cannot be empty", + ): + QualityScoreConfig(metrics=[]) + + def test_multiple_duplicates_detected(self) -> None: + """Test that multiple different duplicates are all detected.""" + with pytest.raises( + ValidationError, + match="Quality score metrics contains duplicates", + ): + QualityScoreConfig( + metrics=[ + "ragas:faithfulness", + "ragas:faithfulness", + "custom:correctness", + "custom:correctness", + ] + ) diff --git a/tests/unit/core/output/conftest.py b/tests/unit/core/output/conftest.py index be2c8739..46cba616 100644 --- a/tests/unit/core/output/conftest.py +++ b/tests/unit/core/output/conftest.py @@ -96,4 +96,6 @@ def mock_system_config(mocker: MockerFixture) -> MockerFixture: config.visualization.enabled_graphs = [] # Mock model_fields to support iteration in _write_config_params and _build_config_dict config.model_fields.keys.return_value = [] + # Mock quality_score to avoid iteration errors when creating QualityReport + config.quality_score = None return config diff --git a/tests/unit/core/output/test_generator.py b/tests/unit/core/output/test_generator.py index 09fd5155..671d8d38 100644 --- a/tests/unit/core/output/test_generator.py +++ b/tests/unit/core/output/test_generator.py @@ -10,6 +10,7 @@ from lightspeed_evaluation.core.models import EvaluationResult from lightspeed_evaluation.core.models.summary import EvaluationSummary +from lightspeed_evaluation.core.models.quality import QualityReport from lightspeed_evaluation.core.output.generator import OutputHandler from lightspeed_evaluation.core.storage import FileBackendConfig @@ -158,6 +159,40 @@ def test_generate_individual_reports_json_only( assert (tmp_path / "test_summary.json").exists() + def test_generate_individual_reports_with_quality_report( + self, + tmp_path: Path, + sample_results: list[EvaluationResult], + mocker: MockerFixture, + ) -> None: + """Test _generate_individual_reports creates quality_report when configured.""" + # Mock system config with quality score metrics + file_config = FileBackendConfig(enabled_outputs=["json"]) + config = mocker.Mock() + config.storage = [file_config] + config.visualization.enabled_graphs = [] + config.model_fields.keys.return_value = [] + # Quality Score configuration + config.quality_score.metrics = ["ragas:faithfulness", "ragas:answer_relevancy"] + + handler = OutputHandler(output_dir=str(tmp_path), system_config=config) + summary = EvaluationSummary.from_results(sample_results) + + # Create a quality report + quality_report = QualityReport.create_report( + summary.by_metric, + ["ragas:faithfulness", "ragas:answer_relevancy"], + ) + + # Generate reports + handler._generate_individual_reports( + sample_results, "test", ["json"], summary, quality_report + ) + + # Check that quality_report.json was created + quality_report_file = tmp_path / "test_quality_report.json" + assert quality_report_file.exists() + def test_generate_individual_reports_txt_only( self, tmp_path: Path, @@ -390,6 +425,117 @@ def test_filename_timestamp_format( assert csv_file.suffix == ".csv" +class TestQualityReportGeneration: + """Tests for quality report generation.""" + + def test_generate_quality_score_report_all_fields( + self, + tmp_path: Path, + sample_results: list[EvaluationResult], + ) -> None: + """Test that _generate_quality_score_report includes all important fields.""" + handler = OutputHandler(output_dir=str(tmp_path)) + summary = EvaluationSummary.from_results(sample_results) + + # Create a quality report + quality_report = QualityReport.create_report( + summary.by_metric, + ["ragas:faithfulness", "ragas:answer_relevancy"], + ) + + assert quality_report is not None + + # Generate the quality score report + quality_file = handler._generate_quality_score_report( + quality_report, "test_quality" + ) + + assert quality_file.exists() + + # Load and verify all important fields + with open(quality_file, encoding="utf-8") as f: + data = json.load(f) + + # Check top-level fields + assert "timestamp" in data + assert "aggregated_quality_score" in data + assert "quality_metrics" in data + assert "extra_metrics" in data + assert "api_latency" in data + assert "api_tokens" in data + assert "warnings" in data + + # Check aggregated_quality_score is a number + assert isinstance(data["aggregated_quality_score"], (int, float)) + + # Check quality_metrics structure + assert isinstance(data["quality_metrics"], dict) + for _, stats in data["quality_metrics"].items(): + assert "mean" in stats + assert "count" in stats + assert "weight" in stats + assert isinstance(stats["mean"], (int, float)) + assert isinstance(stats["count"], int) + assert isinstance(stats["weight"], (int, float)) + # Weight should be between 0 and 1 + assert 0 <= stats["weight"] <= 1 + + # Check extra_metrics structure + assert isinstance(data["extra_metrics"], dict) + + # Check API fields are numeric + assert isinstance(data["api_latency"], (int, float)) + assert isinstance(data["api_tokens"], int) + + # Check warnings is a list + assert isinstance(data["warnings"], list) + + def test_quality_report_with_partial_metrics( + self, + tmp_path: Path, + ) -> None: + """Test quality report generation when only some metrics are available.""" + # Create results with only one of the configured quality metrics + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="ragas:faithfulness", + score=0.85, + result="PASS", + threshold=0.7, + reason="Good", + ), + ] + + handler = OutputHandler(output_dir=str(tmp_path)) + summary = EvaluationSummary.from_results(results) + + # Try to create quality report with metrics that don't exist + quality_report = QualityReport.create_report( + summary.by_metric, + ["ragas:faithfulness", "ragas:answer_relevancy", "nonexistent:metric"], + ) + + assert quality_report is not None + + # Generate the quality score report + quality_file = handler._generate_quality_score_report( + quality_report, "test_partial" + ) + + with open(quality_file, encoding="utf-8") as f: + data = json.load(f) + + # Should have warnings about missing metrics + assert len(data["warnings"]) > 0 + assert any("nonexistent:metric" in w for w in data["warnings"]) + assert any("ragas:answer_relevancy" in w for w in data["warnings"]) + + # Should still have the available metric + assert "ragas:faithfulness" in data["quality_metrics"] + + class TestOutputHandlerSave: """Tests for OutputHandler.save() method.""" diff --git a/tests/unit/core/system/test_loader.py b/tests/unit/core/system/test_loader.py index 4feaaf71..719e63e6 100644 --- a/tests/unit/core/system/test_loader.py +++ b/tests/unit/core/system/test_loader.py @@ -459,3 +459,146 @@ def test_from_config_returns_config_loader_instance(self) -> None: loader = ConfigLoader.from_config(config) assert isinstance(loader, ConfigLoader) + + +class TestConfigLoaderQualityScore: + """Unit tests for ConfigLoader quality_score feature.""" + + def test_quality_score_absent_results_in_none(self) -> None: + """Test missing quality_score section results in None.""" + yaml_content = """ +llm: + provider: openai + model: gpt-4o-mini +""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(yaml_content) + temp_path = f.name + + try: + config = ConfigLoader().load_system_config(temp_path) + assert config.quality_score is None + finally: + Path(temp_path).unlink() + + def test_quality_score_config_loaded(self) -> None: + """Test quality_score section is parsed and attached to SystemConfig.""" + yaml_content = """ +llm: + provider: openai + model: gpt-4o-mini + +metrics_metadata: + turn_level: + ragas:faithfulness: + threshold: 0.7 + default: true + conversation_level: {} + +quality_score: + metrics: + - ragas:faithfulness + default: false +""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(yaml_content) + temp_path = f.name + + try: + config = ConfigLoader().load_system_config(temp_path) + assert config.quality_score is not None + assert config.quality_score.metrics == ["ragas:faithfulness"] + assert config.quality_score.default is False + # Metric default is untouched + assert ( + config.default_turn_metrics_metadata["ragas:faithfulness"]["default"] + is True + ) + finally: + Path(temp_path).unlink() + + def test_quality_score_default_true_sets_default_on_metrics(self) -> None: + """Test quality_score.default: true sets default: true on turn-level metrics.""" + yaml_content = """ +llm: + provider: openai + +metrics_metadata: + turn_level: + ragas:faithfulness: + threshold: 0.7 + default: true + custom:correctness: + threshold: 0.8 + default: false + other:timeliness: + threshold: 0.75 + default: false + conversation_level: + deepeval:completeness: + threshold: 0.6 + default: false + +quality_score: + metrics: + - ragas:faithfulness + - custom:correctness + default: true +""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(yaml_content) + temp_path = f.name + + try: + config = ConfigLoader().load_system_config(temp_path) + assert ( + config.default_turn_metrics_metadata["ragas:faithfulness"]["default"] + is True + ) + assert ( + config.default_turn_metrics_metadata["custom:correctness"]["default"] + is True + ) + assert ( + config.default_turn_metrics_metadata["other:timeliness"]["default"] + is False + ) + assert ( + config.default_conversation_metrics_metadata["deepeval:completeness"][ + "default" + ] + is False + ) + finally: + Path(temp_path).unlink() + + def test_quality_score_default_true_with_undefined_metric_fails(self) -> None: + """Test quality_score.default: true with undefined metric raises ConfigurationError.""" + yaml_content = """ +llm: + provider: openai + +metrics_metadata: + turn_level: {} + conversation_level: {} + +quality_score: + metrics: + - nonexistent:metric + default: true +""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(yaml_content) + temp_path = f.name + + try: + with pytest.raises( + ConfigurationError, + match=( + "Metric 'nonexistent:metric' is listed in " + "quality_score.metrics but not defined" + ), + ): + ConfigLoader().load_system_config(temp_path) + finally: + Path(temp_path).unlink()