-
Notifications
You must be signed in to change notification settings - Fork 32
Expand file tree
/
Copy pathquality.py
More file actions
181 lines (149 loc) · 6.8 KB
/
quality.py
File metadata and controls
181 lines (149 loc) · 6.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""Quality score models for aggregated quality assessment.
Provides Pydantic models for computing and reporting an aggregated quality score
from selected metrics using weighted averaging based on sample sizes.
"""
import logging
from typing import Optional
from pydantic import BaseModel, Field
from lightspeed_evaluation.core.models.summary import MetricStats, ScoreStatistics
logger = logging.getLogger(__name__)
class QualityMetricResult(BaseModel):
"""Quality metric result using composition to add weight to score statistics."""
statistics: ScoreStatistics = Field(
description="Score statistics for this quality metric"
)
weight: float = Field(
default=0.0,
description="Weight proportion (sample_size / total_samples) used in weighted average",
)
class QualityReport(BaseModel):
"""Aggregated quality score from selected metrics."""
aggregated_quality_score: float = Field(
default=0.0, description="Weighted average of quality score metrics"
)
quality_metrics: dict[str, QualityMetricResult] = Field(
default_factory=dict,
description="Individual metrics used in quality score calculation",
)
extra_metrics: dict[str, ScoreStatistics] = Field(
default_factory=dict,
description="Other evaluated metrics calculated, not used for quality score calculation",
)
warnings: list[str] = Field(
default_factory=list,
description="Warnings about quality metrics configuration or usage",
)
api_latency: float = Field(
default=0.0, description="[Placeholder] Average API response time in seconds"
)
api_tokens: int = Field(
default=0,
description="[Placeholder] Total number of tokens consumed across all API calls",
)
@staticmethod
def create_report(
by_metric: dict[str, MetricStats],
quality_score_metrics: list[str],
) -> Optional["QualityReport"]:
"""Creates a quality report with aggregated quality score from selected metrics.
Separates metrics into quality metrics (used for quality score calculation) and
extra metrics (evaluated but not included in quality score).
Args:
by_metric: Dictionary mapping metric identifiers to their computed statistics.
quality_score_metrics: Metric identifiers to include in quality score calculation.
All specified metrics must exist in by_metric.
Returns:
QualityReport with aggregated quality score and separated quality/extra metrics,
or None if all quality score metrics have zero samples.
Raises:
ValueError: If any quality_score_metrics are not found in by_metric.
"""
warnings: list[str] = []
# Validate all quality score metrics exist in computed metrics (by_metric)
missing_metrics = [m for m in quality_score_metrics if m not in by_metric]
if missing_metrics:
warning_msg = (
"WARNING: "
f"Quality score metrics {missing_metrics} were excluded from "
"quality score computation. "
f"Reason: Not found in evaluation results."
)
warnings.append(warning_msg)
logger.warning(warning_msg)
quality_score_metrics = list(
set(quality_score_metrics) - set(missing_metrics)
)
# Calculate total samples from quality score metrics only
total_samples = 0
for metric_id in quality_score_metrics:
score_stats = by_metric[metric_id].score_statistics
if score_stats is not None:
total_samples += score_stats.count
if total_samples == 0:
logger.warning(
"CRITICAL: Quality score computation failed. "
"All configured quality metrics have zero evaluation results."
)
return None
quality_metrics: dict[str, QualityMetricResult] = {}
extra_metrics: dict[str, ScoreStatistics] = {}
# Separate quality metrics from extra metrics
for metric_id in by_metric:
if metric_id in quality_score_metrics:
score_stats = by_metric[metric_id].score_statistics
# Skip if score_statistics is None
if score_stats is None:
warning_msg = (
f"WARNING: Quality score metric '{metric_id}' "
"was excluded from quality score computation. "
"Reason: Missing score statistics data."
)
warnings.append(warning_msg)
logger.warning(warning_msg)
continue
sample_size = score_stats.count
# Skip metrics with zero samples
if sample_size == 0:
warning_msg = (
f"WARNING: Quality score metric '{metric_id}' "
"was excluded from quality score computation. "
"Reason: Zero evaluation results for this metric."
)
warnings.append(warning_msg)
logger.warning(warning_msg)
continue
weight = sample_size / total_samples
quality_metrics[metric_id] = QualityMetricResult(
statistics=score_stats,
weight=weight,
)
else:
stats = by_metric[metric_id].score_statistics
if stats is not None:
extra_metrics[metric_id] = stats
# Calculate aggregated quality score
aggregated_score = QualityReport._calculate_quality_score(quality_metrics)
return QualityReport(
aggregated_quality_score=aggregated_score,
quality_metrics=quality_metrics,
extra_metrics=extra_metrics,
warnings=warnings,
)
@staticmethod
def _calculate_quality_score(
quality_metrics: dict[str, QualityMetricResult],
) -> float:
"""Calculate weighted average quality score from quality metrics.
Computes a weighted average where each metric's weight is proportional to its
sample size relative to the total samples across all quality metrics.
Args:
quality_metrics: Dictionary of quality metric results with statistics and
weights. Each metric contains statistics with a mean score and a weight
(sample_size / total_samples).
Returns:
Weighted average quality score computed as sum of (mean * weight) for all metrics.
"""
weighted_sum = 0.0
for metric in quality_metrics.values():
weighted_sum += metric.statistics.mean * metric.weight
return weighted_sum