lightspeed-evaluation/src/lightspeed_evaluation/core/output/statistics.py at d6deb751de49314c6413ecfe70af8c91c79ff8c1 · lightspeed-core/lightspeed-evaluation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
"""Shared utilities for output and evaluation."""

import statistics
from typing import Any

import numpy as np
import pandas as pd

from lightspeed_evaluation.core.models import EvaluationData, EvaluationResult


def bootstrap_intervals(
    s: pd.Series, confidence: float = 95, bootstrap_steps: int = 10000
) -> tuple[np.floating, np.floating, np.floating]:
    """Compute confidence interval using bootstraping, return low, mean, high."""
    if not 0 <= confidence <= 100:
        raise ValueError("Invalid confidence, must be between 0 and 100")

    sample_n = len(s)
    sample_mean = np.mean(s)

    confidence_rev = 100 - confidence

    rates = np.array(
        [np.mean(s.sample(n=sample_n, replace=True)) for _ in range(bootstrap_steps)]
    )

    # Median (not mean) is correct here
    mean_boot_strap = np.median(rates)
    low = np.percentile(rates - sample_mean, (confidence_rev / 2.0))
    high = np.percentile(rates - sample_mean, 100 - (confidence_rev / 2.0))

    # high represent lower bound, low represents upper bound
    return sample_mean - high, mean_boot_strap, sample_mean - low


def calculate_basic_stats(results: list[EvaluationResult]) -> dict[str, Any]:
    """Calculate basic pass/fail/error/skipped statistics from results."""
    if not results:
        return {
            "TOTAL": 0,
            "PASS": 0,
            "FAIL": 0,
            "ERROR": 0,
            "SKIPPED": 0,
            "pass_rate": 0.0,
            "fail_rate": 0.0,
            "error_rate": 0.0,
            "skipped_rate": 0.0,
            "total_judge_llm_input_tokens": 0,
            "total_judge_llm_output_tokens": 0,
            "total_judge_llm_tokens": 0,
            "total_embedding_tokens": 0,
        }

    total = len(results)
    pass_count = sum(1 for r in results if r.result == "PASS")
    fail_count = sum(1 for r in results if r.result == "FAIL")
    error_count = sum(1 for r in results if r.result == "ERROR")
    skipped_count = sum(1 for r in results if r.result == "SKIPPED")

    # Calculate token totals
    total_judge_input = sum(r.judge_llm_input_tokens for r in results)
    total_judge_output = sum(r.judge_llm_output_tokens for r in results)
    total_embedding = sum(r.embedding_tokens for r in results)

    return {
        "TOTAL": total,
        "PASS": pass_count,
        "FAIL": fail_count,
        "ERROR": error_count,
        "SKIPPED": skipped_count,
        "pass_rate": (pass_count / total) * 100 if total > 0 else 0,
        "fail_rate": (fail_count / total) * 100 if total > 0 else 0,
        "error_rate": (error_count / total) * 100 if total > 0 else 0,
        "skipped_rate": (skipped_count / total) * 100 if total > 0 else 0,
        "total_judge_llm_input_tokens": total_judge_input,
        "total_judge_llm_output_tokens": total_judge_output,
        "total_judge_llm_tokens": total_judge_input + total_judge_output,
        "total_embedding_tokens": total_embedding,
    }


def calculate_detailed_stats(results: list[EvaluationResult]) -> dict[str, Any]:
    """Calculate detailed statistics broken down by different categories."""
    if not results:
        return {"by_metric": {}, "by_conversation": {}, "by_tag": {}}

    by_metric: dict[str, dict[str, Any]] = {}
    by_conversation: dict[str, dict[str, Any]] = {}
    by_tag: dict[str, dict[str, Any]] = {}

    # Collect data using generic update function
    for result in results:
        _update_stats(by_metric, result.metric_identifier, result, include_scores=True)
        _update_stats(by_conversation, result.conversation_group_id, result)
        _update_stats(by_tag, result.tag, result, include_scores=True)

    # Finalize statistics for each group
    for stats in by_metric.values():
        _finalize_group_stats(stats, include_scores=True)

    # Note: Conversations don't include score_statistics with confidence intervals.
    # To calculate CI for conversations, we would need to reconstruct the original
    # results for each conversation to create binary series for each outcome type.
    # This could be enhanced by passing the original results to this function.
    for stats in by_conversation.values():
        _finalize_group_stats(stats)

    for stats in by_tag.values():
        _finalize_group_stats(stats, include_scores=True)

    return {
        "by_metric": by_metric,
        "by_conversation": by_conversation,
        "by_tag": by_tag,
    }


def _create_empty_stats(*, include_scores: bool = False) -> dict[str, Any]:
    """Create empty statistics dictionary.

    Args:
        include_scores: Whether to include a scores list for score tracking.
    """
    stats: dict[str, Any] = {
        "pass": 0,
        "fail": 0,
        "error": 0,
        "skipped": 0,
    }
    if include_scores:
        stats["scores"] = []
    return stats


def _update_stats(
    stats_dict: dict[str, dict[str, Any]],
    key: str,
    result: EvaluationResult,
    *,
    include_scores: bool = False,
) -> None:
    """Update statistics dictionary with a result.

    Args:
        stats_dict: Dictionary mapping keys to their statistics.
        key: The key to update (e.g., metric_identifier, conversation_group_id).
        result: The evaluation result to add.
        include_scores: Whether to track individual scores.
    """
    if key not in stats_dict:
        stats_dict[key] = _create_empty_stats(include_scores=include_scores)

    stats = stats_dict[key]
    stats[result.result.lower()] += 1

    if include_scores and result.score is not None:
        stats["scores"].append(result.score)


def _calculate_rates(stats: dict[str, Any]) -> None:
    """Calculate pass/fail/error/skipped rates for a stats dictionary."""
    total = stats["pass"] + stats["fail"] + stats["error"] + stats["skipped"]
    if total > 0:
        stats["pass_rate"] = stats["pass"] / total * 100
        stats["fail_rate"] = stats["fail"] / total * 100
        stats["error_rate"] = stats["error"] / total * 100
        stats["skipped_rate"] = stats["skipped"] / total * 100
    else:
        stats["pass_rate"] = 0.0
        stats["fail_rate"] = 0.0
        stats["error_rate"] = 0.0
        stats["skipped_rate"] = 0.0


def _calculate_numeric_stats(values: list[float]) -> dict[str, Any]:
    """Calculate basic numeric statistics for a list of values.

    Args:
        values: List of numeric values.

    Returns:
        Dictionary containing count, mean, median, std, min, max, p95, p99.
    """
    if not values:
        return {"count": 0}

    return {
        "count": len(values),
        "mean": statistics.mean(values),
        "median": statistics.median(values),
        "std": statistics.stdev(values) if len(values) > 1 else 0.0,
        "min": min(values),
        "max": max(values),
        "p95": float(np.percentile(values, 95)),
        "p99": float(np.percentile(values, 99)),
    }


def _calculate_score_statistics(scores: list[float]) -> dict[str, Any]:
    """Calculate score statistics with confidence intervals.

    Args:
        scores: List of score values.

    Returns:
        Dictionary containing mean, median, std, min, max, count, and confidence_interval.
    """
    if not scores:
        return {
            "mean": 0.0,
            "median": 0.0,
            "std": 0.0,
            "min": 0.0,
            "max": 0.0,
            "count": 0,
            "confidence_interval": None,
        }

    score_stats = _calculate_numeric_stats(scores)
    score_stats["confidence_interval"] = None

    # Calculate confidence intervals using bootstrap
    if len(scores) > 1:  # Need at least 2 samples for meaningful bootstrap
        try:
            scores_series = pd.Series(scores)
            ci_low, ci_mean, ci_high = bootstrap_intervals(scores_series)
            score_stats["confidence_interval"] = {
                "low": float(ci_low),
                "mean": float(ci_mean),
                "high": float(ci_high),
                "confidence_level": 95,
            }
        except (ValueError, RuntimeError):
            pass  # confidence_interval already set to None

    return score_stats


def _finalize_group_stats(
    stats: dict[str, Any], *, include_scores: bool = False
) -> None:
    """Finalize statistics for a group (calculate rates and optionally score stats).

    Args:
        stats: Statistics dictionary to finalize.
        include_scores: Whether to calculate score statistics.
    """
    _calculate_rates(stats)
    if include_scores:
        stats["score_statistics"] = _calculate_score_statistics(stats.get("scores", []))


def calculate_api_token_usage(evaluation_data: list[EvaluationData]) -> dict[str, Any]:
    """Calculate total API token usage from evaluation data.

    Args:
        evaluation_data: List of evaluation data containing turn-level API token counts.

    Returns:
        Dictionary containing total_api_input_tokens, total_api_output_tokens,
        and total_api_tokens.
    """
    total_input_tokens = 0
    total_output_tokens = 0

    for conv_data in evaluation_data:
        for turn in conv_data.turns:
            total_input_tokens += turn.api_input_tokens
            total_output_tokens += turn.api_output_tokens

    return {
        "total_api_input_tokens": total_input_tokens,
        "total_api_output_tokens": total_output_tokens,
        "total_api_tokens": total_input_tokens + total_output_tokens,
    }


def calculate_streaming_stats(
    evaluation_data: list[EvaluationData],
) -> dict[str, Any]:
    """Calculate streaming performance statistics from evaluation data.

    Args:
        evaluation_data: List of evaluation data containing turn-level streaming metrics.

    Returns:
        Dictionary containing streaming performance statistics (TTFT, duration, throughput).
    """
    ttft_values: list[float] = []
    duration_values: list[float] = []
    throughput_values: list[float] = []

    for conv_data in evaluation_data:
        for turn in conv_data.turns:
            if turn.time_to_first_token is not None:
                ttft_values.append(turn.time_to_first_token)
            if turn.streaming_duration is not None:
                duration_values.append(turn.streaming_duration)
            if turn.tokens_per_second is not None:
                throughput_values.append(turn.tokens_per_second)

    return {
        "time_to_first_token": _calculate_numeric_stats(ttft_values),
        "streaming_duration": _calculate_numeric_stats(duration_values),
        "tokens_per_second": _calculate_numeric_stats(throughput_values),
    }


def calculate_api_latency_stats(results: list[EvaluationResult]) -> dict[str, Any]:
    """Calculate API latency statistics from evaluation results.

    Args:
        results: List of evaluation results containing API latency data.

    Returns:
        Dictionary containing API latency statistics with percentiles (p50, p95, p99).
    """
    api_latency_values: list[float] = []

    for result in results:
        if result.api_latency > 0:
            api_latency_values.append(result.api_latency)

    return _calculate_numeric_stats(api_latency_values)