Skip to content

Commit fbfc4a7

Browse files
authored
Merge pull request #216 from x86girl/prgutier/fix-silent-score-mismatch
fix: Add retry configuration and warning for GEval score=None bug
2 parents 924c598 + e28e2b8 commit fbfc4a7

7 files changed

Lines changed: 475 additions & 21 deletions

File tree

src/lightspeed_evaluation/core/llm/deepeval.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
import litellm
1212
from deepeval.models import LiteLLMModel
13+
from tenacity import stop_after_attempt
1314

15+
from lightspeed_evaluation.core.constants import DEFAULT_LLM_RETRIES
1416
from lightspeed_evaluation.core.llm.litellm_patch import setup_litellm_ssl
1517

1618
logger = logging.getLogger(__name__)
@@ -46,15 +48,46 @@ def __init__(self, model_name: str, llm_params: dict[str, Any]):
4648
# LiteLLMModel stores **kwargs in self.kwargs and merges them into
4749
# every litellm.completion() call
4850
# Note: Forbidden keys are rejected at LLMParametersConfig load time
51+
52+
# Override DeepEval's hardcoded retry logic with user configuration
53+
# DeepEval uses @retry decorators that capture MAX_RETRIES at import time
54+
# We must patch the retry decorators after import but before instantiation
55+
num_retries = self.llm_params.get("num_retries", DEFAULT_LLM_RETRIES)
56+
57+
self._patch_deepeval_retries(num_retries)
58+
4959
self.llm_model = LiteLLMModel(
5060
model=self.model_name,
5161
timeout=self.llm_params.get("timeout"),
52-
num_retries=self.llm_params.get("num_retries"),
5362
**self.llm_params.get("parameters", {}),
5463
)
5564

5665
print(f"✅ DeepEval LLM Manager: {self.model_name}")
5766

67+
def _patch_deepeval_retries(self, max_retries: int) -> None:
68+
"""Monkey-patch DeepEval's retry decorators to use configured max_retries.
69+
70+
DeepEval's @retry decorators capture MAX_RETRIES at import time.
71+
We patch the 'stop' attribute on each retry decorator to use our value.
72+
"""
73+
# Patch the stop condition on all retry-decorated methods
74+
for method_name in [
75+
"generate",
76+
"a_generate",
77+
"generate_raw_response",
78+
"a_generate_raw_response",
79+
"generate_samples",
80+
]:
81+
method = getattr(LiteLLMModel, method_name)
82+
method.retry.stop = stop_after_attempt( # pylint: disable=no-member
83+
max_retries
84+
)
85+
86+
logger.info(
87+
"Patched DeepEval retry logic: max_retries=%d",
88+
max_retries,
89+
)
90+
5891
def setup_ssl_verify(self) -> None:
5992
"""Setup SSL verification based on LLM parameters.
6093

src/lightspeed_evaluation/core/metrics/deepeval.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,17 +87,34 @@ def _build_conversational_test_case(self, conv_data: Any) -> ConversationalTestC
8787

8888
return ConversationalTestCase(turns=turns)
8989

90-
def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float, str]:
90+
def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float | None, str]:
9191
"""Evaluate and get result."""
9292
metric.measure(test_case)
9393
self.llm_manager.flush_deepevals_pending_tasks()
9494

95+
score = metric.score
9596
reason = (
9697
metric.reason
9798
if hasattr(metric, "reason") and metric.reason
98-
else f"Score: {metric.score:.2f}"
99+
else f"Score: {score:.2f}" if score is not None else "No score returned"
99100
)
100-
return metric.score, reason
101+
102+
# CRITICAL: Warn if score is None (indicates evaluation failure)
103+
# None scores indicate evaluation failures that need investigation:
104+
# - Rate limiting (429 errors)
105+
# - LLM judge returning malformed JSON that fails parsing
106+
# - Timeout errors from LLM provider
107+
# - API quota/credits exhausted
108+
if score is None:
109+
logger.warning(
110+
"%s metric returned None score. "
111+
"This typically indicates LLM judge failure (rate limiting, timeout, "
112+
"invalid JSON response, or quota exhausted). Reason: %s",
113+
metric.__class__.__name__,
114+
reason,
115+
)
116+
117+
return score, reason
101118

102119
def evaluate(
103120
self,
@@ -117,7 +134,8 @@ def evaluate(
117134
scope: EvaluationScope containing turn info and conversation flag
118135
119136
Returns:
120-
Tuple of (score, reason)
137+
tuple[float | None, str]: Tuple of (score, reason).
138+
Score is in [0, 1] or None if evaluation failed.
121139
"""
122140
# Route to standard DeepEval metrics
123141
if metric_name in self.supported_metrics:

src/lightspeed_evaluation/core/metrics/geval.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -292,17 +292,34 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
292292
# Create test case for a single turn
293293
test_case = LLMTestCase(**test_case_kwargs)
294294

295-
# Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
295+
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
296296
try:
297297
metric.measure(test_case)
298298
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
299299

300-
score = metric.score if metric.score is not None else 0.0
300+
# Extract score and reason
301+
score = metric.score
301302
reason = (
302303
str(metric.reason)
303304
if hasattr(metric, "reason") and metric.reason
304305
else "No reason provided"
305306
)
307+
308+
# CRITICAL: Warn if score is None (indicates evaluation failure)
309+
# None scores indicate evaluation failures that need investigation:
310+
# - Rate limiting (429 errors after all retries exhausted)
311+
# - LLM judge returning malformed JSON that fails parsing
312+
# - Timeout errors from LLM provider
313+
# - API quota/credits exhausted
314+
# Warning helps identify these failures for debugging.
315+
if score is None:
316+
logger.warning(
317+
"GEval turn-level metric returned None score. "
318+
"This typically indicates LLM judge failure (rate limiting, timeout, "
319+
"invalid JSON response, or quota exhausted). Reason: %s",
320+
reason,
321+
)
322+
306323
return score, reason
307324
except Exception as e: # pylint: disable=W0718
308325
logger.error(
@@ -405,17 +422,29 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
405422
actual_output="\n".join(conversation_output),
406423
)
407424

408-
# Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
425+
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
409426
try:
410427
metric.measure(test_case)
411428
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
412429

413-
score = metric.score if metric.score is not None else 0.0
430+
# Extract score and reason
431+
score = metric.score
414432
reason = (
415433
str(metric.reason)
416434
if hasattr(metric, "reason") and metric.reason
417435
else "No reason provided"
418436
)
437+
438+
# CRITICAL: Warn if score is None (indicates evaluation failure)
439+
# See turn-level evaluation for detailed explanation of why this matters
440+
if score is None:
441+
logger.warning(
442+
"GEval conversation-level metric returned None score. "
443+
"This typically indicates LLM judge failure (rate limiting, timeout, "
444+
"invalid JSON response, or quota exhausted). Reason: %s",
445+
reason,
446+
)
447+
419448
return score, reason
420449
except Exception as e: # pylint: disable=W0718
421450
logger.error(

tests/unit/core/llm/test_deepeval_manager.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Unit tests for DeepEval LLM Manager."""
22

3+
import logging
4+
35
import pytest
46
from pytest_mock import MockerFixture
57

@@ -95,3 +97,62 @@ def test_drop_params_always_enabled(self, mocker: MockerFixture) -> None:
9597
DeepEvalLLMManager("gpt-4", {})
9698

9799
assert mock_litellm.drop_params is True
100+
101+
def test_patch_deepeval_retries_called_with_configured_value(
102+
self, mocker: MockerFixture
103+
) -> None:
104+
"""Test that _patch_deepeval_retries patches LiteLLMModel retry logic."""
105+
# Mock LiteLLMModel with methods that have retry decorators
106+
mock_generate = mocker.Mock()
107+
mock_generate.retry = mocker.Mock()
108+
mock_a_generate = mocker.Mock()
109+
mock_a_generate.retry = mocker.Mock()
110+
mock_generate_raw = mocker.Mock()
111+
mock_generate_raw.retry = mocker.Mock()
112+
mock_a_generate_raw = mocker.Mock()
113+
mock_a_generate_raw.retry = mocker.Mock()
114+
mock_generate_samples = mocker.Mock()
115+
mock_generate_samples.retry = mocker.Mock()
116+
117+
mock_litellm_class = mocker.patch(
118+
"lightspeed_evaluation.core.llm.deepeval.LiteLLMModel"
119+
)
120+
mock_litellm_class.generate = mock_generate
121+
mock_litellm_class.a_generate = mock_a_generate
122+
mock_litellm_class.generate_raw_response = mock_generate_raw
123+
mock_litellm_class.a_generate_raw_response = mock_a_generate_raw
124+
mock_litellm_class.generate_samples = mock_generate_samples
125+
126+
# Mock stop_after_attempt to return a mock stop condition
127+
mock_stop_condition = mocker.Mock()
128+
mock_stop_after_attempt = mocker.patch(
129+
"lightspeed_evaluation.core.llm.deepeval.stop_after_attempt",
130+
return_value=mock_stop_condition,
131+
)
132+
133+
params = {"num_retries": 5}
134+
DeepEvalLLMManager("gpt-4", params)
135+
136+
# Verify stop_after_attempt was called 5 times (once per method) with the configured retries
137+
assert mock_stop_after_attempt.call_count == 5
138+
mock_stop_after_attempt.assert_called_with(5)
139+
140+
# Verify each method's retry.stop was set to the new stop condition
141+
assert mock_generate.retry.stop == mock_stop_condition
142+
assert mock_a_generate.retry.stop == mock_stop_condition
143+
assert mock_generate_raw.retry.stop == mock_stop_condition
144+
assert mock_a_generate_raw.retry.stop == mock_stop_condition
145+
assert mock_generate_samples.retry.stop == mock_stop_condition
146+
147+
def test_patch_deepeval_retries_logs_operation(
148+
self, mocker: MockerFixture, caplog: pytest.LogCaptureFixture
149+
) -> None:
150+
"""Test that retry patching logs the max_retries value."""
151+
mocker.patch("lightspeed_evaluation.core.llm.deepeval.LiteLLMModel")
152+
153+
params = {"num_retries": 7}
154+
155+
with caplog.at_level(logging.INFO):
156+
DeepEvalLLMManager("gpt-4", params)
157+
158+
assert "Patched DeepEval retry logic: max_retries=7" in caplog.text

tests/unit/core/metrics/conftest.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33
"""Pytest configuration and fixtures for metrics tests."""
44

55
import sys
6+
from typing import Any
67

78
import pytest
89
from pytest_mock import MockerFixture
910

11+
from lightspeed_evaluation.core.metrics.deepeval import DeepEvalMetrics
1012
from lightspeed_evaluation.core.metrics.nlp import NLPMetrics
1113
from lightspeed_evaluation.core.models import EvaluationScope, SystemConfig, TurnData
1214

@@ -154,3 +156,67 @@ def mock_similarity_scorer(mocker: MockerFixture) -> MockerFixture:
154156
return_value=mock_scorer_instance,
155157
)
156158
return mock_scorer_instance
159+
160+
161+
@pytest.fixture
162+
def mock_llm_manager(mocker: MockerFixture) -> Any:
163+
"""Create a mock LLMManager for DeepEval tests."""
164+
mock_manager = mocker.MagicMock()
165+
mock_config = mocker.MagicMock()
166+
mock_config.cache_enabled = False
167+
mock_manager.get_config.return_value = mock_config
168+
mock_manager.get_model_name.return_value = "gpt-4"
169+
mock_manager.get_llm_params.return_value = {"num_retries": 3}
170+
return mock_manager
171+
172+
173+
@pytest.fixture
174+
def mock_metric_manager(mocker: MockerFixture) -> Any:
175+
"""Create a mock MetricManager."""
176+
return mocker.MagicMock()
177+
178+
179+
@pytest.fixture
180+
def mock_deepeval_llm_manager(mocker: MockerFixture) -> Any:
181+
"""Create a mock DeepEvalLLMManager."""
182+
mock_manager = mocker.MagicMock()
183+
mock_llm = mocker.MagicMock()
184+
mock_manager.get_llm.return_value = mock_llm
185+
mock_manager.flush_deepevals_pending_tasks.return_value = None
186+
return mock_manager
187+
188+
189+
@pytest.fixture
190+
def mock_conv_data(mocker: MockerFixture) -> Any:
191+
"""Create mock conversation data with turns."""
192+
turn1 = mocker.MagicMock()
193+
turn1.query = "What is AI?"
194+
turn1.response = "AI stands for Artificial Intelligence."
195+
196+
turn2 = mocker.MagicMock()
197+
turn2.query = "Can you explain more?"
198+
turn2.response = "AI is the simulation of human intelligence by machines."
199+
200+
conv_data = mocker.MagicMock()
201+
conv_data.turns = [turn1, turn2]
202+
return conv_data
203+
204+
205+
@pytest.fixture
206+
def deepeval_metrics(
207+
mock_llm_manager: Any,
208+
mock_metric_manager: Any,
209+
mock_deepeval_llm_manager: Any,
210+
mocker: MockerFixture,
211+
) -> DeepEvalMetrics:
212+
"""Create DeepEvalMetrics instance with mocked dependencies."""
213+
mocker.patch(
214+
"lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager",
215+
return_value=mock_deepeval_llm_manager,
216+
)
217+
mocker.patch("lightspeed_evaluation.core.metrics.deepeval.GEvalHandler")
218+
219+
return DeepEvalMetrics(
220+
llm_manager=mock_llm_manager,
221+
metric_manager=mock_metric_manager,
222+
)

0 commit comments

Comments
 (0)