Skip to content

Commit f951dac

Browse files
authored
fix: Simplify judge structured output for improve reliability for judge scores for some LLMs (#105)
fix: Improve metric token collection for Judge evaluations when using LangChain fix: Improved raw response when performing Judge evaluations using LangChain
1 parent a14761d commit f951dac

File tree

5 files changed

+147
-199
lines changed

5 files changed

+147
-199
lines changed

packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py

Lines changed: 60 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -83,44 +83,39 @@ async def invoke_structured_model(
8383
:param response_structure: Dictionary defining the output structure
8484
:return: StructuredResponse containing the structured data
8585
"""
86+
structured_response = StructuredResponse(
87+
data={},
88+
raw_response='',
89+
metrics=LDAIMetrics(success=False, usage=None),
90+
)
8691
try:
8792
langchain_messages = LangChainProvider.convert_messages_to_langchain(messages)
88-
structured_llm = self._llm.with_structured_output(response_structure)
93+
structured_llm = self._llm.with_structured_output(response_structure, include_raw=True)
8994
response = await structured_llm.ainvoke(langchain_messages)
9095

9196
if not isinstance(response, dict):
9297
log.warning(
9398
f'Structured output did not return a dict. '
9499
f'Got: {type(response)}'
95100
)
96-
return StructuredResponse(
97-
data={},
98-
raw_response='',
99-
metrics=LDAIMetrics(
100-
success=False,
101-
usage=TokenUsage(total=0, input=0, output=0),
102-
),
103-
)
101+
return structured_response
104102

105-
return StructuredResponse(
106-
data=response,
107-
raw_response=str(response),
108-
metrics=LDAIMetrics(
109-
success=True,
110-
usage=TokenUsage(total=0, input=0, output=0),
111-
),
112-
)
103+
raw_response = response.get('raw')
104+
if raw_response is not None:
105+
if hasattr(raw_response, 'content'):
106+
structured_response.raw_response = raw_response.content
107+
structured_response.metrics.usage = LangChainProvider.get_ai_usage_from_response(raw_response)
108+
109+
if response.get('parsing_error'):
110+
log.warning(f'LangChain structured model invocation had a parsing error')
111+
return structured_response
112+
113+
structured_response.metrics.success = True
114+
structured_response.data = response.get('parsed') or {}
115+
return structured_response
113116
except Exception as error:
114117
log.warning(f'LangChain structured model invocation failed: {error}')
115-
116-
return StructuredResponse(
117-
data={},
118-
raw_response='',
119-
metrics=LDAIMetrics(
120-
success=False,
121-
usage=TokenUsage(total=0, input=0, output=0),
122-
),
123-
)
118+
return structured_response
124119

125120
def get_chat_model(self) -> BaseChatModel:
126121
"""
@@ -135,20 +130,47 @@ def map_provider(ld_provider_name: str) -> str:
135130
"""
136131
Map LaunchDarkly provider names to LangChain provider names.
137132
138-
This method enables seamless integration between LaunchDarkly's standardized
139-
provider naming and LangChain's naming conventions.
140-
141133
:param ld_provider_name: LaunchDarkly provider name
142134
:return: LangChain-compatible provider name
143135
"""
144136
lowercased_name = ld_provider_name.lower()
137+
# Bedrock is the only provider that uses "provider:model_family" (e.g. Bedrock:Anthropic).
138+
if lowercased_name.startswith('bedrock:'):
139+
return 'bedrock_converse'
145140

146141
mapping: Dict[str, str] = {
147142
'gemini': 'google-genai',
143+
'bedrock': 'bedrock_converse',
148144
}
149-
150145
return mapping.get(lowercased_name, lowercased_name)
151146

147+
@staticmethod
148+
def get_ai_usage_from_response(response: BaseMessage) -> TokenUsage:
149+
"""
150+
Get token usage from a LangChain provider response.
151+
152+
:param response: The response from the LangChain model
153+
:return: TokenUsage with success status and token usage
154+
"""
155+
# Extract token usage if available
156+
usage: Optional[TokenUsage] = None
157+
if hasattr(response, 'usage_metadata') and response.usage_metadata:
158+
usage = TokenUsage(
159+
total=response.usage_metadata.get('total_tokens', 0),
160+
input=response.usage_metadata.get('input_tokens', 0),
161+
output=response.usage_metadata.get('output_tokens', 0),
162+
)
163+
if not usage and hasattr(response, 'response_metadata') and response.response_metadata:
164+
token_usage = response.response_metadata.get('tokenUsage') or response.response_metadata.get('token_usage')
165+
if token_usage:
166+
usage = TokenUsage(
167+
total=token_usage.get('totalTokens', 0) or token_usage.get('total_tokens', 0),
168+
input=token_usage.get('promptTokens', 0) or token_usage.get('prompt_tokens', 0),
169+
output=token_usage.get('completionTokens', 0) or token_usage.get('completion_tokens', 0),
170+
)
171+
172+
return usage
173+
152174
@staticmethod
153175
def get_ai_metrics_from_response(response: BaseMessage) -> LDAIMetrics:
154176
"""
@@ -168,15 +190,7 @@ def get_ai_metrics_from_response(response: BaseMessage) -> LDAIMetrics:
168190
)
169191
"""
170192
# Extract token usage if available
171-
usage: Optional[TokenUsage] = None
172-
if hasattr(response, 'response_metadata') and response.response_metadata:
173-
token_usage = response.response_metadata.get('tokenUsage') or response.response_metadata.get('token_usage')
174-
if token_usage:
175-
usage = TokenUsage(
176-
total=token_usage.get('totalTokens', 0) or token_usage.get('total_tokens', 0),
177-
input=token_usage.get('promptTokens', 0) or token_usage.get('prompt_tokens', 0),
178-
output=token_usage.get('completionTokens', 0) or token_usage.get('completion_tokens', 0),
179-
)
193+
usage = LangChainProvider.get_ai_usage_from_response(response)
180194

181195
return LDAIMetrics(success=True, usage=usage)
182196

@@ -227,10 +241,15 @@ def create_langchain_model(ai_config: AIConfigKind) -> BaseChatModel:
227241

228242
model_name = model_dict.get('name', '')
229243
provider = provider_dict.get('name', '')
230-
parameters = model_dict.get('parameters') or {}
244+
parameters = dict(model_dict.get('parameters') or {})
245+
mapped_provider = LangChainProvider.map_provider(provider)
231246

247+
# Bedrock requires the foundation provider (e.g. Bedrock:Anthropic) passed in
248+
# parameters separately from model_provider, which is used for LangChain routing.
249+
if mapped_provider == 'bedrock_converse' and 'provider' not in parameters:
250+
parameters['provider'] = provider.removeprefix('bedrock:')
232251
return init_chat_model(
233252
model_name,
234-
model_provider=LangChainProvider.map_provider(provider),
253+
model_provider=mapped_provider,
235254
**parameters,
236255
)

packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,14 @@ def test_maps_gemini_to_google_genai(self):
130130
assert LangChainProvider.map_provider('Gemini') == 'google-genai'
131131
assert LangChainProvider.map_provider('GEMINI') == 'google-genai'
132132

133+
def test_maps_bedrock_and_model_families_to_bedrock_converse(self):
134+
"""Should map bedrock and bedrock:model_family to bedrock_converse."""
135+
assert LangChainProvider.map_provider('bedrock') == 'bedrock_converse'
136+
assert LangChainProvider.map_provider('Bedrock:Anthropic') == 'bedrock_converse'
137+
assert LangChainProvider.map_provider('bedrock:anthropic') == 'bedrock_converse'
138+
assert LangChainProvider.map_provider('bedrock:amazon') == 'bedrock_converse'
139+
assert LangChainProvider.map_provider('bedrock:cohere') == 'bedrock_converse'
140+
133141
def test_returns_provider_name_unchanged_for_unmapped_providers(self):
134142
"""Should return provider name unchanged for unmapped providers."""
135143
assert LangChainProvider.map_provider('openai') == 'openai'
@@ -197,7 +205,8 @@ def mock_llm(self):
197205
@pytest.mark.asyncio
198206
async def test_returns_success_true_for_successful_invocation(self, mock_llm):
199207
"""Should return success=True for successful invocation."""
200-
mock_response = {'result': 'structured data'}
208+
parsed_data = {'result': 'structured data'}
209+
mock_response = {'parsed': parsed_data, 'raw': None}
201210
mock_structured_llm = MagicMock()
202211
mock_structured_llm.ainvoke = AsyncMock(return_value=mock_response)
203212
mock_llm.with_structured_output = MagicMock(return_value=mock_structured_llm)
@@ -208,7 +217,7 @@ async def test_returns_success_true_for_successful_invocation(self, mock_llm):
208217
result = await provider.invoke_structured_model(messages, response_structure)
209218

210219
assert result.metrics.success is True
211-
assert result.data == mock_response
220+
assert result.data == parsed_data
212221

213222
@pytest.mark.asyncio
214223
async def test_returns_success_false_when_structured_model_invocation_throws_error(self, mock_llm):
@@ -226,8 +235,7 @@ async def test_returns_success_false_when_structured_model_invocation_throws_err
226235
assert result.metrics.success is False
227236
assert result.data == {}
228237
assert result.raw_response == ''
229-
assert result.metrics.usage is not None
230-
assert result.metrics.usage.total == 0
238+
assert result.metrics.usage is None
231239

232240

233241
class TestGetChatModel:

packages/sdk/server-ai/src/ldai/judge/__init__.py

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(
3737
self._ai_config = ai_config
3838
self._ai_config_tracker = ai_config_tracker
3939
self._ai_provider = ai_provider
40-
self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key)
40+
self._evaluation_response_structure = EvaluationSchemaBuilder.build()
4141

4242
async def evaluate(
4343
self,
@@ -77,10 +77,9 @@ async def evaluate(
7777
)
7878

7979
success = response.metrics.success
80-
8180
evals = self._parse_evaluation_response(response.data)
8281

83-
if self._ai_config.evaluation_metric_key not in evals:
82+
if not evals:
8483
log.warn('Judge evaluation did not return the expected evaluation')
8584
success = False
8685

@@ -175,47 +174,26 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
175174

176175
def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
177176
"""
178-
Parses the structured evaluation response from the AI provider.
179-
180-
:param data: The structured response data
181-
:return: Dictionary of evaluation scores keyed by metric key
177+
Parses the structured evaluation response. Expects {"score": n, "reasoning": "..."}.
182178
"""
183179
results: Dict[str, EvalScore] = {}
184-
185-
if not data.get('evaluations') or not isinstance(data['evaluations'], dict):
186-
log.warn('Invalid response: missing or invalid evaluations object')
187-
return results
188-
189-
evaluations = data['evaluations']
190-
191180
metric_key = self._ai_config.evaluation_metric_key
192181
if not metric_key:
193182
log.warn('Evaluation metric key is missing')
194183
return results
195184

196-
evaluation = evaluations.get(metric_key)
197-
198-
if not evaluation or not isinstance(evaluation, dict):
199-
log.warn(f'Missing evaluation for metric key: {metric_key}')
185+
if not isinstance(data, dict):
186+
log.warn('Invalid response: missing or invalid evaluation')
200187
return results
201188

202-
score = evaluation.get('score')
203-
reasoning = evaluation.get('reasoning')
204-
189+
score = data.get('score')
190+
reasoning = data.get('reasoning')
205191
if not isinstance(score, (int, float)) or score < 0 or score > 1:
206-
log.warn(
207-
f'Invalid score evaluated for {metric_key}: {score}. '
208-
'Score must be a number between 0 and 1 inclusive'
209-
)
192+
log.warn(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive')
210193
return results
211-
212194
if not isinstance(reasoning, str):
213-
log.warn(
214-
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
215-
'Reasoning must be a string'
216-
)
195+
log.warn('Invalid reasoning: must be a string')
217196
return results
218197

219198
results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
220-
221199
return results

packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py

Lines changed: 17 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,77 +1,43 @@
1-
"""Internal class for building dynamic evaluation response schemas."""
1+
"""Internal class for building evaluation response schemas."""
22

3-
from typing import Any, Dict, Optional
3+
from typing import Any, Dict
44

55

66
class EvaluationSchemaBuilder:
77
"""
8-
Internal class for building dynamic evaluation response schemas.
8+
Internal class for building evaluation response schemas.
99
Not exported - only used internally by Judge.
10+
Schema is a fixed shape: top-level score and reasoning.
11+
The judge config's evaluation_metric_key is only used when keying the result,
12+
not in the schema.
1013
"""
1114

1215
@staticmethod
13-
def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]:
16+
def build() -> Dict[str, Any]:
1417
"""
15-
Build an evaluation response schema from evaluation metric key.
18+
Build the evaluation response schema. No parameters; the schema is
19+
always the same. The judge keys the parsed result by its config's
20+
evaluation_metric_key.
1621
17-
:param evaluation_metric_key: Evaluation metric key, or None if not available
18-
:return: Schema dictionary for structured output, or None if evaluation_metric_key is None
19-
"""
20-
if not evaluation_metric_key:
21-
return None
22+
In practice the model returns JSON like:
23+
{"score": 0.85, "reasoning": "The response is accurate."}
2224
23-
return {
24-
'title': 'EvaluationResponse',
25-
'description': f"Response containing evaluation results for {evaluation_metric_key} metric",
26-
'type': 'object',
27-
'properties': {
28-
'evaluations': {
29-
'type': 'object',
30-
'description': (
31-
f"Object containing evaluation results for "
32-
f"{evaluation_metric_key} metric"
33-
),
34-
'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key),
35-
'required': [evaluation_metric_key],
36-
'additionalProperties': False,
37-
},
38-
},
39-
'required': ['evaluations'],
40-
'additionalProperties': False,
41-
}
42-
43-
@staticmethod
44-
def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]:
45-
"""
46-
Build properties for a single evaluation metric key.
47-
48-
:param evaluation_metric_key: Evaluation metric key
49-
:return: Dictionary of properties for the key
50-
"""
51-
return {
52-
evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key)
53-
}
54-
55-
@staticmethod
56-
def _build_key_schema(key: str) -> Dict[str, Any]:
57-
"""
58-
Build schema for a single evaluation metric key.
59-
60-
:param key: Evaluation metric key
61-
:return: Schema dictionary for the key
25+
:return: Schema dictionary for structured output
6226
"""
6327
return {
28+
'title': 'EvaluationResponse',
29+
'description': 'Response containing an evaluation (score and reasoning).',
6430
'type': 'object',
6531
'properties': {
6632
'score': {
6733
'type': 'number',
6834
'minimum': 0,
6935
'maximum': 1,
70-
'description': f'Score between 0.0 and 1.0 for {key}',
36+
'description': 'Score between 0.0 and 1.0.',
7137
},
7238
'reasoning': {
7339
'type': 'string',
74-
'description': f'Reasoning behind the score for {key}',
40+
'description': 'Reasoning behind the score.',
7541
},
7642
},
7743
'required': ['score', 'reasoning'],

0 commit comments

Comments
 (0)