Skip to content

Commit 38b6cae

Browse files
committed
fix: add deepseek-v4 models, fix calculate_cost, and improve error handling
- Register deepseek-v4-flash and deepseek-v4-pro in DEEPSEEK_MODELS_DATA - Fix calculate_cost() across all 8 providers to return EvaluationCost(0.0, ...) instead of None when pricing is unknown (Anthropic, OpenAI, Azure, Gemini, Kimi, Grok, Amazon Bedrock, DeepSeek) - Fix ContextGenerator to raise DeepEvalError when ALL documents fail instead of silently returning empty contexts - Fix Synthesizer to raise DeepEvalError when contexts are empty deepseek-chat and deepseek-reasoner will be deprecated on 2026-07-24 in favor of v4-flash and v4-pro, so this is needed for forward compatibility.
1 parent dfe4122 commit 38b6cae

18 files changed

Lines changed: 180 additions & 74 deletions

deepeval/models/llms/amazon_bedrock_model.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -350,14 +350,20 @@ def get_converse_request_body(self, prompt: str) -> dict:
350350

351351
def calculate_cost(
352352
self, input_tokens: int, output_tokens: int
353-
) -> Optional[float]:
354-
if self.model_data.input_price and self.model_data.output_price:
355-
input_cost = input_tokens * self.model_data.input_price
356-
output_cost = output_tokens * self.model_data.output_price
357-
return EvaluationCost(
358-
input_cost + output_cost, input_tokens, output_tokens
359-
)
360-
return None
353+
) -> float:
354+
input_cost = (
355+
input_tokens * self.model_data.input_price
356+
if self.model_data.input_price
357+
else 0.0
358+
)
359+
output_cost = (
360+
output_tokens * self.model_data.output_price
361+
if self.model_data.output_price
362+
else 0.0
363+
)
364+
return EvaluationCost(
365+
input_cost + output_cost, input_tokens, output_tokens
366+
)
361367

362368
def load_model(self):
363369
pass

deepeval/models/llms/anthropic_model.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -236,12 +236,19 @@ def generate_content(self, multimodal_input: List[Union[str, MLLMImage]]):
236236
###############################################
237237

238238
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
239-
if self.model_data.input_price and self.model_data.output_price:
240-
input_cost = input_tokens * self.model_data.input_price
241-
output_cost = output_tokens * self.model_data.output_price
242-
return EvaluationCost(
243-
input_cost + output_cost, input_tokens, output_tokens
244-
)
239+
input_cost = (
240+
input_tokens * self.model_data.input_price
241+
if self.model_data.input_price
242+
else 0.0
243+
)
244+
output_cost = (
245+
output_tokens * self.model_data.output_price
246+
if self.model_data.output_price
247+
else 0.0
248+
)
249+
return EvaluationCost(
250+
input_cost + output_cost, input_tokens, output_tokens
251+
)
245252

246253
#########################
247254
# Capabilities #

deepeval/models/llms/azure_model.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -435,12 +435,19 @@ def generate_content(
435435
###############################################
436436

437437
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
438-
if self.model_data.input_price and self.model_data.output_price:
439-
input_cost = input_tokens * self.model_data.input_price
440-
output_cost = output_tokens * self.model_data.output_price
441-
return EvaluationCost(
442-
input_cost + output_cost, input_tokens, output_tokens
443-
)
438+
input_cost = (
439+
input_tokens * self.model_data.input_price
440+
if self.model_data.input_price
441+
else 0.0
442+
)
443+
output_cost = (
444+
output_tokens * self.model_data.output_price
445+
if self.model_data.output_price
446+
else 0.0
447+
)
448+
return EvaluationCost(
449+
input_cost + output_cost, input_tokens, output_tokens
450+
)
444451

445452
###############################################
446453
# Capabilities

deepeval/models/llms/constants.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,6 +1059,22 @@ def make_model_data(**kwargs: Any) -> ModelDataFactory:
10591059
input_price=1.00 / 1e6,
10601060
output_price=2.00 / 1e6,
10611061
),
1062+
"deepseek-v4-flash": make_model_data(
1063+
supports_log_probs=False,
1064+
supports_multimodal=False,
1065+
supports_structured_outputs=True,
1066+
supports_json=True,
1067+
input_price=None,
1068+
output_price=None,
1069+
),
1070+
"deepseek-v4-pro": make_model_data(
1071+
supports_log_probs=False,
1072+
supports_multimodal=False,
1073+
supports_structured_outputs=True,
1074+
supports_json=True,
1075+
input_price=None,
1076+
output_price=None,
1077+
),
10621078
}
10631079
)
10641080

deepeval/models/llms/deepseek_model.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -177,12 +177,19 @@ async def a_generate(
177177
###############################################
178178

179179
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
180-
if self.model_data.input_price and self.model_data.output_price:
181-
input_cost = input_tokens * self.model_data.input_price
182-
output_cost = output_tokens * self.model_data.output_price
183-
return EvaluationCost(
184-
input_cost + output_cost, input_tokens, output_tokens
185-
)
180+
input_cost = (
181+
input_tokens * self.model_data.input_price
182+
if self.model_data.input_price
183+
else 0.0
184+
)
185+
output_cost = (
186+
output_tokens * self.model_data.output_price
187+
if self.model_data.output_price
188+
else 0.0
189+
)
190+
return EvaluationCost(
191+
input_cost + output_cost, input_tokens, output_tokens
192+
)
186193

187194
###############################################
188195
# Capabilities

deepeval/models/llms/gemini_model.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -316,13 +316,20 @@ async def a_generate(
316316

317317
def calculate_cost(
318318
self, input_tokens: int, output_tokens: int
319-
) -> Optional[EvaluationCost]:
320-
if self.model_data.input_price and self.model_data.output_price:
321-
input_cost = input_tokens * self.model_data.input_price
322-
output_cost = output_tokens * self.model_data.output_price
323-
return EvaluationCost(
324-
input_cost + output_cost, input_tokens, output_tokens
325-
)
319+
) -> float:
320+
input_cost = (
321+
input_tokens * self.model_data.input_price
322+
if self.model_data.input_price
323+
else 0.0
324+
)
325+
output_cost = (
326+
output_tokens * self.model_data.output_price
327+
if self.model_data.output_price
328+
else 0.0
329+
)
330+
return EvaluationCost(
331+
input_cost + output_cost, input_tokens, output_tokens
332+
)
326333

327334
def _token_cost(self, response) -> EvaluationCost:
328335
usage = getattr(response, "usage_metadata", None)

deepeval/models/llms/grok_model.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -226,12 +226,19 @@ def generate_content(
226226
###############################################
227227

228228
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
229-
if self.model_data.input_price and self.model_data.output_price:
230-
input_cost = input_tokens * self.model_data.input_price
231-
output_cost = output_tokens * self.model_data.output_price
232-
return EvaluationCost(
233-
input_cost + output_cost, input_tokens, output_tokens
234-
)
229+
input_cost = (
230+
input_tokens * self.model_data.input_price
231+
if self.model_data.input_price
232+
else 0.0
233+
)
234+
output_cost = (
235+
output_tokens * self.model_data.output_price
236+
if self.model_data.output_price
237+
else 0.0
238+
)
239+
return EvaluationCost(
240+
input_cost + output_cost, input_tokens, output_tokens
241+
)
235242

236243
###############################################
237244
# Capabilities

deepeval/models/llms/kimi_model.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -225,12 +225,19 @@ def generate_content(
225225
###############################################
226226

227227
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
228-
if self.model_data.input_price and self.model_data.output_price:
229-
input_cost = input_tokens * self.model_data.input_price
230-
output_cost = output_tokens * self.model_data.output_price
231-
return EvaluationCost(
232-
input_cost + output_cost, input_tokens, output_tokens
233-
)
228+
input_cost = (
229+
input_tokens * self.model_data.input_price
230+
if self.model_data.input_price
231+
else 0.0
232+
)
233+
output_cost = (
234+
output_tokens * self.model_data.output_price
235+
if self.model_data.output_price
236+
else 0.0
237+
)
238+
return EvaluationCost(
239+
input_cost + output_cost, input_tokens, output_tokens
240+
)
234241

235242
###############################################
236243
# Capabilities

deepeval/models/llms/openai_model.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -396,16 +396,20 @@ def generate_samples(
396396

397397
def calculate_cost(
398398
self, input_tokens: int, output_tokens: int
399-
) -> Optional[float]:
400-
if self.model_data.input_price and self.model_data.output_price:
401-
input_cost = input_tokens * self.model_data.input_price
402-
output_cost = output_tokens * self.model_data.output_price
403-
# Carry token counts alongside the cost so metric runs can surface
404-
# input/output token usage (EvaluationCost subclasses float, so every
405-
# existing `output, cost = generate(...)` caller is unaffected).
406-
return EvaluationCost(
407-
input_cost + output_cost, input_tokens, output_tokens
408-
)
399+
) -> float:
400+
input_cost = (
401+
input_tokens * self.model_data.input_price
402+
if self.model_data.input_price
403+
else 0.0
404+
)
405+
output_cost = (
406+
output_tokens * self.model_data.output_price
407+
if self.model_data.output_price
408+
else 0.0
409+
)
410+
return EvaluationCost(
411+
input_cost + output_cost, input_tokens, output_tokens
412+
)
409413

410414
#########################
411415
# Capabilities #

deepeval/synthesizer/chunking/context_generator.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
DeepEvalBaseEmbeddingModel,
2828
DeepEvalBaseLLM,
2929
)
30+
from deepeval.errors import DeepEvalError
3031
from deepeval.utils import update_pbar, add_pbar, remove_pbars
3132
from deepeval.config.settings import get_settings
3233

@@ -209,6 +210,7 @@ def generate_contexts(
209210
update_pbar(progress, pbar_id, remove=False)
210211

211212
# process each doc end-to-end (sync), with per-doc error logging
213+
docs_with_errors: List[str] = []
212214
for path, chunker in source_file_to_chunker_map.items():
213215
collection = None
214216
try:
@@ -267,6 +269,7 @@ def generate_contexts(
267269
source_files.extend([path] * len(ctxs_for_doc))
268270

269271
except Exception as exc:
272+
docs_with_errors.append(path)
270273
# record and continue with other docs
271274
show_trace = bool(get_settings().DEEPEVAL_LOG_STACK_TRACES)
272275
exc_info = (
@@ -306,6 +309,12 @@ def generate_contexts(
306309
"Not enough chunks in smallest document",
307310
)
308311

312+
if docs_with_errors and not contexts:
313+
raise DeepEvalError(
314+
f"Context generation failed for all {len(docs_with_errors)} "
315+
f"document(s). Check the logs above for per-document errors."
316+
)
317+
309318
return contexts, source_files, scores
310319

311320
finally:
@@ -432,8 +441,10 @@ async def pipeline(path: str, chunker: DocumentChunker):
432441
results = await asyncio.gather(*tasks, return_exceptions=True)
433442

434443
# Collect results, surface any errors after cleanup
444+
docs_with_errors: List[str] = []
435445
for path, res in zip(paths, results):
436446
if isinstance(res, Exception):
447+
docs_with_errors.append(path)
437448
logger.error(
438449
"Document pipeline failed for %s",
439450
path,
@@ -463,6 +474,12 @@ async def pipeline(path: str, chunker: DocumentChunker):
463474
"Not enough chunks in smallest document",
464475
)
465476

477+
if docs_with_errors and not contexts:
478+
raise DeepEvalError(
479+
f"Context generation failed for all {len(docs_with_errors)} "
480+
f"document(s). Check the logs above for per-document errors."
481+
)
482+
466483
return contexts, source_files, scores
467484

468485
finally:

0 commit comments

Comments
 (0)