Skip to content

Commit 6181933

Browse files
authored
Merge pull request #2648 from confident-ai/hotfix/integrations
Hotfix/integrations
2 parents 171b077 + 44f026c commit 6181933

71 files changed

Lines changed: 7655 additions & 1796 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

deepeval/__init__.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def _expose_public_api() -> None:
1717
# Do not do this at module level or ruff will complain with E402
1818
global __version__, evaluate, assert_test, compare
1919
global on_test_run_end, log_hyperparameters, login, telemetry
20+
global instrument
2021

2122
from ._version import __version__ as _version
2223
from deepeval.evaluate import (
@@ -40,6 +41,25 @@ def _expose_public_api() -> None:
4041
login = _login
4142
telemetry = _telemetry
4243

44+
def instrument(*args, **kwargs):
45+
"""Set up Confident AI's OTel backend.
46+
47+
Configures a TracerProvider, attaches deepeval's OpenInference span
48+
interceptor, and ships spans to the Confident OTel endpoint. Pair with
49+
any OpenInference instrumentor (e.g. ``GoogleADKInstrumentor``,
50+
``OpenAIInstrumentor``) to capture framework-specific telemetry.
51+
52+
Accepts the same arguments as
53+
``deepeval.integrations.openinference.instrument_openinference``.
54+
"""
55+
from deepeval.integrations.openinference import (
56+
instrument_openinference,
57+
)
58+
59+
return instrument_openinference(*args, **kwargs)
60+
61+
globals()["instrument"] = instrument
62+
4363

4464
_expose_public_api()
4565

@@ -60,6 +80,7 @@ def _expose_public_api() -> None:
6080
"assert_test",
6181
"on_test_run_end",
6282
"compare",
83+
"instrument",
6384
]
6485

6586

deepeval/cli/generate/command.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
1+
import sys
12
from pathlib import Path
2-
from typing import List, Optional
3+
from typing import Any, List, Optional
34

45
import typer
56
from rich import print
67

7-
from deepeval.synthesizer import Synthesizer
8-
from deepeval.synthesizer.config import ContextConstructionConfig
98
from deepeval.cli.generate.utils import (
109
FileType,
1110
GenerationMethod,
@@ -20,6 +19,25 @@
2019
)
2120

2221

22+
# Lazy module-level attrs: ``Synthesizer`` and ``ContextConstructionConfig``
23+
# materialize on first access (PEP 562) so unrelated CLI commands like
24+
# ``deepeval test run`` don't pay for the synthesizer chain at startup.
25+
# Tests still see them as module attributes so ``monkeypatch.setattr(
26+
# generate_cli, "Synthesizer", _Fake)`` works.
27+
def __getattr__(name: str) -> Any:
28+
if name == "Synthesizer":
29+
from deepeval.synthesizer import Synthesizer
30+
31+
globals()["Synthesizer"] = Synthesizer
32+
return Synthesizer
33+
if name == "ContextConstructionConfig":
34+
from deepeval.synthesizer.config import ContextConstructionConfig
35+
36+
globals()["ContextConstructionConfig"] = ContextConstructionConfig
37+
return ContextConstructionConfig
38+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
39+
40+
2341
def generate_command(
2442
method: GenerationMethod = typer.Option(
2543
...,
@@ -186,6 +204,13 @@ def generate_command(
186204
),
187205
):
188206
"""Generate synthetic goldens with the golden synthesizer."""
207+
# Go through the module so test monkeypatches stick. Direct
208+
# ``from deepeval.synthesizer import Synthesizer`` would always
209+
# fetch the real class and ignore patched module attrs.
210+
_self = sys.modules[__name__]
211+
Synthesizer = _self.Synthesizer
212+
ContextConstructionConfig = _self.ContextConstructionConfig
213+
189214
document_paths = None
190215
contexts = None
191216
goldens = None

deepeval/evaluate/execute/agentic.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -185,18 +185,39 @@ async def dfs(trace: Trace, span: BaseSpan):
185185
await asyncio.gather(*child_tasks, return_exceptions=True)
186186
raise
187187

188-
if not _skip_metrics_for_error(trace=current_trace):
189-
if current_trace and current_trace.root_spans:
190-
await dfs(current_trace, current_trace.root_spans[0])
191-
else:
192-
if (
193-
logger.isEnabledFor(logging.DEBUG)
194-
and get_settings().DEEPEVAL_VERBOSE_MODE
195-
):
196-
logger.debug(
197-
"Skipping DFS: empty trace or no root spans (trace=%s)",
198-
current_trace.uuid if current_trace else None,
188+
# Always walk spans, even on errored traces — the walker hydrates
189+
# ``trace_api.*_spans`` and the user needs that data on the
190+
# dashboard to diagnose. Per-span metric skip already lives
191+
# inside ``_a_execute_span_test_case`` (appends api_span first,
192+
# then short-circuits on error). Walk EVERY root, not just
193+
# ``root_spans[0]``: OTel integrations can land multiple logical
194+
# roots when a child ends before its parent.
195+
if current_trace and current_trace.root_spans:
196+
root_tasks = [
197+
asyncio.create_task(dfs(current_trace, root))
198+
for root in current_trace.root_spans
199+
]
200+
if root_tasks:
201+
try:
202+
await asyncio.wait_for(
203+
asyncio.gather(*root_tasks),
204+
timeout=get_gather_timeout(),
199205
)
206+
except (asyncio.TimeoutError, TimeoutError):
207+
for t in root_tasks:
208+
if not t.done():
209+
t.cancel()
210+
await asyncio.gather(*root_tasks, return_exceptions=True)
211+
raise
212+
else:
213+
if (
214+
logger.isEnabledFor(logging.DEBUG)
215+
and get_settings().DEEPEVAL_VERBOSE_MODE
216+
):
217+
logger.debug(
218+
"Skipping DFS: empty trace or no root spans (trace=%s)",
219+
current_trace.uuid if current_trace else None,
220+
)
200221
except asyncio.CancelledError:
201222
# mark any unfinished metrics as cancelled
202223
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:

deepeval/evaluate/execute/loop.py

Lines changed: 66 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,10 @@ def dfs(
360360

361361
start_time = time.perf_counter()
362362

363-
# Handle trace-level metrics
363+
# On errored traces, skip trace-level metrics (no test case
364+
# to judge) but DO run the span-level DFS walker below —
365+
# it's what hydrates ``trace_api.*_spans`` for the dashboard,
366+
# and per-span metric skip is handled inside ``dfs``.
364367
skip_metrics_for_this_golden = False
365368
if _skip_metrics_for_error(trace=current_trace):
366369
trace_api.status = TraceSpanApiStatus.ERRORED
@@ -372,73 +375,74 @@ def dfs(
372375
current_trace
373376
),
374377
)
375-
else:
376-
if current_trace.metrics:
377-
requires_trace = any(
378-
metric.requires_trace
379-
for metric in current_trace.metrics
380-
)
378+
elif current_trace.metrics:
379+
requires_trace = any(
380+
metric.requires_trace
381+
for metric in current_trace.metrics
382+
)
381383

382-
# Build the trace-level LLMTestCase from the golden
383-
# directly, the same way the async iterator does
384-
# (see ``_a_evaluate_trace``). This makes top-level
385-
# ``metrics=[...]`` work out of the box even when the
386-
# user never calls ``update_current_trace(input=...)``.
387-
llm_test_case = LLMTestCase(
388-
input=golden.input,
389-
actual_output=(
390-
str(current_trace.output)
391-
if current_trace.output is not None
392-
else golden.actual_output
393-
),
394-
expected_output=current_trace.expected_output,
395-
context=current_trace.context,
396-
retrieval_context=current_trace.retrieval_context,
397-
tools_called=current_trace.tools_called,
398-
expected_tools=current_trace.expected_tools,
399-
)
384+
# Build the trace-level LLMTestCase from the golden
385+
# directly, the same way the async iterator does
386+
# (see ``_a_evaluate_trace``). This makes top-level
387+
# ``metrics=[...]`` work out of the box even when the
388+
# user never calls ``update_current_trace(input=...)``.
389+
llm_test_case = LLMTestCase(
390+
input=golden.input,
391+
actual_output=(
392+
str(current_trace.output)
393+
if current_trace.output is not None
394+
else golden.actual_output
395+
),
396+
expected_output=current_trace.expected_output,
397+
context=current_trace.context,
398+
retrieval_context=current_trace.retrieval_context,
399+
tools_called=current_trace.tools_called,
400+
expected_tools=current_trace.expected_tools,
401+
)
400402

401-
if requires_trace:
402-
llm_test_case._trace_dict = (
403-
trace_manager.create_nested_spans_dict(
404-
current_trace.root_spans[0]
405-
)
403+
if requires_trace:
404+
llm_test_case._trace_dict = (
405+
trace_manager.create_nested_spans_dict(
406+
current_trace.root_spans[0]
406407
)
408+
)
407409

408-
if not skip_metrics_for_this_golden:
409-
for metric in current_trace.metrics:
410-
metric.skipped = False
411-
metric.error = None
412-
if display_config.verbose_mode is not None:
413-
metric.verbose_mode = (
414-
display_config.verbose_mode
415-
)
416-
417-
trace_api.metrics_data = []
418-
for metric in current_trace.metrics:
419-
res = _execute_metric(
420-
metric=metric,
421-
test_case=llm_test_case,
422-
show_metric_indicator=show_metric_indicator,
423-
in_component=True,
424-
error_config=error_config,
410+
if not skip_metrics_for_this_golden:
411+
for metric in current_trace.metrics:
412+
metric.skipped = False
413+
metric.error = None
414+
if display_config.verbose_mode is not None:
415+
metric.verbose_mode = (
416+
display_config.verbose_mode
425417
)
426-
if res == "skip":
427-
continue
428-
429-
if not metric.skipped:
430-
metric_data = create_metric_data(metric)
431-
trace_api.metrics_data.append(metric_data)
432-
api_test_case.update_metric_data(
433-
metric_data
434-
)
435-
api_test_case.update_status(
436-
metric_data.success
437-
)
438-
update_pbar(progress, pbar_eval_id)
439418

440-
# Then handle span-level metrics
441-
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
419+
trace_api.metrics_data = []
420+
for metric in current_trace.metrics:
421+
res = _execute_metric(
422+
metric=metric,
423+
test_case=llm_test_case,
424+
show_metric_indicator=show_metric_indicator,
425+
in_component=True,
426+
error_config=error_config,
427+
)
428+
if res == "skip":
429+
continue
430+
431+
if not metric.skipped:
432+
metric_data = create_metric_data(metric)
433+
trace_api.metrics_data.append(metric_data)
434+
api_test_case.update_metric_data(metric_data)
435+
api_test_case.update_status(metric_data.success)
436+
update_pbar(progress, pbar_eval_id)
437+
438+
# Always walk spans, even on errored traces — the walker
439+
# hydrates ``trace_api.*_spans`` and the user needs that
440+
# data on the dashboard to diagnose. Walk EVERY root, not
441+
# just ``root_spans[0]``: OTel integrations can land
442+
# multiple logical roots when a child ends before its
443+
# parent. Mirrors the async path in ``agentic.py``.
444+
for root in current_trace.root_spans:
445+
dfs(root, progress, pbar_eval_id)
442446

443447
end_time = time.perf_counter()
444448
run_duration = end_time - start_time

deepeval/evaluate/utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,27 @@ def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
163163
# problem. The truthiness check cleanly covers the "absent" cases
164164
# (`None`, `{}`, `""`) that would otherwise show as garbage in the
165165
# trace-level Metrics Summary and break `filter_duplicate_results`.
166+
#
167+
# Span lists start empty and are populated by the eval-iterator's
168+
# DFS walker (``_a_execute_span_test_case`` / its sync twin), which
169+
# categorizes each visited span by isinstance and appends to the
170+
# matching ``trace_api.*_spans`` list. We DON'T pre-populate from
171+
# ``trace.root_spans`` here because the walker is also responsible
172+
# for attaching per-span metric data, error flags, and trace dicts —
173+
# doing it twice (here + walker) would either double-emit or require
174+
# the walker to dedupe.
175+
#
176+
# Trace-level fields (``name``, ``tags``, ``thread_id``, ``user_id``,
177+
# ``metadata``, ``environment``) ARE forwarded from the trace so that
178+
# OTel-based integrations whose users configured them via instrumentation
179+
# settings or ``update_current_trace(...)`` see them on the dashboard.
180+
# The non-eval REST path (``trace_manager.create_trace_api``) already
181+
# forwards these; mirror its shape here so the eval-iterator path
182+
# doesn't silently drop them. ``metadata`` is intentionally sourced
183+
# from the golden (not the trace) — that's an evaluation-specific
184+
# convention the eval pipeline relies on for per-row context. Trace-
185+
# configured metadata flows through the per-trace upload path, not
186+
# through this golden-shaped TraceApi.
166187
return TraceApi(
167188
uuid=trace.uuid,
168189
baseSpans=[],
@@ -188,6 +209,11 @@ def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
188209
tools_called=trace.tools_called,
189210
expected_tools=trace.expected_tools,
190211
metadata=golden.additional_metadata,
212+
name=trace.name,
213+
tags=trace.tags,
214+
threadId=trace.thread_id,
215+
userId=trace.user_id,
216+
environment=trace.environment,
191217
status=(
192218
TraceSpanApiStatus.SUCCESS
193219
if trace.status == TraceSpanStatus.SUCCESS

0 commit comments

Comments
 (0)