confident-ai · penguine-ip · May 5, 2026 · May 1, 2026 · May 4, 2026 · May 4, 2026
diff --git a/deepeval/__init__.py b/deepeval/__init__.py
@@ -17,6 +17,7 @@ def _expose_public_api() -> None:
     # Do not do this at module level or ruff will complain with E402
     global __version__, evaluate, assert_test, compare
     global on_test_run_end, log_hyperparameters, login, telemetry
+    global instrument
 
     from ._version import __version__ as _version
     from deepeval.evaluate import (
@@ -40,6 +41,25 @@ def _expose_public_api() -> None:
     login = _login
     telemetry = _telemetry
 
+    def instrument(*args, **kwargs):
+        """Set up Confident AI's OTel backend.
+
+        Configures a TracerProvider, attaches deepeval's OpenInference span
+        interceptor, and ships spans to the Confident OTel endpoint. Pair with
+        any OpenInference instrumentor (e.g. ``GoogleADKInstrumentor``,
+        ``OpenAIInstrumentor``) to capture framework-specific telemetry.
+
+        Accepts the same arguments as
+        ``deepeval.integrations.openinference.instrument_openinference``.
+        """
+        from deepeval.integrations.openinference import (
+            instrument_openinference,
+        )
+
+        return instrument_openinference(*args, **kwargs)
+
+    globals()["instrument"] = instrument
+
 
 _expose_public_api()
 
@@ -60,6 +80,7 @@ def _expose_public_api() -> None:
     "assert_test",
     "on_test_run_end",
     "compare",
+    "instrument",
 ]
 
 

diff --git a/deepeval/cli/generate/command.py b/deepeval/cli/generate/command.py
@@ -1,11 +1,10 @@
+import sys
 from pathlib import Path
-from typing import List, Optional
+from typing import Any, List, Optional
 
 import typer
 from rich import print
 
-from deepeval.synthesizer import Synthesizer
-from deepeval.synthesizer.config import ContextConstructionConfig
 from deepeval.cli.generate.utils import (
     FileType,
     GenerationMethod,
@@ -20,6 +19,25 @@
 )
 
 
+# Lazy module-level attrs: ``Synthesizer`` and ``ContextConstructionConfig``
+# materialize on first access (PEP 562) so unrelated CLI commands like
+# ``deepeval test run`` don't pay for the synthesizer chain at startup.
+# Tests still see them as module attributes so ``monkeypatch.setattr(
+# generate_cli, "Synthesizer", _Fake)`` works.
+def __getattr__(name: str) -> Any:
+    if name == "Synthesizer":
+        from deepeval.synthesizer import Synthesizer
+
+        globals()["Synthesizer"] = Synthesizer
+        return Synthesizer
+    if name == "ContextConstructionConfig":
+        from deepeval.synthesizer.config import ContextConstructionConfig
+
+        globals()["ContextConstructionConfig"] = ContextConstructionConfig
+        return ContextConstructionConfig
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
 def generate_command(
     method: GenerationMethod = typer.Option(
         ...,
@@ -186,6 +204,13 @@ def generate_command(
     ),
 ):
     """Generate synthetic goldens with the golden synthesizer."""
+    # Go through the module so test monkeypatches stick. Direct
+    # ``from deepeval.synthesizer import Synthesizer`` would always
+    # fetch the real class and ignore patched module attrs.
+    _self = sys.modules[__name__]
+    Synthesizer = _self.Synthesizer
+    ContextConstructionConfig = _self.ContextConstructionConfig
+
     document_paths = None
     contexts = None
     goldens = None

diff --git a/deepeval/evaluate/execute/agentic.py b/deepeval/evaluate/execute/agentic.py
@@ -185,18 +185,39 @@ async def dfs(trace: Trace, span: BaseSpan):
                     await asyncio.gather(*child_tasks, return_exceptions=True)
                     raise
 
-        if not _skip_metrics_for_error(trace=current_trace):
-            if current_trace and current_trace.root_spans:
-                await dfs(current_trace, current_trace.root_spans[0])
-            else:
-                if (
-                    logger.isEnabledFor(logging.DEBUG)
-                    and get_settings().DEEPEVAL_VERBOSE_MODE
-                ):
-                    logger.debug(
-                        "Skipping DFS: empty trace or no root spans (trace=%s)",
-                        current_trace.uuid if current_trace else None,
+        # Always walk spans, even on errored traces — the walker hydrates
+        # ``trace_api.*_spans`` and the user needs that data on the
+        # dashboard to diagnose. Per-span metric skip already lives
+        # inside ``_a_execute_span_test_case`` (appends api_span first,
+        # then short-circuits on error). Walk EVERY root, not just
+        # ``root_spans[0]``: OTel integrations can land multiple logical
+        # roots when a child ends before its parent.
+        if current_trace and current_trace.root_spans:
+            root_tasks = [
+                asyncio.create_task(dfs(current_trace, root))
+                for root in current_trace.root_spans
+            ]
+            if root_tasks:
+                try:
+                    await asyncio.wait_for(
+                        asyncio.gather(*root_tasks),
+                        timeout=get_gather_timeout(),
                     )
+                except (asyncio.TimeoutError, TimeoutError):
+                    for t in root_tasks:
+                        if not t.done():
+                            t.cancel()
+                    await asyncio.gather(*root_tasks, return_exceptions=True)
+                    raise
+        else:
+            if (
+                logger.isEnabledFor(logging.DEBUG)
+                and get_settings().DEEPEVAL_VERBOSE_MODE
+            ):
+                logger.debug(
+                    "Skipping DFS: empty trace or no root spans (trace=%s)",
+                    current_trace.uuid if current_trace else None,
+                )
     except asyncio.CancelledError:
         # mark any unfinished metrics as cancelled
         if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:

diff --git a/deepeval/evaluate/execute/loop.py b/deepeval/evaluate/execute/loop.py
@@ -360,7 +360,10 @@ def dfs(
 
                 start_time = time.perf_counter()
 
-                # Handle trace-level metrics
+                # On errored traces, skip trace-level metrics (no test case
+                # to judge) but DO run the span-level DFS walker below —
+                # it's what hydrates ``trace_api.*_spans`` for the dashboard,
+                # and per-span metric skip is handled inside ``dfs``.
                 skip_metrics_for_this_golden = False
                 if _skip_metrics_for_error(trace=current_trace):
                     trace_api.status = TraceSpanApiStatus.ERRORED
@@ -372,73 +375,74 @@ def dfs(
                                 current_trace
                             ),
                         )
-                else:
-                    if current_trace.metrics:
-                        requires_trace = any(
-                            metric.requires_trace
-                            for metric in current_trace.metrics
-                        )
+                elif current_trace.metrics:
+                    requires_trace = any(
+                        metric.requires_trace
+                        for metric in current_trace.metrics
+                    )
 
-                        # Build the trace-level LLMTestCase from the golden
-                        # directly, the same way the async iterator does
-                        # (see ``_a_evaluate_trace``). This makes top-level
-                        # ``metrics=[...]`` work out of the box even when the
-                        # user never calls ``update_current_trace(input=...)``.
-                        llm_test_case = LLMTestCase(
-                            input=golden.input,
-                            actual_output=(
-                                str(current_trace.output)
-                                if current_trace.output is not None
-                                else golden.actual_output
-                            ),
-                            expected_output=current_trace.expected_output,
-                            context=current_trace.context,
-                            retrieval_context=current_trace.retrieval_context,
-                            tools_called=current_trace.tools_called,
-                            expected_tools=current_trace.expected_tools,
-                        )
+                    # Build the trace-level LLMTestCase from the golden
+                    # directly, the same way the async iterator does
+                    # (see ``_a_evaluate_trace``). This makes top-level
+                    # ``metrics=[...]`` work out of the box even when the
+                    # user never calls ``update_current_trace(input=...)``.
+                    llm_test_case = LLMTestCase(
+                        input=golden.input,
+                        actual_output=(
+                            str(current_trace.output)
+                            if current_trace.output is not None
+                            else golden.actual_output
+                        ),
+                        expected_output=current_trace.expected_output,
+                        context=current_trace.context,
+                        retrieval_context=current_trace.retrieval_context,
+                        tools_called=current_trace.tools_called,
+                        expected_tools=current_trace.expected_tools,
+                    )
 
-                        if requires_trace:
-                            llm_test_case._trace_dict = (
-                                trace_manager.create_nested_spans_dict(
-                                    current_trace.root_spans[0]
-                                )
+                    if requires_trace:
+                        llm_test_case._trace_dict = (
+                            trace_manager.create_nested_spans_dict(
+                                current_trace.root_spans[0]
                             )
+                        )
 
-                        if not skip_metrics_for_this_golden:
-                            for metric in current_trace.metrics:
-                                metric.skipped = False
-                                metric.error = None
-                                if display_config.verbose_mode is not None:
-                                    metric.verbose_mode = (
-                                        display_config.verbose_mode
-                                    )
-
-                            trace_api.metrics_data = []
-                            for metric in current_trace.metrics:
-                                res = _execute_metric(
-                                    metric=metric,
-                                    test_case=llm_test_case,
-                                    show_metric_indicator=show_metric_indicator,
-                                    in_component=True,
-                                    error_config=error_config,
+                    if not skip_metrics_for_this_golden:
+                        for metric in current_trace.metrics:
+                            metric.skipped = False
+                            metric.error = None
+                            if display_config.verbose_mode is not None:
+                                metric.verbose_mode = (
+                                    display_config.verbose_mode
                                 )
-                                if res == "skip":
-                                    continue
-
-                                if not metric.skipped:
-                                    metric_data = create_metric_data(metric)
-                                    trace_api.metrics_data.append(metric_data)
-                                    api_test_case.update_metric_data(
-                                        metric_data
-                                    )
-                                    api_test_case.update_status(
-                                        metric_data.success
-                                    )
-                                    update_pbar(progress, pbar_eval_id)
 
-                    # Then handle span-level metrics
-                    dfs(current_trace.root_spans[0], progress, pbar_eval_id)
+                        trace_api.metrics_data = []
+                        for metric in current_trace.metrics:
+                            res = _execute_metric(
+                                metric=metric,
+                                test_case=llm_test_case,
+                                show_metric_indicator=show_metric_indicator,
+                                in_component=True,
+                                error_config=error_config,
+                            )
+                            if res == "skip":
+                                continue
+
+                            if not metric.skipped:
+                                metric_data = create_metric_data(metric)
+                                trace_api.metrics_data.append(metric_data)
+                                api_test_case.update_metric_data(metric_data)
+                                api_test_case.update_status(metric_data.success)
+                                update_pbar(progress, pbar_eval_id)
+
+                # Always walk spans, even on errored traces — the walker
+                # hydrates ``trace_api.*_spans`` and the user needs that
+                # data on the dashboard to diagnose. Walk EVERY root, not
+                # just ``root_spans[0]``: OTel integrations can land
+                # multiple logical roots when a child ends before its
+                # parent. Mirrors the async path in ``agentic.py``.
+                for root in current_trace.root_spans:
+                    dfs(root, progress, pbar_eval_id)
 
             end_time = time.perf_counter()
             run_duration = end_time - start_time

diff --git a/deepeval/evaluate/utils.py b/deepeval/evaluate/utils.py
@@ -163,6 +163,27 @@ def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
     # problem. The truthiness check cleanly covers the "absent" cases
     # (`None`, `{}`, `""`) that would otherwise show as garbage in the
     # trace-level Metrics Summary and break `filter_duplicate_results`.
+    #
+    # Span lists start empty and are populated by the eval-iterator's
+    # DFS walker (``_a_execute_span_test_case`` / its sync twin), which
+    # categorizes each visited span by isinstance and appends to the
+    # matching ``trace_api.*_spans`` list. We DON'T pre-populate from
+    # ``trace.root_spans`` here because the walker is also responsible
+    # for attaching per-span metric data, error flags, and trace dicts —
+    # doing it twice (here + walker) would either double-emit or require
+    # the walker to dedupe.
+    #
+    # Trace-level fields (``name``, ``tags``, ``thread_id``, ``user_id``,
+    # ``metadata``, ``environment``) ARE forwarded from the trace so that
+    # OTel-based integrations whose users configured them via instrumentation
+    # settings or ``update_current_trace(...)`` see them on the dashboard.
+    # The non-eval REST path (``trace_manager.create_trace_api``) already
+    # forwards these; mirror its shape here so the eval-iterator path
+    # doesn't silently drop them. ``metadata`` is intentionally sourced
+    # from the golden (not the trace) — that's an evaluation-specific
+    # convention the eval pipeline relies on for per-row context. Trace-
+    # configured metadata flows through the per-trace upload path, not
+    # through this golden-shaped TraceApi.
     return TraceApi(
         uuid=trace.uuid,
         baseSpans=[],
@@ -188,6 +209,11 @@ def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
         tools_called=trace.tools_called,
         expected_tools=trace.expected_tools,
         metadata=golden.additional_metadata,
+        name=trace.name,
+        tags=trace.tags,
+        threadId=trace.thread_id,
+        userId=trace.user_id,
+        environment=trace.environment,
         status=(
             TraceSpanApiStatus.SUCCESS
             if trace.status == TraceSpanStatus.SUCCESS