Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions deepeval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def _expose_public_api() -> None:
# Do not do this at module level or ruff will complain with E402
global __version__, evaluate, assert_test, compare
global on_test_run_end, log_hyperparameters, login, telemetry
global instrument

from ._version import __version__ as _version
from deepeval.evaluate import (
Expand All @@ -40,6 +41,25 @@ def _expose_public_api() -> None:
login = _login
telemetry = _telemetry

def instrument(*args, **kwargs):
"""Set up Confident AI's OTel backend.
Configures a TracerProvider, attaches deepeval's OpenInference span
interceptor, and ships spans to the Confident OTel endpoint. Pair with
any OpenInference instrumentor (e.g. ``GoogleADKInstrumentor``,
``OpenAIInstrumentor``) to capture framework-specific telemetry.
Accepts the same arguments as
``deepeval.integrations.openinference.instrument_openinference``.
"""
from deepeval.integrations.openinference import (
instrument_openinference,
)

return instrument_openinference(*args, **kwargs)

globals()["instrument"] = instrument


_expose_public_api()

Expand All @@ -60,6 +80,7 @@ def _expose_public_api() -> None:
"assert_test",
"on_test_run_end",
"compare",
"instrument",
]


Expand Down
31 changes: 28 additions & 3 deletions deepeval/cli/generate/command.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import sys
from pathlib import Path
from typing import List, Optional
from typing import Any, List, Optional

import typer
from rich import print

from deepeval.synthesizer import Synthesizer
from deepeval.synthesizer.config import ContextConstructionConfig
from deepeval.cli.generate.utils import (
FileType,
GenerationMethod,
Expand All @@ -20,6 +19,25 @@
)


# Lazy module-level attrs: ``Synthesizer`` and ``ContextConstructionConfig``
# materialize on first access (PEP 562) so unrelated CLI commands like
# ``deepeval test run`` don't pay for the synthesizer chain at startup.
# Tests still see them as module attributes so ``monkeypatch.setattr(
# generate_cli, "Synthesizer", _Fake)`` works.
def __getattr__(name: str) -> Any:
if name == "Synthesizer":
from deepeval.synthesizer import Synthesizer

globals()["Synthesizer"] = Synthesizer
return Synthesizer
if name == "ContextConstructionConfig":
from deepeval.synthesizer.config import ContextConstructionConfig

globals()["ContextConstructionConfig"] = ContextConstructionConfig
return ContextConstructionConfig
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def generate_command(
method: GenerationMethod = typer.Option(
...,
Expand Down Expand Up @@ -186,6 +204,13 @@ def generate_command(
),
):
"""Generate synthetic goldens with the golden synthesizer."""
# Go through the module so test monkeypatches stick. Direct
# ``from deepeval.synthesizer import Synthesizer`` would always
# fetch the real class and ignore patched module attrs.
_self = sys.modules[__name__]
Synthesizer = _self.Synthesizer
ContextConstructionConfig = _self.ContextConstructionConfig

document_paths = None
contexts = None
goldens = None
Expand Down
43 changes: 32 additions & 11 deletions deepeval/evaluate/execute/agentic.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,18 +185,39 @@ async def dfs(trace: Trace, span: BaseSpan):
await asyncio.gather(*child_tasks, return_exceptions=True)
raise

if not _skip_metrics_for_error(trace=current_trace):
if current_trace and current_trace.root_spans:
await dfs(current_trace, current_trace.root_spans[0])
else:
if (
logger.isEnabledFor(logging.DEBUG)
and get_settings().DEEPEVAL_VERBOSE_MODE
):
logger.debug(
"Skipping DFS: empty trace or no root spans (trace=%s)",
current_trace.uuid if current_trace else None,
# Always walk spans, even on errored traces — the walker hydrates
# ``trace_api.*_spans`` and the user needs that data on the
# dashboard to diagnose. Per-span metric skip already lives
# inside ``_a_execute_span_test_case`` (appends api_span first,
# then short-circuits on error). Walk EVERY root, not just
# ``root_spans[0]``: OTel integrations can land multiple logical
# roots when a child ends before its parent.
if current_trace and current_trace.root_spans:
root_tasks = [
asyncio.create_task(dfs(current_trace, root))
for root in current_trace.root_spans
]
if root_tasks:
try:
await asyncio.wait_for(
asyncio.gather(*root_tasks),
timeout=get_gather_timeout(),
)
except (asyncio.TimeoutError, TimeoutError):
for t in root_tasks:
if not t.done():
t.cancel()
await asyncio.gather(*root_tasks, return_exceptions=True)
raise
else:
if (
logger.isEnabledFor(logging.DEBUG)
and get_settings().DEEPEVAL_VERBOSE_MODE
):
logger.debug(
"Skipping DFS: empty trace or no root spans (trace=%s)",
current_trace.uuid if current_trace else None,
)
except asyncio.CancelledError:
# mark any unfinished metrics as cancelled
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
Expand Down
128 changes: 66 additions & 62 deletions deepeval/evaluate/execute/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,10 @@ def dfs(

start_time = time.perf_counter()

# Handle trace-level metrics
# On errored traces, skip trace-level metrics (no test case
# to judge) but DO run the span-level DFS walker below —
# it's what hydrates ``trace_api.*_spans`` for the dashboard,
# and per-span metric skip is handled inside ``dfs``.
skip_metrics_for_this_golden = False
if _skip_metrics_for_error(trace=current_trace):
trace_api.status = TraceSpanApiStatus.ERRORED
Expand All @@ -372,73 +375,74 @@ def dfs(
current_trace
),
)
else:
if current_trace.metrics:
requires_trace = any(
metric.requires_trace
for metric in current_trace.metrics
)
elif current_trace.metrics:
requires_trace = any(
metric.requires_trace
for metric in current_trace.metrics
)

# Build the trace-level LLMTestCase from the golden
# directly, the same way the async iterator does
# (see ``_a_evaluate_trace``). This makes top-level
# ``metrics=[...]`` work out of the box even when the
# user never calls ``update_current_trace(input=...)``.
llm_test_case = LLMTestCase(
input=golden.input,
actual_output=(
str(current_trace.output)
if current_trace.output is not None
else golden.actual_output
),
expected_output=current_trace.expected_output,
context=current_trace.context,
retrieval_context=current_trace.retrieval_context,
tools_called=current_trace.tools_called,
expected_tools=current_trace.expected_tools,
)
# Build the trace-level LLMTestCase from the golden
# directly, the same way the async iterator does
# (see ``_a_evaluate_trace``). This makes top-level
# ``metrics=[...]`` work out of the box even when the
# user never calls ``update_current_trace(input=...)``.
llm_test_case = LLMTestCase(
input=golden.input,
actual_output=(
str(current_trace.output)
if current_trace.output is not None
else golden.actual_output
),
expected_output=current_trace.expected_output,
context=current_trace.context,
retrieval_context=current_trace.retrieval_context,
tools_called=current_trace.tools_called,
expected_tools=current_trace.expected_tools,
)

if requires_trace:
llm_test_case._trace_dict = (
trace_manager.create_nested_spans_dict(
current_trace.root_spans[0]
)
if requires_trace:
llm_test_case._trace_dict = (
trace_manager.create_nested_spans_dict(
current_trace.root_spans[0]
)
)

if not skip_metrics_for_this_golden:
for metric in current_trace.metrics:
metric.skipped = False
metric.error = None
if display_config.verbose_mode is not None:
metric.verbose_mode = (
display_config.verbose_mode
)

trace_api.metrics_data = []
for metric in current_trace.metrics:
res = _execute_metric(
metric=metric,
test_case=llm_test_case,
show_metric_indicator=show_metric_indicator,
in_component=True,
error_config=error_config,
if not skip_metrics_for_this_golden:
for metric in current_trace.metrics:
metric.skipped = False
metric.error = None
if display_config.verbose_mode is not None:
metric.verbose_mode = (
display_config.verbose_mode
)
if res == "skip":
continue

if not metric.skipped:
metric_data = create_metric_data(metric)
trace_api.metrics_data.append(metric_data)
api_test_case.update_metric_data(
metric_data
)
api_test_case.update_status(
metric_data.success
)
update_pbar(progress, pbar_eval_id)

# Then handle span-level metrics
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
trace_api.metrics_data = []
for metric in current_trace.metrics:
res = _execute_metric(
metric=metric,
test_case=llm_test_case,
show_metric_indicator=show_metric_indicator,
in_component=True,
error_config=error_config,
)
if res == "skip":
continue

if not metric.skipped:
metric_data = create_metric_data(metric)
trace_api.metrics_data.append(metric_data)
api_test_case.update_metric_data(metric_data)
api_test_case.update_status(metric_data.success)
update_pbar(progress, pbar_eval_id)

# Always walk spans, even on errored traces — the walker
# hydrates ``trace_api.*_spans`` and the user needs that
# data on the dashboard to diagnose. Walk EVERY root, not
# just ``root_spans[0]``: OTel integrations can land
# multiple logical roots when a child ends before its
# parent. Mirrors the async path in ``agentic.py``.
for root in current_trace.root_spans:
dfs(root, progress, pbar_eval_id)

end_time = time.perf_counter()
run_duration = end_time - start_time
Expand Down
26 changes: 26 additions & 0 deletions deepeval/evaluate/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,27 @@ def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
# problem. The truthiness check cleanly covers the "absent" cases
# (`None`, `{}`, `""`) that would otherwise show as garbage in the
# trace-level Metrics Summary and break `filter_duplicate_results`.
#
# Span lists start empty and are populated by the eval-iterator's
# DFS walker (``_a_execute_span_test_case`` / its sync twin), which
# categorizes each visited span by isinstance and appends to the
# matching ``trace_api.*_spans`` list. We DON'T pre-populate from
# ``trace.root_spans`` here because the walker is also responsible
# for attaching per-span metric data, error flags, and trace dicts —
# doing it twice (here + walker) would either double-emit or require
# the walker to dedupe.
#
# Trace-level fields (``name``, ``tags``, ``thread_id``, ``user_id``,
# ``metadata``, ``environment``) ARE forwarded from the trace so that
# OTel-based integrations whose users configured them via instrumentation
# settings or ``update_current_trace(...)`` see them on the dashboard.
# The non-eval REST path (``trace_manager.create_trace_api``) already
# forwards these; mirror its shape here so the eval-iterator path
# doesn't silently drop them. ``metadata`` is intentionally sourced
# from the golden (not the trace) — that's an evaluation-specific
# convention the eval pipeline relies on for per-row context. Trace-
# configured metadata flows through the per-trace upload path, not
# through this golden-shaped TraceApi.
return TraceApi(
uuid=trace.uuid,
baseSpans=[],
Expand All @@ -188,6 +209,11 @@ def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
tools_called=trace.tools_called,
expected_tools=trace.expected_tools,
metadata=golden.additional_metadata,
name=trace.name,
tags=trace.tags,
threadId=trace.thread_id,
userId=trace.user_id,
environment=trace.environment,
status=(
TraceSpanApiStatus.SUCCESS
if trace.status == TraceSpanStatus.SUCCESS
Expand Down
Loading
Loading