Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions deepeval/confident/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ class Endpoints(Enum):
DATASET_ALIAS_VERSIONS_ENDPOINT = "/v1/datasets/:alias/versions"

TEST_RUN_ENDPOINT = "/v1/test-run"
TEST_RUNS_ENDPOINT = "/v1/test-runs"
EXPERIMENT_ENDPOINT = "/v1/experiment"
TRACES_ENDPOINT = "/v1/traces"
ANNOTATIONS_ENDPOINT = "/v1/annotations"
Expand Down
6 changes: 6 additions & 0 deletions deepeval/tracing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from .context import (
update_current_span,
update_current_trace,
trace_test_run,
trace_test_run_id,
trace_test_run_metric_collection,
current_trace_context,
current_span_context,
update_agent_span,
Expand All @@ -23,6 +26,9 @@
__all__ = [
"update_current_span",
"update_current_trace",
"trace_test_run",
"trace_test_run_id",
"trace_test_run_metric_collection",
"current_trace_context",
"current_span_context",
"update_agent_span",
Expand Down
1 change: 1 addition & 0 deletions deepeval/tracing/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ class TraceApi(BaseModel):
output: Optional[Any] = Field(None)
status: Optional[TraceSpanApiStatus] = Field(TraceSpanApiStatus.SUCCESS)
test_case_id: Optional[str] = Field(None, alias="testCaseId")
test_run_id: Optional[str] = Field(None, alias="testRunId")
turn_id: Optional[str] = Field(None, alias="turnId")

# additional test case parameters
Expand Down
45 changes: 45 additions & 0 deletions deepeval/tracing/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,48 @@ def drop(self):

current_span_context = SpanContext()
current_trace_context = TraceContext()
trace_test_run_id: ContextVar[Optional[str]] = ContextVar(
"trace_test_run_id", default=None
)
trace_test_run_metric_collection: ContextVar[Optional[str]] = ContextVar(
"trace_test_run_metric_collection", default=None
)


@contextmanager
def trace_test_run(
metric_collection: str, identifier: Optional[str] = None
) -> Iterator[Optional[str]]:
from deepeval.confident.api import (
Api,
Endpoints,
HttpMethods,
is_confident,
)

if not is_confident():
raise ValueError(
"No Confident AI API key found. Set one to use trace_test_run(...)."
)

body: Dict[str, Any] = {"metricCollection": metric_collection}
if identifier is not None:
body["identifier"] = identifier

data, _ = Api().send_request(
method=HttpMethods.POST,
endpoint=Endpoints.TEST_RUNS_ENDPOINT,
body=body,
)
test_run_id = data.get("id") if isinstance(data, dict) else data

id_token = trace_test_run_id.set(test_run_id)
mc_token = trace_test_run_metric_collection.set(metric_collection)
try:
yield test_run_id
finally:
trace_test_run_id.reset(id_token)
trace_test_run_metric_collection.reset(mc_token)


def update_current_span(
Expand Down Expand Up @@ -133,6 +175,7 @@ def update_current_trace(
test_case: Optional[LLMTestCase] = None,
confident_api_key: Optional[str] = None,
test_case_id: Optional[str] = None,
test_run_id: Optional[str] = None,
turn_id: Optional[str] = None,
metric_collection: Optional[str] = None,
metrics: Optional[List[BaseMetric]] = None,
Expand Down Expand Up @@ -176,6 +219,8 @@ def update_current_trace(
current_trace.confident_api_key = confident_api_key
if test_case_id:
current_trace.test_case_id = test_case_id
if test_run_id:
current_trace.test_run_id = test_run_id
if turn_id:
current_trace.turn_id = turn_id
if metric_collection:
Expand Down
7 changes: 7 additions & 0 deletions deepeval/tracing/otel/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ class BaseSpanWrapper:
trace_tools_called: Optional[List[ToolCall]] = None
trace_expected_tools: Optional[List[ToolCall]] = None
trace_test_case_id: Optional[str] = None
trace_test_run_id: Optional[str] = None
trace_turn_id: Optional[str] = None
trace_metric_collection: Optional[str] = None
trace_environment: Optional[str] = None
Expand Down Expand Up @@ -325,6 +326,10 @@ def _set_current_trace_attributes_from_base_span_wrapper(
base_span_wrapper.trace_test_case_id, str
):
current_trace.test_case_id = base_span_wrapper.trace_test_case_id
if base_span_wrapper.trace_test_run_id and isinstance(
base_span_wrapper.trace_test_run_id, str
):
current_trace.test_run_id = base_span_wrapper.trace_test_run_id
if base_span_wrapper.trace_turn_id and isinstance(
base_span_wrapper.trace_turn_id, str
):
Expand Down Expand Up @@ -427,6 +432,7 @@ def __set_trace_attributes(
raw_trace_expected_tools = list(raw_trace_expected_tools)

trace_test_case_id = span.attributes.get("confident.trace.test_case_id")
trace_test_run_id = span.attributes.get("confident.trace.test_run_id")
trace_turn_id = span.attributes.get("confident.trace.turn_id")

raw_trace_metric_collection = span.attributes.get(
Expand Down Expand Up @@ -460,6 +466,7 @@ def __set_trace_attributes(
base_span_wrapper.trace_tools_called = trace_tools_called
base_span_wrapper.trace_expected_tools = trace_expected_tools
base_span_wrapper.trace_test_case_id = trace_test_case_id
base_span_wrapper.trace_test_run_id = trace_test_run_id
base_span_wrapper.trace_turn_id = trace_turn_id
base_span_wrapper.trace_metric_collection = trace_metric_collection
base_span_wrapper.trace_environment = trace_environment
Expand Down
15 changes: 15 additions & 0 deletions deepeval/tracing/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@
apply_pending_to_span,
current_span_context,
current_trace_context,
trace_test_run_id,
trace_test_run_metric_collection,
pop_pending_for,
)
from deepeval.tracing.types import TestCaseMetricPair
Expand Down Expand Up @@ -328,6 +330,18 @@ def end_trace(self, trace_uuid: str):
if trace.status == TraceSpanStatus.IN_PROGRESS:
trace.status = TraceSpanStatus.SUCCESS

# Inside a `trace_test_run(...)` block, stamp the trace so it becomes
# one test case in that run.
if not trace.test_run_id:
active_test_run_id = trace_test_run_id.get()
if active_test_run_id:
trace.test_run_id = active_test_run_id
active_metric_collection = (
trace_test_run_metric_collection.get()
)
if active_metric_collection:
trace.metric_collection = active_metric_collection

if trace_testing_manager.test_name:
# Trace testing mode is enabled
# Instead posting the trace to the queue, it will be stored in this global variable
Expand Down Expand Up @@ -877,6 +891,7 @@ def create_trace_api(self, trace: Trace) -> TraceApi:
toolsCalled=trace.tools_called,
expectedTools=trace.expected_tools,
testCaseId=trace.test_case_id,
testRunId=trace.test_run_id,
turnId=trace.turn_id,
confident_api_key=trace.confident_api_key,
environment=(
Expand Down
1 change: 1 addition & 0 deletions deepeval/tracing/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ class Trace(BaseModel):
metrics: Optional[List[BaseMetric]] = None
metric_collection: Optional[str] = None
test_case_id: Optional[str] = Field(None, serialization_alias="testCaseId")
test_run_id: Optional[str] = Field(None, serialization_alias="testRunId")
turn_id: Optional[str] = Field(None, serialization_alias="turnId")

# Don't serialize these
Expand Down
47 changes: 47 additions & 0 deletions docs/content/docs/(concepts)/evaluation-llm-tracing.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,53 @@ Both `update_current_trace` and `update_current_span` accept the same set of `LL
`tags` and `metadata` aren't just for filtering and visualization — they're real test case fields that custom metrics like [`GEval`](/docs/metrics-llm-evals) can read. If your eval criteria depend on, say, the user tier or the retrieval source, set those on the trace/span via `tags` / `metadata` and reference them in your `GEval` criteria.
:::

## Group traces into a test run

By default each trace is logged on its own. Wrap your code in `trace_test_run(...)` to collect every trace produced inside the block into a single **test run**, where **each trace becomes one test case** evaluated against the run's metric collection:

```python title="main.py" showLineNumbers {1,4}
from deepeval.tracing import observe, trace_test_run, trace_test_run_id

with trace_test_run(metric_collection="My Collection", identifier="nightly-run"):
for golden in goldens:
llm_app(golden.input) # each trace here becomes one test case in the run
```

`trace_test_run` creates an in-progress test run on Confident AI and stamps every trace ended inside the block with that test run's id (and metric collection). It yields the test run id if you need it:

```python title="main.py" showLineNumbers {1}
with trace_test_run(metric_collection="My Collection") as test_run_id:
llm_app(query)
print(trace_test_run_id) # Same as test_run_id and can be used in any other files
```

| Parameter | Type | Description |
| ------------------- | --------------- | ------------------------------------------------------------------------------------ |
| `metric_collection` | `str` | The metric collection each trace's test case is evaluated against. **Required.** |
| `identifier` | `Optional[str]` | A human-readable label for the test run, shown on the Confident AI platform. |

:::note
The metric collection passed to `trace_test_run` is authoritative for the test run — it's what every trace's test case is evaluated against, even if a trace inside the block also sets its own `metric_collection`. Span-level metric collections still run as independent [component-level evals](/docs/evaluation-component-level-llm-evals).
:::

The test run stays in progress while you stream traces into it and is finalized automatically shortly after. To attach a trace to a run **without** the context manager — for example when you already have a test run id — pass it to `update_current_trace`:

```python title="main.py" showLineNumbers {4,5}
from deepeval.tracing import observe, update_current_trace

@observe()
def llm_app(query: str) -> str:
update_current_trace(test_run_id=test_run_id, metric_collection="My Collection")
...
```

If you instrument with OpenTelemetry instead of `@observe`, set the test run id as a span attribute and `deepeval`'s exporter will pick it up:

```python
span.set_attribute("confident.trace.test_run_id", test_run_id)
span.set_attribute("confident.trace.metric_collection", "My Collection")
```

## Prettifying traces for coding agents

Traces aren't only read by humans. When you run evals locally and a metric fails, the failing trace is also what coding agents like **Claude Code, Codex, and Cursor** load into context to figure out which prompt, retriever, or tool actually caused the regression.
Expand Down
1 change: 1 addition & 0 deletions docs/content/guides/guides-tracing-rag.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ The `update_current_trace()` function allows you to set attributes on the trace
| `test_case` | `Optional[LLMTestCase]` | Bulk assignment of multiple fields from a test case |
| `confident_api_key` | `Optional[str]` | API key for Confident AI integration |
| `test_case_id` | `Optional[str]` | Identifier for the associated test case |
| `test_run_id` | `Optional[str]` | Identifier of the test run to add this trace to as a test case (see `trace_test_run`) |
| `turn_id` | `Optional[str]` | Identifier for the specific interaction turn |
| `metric_collection` | `Optional[str]` | Attach a predefined Confident AI metric collection |

Expand Down
1 change: 1 addition & 0 deletions typescript/src/confident/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ export enum Endpoints {
DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue",
DATASET_ALIAS_VERSIONS_ENDPOINT = "/v1/datasets/:alias/versions",
TEST_RUN_ENDPOINT = "/v1/test-run",
TEST_RUNS_ENDPOINT = "/v1/test-runs",
TRACING_ENDPOINT = "/v1/tracing",
TRACES_ENDPOINT = "/v1/traces",
EVENT_ENDPOINT = "/v1/event",
Expand Down
1 change: 1 addition & 0 deletions typescript/src/tracing/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ export interface TraceApi {
threadId?: string;
userId?: string;
testCaseId?: string;
testRunId?: string;
turnId?: string;
input?: any;
output?: any;
Expand Down
2 changes: 2 additions & 0 deletions typescript/src/tracing/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ export {
traceManager,
getCurrentSpan,
getCurrentTrace,
traceTestRun,
getTraceTestRunId,
} from "./tracing";

export { setTracingContext } from "./trace-context";
Expand Down
53 changes: 53 additions & 0 deletions typescript/src/tracing/tracing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ export interface Trace {
threadId?: string;
userId?: string;
testCaseId?: string;
testRunId?: string;
turnId?: string;
input?: any;
output?: any;
Expand Down Expand Up @@ -319,6 +320,43 @@ export function setCurrentTrace(trace: Trace | null): void {
});
}

const traceTestRunContext = new AsyncLocalStorage<{
testRunId: string;
metricCollection: string;
}>();

export function getTraceTestRunId(): string | undefined {
return traceTestRunContext.getStore()?.testRunId;
}

export async function traceTestRun<T>(
options: { metricCollection: string; identifier?: string },
fn: (testRunId: string) => T | Promise<T>,
): Promise<T> {
const body: Record<string, any> = {
metricCollection: options.metricCollection,
};
if (options.identifier !== undefined) {
body.identifier = options.identifier;
}

const api = new Api();
const result = await api.sendRequest(
HttpMethods.POST,
Endpoints.TEST_RUNS_ENDPOINT,
body,
);
const testRunId: string | undefined = result?.data?.id ?? result?.id;
if (!testRunId) {
throw new Error("Failed to create test run: no id returned.");
}

return traceTestRunContext.run(
{ testRunId, metricCollection: options.metricCollection },
() => fn(testRunId),
);
}

export function withTracingContext<T>(
span: BaseSpan | undefined,
trace: Trace | undefined,
Expand Down Expand Up @@ -466,6 +504,15 @@ export class TraceManager {
if (trace.status === TraceSpanStatus.IN_PROGRESS) {
trace.status = TraceSpanStatus.SUCCESS;
}
if (!trace.testRunId) {
const testRunStore = traceTestRunContext.getStore();
if (testRunStore?.testRunId) {
trace.testRunId = testRunStore.testRunId;
if (testRunStore.metricCollection) {
trace.metricCollection = testRunStore.metricCollection;
}
}
}
if (this.traceCaptureSink) {
this.traceCaptureSink(trace);
} else if (!this.evaluating) {
Expand Down Expand Up @@ -797,6 +844,7 @@ export class TraceManager {
threadId: trace.threadId,
userId: trace.userId,
testCaseId: trace.testCaseId,
testRunId: trace.testRunId,
turnId: trace.turnId,
input: trace.input,
output: trace.output,
Expand Down Expand Up @@ -1447,6 +1495,7 @@ export interface UpdateCurrentTraceParams {
threadId?: string;
userId?: string;
testCaseId?: string;
testRunId?: string;
turnId?: string;
input?: any;
output?: any;
Expand All @@ -1469,6 +1518,7 @@ export const updateCurrentTrace = ({
threadId,
userId,
testCaseId,
testRunId,
turnId,
input,
output,
Expand Down Expand Up @@ -1516,6 +1566,9 @@ export const updateCurrentTrace = ({
if (testCaseId !== undefined) {
currentTrace.testCaseId = testCaseId;
}
if (testRunId !== undefined) {
currentTrace.testRunId = testRunId;
}
if (turnId !== undefined) {
currentTrace.turnId = turnId;
}
Expand Down
Loading