confident-ai · A-Vamshi · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 27, 2026
diff --git a/deepeval/confident/api.py b/deepeval/confident/api.py
@@ -128,6 +128,7 @@ class Endpoints(Enum):
     DATASET_ALIAS_VERSIONS_ENDPOINT = "/v1/datasets/:alias/versions"
 
     TEST_RUN_ENDPOINT = "/v1/test-run"
+    TEST_RUNS_ENDPOINT = "/v1/test-runs"
     EXPERIMENT_ENDPOINT = "/v1/experiment"
     TRACES_ENDPOINT = "/v1/traces"
     ANNOTATIONS_ENDPOINT = "/v1/annotations"

diff --git a/deepeval/tracing/__init__.py b/deepeval/tracing/__init__.py
@@ -1,6 +1,9 @@
 from .context import (
     update_current_span,
     update_current_trace,
+    trace_test_run,
+    trace_test_run_id,
+    trace_test_run_metric_collection,
     current_trace_context,
     current_span_context,
     update_agent_span,
@@ -23,6 +26,9 @@
 __all__ = [
     "update_current_span",
     "update_current_trace",
+    "trace_test_run",
+    "trace_test_run_id",
+    "trace_test_run_metric_collection",
     "current_trace_context",
     "current_span_context",
     "update_agent_span",

diff --git a/deepeval/tracing/api.py b/deepeval/tracing/api.py
@@ -148,6 +148,7 @@ class TraceApi(BaseModel):
     output: Optional[Any] = Field(None)
     status: Optional[TraceSpanApiStatus] = Field(TraceSpanApiStatus.SUCCESS)
     test_case_id: Optional[str] = Field(None, alias="testCaseId")
+    test_run_id: Optional[str] = Field(None, alias="testRunId")
     turn_id: Optional[str] = Field(None, alias="turnId")
 
     # additional test case parameters

diff --git a/deepeval/tracing/context.py b/deepeval/tracing/context.py
@@ -59,6 +59,48 @@ def drop(self):
 
 current_span_context = SpanContext()
 current_trace_context = TraceContext()
+trace_test_run_id: ContextVar[Optional[str]] = ContextVar(
+    "trace_test_run_id", default=None
+)
+trace_test_run_metric_collection: ContextVar[Optional[str]] = ContextVar(
+    "trace_test_run_metric_collection", default=None
+)
+
+
+@contextmanager
+def trace_test_run(
+    metric_collection: str, identifier: Optional[str] = None
+) -> Iterator[Optional[str]]:
+    from deepeval.confident.api import (
+        Api,
+        Endpoints,
+        HttpMethods,
+        is_confident,
+    )
+
+    if not is_confident():
+        raise ValueError(
+            "No Confident AI API key found. Set one to use trace_test_run(...)."
+        )
+
+    body: Dict[str, Any] = {"metricCollection": metric_collection}
+    if identifier is not None:
+        body["identifier"] = identifier
+
+    data, _ = Api().send_request(
+        method=HttpMethods.POST,
+        endpoint=Endpoints.TEST_RUNS_ENDPOINT,
+        body=body,
+    )
+    test_run_id = data.get("id") if isinstance(data, dict) else data
+
+    id_token = trace_test_run_id.set(test_run_id)
+    mc_token = trace_test_run_metric_collection.set(metric_collection)
+    try:
+        yield test_run_id
+    finally:
+        trace_test_run_id.reset(id_token)
+        trace_test_run_metric_collection.reset(mc_token)
 
 
 def update_current_span(
@@ -133,6 +175,7 @@ def update_current_trace(
     test_case: Optional[LLMTestCase] = None,
     confident_api_key: Optional[str] = None,
     test_case_id: Optional[str] = None,
+    test_run_id: Optional[str] = None,
     turn_id: Optional[str] = None,
     metric_collection: Optional[str] = None,
     metrics: Optional[List[BaseMetric]] = None,
@@ -176,6 +219,8 @@ def update_current_trace(
         current_trace.confident_api_key = confident_api_key
     if test_case_id:
         current_trace.test_case_id = test_case_id
+    if test_run_id:
+        current_trace.test_run_id = test_run_id
     if turn_id:
         current_trace.turn_id = turn_id
     if metric_collection:

diff --git a/deepeval/tracing/otel/exporter.py b/deepeval/tracing/otel/exporter.py
@@ -89,6 +89,7 @@ class BaseSpanWrapper:
     trace_tools_called: Optional[List[ToolCall]] = None
     trace_expected_tools: Optional[List[ToolCall]] = None
     trace_test_case_id: Optional[str] = None
+    trace_test_run_id: Optional[str] = None
     trace_turn_id: Optional[str] = None
     trace_metric_collection: Optional[str] = None
     trace_environment: Optional[str] = None
@@ -325,6 +326,10 @@ def _set_current_trace_attributes_from_base_span_wrapper(
             base_span_wrapper.trace_test_case_id, str
         ):
             current_trace.test_case_id = base_span_wrapper.trace_test_case_id
+        if base_span_wrapper.trace_test_run_id and isinstance(
+            base_span_wrapper.trace_test_run_id, str
+        ):
+            current_trace.test_run_id = base_span_wrapper.trace_test_run_id
         if base_span_wrapper.trace_turn_id and isinstance(
             base_span_wrapper.trace_turn_id, str
         ):
@@ -427,6 +432,7 @@ def __set_trace_attributes(
             raw_trace_expected_tools = list(raw_trace_expected_tools)
 
         trace_test_case_id = span.attributes.get("confident.trace.test_case_id")
+        trace_test_run_id = span.attributes.get("confident.trace.test_run_id")
         trace_turn_id = span.attributes.get("confident.trace.turn_id")
 
         raw_trace_metric_collection = span.attributes.get(
@@ -460,6 +466,7 @@ def __set_trace_attributes(
         base_span_wrapper.trace_tools_called = trace_tools_called
         base_span_wrapper.trace_expected_tools = trace_expected_tools
         base_span_wrapper.trace_test_case_id = trace_test_case_id
+        base_span_wrapper.trace_test_run_id = trace_test_run_id
         base_span_wrapper.trace_turn_id = trace_turn_id
         base_span_wrapper.trace_metric_collection = trace_metric_collection
         base_span_wrapper.trace_environment = trace_environment

diff --git a/deepeval/tracing/tracing.py b/deepeval/tracing/tracing.py
@@ -76,6 +76,8 @@
     apply_pending_to_span,
     current_span_context,
     current_trace_context,
+    trace_test_run_id,
+    trace_test_run_metric_collection,
     pop_pending_for,
 )
 from deepeval.tracing.types import TestCaseMetricPair
@@ -328,6 +330,18 @@ def end_trace(self, trace_uuid: str):
             if trace.status == TraceSpanStatus.IN_PROGRESS:
                 trace.status = TraceSpanStatus.SUCCESS
 
+            # Inside a `trace_test_run(...)` block, stamp the trace so it becomes
+            # one test case in that run. 
+            if not trace.test_run_id:
+                active_test_run_id = trace_test_run_id.get()
+                if active_test_run_id:
+                    trace.test_run_id = active_test_run_id
+                    active_metric_collection = (
+                        trace_test_run_metric_collection.get()
+                    )
+                    if active_metric_collection:
+                        trace.metric_collection = active_metric_collection
+
             if trace_testing_manager.test_name:
                 # Trace testing mode is enabled
                 # Instead posting the trace to the queue, it will be stored in this global variable
@@ -877,6 +891,7 @@ def create_trace_api(self, trace: Trace) -> TraceApi:
             toolsCalled=trace.tools_called,
             expectedTools=trace.expected_tools,
             testCaseId=trace.test_case_id,
+            testRunId=trace.test_run_id,
             turnId=trace.turn_id,
             confident_api_key=trace.confident_api_key,
             environment=(

diff --git a/deepeval/tracing/types.py b/deepeval/tracing/types.py
@@ -197,6 +197,7 @@ class Trace(BaseModel):
     metrics: Optional[List[BaseMetric]] = None
     metric_collection: Optional[str] = None
     test_case_id: Optional[str] = Field(None, serialization_alias="testCaseId")
+    test_run_id: Optional[str] = Field(None, serialization_alias="testRunId")
     turn_id: Optional[str] = Field(None, serialization_alias="turnId")
 
     # Don't serialize these

diff --git a/docs/content/docs/(concepts)/evaluation-llm-tracing.mdx b/docs/content/docs/(concepts)/evaluation-llm-tracing.mdx
@@ -454,6 +454,53 @@ Both `update_current_trace` and `update_current_span` accept the same set of `LL
 `tags` and `metadata` aren't just for filtering and visualization — they're real test case fields that custom metrics like [`GEval`](/docs/metrics-llm-evals) can read. If your eval criteria depend on, say, the user tier or the retrieval source, set those on the trace/span via `tags` / `metadata` and reference them in your `GEval` criteria.
 :::
 
+## Group traces into a test run
+
+By default each trace is logged on its own. Wrap your code in `trace_test_run(...)` to collect every trace produced inside the block into a single **test run**, where **each trace becomes one test case** evaluated against the run's metric collection:
+
+```python title="main.py" showLineNumbers {1,4}
+from deepeval.tracing import observe, trace_test_run, trace_test_run_id
+
+with trace_test_run(metric_collection="My Collection", identifier="nightly-run"):
+    for golden in goldens:
+        llm_app(golden.input)   # each trace here becomes one test case in the run
+```
+
+`trace_test_run` creates an in-progress test run on Confident AI and stamps every trace ended inside the block with that test run's id (and metric collection). It yields the test run id if you need it:
+
+```python title="main.py" showLineNumbers {1}
+with trace_test_run(metric_collection="My Collection") as test_run_id:
+    llm_app(query)
+    print(trace_test_run_id) # Same as test_run_id and can be used in any other files
+```
+
+| Parameter           | Type            | Description                                                                          |
+| ------------------- | --------------- | ------------------------------------------------------------------------------------ |
+| `metric_collection` | `str`           | The metric collection each trace's test case is evaluated against. **Required.**     |
+| `identifier`        | `Optional[str]` | A human-readable label for the test run, shown on the Confident AI platform.         |
+
+:::note
+The metric collection passed to `trace_test_run` is authoritative for the test run — it's what every trace's test case is evaluated against, even if a trace inside the block also sets its own `metric_collection`. Span-level metric collections still run as independent [component-level evals](/docs/evaluation-component-level-llm-evals).
+:::
+
+The test run stays in progress while you stream traces into it and is finalized automatically shortly after. To attach a trace to a run **without** the context manager — for example when you already have a test run id — pass it to `update_current_trace`:
+
+```python title="main.py" showLineNumbers {4,5}
+from deepeval.tracing import observe, update_current_trace
+
+@observe()
+def llm_app(query: str) -> str:
+    update_current_trace(test_run_id=test_run_id, metric_collection="My Collection")
+    ...
+```
+
+If you instrument with OpenTelemetry instead of `@observe`, set the test run id as a span attribute and `deepeval`'s exporter will pick it up:
+
+```python
+span.set_attribute("confident.trace.test_run_id", test_run_id)
+span.set_attribute("confident.trace.metric_collection", "My Collection")
+```
+
 ## Prettifying traces for coding agents
 
 Traces aren't only read by humans. When you run evals locally and a metric fails, the failing trace is also what coding agents like **Claude Code, Codex, and Cursor** load into context to figure out which prompt, retriever, or tool actually caused the regression.

diff --git a/docs/content/guides/guides-tracing-rag.mdx b/docs/content/guides/guides-tracing-rag.mdx
@@ -248,6 +248,7 @@ The `update_current_trace()` function allows you to set attributes on the trace
 | `test_case`         | `Optional[LLMTestCase]`    | Bulk assignment of multiple fields from a test case                  |
 | `confident_api_key` | `Optional[str]`            | API key for Confident AI integration                                 |
 | `test_case_id`      | `Optional[str]`            | Identifier for the associated test case                              |
+| `test_run_id`       | `Optional[str]`            | Identifier of the test run to add this trace to as a test case (see `trace_test_run`) |
 | `turn_id`           | `Optional[str]`            | Identifier for the specific interaction turn                         |
 | `metric_collection` | `Optional[str]`            | Attach a predefined Confident AI metric collection                   |
 

diff --git a/typescript/src/confident/api.ts b/typescript/src/confident/api.ts
@@ -62,6 +62,7 @@ export enum Endpoints {
   DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue",
   DATASET_ALIAS_VERSIONS_ENDPOINT = "/v1/datasets/:alias/versions",
   TEST_RUN_ENDPOINT = "/v1/test-run",
+  TEST_RUNS_ENDPOINT = "/v1/test-runs",
   TRACING_ENDPOINT = "/v1/tracing",
   TRACES_ENDPOINT = "/v1/traces",
   EVENT_ENDPOINT = "/v1/event",

diff --git a/typescript/src/tracing/api.ts b/typescript/src/tracing/api.ts
@@ -101,6 +101,7 @@ export interface TraceApi {
   threadId?: string;
   userId?: string;
   testCaseId?: string;
+  testRunId?: string;
   turnId?: string;
   input?: any;
   output?: any;

diff --git a/typescript/src/tracing/index.ts b/typescript/src/tracing/index.ts
@@ -10,6 +10,8 @@ export {
   traceManager,
   getCurrentSpan,
   getCurrentTrace,
+  traceTestRun,
+  getTraceTestRunId,
 } from "./tracing";
 
 export { setTracingContext } from "./trace-context";

diff --git a/typescript/src/tracing/tracing.ts b/typescript/src/tracing/tracing.ts
@@ -270,6 +270,7 @@ export interface Trace {
   threadId?: string;
   userId?: string;
   testCaseId?: string;
+  testRunId?: string;
   turnId?: string;
   input?: any;
   output?: any;
@@ -319,6 +320,43 @@ export function setCurrentTrace(trace: Trace | null): void {
   });
 }
 
+const traceTestRunContext = new AsyncLocalStorage<{
+  testRunId: string;
+  metricCollection: string;
+}>();
+
+export function getTraceTestRunId(): string | undefined {
+  return traceTestRunContext.getStore()?.testRunId;
+}
+
+export async function traceTestRun<T>(
+  options: { metricCollection: string; identifier?: string },
+  fn: (testRunId: string) => T | Promise<T>,
+): Promise<T> {
+  const body: Record<string, any> = {
+    metricCollection: options.metricCollection,
+  };
+  if (options.identifier !== undefined) {
+    body.identifier = options.identifier;
+  }
+
+  const api = new Api();
+  const result = await api.sendRequest(
+    HttpMethods.POST,
+    Endpoints.TEST_RUNS_ENDPOINT,
+    body,
+  );
+  const testRunId: string | undefined = result?.data?.id ?? result?.id;
+  if (!testRunId) {
+    throw new Error("Failed to create test run: no id returned.");
+  }
+
+  return traceTestRunContext.run(
+    { testRunId, metricCollection: options.metricCollection },
+    () => fn(testRunId),
+  );
+}
+
 export function withTracingContext<T>(
   span: BaseSpan | undefined,
   trace: Trace | undefined,
@@ -466,6 +504,15 @@ export class TraceManager {
       if (trace.status === TraceSpanStatus.IN_PROGRESS) {
         trace.status = TraceSpanStatus.SUCCESS;
       }
+      if (!trace.testRunId) {
+        const testRunStore = traceTestRunContext.getStore();
+        if (testRunStore?.testRunId) {
+          trace.testRunId = testRunStore.testRunId;
+          if (testRunStore.metricCollection) {
+            trace.metricCollection = testRunStore.metricCollection;
+          }
+        }
+      }
       if (this.traceCaptureSink) {
         this.traceCaptureSink(trace);
       } else if (!this.evaluating) {
@@ -797,6 +844,7 @@ export class TraceManager {
       threadId: trace.threadId,
       userId: trace.userId,
       testCaseId: trace.testCaseId,
+      testRunId: trace.testRunId,
       turnId: trace.turnId,
       input: trace.input,
       output: trace.output,
@@ -1447,6 +1495,7 @@ export interface UpdateCurrentTraceParams {
   threadId?: string;
   userId?: string;
   testCaseId?: string;
+  testRunId?: string;
   turnId?: string;
   input?: any;
   output?: any;
@@ -1469,6 +1518,7 @@ export const updateCurrentTrace = ({
   threadId,
   userId,
   testCaseId,
+  testRunId,
   turnId,
   input,
   output,
@@ -1516,6 +1566,9 @@ export const updateCurrentTrace = ({
   if (testCaseId !== undefined) {
     currentTrace.testCaseId = testCaseId;
   }
+  if (testRunId !== undefined) {
+    currentTrace.testRunId = testRunId;
+  }
   if (turnId !== undefined) {
     currentTrace.turnId = turnId;
   }