diff --git a/deepeval/evaluate/execute.py b/deepeval/evaluate/execute.py
index ab850e9048..9d6b59d993 100644
--- a/deepeval/evaluate/execute.py
+++ b/deepeval/evaluate/execute.py
@@ -1189,7 +1189,7 @@ def dfs(
                         if has_task_completion or requires_trace:
                             if llm_test_case is None:
                                 llm_test_case = LLMTestCase(input="None")
-                            llm_test_case._trace_dict = (
+                            llm_test_case.trace_dict = (
                                 trace_manager.create_nested_spans_dict(span)
                             )
                         else:
@@ -1291,7 +1291,7 @@ def dfs(
                             if has_task_completion or requires_trace:
                                 if llm_test_case is None:
                                     llm_test_case = LLMTestCase(input="None")
-                                llm_test_case._trace_dict = (
+                                llm_test_case.trace_dict = (
                                     trace_manager.create_nested_spans_dict(
                                         current_trace.root_spans[0]
                                     )
@@ -1305,15 +1305,15 @@ def dfs(
                                         TraceSpanApiStatus.ERRORED
                                     )
                                     if current_trace.root_spans:
-                                        current_trace.root_spans[0].status = (
-                                            TraceSpanStatus.ERRORED
-                                        )
-                                        current_trace.root_spans[0].error = (
-                                            format_error_text(
-                                                DeepEvalError(
-                                                    "Trace has metrics but no LLMTestCase (missing input/output). "
-                                                    "Are you sure you called `update_current_trace()`?"
-                                                )
+                                        current_trace.root_spans[
+                                            0
+                                        ].status = TraceSpanStatus.ERRORED
+                                        current_trace.root_spans[
+                                            0
+                                        ].error = format_error_text(
+                                            DeepEvalError(
+                                                "Trace has metrics but no LLMTestCase (missing input/output). "
+                                                "Are you sure you called `update_current_trace()`?"
                                             )
                                         )
                                     if progress and pbar_eval_id is not None:
@@ -1997,7 +1997,7 @@ async def _a_execute_span_test_case(
     if requires_trace:
         if test_case is None:
             test_case = LLMTestCase(input="None")
-        test_case._trace_dict = trace_manager.create_nested_spans_dict(span)
+        test_case.trace_dict = trace_manager.create_nested_spans_dict(span)
 
     for metric in metrics:
         metric.skipped = False
@@ -2096,7 +2096,7 @@ async def _a_execute_trace_test_case(
     if requires_trace:
         if test_case is None:
             test_case = LLMTestCase(input="None")
-        test_case._trace_dict = trace_manager.create_nested_spans_dict(
+        test_case.trace_dict = trace_manager.create_nested_spans_dict(
             trace.root_spans[0]
         )
 
@@ -2284,7 +2284,7 @@ def dfs(
                     if requires_trace:
                         if llm_test_case is None:
                             llm_test_case = LLMTestCase(input="None")
-                        llm_test_case._trace_dict = (
+                        llm_test_case.trace_dict = (
                             trace_manager.create_nested_spans_dict(span)
                         )
                     else:
@@ -2383,7 +2383,7 @@ def dfs(
                         if requires_trace:
                             if llm_test_case is None:
                                 llm_test_case = LLMTestCase(input="None")
-                            llm_test_case._trace_dict = (
+                            llm_test_case.trace_dict = (
                                 trace_manager.create_nested_spans_dict(
                                     current_trace.root_spans[0]
                                 )
@@ -2393,15 +2393,15 @@ def dfs(
                                 current_trace.status = TraceSpanStatus.ERRORED
                                 trace_api.status = TraceSpanApiStatus.ERRORED
                                 if current_trace.root_spans:
-                                    current_trace.root_spans[0].status = (
-                                        TraceSpanStatus.ERRORED
-                                    )
-                                    current_trace.root_spans[0].error = (
-                                        format_error_text(
-                                            DeepEvalError(
-                                                "Trace has metrics but no LLMTestCase (missing input/output). "
-                                                "Are you sure you called `update_current_trace()`?"
-                                            )
+                                    current_trace.root_spans[
+                                        0
+                                    ].status = TraceSpanStatus.ERRORED
+                                    current_trace.root_spans[
+                                        0
+                                    ].error = format_error_text(
+                                        DeepEvalError(
+                                            "Trace has metrics but no LLMTestCase (missing input/output). "
+                                            "Are you sure you called `update_current_trace()`?"
                                         )
                                     )
                                 if progress and pbar_eval_id is not None:
diff --git a/deepeval/metrics/plan_adherence/plan_adherence.py b/deepeval/metrics/plan_adherence/plan_adherence.py
index 8e46a28710..0e4b73c23f 100644
--- a/deepeval/metrics/plan_adherence/plan_adherence.py
+++ b/deepeval/metrics/plan_adherence/plan_adherence.py
@@ -179,7 +179,7 @@ async def a_measure(
 
     def _get_plan_adherence_score(self, task, plan, test_case):
         prompt = PlanAdherenceTemplate.evaluate_adherence(
-            task, "\n".join(plan), test_case._trace_dict
+            task, "\n".join(plan), test_case.trace_dict
         )
         return generate_with_schema_and_extract(
             metric=self,
@@ -191,7 +191,7 @@ def _get_plan_adherence_score(self, task, plan, test_case):
 
     async def _a_get_plan_adherence_score(self, task, plan, test_case):
         prompt = PlanAdherenceTemplate.evaluate_adherence(
-            task, "\n".join(plan), test_case._trace_dict
+            task, "\n".join(plan), test_case.trace_dict
         )
         return await a_generate_with_schema_and_extract(
             metric=self,
@@ -203,7 +203,7 @@ async def _a_get_plan_adherence_score(self, task, plan, test_case):
 
     def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
         prompt = PlanAdherenceTemplate.extract_plan_from_trace(
-            test_case._trace_dict
+            test_case.trace_dict
         )
         return generate_with_schema_and_extract(
             metric=self,
@@ -217,7 +217,7 @@ async def _a_extract_plan_from_trace(
         self, test_case: LLMTestCase
     ) -> AgentPlan:
         prompt = PlanAdherenceTemplate.extract_plan_from_trace(
-            test_case._trace_dict
+            test_case.trace_dict
         )
         return await a_generate_with_schema_and_extract(
             metric=self,
@@ -229,7 +229,7 @@ async def _a_extract_plan_from_trace(
 
     def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
         prompt = StepEfficiencyTemplate.extract_task_from_trace(
-            test_case._trace_dict
+            test_case.trace_dict
         )
         return generate_with_schema_and_extract(
             metric=self,
@@ -241,7 +241,7 @@ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
 
     async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
         prompt = StepEfficiencyTemplate.extract_task_from_trace(
-            test_case._trace_dict
+            test_case.trace_dict
         )
         return await a_generate_with_schema_and_extract(
             metric=self,
diff --git a/deepeval/metrics/plan_quality/plan_quality.py b/deepeval/metrics/plan_quality/plan_quality.py
index 014a7a5efc..442f3a04ef 100644
--- a/deepeval/metrics/plan_quality/plan_quality.py
+++ b/deepeval/metrics/plan_quality/plan_quality.py
@@ -205,7 +205,7 @@ async def _a_get_plan_quality_score(self, task, plan):
 
     def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
         prompt = PlanAdherenceTemplate.extract_plan_from_trace(
-            test_case._trace_dict
+            test_case.trace_dict
         )
         return generate_with_schema_and_extract(
             metric=self,
@@ -219,7 +219,7 @@ async def _a_extract_plan_from_trace(
         self, test_case: LLMTestCase
     ) -> AgentPlan:
         prompt = PlanAdherenceTemplate.extract_plan_from_trace(
-            test_case._trace_dict
+            test_case.trace_dict
         )
         return await a_generate_with_schema_and_extract(
             metric=self,
@@ -231,7 +231,7 @@ async def _a_extract_plan_from_trace(
 
     def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
         prompt = StepEfficiencyTemplate.extract_task_from_trace(
-            test_case._trace_dict
+            test_case.trace_dict
         )
         return generate_with_schema_and_extract(
             metric=self,
@@ -243,7 +243,7 @@ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
 
     async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
         prompt = StepEfficiencyTemplate.extract_task_from_trace(
-            test_case._trace_dict
+            test_case.trace_dict
         )
         return await a_generate_with_schema_and_extract(
             metric=self,
diff --git a/deepeval/metrics/step_efficiency/step_efficiency.py b/deepeval/metrics/step_efficiency/step_efficiency.py
index e56f89b3fa..58dc104ce9 100644
--- a/deepeval/metrics/step_efficiency/step_efficiency.py
+++ b/deepeval/metrics/step_efficiency/step_efficiency.py
@@ -156,9 +156,9 @@ async def a_measure(
     def _get_score(
         self, task: str, test_case: LLMTestCase
     ) -> EfficiencyVerdict:
-        if test_case._trace_dict is not None:
+        if test_case.trace_dict is not None:
             prompt = StepEfficiencyTemplate.get_execution_efficiency(
-                task, test_case._trace_dict
+                task, test_case.trace_dict
             )
 
         return generate_with_schema_and_extract(
@@ -172,9 +172,9 @@ def _get_score(
     async def _a_get_score(
         self, task: str, test_case: LLMTestCase
     ) -> EfficiencyVerdict:
-        if test_case._trace_dict is not None:
+        if test_case.trace_dict is not None:
             prompt = StepEfficiencyTemplate.get_execution_efficiency(
-                task, test_case._trace_dict
+                task, test_case.trace_dict
             )
 
         return await a_generate_with_schema_and_extract(
@@ -187,7 +187,7 @@ async def _a_get_score(
 
     def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
         prompt = StepEfficiencyTemplate.extract_task_from_trace(
-            test_case._trace_dict
+            test_case.trace_dict
         )
         return generate_with_schema_and_extract(
             metric=self,
@@ -199,7 +199,7 @@ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
 
     async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
         prompt = StepEfficiencyTemplate.extract_task_from_trace(
-            test_case._trace_dict
+            test_case.trace_dict
         )
         return await a_generate_with_schema_and_extract(
             metric=self,
diff --git a/deepeval/metrics/task_completion/task_completion.py b/deepeval/metrics/task_completion/task_completion.py
index 3ea982256c..3c3c565b49 100644
--- a/deepeval/metrics/task_completion/task_completion.py
+++ b/deepeval/metrics/task_completion/task_completion.py
@@ -187,10 +187,10 @@ async def _a_extract_task_and_outcome(
         self,
         test_case: LLMTestCase,
     ) -> Tuple:
-        has_trace: bool = isinstance(test_case._trace_dict, Dict)
+        has_trace: bool = isinstance(test_case.trace_dict, Dict)
         if has_trace:
             prompt = TaskCompletionTemplate.extract_task_and_outcome_from_trace(
-                trace=test_case._trace_dict
+                trace=test_case.trace_dict
             )
         else:
             # TODO: Deprecate this soon
@@ -211,10 +211,10 @@ def _extract_task_and_outcome(
         self,
         test_case: LLMTestCase,
     ) -> Tuple:
-        has_trace: bool = isinstance(test_case._trace_dict, Dict)
+        has_trace: bool = isinstance(test_case.trace_dict, Dict)
         if has_trace:
             prompt = TaskCompletionTemplate.extract_task_and_outcome_from_trace(
-                trace=test_case._trace_dict
+                trace=test_case.trace_dict
             )
         else:
             # TODO: Deprecate this soon
diff --git a/deepeval/test_case/llm_test_case.py b/deepeval/test_case/llm_test_case.py
index 44ffc72a93..84e386c590 100644
--- a/deepeval/test_case/llm_test_case.py
+++ b/deepeval/test_case/llm_test_case.py
@@ -26,9 +26,9 @@
     validate_mcp_servers,
 )
 
-_MLLM_IMAGE_REGISTRY: weakref.WeakValueDictionary[str, "MLLMImage"] = (
-    weakref.WeakValueDictionary()
-)
+_MLLM_IMAGE_REGISTRY: weakref.WeakValueDictionary[
+    str, "MLLMImage"
+] = weakref.WeakValueDictionary()
 
 
 @dataclass
@@ -374,7 +374,16 @@ class LLMTestCase(BaseModel):
             "customColumnKeyValues", "custom_column_key_values"
         ),
     )
-    _trace_dict: Optional[Dict] = PrivateAttr(default=None)
+    trace_dict: Optional[Dict] = Field(
+        default=None,
+        description=(
+            "Nested span tree from tracing (same shape as TraceManager."
+            "create_nested_spans_dict). Pass a pre-recorded trace for "
+            "post-hoc evaluation with metrics that require_trace=True."
+        ),
+        serialization_alias="traceDict",
+        validation_alias=AliasChoices("traceDict", "trace_dict"),
+    )
     _dataset_rank: Optional[int] = PrivateAttr(default=None)
     _dataset_alias: Optional[str] = PrivateAttr(default=None)
     _dataset_id: Optional[str] = PrivateAttr(default=None)
diff --git a/examples/tracing/test_posthoc_evaluation.py b/examples/tracing/test_posthoc_evaluation.py
new file mode 100644
index 0000000000..7268ad4cf6
--- /dev/null
+++ b/examples/tracing/test_posthoc_evaluation.py
@@ -0,0 +1,116 @@
+"""
+Post-hoc evaluation using LLMTestCase.trace_dict
+-------------------------------------------------
+DeepEval's agentic trace metrics (TaskCompletionMetric, StepEfficiencyMetric,
+PlanQualityMetric, PlanAdherenceMetric) work with two approaches:
+
+APPROACH 1 — @observe at runtime (standard DeepEval way)
+  Instrument functions with @observe before they run. DeepEval captures the
+  trace automatically and writes it into LLMTestCase.trace_dict after the call.
+
+APPROACH 2 — post-hoc via trace_dict (this file)
+  Your agent has *already* run and you have a saved trace (from a log file,
+  database, observability system, or a previous @observe run that was
+  serialised). Pass it directly as trace_dict= when constructing LLMTestCase.
+  No @observe, no live agent execution required.
+
+This is useful for:
+  - Agents you don't own or can't re-run (3rd-party pipelines)
+  - Offline / batch evaluation from logs
+  - CI workflows that replay saved traces
+  - Post-mortem analysis of production runs
+
+The trace dict shape must match what TraceManager.create_nested_spans_dict()
+produces:
+
+  {
+    "name":     str,
+    "type":     "agent" | "tool" | "llm" | "retriever" | "custom",
+    "input":    {...},
+    "output":   {...},
+    "children": [ <same shape, recursively> ]
+  }
+
+Run:
+  python examples/tracing/test_posthoc_evaluation.py
+"""
+
+import json
+
+from deepeval import evaluate
+from deepeval.metrics import TaskCompletionMetric
+from deepeval.test_case import LLMTestCase
+
+
+# ---------------------------------------------------------------------------
+# 1. A saved trace (in practice you'd load this from a JSON file / DB / etc.)
+# ---------------------------------------------------------------------------
+def load_saved_trace() -> dict:
+    """
+    Simulates loading a pre-recorded agent trace.
+    In real usage: json.load(open("traces/run_2024-01-15.json"))
+    """
+    return {
+        "name": "trip_planner_agent",
+        "type": "agent",
+        "input": {"input": "Plan a 2-day trip to Paris with restaurants."},
+        "output": {
+            "output": [
+                "Eiffel Tower",
+                "Louvre Museum",
+                "Le Jules Verne",
+                "Angelina Paris",
+                "Septime",
+            ]
+        },
+        "children": [
+            {
+                "name": "itinerary_generator",
+                "type": "tool",
+                "input": {
+                    "inputParameters": {"destination": "Paris", "days": 2}
+                },
+                "output": {"output": ["Eiffel Tower", "Louvre Museum"]},
+                "children": [],
+            },
+            {
+                "name": "restaurant_finder",
+                "type": "tool",
+                "input": {"inputParameters": {"city": "Paris"}},
+                "output": {
+                    "output": ["Le Jules Verne", "Angelina Paris", "Septime"]
+                },
+                "children": [],
+            },
+        ],
+    }
+
+
+# ---------------------------------------------------------------------------
+# 2. Build LLMTestCase with the saved trace — no @observe needed
+# ---------------------------------------------------------------------------
+saved_trace = load_saved_trace()
+
+test_case = LLMTestCase(
+    input="Plan a 2-day trip to Paris with restaurants.",
+    actual_output=(
+        "Eiffel Tower, Louvre Museum, Le Jules Verne, Angelina Paris, Septime"
+    ),
+    trace_dict=saved_trace,  # post-hoc trace injection
+)
+
+# trace_dict is also accepted via its camelCase alias for JSON round-trips:
+#   LLMTestCase.model_validate({"input": "...", "traceDict": saved_trace})
+assert test_case.trace_dict == saved_trace
+
+# ---------------------------------------------------------------------------
+# 3. Evaluate — TaskCompletionMetric detects trace_dict and uses the
+#    trace-aware prompt path (same as it would with @observe at runtime)
+# ---------------------------------------------------------------------------
+task_completion = TaskCompletionMetric(threshold=0.5)
+
+print("Running post-hoc evaluation with saved trace …")
+evaluate([test_case], [task_completion])
+
+print(f"\nScore : {task_completion.score}")
+print(f"Reason: {task_completion.reason}")
diff --git a/tests/test_core/test_test_case/test_single_turn.py b/tests/test_core/test_test_case/test_single_turn.py
index b61878b119..aca8bb6684 100644
--- a/tests/test_core/test_test_case/test_single_turn.py
+++ b/tests/test_core/test_test_case/test_single_turn.py
@@ -13,7 +13,6 @@
 
 
 class TestLLMTestCaseInitialization:
-
     def test_minimal_initialization(self):
         test_case = LLMTestCase(input="What is the capital of France?")
 
@@ -35,13 +34,36 @@ def test_minimal_initialization(self):
         assert test_case.mcp_resources_called is None
         assert test_case.mcp_prompts_called is None
 
-        # Test private attributes have defaults
-        assert test_case._trace_dict is None
+        # trace_dict is public (optional nested span tree for post-hoc eval)
+        assert test_case.trace_dict is None
+        # Other private attributes have defaults
         assert test_case._dataset_rank is None
         assert test_case._dataset_alias is None
         assert test_case._dataset_id is None
         assert isinstance(test_case._identifier, str)
 
+    def test_trace_dict_constructor_and_alias(self):
+        nested = {
+            "name": "root_agent",
+            "type": "agent",
+            "input": {"input": "Book a flight to NYC"},
+            "output": {"summary": "done"},
+            "children": [],
+        }
+        tc = LLMTestCase(
+            input="Book a flight to NYC",
+            actual_output="Flight options listed.",
+            trace_dict=nested,
+        )
+        assert tc.trace_dict == nested
+        tc2 = LLMTestCase.model_validate(
+            {
+                "input": "hi",
+                "traceDict": {"name": "a", "type": "agent", "children": []},
+            }
+        )
+        assert tc2.trace_dict["name"] == "a"
+
     def test_full_initialization(self):
         tool_call = ToolCall(
             name="search_tool",
@@ -96,7 +118,6 @@ def test_full_initialization(self):
 
 
 class TestLLMTestCaseCamelCaseInitialization:
-
     def test_camelcase_field_initialization(self):
         input_text = "What is artificial intelligence?"
         actual_output_text = "AI is a branch of computer science..."
@@ -235,7 +256,6 @@ def test_tool_call_camelcase_initialization(self):
 
 
 class TestLLMTestCaseTypeValidation:
-
     def test_input_must_be_string(self):
         with pytest.raises(TypeError, match="'input' must be a string"):
             LLMTestCase(input=123)
@@ -477,7 +497,6 @@ def test_tool_call_repr_minimal(self):
 
 
 class TestEdgeCases:
-
     def test_empty_strings(self):
         test_case = LLMTestCase(
             input="", actual_output="", expected_output="", comments=""
@@ -580,7 +599,6 @@ def test_deeply_nested_structures(self):
 
 
 class TestSerialization:
-
     def test_serialization_aliases(self):
         test_case = LLMTestCase(
             input="test",
@@ -655,7 +673,8 @@ def test_private_attributes_not_in_model_dump(self):
 
         model_dict = test_case.model_dump()
 
-        assert "_trace_dict" not in model_dict
+        assert "trace_dict" in model_dict
+        assert model_dict["trace_dict"] is None
         assert "_dataset_rank" not in model_dict
         assert "_dataset_alias" not in model_dict
         assert "_dataset_id" not in model_dict
@@ -664,18 +683,18 @@ def test_private_attributes_not_in_model_dump(self):
     def test_private_attributes_accessible(self):
         test_case = LLMTestCase(input="test")
 
-        assert test_case._trace_dict is None
+        assert test_case.trace_dict is None
         assert test_case._dataset_rank is None
         assert test_case._dataset_alias is None
         assert test_case._dataset_id is None
         assert isinstance(test_case._identifier, str)
 
-        test_case._trace_dict = {"key": "value"}
+        test_case.trace_dict = {"key": "value"}
         test_case._dataset_rank = 1
         test_case._dataset_alias = "test_alias"
         test_case._dataset_id = "test_id"
 
-        assert test_case._trace_dict == {"key": "value"}
+        assert test_case.trace_dict == {"key": "value"}
         assert test_case._dataset_rank == 1
         assert test_case._dataset_alias == "test_alias"
         assert test_case._dataset_id == "test_id"