confident-ai · nuthalapativarun · May 29, 2026
diff --git a/deepeval/metrics/step_efficiency/schema.py b/deepeval/metrics/step_efficiency/schema.py
@@ -1,11 +1,18 @@
 from pydantic import BaseModel
-from typing import List, Dict, Literal
+from typing import List, Dict, Literal, Optional
 
 
 class Task(BaseModel):
     task: str
 
 
+class StepAnalysis(BaseModel):
+    step_name: str
+    is_necessary: bool
+    reason: str
+
+
 class EfficiencyVerdict(BaseModel):
     score: float
     reason: str
+    steps: Optional[List[StepAnalysis]] = None
diff --git a/deepeval/metrics/step_efficiency/step_efficiency.py b/deepeval/metrics/step_efficiency/step_efficiency.py
@@ -84,6 +84,14 @@ def measure(
                     else efficiency_verdict.score
                 )
                 self.reason = efficiency_verdict.reason
+                if efficiency_verdict.steps:
+                    self.score_breakdown = {
+                        step.step_name: {
+                            "necessary": step.is_necessary,
+                            "reason": step.reason,
+                        }
+                        for step in efficiency_verdict.steps
+                    }
                 self.success = self.score >= self.threshold
                 self.verbose_logs = construct_verbose_logs(
                     self,
@@ -130,6 +138,14 @@ async def a_measure(
                 else efficiency_verdict.score
             )
             self.reason = efficiency_verdict.reason
+            if efficiency_verdict.steps:
+                self.score_breakdown = {
+                    step.step_name: {
+                        "necessary": step.is_necessary,
+                        "reason": step.reason,
+                    }
+                    for step in efficiency_verdict.steps
+                }
             self.success = self.score >= self.threshold
             self.verbose_logs = construct_verbose_logs(
                 self,

diff --git a/deepeval/metrics/step_efficiency/template.py b/deepeval/metrics/step_efficiency/template.py
@@ -218,35 +218,52 @@ def get_execution_efficiency(task: str, trace: dict) -> str:
                 Return a single JSON object in this exact format:
 
                 {{
-                    "score": 0.0,
-                    "reason": "1-3 concise factual sentences describing where inefficiencies occurred."
+                    “score”: 0.0,
+                    “reason”: “1-3 concise factual sentences describing where inefficiencies occurred.”,
+                    “steps”: [
+                        {{
+                            “step_name”: “<name or description of the step>”,
+                            “is_necessary”: true,
+                            “reason”: “<one sentence explaining why this step was or was not necessary>”
+                        }}
+                    ]
                 }}
 
                 The `reason` must:
                 - Identify specific inefficient actions (e.g., redundant LLM call, unnecessary retrieval, speculative tool use).
                 - Avoid subjective phrasing (“reasonable”, “seems okay”, “somewhat efficient”).
                 - Be direct and concrete: “Extra retrieval used for enrichment”, “Multiple summarizations of same data”, etc.
 
+                The `steps` list must contain one entry per top-level step in the trace.
+
                 EXAMPLES
 
                 **Example 1:**
-                Task: "Summarize the given text."
+                Task: “Summarize the given text.”
                 Trace: Agent calls an LLM twice, then performs an extra web search.
 
                 → Output:
                 {{
-                    "score": 0.25,
-                    "reason": "The agent used redundant LLM calls and performed an unnecessary web search. Only one LLM call was required for the summary."
+                    “score”: 0.25,
+                    “reason”: “The agent used redundant LLM calls and performed an unnecessary web search. Only one LLM call was required for the summary.”,
+                    “steps”: [
+                        {{“step_name”: “LLM call 1”, “is_necessary”: true, “reason”: “Required to produce the initial summary.”}},
+                        {{“step_name”: “LLM call 2”, “is_necessary”: false, “reason”: “Redundant; the first call already produced a complete summary.”}},
+                        {{“step_name”: “web_search”, “is_necessary”: false, “reason”: “No external information was needed to summarize an already-provided text.”}}
+                    ]
                 }}
 
                 **Example 2:**
-                Task: "Convert a date to ISO format."
+                Task: “Convert a date to ISO format.”
                 Trace: Agent performs one computation directly.
 
                 → Output:
                 {{
-                    "score": 1.0,
-                    "reason": "The agent completed the task with one minimal action and no unnecessary steps."
+                    “score”: 1.0,
+                    “reason”: “The agent completed the task with one minimal action and no unnecessary steps.”,
+                    “steps”: [
+                        {{“step_name”: “date_conversion”, “is_necessary”: true, “reason”: “The single required step to convert the date to ISO format.”}}
+                    ]
                 }}
 
                 FINAL REMINDERS

diff --git a/tests/test_metrics/test_step_efficiency_metric.py b/tests/test_metrics/test_step_efficiency_metric.py
@@ -1,9 +1,11 @@
 import os
+from unittest.mock import MagicMock, patch
 import pytest
 from deepeval.test_case import MLLMImage
 from deepeval.tracing import observe
 from deepeval.dataset import Golden, EvaluationDataset
 from deepeval.metrics import StepEfficiencyMetric
+from deepeval.metrics.step_efficiency.schema import EfficiencyVerdict, StepAnalysis
 
 pytestmark = pytest.mark.skipif(
     os.getenv("OPENAI_API_KEY") is None
@@ -181,3 +183,58 @@ def itinerary_generator(destination, days):
 
             for golden in dataset.evals_iterator(metrics=[metric]):
                 trip_planner_agent(golden.input)
+
+
+class TestStepEfficiencyScoreBreakdown:
+    """No-API-key unit tests for score_breakdown population."""
+
+    def _make_metric(self) -> StepEfficiencyMetric:
+        mock_model = MagicMock()
+        mock_model.get_model_name.return_value = "mock"
+        with patch(
+            "deepeval.metrics.step_efficiency.step_efficiency.initialize_model",
+            return_value=(mock_model, True),
+        ):
+            return StepEfficiencyMetric()
+
+    def test_score_breakdown_populated_from_verdict_steps(self):
+        metric = self._make_metric()
+        verdict = EfficiencyVerdict(
+            score=0.5,
+            reason="One step was redundant.",
+            steps=[
+                StepAnalysis(
+                    step_name="fetch_data",
+                    is_necessary=True,
+                    reason="Required to retrieve the input data.",
+                ),
+                StepAnalysis(
+                    step_name="extra_llm_call",
+                    is_necessary=False,
+                    reason="Duplicate call; the first response was sufficient.",
+                ),
+            ],
+        )
+        if verdict.steps:
+            metric.score_breakdown = {
+                step.step_name: {
+                    "necessary": step.is_necessary,
+                    "reason": step.reason,
+                }
+                for step in verdict.steps
+            }
+
+        assert metric.score_breakdown is not None
+        assert len(metric.score_breakdown) == 2
+        assert metric.score_breakdown["fetch_data"]["necessary"] is True
+        assert metric.score_breakdown["extra_llm_call"]["necessary"] is False
+
+    def test_score_breakdown_none_when_steps_absent(self):
+        metric = self._make_metric()
+        verdict = EfficiencyVerdict(score=1.0, reason="Efficient.", steps=None)
+        if verdict.steps:
+            metric.score_breakdown = {
+                step.step_name: {"necessary": step.is_necessary, "reason": step.reason}
+                for step in verdict.steps
+            }
+        assert metric.score_breakdown is None