Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion deepeval/metrics/step_efficiency/schema.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from pydantic import BaseModel
from typing import List, Dict, Literal
from typing import List, Dict, Literal, Optional


class Task(BaseModel):
task: str


class StepAnalysis(BaseModel):
step_name: str
is_necessary: bool
reason: str


class EfficiencyVerdict(BaseModel):
score: float
reason: str
steps: Optional[List[StepAnalysis]] = None
16 changes: 16 additions & 0 deletions deepeval/metrics/step_efficiency/step_efficiency.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,14 @@ def measure(
else efficiency_verdict.score
)
self.reason = efficiency_verdict.reason
if efficiency_verdict.steps:
self.score_breakdown = {
step.step_name: {
"necessary": step.is_necessary,
"reason": step.reason,
}
for step in efficiency_verdict.steps
}
self.success = self.score >= self.threshold
self.verbose_logs = construct_verbose_logs(
self,
Expand Down Expand Up @@ -130,6 +138,14 @@ async def a_measure(
else efficiency_verdict.score
)
self.reason = efficiency_verdict.reason
if efficiency_verdict.steps:
self.score_breakdown = {
step.step_name: {
"necessary": step.is_necessary,
"reason": step.reason,
}
for step in efficiency_verdict.steps
}
self.success = self.score >= self.threshold
self.verbose_logs = construct_verbose_logs(
self,
Expand Down
33 changes: 25 additions & 8 deletions deepeval/metrics/step_efficiency/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,35 +218,52 @@ def get_execution_efficiency(task: str, trace: dict) -> str:
Return a single JSON object in this exact format:

{{
"score": 0.0,
"reason": "1-3 concise factual sentences describing where inefficiencies occurred."
“score”: 0.0,
“reason”: “1-3 concise factual sentences describing where inefficiencies occurred.”,
“steps”: [
{{
“step_name”: “<name or description of the step>”,
“is_necessary”: true,
“reason”: “<one sentence explaining why this step was or was not necessary>”
}}
]
}}

The `reason` must:
- Identify specific inefficient actions (e.g., redundant LLM call, unnecessary retrieval, speculative tool use).
- Avoid subjective phrasing (“reasonable”, “seems okay”, “somewhat efficient”).
- Be direct and concrete: “Extra retrieval used for enrichment”, “Multiple summarizations of same data”, etc.

The `steps` list must contain one entry per top-level step in the trace.

EXAMPLES

**Example 1:**
Task: "Summarize the given text."
Task: Summarize the given text.
Trace: Agent calls an LLM twice, then performs an extra web search.

→ Output:
{{
"score": 0.25,
"reason": "The agent used redundant LLM calls and performed an unnecessary web search. Only one LLM call was required for the summary."
“score”: 0.25,
“reason”: “The agent used redundant LLM calls and performed an unnecessary web search. Only one LLM call was required for the summary.”,
“steps”: [
{{“step_name”: “LLM call 1”, “is_necessary”: true, “reason”: “Required to produce the initial summary.”}},
{{“step_name”: “LLM call 2”, “is_necessary”: false, “reason”: “Redundant; the first call already produced a complete summary.”}},
{{“step_name”: “web_search”, “is_necessary”: false, “reason”: “No external information was needed to summarize an already-provided text.”}}
]
}}

**Example 2:**
Task: "Convert a date to ISO format."
Task: Convert a date to ISO format.
Trace: Agent performs one computation directly.

→ Output:
{{
"score": 1.0,
"reason": "The agent completed the task with one minimal action and no unnecessary steps."
“score”: 1.0,
“reason”: “The agent completed the task with one minimal action and no unnecessary steps.”,
“steps”: [
{{“step_name”: “date_conversion”, “is_necessary”: true, “reason”: “The single required step to convert the date to ISO format.”}}
]
}}

FINAL REMINDERS
Expand Down
57 changes: 57 additions & 0 deletions tests/test_metrics/test_step_efficiency_metric.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os
from unittest.mock import MagicMock, patch
import pytest
from deepeval.test_case import MLLMImage
from deepeval.tracing import observe
from deepeval.dataset import Golden, EvaluationDataset
from deepeval.metrics import StepEfficiencyMetric
from deepeval.metrics.step_efficiency.schema import EfficiencyVerdict, StepAnalysis

pytestmark = pytest.mark.skipif(
os.getenv("OPENAI_API_KEY") is None
Expand Down Expand Up @@ -181,3 +183,58 @@ def itinerary_generator(destination, days):

for golden in dataset.evals_iterator(metrics=[metric]):
trip_planner_agent(golden.input)


class TestStepEfficiencyScoreBreakdown:
"""No-API-key unit tests for score_breakdown population."""

def _make_metric(self) -> StepEfficiencyMetric:
mock_model = MagicMock()
mock_model.get_model_name.return_value = "mock"
with patch(
"deepeval.metrics.step_efficiency.step_efficiency.initialize_model",
return_value=(mock_model, True),
):
return StepEfficiencyMetric()

def test_score_breakdown_populated_from_verdict_steps(self):
metric = self._make_metric()
verdict = EfficiencyVerdict(
score=0.5,
reason="One step was redundant.",
steps=[
StepAnalysis(
step_name="fetch_data",
is_necessary=True,
reason="Required to retrieve the input data.",
),
StepAnalysis(
step_name="extra_llm_call",
is_necessary=False,
reason="Duplicate call; the first response was sufficient.",
),
],
)
if verdict.steps:
metric.score_breakdown = {
step.step_name: {
"necessary": step.is_necessary,
"reason": step.reason,
}
for step in verdict.steps
}

assert metric.score_breakdown is not None
assert len(metric.score_breakdown) == 2
assert metric.score_breakdown["fetch_data"]["necessary"] is True
assert metric.score_breakdown["extra_llm_call"]["necessary"] is False

def test_score_breakdown_none_when_steps_absent(self):
metric = self._make_metric()
verdict = EfficiencyVerdict(score=1.0, reason="Efficient.", steps=None)
if verdict.steps:
metric.score_breakdown = {
step.step_name: {"necessary": step.is_necessary, "reason": step.reason}
for step in verdict.steps
}
assert metric.score_breakdown is None
Loading