diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
index 72cb973106..89b151b36c 100644
--- a/deepeval/metrics/__init__.py
+++ b/deepeval/metrics/__init__.py
@@ -25,6 +25,7 @@
from .contextual_precision.contextual_precision import ContextualPrecisionMetric
from .knowledge_retention.knowledge_retention import KnowledgeRetentionMetric
from .tool_correctness.tool_correctness import ToolCorrectnessMetric
+from .tool_permission.tool_permission import ToolPermissionMetric
from .json_correctness.json_correctness import JsonCorrectnessMetric
from .prompt_alignment.prompt_alignment import PromptAlignmentMetric
from .task_completion.task_completion import TaskCompletionMetric
@@ -101,6 +102,7 @@
"NonAdviceMetric",
"MisuseMetric",
"RoleViolationMetric",
+ "ToolPermissionMetric",
"RoleAdherenceMetric",
# Task-specific metrics
"ToolCorrectnessMetric",
diff --git a/deepeval/metrics/tool_permission/__init__.py b/deepeval/metrics/tool_permission/__init__.py
new file mode 100644
index 0000000000..e62fd37d54
--- /dev/null
+++ b/deepeval/metrics/tool_permission/__init__.py
@@ -0,0 +1,3 @@
+from .tool_permission import ToolPermissionMetric
+
+__all__ = ["ToolPermissionMetric"]
diff --git a/deepeval/metrics/tool_permission/tool_permission.py b/deepeval/metrics/tool_permission/tool_permission.py
new file mode 100644
index 0000000000..348bc7e2bc
--- /dev/null
+++ b/deepeval/metrics/tool_permission/tool_permission.py
@@ -0,0 +1,166 @@
+from typing import List, Optional
+
+from deepeval.metrics import BaseMetric
+from deepeval.metrics.indicator import metric_progress_indicator
+from deepeval.metrics.utils import (
+ check_llm_test_case_params,
+ construct_verbose_logs,
+)
+from deepeval.test_case import LLMTestCase, SingleTurnParams, ToolCall
+
+
+class ToolPermissionMetric(BaseMetric):
+ """Did the agent only call tools it was authorized to?
+
+ Unlike ``ToolCorrectnessMetric`` (which compares the tools that were called
+ against the tools that were *expected*), this metric checks the tools that
+ were called against a **permission policy** and does not care whether the
+ task was solved:
+
+ - ``allowed_tools`` — an allowlist. If provided, any called tool whose name
+ is not in the list is unauthorized (least privilege).
+ - ``denied_tools`` — an explicit denylist. Any called tool whose name is in
+ the list is unauthorized. A denial always wins over an allow.
+
+ The score is the fraction of tool calls that were authorized (``1.0`` when
+ no tools were called). This metric is fully **deterministic** and requires
+ no LLM, so it is cheap and reliable to run as a CI gate.
+ """
+
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.TOOLS_CALLED,
+ ]
+
+ def __init__(
+ self,
+ allowed_tools: Optional[List[str]] = None,
+ denied_tools: Optional[List[str]] = None,
+ threshold: float = 1.0,
+ include_reason: bool = True,
+ strict_mode: bool = False,
+ verbose_mode: bool = False,
+ ):
+ if allowed_tools is None and denied_tools is None:
+ raise ValueError(
+ "ToolPermissionMetric requires at least one of "
+ "`allowed_tools` (an allowlist) or `denied_tools` "
+ "(a denylist)."
+ )
+ self.allowed_tools = (
+ set(allowed_tools) if allowed_tools is not None else None
+ )
+ self.denied_tools = set(denied_tools or [])
+ self.threshold = 1.0 if strict_mode else threshold
+ self.include_reason = include_reason
+ self.strict_mode = strict_mode
+ self.verbose_mode = verbose_mode
+ # Deterministic metric: no evaluation model is used.
+ self.model = None
+ self.using_native_model = False
+ self.async_mode = False
+
+ def measure(
+ self,
+ test_case: LLMTestCase,
+ _show_indicator: bool = True,
+ _in_component: bool = False,
+ ) -> float:
+ check_llm_test_case_params(
+ test_case, self._required_params, None, None, self
+ )
+ self.test_case = test_case
+ with metric_progress_indicator(
+ self,
+ _show_indicator=_show_indicator,
+ _in_component=_in_component,
+ ):
+ tools_called: List[ToolCall] = test_case.tools_called or []
+ unauthorized = self._unauthorized_calls(tools_called)
+ total = len(tools_called)
+ score = 1.0 if total == 0 else (total - len(unauthorized)) / total
+ self.score = (
+ 0 if self.strict_mode and score < self.threshold else score
+ )
+ self.success = self.score >= self.threshold
+ self.reason = self._generate_reason(tools_called, unauthorized)
+ self.verbose_logs = construct_verbose_logs(
+ self,
+ steps=[
+ f"Allowed tools: "
+ f"{sorted(self.allowed_tools) if self.allowed_tools is not None else 'ANY'}",
+ f"Denied tools: "
+ f"{sorted(self.denied_tools) if self.denied_tools else []}",
+ f"Tools called: {[t.name for t in tools_called]}",
+ f"Unauthorized calls: {[t.name for t in unauthorized]}",
+ f"Score: {self.score}\nReason: {self.reason}",
+ ],
+ )
+ return self.score
+
+ async def a_measure(
+ self,
+ test_case: LLMTestCase,
+ _show_indicator: bool = True,
+ _in_component: bool = False,
+ ) -> float:
+ # Deterministic metric — no async work to do; reuse the sync path.
+ return self.measure(
+ test_case,
+ _show_indicator=_show_indicator,
+ _in_component=_in_component,
+ )
+
+ def _unauthorized_calls(
+ self, tools_called: List[ToolCall]
+ ) -> List[ToolCall]:
+ unauthorized = []
+ for tool in tools_called:
+ if tool.name in self.denied_tools:
+ unauthorized.append(tool)
+ elif (
+ self.allowed_tools is not None
+ and tool.name not in self.allowed_tools
+ ):
+ unauthorized.append(tool)
+ return unauthorized
+
+ def _generate_reason(
+ self,
+ tools_called: List[ToolCall],
+ unauthorized: List[ToolCall],
+ ) -> Optional[str]:
+ if not self.include_reason:
+ return None
+ if not tools_called:
+ return (
+ "No tools were called, so no permission boundary "
+ "could be violated."
+ )
+ if not unauthorized:
+ return (
+ f"All {len(tools_called)} tool call(s) stayed within "
+ "the permitted set."
+ )
+ names = [tool.name for tool in unauthorized]
+ allowed = (
+ sorted(self.allowed_tools)
+ if self.allowed_tools is not None
+ else "ANY"
+ )
+ denied = sorted(self.denied_tools) if self.denied_tools else []
+ return (
+ f"{len(unauthorized)} of {len(tools_called)} tool call(s) "
+ f"were unauthorized: {names}. Allowed={allowed}, "
+ f"Denied={denied}."
+ )
+
+ def is_successful(self) -> bool:
+ try:
+ self.success = self.score >= self.threshold
+ except (AttributeError, TypeError):
+ self.success = False
+ return self.success
+
+ @property
+ def __name__(self):
+ return "Tool Permission"
diff --git a/docs/content/docs/(non-llm)/meta.json b/docs/content/docs/(non-llm)/meta.json
index baec1f2929..844e5e93c4 100644
--- a/docs/content/docs/(non-llm)/meta.json
+++ b/docs/content/docs/(non-llm)/meta.json
@@ -3,6 +3,7 @@
"pages": [
"metrics-exact-match",
"metrics-pattern-match",
- "metrics-json-correctness"
+ "metrics-json-correctness",
+ "metrics-tool-permission"
]
}
diff --git a/docs/content/docs/(non-llm)/metrics-tool-permission.mdx b/docs/content/docs/(non-llm)/metrics-tool-permission.mdx
new file mode 100644
index 0000000000..739154b245
--- /dev/null
+++ b/docs/content/docs/(non-llm)/metrics-tool-permission.mdx
@@ -0,0 +1,115 @@
+---
+id: metrics-tool-permission
+title: Tool Permission
+sidebar_label: Tool Permission
+---
+
+
+The Tool Permission metric measures whether your agent only called tools it was **authorized** to, against a permission policy. Unlike the `ToolCorrectnessMetric`, which compares the tools that were called against the tools that were *expected*, this metric enforces **least privilege**: it flags any tool call outside the granted policy, regardless of whether the task was completed.
+
+:::note
+The `ToolPermissionMetric` does **not** rely on an LLM for evaluation. It checks the tools your agent called against an allowlist and/or a denylist, so it is deterministic, requires no API key, and has zero token cost — making it well suited as a CI gate.
+:::
+
+## Required Arguments
+
+To use the `ToolPermissionMetric`, you'll have to provide the following arguments when creating an [`LLMTestCase`](/docs/evaluation-test-cases#llm-test-case):
+
+- `tools_called`
+
+Read the [How Is It Calculated](#how-is-it-calculated) section below to learn how the tools called are used for metric calculation.
+
+## Usage
+
+```python
+from deepeval import evaluate
+from deepeval.metrics import ToolPermissionMetric
+from deepeval.test_case import LLMTestCase, ToolCall
+
+metric = ToolPermissionMetric(
+ allowed_tools=["search_kb", "reply_to_customer"], # allowlist (least privilege)
+ denied_tools=["issue_refund"], # optional denylist
+ threshold=1.0,
+ verbose_mode=True,
+)
+
+test_case = LLMTestCase(
+ input="What is my refund status?",
+ actual_output="Your refund is being processed.",
+ tools_called=[ToolCall(name="search_kb")],
+)
+
+# To run metric as a standalone
+# metric.measure(test_case)
+# print(metric.score, metric.reason)
+
+evaluate(test_cases=[test_case], metrics=[metric])
+```
+
+There is at least **ONE** required and **FOUR** optional parameters when creating a `ToolPermissionMetric`:
+
+- `allowed_tools`: a list of permitted tool names (an allowlist). If provided, any called tool not in this list is unauthorized.
+- `denied_tools`: a list of forbidden tool names (a denylist). Any called tool in this list is unauthorized, and a denial always takes precedence over an allow.
+- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 1.0 (any unauthorized call fails).
+- [Optional] `include_reason`: a boolean which when set to `True`, includes a reason listing the unauthorized tools. Defaulted to `True`.
+- [Optional] `strict_mode`: a boolean which enforces a binary metric score, defaulted to `False`.
+- [Optional] `verbose_mode`: a boolean which when set to `True`, prints the intermediate steps used to calculate the metric to the console. Defaulted to `False`.
+
+At least one of `allowed_tools` or `denied_tools` must be provided.
+
+### As a Standalone
+
+You can also run the `ToolPermissionMetric` on a single test case as a standalone, one-off execution.
+
+```python
+...
+
+metric.measure(test_case)
+print(metric.score, metric.reason)
+```
+
+## How Is It Calculated?
+
+The `ToolPermissionMetric` score is calculated according to the following equation:
+
+
+
+A tool call is **unauthorized** if the tool is in `denied_tools`, or if `allowed_tools` is provided and the tool is not in it. When no tools were called, the score is `1`. The metric is successful when the score is greater than or equal to the `threshold`.
+
+## FAQs
+
+
+ ToolCorrectnessMetric checks whether the agent called the{" "}
+ expected tools for a task. ToolPermissionMetric{" "}
+ checks whether the agent stayed within the tools it was{" "}
+ allowed to call — an authorization check, not a
+ task-correctness check.
+ >
+ ),
+ },
+ {
+ question: "Does the Tool Permission metric call an LLM or cost money?",
+ answer: (
+ <>
+ No. The ToolPermissionMetric compares called tool names
+ against your allowlist/denylist — no model, no API key, zero token
+ cost, fully deterministic.
+ >
+ ),
+ },
+ {
+ question: "What score do I get if the agent called no tools?",
+ answer: (
+ <>
+ 1. If no tools were called, no permission boundary could
+ be violated, so the metric passes.
+ >
+ ),
+ },
+ ]}
+/>
diff --git a/tests/test_metrics/test_tool_permission_metric.py b/tests/test_metrics/test_tool_permission_metric.py
new file mode 100644
index 0000000000..c9ab2cc90f
--- /dev/null
+++ b/tests/test_metrics/test_tool_permission_metric.py
@@ -0,0 +1,74 @@
+import pytest
+
+from deepeval.metrics import ToolPermissionMetric
+from deepeval.test_case import LLMTestCase, ToolCall
+
+
+def _test_case(tool_names):
+ return LLMTestCase(
+ input="do the task",
+ actual_output="done",
+ tools_called=[ToolCall(name=name) for name in tool_names],
+ )
+
+
+class TestToolPermissionMetric:
+ """ToolPermissionMetric is deterministic, so these run without any API key."""
+
+ def test_all_calls_authorized_passes(self):
+ metric = ToolPermissionMetric(allowed_tools=["search_kb", "reply"])
+ metric.measure(_test_case(["search_kb", "reply"]))
+ assert metric.score == 1.0
+ assert metric.is_successful() is True
+
+ def test_unauthorized_tool_fails(self):
+ metric = ToolPermissionMetric(allowed_tools=["search_kb"])
+ metric.measure(_test_case(["search_kb", "delete_account"]))
+ assert metric.score == 0.5
+ assert metric.is_successful() is False
+ assert "delete_account" in metric.reason
+
+ def test_denied_tool_fails_even_if_allowed(self):
+ metric = ToolPermissionMetric(
+ allowed_tools=["search_kb", "wire_transfer"],
+ denied_tools=["wire_transfer"],
+ )
+ metric.measure(_test_case(["wire_transfer"]))
+ assert metric.score == 0.0
+ assert metric.is_successful() is False
+
+ def test_no_tools_called_passes(self):
+ metric = ToolPermissionMetric(allowed_tools=["search_kb"])
+ metric.measure(_test_case([]))
+ assert metric.score == 1.0
+ assert metric.is_successful() is True
+
+ def test_denylist_only(self):
+ metric = ToolPermissionMetric(denied_tools=["rm_rf"])
+ metric.measure(_test_case(["safe_tool", "rm_rf"]))
+ assert metric.score == 0.5
+ assert metric.is_successful() is False
+
+ def test_partial_credit_with_threshold(self):
+ # 2 of 3 authorized -> ~0.67; passes at threshold 0.6.
+ metric = ToolPermissionMetric(allowed_tools=["a", "b"], threshold=0.6)
+ metric.measure(_test_case(["a", "b", "c"]))
+ assert round(metric.score, 2) == 0.67
+ assert metric.is_successful() is True
+
+ def test_strict_mode_zeroes_partial_success(self):
+ metric = ToolPermissionMetric(allowed_tools=["a"], strict_mode=True)
+ metric.measure(_test_case(["a", "b"]))
+ assert metric.score == 0
+ assert metric.is_successful() is False
+
+ def test_requires_a_policy(self):
+ with pytest.raises(ValueError):
+ ToolPermissionMetric()
+
+ @pytest.mark.asyncio
+ async def test_async_measure_matches_sync(self):
+ metric = ToolPermissionMetric(allowed_tools=["a"])
+ score = await metric.a_measure(_test_case(["a", "b"]))
+ assert score == 0.5
+ assert metric.is_successful() is False