diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
index 72cb973106..da3a21ba7b 100644
--- a/deepeval/metrics/__init__.py
+++ b/deepeval/metrics/__init__.py
@@ -35,6 +35,7 @@
from .tool_use.tool_use import ToolUseMetric
from .goal_accuracy.goal_accuracy import GoalAccuracyMetric
from .argument_correctness.argument_correctness import ArgumentCorrectnessMetric
+from .agent_loop_detection.agent_loop_detection import AgentLoopDetectionMetric
from .mcp.mcp_task_completion import MCPTaskCompletionMetric
from .mcp.multi_turn_mcp_use_metric import MultiTurnMCPUseMetric
from .mcp_use_metric.mcp_use_metric import MCPUseMetric
@@ -116,6 +117,7 @@
"PlanQualityMetric",
"ToolUseMetric",
"GoalAccuracyMetric",
+ "AgentLoopDetectionMetric",
# Conversational metrics
"TurnRelevancyMetric",
"ConversationCompletenessMetric",
diff --git a/deepeval/metrics/agent_loop_detection/__init__.py b/deepeval/metrics/agent_loop_detection/__init__.py
new file mode 100644
index 0000000000..42b424e1ea
--- /dev/null
+++ b/deepeval/metrics/agent_loop_detection/__init__.py
@@ -0,0 +1 @@
+from .agent_loop_detection import AgentLoopDetectionMetric
diff --git a/deepeval/metrics/agent_loop_detection/agent_loop_detection.py b/deepeval/metrics/agent_loop_detection/agent_loop_detection.py
new file mode 100644
index 0000000000..c0921de691
--- /dev/null
+++ b/deepeval/metrics/agent_loop_detection/agent_loop_detection.py
@@ -0,0 +1,564 @@
+import json
+from difflib import SequenceMatcher
+from typing import Optional, List, Tuple, Dict
+
+from deepeval.test_case import LLMTestCase, SingleTurnParams
+from deepeval.metrics import BaseMetric
+from deepeval.metrics.utils import (
+ construct_verbose_logs,
+ check_llm_test_case_params,
+)
+from deepeval.utils import get_or_create_event_loop
+from deepeval.metrics.indicator import metric_progress_indicator
+
+# Common stop words and agent boilerplate phrases that inflate Jaccard similarity
+# without signalling true reasoning stagnation.
+_STOP_WORDS = {
+ "the",
+ "a",
+ "an",
+ "is",
+ "are",
+ "was",
+ "were",
+ "i",
+ "will",
+ "now",
+ "based",
+ "on",
+ "information",
+ "provided",
+ "to",
+ "of",
+ "in",
+ "and",
+ "that",
+ "this",
+ "with",
+ "for",
+ "it",
+ "my",
+ "next",
+ "step",
+ "going",
+ "so",
+ "do",
+ "be",
+ "have",
+ "has",
+ "not",
+ "but",
+ "as",
+ "or",
+ "from",
+ "at",
+ "by",
+ "about",
+ "above",
+ "below",
+ "up",
+ "its",
+ "let",
+}
+
+
+class AgentLoopDetectionMetric(BaseMetric):
+ """Detects infinite loops and cyclical patterns in agent execution traces.
+
+ Analyzes three independent sub-signals and returns a weighted score from
+ 0.0 (severe looping) to 1.0 (clean execution):
+
+ 1. **Tool Call Repetition** — Counts identical ``(name, args)`` tool
+ invocations. Score degrades at ``repetition_threshold`` (0.5) and
+ at ``2 × repetition_threshold`` (0.0).
+
+ 2. **Reasoning Stagnation** — Compares consecutive LLM-span outputs
+ using *both* bigram Jaccard similarity *and* ``SequenceMatcher``
+ ratio (stdlib ``difflib``). Taking the maximum of the two catches
+ stagnation whether the wording is literally repeated or merely
+ reordered ("I will now search" vs "Let me search now"). Common
+ stop words are stripped before comparison to prevent boilerplate
+ from inflating scores.
+
+ 3. **Call Graph Cycles** — DFS on the nested ``children`` tree. A
+ cycle is flagged when a span with the same ``type:name:input_hash``
+ label appears twice on the same root-to-leaf ancestry path.
+ Including a truncated input hash in the label reduces false
+ positives when two structurally different spans happen to share a
+ name (see *Limitations* in the metric documentation).
+
+ Design decisions
+ ~~~~~~~~~~~~~~~~
+ * **Fully deterministic** — no LLM / API key required. This is
+ intentional: the metric is designed to run in production pipelines
+ at zero cost and zero latency.
+ * **No ``model`` parameter** — because every sub-signal is computed
+ with deterministic algorithms, accepting a ``model`` argument would
+ be misleading. A future LLM-as-judge stagnation mode could be added
+ behind a feature flag if semantic comparison proves necessary.
+
+ Limitations
+ ~~~~~~~~~~~
+ * **Cycle detection** relies on ``type:name:input_hash`` identity.
+ Two genuinely different spans that share the same type, name, *and*
+ a truncated input hash could still be false-positived. The trace
+ dict (``_trace_dict``) does not expose span UUIDs, so this is the
+ best available heuristic.
+ * **Stagnation detection** uses structural text similarity (bigram
+ Jaccard + ``SequenceMatcher``). It will miss semantically identical
+ outputs that are worded very differently. An LLM-as-judge mode
+ would solve this but would sacrifice the deterministic / zero-cost
+ properties.
+ """
+
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
+ ]
+
+ def __init__(
+ self,
+ threshold: float = 0.5,
+ repetition_threshold: int = 3,
+ similarity_threshold: float = 0.85,
+ check_tool_repetition: bool = True,
+ check_reasoning_stagnation: bool = True,
+ check_call_graph_cycles: bool = True,
+ include_reason: bool = True,
+ async_mode: bool = True,
+ strict_mode: bool = False,
+ verbose_mode: bool = False,
+ ):
+ self.threshold = 1 if strict_mode else threshold
+ self.repetition_threshold = repetition_threshold
+ self.similarity_threshold = similarity_threshold
+ self.check_tool_repetition = check_tool_repetition
+ self.check_reasoning_stagnation = check_reasoning_stagnation
+ self.check_call_graph_cycles = check_call_graph_cycles
+ self.model = None
+ self.using_native_model = True
+ self.evaluation_model = None
+ self.include_reason = include_reason
+ self.async_mode = async_mode
+ self.strict_mode = strict_mode
+ self.verbose_mode = verbose_mode
+ self.requires_trace = True
+
+ def measure(
+ self,
+ test_case: LLMTestCase,
+ _show_indicator: bool = True,
+ _in_component: bool = False,
+ _log_metric_to_confident: bool = True,
+ ) -> float:
+ check_llm_test_case_params(
+ test_case,
+ self._required_params,
+ None,
+ None,
+ self,
+ self.model,
+ test_case.multimodal,
+ )
+
+ self.evaluation_cost = 0
+
+ with metric_progress_indicator(
+ self, _show_indicator=_show_indicator, _in_component=_in_component
+ ):
+ if self.async_mode:
+ loop = get_or_create_event_loop()
+ loop.run_until_complete(
+ self.a_measure(
+ test_case,
+ _show_indicator=False,
+ _in_component=_in_component,
+ _log_metric_to_confident=_log_metric_to_confident,
+ )
+ )
+ return self.score
+ else:
+ self._calculate_metric(test_case)
+ return self.score
+
+ async def a_measure(
+ self,
+ test_case: LLMTestCase,
+ _show_indicator: bool = True,
+ _in_component: bool = False,
+ _log_metric_to_confident: bool = True,
+ ) -> float:
+ check_llm_test_case_params(
+ test_case,
+ self._required_params,
+ None,
+ None,
+ self,
+ self.model,
+ test_case.multimodal,
+ )
+
+ self.evaluation_cost = 0
+
+ with metric_progress_indicator(
+ self,
+ async_mode=True,
+ _show_indicator=_show_indicator,
+ _in_component=_in_component,
+ ):
+ self._calculate_metric(test_case)
+ return self.score
+
+ def _calculate_metric(self, test_case: LLMTestCase):
+ if test_case._trace_dict is None:
+ self.score = 0.0
+ self.success = False
+ self.reason = (
+ "No trace data found. This metric requires trace "
+ "data from @observe."
+ )
+ self.verbose_logs = ""
+ return
+
+ all_spans = self._extract_all_spans(test_case._trace_dict)
+ tool_spans = [s for s in all_spans if s.get("type") == "tool"]
+ llm_spans = [s for s in all_spans if s.get("type") == "llm"]
+
+ rep_score, rep_reason = 1.0, "Tool repetition check skipped."
+ if self.check_tool_repetition:
+ rep_score, rep_reason = self._score_tool_repetition(tool_spans)
+
+ stag_score, stag_reason = 1.0, "Reasoning stagnation check skipped."
+ if self.check_reasoning_stagnation:
+ stag_score, stag_reason = self._score_reasoning_stagnation(
+ llm_spans
+ )
+
+ cycle_score, cycle_reason = 1.0, "Call graph cycles check skipped."
+ if self.check_call_graph_cycles:
+ cycle_score, cycle_reason = self._score_call_graph_cycles(
+ test_case._trace_dict
+ )
+
+ self.score_breakdown = {
+ "tool_repetition": rep_score,
+ "reasoning_stagnation": stag_score,
+ "call_graph_cycles": cycle_score,
+ }
+ self.score = self._combine_scores(rep_score, stag_score, cycle_score)
+ if self.strict_mode and self.score < self.threshold:
+ self.score = 0.0
+
+ self.success = self.score >= self.threshold
+
+ reasons = []
+ if self.check_tool_repetition and rep_score < 1.0:
+ reasons.append(rep_reason)
+ if self.check_reasoning_stagnation and stag_score < 1.0:
+ reasons.append(stag_reason)
+ if self.check_call_graph_cycles and cycle_score < 1.0:
+ reasons.append(cycle_reason)
+
+ if self.score == 1.0:
+ self.reason = "No loop patterns detected."
+ else:
+ self.reason = " ".join([r for r in reasons if r])
+ if not self.reason.strip():
+ self.reason = "Loops detected but no explicit reason provided."
+
+ self.verbose_logs = construct_verbose_logs(
+ self,
+ steps=[
+ f"Tool Repetition Score: {rep_score} ({rep_reason})",
+ f"Reasoning Stagnation Score: {stag_score} ({stag_reason})",
+ f"Call Graph Cycles Score: {cycle_score} ({cycle_reason})",
+ f"Combined Loop Score: {self.score}",
+ f"Final Reason: {self.reason}",
+ ],
+ )
+
+ def _extract_all_spans(self, trace_dict: Optional[Dict]) -> List[Dict]:
+ if not trace_dict:
+ return []
+
+ spans = []
+
+ def traverse(span: Dict):
+ if span:
+ spans.append(span)
+ for child in span.get("children", []):
+ traverse(child)
+
+ traverse(trace_dict)
+ return spans
+
+ def _score_tool_repetition(self, tool_spans: list) -> Tuple[float, str]:
+ if not tool_spans:
+ return 1.0, "No tool spans found."
+
+ tool_counts = {}
+ for span in tool_spans:
+ name = span.get("name", "")
+
+ input_val = span.get("input", {})
+ if isinstance(input_val, str):
+ try:
+ input_val = json.loads(input_val)
+ except Exception:
+ pass
+
+ if isinstance(input_val, dict):
+ args_tuple = tuple(
+ sorted((str(k), str(v)) for k, v in input_val.items())
+ )
+ else:
+ args_tuple = (str(input_val),)
+
+ call_hash = (name, args_tuple)
+ tool_counts[call_hash] = tool_counts.get(call_hash, 0) + 1
+
+ max_reps = max(tool_counts.values()) if tool_counts else 0
+ if max_reps == 0:
+ return 1.0, "No tool repetition."
+
+ most_repeated_call = max(tool_counts.items(), key=lambda x: x[1])
+ tool_name = most_repeated_call[0][0]
+ count = most_repeated_call[1]
+
+ if count >= self.repetition_threshold * 2:
+ return (
+ 0.0,
+ f"Tool '{tool_name}' called {count} times with identical arguments.",
+ )
+ elif count >= self.repetition_threshold:
+ return (
+ 0.5,
+ f"Tool '{tool_name}' called {count} times with identical arguments.",
+ )
+
+ return 1.0, "Tool calls are within acceptable repetition limits."
+
+ # ------------------------------------------------------------------
+ # Call graph cycle detection
+ # ------------------------------------------------------------------
+
+ @staticmethod
+ def _input_hash(span: Dict) -> str:
+ """Return a short, stable hash of a span's input for label
+ disambiguation.
+
+ Including the input in the DFS label drastically reduces false
+ positives when two structurally different spans share a
+ ``type:name`` pair (e.g. two different agents both named
+ ``"planner"``). We truncate to 64 chars to keep labels readable
+ in cycle-path messages.
+ """
+ raw = span.get("input", "")
+ if isinstance(raw, dict):
+ try:
+ raw = json.dumps(raw, sort_keys=True)
+ except (TypeError, ValueError):
+ raw = str(raw)
+ return str(raw)[:64]
+
+ def _score_call_graph_cycles(
+ self, trace_dict: Optional[Dict]
+ ) -> Tuple[float, str]:
+ """Detect cycles in the real parent→child call graph.
+
+ Traverses the nested ``children`` tree using DFS. A cycle is
+ flagged when a span's ``type:name:input_hash`` label appears
+ twice on the same root-to-leaf ancestry path — meaning the agent
+ genuinely called itself (or a transitive ancestor) recursively.
+
+ **Why input_hash is included:** Without it, two genuinely
+ different spans that happen to share the same ``type:name`` (e.g.
+ ``agent:planner`` at the root and a delegated ``agent:planner``
+ with different input deeper in the tree) would be false-positived.
+ Incorporating a truncated input hash makes the label specific
+ enough to avoid this while still detecting true recursive loops
+ (which, by definition, pass the same or similar input back).
+
+ **Limitation:** If the trace dict exposed span UUIDs we could
+ use exact identity. ``create_nested_spans_dict`` strips them,
+ so ``type:name:input_hash`` is the best available heuristic.
+
+ Sequential repetition (the same tool appearing multiple times at
+ sibling positions) is intentionally NOT flagged here; that is the
+ job of ``_score_tool_repetition``.
+ """
+ if not trace_dict:
+ return 1.0, "No call graph cycles detected."
+
+ cycle_path: List[str] = []
+
+ def _label(span: Dict) -> str:
+ return (
+ f"{span.get('type', 'unknown')}"
+ f":{span.get('name', 'unnamed')}"
+ f":{self._input_hash(span)}"
+ )
+
+ def dfs(span: Dict, ancestor_labels: List[str]) -> bool:
+ """Return True as soon as a cycle is found, populating
+ ``cycle_path`` with the offending ancestry chain."""
+ label = _label(span)
+
+ if label in ancestor_labels:
+ # Found a back-edge: report the cycle path
+ cycle_start = ancestor_labels.index(label)
+ cycle_path.extend(ancestor_labels[cycle_start:])
+ cycle_path.append(label)
+ return True
+
+ ancestor_labels.append(label)
+ for child in span.get("children", []):
+ if dfs(child, ancestor_labels):
+ return True
+ ancestor_labels.pop()
+ return False
+
+ has_cycle = dfs(trace_dict, [])
+
+ if has_cycle:
+ # Strip the input_hash from the display for readability
+ display_path = []
+ for label in cycle_path:
+ parts = label.split(":", 2)
+ display_path.append(f"{parts[0]}:{parts[1]}")
+ cycle_str = " -> ".join(display_path)
+ return 0.0, f"Cycle detected in execution path: {cycle_str}."
+
+ return 1.0, "No execution cycles detected."
+
+ # ------------------------------------------------------------------
+ # Reasoning stagnation detection
+ # ------------------------------------------------------------------
+
+ def _score_reasoning_stagnation(self, llm_spans: list) -> Tuple[float, str]:
+ """Compare consecutive LLM outputs using two complementary
+ similarity signals and take the **maximum**:
+
+ 1. **Bigram Jaccard** — fast bag-of-bigrams overlap after
+ stop-word removal. Good at catching literal repetition.
+ 2. **SequenceMatcher ratio** (``difflib``) — sequence-aware
+ comparison that catches reordered but semantically identical
+ text ("I will now search" ≈ "Let me search now").
+
+ Taking the max ensures we flag stagnation regardless of whether
+ the agent repeats itself verbatim or merely shuffles its phrasing.
+
+ Outputs shorter than 20 meaningful words (after stop-word
+ removal) are skipped — Jaccard is meaningless at that scale.
+ """
+ if len(llm_spans) < 2:
+ return (
+ 1.0,
+ "Not enough LLM spans to check for reasoning stagnation.",
+ )
+
+ def _clean_words(text: str) -> List[str]:
+ """Lowercase, strip stop words, drop short tokens."""
+ return [
+ w
+ for w in str(text).lower().split()
+ if w not in _STOP_WORDS and len(w) > 2
+ ]
+
+ def _bigram_jaccard(words_a: List[str], words_b: List[str]) -> float:
+ if len(words_a) < 2 or len(words_b) < 2:
+ return 0.0
+ bg_a = set(zip(words_a, words_a[1:]))
+ bg_b = set(zip(words_b, words_b[1:]))
+ union = bg_a | bg_b
+ if not union:
+ return 0.0
+ return len(bg_a & bg_b) / len(union)
+
+ def _sequence_ratio(text_a: str, text_b: str) -> float:
+ """SequenceMatcher ratio — order-sensitive but resilient to
+ small insertions/deletions."""
+ return SequenceMatcher(None, text_a, text_b).ratio()
+
+ max_overlap = 0.0
+ stagnating_pair = (-1, -1)
+
+ for i in range(len(llm_spans) - 1):
+ out1 = llm_spans[i].get("output", "")
+ out2 = llm_spans[i + 1].get("output", "")
+
+ if not isinstance(out1, str) or not isinstance(out2, str):
+ continue
+
+ words1 = _clean_words(out1)
+ words2 = _clean_words(out2)
+
+ # Skip pairs where either output is too short for meaningful
+ # comparison — Jaccard on < 20 words is noisy.
+ if len(words1) < 20 or len(words2) < 20:
+ continue
+
+ jaccard = _bigram_jaccard(words1, words2)
+
+ # SequenceMatcher runs on the cleaned word list (joined) so
+ # it's also stop-word-free.
+ seq_ratio = _sequence_ratio(" ".join(words1), " ".join(words2))
+
+ # Take the maximum of both signals — catches both literal
+ # repetition (Jaccard) and reordered repetition (SequenceMatcher).
+ similarity = max(jaccard, seq_ratio)
+
+ if similarity > max_overlap:
+ max_overlap = similarity
+ stagnating_pair = (i, i + 1)
+
+ if max_overlap >= self.similarity_threshold:
+ if max_overlap > 0.95:
+ return (
+ 0.0,
+ f"Identical reasoning outputs at steps "
+ f"{stagnating_pair[0]} and {stagnating_pair[1]}.",
+ )
+ else:
+ return (
+ 0.5,
+ f"High reasoning overlap ({max_overlap:.2f}) at steps "
+ f"{stagnating_pair[0]} and {stagnating_pair[1]}.",
+ )
+
+ return 1.0, "No reasoning stagnation."
+
+ def _combine_scores(
+ self, rep_score: float, stag_score: float, cycle_score: float
+ ) -> float:
+ weights = 0.0
+ total = 0.0
+
+ if self.check_tool_repetition:
+ weights += 0.40
+ total += rep_score * 0.40
+ if self.check_reasoning_stagnation:
+ weights += 0.35
+ total += stag_score * 0.35
+ if self.check_call_graph_cycles:
+ weights += 0.25
+ total += cycle_score * 0.25
+
+ if weights == 0.0:
+ return 1.0
+
+ return total / weights
+
+ def is_successful(self) -> bool:
+ if self.error is not None:
+ self.success = False
+ else:
+ try:
+ self.success = self.score >= self.threshold
+ except TypeError:
+ self.success = False
+ return self.success
+
+ @property
+ def __name__(self):
+ return "Agent Loop Detection"
diff --git a/docs/content/docs/(agentic)/meta.json b/docs/content/docs/(agentic)/meta.json
index 1343906321..5802a13011 100644
--- a/docs/content/docs/(agentic)/meta.json
+++ b/docs/content/docs/(agentic)/meta.json
@@ -6,6 +6,7 @@
"metrics-argument-correctness",
"metrics-step-efficiency",
"metrics-plan-adherence",
- "metrics-plan-quality"
+ "metrics-plan-quality",
+ "metrics-agent-loop-detection"
]
}
diff --git a/docs/content/docs/(agentic)/metrics-agent-loop-detection.mdx b/docs/content/docs/(agentic)/metrics-agent-loop-detection.mdx
new file mode 100644
index 0000000000..f6d9ea85b0
--- /dev/null
+++ b/docs/content/docs/(agentic)/metrics-agent-loop-detection.mdx
@@ -0,0 +1,277 @@
+---
+id: metrics-agent-loop-detection
+title: Agent Loop Detection
+sidebar_label: Agent Loop Detection
+---
+
+
+
+The Agent Loop Detection metric is a **fully deterministic** (no LLM required) agentic metric that detects whether an LLM agent is stuck in an infinite loop or cyclical execution pattern. It analyzes the agent's full execution trace across three independent sub-signals and returns a score from **0.0** (severe looping detected) to **1.0** (clean execution).
+
+:::info
+Agent Loop Detection analyzes your **agent's full execution trace**, which requires [setting up tracing](/docs/evaluation-llm-tracing). Because it is fully deterministic, it runs in production without any API key or LLM call.
+:::
+
+## Required Arguments
+
+The `AgentLoopDetectionMetric` is a **trace-only** metric. It reads from the agent trace set via `update_current_trace` and does **not** require `tools_called` or `expected_tools`. It only requires the standard trace-based fields:
+
+- `input`
+- `actual_output`
+
+## Usage
+
+To begin, [set up tracing](/docs/evaluation-llm-tracing) and supply the `AgentLoopDetectionMetric()` to your `evals_iterator`.
+
+```python
+from deepeval.tracing import observe, update_current_trace
+from deepeval.dataset import Golden, EvaluationDataset
+from deepeval.metrics import AgentLoopDetectionMetric
+
+
+@observe()
+def search_web(query: str) -> str:
+ # Your tool implementation
+ return f"Results for: {query}"
+
+
+@observe()
+def my_agent(input: str) -> str:
+ result = search_web(input)
+ update_current_trace(input=input, output=result)
+ return result
+
+
+# Create dataset
+dataset = EvaluationDataset(goldens=[Golden(input="What is the weather in Paris?")])
+
+# Initialize metric — no model or API key needed
+loop_metric = AgentLoopDetectionMetric(threshold=0.5)
+
+# Evaluate
+for golden in dataset.evals_iterator(metrics=[loop_metric]):
+ my_agent(golden.input)
+```
+
+There are **EIGHT** optional parameters when creating an `AgentLoopDetectionMetric`:
+
+- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to `0.5`.
+- [Optional] `repetition_threshold`: an integer representing how many identical tool calls (same name + same arguments) must occur before flagging repetition. Defaulted to `3`.
+- [Optional] `similarity_threshold`: a float representing the minimum similarity score between consecutive LLM outputs to flag as stagnating. The metric uses the maximum of bigram Jaccard similarity and `SequenceMatcher` ratio. Defaulted to `0.85`.
+- [Optional] `check_tool_repetition`: a boolean — when `True`, enables the tool repetition sub-signal. Defaulted to `True`.
+- [Optional] `check_reasoning_stagnation`: a boolean — when `True`, enables the reasoning stagnation sub-signal. Defaulted to `True`.
+- [Optional] `check_call_graph_cycles`: a boolean — when `True`, enables the call graph cycle sub-signal. Defaulted to `True`.
+- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`.
+- [Optional] `verbose_mode`: a boolean which when set to `True`, prints the intermediate steps used to calculate said metric to the console, as outlined in the [How Is It Calculated](#how-is-it-calculated) section. Defaulted to `False`.
+
+:::info
+The `AgentLoopDetectionMetric` does **not** accept a `model` parameter. All three sub-signals are computed deterministically (hashing, n-gram set operations, DFS) — no LLM is involved. See [Why no `model` parameter?](#why-no-model-parameter) for the design rationale.
+:::
+
+To learn more about how the `evals_iterator` works, [click here.](/docs/evaluation-end-to-end-llm-evals#e2e-evals-for-tracing)
+
+### Within components
+
+You can also run `AgentLoopDetectionMetric` within nested components for [component-level](/docs/evaluation-component-level-llm-evals) evaluation:
+
+```python
+from deepeval.tracing import observe, update_current_span
+from deepeval.test_case import LLMTestCase
+from deepeval.metrics import AgentLoopDetectionMetric
+
+loop_metric = AgentLoopDetectionMetric(threshold=0.5)
+
+@observe(metrics=[loop_metric])
+def inner_agent_component(input: str) -> str:
+ output = "..." # Your agent logic here
+ test_case = LLMTestCase(input=input, actual_output=output)
+ update_current_span(test_case=test_case)
+ return output
+
+@observe()
+def outer_agent(input: str) -> str:
+ return inner_agent_component(input)
+```
+
+### As a standalone
+
+You can also run `AgentLoopDetectionMetric` on a single test case as a standalone, one-off execution:
+
+```python
+from deepeval.test_case import LLMTestCase
+from deepeval.metrics import AgentLoopDetectionMetric
+
+# test_case._trace_dict must be populated from @observe tracing
+metric = AgentLoopDetectionMetric(threshold=0.5)
+metric.measure(test_case)
+
+print(metric.score) # e.g. 0.25
+print(metric.reason) # e.g. "Tool 'search_web' called 6 times with identical arguments."
+print(metric.score_breakdown)
+# {
+# "tool_repetition": 0.0,
+# "reasoning_stagnation": 1.0,
+# "call_graph_cycles": 1.0,
+# }
+```
+
+:::caution
+Running as a standalone is useful for debugging or building a custom pipeline, but you will **NOT** get the benefits of testing reports, Confident AI platform integration, or the optimizations (speed, caching, computation) that `evaluate()` or `deepeval test run` offer.
+:::
+
+## How Is It Calculated?
+
+The `AgentLoopDetectionMetric` score is a **weighted combination** of three independent sub-signals:
+
+
+
+Where each sub-score is **1.0** (no issue), **0.5** (mild issue), or **0.0** (severe issue), and $W_{\text{enabled}}$ is the sum of weights for all enabled checks. Disabling a sub-signal removes its weight from the denominator so it never penalizes the overall score.
+
+| Sub-signal | Weight | What it detects |
+|---|---|---|
+| Tool Call Repetition | 40% | Same tool called with identical arguments ≥ `repetition_threshold` times |
+| Reasoning Stagnation | 35% | Consecutive LLM outputs share ≥ `similarity_threshold` similarity |
+| Call Graph Cycles | 25% | A span appears twice on the same root-to-leaf ancestry path (true recursion) |
+
+### Tool Call Repetition
+
+This sub-signal hashes each tool call as `(tool_name, sorted_args)` and counts how many times each unique call appears in the trace. Only calls with **identical** name and arguments are counted as repeats — calls with different arguments (e.g. a refined search query) are treated as distinct and are never penalized.
+
+| Condition | Score |
+|---|---|
+| Max repeats < `repetition_threshold` | **1.0** — within acceptable limits |
+| Max repeats ≥ `repetition_threshold` | **0.5** — mild repetition loop |
+| Max repeats ≥ `repetition_threshold × 2` | **0.0** — severe repetition loop |
+
+### Reasoning Stagnation
+
+This sub-signal compares **consecutive LLM span outputs** using two complementary similarity measures and takes the **maximum** of both:
+
+1. **Bigram Jaccard similarity** — bag-of-bigrams overlap after stripping common stop words and agent boilerplate phrases. Catches literal repetition. Outputs with fewer than 20 meaningful words are skipped (Jaccard is unreliable at small scale).
+2. **`SequenceMatcher` ratio** (Python `difflib`) — sequence-aware comparison that catches reordered but semantically identical text (e.g. `"I will now search"` ≈ `"Let me search now"`).
+
+Taking the maximum ensures stagnation is flagged whether the agent repeats itself verbatim or merely shuffles its phrasing.
+
+| Condition | Score |
+|---|---|
+| Max similarity < `similarity_threshold` | **1.0** — no stagnation |
+| Max similarity ≥ `similarity_threshold` | **0.5** — high overlap, likely stagnating |
+| Max similarity > 0.95 | **0.0** — outputs are essentially identical |
+
+### Call Graph Cycles
+
+This sub-signal traverses the parent→child span tree from the agent trace and runs a depth-first search (DFS). A **cycle** is detected when a span's `type:name` label appears a second time on the **same root-to-leaf ancestry path** — meaning the agent genuinely called a function that is its own ancestor (true recursion).
+
+:::note
+Sequential repetition (the same tool appearing at sibling positions) is intentionally **not** flagged here — that is the responsibility of the Tool Call Repetition sub-signal above.
+:::
+
+| Condition | Score |
+|---|---|
+| No back-edges in the call tree | **1.0** — clean execution graph |
+| At least one back-edge found | **0.0** — cycle detected; reason includes the full cycle path |
+
+## Score Interpretation
+
+| Score range | Interpretation |
+|---|---|
+| `1.0` | Clean execution — no loop patterns detected |
+| `0.5–1.0` | Mild issues — some repetition or overlap; agent likely recovers |
+| `0.0–0.5` | Severe looping — agent is likely stuck; human review recommended |
+| `0.0` | Critical loop — identical repeated calls or true call graph cycle |
+
+**Example: clean agent**
+```
+Agent Loop Detection
+Score: 1.0
+Reason: No loop patterns detected.
+```
+
+**Example: looping agent**
+```
+Agent Loop Detection
+Score: 0.25
+Reason: Tool 'search_web' called 6 times with identical arguments.
+ Identical reasoning outputs at steps 2 and 3.
+```
+
+## Score Breakdown
+
+The `score_breakdown` attribute exposes each sub-signal score independently after evaluation:
+
+```python
+metric = AgentLoopDetectionMetric(threshold=0.5)
+
+# After measure() runs:
+print(metric.score_breakdown)
+# {
+# "tool_repetition": 0.0,
+# "reasoning_stagnation": 1.0,
+# "call_graph_cycles": 1.0,
+# }
+
+print(metric.score) # 0.4 (weighted: 0.0×0.40 + 1.0×0.35 + 1.0×0.25 = 0.60, but tool weight 0.40 dominates)
+print(metric.reason) # "Tool 'search_web' called 6 times with identical arguments."
+```
+
+A value of `1.0` means no issue was detected for that sub-signal. Lower values indicate degradation proportional to the severity of the loop pattern.
+
+## Configuring Sub-signals
+
+Each of the three sub-signals can be toggled independently. When a sub-signal is disabled, its weight is excluded from the denominator so the score is not penalized:
+
+```python
+# Only check for tool repetition — ignore stagnation and graph cycles
+metric = AgentLoopDetectionMetric(
+ check_tool_repetition=True,
+ check_reasoning_stagnation=False,
+ check_call_graph_cycles=False,
+ repetition_threshold=2, # flag after 2 identical calls (stricter)
+)
+
+# Only check for true recursive cycles in the call graph
+metric = AgentLoopDetectionMetric(
+ check_tool_repetition=False,
+ check_reasoning_stagnation=False,
+ check_call_graph_cycles=True,
+)
+```
+
+## Limitations & Design Decisions
+
+### Why no `model` parameter?
+
+This metric is **fully deterministic by design**. Every sub-signal is computed with hashing, set operations, and sequence comparison algorithms — no LLM is needed. This means:
+
+- **Zero cost** — runs in production without API keys or network calls.
+- **Deterministic** — identical traces always produce identical scores (critical for CI/CD gates).
+- **Zero latency** — no LLM round-trip; the metric completes in milliseconds.
+
+A future `model` parameter could enable an LLM-as-judge mode for the reasoning stagnation sub-signal, which would catch semantically identical outputs that differ greatly in wording. This is tracked as a potential future enhancement.
+
+### Cycle detection uses `type:name:input_hash` labels
+
+The call graph cycle detector identifies spans by a `type:name:input_hash` label (where `input_hash` is a 64-character truncated serialization of the span's input). This is a **heuristic** — the trace dict does not expose span UUIDs (they are stripped internally), so exact span identity is unavailable.
+
+**Trade-offs:**
+
+- Including the input hash reduces false positives when two genuinely different spans share the same `type:name` (e.g. an outer `"planner"` agent delegating to an inner `"planner"` with different input).
+- A true recursive loop passes the same or similar input back to itself, so the input hash matches and the cycle is correctly detected.
+- In the rare edge case where two unrelated same-name spans happen to receive identical truncated inputs, a false positive could theoretically occur. In practice this is extremely unlikely.
+
+### Stagnation detection: dual-signal approach
+
+Reasoning stagnation deliberately uses **two** complementary signals and takes the maximum:
+
+| Signal | Catches | Misses |
+|---|---|---|
+| Bigram Jaccard | Literal repetition, shared phrase patterns | Word reordering |
+| `SequenceMatcher` ratio | Reordered-but-similar text, insertions/deletions | Semantically identical text with different vocabulary |
+
+This is strictly better than either signal alone, but it will still miss outputs that are **semantically identical but lexically different** (e.g. `"Search for Paris weather"` vs `"Look up the forecast in Paris"`). Catching that class of stagnation would require an LLM-as-judge, which would sacrifice the deterministic and zero-cost properties of this metric.
+
+---
+
+:::note Community Metric
+`AgentLoopDetectionMetric` was contributed by [Jeel Thummar](https://github.com/Jeel3011), author of [AGeval](https://pypi.org/project/ageval/) — an episodic evaluation framework for LangGraph agents. The loop detection patterns here are drawn directly from production experience with ReAct and LangGraph agents. See [issue #2643](https://github.com/confident-ai/deepeval/issues/2643) for the original proposal and design discussion.
+:::
diff --git a/tests/test_agent_loop_detection.py b/tests/test_agent_loop_detection.py
new file mode 100644
index 0000000000..6973e7f427
--- /dev/null
+++ b/tests/test_agent_loop_detection.py
@@ -0,0 +1,372 @@
+"""Tests for AgentLoopDetectionMetric.
+
+All tests are fully deterministic — no API key, no network, no LLM required.
+"""
+
+import pytest
+
+from deepeval.test_case import LLMTestCase
+from deepeval.metrics.agent_loop_detection import AgentLoopDetectionMetric
+
+
+# ---------------------------------------------------------------------------
+# Trace / test-case builder helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_tool_span(name: str, input_data: dict, children=None) -> dict:
+ """Build a minimal tool span dict matching DeepEval's _trace_dict format."""
+ span = {
+ "type": "tool",
+ "name": name,
+ "input": input_data,
+ "output": f"result of {name}",
+ "children": children or [],
+ }
+ return span
+
+
+def _make_llm_span(output: str, children=None) -> dict:
+ """Build a minimal LLM span dict."""
+ return {
+ "type": "llm",
+ "name": "llm_call",
+ "input": "some prompt",
+ "output": output,
+ "children": children or [],
+ }
+
+
+def _make_agent_span(name: str, children: list) -> dict:
+ """Build a root agent span wrapping child spans."""
+ return {
+ "type": "agent",
+ "name": name,
+ "input": "user query",
+ "output": "agent answer",
+ "children": children,
+ }
+
+
+def _make_test_case(trace_dict: dict) -> LLMTestCase:
+ """Create an LLMTestCase with a pre-built _trace_dict (no API needed)."""
+ tc = LLMTestCase(input="test input", actual_output="test output")
+ tc._trace_dict = trace_dict
+ return tc
+
+
+# ---------------------------------------------------------------------------
+# Test 1: Clean trace → score 1.0
+# ---------------------------------------------------------------------------
+
+
+def test_clean_trace_passes():
+ """A well-behaved agent with distinct tool calls should score 1.0."""
+ trace = _make_agent_span(
+ "planner",
+ [
+ _make_tool_span("search_web", {"query": "Paris weather"}),
+ _make_tool_span("get_forecast", {"city": "Paris"}),
+ _make_tool_span("summarize", {"text": "It is sunny"}),
+ ],
+ )
+ metric = AgentLoopDetectionMetric(threshold=0.5)
+ tc = _make_test_case(trace)
+ metric._calculate_metric(tc)
+
+ assert metric.score == 1.0
+ assert metric.success is True
+
+
+# ---------------------------------------------------------------------------
+# Test 2: Repeated tool calls → score ≤ 0.5
+# ---------------------------------------------------------------------------
+
+
+def test_repeated_tool_calls_detected():
+ """Four identical tool calls with repetition_threshold=3 should fail.
+
+ The metric scores 0.5 for moderate repetition (count >= threshold but
+ < 2x threshold). Using threshold=0.7 ensures 0.5 < threshold → failure.
+ """
+ trace = _make_agent_span(
+ "looping_agent",
+ [
+ _make_tool_span("search_web", {"query": "Paris weather"}),
+ _make_tool_span("search_web", {"query": "Paris weather"}),
+ _make_tool_span("search_web", {"query": "Paris weather"}),
+ _make_tool_span("search_web", {"query": "Paris weather"}),
+ ],
+ )
+ metric = AgentLoopDetectionMetric(
+ threshold=0.7, # > 0.5 so that a moderate repetition score fails
+ repetition_threshold=3,
+ check_reasoning_stagnation=False,
+ check_call_graph_cycles=False,
+ )
+ tc = _make_test_case(trace)
+ metric._calculate_metric(tc)
+
+ assert metric.score_breakdown["tool_repetition"] <= 0.5
+ assert metric.success is False
+
+
+# ---------------------------------------------------------------------------
+# Test 3: No trace data → score 0.0 with "No trace data" in reason
+# ---------------------------------------------------------------------------
+
+
+def test_no_trace_returns_zero():
+ """Missing trace should immediately return 0.0 with a descriptive reason."""
+ metric = AgentLoopDetectionMetric()
+ tc = LLMTestCase(input="x", actual_output="y")
+ tc._trace_dict = None
+ metric._calculate_metric(tc)
+
+ assert metric.score == 0.0
+ assert "No trace data" in metric.reason
+
+
+# ---------------------------------------------------------------------------
+# Test 4: Stagnating LLM outputs → stagnation score < 1.0
+# ---------------------------------------------------------------------------
+
+
+def test_reasoning_stagnation_detected():
+ """Identical long LLM outputs should be flagged as stagnating."""
+ repeated_output = (
+ "I need to search for Paris weather using the search tool. "
+ "The user wants current weather conditions in Paris France. "
+ "I will call the search tool with query Paris weather forecast today. "
+ "After retrieving results I will summarize them clearly for the user. "
+ "Paris weather search results will help answer the question completely."
+ )
+ # Build a trace with 3 identical LLM outputs (all > 20 meaningful words)
+ trace = _make_agent_span(
+ "stagnating_agent",
+ [
+ _make_llm_span(repeated_output),
+ _make_llm_span(repeated_output),
+ _make_llm_span(repeated_output),
+ ],
+ )
+ metric = AgentLoopDetectionMetric(
+ similarity_threshold=0.75,
+ check_tool_repetition=False,
+ check_call_graph_cycles=False,
+ )
+ tc = _make_test_case(trace)
+ metric._calculate_metric(tc)
+
+ assert metric.score_breakdown["reasoning_stagnation"] < 1.0
+
+
+# ---------------------------------------------------------------------------
+# Test 5: Selective sub-signal disabling
+# ---------------------------------------------------------------------------
+
+
+def test_disable_tool_repetition_check():
+ """When check_tool_repetition=False, repeated tools must not penalise score."""
+ trace = _make_agent_span(
+ "agent",
+ [
+ _make_tool_span("search_web", {"query": "q"}),
+ _make_tool_span("search_web", {"query": "q"}),
+ _make_tool_span("search_web", {"query": "q"}),
+ _make_tool_span("search_web", {"query": "q"}),
+ ],
+ )
+ metric = AgentLoopDetectionMetric(
+ check_tool_repetition=False,
+ check_reasoning_stagnation=False,
+ check_call_graph_cycles=False,
+ )
+ tc = _make_test_case(trace)
+ metric._calculate_metric(tc)
+
+ # All checks disabled → should default to 1.0
+ assert metric.score_breakdown["tool_repetition"] == 1.0
+ assert metric.score == 1.0
+
+
+# ---------------------------------------------------------------------------
+# Test 6: Score combining weights normalise correctly
+# ---------------------------------------------------------------------------
+
+
+def test_score_combines_with_correct_weights():
+ """When cycles are disabled, rep+stag weights must sum to 1.0 after norm."""
+ metric = AgentLoopDetectionMetric(
+ check_tool_repetition=True,
+ check_reasoning_stagnation=True,
+ check_call_graph_cycles=False,
+ )
+ # rep=1.0 * 0.40, stag=0.0 * 0.35 → total=0.40 / 0.75
+ combined = metric._combine_scores(1.0, 0.0, 1.0)
+ expected = 0.40 / 0.75
+ assert abs(combined - expected) < 0.001
+
+
+# ---------------------------------------------------------------------------
+# Test 7: Call graph cycle (true recursive span) → cycle score 0.0
+# ---------------------------------------------------------------------------
+
+
+def test_call_graph_cycle_detected():
+ """A true recursive loop passes the same input back to itself.
+
+ The inner agent has the same type, name, AND input as the outer —
+ this is the signature of a genuine call graph cycle where the agent
+ re-invoked itself with the same request.
+ """
+ # agent("user query") → tool_A → agent("user query") ← true cycle
+ inner_agent = {
+ "type": "agent",
+ "name": "planner",
+ "input": "user query", # same input as outer → recursive loop
+ "output": "stuck",
+ "children": [],
+ }
+ outer_agent = {
+ "type": "agent",
+ "name": "planner",
+ "input": "user query", # same input as inner
+ "output": "answer",
+ "children": [
+ _make_tool_span("tool_a", {"x": "1"}, children=[inner_agent]),
+ ],
+ }
+ metric = AgentLoopDetectionMetric(
+ check_tool_repetition=False,
+ check_reasoning_stagnation=False,
+ check_call_graph_cycles=True,
+ )
+ tc = _make_test_case(outer_agent)
+ metric._calculate_metric(tc)
+
+ assert metric.score_breakdown["call_graph_cycles"] == 0.0
+ assert metric.success is False
+ assert "Cycle" in metric.reason
+
+
+# ---------------------------------------------------------------------------
+# Test 8: Sequential same-name calls are NOT flagged as call graph cycles
+# ---------------------------------------------------------------------------
+
+
+def test_sequential_same_name_not_a_cycle():
+ """
+ Calling tool A then tool B then tool A again is NOT a cycle — tool A appears
+ twice at sibling positions, not on the same ancestry path.
+ """
+ trace = _make_agent_span(
+ "agent",
+ [
+ _make_tool_span("search_web", {"q": "foo"}),
+ _make_tool_span("other_tool", {"x": "bar"}),
+ _make_tool_span(
+ "search_web", {"q": "baz"}
+ ), # same name, different args
+ ],
+ )
+ metric = AgentLoopDetectionMetric(
+ check_tool_repetition=False,
+ check_reasoning_stagnation=False,
+ check_call_graph_cycles=True,
+ )
+ tc = _make_test_case(trace)
+ metric._calculate_metric(tc)
+
+ # No cycle — these are siblings, not ancestors of each other
+ assert metric.score_breakdown["call_graph_cycles"] == 1.0
+
+
+# ---------------------------------------------------------------------------
+# Test 9: Same-name agents with DIFFERENT inputs are NOT false-positived
+# ---------------------------------------------------------------------------
+
+
+def test_same_name_different_input_not_a_cycle():
+ """Two agents with the same type:name but different inputs should not
+ be flagged as a cycle — input_hash disambiguates them.
+
+ This is the edge case where the old label-only approach would have
+ false-positived. Including a truncated input hash in the DFS label
+ makes the detection resilient to legitimate same-name delegation.
+ """
+ # outer planner delegates to an inner planner with different input
+ inner_planner = {
+ "type": "agent",
+ "name": "planner",
+ "input": "subtask: book hotel in Paris",
+ "output": "hotel booked",
+ "children": [],
+ }
+ outer_planner = {
+ "type": "agent",
+ "name": "planner", # same name!
+ "input": "plan trip to Paris", # but different input
+ "output": "trip planned",
+ "children": [
+ _make_tool_span(
+ "delegate", {"to": "planner"}, children=[inner_planner]
+ ),
+ ],
+ }
+ metric = AgentLoopDetectionMetric(
+ check_tool_repetition=False,
+ check_reasoning_stagnation=False,
+ check_call_graph_cycles=True,
+ )
+ tc = _make_test_case(outer_planner)
+ metric._calculate_metric(tc)
+
+ # Different inputs → different labels → no cycle
+ assert metric.score_breakdown["call_graph_cycles"] == 1.0
+ assert metric.success is True
+
+
+# ---------------------------------------------------------------------------
+# Test 10: Reordered-but-identical reasoning is caught by SequenceMatcher
+# ---------------------------------------------------------------------------
+
+
+def test_reordered_stagnation_detected():
+ """SequenceMatcher should catch stagnation even when the agent shuffles
+ its phrasing but conveys the same content.
+
+ Bigram Jaccard alone would miss this because the bigram sets differ
+ when words are reordered. SequenceMatcher, being sequence-aware,
+ produces a high ratio and triggers the stagnation flag.
+ """
+ output_a = (
+ "I need to search for the current weather conditions in Paris France "
+ "using the search tool. After retrieving the results I will summarize "
+ "them clearly for the user. The Paris weather search results will help "
+ "me answer the question completely and accurately for the user today."
+ )
+ # Same semantic content, moderate word reordering
+ output_b = (
+ "Let me search for the current weather conditions in Paris France "
+ "using the search tool. After I retrieve the results I will clearly "
+ "summarize them for the user. The search results for Paris weather "
+ "will help me completely and accurately answer the question for today."
+ )
+ trace = _make_agent_span(
+ "agent",
+ [
+ _make_llm_span(output_a),
+ _make_llm_span(output_b),
+ ],
+ )
+ metric = AgentLoopDetectionMetric(
+ similarity_threshold=0.75,
+ check_tool_repetition=False,
+ check_call_graph_cycles=False,
+ )
+ tc = _make_test_case(trace)
+ metric._calculate_metric(tc)
+
+ # SequenceMatcher should push the similarity above 0.75
+ assert metric.score_breakdown["reasoning_stagnation"] < 1.0
diff --git a/tests/test_core/test_tracing/apps/async_app.py b/tests/test_core/test_tracing/apps/async_app.py
index 8992fe45b5..fce1231e2e 100644
--- a/tests/test_core/test_tracing/apps/async_app.py
+++ b/tests/test_core/test_tracing/apps/async_app.py
@@ -1,11 +1,12 @@
-from deepeval.metrics import TaskCompletionMetric, AnswerRelevancyMetric
+import asyncio
+
+from deepeval.metrics import AnswerRelevancyMetric, TaskCompletionMetric
from deepeval.tracing import (
+ observe,
update_current_span,
update_llm_span,
update_retriever_span,
- observe,
)
-import asyncio
@observe(type="llm", model="gpt-4o")
@@ -62,9 +63,22 @@ async def custom_research_agent(query: str):
return analysis
+# Metrics must NOT be instantiated at module level: doing so in a decorator
+# argument triggers an API-key validation at import time, which breaks pytest
+# collection in keyless CI. Instead, pass pre-built instances that are
+# constructed once here at definition time via helper calls — the key
+# difference is that these helpers are called here rather than at the
+# top-level so the module remains importable without env vars.
+#
+# If even this causes issues in keyless environments the metrics can be moved
+# fully inside the function body and passed via `update_current_span`.
+_weather_metrics = [AnswerRelevancyMetric()]
+_meta_metrics = [TaskCompletionMetric(task="Get the weather")]
+
+
@observe(
available_tools=["get_weather", "get_location"],
- metrics=[AnswerRelevancyMetric()],
+ metrics=_weather_metrics,
)
async def weather_agent(query: str):
update_current_span(
@@ -84,7 +98,7 @@ async def research_agent(query: str):
@observe(
type="agent",
agent_handoffs=["research_agent", "custom_research_agent"],
- metrics=[TaskCompletionMetric(task="Get the weather")],
+ metrics=_meta_metrics,
metric_collection="Test",
)
async def meta_agent(input: str):