Fix task completion tool prompt rendering

luochen211 · luochen211 · commit 1a18b1dbbd99 · 2026-06-28T21:01:10.000+08:00
diff --git a/deepeval/metrics/task_completion/task_completion.py b/deepeval/metrics/task_completion/task_completion.py
@@ -5,6 +5,7 @@
     construct_verbose_logs,
     check_llm_test_case_params,
     initialize_model,
+    print_tools_called,
     a_generate_with_schema_and_extract,
     generate_with_schema_and_extract,
 )
@@ -194,6 +195,9 @@ async def _a_extract_task_and_outcome(
                 input=test_case.input,
                 actual_output=test_case.actual_output,
                 tools_called=test_case.tools_called,
+                tools_called_formatted=print_tools_called(
+                    test_case.tools_called
+                ),
             )
         return await a_generate_with_schema_and_extract(
             metric=self,
@@ -220,6 +224,9 @@ def _extract_task_and_outcome(
                 input=test_case.input,
                 actual_output=test_case.actual_output,
                 tools_called=test_case.tools_called,
+                tools_called_formatted=print_tools_called(
+                    test_case.tools_called
+                ),
             )
         return generate_with_schema_and_extract(
             metric=self,
diff --git a/tests/test_metrics/test_task_completion_prompt.py b/tests/test_metrics/test_task_completion_prompt.py
@@ -0,0 +1,87 @@
+import pytest
+
+import deepeval.metrics.task_completion.task_completion as task_completion_module
+from deepeval.metrics import TaskCompletionMetric
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.test_case import LLMTestCase, ToolCall
+
+
+class FakeLLM(DeepEvalBaseLLM):
+    def load_model(self, *args, **kwargs):
+        return self
+
+    def generate(self, *args, **kwargs):
+        return "{}"
+
+    async def a_generate(self, *args, **kwargs):
+        return "{}"
+
+    def get_model_name(self, *args, **kwargs):
+        return "fake-llm"
+
+
+def make_test_case():
+    return LLMTestCase(
+        input="Plan a weather-aware picnic",
+        actual_output="Checked the weather and suggested a park picnic.",
+        tools_called=[
+            ToolCall(
+                name="weather_lookup",
+                input_parameters={"city": "San Francisco"},
+                output={"forecast": "sunny"},
+            )
+        ],
+    )
+
+
+def assert_prompt_formats_tools_called(prompt):
+    assert "weather_lookup" in prompt
+    assert "San Francisco" in prompt
+    assert "{{ tools_called_formatted }}" not in prompt
+
+
+def test_task_completion_goal_prompt_formats_tools_called_sync(monkeypatch):
+    captured = {}
+
+    def fake_generate_with_schema_and_extract(**kwargs):
+        captured["prompt"] = kwargs["prompt"]
+        return "task", "outcome"
+
+    monkeypatch.setattr(
+        task_completion_module,
+        "generate_with_schema_and_extract",
+        fake_generate_with_schema_and_extract,
+    )
+
+    metric = TaskCompletionMetric(model=FakeLLM(), async_mode=False)
+
+    assert metric._extract_task_and_outcome(make_test_case()) == (
+        "task",
+        "outcome",
+    )
+    assert_prompt_formats_tools_called(captured["prompt"])
+
+
+@pytest.mark.asyncio
+async def test_task_completion_goal_prompt_formats_tools_called_async(
+    monkeypatch,
+):
+    captured = {}
+
+    async def fake_a_generate_with_schema_and_extract(**kwargs):
+        captured["prompt"] = kwargs["prompt"]
+        return "task", "outcome"
+
+    monkeypatch.setattr(
+        task_completion_module,
+        "a_generate_with_schema_and_extract",
+        fake_a_generate_with_schema_and_extract,
+    )
+
+    metric = TaskCompletionMetric(model=FakeLLM())
+
+    assert await metric._a_extract_task_and_outcome(make_test_case()) == (
+        "task",
+        "outcome",
+    )
+    assert_prompt_formats_tools_called(captured["prompt"])