Skip to content

Commit 1a18b1d

Browse files
committed
Fix task completion tool prompt rendering
1 parent 8ebfa33 commit 1a18b1d

2 files changed

Lines changed: 94 additions & 0 deletions

File tree

deepeval/metrics/task_completion/task_completion.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
construct_verbose_logs,
66
check_llm_test_case_params,
77
initialize_model,
8+
print_tools_called,
89
a_generate_with_schema_and_extract,
910
generate_with_schema_and_extract,
1011
)
@@ -194,6 +195,9 @@ async def _a_extract_task_and_outcome(
194195
input=test_case.input,
195196
actual_output=test_case.actual_output,
196197
tools_called=test_case.tools_called,
198+
tools_called_formatted=print_tools_called(
199+
test_case.tools_called
200+
),
197201
)
198202
return await a_generate_with_schema_and_extract(
199203
metric=self,
@@ -220,6 +224,9 @@ def _extract_task_and_outcome(
220224
input=test_case.input,
221225
actual_output=test_case.actual_output,
222226
tools_called=test_case.tools_called,
227+
tools_called_formatted=print_tools_called(
228+
test_case.tools_called
229+
),
223230
)
224231
return generate_with_schema_and_extract(
225232
metric=self,
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import pytest
2+
3+
import deepeval.metrics.task_completion.task_completion as task_completion_module
4+
from deepeval.metrics import TaskCompletionMetric
5+
from deepeval.models import DeepEvalBaseLLM
6+
from deepeval.test_case import LLMTestCase, ToolCall
7+
8+
9+
class FakeLLM(DeepEvalBaseLLM):
10+
def load_model(self, *args, **kwargs):
11+
return self
12+
13+
def generate(self, *args, **kwargs):
14+
return "{}"
15+
16+
async def a_generate(self, *args, **kwargs):
17+
return "{}"
18+
19+
def get_model_name(self, *args, **kwargs):
20+
return "fake-llm"
21+
22+
23+
def make_test_case():
24+
return LLMTestCase(
25+
input="Plan a weather-aware picnic",
26+
actual_output="Checked the weather and suggested a park picnic.",
27+
tools_called=[
28+
ToolCall(
29+
name="weather_lookup",
30+
input_parameters={"city": "San Francisco"},
31+
output={"forecast": "sunny"},
32+
)
33+
],
34+
)
35+
36+
37+
def assert_prompt_formats_tools_called(prompt):
38+
assert "weather_lookup" in prompt
39+
assert "San Francisco" in prompt
40+
assert "{{ tools_called_formatted }}" not in prompt
41+
42+
43+
def test_task_completion_goal_prompt_formats_tools_called_sync(monkeypatch):
44+
captured = {}
45+
46+
def fake_generate_with_schema_and_extract(**kwargs):
47+
captured["prompt"] = kwargs["prompt"]
48+
return "task", "outcome"
49+
50+
monkeypatch.setattr(
51+
task_completion_module,
52+
"generate_with_schema_and_extract",
53+
fake_generate_with_schema_and_extract,
54+
)
55+
56+
metric = TaskCompletionMetric(model=FakeLLM(), async_mode=False)
57+
58+
assert metric._extract_task_and_outcome(make_test_case()) == (
59+
"task",
60+
"outcome",
61+
)
62+
assert_prompt_formats_tools_called(captured["prompt"])
63+
64+
65+
@pytest.mark.asyncio
66+
async def test_task_completion_goal_prompt_formats_tools_called_async(
67+
monkeypatch,
68+
):
69+
captured = {}
70+
71+
async def fake_a_generate_with_schema_and_extract(**kwargs):
72+
captured["prompt"] = kwargs["prompt"]
73+
return "task", "outcome"
74+
75+
monkeypatch.setattr(
76+
task_completion_module,
77+
"a_generate_with_schema_and_extract",
78+
fake_a_generate_with_schema_and_extract,
79+
)
80+
81+
metric = TaskCompletionMetric(model=FakeLLM())
82+
83+
assert await metric._a_extract_task_and_outcome(make_test_case()) == (
84+
"task",
85+
"outcome",
86+
)
87+
assert_prompt_formats_tools_called(captured["prompt"])

0 commit comments

Comments
 (0)