Skip to content

Commit 2c67b64

Browse files
committed
feat: add list engagements test case
1 parent 1bcd156 commit 2c67b64

File tree

3 files changed

+880
-869
lines changed

3 files changed

+880
-869
lines changed

benchmarks/compare_tool_calls.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import json
2+
3+
def compare_tool_calls(expected_calls, actual_calls):
4+
"""
5+
Compare expected tool calls with actual tool calls, allowing for flexible matching.
6+
7+
Args:
8+
expected_calls (list): List of expected tool call dictionaries
9+
actual_calls (list): List of actual tool call dictionaries
10+
11+
Returns:
12+
tuple: (bool, str) - (success, reason)
13+
"""
14+
if len(expected_calls) != len(actual_calls):
15+
mismatch_reason = f"Tool call count mismatch: Expected {len(expected_calls)}, got {len(actual_calls)}"
16+
print(mismatch_reason)
17+
return False, mismatch_reason
18+
19+
# Create a copy of actual calls that we can mark as matched
20+
remaining_actual_calls = actual_calls.copy()
21+
22+
for i, expected in enumerate(expected_calls):
23+
print(f"\nChecking expected tool call {i+1}:")
24+
print(f"Expected: {json.dumps(expected, indent=2)}")
25+
26+
expected_tool_name = expected['tool_name']
27+
expected_input = json.loads(expected['tool_input']) if isinstance(expected['tool_input'], str) else expected['tool_input']
28+
expected_status = expected['tool_output']['status']
29+
30+
# Find a matching actual call
31+
match_found = False
32+
matched_call_index = -1
33+
34+
for j, actual in enumerate(remaining_actual_calls):
35+
if actual is None: # Skip already matched calls
36+
continue
37+
38+
actual_tool_name = actual['tool_name']
39+
actual_input = json.loads(actual['tool_input']) if isinstance(actual['tool_input'], str) else actual['tool_input']
40+
actual_status = actual['tool_output']['status']
41+
42+
# Check if tool name matches
43+
if expected_tool_name != actual_tool_name:
44+
continue
45+
46+
# Check if tool output status matches
47+
if expected_status != actual_status:
48+
continue
49+
50+
# Check if all expected keys are in actual input
51+
keys_match = True
52+
for key, value in expected_input.items():
53+
if key not in actual_input:
54+
keys_match = False
55+
break
56+
57+
# For values in angle brackets, just check that the actual value exists
58+
if isinstance(value, str) and value.startswith("<") and value.endswith(">"):
59+
if actual_input[key] is None or actual_input[key] == "":
60+
keys_match = False
61+
break
62+
print(f"Placeholder {value} matched with actual value {actual_input[key]}")
63+
# For regular values, check for exact match
64+
elif expected_input[key] != actual_input[key]:
65+
keys_match = False
66+
break
67+
68+
if keys_match:
69+
match_found = True
70+
matched_call_index = j
71+
print(f"Matched with actual call: {json.dumps(actual, indent=2)}")
72+
if any(isinstance(value, str) and value.startswith("<") and value.endswith(">") for value in expected_input.values()):
73+
print("Placeholder values were successfully matched")
74+
break
75+
76+
if not match_found:
77+
mismatch_reason = f"No matching actual call found for expected call {i+1}"
78+
print(mismatch_reason)
79+
return False, mismatch_reason
80+
81+
# Mark the matched call as used
82+
remaining_actual_calls[matched_call_index] = None
83+
84+
print("All expected tool calls were matched!")
85+
return True, "All tool calls match"

benchmarks/hubspot_benchmark.json

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,52 @@
3434
}
3535
}
3636
]
37+
},
38+
{
39+
"intent": "John needs to retrieve past engagements (calls, emails, meetings) related to Clara Reynolds to prepare the CEO for an important meeting.",
40+
"expected_tool_calls": [
41+
{
42+
"tool_name": "hubspot__list-or-search-contacts__ListOrSearchContacts",
43+
"tool_input": "{\"search\":\"Clara Reynolds\"}",
44+
"tool_output": {
45+
"status": "completed"
46+
}
47+
},
48+
{
49+
"tool_name": "hubspot__list-or-search-engagements__ListOrSearchEngagements",
50+
"tool_input": "{\"type\": \"CALL\", \"contactId\": \"<CONTACT_ID>\"}",
51+
"tool_output": {
52+
"status": "completed"
53+
}
54+
},
55+
{
56+
"tool_name": "hubspot__list-or-search-engagements__ListOrSearchEngagements",
57+
"tool_input": "{\"type\": \"EMAIL\", \"contactId\": \"<CONTACT_ID>\"}",
58+
"tool_output": {
59+
"status": "completed"
60+
}
61+
},
62+
{
63+
"tool_name": "hubspot__list-or-search-engagements__ListOrSearchEngagements",
64+
"tool_input": "{\"type\": \"MEETING\", \"contactId\": \"<CONTACT_ID>\"}",
65+
"tool_output": {
66+
"status": "completed"
67+
}
68+
},
69+
{
70+
"tool_name": "hubspot__list-or-search-engagements__ListOrSearchEngagements",
71+
"tool_input": "{\"type\": \"TASK\", \"contactId\": \"<CONTACT_ID>\"}",
72+
"tool_output": {
73+
"status": "completed"
74+
}
75+
},
76+
{
77+
"tool_name": "hubspot__list-or-search-engagements__ListOrSearchEngagements",
78+
"tool_input": "{\"type\": \"NOTE\", \"contactId\": \"<CONTACT_ID>\"}",
79+
"tool_output": {
80+
"status": "completed"
81+
}
82+
}
83+
]
3784
}
3885
]

0 commit comments

Comments
 (0)