-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_check.py
More file actions
271 lines (256 loc) · 10.7 KB
/
run_check.py
File metadata and controls
271 lines (256 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
"""
按需记忆功能验收测试运行器(NM-01 ~ NM-04)
用法: python run_check.py
"""
from __future__ import annotations
import asyncio
import sys
import time
import traceback
sys.path.insert(0, ".")
from app.services.chat_service import create_chat_service
from app.schemas.chat import ChatRequest
TESTS: list[dict] = [
{
"id": "NM-01a",
"name": "跨会话记忆写入(companion session A)",
"request": {
"text": "我叫小明,我最喜欢的颜色是蓝色!",
"mode": "companion",
"age_hint": "6",
"session_id": "nm-sess-A",
"profile_id": "nm-test-child",
},
"checks": [
("memory.session_updated == True", lambda r: r.memory.session_updated is True),
("memory.written_types 非空", lambda r: len(r.memory.written_types) > 0),
("message 非空", lambda r: bool(r.message)),
],
},
{
"id": "NM-01b",
"name": "跨会话记忆召回(companion session B,同 profile_id)",
"request": {
"text": "你知道我最喜欢什么颜色吗?",
"mode": "companion",
"age_hint": "6",
"session_id": "nm-sess-B",
"profile_id": "nm-test-child",
},
"checks": [
# selected_act 是最后一轮迭代的决策(respond=direct),
# 真正的工具调用发生在第一轮,用 workflow_trace 验证
("'tools' in workflow_trace(read_memory 被调用)", lambda r: "tools" in r.metadata.workflow_trace),
# chatbot 出现 ≥2 次说明经历了:call tool → observe → re-reason
("chatbot 迭代 ≥ 2 次", lambda r: r.metadata.workflow_trace.count("chatbot") >= 2),
("message 包含'蓝'(最终答案含颜色)", lambda r: "蓝" in r.message),
],
},
{
"id": "NM-02a",
"name": "当前会话记忆写入(companion session C)",
"request": {
"text": "我的名字叫小花,今天学会了画画!",
"mode": "companion",
"age_hint": "6",
"session_id": "nm-same-01",
"profile_id": "nm-test-child-2",
},
"checks": [
("memory.session_updated == True", lambda r: r.memory.session_updated is True),
("message 非空", lambda r: bool(r.message)),
],
},
{
"id": "NM-02b",
"name": "当前会话回忆(同 session,应从上下文直接回答)",
"request": {
"text": "你还记得我的名字吗?",
"mode": "companion",
"age_hint": "6",
"session_id": "nm-same-01",
"profile_id": "nm-test-child-2",
},
"checks": [
("react.selected_act == 'direct'(无需工具)", lambda r: r.react.selected_act == "direct"),
("'tools' not in workflow_trace", lambda r: "tools" not in r.metadata.workflow_trace),
("message 包含'小花'", lambda r: "小花" in r.message),
],
},
{
"id": "NM-03",
"name": "Education 模式 profile 记忆自动注入(不触发工具)",
"request": {
"text": "蜗牛是什么动物?",
"mode": "education",
"age_hint": "6",
"session_id": "nm-edu-new",
"profile_id": "nm-test-child",
},
"checks": [
("'plan' in workflow_trace", lambda r: "plan" in r.metadata.workflow_trace),
("'chatbot' not in workflow_trace(不走 ReAct)", lambda r: "chatbot" not in r.metadata.workflow_trace),
("message 非空", lambda r: bool(r.message)),
],
},
{
"id": "NM-04a",
"name": "Parent 跨会话记忆写入(session P1)",
"request": {
"text": "我的孩子叫小宝,今年4岁,最近在学认字。",
"mode": "parent",
"age_hint": "4",
"session_id": "nm-parent-A",
"profile_id": "nm-test-parent",
},
"checks": [
("memory.session_updated == True", lambda r: r.memory.session_updated is True),
("message 非空", lambda r: bool(r.message)),
],
},
{
"id": "NM-04b",
"name": "Parent 跨会话记忆召回(session P2,同 profile_id)",
"request": {
"text": "你还记得我孩子叫什么名字吗?",
"mode": "parent",
"age_hint": "4",
"session_id": "nm-parent-B",
"profile_id": "nm-test-parent",
},
"checks": [
("'tools' in workflow_trace(read_memory 被调用)", lambda r: "tools" in r.metadata.workflow_trace),
("chatbot 迭代 ≥ 2 次", lambda r: r.metadata.workflow_trace.count("chatbot") >= 2),
("message 包含'小宝'(最终答案含孩子名)", lambda r: "小宝" in r.message),
],
},
# ── 图片理解 ──────────────────────────────────────────────────────────────
{
"id": "IM-01a",
"name": "图片识别('这是什么?' + image_url)",
"request": {
"text": "这是什么?",
"image_url": "https://ts1.tc.mm.bing.net/th/id/OIP-C.Mykh0w4k5mpqd4xGDKBgPQHaE7?w=193&h=135&c=8&rs=1&qlt=90&o=6&dpr=1.3&pid=3.1&rm=2",
"mode": "companion",
"age_hint": "6",
"session_id": "im-sess-01",
},
"checks": [
("message 非空", lambda r: bool(r.message)),
("message 包含'长颈鹿'", lambda r: "长颈鹿" in r.message),
],
},
{
"id": "IM-01b",
"name": "图片追问(无图,同 session,从历史恢复图片上下文)",
"request": {
"text": "那么图中具体有几只长颈鹿呢?站位情况怎么样?",
"mode": "companion",
"age_hint": "6",
"session_id": "im-sess-01",
},
"checks": [
("message 非空", lambda r: bool(r.message)),
(
"message 包含数量描述(数字或'只')",
lambda r: "只" in r.message or any(
c in r.message for c in "一二三四五六七八九十0123456789"
),
),
],
},
# ── Skill:generate_parent_summary ───────────────────────────────────────
{
"id": "SK-01a",
"name": "Skill 前置:写入孩子学习数据(education mode,default_child)",
"request": {
"text": "今天学了加法,1+1=2,1+2=3,老师还教我们数数到20!",
"mode": "education",
"age_hint": "6",
"session_id": "sk-edu-01",
},
"checks": [
("memory.session_updated == True", lambda r: r.memory.session_updated is True),
("message 非空", lambda r: bool(r.message)),
],
},
{
"id": "SK-01b",
"name": "Skill 触发:家长查询孩子学习情况(generate_parent_summary)",
"request": {
"text": "帮我看看孩子最近的学习情况,有什么进展吗?",
"mode": "parent",
"age_hint": "6",
"session_id": "sk-parent-01",
},
"checks": [
("react.selected_act == 'skill'", lambda r: r.react.selected_act == "skill"),
("'tools' in workflow_trace(skill 被调用)", lambda r: "tools" in r.metadata.workflow_trace),
("message 非空", lambda r: bool(r.message)),
],
},
]
def fmt(b: bool) -> str:
return "✓" if b else "✗"
async def run_tests() -> list[dict]:
print("初始化 ChatService...", flush=True)
service = create_chat_service()
print("完成,开始测试\n", flush=True)
results = []
for t in TESTS:
tid, name = t["id"], t["name"]
print(f"[{tid}] {name}", flush=True)
try:
req = ChatRequest.model_validate(t["request"])
t0 = time.monotonic()
resp = await service.explain_and_ask(req)
elapsed = round(time.monotonic() - t0, 2)
check_results = []
for cname, cfn in t["checks"]:
try:
passed = cfn(resp)
except Exception as e:
passed = False
cname = f"{cname} [异常:{e}]"
check_results.append((cname, passed))
status = "PASS" if all(p for _, p in check_results) else "FAIL"
result = {
"id": tid, "name": name, "status": status, "elapsed_s": elapsed,
"checks": check_results,
"selected_act": resp.react.selected_act,
"used_rag": resp.grounding.used_rag,
"session_updated": resp.memory.session_updated,
"written_types": resp.memory.written_types,
"workflow_trace": resp.metadata.workflow_trace,
"message_snippet": resp.message[:100].replace("\n", " "),
"error": None,
}
except Exception:
result = {
"id": tid, "name": name, "status": "ERROR", "elapsed_s": 0,
"checks": [(c, False) for c, _ in t["checks"]],
"selected_act": None, "used_rag": None, "session_updated": None,
"written_types": [], "workflow_trace": [], "message_snippet": "",
"error": traceback.format_exc()[-600:],
}
results.append(result)
icon = {"PASS": "✅", "FAIL": "❌", "ERROR": "💥"}[result["status"]]
print(f" {icon} {result['status']} {result['elapsed_s']}s", flush=True)
print(f" act={result['selected_act']} rag={result['used_rag']} session_updated={result['session_updated']}", flush=True)
print(f" written={result['written_types']}", flush=True)
print(f" trace={result['workflow_trace']}", flush=True)
print(f" msg: {result['message_snippet']}", flush=True)
for cname, cp in result["checks"]:
print(f" {fmt(cp)} {cname}", flush=True)
if result["error"]:
print(f" ERROR:\n{result['error']}", flush=True)
print(flush=True)
total = len(results)
passed = sum(1 for r in results if r["status"] == "PASS")
failed = sum(1 for r in results if r["status"] == "FAIL")
errors = sum(1 for r in results if r["status"] == "ERROR")
print(f"{'='*50}")
print(f"总计: {total} ✅PASS: {passed} ❌FAIL: {failed} 💥ERROR: {errors}")
return results
if __name__ == "__main__":
asyncio.run(run_tests())