eliza/packages/benchmarks/orchestrator_lifecycle/evaluator.py at develop · elizaOS/eliza · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""Rule-based evaluator for orchestrator lifecycle scenarios."""

from __future__ import annotations

from .types import LifecycleMetrics, Scenario, ScenarioResult


BEHAVIOR_KEYWORDS: dict[str, list[str]] = {
    "ask_clarifying_question_before_start": [
        "clarify",
        "could you remind me",
        "could you specify",
        "need more detail",
        "clarifying question",
        "what exactly",
        "which task",
        "can you tell me more",
        "more information",
    ],
    "do_not_start_without_required_info": [
        "will wait",
        "before starting",
        "before proceeding",
        "before i proceed",
        "before acting",
        "need details first",
        "won't start",
        "wait for",
        "hold off",
        "before i begin",
        "what outcomes",
        "what you'd like me to handle",
        "what you’d like me to handle",
    ],
    "spawn_subagent": [
        "subagent",
        "delegate",
        "delegating",
        "delegated",
        "worker",
        "spawn",
        "spawning",
        "hand off",
        "handing off",
    ],
    "report_active_subagent_status": [
        "status",
        "progress",
        "active subagent",
        "progress update",
        "running",
        "in progress",
        "subagent who will",
        "subagent has been spawned",
        "subagent delegated to",
        "delegated to gather",
        "subagent delegate",
        "subagent to research",
        "will keep you updated",
        "keep you updated",
    ],
    "ack_scope_change": [
        "scope change",
        "updated scope",
        "changed request",
        "scope update",
        "new scope",
        "scope acknowledged",
    ],
    "apply_scope_change_to_task": [
        "updated plan",
        "plan updated",
        "plan is now updated",
        "updated accordingly",
        "updating the plan accordingly",
        "re-planned",
        "replanned",
        "new task plan",
        "apply scope change",
        "applied scope change",
        "applying scope change",
        "scope has been applied",
        "revised plan",
        "new plan",
    ],
    "pause_task": [
        "paused",
        "pause task",
        "pausing",
        "on hold",
        "putting on hold",
        "hold the task",
    ],
    "resume_task": [
        "resumed",
        "resume task",
        "resuming",
        "continuing",
        "continue the task",
        "picking up",
        "back to work",
    ],
    "cancel_task": [
        "cancelled",
        "canceled",
        "cancel task",
        "cancelling",
        "canceling",
        "stopped",
        "stopping the task",
        "i'll cancel",
        "i will cancel",
    ],
    "confirm_cancel_effect": [
        "no further execution",
        "cancel confirmed",
        "won't continue",
        "will not continue",
        "no more work",
        "execution stopped",
        "halt execution",
        "halted",
    ],
    "final_summary_to_stakeholder": [
        "summary",
        "completed",
        "deliverable",
        "final summary",
        "stakeholder",
        "wrapping up",
        "results",
    ],
}


class LifecycleEvaluator:
    def evaluate_scenario(
        self,
        scenario: Scenario,
        assistant_messages: list[str],
    ) -> ScenarioResult:
        checks_total = 0
        checks_passed = 0
        violations: list[str] = []
        notes: list[str] = []

        combined = "\n".join(assistant_messages).lower()
        for turn in scenario.turns:
            for behavior in turn.expected_behaviors:
                checks_total += 1
                if self._has_behavior(combined, behavior):
                    checks_passed += 1
                else:
                    violations.append(f"missing:{behavior}")
            for behavior in turn.forbidden_behaviors:
                checks_total += 1
                if self._has_behavior(combined, behavior):
                    violations.append(f"forbidden:{behavior}")
                else:
                    checks_passed += 1

        score = (checks_passed / checks_total) if checks_total > 0 else 1.0
        passed = score >= 0.75 and not any(v.startswith("forbidden") for v in violations)
        if passed:
            notes.append("Scenario passed threshold checks.")
        else:
            notes.append("Scenario failed threshold checks.")
        return ScenarioResult(
            scenario_id=scenario.scenario_id,
            title=scenario.title,
            passed=passed,
            score=score,
            checks_passed=checks_passed,
            checks_total=checks_total,
            violations=violations,
            notes=notes,
        )

    def compute_metrics(self, results: list[ScenarioResult]) -> LifecycleMetrics:
        total = len(results)
        passed = sum(1 for r in results if r.passed)
        overall = (sum(r.score for r in results) / total) if total > 0 else 0.0

        def _rate(tag: str) -> float:
            tagged = [r for r in results if tag in r.scenario_id]
            if not tagged:
                return 0.0
            return sum(r.score for r in tagged) / len(tagged)

        clarification = _rate("clarification")
        status = _rate("status")
        interruption = (
            _rate("pause")
            + _rate("resume")
            + _rate("cancel")
            + _rate("interrupt")
        ) / 4
        summary = _rate("summary")
        if summary == 0:
            summary = overall
        return LifecycleMetrics(
            overall_score=overall,
            scenario_pass_rate=(passed / total) if total > 0 else 0.0,
            total_scenarios=total,
            passed_scenarios=passed,
            clarification_success_rate=clarification,
            status_accuracy_rate=status,
            interruption_handling_rate=interruption,
            completion_summary_quality=summary,
        )

    def _has_behavior(self, combined_text: str, behavior: str) -> bool:
        keywords = BEHAVIOR_KEYWORDS.get(behavior, [])
        if not keywords:
            return behavior.replace("_", " ") in combined_text
        return any(keyword in combined_text for keyword in keywords)