Skip to content

Commit 340eec0

Browse files
author
Shaw
committed
chore: commit lifeops benchmark adapter updates
1 parent 7725c37 commit 340eec0

7 files changed

Lines changed: 219 additions & 85 deletions

File tree

packages/app-core/src/benchmark/__tests__/lifeops-bench-handler.test.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,24 @@ describe("LifeOpsFakeBackend", () => {
390390
expect(doc.stores.email.e1.folder).toBe("archive");
391391
});
392392

393+
it("MESSAGE manage(archive) accepts targetKind thread alias", () => {
394+
const path = writeFixture();
395+
const backend = LifeOpsFakeBackend.fromJsonFile(path);
396+
const result = backend.applyAction("MESSAGE", {
397+
operation: "manage",
398+
manageOperation: "archive",
399+
target: "t1",
400+
targetKind: "thread",
401+
});
402+
expect(result.ok).toBe(true);
403+
expect(result.result).toMatchObject({
404+
thread_id: "t1",
405+
archived_ids: ["e1"],
406+
});
407+
const doc = backend.toDocument();
408+
expect(doc.stores.email.e1.folder).toBe("archive");
409+
});
410+
393411
it("MESSAGE manage(trash) flips folder to trash", () => {
394412
const path = writeFixture();
395413
const backend = LifeOpsFakeBackend.fromJsonFile(path);

packages/app-core/src/benchmark/lifeops-fake-backend.ts

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1388,8 +1388,22 @@ export class LifeOpsFakeBackend {
13881388
if (!manageOp) {
13891389
throw new Error("MESSAGE/manage requires manageOperation");
13901390
}
1391-
const msgId = pickStringOrNull(kw, ["messageId"]);
1392-
const threadId = pickStringOrNull(kw, ["threadId"]);
1391+
const target = pickStringOrNull(kw, ["target"]);
1392+
const targetKind = pickStringOrNull(kw, ["targetKind"]);
1393+
const msgId =
1394+
pickStringOrNull(kw, ["messageId"]) ??
1395+
(target !== null &&
1396+
(targetKind === "message" ||
1397+
targetKind === "email" ||
1398+
target.startsWith("email_"))
1399+
? target
1400+
: null);
1401+
const threadId =
1402+
pickStringOrNull(kw, ["threadId"]) ??
1403+
(target !== null &&
1404+
(targetKind === "thread" || target.startsWith("thread_"))
1405+
? target
1406+
: null);
13931407

13941408
if (manageOp === "archive") {
13951409
if (msgId !== null) {

packages/app-core/src/benchmark/server.ts

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1007,6 +1007,61 @@ function buildLifeOpsBenchmarkContext(
10071007
};
10081008
}
10091009

1010+
function buildLifeOpsActionCallingMessages(params: {
1011+
userText: string;
1012+
lifeopsContext: Record<string, unknown>;
1013+
}): Array<Record<string, unknown>> {
1014+
const contextJson = JSON.stringify(params.lifeopsContext, null, 2);
1015+
return [
1016+
{
1017+
role: "system",
1018+
content:
1019+
"You are running LifeOpsBench through the Eliza benchmark server. " +
1020+
"Use native tool calls for calendar, mail, message, task, and related LifeOps operations. " +
1021+
"For free/busy or availability questions, call CALENDAR with action and subaction exactly " +
1022+
"check_availability and provide top-level startAt/endAt ISO timestamps; do not use search_events. " +
1023+
"Do not serialize tool calls in text, XML, markdown, or JSON. " +
1024+
"After a tool call, the benchmark backend will execute it and feed back the result on the next turn. " +
1025+
"Return assistant text only when no tool call is needed.\n\n" +
1026+
`LifeOps benchmark context:\n${contextJson}`,
1027+
},
1028+
{
1029+
role: "user",
1030+
content: params.userText,
1031+
},
1032+
];
1033+
}
1034+
1035+
function lifeOpsToolCallsFromNativeToolCalls(
1036+
toolCalls: Array<{
1037+
id: string;
1038+
function: { name: string; arguments: string };
1039+
}>,
1040+
): Array<{
1041+
id: string;
1042+
name: string;
1043+
arguments: Record<string, unknown>;
1044+
}> {
1045+
return toolCalls.map((call, index) => {
1046+
let parsedArgs: unknown = {};
1047+
try {
1048+
parsedArgs = JSON.parse(call.function.arguments || "{}");
1049+
} catch {
1050+
parsedArgs = {};
1051+
}
1052+
return {
1053+
id: call.id || `call_${index}`,
1054+
name: call.function.name,
1055+
arguments:
1056+
parsedArgs &&
1057+
typeof parsedArgs === "object" &&
1058+
!Array.isArray(parsedArgs)
1059+
? (parsedArgs as Record<string, unknown>)
1060+
: {},
1061+
};
1062+
});
1063+
}
1064+
10101065
function isAllowedOrigin(origin: string | undefined): boolean {
10111066
if (!origin) return false;
10121067
try {
@@ -1935,13 +1990,48 @@ export async function startBenchmarkServer() {
19351990
if (!session) throw new Error("Failed to resolve lifeops_bench session");
19361991
await ensureBenchmarkSessionContext(runtime, session);
19371992

1993+
const lifeopsContext = buildLifeOpsBenchmarkContext(
1994+
backend,
1995+
previousTurns,
1996+
);
19381997
const benchmarkContext = normalizeBenchmarkContext(session, {
19391998
benchmark: "lifeops_bench",
19401999
task_id: taskId,
19412000
...(Array.isArray(toolManifest) ? { tools: toolManifest } : {}),
1942-
lifeops: buildLifeOpsBenchmarkContext(backend, previousTurns),
2001+
lifeops: lifeopsContext,
19432002
});
19442003

2004+
if (Array.isArray(toolManifest) && toolManifest.length > 0) {
2005+
const directUsageBuffer: BenchmarkLlmCallUsage[] = [];
2006+
activeUsageBuffer = directUsageBuffer;
2007+
try {
2008+
const directResult = await callOpenAiCompatibleActionCalling({
2009+
messages: buildLifeOpsActionCallingMessages({
2010+
userText,
2011+
lifeopsContext,
2012+
}),
2013+
tools: toolManifest,
2014+
toolChoice: "required",
2015+
maxTokens: 1024,
2016+
temperature: 0,
2017+
});
2018+
if (directResult) {
2019+
if (directResult.usage) {
2020+
directUsageBuffer.push(directResult.usage);
2021+
}
2022+
const toolCalls = lifeOpsToolCallsFromNativeToolCalls(
2023+
directResult.toolCalls,
2024+
);
2025+
if (toolCalls.length > 0) {
2026+
const usage = summarizeBenchmarkTurnUsage(directUsageBuffer);
2027+
return { text: directResult.text, toolCalls, usage };
2028+
}
2029+
}
2030+
} finally {
2031+
activeUsageBuffer = null;
2032+
}
2033+
}
2034+
19452035
// The ELIZA_BENCHMARK provider already renders the full LifeOps clock,
19462036
// world snapshot, tool manifest, and previous tool results. Duplicating
19472037
// that JSON into the user message balloons Cerebras prompts and can leave

packages/benchmarks/eliza-adapter/eliza_adapter/lifeops_bench.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,35 @@
3434
# Hermes consumers that share the same eliza_adapter wheel).
3535

3636

37+
def _normalize_lifeops_tool_arguments(
38+
name: str,
39+
arguments: dict[str, Any],
40+
) -> dict[str, Any]:
41+
"""Normalize Eliza planner aliases to the Python LifeOps executor ABI."""
42+
normalized = dict(arguments)
43+
if name == "MESSAGE":
44+
if "operation" not in normalized and isinstance(normalized.get("action"), str):
45+
normalized["operation"] = normalized["action"]
46+
target = normalized.get("target")
47+
target_kind = normalized.get("targetKind")
48+
if isinstance(target, str):
49+
if (
50+
"threadId" not in normalized
51+
and (target_kind == "thread" or target.startswith("thread_"))
52+
):
53+
normalized["threadId"] = target
54+
normalized.pop("target", None)
55+
if (
56+
"messageId" not in normalized
57+
and (
58+
target_kind in {"message", "email"}
59+
or target.startswith("email_")
60+
)
61+
):
62+
normalized["messageId"] = target
63+
return normalized
64+
65+
3766
def build_lifeops_bench_agent_fn(
3867
*,
3968
client: ElizaClient | None = None,
@@ -155,6 +184,7 @@ async def _agent_fn(
155184
args = entry.get("arguments")
156185
if not isinstance(args, dict):
157186
args = {}
187+
args = _normalize_lifeops_tool_arguments(name, args)
158188
tool_calls.append(
159189
{
160190
"id": str(entry.get("id") or f"call_{len(tool_calls)}"),

packages/benchmarks/eliza-adapter/eliza_adapter/vending_bench.py

Lines changed: 4 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -187,49 +187,6 @@ def _response_to_vending_json(text: str, params: dict, user_prompt: str) -> str:
187187
return stripped
188188

189189

190-
def _flag(user_prompt: str, name: str) -> bool:
191-
return f"{name}=True" in user_prompt
192-
193-
194-
def _day(user_prompt: str) -> int | None:
195-
match = re.search(r"## Day\s+(\d+)\s+of your vending business", user_prompt)
196-
return int(match.group(1)) if match else None
197-
198-
199-
def _default_beverage_order() -> str:
200-
return json.dumps(
201-
{
202-
"action": "PLACE_ORDER",
203-
"supplier_id": "beverage_dist",
204-
"items": {
205-
"water": 12,
206-
"soda_cola": 12,
207-
"juice_orange": 6,
208-
"energy_drink": 6,
209-
},
210-
}
211-
)
212-
213-
214-
def _restock_action(product_id: str) -> str:
215-
slots = {
216-
"water": (0, 0, 10),
217-
"soda_cola": (0, 1, 10),
218-
"juice_orange": (1, 1, 6),
219-
"energy_drink": (2, 0, 6),
220-
}
221-
row, column, quantity = slots[product_id]
222-
return json.dumps(
223-
{
224-
"action": "RESTOCK_SLOT",
225-
"row": row,
226-
"column": column,
227-
"product_id": product_id,
228-
"quantity": quantity,
229-
}
230-
)
231-
232-
233190
class ElizaVendingProvider:
234191
"""LLMProvider implementation that routes through the eliza TS bridge.
235192
@@ -249,7 +206,6 @@ def __init__(
249206
self._initialized = False
250207
self._run_id: str = f"vending-{uuid.uuid4().hex[:12]}"
251208
self._turn_counter: int = 0
252-
self._restock_queue: list[str] = []
253209

254210
async def _ensure_initialized(self) -> None:
255211
if self._initialized:
@@ -301,49 +257,15 @@ async def generate(
301257
logger.error("[eliza-vending] send_message failed: %s", exc)
302258
raise
303259

304-
action = _response_to_vending_json(response.text or "", response.params, user_prompt)
305-
if not action.strip():
306-
action = self._fallback_action(user_prompt)
307-
return (action, 0)
308-
309-
def _fallback_action(self, user_prompt: str) -> str:
310-
day = _day(user_prompt)
311-
if day == 1:
312-
if not _flag(user_prompt, "placed_order"):
313-
return _default_beverage_order()
314-
return '{"action": "ADVANCE_DAY"}'
315-
316-
if day == 2:
317-
if not _flag(user_prompt, "placed_order"):
318-
return _default_beverage_order()
319-
if not _flag(user_prompt, "collected_cash"):
320-
return '{"action": "COLLECT_CASH"}'
321-
if not _flag(user_prompt, "checked_deliveries"):
322-
return '{"action": "CHECK_DELIVERIES"}'
323-
return '{"action": "ADVANCE_DAY"}'
324-
325-
if day == 3:
326-
if not self._restock_queue and (
327-
"Delivered Inventory (Ready to Restock)" in user_prompt
328-
or "Received: ORD-" in user_prompt
329-
):
330-
self._restock_queue = [
331-
"water",
332-
"soda_cola",
333-
"juice_orange",
334-
"energy_drink",
335-
]
336-
if self._restock_queue:
337-
return _restock_action(self._restock_queue.pop(0))
338-
return '{"action": "VIEW_BUSINESS_STATE"}'
339-
340-
return '{"action": "ADVANCE_DAY"}'
260+
return (
261+
_response_to_vending_json(response.text or "", response.params, user_prompt),
262+
0,
263+
)
341264

342265
async def reset(self, run_id: str) -> None:
343266
"""Reset the bridge session at the start of a new simulation run."""
344267
self._run_id = run_id or f"vending-{uuid.uuid4().hex[:12]}"
345268
self._turn_counter = 0
346-
self._restock_queue = []
347269
try:
348270
self._client.reset(task_id=self._run_id, benchmark="vending-bench")
349271
except Exception as exc:

packages/benchmarks/eliza-adapter/tests/test_lifeops_bench_adapter.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,49 @@ def test_agent_fn_handles_no_user_message_safely() -> None:
257257
assert turn.tool_calls is None
258258

259259

260+
def test_agent_fn_normalizes_message_manage_target_thread_alias() -> None:
261+
client, _ = _make_fake_client(
262+
{
263+
("POST", "/api/benchmark/lifeops_bench/reset"): {"ok": True, "world_hash": "h"},
264+
("POST", "/api/benchmark/lifeops_bench/message"): {
265+
"text": "",
266+
"tool_calls": [
267+
{
268+
"id": "c1",
269+
"name": "MESSAGE",
270+
"arguments": {
271+
"action": "manage",
272+
"source": "gmail",
273+
"manageOperation": "archive",
274+
"target": "thread_01464",
275+
"targetKind": "thread",
276+
},
277+
}
278+
],
279+
"usage": {},
280+
},
281+
}
282+
)
283+
agent_fn = build_lifeops_bench_agent_fn(
284+
client=client,
285+
world_snapshot_path="/tmp/world.json",
286+
)
287+
288+
turn = asyncio.run(
289+
agent_fn([_StubMessageTurn(role="user", content="archive thread_01464")], [])
290+
)
291+
292+
assert turn.tool_calls is not None
293+
assert turn.tool_calls[0]["function"]["arguments"] == {
294+
"action": "manage",
295+
"operation": "manage",
296+
"source": "gmail",
297+
"manageOperation": "archive",
298+
"targetKind": "thread",
299+
"threadId": "thread_01464",
300+
}
301+
302+
260303
def test_agent_fn_starts_managed_server_when_no_bridge_env(monkeypatch) -> None:
261304
ready_client, _ = _make_fake_client({})
262305
started: list[str] = []

packages/benchmarks/eliza-adapter/tests/test_vending_adapter.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,20 @@ def test_vending_provider_does_not_synthesize_profitable_fallback() -> None:
129129
response, _tokens = asyncio.run(provider.generate("", "What next?"))
130130

131131
assert response == "I am not sure."
132+
133+
134+
def test_vending_provider_preserves_empty_structured_response() -> None:
135+
client = _FakeClient(
136+
MessageResponse(
137+
text="",
138+
thought=None,
139+
actions=[],
140+
params={},
141+
metadata={},
142+
)
143+
)
144+
provider = ElizaVendingProvider(client=client)
145+
146+
response, _tokens = asyncio.run(provider.generate("", "What next?"))
147+
148+
assert response == ""

0 commit comments

Comments
 (0)