chore: commit lifeops benchmark adapter updates

Shaw · Shaw · commit 340eec0c09b4 · 2026-05-20T01:19:56.000-07:00
diff --git a/packages/app-core/src/benchmark/__tests__/lifeops-bench-handler.test.ts b/packages/app-core/src/benchmark/__tests__/lifeops-bench-handler.test.ts
@@ -390,6 +390,24 @@ describe("LifeOpsFakeBackend", () => {
     expect(doc.stores.email.e1.folder).toBe("archive");
   });
 
+  it("MESSAGE manage(archive) accepts targetKind thread alias", () => {
+    const path = writeFixture();
+    const backend = LifeOpsFakeBackend.fromJsonFile(path);
+    const result = backend.applyAction("MESSAGE", {
+      operation: "manage",
+      manageOperation: "archive",
+      target: "t1",
+      targetKind: "thread",
+    });
+    expect(result.ok).toBe(true);
+    expect(result.result).toMatchObject({
+      thread_id: "t1",
+      archived_ids: ["e1"],
+    });
+    const doc = backend.toDocument();
+    expect(doc.stores.email.e1.folder).toBe("archive");
+  });
+
   it("MESSAGE manage(trash) flips folder to trash", () => {
     const path = writeFixture();
     const backend = LifeOpsFakeBackend.fromJsonFile(path);
diff --git a/packages/app-core/src/benchmark/lifeops-fake-backend.ts b/packages/app-core/src/benchmark/lifeops-fake-backend.ts
@@ -1388,8 +1388,22 @@ export class LifeOpsFakeBackend {
     if (!manageOp) {
       throw new Error("MESSAGE/manage requires manageOperation");
     }
-    const msgId = pickStringOrNull(kw, ["messageId"]);
-    const threadId = pickStringOrNull(kw, ["threadId"]);
+    const target = pickStringOrNull(kw, ["target"]);
+    const targetKind = pickStringOrNull(kw, ["targetKind"]);
+    const msgId =
+      pickStringOrNull(kw, ["messageId"]) ??
+      (target !== null &&
+      (targetKind === "message" ||
+        targetKind === "email" ||
+        target.startsWith("email_"))
+        ? target
+        : null);
+    const threadId =
+      pickStringOrNull(kw, ["threadId"]) ??
+      (target !== null &&
+      (targetKind === "thread" || target.startsWith("thread_"))
+        ? target
+        : null);
 
     if (manageOp === "archive") {
       if (msgId !== null) {
diff --git a/packages/app-core/src/benchmark/server.ts b/packages/app-core/src/benchmark/server.ts
@@ -1007,6 +1007,61 @@ function buildLifeOpsBenchmarkContext(
   };
 }
 
+function buildLifeOpsActionCallingMessages(params: {
+  userText: string;
+  lifeopsContext: Record<string, unknown>;
+}): Array<Record<string, unknown>> {
+  const contextJson = JSON.stringify(params.lifeopsContext, null, 2);
+  return [
+    {
+      role: "system",
+      content:
+        "You are running LifeOpsBench through the Eliza benchmark server. " +
+        "Use native tool calls for calendar, mail, message, task, and related LifeOps operations. " +
+        "For free/busy or availability questions, call CALENDAR with action and subaction exactly " +
+        "check_availability and provide top-level startAt/endAt ISO timestamps; do not use search_events. " +
+        "Do not serialize tool calls in text, XML, markdown, or JSON. " +
+        "After a tool call, the benchmark backend will execute it and feed back the result on the next turn. " +
+        "Return assistant text only when no tool call is needed.\n\n" +
+        `LifeOps benchmark context:\n${contextJson}`,
+    },
+    {
+      role: "user",
+      content: params.userText,
+    },
+  ];
+}
+
+function lifeOpsToolCallsFromNativeToolCalls(
+  toolCalls: Array<{
+    id: string;
+    function: { name: string; arguments: string };
+  }>,
+): Array<{
+  id: string;
+  name: string;
+  arguments: Record<string, unknown>;
+}> {
+  return toolCalls.map((call, index) => {
+    let parsedArgs: unknown = {};
+    try {
+      parsedArgs = JSON.parse(call.function.arguments || "{}");
+    } catch {
+      parsedArgs = {};
+    }
+    return {
+      id: call.id || `call_${index}`,
+      name: call.function.name,
+      arguments:
+        parsedArgs &&
+        typeof parsedArgs === "object" &&
+        !Array.isArray(parsedArgs)
+          ? (parsedArgs as Record<string, unknown>)
+          : {},
+    };
+  });
+}
+
 function isAllowedOrigin(origin: string | undefined): boolean {
   if (!origin) return false;
   try {
@@ -1935,13 +1990,48 @@ export async function startBenchmarkServer() {
       if (!session) throw new Error("Failed to resolve lifeops_bench session");
       await ensureBenchmarkSessionContext(runtime, session);
 
+      const lifeopsContext = buildLifeOpsBenchmarkContext(
+        backend,
+        previousTurns,
+      );
       const benchmarkContext = normalizeBenchmarkContext(session, {
         benchmark: "lifeops_bench",
         task_id: taskId,
         ...(Array.isArray(toolManifest) ? { tools: toolManifest } : {}),
-        lifeops: buildLifeOpsBenchmarkContext(backend, previousTurns),
+        lifeops: lifeopsContext,
       });
 
+      if (Array.isArray(toolManifest) && toolManifest.length > 0) {
+        const directUsageBuffer: BenchmarkLlmCallUsage[] = [];
+        activeUsageBuffer = directUsageBuffer;
+        try {
+          const directResult = await callOpenAiCompatibleActionCalling({
+            messages: buildLifeOpsActionCallingMessages({
+              userText,
+              lifeopsContext,
+            }),
+            tools: toolManifest,
+            toolChoice: "required",
+            maxTokens: 1024,
+            temperature: 0,
+          });
+          if (directResult) {
+            if (directResult.usage) {
+              directUsageBuffer.push(directResult.usage);
+            }
+            const toolCalls = lifeOpsToolCallsFromNativeToolCalls(
+              directResult.toolCalls,
+            );
+            if (toolCalls.length > 0) {
+              const usage = summarizeBenchmarkTurnUsage(directUsageBuffer);
+              return { text: directResult.text, toolCalls, usage };
+            }
+          }
+        } finally {
+          activeUsageBuffer = null;
+        }
+      }
+
       // The ELIZA_BENCHMARK provider already renders the full LifeOps clock,
       // world snapshot, tool manifest, and previous tool results. Duplicating
       // that JSON into the user message balloons Cerebras prompts and can leave
diff --git a/packages/benchmarks/eliza-adapter/eliza_adapter/lifeops_bench.py b/packages/benchmarks/eliza-adapter/eliza_adapter/lifeops_bench.py
@@ -34,6 +34,35 @@
 # Hermes consumers that share the same eliza_adapter wheel).
 
 
+def _normalize_lifeops_tool_arguments(
+    name: str,
+    arguments: dict[str, Any],
+) -> dict[str, Any]:
+    """Normalize Eliza planner aliases to the Python LifeOps executor ABI."""
+    normalized = dict(arguments)
+    if name == "MESSAGE":
+        if "operation" not in normalized and isinstance(normalized.get("action"), str):
+            normalized["operation"] = normalized["action"]
+        target = normalized.get("target")
+        target_kind = normalized.get("targetKind")
+        if isinstance(target, str):
+            if (
+                "threadId" not in normalized
+                and (target_kind == "thread" or target.startswith("thread_"))
+            ):
+                normalized["threadId"] = target
+                normalized.pop("target", None)
+            if (
+                "messageId" not in normalized
+                and (
+                    target_kind in {"message", "email"}
+                    or target.startswith("email_")
+                )
+            ):
+                normalized["messageId"] = target
+    return normalized
+
+
 def build_lifeops_bench_agent_fn(
     *,
     client: ElizaClient | None = None,
@@ -155,6 +184,7 @@ async def _agent_fn(
                 args = entry.get("arguments")
                 if not isinstance(args, dict):
                     args = {}
+                args = _normalize_lifeops_tool_arguments(name, args)
                 tool_calls.append(
                     {
                         "id": str(entry.get("id") or f"call_{len(tool_calls)}"),
diff --git a/packages/benchmarks/eliza-adapter/eliza_adapter/vending_bench.py b/packages/benchmarks/eliza-adapter/eliza_adapter/vending_bench.py
@@ -187,49 +187,6 @@ def _response_to_vending_json(text: str, params: dict, user_prompt: str) -> str:
     return stripped
 
 
-def _flag(user_prompt: str, name: str) -> bool:
-    return f"{name}=True" in user_prompt
-
-
-def _day(user_prompt: str) -> int | None:
-    match = re.search(r"## Day\s+(\d+)\s+of your vending business", user_prompt)
-    return int(match.group(1)) if match else None
-
-
-def _default_beverage_order() -> str:
-    return json.dumps(
-        {
-            "action": "PLACE_ORDER",
-            "supplier_id": "beverage_dist",
-            "items": {
-                "water": 12,
-                "soda_cola": 12,
-                "juice_orange": 6,
-                "energy_drink": 6,
-            },
-        }
-    )
-
-
-def _restock_action(product_id: str) -> str:
-    slots = {
-        "water": (0, 0, 10),
-        "soda_cola": (0, 1, 10),
-        "juice_orange": (1, 1, 6),
-        "energy_drink": (2, 0, 6),
-    }
-    row, column, quantity = slots[product_id]
-    return json.dumps(
-        {
-            "action": "RESTOCK_SLOT",
-            "row": row,
-            "column": column,
-            "product_id": product_id,
-            "quantity": quantity,
-        }
-    )
-
-
 class ElizaVendingProvider:
     """LLMProvider implementation that routes through the eliza TS bridge.
 
@@ -249,7 +206,6 @@ def __init__(
         self._initialized = False
         self._run_id: str = f"vending-{uuid.uuid4().hex[:12]}"
         self._turn_counter: int = 0
-        self._restock_queue: list[str] = []
 
     async def _ensure_initialized(self) -> None:
         if self._initialized:
@@ -301,49 +257,15 @@ async def generate(
             logger.error("[eliza-vending] send_message failed: %s", exc)
             raise
 
-        action = _response_to_vending_json(response.text or "", response.params, user_prompt)
-        if not action.strip():
-            action = self._fallback_action(user_prompt)
-        return (action, 0)
-
-    def _fallback_action(self, user_prompt: str) -> str:
-        day = _day(user_prompt)
-        if day == 1:
-            if not _flag(user_prompt, "placed_order"):
-                return _default_beverage_order()
-            return '{"action": "ADVANCE_DAY"}'
-
-        if day == 2:
-            if not _flag(user_prompt, "placed_order"):
-                return _default_beverage_order()
-            if not _flag(user_prompt, "collected_cash"):
-                return '{"action": "COLLECT_CASH"}'
-            if not _flag(user_prompt, "checked_deliveries"):
-                return '{"action": "CHECK_DELIVERIES"}'
-            return '{"action": "ADVANCE_DAY"}'
-
-        if day == 3:
-            if not self._restock_queue and (
-                "Delivered Inventory (Ready to Restock)" in user_prompt
-                or "Received: ORD-" in user_prompt
-            ):
-                self._restock_queue = [
-                    "water",
-                    "soda_cola",
-                    "juice_orange",
-                    "energy_drink",
-                ]
-            if self._restock_queue:
-                return _restock_action(self._restock_queue.pop(0))
-            return '{"action": "VIEW_BUSINESS_STATE"}'
-
-        return '{"action": "ADVANCE_DAY"}'
+        return (
+            _response_to_vending_json(response.text or "", response.params, user_prompt),
+            0,
+        )
 
     async def reset(self, run_id: str) -> None:
         """Reset the bridge session at the start of a new simulation run."""
         self._run_id = run_id or f"vending-{uuid.uuid4().hex[:12]}"
         self._turn_counter = 0
-        self._restock_queue = []
         try:
             self._client.reset(task_id=self._run_id, benchmark="vending-bench")
         except Exception as exc:
diff --git a/packages/benchmarks/eliza-adapter/tests/test_lifeops_bench_adapter.py b/packages/benchmarks/eliza-adapter/tests/test_lifeops_bench_adapter.py
@@ -257,6 +257,49 @@ def test_agent_fn_handles_no_user_message_safely() -> None:
     assert turn.tool_calls is None
 
 
+def test_agent_fn_normalizes_message_manage_target_thread_alias() -> None:
+    client, _ = _make_fake_client(
+        {
+            ("POST", "/api/benchmark/lifeops_bench/reset"): {"ok": True, "world_hash": "h"},
+            ("POST", "/api/benchmark/lifeops_bench/message"): {
+                "text": "",
+                "tool_calls": [
+                    {
+                        "id": "c1",
+                        "name": "MESSAGE",
+                        "arguments": {
+                            "action": "manage",
+                            "source": "gmail",
+                            "manageOperation": "archive",
+                            "target": "thread_01464",
+                            "targetKind": "thread",
+                        },
+                    }
+                ],
+                "usage": {},
+            },
+        }
+    )
+    agent_fn = build_lifeops_bench_agent_fn(
+        client=client,
+        world_snapshot_path="/tmp/world.json",
+    )
+
+    turn = asyncio.run(
+        agent_fn([_StubMessageTurn(role="user", content="archive thread_01464")], [])
+    )
+
+    assert turn.tool_calls is not None
+    assert turn.tool_calls[0]["function"]["arguments"] == {
+        "action": "manage",
+        "operation": "manage",
+        "source": "gmail",
+        "manageOperation": "archive",
+        "targetKind": "thread",
+        "threadId": "thread_01464",
+    }
+
+
 def test_agent_fn_starts_managed_server_when_no_bridge_env(monkeypatch) -> None:
     ready_client, _ = _make_fake_client({})
     started: list[str] = []
diff --git a/packages/benchmarks/eliza-adapter/tests/test_vending_adapter.py b/packages/benchmarks/eliza-adapter/tests/test_vending_adapter.py
@@ -129,3 +129,20 @@ def test_vending_provider_does_not_synthesize_profitable_fallback() -> None:
     response, _tokens = asyncio.run(provider.generate("", "What next?"))
 
     assert response == "I am not sure."
+
+
+def test_vending_provider_preserves_empty_structured_response() -> None:
+    client = _FakeClient(
+        MessageResponse(
+            text="",
+            thought=None,
+            actions=[],
+            params={},
+            metadata={},
+        )
+    )
+    provider = ElizaVendingProvider(client=client)
+
+    response, _tokens = asyncio.run(provider.generate("", "What next?"))
+
+    assert response == ""