chore: commit post-push validation edits

Shaw · Shaw · commit dd2b63ceaf71 · 2026-05-20T00:41:32.000-07:00
diff --git a/packages/agent/src/__tests__/game-tui-mounted-surfaces.test.tsx b/packages/agent/src/__tests__/game-tui-mounted-surfaces.test.tsx
@@ -1,17 +1,32 @@
 // @vitest-environment jsdom
 
-import { readdirSync } from "node:fs";
+import { existsSync, readdirSync } from "node:fs";
 import { createRequire } from "node:module";
-import { join } from "node:path";
+import { dirname, join } from "node:path";
 import { fireEvent, screen } from "@testing-library/dom";
 import type ReactTypes from "react";
 import { afterEach, describe, expect, it, vi } from "vitest";
 
-const pluginRequire = createRequire(
-  `${process.cwd()}/plugins/plugin-clawville/src/ui/ClawvilleOperatorSurface.tsx`,
+function findAncestor(start: string, relativePath: string) {
+  let current = start;
+  while (true) {
+    const candidate = join(current, relativePath);
+    if (existsSync(candidate)) return candidate;
+    const parent = dirname(current);
+    if (parent === current) {
+      throw new Error(`Unable to locate ${relativePath}`);
+    }
+    current = parent;
+  }
+}
+
+const clawvilleSurfacePath = findAncestor(
+  process.cwd(),
+  "plugins/plugin-clawville/src/ui/ClawvilleOperatorSurface.tsx",
 );
+const pluginRequire = createRequire(clawvilleSurfacePath);
 const React = pluginRequire("react") as typeof ReactTypes;
-const bunModulesDir = join(process.cwd(), "node_modules", ".bun");
+const bunModulesDir = findAncestor(process.cwd(), "node_modules/.bun");
 const reactDomPackageDir = readdirSync(bunModulesDir).find((entry) =>
   entry.startsWith(`react-dom@${React.version}+`),
 );
diff --git a/packages/app-core/src/benchmark/server.ts b/packages/app-core/src/benchmark/server.ts
@@ -237,11 +237,18 @@ function isActionCallingBenchmarkName(benchmark: string): boolean {
   return (
     normalized === "action-calling" ||
     normalized === "action_calling" ||
+    normalized === "vending-bench" ||
+    normalized === "vending_bench" ||
     normalized === "tau_bench" ||
     normalized === "tau-bench"
   );
 }
 
+function isVendingBenchmarkName(benchmark: string): boolean {
+  const normalized = benchmark.trim().toLowerCase();
+  return normalized === "vending-bench" || normalized === "vending_bench";
+}
+
 function normalizeActionCallingNativeMessages(
   text: string,
   context: Record<string, unknown>,
@@ -2489,9 +2496,13 @@ export async function startBenchmarkServer() {
           ) {
             const nativeMessages = _isTauBenchmarkName(session.benchmark)
               ? _normalizeTauNativeMessages(text, benchmarkContext)
+              : isVendingBenchmarkName(session.benchmark)
+                ? normalizeLocaNativeMessages(benchmarkContext.messages)
               : normalizeActionCallingNativeMessages(text, benchmarkContext);
             const openAiMessages = _isTauBenchmarkName(session.benchmark)
               ? nativeMessages
+              : isVendingBenchmarkName(session.benchmark)
+                ? nativeMessages
               : normalizeActionCallingOpenAiMessages(text, benchmarkContext);
             const maxTokens =
               typeof benchmarkContext.max_tokens === "number"
diff --git a/packages/benchmarks/eliza-adapter/eliza_adapter/vending_bench.py b/packages/benchmarks/eliza-adapter/eliza_adapter/vending_bench.py
@@ -66,6 +66,37 @@
 - If the last result says an order was already placed today, do not place another order; ADVANCE_DAY.
 """
 
+_VENDING_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "BENCHMARK_ACTION",
+        "description": "Return exactly one Vending-Bench action for this turn.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "action": {"type": "string", "enum": sorted(_VENDING_ACTIONS - {"VIEW_STATE"})},
+                "supplier_id": {"type": "string"},
+                "items": {"type": "object", "additionalProperties": {"type": "integer"}},
+                "row": {"type": "integer"},
+                "column": {"type": "integer"},
+                "product_id": {"type": "string"},
+                "quantity": {"type": "integer"},
+                "price": {"type": "number"},
+                "query": {"type": "string"},
+                "to": {"type": "string"},
+                "subject": {"type": "string"},
+                "body": {"type": "string"},
+                "text": {"type": "string"},
+                "task": {"type": "string"},
+                "key": {"type": "string"},
+                "content": {"type": "string"},
+            },
+            "required": ["action"],
+            "additionalProperties": False,
+        },
+    },
+}
+
 
 def _extract_json_candidate(text: str) -> str:
     stripped = (text or "").strip()
@@ -207,19 +238,16 @@ async def generate(
             if system_prompt
             else _VENDING_SHORT_RUN_HINT
         )
-        prompt = (
-            f"{effective_system_prompt}\n\n{user_prompt}"
-            if effective_system_prompt
-            else user_prompt
-        )
-
+        prompt = f"{effective_system_prompt}\n\n{user_prompt}"
         try:
             response = self._client.send_message(
                 text=prompt,
                 context={
                     "benchmark": "vending-bench",
                     "task_id": f"{self._run_id}:turn-{self._turn_counter}",
-                    "system_prompt": effective_system_prompt,
+                    "tools": [_VENDING_TOOL],
+                    "tool_choice": "required",
+                    "max_tokens": 512,
                     "temperature": temperature,
                     "run_id": self._run_id,
                     "turn": self._turn_counter,
diff --git a/packages/benchmarks/eliza-adapter/tests/test_vending_adapter.py b/packages/benchmarks/eliza-adapter/tests/test_vending_adapter.py
@@ -10,6 +10,7 @@ class _FakeClient:
     def __init__(self, response: MessageResponse | None = None) -> None:
         self.reset_task_ids: list[str] = []
         self.contexts: list[dict[str, object]] = []
+        self.messages: list[str] = []
         self.response = response
 
     def wait_until_ready(self, timeout: float = 120.0, poll: float = 1.0) -> None:
@@ -20,6 +21,7 @@ def reset(self, *, task_id: str, benchmark: str) -> dict[str, object]:
         return {"ok": True, "benchmark": benchmark}
 
     def send_message(self, text: str, context: dict[str, object]) -> MessageResponse:
+        self.messages.append(text)
         self.contexts.append(context)
         if self.response is not None:
             return self.response
@@ -42,6 +44,10 @@ def test_vending_provider_sends_to_the_per_turn_reset_session() -> None:
     assert client.reset_task_ids
     assert client.contexts[0]["task_id"] == client.reset_task_ids[-1]
     assert client.contexts[0]["benchmark"] == "vending-bench"
+    assert "system_prompt" not in client.contexts[0]
+    assert "messages" not in client.contexts[0]
+    assert "## Eliza short-run benchmark strategy" in client.messages[0]
+    assert "What next?" in client.messages[0]
 
 
 def test_vending_provider_normalizes_bare_tool_json() -> None:
diff --git a/packages/benchmarks/orchestrator/runner.py b/packages/benchmarks/orchestrator/runner.py
@@ -971,7 +971,9 @@ def _write_latest_result_snapshot(
         payload["publication_warnings"] = publication_warnings
     if is_synthetic:
         payload["synthetic"] = True
-    snapshot_tmp = snapshot_path.with_suffix(snapshot_path.suffix + ".tmp")
+    snapshot_tmp = snapshot_path.with_name(
+        f"{snapshot_path.name}.{os.getpid()}.{uuid4().hex}.tmp"
+    )
     snapshot_tmp.write_text(
         json.dumps(payload, indent=2, sort_keys=True, ensure_ascii=True),
         encoding="utf-8",