Skip to content

Commit dd2b63c

Browse files
author
Shaw
committed
chore: commit post-push validation edits
1 parent 05bd6ef commit dd2b63c

5 files changed

Lines changed: 75 additions & 13 deletions

File tree

packages/agent/src/__tests__/game-tui-mounted-surfaces.test.tsx

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,32 @@
11
// @vitest-environment jsdom
22

3-
import { readdirSync } from "node:fs";
3+
import { existsSync, readdirSync } from "node:fs";
44
import { createRequire } from "node:module";
5-
import { join } from "node:path";
5+
import { dirname, join } from "node:path";
66
import { fireEvent, screen } from "@testing-library/dom";
77
import type ReactTypes from "react";
88
import { afterEach, describe, expect, it, vi } from "vitest";
99

10-
const pluginRequire = createRequire(
11-
`${process.cwd()}/plugins/plugin-clawville/src/ui/ClawvilleOperatorSurface.tsx`,
10+
function findAncestor(start: string, relativePath: string) {
11+
let current = start;
12+
while (true) {
13+
const candidate = join(current, relativePath);
14+
if (existsSync(candidate)) return candidate;
15+
const parent = dirname(current);
16+
if (parent === current) {
17+
throw new Error(`Unable to locate ${relativePath}`);
18+
}
19+
current = parent;
20+
}
21+
}
22+
23+
const clawvilleSurfacePath = findAncestor(
24+
process.cwd(),
25+
"plugins/plugin-clawville/src/ui/ClawvilleOperatorSurface.tsx",
1226
);
27+
const pluginRequire = createRequire(clawvilleSurfacePath);
1328
const React = pluginRequire("react") as typeof ReactTypes;
14-
const bunModulesDir = join(process.cwd(), "node_modules", ".bun");
29+
const bunModulesDir = findAncestor(process.cwd(), "node_modules/.bun");
1530
const reactDomPackageDir = readdirSync(bunModulesDir).find((entry) =>
1631
entry.startsWith(`react-dom@${React.version}+`),
1732
);

packages/app-core/src/benchmark/server.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,11 +237,18 @@ function isActionCallingBenchmarkName(benchmark: string): boolean {
237237
return (
238238
normalized === "action-calling" ||
239239
normalized === "action_calling" ||
240+
normalized === "vending-bench" ||
241+
normalized === "vending_bench" ||
240242
normalized === "tau_bench" ||
241243
normalized === "tau-bench"
242244
);
243245
}
244246

247+
function isVendingBenchmarkName(benchmark: string): boolean {
248+
const normalized = benchmark.trim().toLowerCase();
249+
return normalized === "vending-bench" || normalized === "vending_bench";
250+
}
251+
245252
function normalizeActionCallingNativeMessages(
246253
text: string,
247254
context: Record<string, unknown>,
@@ -2489,9 +2496,13 @@ export async function startBenchmarkServer() {
24892496
) {
24902497
const nativeMessages = _isTauBenchmarkName(session.benchmark)
24912498
? _normalizeTauNativeMessages(text, benchmarkContext)
2499+
: isVendingBenchmarkName(session.benchmark)
2500+
? normalizeLocaNativeMessages(benchmarkContext.messages)
24922501
: normalizeActionCallingNativeMessages(text, benchmarkContext);
24932502
const openAiMessages = _isTauBenchmarkName(session.benchmark)
24942503
? nativeMessages
2504+
: isVendingBenchmarkName(session.benchmark)
2505+
? nativeMessages
24952506
: normalizeActionCallingOpenAiMessages(text, benchmarkContext);
24962507
const maxTokens =
24972508
typeof benchmarkContext.max_tokens === "number"

packages/benchmarks/eliza-adapter/eliza_adapter/vending_bench.py

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,37 @@
6666
- If the last result says an order was already placed today, do not place another order; ADVANCE_DAY.
6767
"""
6868

69+
_VENDING_TOOL = {
70+
"type": "function",
71+
"function": {
72+
"name": "BENCHMARK_ACTION",
73+
"description": "Return exactly one Vending-Bench action for this turn.",
74+
"parameters": {
75+
"type": "object",
76+
"properties": {
77+
"action": {"type": "string", "enum": sorted(_VENDING_ACTIONS - {"VIEW_STATE"})},
78+
"supplier_id": {"type": "string"},
79+
"items": {"type": "object", "additionalProperties": {"type": "integer"}},
80+
"row": {"type": "integer"},
81+
"column": {"type": "integer"},
82+
"product_id": {"type": "string"},
83+
"quantity": {"type": "integer"},
84+
"price": {"type": "number"},
85+
"query": {"type": "string"},
86+
"to": {"type": "string"},
87+
"subject": {"type": "string"},
88+
"body": {"type": "string"},
89+
"text": {"type": "string"},
90+
"task": {"type": "string"},
91+
"key": {"type": "string"},
92+
"content": {"type": "string"},
93+
},
94+
"required": ["action"],
95+
"additionalProperties": False,
96+
},
97+
},
98+
}
99+
69100

70101
def _extract_json_candidate(text: str) -> str:
71102
stripped = (text or "").strip()
@@ -207,19 +238,16 @@ async def generate(
207238
if system_prompt
208239
else _VENDING_SHORT_RUN_HINT
209240
)
210-
prompt = (
211-
f"{effective_system_prompt}\n\n{user_prompt}"
212-
if effective_system_prompt
213-
else user_prompt
214-
)
215-
241+
prompt = f"{effective_system_prompt}\n\n{user_prompt}"
216242
try:
217243
response = self._client.send_message(
218244
text=prompt,
219245
context={
220246
"benchmark": "vending-bench",
221247
"task_id": f"{self._run_id}:turn-{self._turn_counter}",
222-
"system_prompt": effective_system_prompt,
248+
"tools": [_VENDING_TOOL],
249+
"tool_choice": "required",
250+
"max_tokens": 512,
223251
"temperature": temperature,
224252
"run_id": self._run_id,
225253
"turn": self._turn_counter,

packages/benchmarks/eliza-adapter/tests/test_vending_adapter.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ class _FakeClient:
1010
def __init__(self, response: MessageResponse | None = None) -> None:
1111
self.reset_task_ids: list[str] = []
1212
self.contexts: list[dict[str, object]] = []
13+
self.messages: list[str] = []
1314
self.response = response
1415

1516
def wait_until_ready(self, timeout: float = 120.0, poll: float = 1.0) -> None:
@@ -20,6 +21,7 @@ def reset(self, *, task_id: str, benchmark: str) -> dict[str, object]:
2021
return {"ok": True, "benchmark": benchmark}
2122

2223
def send_message(self, text: str, context: dict[str, object]) -> MessageResponse:
24+
self.messages.append(text)
2325
self.contexts.append(context)
2426
if self.response is not None:
2527
return self.response
@@ -42,6 +44,10 @@ def test_vending_provider_sends_to_the_per_turn_reset_session() -> None:
4244
assert client.reset_task_ids
4345
assert client.contexts[0]["task_id"] == client.reset_task_ids[-1]
4446
assert client.contexts[0]["benchmark"] == "vending-bench"
47+
assert "system_prompt" not in client.contexts[0]
48+
assert "messages" not in client.contexts[0]
49+
assert "## Eliza short-run benchmark strategy" in client.messages[0]
50+
assert "What next?" in client.messages[0]
4551

4652

4753
def test_vending_provider_normalizes_bare_tool_json() -> None:

packages/benchmarks/orchestrator/runner.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -971,7 +971,9 @@ def _write_latest_result_snapshot(
971971
payload["publication_warnings"] = publication_warnings
972972
if is_synthetic:
973973
payload["synthetic"] = True
974-
snapshot_tmp = snapshot_path.with_suffix(snapshot_path.suffix + ".tmp")
974+
snapshot_tmp = snapshot_path.with_name(
975+
f"{snapshot_path.name}.{os.getpid()}.{uuid4().hex}.tmp"
976+
)
975977
snapshot_tmp.write_text(
976978
json.dumps(payload, indent=2, sort_keys=True, ensure_ascii=True),
977979
encoding="utf-8",

0 commit comments

Comments
 (0)