Skip to content

Commit f302a7b

Browse files
author
Shaw
committed
chore: commit current validation and training updates
1 parent 5d661b1 commit f302a7b

23 files changed

Lines changed: 510 additions & 45 deletions
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
"""OpenAI-compatible bridge from Hermes-native envs to benchmark harnesses.
2+
3+
The Hermes-native terminal/simulation envs drive their rollout loop through an
4+
OpenAI chat-completions endpoint. For cross-harness runs we keep that real env
5+
and scorer, but point the model endpoint at this local bridge so model turns
6+
are answered by the selected Eliza/OpenClaw adapter instead of bypassing the
7+
harness label.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import json
13+
import os
14+
import socket
15+
import threading
16+
import time
17+
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
18+
from typing import Any, Mapping
19+
20+
21+
class HarnessOpenAIProxy:
22+
"""Small local ``/chat/completions`` server backed by a harness client."""
23+
24+
def __init__(
25+
self,
26+
*,
27+
harness: str,
28+
provider: str,
29+
model: str,
30+
upstream_base_url: str | None = None,
31+
) -> None:
32+
harness = harness.strip().lower()
33+
if harness not in {"eliza", "openclaw"}:
34+
raise ValueError(f"unsupported proxy harness: {harness!r}")
35+
self.harness = harness
36+
self.provider = provider or "cerebras"
37+
self.model = model
38+
self.upstream_base_url = upstream_base_url
39+
self._client: Any | None = None
40+
self._server_handle: Any | None = None
41+
self._httpd: ThreadingHTTPServer | None = None
42+
self._thread: threading.Thread | None = None
43+
self.base_url: str | None = None
44+
45+
def start(self) -> "HarnessOpenAIProxy":
46+
self._client, self._server_handle = _build_client(
47+
harness=self.harness,
48+
provider=self.provider,
49+
model=self.model,
50+
upstream_base_url=self.upstream_base_url,
51+
)
52+
53+
proxy = self
54+
55+
class Handler(BaseHTTPRequestHandler):
56+
protocol_version = "HTTP/1.1"
57+
58+
def do_GET(self) -> None: # noqa: N802
59+
if self.path.rstrip("/") in {"", "/v1", "/health"}:
60+
self._write_json({"status": "ok", "harness": proxy.harness})
61+
return
62+
self.send_error(404)
63+
64+
def do_POST(self) -> None: # noqa: N802
65+
path = self.path.rstrip("/")
66+
if path not in {"/chat/completions", "/v1/chat/completions"}:
67+
self.send_error(404)
68+
return
69+
try:
70+
payload = self._read_json()
71+
response = proxy.complete(payload)
72+
self._write_json(response)
73+
except Exception as exc: # noqa: BLE001
74+
self._write_json(
75+
{
76+
"error": {
77+
"message": f"{exc.__class__.__name__}: {exc}",
78+
"type": "harness_proxy_error",
79+
}
80+
},
81+
status=500,
82+
)
83+
84+
def log_message(self, format: str, *args: Any) -> None:
85+
return
86+
87+
def _read_json(self) -> dict[str, Any]:
88+
length = int(self.headers.get("Content-Length") or "0")
89+
raw = self.rfile.read(length) if length else b"{}"
90+
data = json.loads(raw.decode("utf-8"))
91+
if not isinstance(data, dict):
92+
raise ValueError("request body must be a JSON object")
93+
return data
94+
95+
def _write_json(self, payload: Mapping[str, Any], *, status: int = 200) -> None:
96+
body = json.dumps(payload, ensure_ascii=True).encode("utf-8")
97+
self.send_response(status)
98+
self.send_header("Content-Type", "application/json")
99+
self.send_header("Content-Length", str(len(body)))
100+
self.end_headers()
101+
self.wfile.write(body)
102+
103+
host = "127.0.0.1"
104+
self._httpd = ThreadingHTTPServer((host, _free_port(host)), Handler)
105+
self.base_url = f"http://{host}:{self._httpd.server_port}/v1"
106+
self._thread = threading.Thread(
107+
target=self._httpd.serve_forever,
108+
name=f"{self.harness}-openai-proxy",
109+
daemon=True,
110+
)
111+
self._thread.start()
112+
return self
113+
114+
def stop(self) -> None:
115+
if self._httpd is not None:
116+
self._httpd.shutdown()
117+
self._httpd.server_close()
118+
self._httpd = None
119+
if self._thread is not None:
120+
self._thread.join(timeout=5)
121+
self._thread = None
122+
stop = getattr(self._server_handle, "stop", None)
123+
if callable(stop):
124+
stop()
125+
self._server_handle = None
126+
self._client = None
127+
128+
def complete(self, payload: Mapping[str, Any]) -> dict[str, Any]:
129+
if self._client is None:
130+
raise RuntimeError("proxy client is not started")
131+
messages = _messages(payload.get("messages"))
132+
text = _last_user_text(messages)
133+
context: dict[str, Any] = {
134+
"benchmark": "hermes_native_env",
135+
"source_benchmark": "hermes_native_env",
136+
"harness_proxy": self.harness,
137+
"messages": messages,
138+
"tools": payload.get("tools") if isinstance(payload.get("tools"), list) else [],
139+
"tool_choice": payload.get("tool_choice"),
140+
"temperature": payload.get("temperature"),
141+
"max_tokens": payload.get("max_tokens"),
142+
}
143+
response = self._client.send_message(text, context=context)
144+
content = str(getattr(response, "text", "") or "")
145+
params = getattr(response, "params", {}) or {}
146+
tool_calls = _normalize_tool_calls(params.get("tool_calls"))
147+
message: dict[str, Any] = {"role": "assistant", "content": content}
148+
if tool_calls:
149+
message["tool_calls"] = tool_calls
150+
message["content"] = content or None
151+
now = int(time.time())
152+
return {
153+
"id": f"chatcmpl-{self.harness}-{now}",
154+
"object": "chat.completion",
155+
"created": now,
156+
"model": self.model,
157+
"choices": [{"index": 0, "message": message, "finish_reason": "tool_calls" if tool_calls else "stop"}],
158+
"usage": _usage(params.get("usage")),
159+
}
160+
161+
162+
def _build_client(
163+
*,
164+
harness: str,
165+
provider: str,
166+
model: str,
167+
upstream_base_url: str | None,
168+
) -> tuple[Any, Any | None]:
169+
if harness == "eliza":
170+
from eliza_adapter import ElizaClient, ElizaServerManager # noqa: WPS433
171+
172+
if not os.environ.get("ELIZA_BENCH_URL"):
173+
server = ElizaServerManager()
174+
server.start()
175+
return server.client, server
176+
client = ElizaClient()
177+
client.wait_until_ready(timeout=180)
178+
return client, None
179+
if harness == "openclaw":
180+
from openclaw_adapter.client import OpenClawClient # noqa: WPS433
181+
182+
return (
183+
OpenClawClient(
184+
provider=provider or "cerebras",
185+
model=model,
186+
base_url=upstream_base_url,
187+
direct_openai_compatible=True,
188+
),
189+
None,
190+
)
191+
raise ValueError(f"unsupported proxy harness: {harness!r}")
192+
193+
194+
def _free_port(host: str) -> int:
195+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
196+
sock.bind((host, 0))
197+
return int(sock.getsockname()[1])
198+
199+
200+
def _messages(value: object) -> list[dict[str, Any]]:
201+
if not isinstance(value, list):
202+
return []
203+
rows: list[dict[str, Any]] = []
204+
for item in value:
205+
if isinstance(item, dict):
206+
rows.append(dict(item))
207+
return rows
208+
209+
210+
def _last_user_text(messages: list[dict[str, Any]]) -> str:
211+
for message in reversed(messages):
212+
if message.get("role") == "user":
213+
content = message.get("content")
214+
if isinstance(content, str):
215+
return content
216+
if isinstance(content, list):
217+
parts: list[str] = []
218+
for part in content:
219+
if isinstance(part, dict) and isinstance(part.get("text"), str):
220+
parts.append(part["text"])
221+
return "\n".join(parts)
222+
return json.dumps(messages, ensure_ascii=True)
223+
224+
225+
def _normalize_tool_calls(value: object) -> list[dict[str, Any]]:
226+
if not isinstance(value, list):
227+
return []
228+
calls: list[dict[str, Any]] = []
229+
for index, item in enumerate(value):
230+
if not isinstance(item, dict):
231+
continue
232+
function = item.get("function")
233+
if not isinstance(function, dict):
234+
name = item.get("name")
235+
arguments = item.get("arguments")
236+
if isinstance(name, str):
237+
function = {
238+
"name": name,
239+
"arguments": arguments if isinstance(arguments, str) else json.dumps(arguments or {}),
240+
}
241+
if not isinstance(function, dict) or not isinstance(function.get("name"), str):
242+
continue
243+
arguments = function.get("arguments")
244+
calls.append(
245+
{
246+
"id": str(item.get("id") or f"call_{index}"),
247+
"type": "function",
248+
"function": {
249+
"name": function["name"],
250+
"arguments": arguments if isinstance(arguments, str) else json.dumps(arguments or {}),
251+
},
252+
}
253+
)
254+
return calls
255+
256+
257+
def _usage(value: object) -> dict[str, int]:
258+
if not isinstance(value, dict):
259+
return {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
260+
usage: dict[str, int] = {}
261+
for key in ("prompt_tokens", "completion_tokens", "total_tokens"):
262+
raw = value.get(key)
263+
usage[key] = int(raw) if isinstance(raw, (int, float)) else 0
264+
return usage
265+
266+
267+
__all__ = ["HarnessOpenAIProxy"]

packages/benchmarks/hermes-adapter/run_env_cli.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from pathlib import Path
2121

2222
from hermes_adapter.env_runner import run_hermes_env
23+
from hermes_adapter.harness_openai_proxy import HarnessOpenAIProxy
2324
from hermes_adapter.swe_env_smoke import run_humanevalpack_swe_smoke
2425

2526

@@ -92,18 +93,32 @@ def main(argv: list[str] | None = None) -> int:
9293
timeout_s=args.timeout_seconds,
9394
)
9495
else:
95-
result = run_hermes_env(
96-
env_id,
97-
output_dir=output_dir,
98-
provider=args.provider,
99-
model=args.model,
100-
base_url=args.base_url,
101-
repo_path=repo_path,
102-
max_tasks=args.max_tasks,
103-
task_filter=args.task_filter,
104-
timeout_s=args.timeout_seconds,
105-
force=args.force,
106-
)
96+
proxy = None
97+
try:
98+
base_url = args.base_url
99+
if args.harness in {"eliza", "openclaw"}:
100+
proxy = HarnessOpenAIProxy(
101+
harness=args.harness,
102+
provider=args.provider,
103+
model=args.model,
104+
upstream_base_url=args.base_url,
105+
).start()
106+
base_url = proxy.base_url
107+
result = run_hermes_env(
108+
env_id,
109+
output_dir=output_dir,
110+
provider=args.provider,
111+
model=args.model,
112+
base_url=base_url,
113+
repo_path=repo_path,
114+
max_tasks=args.max_tasks,
115+
task_filter=args.task_filter,
116+
timeout_s=args.timeout_seconds,
117+
force=args.force,
118+
)
119+
finally:
120+
if proxy is not None:
121+
proxy.stop()
107122

108123
ts = time.strftime("%Y%m%dT%H%M%SZ", time.gmtime())
109124
result_path = output_dir / f"hermes_{env_id}_{ts}.json"
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
from __future__ import annotations
2+
3+
from types import SimpleNamespace
4+
5+
from hermes_adapter.harness_openai_proxy import HarnessOpenAIProxy
6+
7+
8+
class _FakeClient:
9+
def __init__(self) -> None:
10+
self.calls: list[tuple[str, dict[str, object]]] = []
11+
12+
def send_message(self, text: str, context: dict[str, object]):
13+
self.calls.append((text, context))
14+
return SimpleNamespace(
15+
text="",
16+
params={
17+
"tool_calls": [
18+
{
19+
"function": {
20+
"name": "terminal",
21+
"arguments": {"cmd": "pytest -q"},
22+
}
23+
}
24+
],
25+
"usage": {"prompt_tokens": 3, "completion_tokens": 2, "total_tokens": 5},
26+
},
27+
)
28+
29+
30+
def test_proxy_completion_forwards_messages_tools_and_returns_openai_shape() -> None:
31+
proxy = HarnessOpenAIProxy(harness="openclaw", provider="cerebras", model="m")
32+
proxy._client = _FakeClient()
33+
34+
payload = {
35+
"messages": [
36+
{"role": "system", "content": "use tools"},
37+
{"role": "user", "content": "fix the repo"},
38+
],
39+
"tools": [{"type": "function", "function": {"name": "terminal"}}],
40+
"tool_choice": "auto",
41+
"temperature": 0,
42+
}
43+
44+
response = proxy.complete(payload)
45+
46+
fake = proxy._client
47+
assert isinstance(fake, _FakeClient)
48+
assert fake.calls[0][0] == "fix the repo"
49+
context = fake.calls[0][1]
50+
assert context["harness_proxy"] == "openclaw"
51+
assert context["messages"] == payload["messages"]
52+
assert context["tools"] == payload["tools"]
53+
assert response["choices"][0]["finish_reason"] == "tool_calls"
54+
message = response["choices"][0]["message"]
55+
assert message["tool_calls"][0]["function"]["name"] == "terminal"
56+
assert message["tool_calls"][0]["function"]["arguments"] == '{"cmd": "pytest -q"}'
57+
assert response["usage"]["total_tokens"] == 5
58+

packages/benchmarks/orchestrator/adapters.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,14 +141,13 @@ def _agent_compatibility_for(benchmark_id: str) -> tuple[str, ...]:
141141
return ALL_HARNESSES if _has_terminal_bench_docker_backend() else ()
142142
if benchmark_id == "gauntlet":
143143
return ALL_HARNESSES if _has_gauntlet_real_surfpool_backend() else ()
144-
if benchmark_id == "hermes_swe_env":
145-
return ALL_HARNESSES if _has_hermes_sandbox_backend() else ()
146144
if benchmark_id in {
147145
"hermes_tblite",
148146
"hermes_terminalbench_2",
149147
"hermes_yc_bench",
148+
"hermes_swe_env",
150149
}:
151-
return ("hermes",) if _has_hermes_sandbox_backend() else ()
150+
return ALL_HARNESSES if _has_hermes_sandbox_backend() else ()
152151
if benchmark_id == "voicebench":
153152
return ALL_HARNESSES if _has_voicebench_real_audio_assets() else ()
154153
if benchmark_id == "voicebench_quality":

0 commit comments

Comments
 (0)