Skip to content

Commit a30d3dd

Browse files
committed
Fix Windows harness bug
1 parent 6bbce34 commit a30d3dd

1 file changed

Lines changed: 13 additions & 6 deletions

File tree

eval/behavioral/harness.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,16 @@ def check_api_reachable(model: str | None = DEFAULT_MODEL, timeout: int = 60) ->
9797
return False, "'claude' CLI not found on PATH"
9898

9999
model = _enforce_model_policy(model)
100-
cmd = [claude_bin, "-p", "Reply with the single word: ok", "--output-format", "json"]
100+
cmd = [claude_bin, "-p", "--output-format", "json"]
101101
if model:
102102
cmd += ["--model", model]
103103

104+
# Prompt goes over stdin (see `_run_agent` for why) -- consistent here even
105+
# though this one is single-line.
104106
try:
105107
proc = subprocess.run(
106108
cmd, capture_output=True, text=True, encoding="utf-8",
107-
stdin=subprocess.DEVNULL, timeout=timeout, env=_claude_env(),
109+
input="Reply with the single word: ok", timeout=timeout, env=_claude_env(),
108110
)
109111
except subprocess.TimeoutExpired:
110112
return False, f"API preflight timed out after {timeout}s (is the network reachable?)"
@@ -135,7 +137,7 @@ def _run_agent(prompt_text: str, workspace: Path, model: str | None, effort: str
135137
raise RuntimeError("'claude' CLI not found on PATH")
136138

137139
cmd = [
138-
claude_bin, "-p", prompt_text,
140+
claude_bin, "-p",
139141
"--output-format", "stream-json", "--verbose",
140142
"--dangerously-skip-permissions",
141143
"--add-dir", str(workspace),
@@ -145,9 +147,14 @@ def _run_agent(prompt_text: str, workspace: Path, model: str | None, effort: str
145147
if effort:
146148
cmd += ["--effort", effort]
147149

150+
# Pass the prompt over stdin rather than as an argv string. On Windows, when
151+
# `claude` resolves to a .cmd/.ps1 shim, a multi-line command-line argument
152+
# is re-parsed by cmd.exe/PowerShell and truncated at the first newline.
153+
# stdin is a raw byte stream and is immune to that on all platforms, so
154+
# multi-line test prompts stay intact.
148155
proc = subprocess.run(
149156
cmd, cwd=str(workspace), capture_output=True, text=True,
150-
encoding="utf-8", stdin=subprocess.DEVNULL, env=_claude_env(),
157+
encoding="utf-8", input=prompt_text, env=_claude_env(),
151158
)
152159

153160
events: list[dict] = []
@@ -229,7 +236,7 @@ def _grade_with_llm(statement: str, run: "Run", judge_model: str | None) -> tupl
229236
'{"pass": true|false, "reason": "<one short sentence>"}'
230237
)
231238
cmd = [
232-
claude_bin, "-p", prompt_text,
239+
claude_bin, "-p",
233240
"--output-format", "json",
234241
"--dangerously-skip-permissions",
235242
"--add-dir", str(run.workspace),
@@ -240,7 +247,7 @@ def _grade_with_llm(statement: str, run: "Run", judge_model: str | None) -> tupl
240247
try:
241248
proc = subprocess.run(
242249
cmd, capture_output=True, text=True, encoding="utf-8",
243-
stdin=subprocess.DEVNULL, timeout=180, env=_claude_env(),
250+
input=prompt_text, timeout=180, env=_claude_env(),
244251
)
245252
except subprocess.TimeoutExpired:
246253
return False, "llm_judge timed out after 180s"

0 commit comments

Comments
 (0)