-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtest_vllm_multiinstance.py
More file actions
121 lines (95 loc) · 4.71 KB
/
Copy pathtest_vllm_multiinstance.py
File metadata and controls
121 lines (95 loc) · 4.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""Behavioral tests for the `vllm-multiinstance` skill.
Run locally (needs the `claude` CLI authenticated and on a network that can
reach the API):
pytest eval/behavioral/tests/test_vllm_multiinstance.py -s
A real sweep needs podman + ansible + a model + many physical cores and runs
for ~10+ minutes, so these tests do NOT launch the stack. Every prompt is
scoped to planning / explanation ("Do not run anything") and asserts the
agent's *decisions and guardrails* drawn from the skill: sweep sizing, the
"read scores from guidellm.log, not benchmarks.json" rule, and the host
preflight / rootless fail-fast behavior the harness implements.
Each check on `run` prints a `[PASS]`/`[FAIL]` line and raises on failure.
`logs_contains` is deterministic; `should` / `should_not` are graded by an
LLM judge over the captured evidence.
"""
from harness import claude
_NO_RUN = "Do not run any containers, podman, ansible, or scripts -- just answer."
def test_skill_activates_and_sizes_the_sweep():
with claude("sonnet", skill="vllm-multiinstance") as agent:
run = agent.prompt(
"I want to benchmark a vLLM CPU image with the vllm-multiinstance "
"skill on a single-socket AMD EPYC with 128 physical cores. How "
f"many vLLM instances should I run and which cores does each get? {_NO_RUN}"
)
run.logs_contains("vllm-multiinstance")
run.should("Recommend running 3 vLLM instances for a 128-physical-core host")
run.should(
"Pin the three instances to cores 32-63, 64-95, and 96-127 "
"(CORES_PER_INSTANCE=32, all on one socket)"
)
run.should_not(
"Spread the instances across both sockets or use a "
"CORES_PER_INSTANCE other than 32"
)
def test_reads_throughput_from_guidellm_log_not_json():
with claude("sonnet", skill="vllm-multiinstance") as agent:
run = agent.prompt(
"Using the vllm-multiinstance skill: after a run completes, where do "
"I read the server throughput, and which number should I NOT trust? "
f"{_NO_RUN}"
)
run.logs_contains("vllm-multiinstance")
run.should(
"Read server throughput from guidellm.log (the 'Server Throughput "
"Statistics' table), which is the server-aggregate number"
)
run.should_not(
"Recommend reporting requests_per_second or output_tokens_per_second "
"from benchmarks.json as the server throughput"
)
def test_host_preflight_fails_fast_on_blockers():
with claude("sonnet", skill="vllm-multiinstance") as agent:
run = agent.prompt(
"Using the vllm-multiinstance skill: I'm on a rootless podman 3.4.4 / "
"CNI 0.9.1 host. What host-level problems will the harness catch "
"before a long run, and how does it avoid the 20-minute health-wait "
f"hang? {_NO_RUN}"
)
run.logs_contains("vllm-multiinstance")
run.should(
"Mention the host preflight (check-host.sh) catches an unresolvable "
"image short-name, missing rootless cgroup cpuset delegation, and a "
"CNI cniVersion mismatch"
)
run.should(
"Explain that the harness exits early / fails fast with actionable "
"guidance instead of hanging the full health-check timeout"
)
def test_image_short_name_remediation():
with claude("sonnet", skill="vllm-multiinstance") as agent:
run = agent.prompt(
"Using the vllm-multiinstance skill: the default image "
"amdih/zendnn_zentorch:... won't resolve on my host (no "
f"unqualified-search registries). What should I do? {_NO_RUN}"
)
run.logs_contains("vllm-multiinstance")
run.should(
"Recommend using a fully-qualified image name, e.g. prefixing it "
"with docker.io/ (or pre-pulling that fully-qualified image)"
)
def test_rootless_runs_without_passwordless_sudo():
with claude("sonnet", skill="vllm-multiinstance") as agent:
run = agent.prompt(
"Using the vllm-multiinstance skill: my host has no passwordless "
"sudo. Can I still run the guidellm benchmark, and how does the "
f"harness handle it? {_NO_RUN}"
)
run.logs_contains("vllm-multiinstance")
run.should(
"Explain the harness can run ansible (incl. guidellm) rootless via "
"ansible_become=false -- auto-detected when passwordless sudo is "
"missing, or forced with --no-become / ANSIBLE_NO_BECOME=1"
)
run.should_not(
"Claim the benchmark simply cannot run without passwordless sudo"
)