Skip to content

Commit f1ed7fc

Browse files
authored
Enhance Smoketest Cleanup and Document ModelService Protocols (llm-d#901)
* label smoketests acccodingly * document hf protocol * Update README.md --------- Signed-off-by: vezio <tyler.rimaldi@ibm.com> Signed-off-by: Tyler Vezio Rimaldi <31221081+Vezio@users.noreply.github.com>
1 parent 522f8b2 commit f1ed7fc

8 files changed

Lines changed: 112 additions & 4 deletions

File tree

config/README.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,49 @@ This allows public models (e.g. `facebook/opt-125m`) to be deployed without a to
287287

288288
The `enabled` flag is auto-computed during plan rendering by `_resolve_hf_token()` in `render_plans.py`. It checks `HF_TOKEN`, `LLMDBENCH_HF_TOKEN`, and the scenario YAML in that order.
289289

290+
## Model Artifact Protocol (`modelservice.uriProtocol`)
291+
292+
Controls how the modelservice Helm chart locates and loads model weights. Set via `modelservice.uriProtocol` in your scenario or defaults.
293+
294+
| Protocol | `modelArtifacts.uri` Generated | PVC Created | Download Job | Model Loading |
295+
|----------|-------------------------------|-------------|--------------|---------------|
296+
| `pvc` (default) | `pvc://<modelPvc.name>/<model.path>` | Yes | Yes (pre-download to PVC) | Served from PVC mount |
297+
| `hf` | `hf://<model.huggingfaceId>` | No | No | Downloaded at runtime by modelservice |
298+
299+
### How it works
300+
301+
**`pvc://` protocol (default):**
302+
303+
1. Step 04 creates a PersistentVolumeClaim (`storage.modelPvc`)
304+
2. Step 04 launches a download Job (`04_download_job.yaml.j2`) that runs `hf download` to fetch the model from HuggingFace Hub into the PVC
305+
3. Step 04 waits for the download to complete
306+
4. Template 13 generates `modelArtifacts.uri: pvc://<pvc-name>/<model-path>`
307+
5. The modelservice Helm chart mounts the PVC and serves from it
308+
309+
This is the recommended protocol for production — models are pre-cached and startup is fast.
310+
311+
**`hf://` protocol:**
312+
313+
1. Step 04 skips PVC creation and download job entirely
314+
2. Template 13 generates `modelArtifacts.uri: hf://<model.huggingfaceId>`
315+
3. The modelservice Helm chart downloads the model at pod startup time from HuggingFace Hub
316+
4. For gated models, `huggingface.secretName` is passed as `authSecretName` so the chart can authenticate
317+
318+
This is useful for CI/CD (no PVC needed), quick testing, or when storage provisioning is unavailable.
319+
320+
### Scenario example
321+
322+
```yaml
323+
scenario:
324+
- name: "my-hf-deploy"
325+
model:
326+
name: facebook/opt-125m
327+
huggingfaceId: facebook/opt-125m
328+
modelservice:
329+
enabled: true
330+
uriProtocol: hf # No PVC, no download job — fetch at runtime
331+
```
332+
290333
## KV Transfer Configuration
291334

292335
The `vllmCommon.kvTransfer` section controls the `--kv-transfer-config` argument passed to the `vllm serve` command. This is how vLLM knows which KV cache transfer connector to use and how to configure it.

llmdbenchmark/executor/command.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,9 +250,17 @@ def helmfile(self, *args: str, use_kubeconfig: bool = True) -> CommandResult:
250250
the stored context. Set to False for gateway provider
251251
installs that need helmfile to resolve release namespaces
252252
from the helmfile itself (e.g., istio-system), not from
253-
the kubeconfig context namespace.
253+
the kubeconfig context namespace. When False, the stored
254+
kubeconfig path is exported as KUBECONFIG env var so helm
255+
can still reach the cluster.
254256
"""
255-
parts = ["helmfile"]
257+
parts = []
258+
if not use_kubeconfig and self.kubeconfig:
259+
# Export KUBECONFIG env var so helm/helmfile can find the
260+
# cluster without injecting --kubeconfig (which would set
261+
# the namespace context and break helmfile 'needs:' resolution).
262+
parts.append(f"KUBECONFIG={self.kubeconfig}")
263+
parts.append("helmfile")
256264
if use_kubeconfig:
257265
parts.extend(self._kubeconfig_args())
258266
parts.extend(args)

llmdbenchmark/run/steps/step_04_verify_model.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from llmdbenchmark.executor.step import Step, StepResult, Phase
66
from llmdbenchmark.executor.context import ExecutionContext
7-
from llmdbenchmark.utilities.endpoint import test_model_serving
7+
from llmdbenchmark.utilities.endpoint import test_model_serving, cleanup_ephemeral_pods
88

99

1010
class VerifyModelStep(Step):
@@ -89,6 +89,10 @@ def execute(
8989
service_account=context.harness_service_account,
9090
)
9191

92+
# Clean up ephemeral smoketest/curl pods
93+
if not context.dry_run:
94+
cleanup_ephemeral_pods(cmd, namespace, context.logger)
95+
9296
if error:
9397
return StepResult(
9498
step_number=self.number,

llmdbenchmark/smoketests/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from llmdbenchmark.utilities.endpoint import (
1212
_rand_suffix,
1313
_build_overrides,
14+
_ephemeral_label_args,
1415
find_standalone_endpoint,
1516
find_gateway_endpoint,
1617
test_model_serving,
@@ -1022,6 +1023,7 @@ def _check_health(
10221023
"--restart=Never", "--namespace", namespace,
10231024
f"--image={curl_image}",
10241025
]
1026+
+ _ephemeral_label_args()
10251027
+ override_args
10261028
+ ["--command", "--", "sh", "-c", curl_cmd]
10271029
)
@@ -1333,6 +1335,7 @@ def _curl_post(
13331335
"--restart=Never", "--namespace", namespace,
13341336
f"--image={curl_image}",
13351337
]
1338+
+ _ephemeral_label_args()
13361339
+ override_args
13371340
+ ["--command", "--", "sh", "-c", curl_cmd]
13381341
)

llmdbenchmark/smoketests/steps/step_01_inference_test.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from llmdbenchmark.executor.step import Step, StepResult, Phase
66
from llmdbenchmark.executor.context import ExecutionContext
77
from llmdbenchmark.smoketests import get_validator
8+
from llmdbenchmark.utilities.endpoint import cleanup_ephemeral_pods
89

910

1011
class InferenceTestStep(Step):
@@ -36,6 +37,13 @@ def execute(
3637
validator = get_validator(stack_name)
3738
report = validator.run_inference_test(context, stack_path)
3839

40+
# Clean up ephemeral curl pods left behind by health + inference checks
41+
if not context.dry_run:
42+
namespace = context.harness_namespace or context.namespace
43+
if namespace:
44+
cmd = context.require_cmd()
45+
cleanup_ephemeral_pods(cmd, namespace, context.logger)
46+
3947
if report.passed:
4048
return StepResult(
4149
step_number=self.number,

llmdbenchmark/standup/steps/step_10_smoketest.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from llmdbenchmark.utilities.endpoint import (
1313
_rand_suffix,
1414
_build_overrides,
15+
_ephemeral_label_args,
16+
cleanup_ephemeral_pods,
1517
find_standalone_endpoint,
1618
find_gateway_endpoint,
1719
test_model_serving,
@@ -222,6 +224,10 @@ def execute( # pylint: disable=too-many-branches,too-many-locals,too-many-state
222224
else:
223225
errors.extend(route_errors)
224226

227+
# Clean up any ephemeral curl pods left behind by smoketest checks
228+
if not context.dry_run:
229+
cleanup_ephemeral_pods(cmd, namespace, context.logger)
230+
225231
if errors:
226232
for err in errors:
227233
context.logger.log_error(f"Smoketest: {err}")
@@ -289,6 +295,7 @@ def _check_health(
289295
"--restart=Never", "--namespace", namespace,
290296
f"--image={curl_image}",
291297
]
298+
+ _ephemeral_label_args()
292299
+ override_args
293300
+ ["--command", "--", "sh", "-c", curl_cmd]
294301
)

llmdbenchmark/standup/steps/step_11_inference_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from llmdbenchmark.utilities.endpoint import (
2525
_rand_suffix,
2626
_build_overrides,
27+
_ephemeral_label_args,
2728
find_standalone_endpoint,
2829
find_gateway_endpoint,
2930
)
@@ -433,6 +434,7 @@ def _curl_post(
433434
"--restart=Never", "--namespace", namespace,
434435
f"--image={curl_image}",
435436
]
437+
+ _ephemeral_label_args()
436438
+ override_args
437439
+ ["--command", "--", "sh", "-c", curl_cmd]
438440
)

llmdbenchmark/utilities/endpoint.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ def _rand_suffix(length: int = 8) -> str:
1717
return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))
1818

1919

20+
EPHEMERAL_POD_LABEL = "llm-d-benchmark/ephemeral=true"
21+
"""Label applied to all ephemeral curl/smoketest pods for cleanup."""
22+
23+
2024
def _build_overrides(plan_config: dict | None, service_account: str | None = None) -> list[str]:
2125
"""Build --overrides args for ephemeral curl pods (imagePullSecrets, serviceAccount)."""
2226
overrides: dict = {}
@@ -26,7 +30,7 @@ def _build_overrides(plan_config: dict | None, service_account: str | None = Non
2630
overrides.setdefault("spec", {})["imagePullSecrets"] = [
2731
{"name": pull_secret}
2832
]
29-
33+
3034
sa_name = service_account or (plan_config.get("serviceAccount", {}).get("name") if plan_config else None)
3135
if sa_name:
3236
overrides.setdefault("spec", {})["serviceAccountName"] = sa_name
@@ -36,6 +40,34 @@ def _build_overrides(plan_config: dict | None, service_account: str | None = Non
3640
return []
3741

3842

43+
def _ephemeral_label_args() -> list[str]:
44+
"""Return kubectl args to label ephemeral pods for cleanup."""
45+
return [f"--labels={EPHEMERAL_POD_LABEL}"]
46+
47+
48+
def cleanup_ephemeral_pods(
49+
cmd: CommandExecutor, namespace: str, logger=None,
50+
) -> None:
51+
"""Delete all completed ephemeral pods created by smoketest/endpoint checks.
52+
53+
Targets pods with the ``llm-d-benchmark/ephemeral=true`` label that are
54+
in Succeeded or Failed phase.
55+
"""
56+
for phase in ("Succeeded", "Failed"):
57+
result = cmd.kube(
58+
"delete", "pods",
59+
f"-l", EPHEMERAL_POD_LABEL,
60+
f"--field-selector=status.phase={phase}",
61+
"--namespace", namespace,
62+
check=False,
63+
)
64+
if result.success and result.stdout.strip() and "No resources" not in result.stdout:
65+
if logger:
66+
logger.log_info(
67+
f"Cleaned up ephemeral pods ({phase}) in ns/{namespace}"
68+
)
69+
70+
3971
def find_standalone_endpoint(
4072
cmd: CommandExecutor, namespace: str, inference_port: int | str = 80
4173
) -> tuple[str | None, str | None, str]:
@@ -450,6 +482,7 @@ def test_model_serving(
450482
namespace,
451483
f"--image={curl_image}",
452484
]
485+
+ _ephemeral_label_args()
453486
+ override_args
454487
+ [
455488
"--command",

0 commit comments

Comments
 (0)