Enhance Smoketest Cleanup and Document ModelService Protocols (llm-d#901)

Vezio · web-flow · commit f1ed7fc2138b · 2026-04-06T14:24:10.000-04:00
* label smoketests acccodingly

* document hf protocol

* Update README.md

---------

Signed-off-by: vezio &lt;tyler.rimaldi@ibm.com&gt;
Signed-off-by: Tyler Vezio Rimaldi &lt;31221081+Vezio@users.noreply.github.com&gt;
diff --git a/config/README.md b/config/README.md
@@ -287,6 +287,49 @@ This allows public models (e.g. `facebook/opt-125m`) to be deployed without a to
 
 The `enabled` flag is auto-computed during plan rendering by `_resolve_hf_token()` in `render_plans.py`. It checks `HF_TOKEN`, `LLMDBENCH_HF_TOKEN`, and the scenario YAML in that order.
 
+## Model Artifact Protocol (`modelservice.uriProtocol`)
+
+Controls how the modelservice Helm chart locates and loads model weights. Set via `modelservice.uriProtocol` in your scenario or defaults.
+
+| Protocol | `modelArtifacts.uri` Generated | PVC Created | Download Job | Model Loading |
+|----------|-------------------------------|-------------|--------------|---------------|
+| `pvc` (default) | `pvc://<modelPvc.name>/<model.path>` | Yes | Yes (pre-download to PVC) | Served from PVC mount |
+| `hf` | `hf://<model.huggingfaceId>` | No | No | Downloaded at runtime by modelservice |
+
+### How it works
+
+**`pvc://` protocol (default):**
+
+1. Step 04 creates a PersistentVolumeClaim (`storage.modelPvc`)
+2. Step 04 launches a download Job (`04_download_job.yaml.j2`) that runs `hf download` to fetch the model from HuggingFace Hub into the PVC
+3. Step 04 waits for the download to complete
+4. Template 13 generates `modelArtifacts.uri: pvc://<pvc-name>/<model-path>`
+5. The modelservice Helm chart mounts the PVC and serves from it
+
+This is the recommended protocol for production — models are pre-cached and startup is fast.
+
+**`hf://` protocol:**
+
+1. Step 04 skips PVC creation and download job entirely
+2. Template 13 generates `modelArtifacts.uri: hf://<model.huggingfaceId>`
+3. The modelservice Helm chart downloads the model at pod startup time from HuggingFace Hub
+4. For gated models, `huggingface.secretName` is passed as `authSecretName` so the chart can authenticate
+
+This is useful for CI/CD (no PVC needed), quick testing, or when storage provisioning is unavailable.
+
+### Scenario example
+
+```yaml
+scenario:
+  - name: "my-hf-deploy"
+    model:
+      name: facebook/opt-125m
+      huggingfaceId: facebook/opt-125m
+    modelservice:
+      enabled: true
+      uriProtocol: hf     # No PVC, no download job — fetch at runtime
+```
+
 ## KV Transfer Configuration
 
 The `vllmCommon.kvTransfer` section controls the `--kv-transfer-config` argument passed to the `vllm serve` command. This is how vLLM knows which KV cache transfer connector to use and how to configure it.
diff --git a/llmdbenchmark/executor/command.py b/llmdbenchmark/executor/command.py
@@ -250,9 +250,17 @@ def helmfile(self, *args: str, use_kubeconfig: bool = True) -> CommandResult:
                 the stored context. Set to False for gateway provider
                 installs that need helmfile to resolve release namespaces
                 from the helmfile itself (e.g., istio-system), not from
-                the kubeconfig context namespace.
+                the kubeconfig context namespace. When False, the stored
+                kubeconfig path is exported as KUBECONFIG env var so helm
+                can still reach the cluster.
         """
-        parts = ["helmfile"]
+        parts = []
+        if not use_kubeconfig and self.kubeconfig:
+            # Export KUBECONFIG env var so helm/helmfile can find the
+            # cluster without injecting --kubeconfig (which would set
+            # the namespace context and break helmfile 'needs:' resolution).
+            parts.append(f"KUBECONFIG={self.kubeconfig}")
+        parts.append("helmfile")
         if use_kubeconfig:
             parts.extend(self._kubeconfig_args())
         parts.extend(args)
diff --git a/llmdbenchmark/run/steps/step_04_verify_model.py b/llmdbenchmark/run/steps/step_04_verify_model.py
@@ -4,7 +4,7 @@
 
 from llmdbenchmark.executor.step import Step, StepResult, Phase
 from llmdbenchmark.executor.context import ExecutionContext
-from llmdbenchmark.utilities.endpoint import test_model_serving
+from llmdbenchmark.utilities.endpoint import test_model_serving, cleanup_ephemeral_pods
 
 
 class VerifyModelStep(Step):
@@ -89,6 +89,10 @@ def execute(
             service_account=context.harness_service_account,
         )
 
+        # Clean up ephemeral smoketest/curl pods
+        if not context.dry_run:
+            cleanup_ephemeral_pods(cmd, namespace, context.logger)
+
         if error:
             return StepResult(
                 step_number=self.number,
diff --git a/llmdbenchmark/smoketests/base.py b/llmdbenchmark/smoketests/base.py
@@ -11,6 +11,7 @@
 from llmdbenchmark.utilities.endpoint import (
     _rand_suffix,
     _build_overrides,
+    _ephemeral_label_args,
     find_standalone_endpoint,
     find_gateway_endpoint,
     test_model_serving,
@@ -1022,6 +1023,7 @@ def _check_health(
                     "--restart=Never", "--namespace", namespace,
                     f"--image={curl_image}",
                 ]
+                + _ephemeral_label_args()
                 + override_args
                 + ["--command", "--", "sh", "-c", curl_cmd]
             )
@@ -1333,6 +1335,7 @@ def _curl_post(
                 "--restart=Never", "--namespace", namespace,
                 f"--image={curl_image}",
             ]
+            + _ephemeral_label_args()
             + override_args
             + ["--command", "--", "sh", "-c", curl_cmd]
         )
diff --git a/llmdbenchmark/smoketests/steps/step_01_inference_test.py b/llmdbenchmark/smoketests/steps/step_01_inference_test.py
@@ -5,6 +5,7 @@
 from llmdbenchmark.executor.step import Step, StepResult, Phase
 from llmdbenchmark.executor.context import ExecutionContext
 from llmdbenchmark.smoketests import get_validator
+from llmdbenchmark.utilities.endpoint import cleanup_ephemeral_pods
 
 
 class InferenceTestStep(Step):
@@ -36,6 +37,13 @@ def execute(
         validator = get_validator(stack_name)
         report = validator.run_inference_test(context, stack_path)
 
+        # Clean up ephemeral curl pods left behind by health + inference checks
+        if not context.dry_run:
+            namespace = context.harness_namespace or context.namespace
+            if namespace:
+                cmd = context.require_cmd()
+                cleanup_ephemeral_pods(cmd, namespace, context.logger)
+
         if report.passed:
             return StepResult(
                 step_number=self.number,
diff --git a/llmdbenchmark/standup/steps/step_10_smoketest.py b/llmdbenchmark/standup/steps/step_10_smoketest.py
@@ -12,6 +12,8 @@
 from llmdbenchmark.utilities.endpoint import (
     _rand_suffix,
     _build_overrides,
+    _ephemeral_label_args,
+    cleanup_ephemeral_pods,
     find_standalone_endpoint,
     find_gateway_endpoint,
     test_model_serving,
@@ -222,6 +224,10 @@ def execute(  # pylint: disable=too-many-branches,too-many-locals,too-many-state
                 else:
                     errors.extend(route_errors)
 
+        # Clean up any ephemeral curl pods left behind by smoketest checks
+        if not context.dry_run:
+            cleanup_ephemeral_pods(cmd, namespace, context.logger)
+
         if errors:
             for err in errors:
                 context.logger.log_error(f"Smoketest: {err}")
@@ -289,6 +295,7 @@ def _check_health(
                     "--restart=Never", "--namespace", namespace,
                     f"--image={curl_image}",
                 ]
+                + _ephemeral_label_args()
                 + override_args
                 + ["--command", "--", "sh", "-c", curl_cmd]
             )
diff --git a/llmdbenchmark/standup/steps/step_11_inference_test.py b/llmdbenchmark/standup/steps/step_11_inference_test.py
@@ -24,6 +24,7 @@
 from llmdbenchmark.utilities.endpoint import (
     _rand_suffix,
     _build_overrides,
+    _ephemeral_label_args,
     find_standalone_endpoint,
     find_gateway_endpoint,
 )
@@ -433,6 +434,7 @@ def _curl_post(
                 "--restart=Never", "--namespace", namespace,
                 f"--image={curl_image}",
             ]
+            + _ephemeral_label_args()
             + override_args
             + ["--command", "--", "sh", "-c", curl_cmd]
         )
diff --git a/llmdbenchmark/utilities/endpoint.py b/llmdbenchmark/utilities/endpoint.py
@@ -17,6 +17,10 @@ def _rand_suffix(length: int = 8) -> str:
     return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))
 
 
+EPHEMERAL_POD_LABEL = "llm-d-benchmark/ephemeral=true"
+"""Label applied to all ephemeral curl/smoketest pods for cleanup."""
+
+
 def _build_overrides(plan_config: dict | None, service_account: str | None = None) -> list[str]:
     """Build --overrides args for ephemeral curl pods (imagePullSecrets, serviceAccount)."""
     overrides: dict = {}
@@ -26,7 +30,7 @@ def _build_overrides(plan_config: dict | None, service_account: str | None = Non
             overrides.setdefault("spec", {})["imagePullSecrets"] = [
                 {"name": pull_secret}
             ]
-            
+
     sa_name = service_account or (plan_config.get("serviceAccount", {}).get("name") if plan_config else None)
     if sa_name:
         overrides.setdefault("spec", {})["serviceAccountName"] = sa_name
@@ -36,6 +40,34 @@ def _build_overrides(plan_config: dict | None, service_account: str | None = Non
     return []
 
 
+def _ephemeral_label_args() -> list[str]:
+    """Return kubectl args to label ephemeral pods for cleanup."""
+    return [f"--labels={EPHEMERAL_POD_LABEL}"]
+
+
+def cleanup_ephemeral_pods(
+    cmd: CommandExecutor, namespace: str, logger=None,
+) -> None:
+    """Delete all completed ephemeral pods created by smoketest/endpoint checks.
+
+    Targets pods with the ``llm-d-benchmark/ephemeral=true`` label that are
+    in Succeeded or Failed phase.
+    """
+    for phase in ("Succeeded", "Failed"):
+        result = cmd.kube(
+            "delete", "pods",
+            f"-l", EPHEMERAL_POD_LABEL,
+            f"--field-selector=status.phase={phase}",
+            "--namespace", namespace,
+            check=False,
+        )
+        if result.success and result.stdout.strip() and "No resources" not in result.stdout:
+            if logger:
+                logger.log_info(
+                    f"Cleaned up ephemeral pods ({phase}) in ns/{namespace}"
+                )
+
+
 def find_standalone_endpoint(
     cmd: CommandExecutor, namespace: str, inference_port: int | str = 80
 ) -> tuple[str | None, str | None, str]:
@@ -450,6 +482,7 @@ def test_model_serving(
                 namespace,
                 f"--image={curl_image}",
             ]
+            + _ephemeral_label_args()
             + override_args
             + [
                 "--command",

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`	`from llmdbenchmark.utilities.endpoint import (`
`12`	`12`	`_rand_suffix,`
`13`	`13`	`_build_overrides,`
	`14`	`+ _ephemeral_label_args,`
`14`	`15`	`find_standalone_endpoint,`
`15`	`16`	`find_gateway_endpoint,`
`16`	`17`	`test_model_serving,`
`@@ -1022,6 +1023,7 @@ def _check_health(`
`1022`	`1023`	`"--restart=Never", "--namespace", namespace,`
`1023`	`1024`	`f"--image={curl_image}",`
`1024`	`1025`	`]`
	`1026`	`+ + _ephemeral_label_args()`
`1025`	`1027`	`+ override_args`
`1026`	`1028`	`+ ["--command", "--", "sh", "-c", curl_cmd]`
`1027`	`1029`	`)`
`@@ -1333,6 +1335,7 @@ def _curl_post(`
`1333`	`1335`	`"--restart=Never", "--namespace", namespace,`
`1334`	`1336`	`f"--image={curl_image}",`
`1335`	`1337`	`]`
	`1338`	`+ + _ephemeral_label_args()`
`1336`	`1339`	`+ override_args`
`1337`	`1340`	`+ ["--command", "--", "sh", "-c", curl_cmd]`
`1338`	`1341`	`)`
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@`
`24`	`24`	`from llmdbenchmark.utilities.endpoint import (`
`25`	`25`	`_rand_suffix,`
`26`	`26`	`_build_overrides,`
	`27`	`+ _ephemeral_label_args,`
`27`	`28`	`find_standalone_endpoint,`
`28`	`29`	`find_gateway_endpoint,`
`29`	`30`	`)`
`@@ -433,6 +434,7 @@ def _curl_post(`
`433`	`434`	`"--restart=Never", "--namespace", namespace,`
`434`	`435`	`f"--image={curl_image}",`
`435`	`436`	`]`
	`437`	`+ + _ephemeral_label_args()`
`436`	`438`	`+ override_args`
`437`	`439`	`+ ["--command", "--", "sh", "-c", curl_cmd]`
`438`	`440`	`)`