Skip to content
2 changes: 1 addition & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def pytest_addoption(parser: Parser) -> None:
# Runtime options
runtime_group.addoption(
"--supported-accelerator-type",
default=os.environ.get("SUPPORTED_ACCLERATOR_TYPE"),
default=os.environ.get("SUPPORTED_ACCELERATOR_TYPE"),
help="Supported accelerator type : Nvidia,AMD,Gaudi",
)
runtime_group.addoption(
Expand Down
18 changes: 17 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def supported_accelerator_type(pytestconfig: pytest.Config) -> str | None:
if accelerator_type.lower() not in AcceleratorType.SUPPORTED_LISTS:
raise ValueError(
"accelerator type is not defined."
"Either pass with `--supported-accelerator-type` or set `SUPPORTED_ACCLERATOR_TYPE` environment variable"
"Either pass with `--supported-accelerator-type` or set `SUPPORTED_ACCELERATOR_TYPE` environment variable"
)
return accelerator_type

Expand Down Expand Up @@ -985,3 +985,19 @@ def oci_registry_route(admin_client: DynamicClient, oci_registry_service: Servic
def oci_registry_host(oci_registry_route: Route) -> str:
"""Get the OCI registry host from the route"""
return oci_registry_route.host


@pytest.fixture(scope="session")
def skip_if_no_supported_accelerator_type(supported_accelerator_type: str | None) -> None:
"""Skip test if no supported GPU accelerator is available."""
# Only GPU accelerators that support vLLM
supported_gpu_accelerators = {
AcceleratorType.NVIDIA,
AcceleratorType.AMD,
AcceleratorType.GAUDI,
}

if not supported_accelerator_type or supported_accelerator_type.lower() not in supported_gpu_accelerators:
pytest.skip(
f"Unsupported accelerator '{supported_accelerator_type}'. Expected one of {supported_gpu_accelerators}."
)
140 changes: 139 additions & 1 deletion tests/model_explainability/lm_eval/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,30 @@
from kubernetes.dynamic import DynamicClient
from ocp_resources.data_science_cluster import DataScienceCluster
from ocp_resources.deployment import Deployment
from ocp_resources.inference_service import InferenceService
from ocp_resources.lm_eval_job import LMEvalJob
from ocp_resources.namespace import Namespace
from ocp_resources.persistent_volume_claim import PersistentVolumeClaim
from ocp_resources.pod import Pod
from ocp_resources.route import Route
from ocp_resources.secret import Secret
from ocp_resources.service import Service
from ocp_resources.serving_runtime import ServingRuntime
from pytest import Config, FixtureRequest

from tests.model_explainability.lm_eval.constants import (
ACCELERATOR_IDENTIFIER,
ARC_EASY_DATASET_IMAGE,
FLAN_T5_IMAGE,
LMEVAL_OCI_REPO,
LMEVAL_OCI_TAG,
)
from tests.model_explainability.lm_eval.utils import get_lmevaljob_pod
from utilities.constants import ApiGroups, Labels, MinIo, Protocols, Timeout
from utilities.constants import ApiGroups, KServeDeploymentType, Labels, MinIo, Protocols, RuntimeTemplates, Timeout
from utilities.exceptions import MissingParameter
from utilities.general import b64_encoded_string
from utilities.inference_utils import create_isvc
from utilities.serving_runtime import ServingRuntimeFromTemplate

VLLM_EMULATOR: str = "vllm-emulator"
VLLM_EMULATOR_PORT: int = 8000
Expand Down Expand Up @@ -542,6 +547,11 @@ def lmevaljob_s3_offline_pod(admin_client: DynamicClient, lmevaljob_s3_offline:
yield get_lmevaljob_pod(client=admin_client, lmevaljob=lmevaljob_s3_offline)


@pytest.fixture(scope="function")
def lmevaljob_gpu_pod(admin_client: DynamicClient, lmevaljob_gpu: LMEvalJob) -> Generator[Pod, Any, Any]:
yield get_lmevaljob_pod(client=admin_client, lmevaljob=lmevaljob_gpu)


@pytest.fixture(scope="function")
def lmeval_hf_access_token(
admin_client: DynamicClient,
Expand All @@ -564,3 +574,131 @@ def lmeval_hf_access_token(
wait_for_resource=True,
) as secret:
yield secret


# GPU-based vLLM fixtures for SmolLM-1.7B
@pytest.fixture(scope="function")
def lmeval_vllm_serving_runtime(
admin_client: DynamicClient,
model_namespace: Namespace,
vllm_runtime_image: str,
supported_accelerator_type: str | None,
) -> Generator[ServingRuntime]:
"""vLLM ServingRuntime for GPU-based model deployment in LMEval tests."""
# Map accelerator type to runtime template
accelerator_to_template = {
"nvidia": RuntimeTemplates.VLLM_CUDA,
"amd": RuntimeTemplates.VLLM_ROCM,
"gaudi": RuntimeTemplates.VLLM_GAUDI,
}

accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
template_name = accelerator_to_template.get(accelerator_type)

if not template_name:
pytest.skip(f"Unsupported accelerator type for vLLM: {supported_accelerator_type}")
Comment on lines +595 to +599
Copy link
Copy Markdown
Contributor

@coderabbitai coderabbitai bot Mar 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Silent fallback to "nvidia" when supported_accelerator_type is None can cause confusing failures.

Line 595 defaults to "nvidia" when supported_accelerator_type is None, but the CLI option (per root conftest.py) returns None when the environment variable is unset. If a test runs on a non-NVIDIA cluster without the accelerator type configured, this fixture will provision a CUDA runtime and fail with a misleading error instead of skipping gracefully.

Consider either:

  1. Requiring the value explicitly (skip/error when None)
  2. Relying on the skip_if_no_supported_accelerator_type fixture as a dependency to guarantee the value is never None here
Option 1: Fail fast when accelerator type is missing
-    accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
-    template_name = accelerator_to_template.get(accelerator_type)
-
-    if not template_name:
-        pytest.skip(f"Unsupported accelerator type for vLLM: {supported_accelerator_type}")
+    if not supported_accelerator_type:
+        pytest.skip("supported_accelerator_type is required for GPU-backed vLLM tests")
+
+    accelerator_type = supported_accelerator_type.lower()
+    template_name = accelerator_to_template.get(accelerator_type)
+
+    if not template_name:
+        pytest.skip(f"Unsupported accelerator type for vLLM: {accelerator_type}")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
template_name = accelerator_to_template.get(accelerator_type)
if not template_name:
pytest.skip(f"Unsupported accelerator type for vLLM: {supported_accelerator_type}")
if not supported_accelerator_type:
pytest.skip("supported_accelerator_type is required for GPU-backed vLLM tests")
accelerator_type = supported_accelerator_type.lower()
template_name = accelerator_to_template.get(accelerator_type)
if not template_name:
pytest.skip(f"Unsupported accelerator type for vLLM: {accelerator_type}")
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/model_explainability/lm_eval/conftest.py` around lines 595 - 599, The
code silently defaults supported_accelerator_type to "nvidia" which can
provision CUDA incorrectly; change the logic so it no longer defaults — if
supported_accelerator_type is None either (a) declare this fixture dependent on
skip_if_no_supported_accelerator_type to guarantee a value, or (b) immediately
call pytest.skip (or raise) when supported_accelerator_type is None before
computing accelerator_type/template_name; update the block that currently sets
accelerator_type, template_name and the subsequent skip to first check
supported_accelerator_type and skip with a clear message rather than falling
back to "nvidia".

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ssaleem-rh this seems like a legit comment. Can you please address it?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like the humans are having a chat. I'll hop back into my burrow for now. If you need me again, just tag @coderabbitai in a new comment, and I'll come hopping out!

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still see no changes here and find this comment resolved again. @kpunwatk can you please work with @ssaleem-rh here?


with ServingRuntimeFromTemplate(
client=admin_client,
name="lmeval-vllm-runtime",
namespace=model_namespace.name,
template_name=template_name,
deployment_type=KServeDeploymentType.RAW_DEPLOYMENT,
runtime_image=vllm_runtime_image,
support_tgis_open_ai_endpoints=True,
) as serving_runtime:
yield serving_runtime


@pytest.fixture(scope="function")
def lmeval_vllm_inference_service(
admin_client: DynamicClient,
model_namespace: Namespace,
lmeval_vllm_serving_runtime: ServingRuntime,
supported_accelerator_type: str | None,
) -> Generator[InferenceService]:
"""InferenceService for GPU-based model deployment in LMEval tests."""
model_path = "HuggingFaceTB/SmolLM-1.7B"
model_name = "lmeval-model"

# Get the correct GPU identifier based on accelerator type
accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
gpu_identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, Labels.Nvidia.NVIDIA_COM_GPU)

resources = {
"requests": {
"cpu": "2",
"memory": "8Gi",
gpu_identifier: "1",
},
"limits": {
"cpu": "3",
"memory": "8Gi",
gpu_identifier: "1",
},
}

runtime_args = [
f"--model={model_path}",
"--dtype=float16",
"--max-model-len=2048",
]

env_vars = [
{"name": "HF_HUB_OFFLINE", "value": "0"},
{"name": "HF_HUB_ENABLE_HF_TRANSFER", "value": "0"},
]

with create_isvc(
client=admin_client,
name=model_name,
namespace=model_namespace.name,
runtime=lmeval_vllm_serving_runtime.name,
model_format=lmeval_vllm_serving_runtime.instance.spec.supportedModelFormats[0].name,
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
resources=resources,
argument=runtime_args,
model_env_variables=env_vars,
min_replicas=1,
) as inference_service:
yield inference_service


@pytest.fixture(scope="function")
def lmevaljob_gpu(
admin_client: DynamicClient,
model_namespace: Namespace,
lmeval_vllm_inference_service: InferenceService,
) -> Generator[LMEvalJob]:
"""LMEvalJob for evaluating a GPU-deployed model via vLLM."""
model_path = "HuggingFaceTB/SmolLM-1.7B"
model_service = Service(
name=f"{lmeval_vllm_inference_service.name}-predictor",
namespace=lmeval_vllm_inference_service.namespace,
)

with LMEvalJob(
client=admin_client,
namespace=model_namespace.name,
name=LMEVALJOB_NAME,
model="local-completions",
task_list={"taskNames": ["arc_easy"]},
log_samples=True,
batch_size="1",
allow_online=True,
allow_code_execution=False,
outputs={"pvcManaged": {"size": "5Gi"}},
limit="0.01",
model_args=[
{"name": "model", "value": lmeval_vllm_inference_service.name},
{
"name": "base_url",
"value": f"http://{model_service.name}.{model_namespace.name}.svc.cluster.local:80/v1/completions",
},
{"name": "num_concurrent", "value": "1"},
{"name": "max_retries", "value": "3"},
{"name": "tokenized_requests", "value": "False"},
{"name": "tokenizer", "value": model_path},
],
) as lmevaljob:
yield lmevaljob
7 changes: 7 additions & 0 deletions tests/model_explainability/lm_eval/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,10 @@

LMEVAL_OCI_REPO = "lmeval/offline-oci"
LMEVAL_OCI_TAG = "v1"

# Accelerator identifier mapping for GPU types
ACCELERATOR_IDENTIFIER: dict[str, str] = {
"nvidia": "nvidia.com/gpu",
"amd": "amd.com/gpu",
"gaudi": "habana.ai/gaudi",
}
39 changes: 38 additions & 1 deletion tests/model_explainability/lm_eval/test_lm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
LMEVAL_OCI_REPO,
LMEVAL_OCI_TAG,
)
from tests.model_explainability.lm_eval.utils import get_lmeval_tasks, validate_lmeval_job_pod_and_logs
from tests.model_explainability.lm_eval.utils import (
get_lmeval_tasks,
validate_lmeval_job_pod_and_logs,
wait_for_vllm_model_ready,
)
from tests.model_explainability.utils import validate_tai_component_images
from utilities.constants import OCIRegistry
from utilities.registry_utils import pull_manifest_from_oci_registry
Expand Down Expand Up @@ -194,3 +198,36 @@ def test_lmeval_local_offline_unitxt_tasks_flan_20newsgroups_oci_artifacts(
LOGGER.info(f"Verifying artifact in OCI registry: {registry_url}/v2/{LMEVAL_OCI_REPO}/manifests/{LMEVAL_OCI_TAG}")
pull_manifest_from_oci_registry(registry_url=registry_url, repo=LMEVAL_OCI_REPO, tag=LMEVAL_OCI_TAG)
LOGGER.info("Manifest found in OCI registry")


@pytest.mark.gpu
@pytest.mark.skip_on_disconnected
@pytest.mark.parametrize(
"model_namespace",
[
pytest.param(
{"name": "test-lmeval-gpu"},
)
],
indirect=True,
)
@pytest.mark.usefixtures("patched_dsc_kserve_headed", "skip_if_no_supported_accelerator_type")
def test_lmeval_gpu(
admin_client: DynamicClient,
model_namespace: Namespace,
patched_dsc_lmeval_allow_all,
lmeval_vllm_inference_service,
lmevaljob_gpu_pod,
):
"""Test LMEval with GPU-backed model deployment via vLLM.

Verifies that LMEval can successfully evaluate a model deployed on GPU using vLLM runtime.
The model is downloaded directly from HuggingFace Hub and evaluated using the arc_easy task.
"""
wait_for_vllm_model_ready(
client=admin_client,
namespace=model_namespace.name,
inference_service_name=lmeval_vllm_inference_service.name,
)

validate_lmeval_job_pod_and_logs(lmevaljob_pod=lmevaljob_gpu_pod)
85 changes: 83 additions & 2 deletions tests/model_explainability/lm_eval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,21 @@

import pandas as pd
import structlog
from kubernetes.client.rest import ApiException
from kubernetes.dynamic import DynamicClient
from ocp_resources.lm_eval_job import LMEvalJob
from ocp_resources.pod import Pod
from pyhelper_utils.general import tts
from timeout_sampler import TimeoutExpiredError
from timeout_sampler import TimeoutExpiredError, TimeoutSampler

from utilities.constants import Timeout
from utilities.exceptions import PodLogMissMatchError, UnexpectedFailureError
from utilities.exceptions import (
PodLogMissMatchError,
ResourceNotFoundError,
UnexpectedFailureError,
UnexpectedResourceCountError,
)
from utilities.general import collect_pod_information

LOGGER = structlog.get_logger(name=__name__)

Expand Down Expand Up @@ -106,3 +113,77 @@ def validate_lmeval_job_pod_and_logs(lmevaljob_pod: Pod) -> None:
raise UnexpectedFailureError("LMEval job pod failed from a running state.") from e
if not bool(re.search(pod_success_log_regex, lmevaljob_pod.log())):
raise PodLogMissMatchError("LMEval job pod failed.")


def wait_for_vllm_model_ready(
client: DynamicClient,
namespace: str,
inference_service_name: str,
max_wait_time: int = 600,
check_interval: int = 10,
) -> Pod:
"""Wait for vLLM model to download and be ready to serve requests.

Args:
client: Kubernetes dynamic client
namespace: Namespace where the inference service is deployed
inference_service_name: Name of the inference service
max_wait_time: Maximum time to wait in seconds
check_interval: Time between checks in seconds

Returns:
The predictor pod once model is ready

Raises:
ResourceNotFoundError: If no predictor pod is found
UnexpectedFailureError: If model fails to load or pod encounters errors
"""
LOGGER.info("Waiting for vLLM model to download and load...")

predictor_pods = list(
Pod.get(
dyn_client=client,
namespace=namespace,
label_selector=f"serving.kserve.io/inferenceservice={inference_service_name},component=predictor",
)
)

if not predictor_pods:
raise ResourceNotFoundError(f"No predictor pod found for inference service '{inference_service_name}'.")

if len(predictor_pods) != 1:
raise UnexpectedResourceCountError(
f"Expected exactly 1 predictor pod for inference service '{inference_service_name}', "
f"but found {len(predictor_pods)}: {[pod.name for pod in predictor_pods]}"
)

predictor_pod = predictor_pods[0]
LOGGER.info(f"Predictor pod: {predictor_pod.name}")

def _check_model_ready() -> bool:
try:
pod_logs = predictor_pod.log(container="kserve-container")
if "Uvicorn running on" in pod_logs or "Application startup complete" in pod_logs:
LOGGER.info("vLLM server is running and ready!")
return True
else:
LOGGER.info("Model still loading..")
return False
except (ApiException, OSError) as e:
LOGGER.info(f"Could not get pod logs yet: {e}")
return False

try:
for sample in TimeoutSampler(
wait_timeout=max_wait_time,
sleep=check_interval,
func=_check_model_ready,
):
if sample:
break
except TimeoutExpiredError as e:
LOGGER.error(f"vLLM pod failed to start within {max_wait_time} seconds")
collect_pod_information(pod=predictor_pod)
raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") from e
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please re-raise TimeoutExpiredError


return predictor_pod
6 changes: 0 additions & 6 deletions tests/model_serving/model_runtime/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,6 @@
from syrupy.extensions.json import JSONSnapshotExtension


@pytest.fixture(scope="session")
def skip_if_no_supported_accelerator_type(supported_accelerator_type: str) -> None:
if not supported_accelerator_type:
pytest.skip("Accelerator type is not provided,vLLM test cannot be run on CPU")


@pytest.fixture
def response_snapshot(snapshot: Any) -> Any:
return snapshot.use_extension(extension_class=JSONSnapshotExtension)
4 changes: 4 additions & 0 deletions utilities/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,7 @@ class ExceptionUserLogin(Exception):

class UnexpectedValueError(Exception):
"""Unexpected value found"""


class ResourceNotFoundError(Exception):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use from kubernetes.dynamic.exceptions import ResourceNotFoundError

"""Resource not found"""