opendatahub-io · ssaleem-rh · Mar 16, 2026 · Mar 16, 2026 · Mar 17, 2026 · Mar 23, 2026
@@ -113,7 +113,7 @@ def pytest_addoption(parser: Parser) -> None:
     # Runtime options
     runtime_group.addoption(
         "--supported-accelerator-type",
-        default=os.environ.get("SUPPORTED_ACCLERATOR_TYPE"),
+        default=os.environ.get("SUPPORTED_ACCELERATOR_TYPE"),
         help="Supported accelerator type : Nvidia,AMD,Gaudi",
     )
     runtime_group.addoption(

@@ -299,7 +299,7 @@ def supported_accelerator_type(pytestconfig: pytest.Config) -> str | None:
     if accelerator_type.lower() not in AcceleratorType.SUPPORTED_LISTS:
         raise ValueError(
             "accelerator type is not defined."
-            "Either pass with `--supported-accelerator-type` or set `SUPPORTED_ACCLERATOR_TYPE` environment variable"
+            "Either pass with `--supported-accelerator-type` or set `SUPPORTED_ACCELERATOR_TYPE` environment variable"
         )
     return accelerator_type
 
@@ -985,3 +985,19 @@ def oci_registry_route(admin_client: DynamicClient, oci_registry_service: Servic
 def oci_registry_host(oci_registry_route: Route) -> str:
     """Get the OCI registry host from the route"""
     return oci_registry_route.host
+
+
+@pytest.fixture(scope="session")
+def skip_if_no_supported_accelerator_type(supported_accelerator_type: str | None) -> None:
+    """Skip test if no supported GPU accelerator is available."""
+    # Only GPU accelerators that support vLLM
+    supported_gpu_accelerators = {
+        AcceleratorType.NVIDIA,
+        AcceleratorType.AMD,
+        AcceleratorType.GAUDI,
+    }
+
+    if not supported_accelerator_type or supported_accelerator_type.lower() not in supported_gpu_accelerators:
+        pytest.skip(
+            f"Unsupported accelerator '{supported_accelerator_type}'. Expected one of {supported_gpu_accelerators}."
+        )
@@ -6,25 +6,30 @@
 from kubernetes.dynamic import DynamicClient
 from ocp_resources.data_science_cluster import DataScienceCluster
 from ocp_resources.deployment import Deployment
+from ocp_resources.inference_service import InferenceService
 from ocp_resources.lm_eval_job import LMEvalJob
 from ocp_resources.namespace import Namespace
 from ocp_resources.persistent_volume_claim import PersistentVolumeClaim
 from ocp_resources.pod import Pod
 from ocp_resources.route import Route
 from ocp_resources.secret import Secret
 from ocp_resources.service import Service
+from ocp_resources.serving_runtime import ServingRuntime
 from pytest import Config, FixtureRequest
 
 from tests.model_explainability.lm_eval.constants import (
+    ACCELERATOR_IDENTIFIER,
     ARC_EASY_DATASET_IMAGE,
     FLAN_T5_IMAGE,
     LMEVAL_OCI_REPO,
     LMEVAL_OCI_TAG,
 )
 from tests.model_explainability.lm_eval.utils import get_lmevaljob_pod
-from utilities.constants import ApiGroups, Labels, MinIo, Protocols, Timeout
+from utilities.constants import ApiGroups, KServeDeploymentType, Labels, MinIo, Protocols, RuntimeTemplates, Timeout
 from utilities.exceptions import MissingParameter
 from utilities.general import b64_encoded_string
+from utilities.inference_utils import create_isvc
+from utilities.serving_runtime import ServingRuntimeFromTemplate
 
 VLLM_EMULATOR: str = "vllm-emulator"
 VLLM_EMULATOR_PORT: int = 8000
@@ -542,6 +547,11 @@ def lmevaljob_s3_offline_pod(admin_client: DynamicClient, lmevaljob_s3_offline:
     yield get_lmevaljob_pod(client=admin_client, lmevaljob=lmevaljob_s3_offline)
 
 
+@pytest.fixture(scope="function")
+def lmevaljob_gpu_pod(admin_client: DynamicClient, lmevaljob_gpu: LMEvalJob) -> Generator[Pod, Any, Any]:
+    yield get_lmevaljob_pod(client=admin_client, lmevaljob=lmevaljob_gpu)
+
+
 @pytest.fixture(scope="function")
 def lmeval_hf_access_token(
     admin_client: DynamicClient,
@@ -564,3 +574,131 @@ def lmeval_hf_access_token(
         wait_for_resource=True,
     ) as secret:
         yield secret
+
+
+# GPU-based vLLM fixtures for SmolLM-1.7B
+@pytest.fixture(scope="function")
+def lmeval_vllm_serving_runtime(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    vllm_runtime_image: str,
+    supported_accelerator_type: str | None,
+) -> Generator[ServingRuntime]:
+    """vLLM ServingRuntime for GPU-based model deployment in LMEval tests."""
+    # Map accelerator type to runtime template
+    accelerator_to_template = {
+        "nvidia": RuntimeTemplates.VLLM_CUDA,
+        "amd": RuntimeTemplates.VLLM_ROCM,
+        "gaudi": RuntimeTemplates.VLLM_GAUDI,
+    }
+
+    accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
+    template_name = accelerator_to_template.get(accelerator_type)
+
+    if not template_name:
+        pytest.skip(f"Unsupported accelerator type for vLLM: {supported_accelerator_type}")
-    accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
-    template_name = accelerator_to_template.get(accelerator_type)
-
-    if not template_name:
-        pytest.skip(f"Unsupported accelerator type for vLLM: {supported_accelerator_type}")
+    if not supported_accelerator_type:
+        pytest.skip("supported_accelerator_type is required for GPU-backed vLLM tests")
+
+    accelerator_type = supported_accelerator_type.lower()
+    template_name = accelerator_to_template.get(accelerator_type)
+
+    if not template_name:
+        pytest.skip(f"Unsupported accelerator type for vLLM: {accelerator_type}")
-    accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
-    template_name = accelerator_to_template.get(accelerator_type)
-
-    if not template_name:
-        pytest.skip(f"Unsupported accelerator type for vLLM: {supported_accelerator_type}")
+    if not supported_accelerator_type:
+        pytest.skip("supported_accelerator_type is required for GPU-backed vLLM tests")
+
+    accelerator_type = supported_accelerator_type.lower()
+    template_name = accelerator_to_template.get(accelerator_type)
+
+    if not template_name:
+        pytest.skip(f"Unsupported accelerator type for vLLM: {accelerator_type}")
+
+    with ServingRuntimeFromTemplate(
+        client=admin_client,
+        name="lmeval-vllm-runtime",
+        namespace=model_namespace.name,
+        template_name=template_name,
+        deployment_type=KServeDeploymentType.RAW_DEPLOYMENT,
+        runtime_image=vllm_runtime_image,
+        support_tgis_open_ai_endpoints=True,
+    ) as serving_runtime:
+        yield serving_runtime
+
+
+@pytest.fixture(scope="function")
+def lmeval_vllm_inference_service(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    lmeval_vllm_serving_runtime: ServingRuntime,
+    supported_accelerator_type: str | None,
+) -> Generator[InferenceService]:
+    """InferenceService for GPU-based model deployment in LMEval tests."""
+    model_path = "HuggingFaceTB/SmolLM-1.7B"
+    model_name = "lmeval-model"
+
+    # Get the correct GPU identifier based on accelerator type
+    accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
+    gpu_identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, Labels.Nvidia.NVIDIA_COM_GPU)
+
+    resources = {
+        "requests": {
+            "cpu": "2",
+            "memory": "8Gi",
+            gpu_identifier: "1",
+        },
+        "limits": {
+            "cpu": "3",
+            "memory": "8Gi",
+            gpu_identifier: "1",
+        },
+    }
+
+    runtime_args = [
+        f"--model={model_path}",
+        "--dtype=float16",
+        "--max-model-len=2048",
+    ]
+
+    env_vars = [
+        {"name": "HF_HUB_OFFLINE", "value": "0"},
+        {"name": "HF_HUB_ENABLE_HF_TRANSFER", "value": "0"},
+    ]
+
+    with create_isvc(
+        client=admin_client,
+        name=model_name,
+        namespace=model_namespace.name,
+        runtime=lmeval_vllm_serving_runtime.name,
+        model_format=lmeval_vllm_serving_runtime.instance.spec.supportedModelFormats[0].name,
+        deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
+        resources=resources,
+        argument=runtime_args,
+        model_env_variables=env_vars,
+        min_replicas=1,
+    ) as inference_service:
+        yield inference_service
+
+
+@pytest.fixture(scope="function")
+def lmevaljob_gpu(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    lmeval_vllm_inference_service: InferenceService,
+) -> Generator[LMEvalJob]:
+    """LMEvalJob for evaluating a GPU-deployed model via vLLM."""
+    model_path = "HuggingFaceTB/SmolLM-1.7B"
+    model_service = Service(
+        name=f"{lmeval_vllm_inference_service.name}-predictor",
+        namespace=lmeval_vllm_inference_service.namespace,
+    )
+
+    with LMEvalJob(
+        client=admin_client,
+        namespace=model_namespace.name,
+        name=LMEVALJOB_NAME,
+        model="local-completions",
+        task_list={"taskNames": ["arc_easy"]},
+        log_samples=True,
+        batch_size="1",
+        allow_online=True,
+        allow_code_execution=False,
+        outputs={"pvcManaged": {"size": "5Gi"}},
+        limit="0.01",
+        model_args=[
+            {"name": "model", "value": lmeval_vllm_inference_service.name},
+            {
+                "name": "base_url",
+                "value": f"http://{model_service.name}.{model_namespace.name}.svc.cluster.local:80/v1/completions",
+            },
+            {"name": "num_concurrent", "value": "1"},
+            {"name": "max_retries", "value": "3"},
+            {"name": "tokenized_requests", "value": "False"},
+            {"name": "tokenizer", "value": model_path},
+        ],
+    ) as lmevaljob:
+        yield lmevaljob
@@ -119,3 +119,10 @@
 
 LMEVAL_OCI_REPO = "lmeval/offline-oci"
 LMEVAL_OCI_TAG = "v1"
+
+# Accelerator identifier mapping for GPU types
+ACCELERATOR_IDENTIFIER: dict[str, str] = {
+    "nvidia": "nvidia.com/gpu",
+    "amd": "amd.com/gpu",
+    "gaudi": "habana.ai/gaudi",
+}
@@ -11,7 +11,11 @@
     LMEVAL_OCI_REPO,
     LMEVAL_OCI_TAG,
 )
-from tests.model_explainability.lm_eval.utils import get_lmeval_tasks, validate_lmeval_job_pod_and_logs
+from tests.model_explainability.lm_eval.utils import (
+    get_lmeval_tasks,
+    validate_lmeval_job_pod_and_logs,
+    wait_for_vllm_model_ready,
+)
 from tests.model_explainability.utils import validate_tai_component_images
 from utilities.constants import OCIRegistry
 from utilities.registry_utils import pull_manifest_from_oci_registry
@@ -194,3 +198,36 @@ def test_lmeval_local_offline_unitxt_tasks_flan_20newsgroups_oci_artifacts(
     LOGGER.info(f"Verifying artifact in OCI registry: {registry_url}/v2/{LMEVAL_OCI_REPO}/manifests/{LMEVAL_OCI_TAG}")
     pull_manifest_from_oci_registry(registry_url=registry_url, repo=LMEVAL_OCI_REPO, tag=LMEVAL_OCI_TAG)
     LOGGER.info("Manifest found in OCI registry")
+
+
+@pytest.mark.gpu
+@pytest.mark.skip_on_disconnected
+@pytest.mark.parametrize(
+    "model_namespace",
+    [
+        pytest.param(
+            {"name": "test-lmeval-gpu"},
+        )
+    ],
+    indirect=True,
+)
+@pytest.mark.usefixtures("patched_dsc_kserve_headed", "skip_if_no_supported_accelerator_type")
+def test_lmeval_gpu(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    patched_dsc_lmeval_allow_all,
+    lmeval_vllm_inference_service,
+    lmevaljob_gpu_pod,
+):
+    """Test LMEval with GPU-backed model deployment via vLLM.
+
+    Verifies that LMEval can successfully evaluate a model deployed on GPU using vLLM runtime.
+    The model is downloaded directly from HuggingFace Hub and evaluated using the arc_easy task.
+    """
+    wait_for_vllm_model_ready(
+        client=admin_client,
+        namespace=model_namespace.name,
+        inference_service_name=lmeval_vllm_inference_service.name,
+    )
+
+    validate_lmeval_job_pod_and_logs(lmevaljob_pod=lmevaljob_gpu_pod)
@@ -2,14 +2,21 @@
 
 import pandas as pd
 import structlog
+from kubernetes.client.rest import ApiException
 from kubernetes.dynamic import DynamicClient
 from ocp_resources.lm_eval_job import LMEvalJob
 from ocp_resources.pod import Pod
 from pyhelper_utils.general import tts
-from timeout_sampler import TimeoutExpiredError
+from timeout_sampler import TimeoutExpiredError, TimeoutSampler
 
 from utilities.constants import Timeout
-from utilities.exceptions import PodLogMissMatchError, UnexpectedFailureError
+from utilities.exceptions import (
+    PodLogMissMatchError,
+    ResourceNotFoundError,
+    UnexpectedFailureError,
+    UnexpectedResourceCountError,
+)
+from utilities.general import collect_pod_information
 
 LOGGER = structlog.get_logger(name=__name__)
 
@@ -106,3 +113,77 @@ def validate_lmeval_job_pod_and_logs(lmevaljob_pod: Pod) -> None:
         raise UnexpectedFailureError("LMEval job pod failed from a running state.") from e
     if not bool(re.search(pod_success_log_regex, lmevaljob_pod.log())):
         raise PodLogMissMatchError("LMEval job pod failed.")
+
+
+def wait_for_vllm_model_ready(
+    client: DynamicClient,
+    namespace: str,
+    inference_service_name: str,
+    max_wait_time: int = 600,
+    check_interval: int = 10,
+) -> Pod:
+    """Wait for vLLM model to download and be ready to serve requests.
+
+    Args:
+        client: Kubernetes dynamic client
+        namespace: Namespace where the inference service is deployed
+        inference_service_name: Name of the inference service
+        max_wait_time: Maximum time to wait in seconds
+        check_interval: Time between checks in seconds
+
+    Returns:
+        The predictor pod once model is ready
+
+    Raises:
+        ResourceNotFoundError: If no predictor pod is found
+        UnexpectedFailureError: If model fails to load or pod encounters errors
+    """
+    LOGGER.info("Waiting for vLLM model to download and load...")
+
+    predictor_pods = list(
+        Pod.get(
+            dyn_client=client,
+            namespace=namespace,
+            label_selector=f"serving.kserve.io/inferenceservice={inference_service_name},component=predictor",
+        )
+    )
+
+    if not predictor_pods:
+        raise ResourceNotFoundError(f"No predictor pod found for inference service '{inference_service_name}'.")
+
+    if len(predictor_pods) != 1:
+        raise UnexpectedResourceCountError(
+            f"Expected exactly 1 predictor pod for inference service '{inference_service_name}', "
+            f"but found {len(predictor_pods)}: {[pod.name for pod in predictor_pods]}"
+        )
+
+    predictor_pod = predictor_pods[0]
+    LOGGER.info(f"Predictor pod: {predictor_pod.name}")
+
+    def _check_model_ready() -> bool:
+        try:
+            pod_logs = predictor_pod.log(container="kserve-container")
+            if "Uvicorn running on" in pod_logs or "Application startup complete" in pod_logs:
+                LOGGER.info("vLLM server is running and ready!")
+                return True
+            else:
+                LOGGER.info("Model still loading..")
+                return False
+        except (ApiException, OSError) as e:
+            LOGGER.info(f"Could not get pod logs yet: {e}")
+            return False
+
+    try:
+        for sample in TimeoutSampler(
+            wait_timeout=max_wait_time,
+            sleep=check_interval,
+            func=_check_model_ready,
+        ):
+            if sample:
+                break
+    except TimeoutExpiredError as e:
+        LOGGER.error(f"vLLM pod failed to start within {max_wait_time} seconds")
+        collect_pod_information(pod=predictor_pod)
+        raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") from e
+
+    return predictor_pod
@@ -4,12 +4,6 @@
 from syrupy.extensions.json import JSONSnapshotExtension
 
 
-@pytest.fixture(scope="session")
-def skip_if_no_supported_accelerator_type(supported_accelerator_type: str) -> None:
-    if not supported_accelerator_type:
-        pytest.skip("Accelerator type is not provided,vLLM test cannot be run on CPU")
-
-
 @pytest.fixture
 def response_snapshot(snapshot: Any) -> Any:
     return snapshot.use_extension(extension_class=JSONSnapshotExtension)
@@ -131,3 +131,7 @@ class ExceptionUserLogin(Exception):
 
 class UnexpectedValueError(Exception):
     """Unexpected value found"""
+
+
+class ResourceNotFoundError(Exception):
+    """Resource not found"""