From 36e09f100bfa121bcd7bf78d5a76cf9f47847619 Mon Sep 17 00:00:00 2001
From: ssaleem-rh <ssaleem@redhat.com>
Date: Mon, 16 Mar 2026 12:46:57 +0000
Subject: [PATCH 1/8] =?UTF-8?q?Fixed=20typo=20in=20error=20message:=20SUPP?=
 =?UTF-8?q?ORTED=5FACCLERATOR=5FTYPE=20=E2=86=92=20SUPPORTED=5FACCELERATOR?=
 =?UTF-8?q?=5FTYPE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED

Signed-off-by: Shehan Saleem <ssaleem@redhat.com>
---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 63ff3cac6..59bf7056c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -299,7 +299,7 @@ def supported_accelerator_type(pytestconfig: pytest.Config) -> str | None:
     if accelerator_type.lower() not in AcceleratorType.SUPPORTED_LISTS:
         raise ValueError(
             "accelerator type is not defined."
-            "Either pass with `--supported-accelerator-type` or set `SUPPORTED_ACCLERATOR_TYPE` environment variable"
+            "Either pass with `--supported-accelerator-type` or set `SUPPORTED_ACCELERATOR_TYPE` environment variable"
         )
     return accelerator_type
 

From 66b3b8b7a2ec7a1fbb9884b2fcbe61d275ddbd0d Mon Sep 17 00:00:00 2001
From: ssaleem-rh <ssaleem@redhat.com>
Date: Mon, 16 Mar 2026 16:37:56 +0000
Subject: [PATCH 2/8] test(lmeval): add GPU testing support with vLLM

Add test_lmeval_gpu to verify LMEval works with GPU-backed
model deployments via vLLM runtime. Includes:
- New test for GPU model evaluation with SmolLM-1.7B
- wait_for_vllm_model_ready utility for model readiness checks
- GPU-specific fixtures: ServingRuntime, InferenceService, LMEvalJob, and pod; skip when no supported accelerator

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED

Signed-off-by: Shehan Saleem <ssaleem@redhat.com>
---
 .../model_explainability/lm_eval/conftest.py  | 134 +++++++++++++++++-
 .../lm_eval/test_lm_eval.py                   |  38 ++++-
 tests/model_explainability/lm_eval/utils.py   |  74 ++++++++++
 3 files changed, 244 insertions(+), 2 deletions(-)

diff --git a/tests/model_explainability/lm_eval/conftest.py b/tests/model_explainability/lm_eval/conftest.py
index 56c5bf5b5..5eb4a8f37 100644
--- a/tests/model_explainability/lm_eval/conftest.py
+++ b/tests/model_explainability/lm_eval/conftest.py
@@ -6,6 +6,7 @@
 from kubernetes.dynamic import DynamicClient
 from ocp_resources.data_science_cluster import DataScienceCluster
 from ocp_resources.deployment import Deployment
+from ocp_resources.inference_service import InferenceService
 from ocp_resources.lm_eval_job import LMEvalJob
 from ocp_resources.namespace import Namespace
 from ocp_resources.persistent_volume_claim import PersistentVolumeClaim
@@ -13,6 +14,7 @@
 from ocp_resources.route import Route
 from ocp_resources.secret import Secret
 from ocp_resources.service import Service
+from ocp_resources.serving_runtime import ServingRuntime
 from pytest import Config, FixtureRequest
 
 from tests.model_explainability.lm_eval.constants import (
@@ -22,9 +24,12 @@
     LMEVAL_OCI_TAG,
 )
 from tests.model_explainability.lm_eval.utils import get_lmevaljob_pod
-from utilities.constants import ApiGroups, Labels, MinIo, Protocols, Timeout
+from tests.model_serving.model_runtime.vllm.constant import ACCELERATOR_IDENTIFIER
+from utilities.constants import ApiGroups, KServeDeploymentType, Labels, MinIo, Protocols, RuntimeTemplates, Timeout
 from utilities.exceptions import MissingParameter
 from utilities.general import b64_encoded_string
+from utilities.inference_utils import create_isvc
+from utilities.serving_runtime import ServingRuntimeFromTemplate
 
 VLLM_EMULATOR: str = "vllm-emulator"
 VLLM_EMULATOR_PORT: int = 8000
@@ -542,6 +547,11 @@ def lmevaljob_s3_offline_pod(admin_client: DynamicClient, lmevaljob_s3_offline:
     yield get_lmevaljob_pod(client=admin_client, lmevaljob=lmevaljob_s3_offline)
 
 
+@pytest.fixture(scope="function")
+def lmevaljob_gpu_pod(admin_client: DynamicClient, lmevaljob_gpu: LMEvalJob) -> Generator[Pod, Any, Any]:
+    yield get_lmevaljob_pod(client=admin_client, lmevaljob=lmevaljob_gpu)
+
+
 @pytest.fixture(scope="function")
 def lmeval_hf_access_token(
     admin_client: DynamicClient,
@@ -564,3 +574,125 @@ def lmeval_hf_access_token(
         wait_for_resource=True,
     ) as secret:
         yield secret
+
+
+# GPU-based vLLM fixtures for SmolLM-1.7B
+
+@pytest.fixture(scope="session")
+def skip_if_no_supported_accelerator_type(supported_accelerator_type: str) -> None:
+    """Skip test if no GPU accelerator is available."""
+    if not supported_accelerator_type:
+        pytest.skip("Accelerator type is not provided, GPU test cannot be run on CPU")
+
+
+@pytest.fixture(scope="function")
+def lmeval_vllm_serving_runtime(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    vllm_runtime_image: str,
+) -> Generator[ServingRuntime]:
+    """vLLM ServingRuntime for GPU-based model deployment in LMEval tests."""
+    with ServingRuntimeFromTemplate(
+        client=admin_client,
+        name="lmeval-vllm-runtime",
+        namespace=model_namespace.name,
+        template_name=RuntimeTemplates.VLLM_CUDA,
+        deployment_type=KServeDeploymentType.RAW_DEPLOYMENT,
+        runtime_image=vllm_runtime_image,
+        support_tgis_open_ai_endpoints=True,
+    ) as serving_runtime:
+        yield serving_runtime
+
+
+@pytest.fixture(scope="function")
+def lmeval_vllm_inference_service(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    lmeval_vllm_serving_runtime: ServingRuntime,
+    supported_accelerator_type: str,
+) -> Generator[InferenceService]:
+    """InferenceService for GPU-based model deployment in LMEval tests."""
+    model_path = "HuggingFaceTB/SmolLM-1.7B"
+    model_name = "lmeval-model"
+
+    # Get the correct GPU identifier based on accelerator type
+    accelerator_type = supported_accelerator_type.lower()
+    gpu_identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, Labels.Nvidia.NVIDIA_COM_GPU)
+
+    resources = {
+        "requests": {
+            "cpu": "2",
+            "memory": "8Gi",
+            gpu_identifier: "1",
+        },
+        "limits": {
+            "cpu": "3",
+            "memory": "8Gi",
+            gpu_identifier: "1",
+        },
+    }
+
+    runtime_args = [
+        f"--model={model_path}",
+        "--dtype=float16",
+        "--max-model-len=2048",
+    ]
+
+    env_vars = [
+        {"name": "HF_HUB_OFFLINE", "value": "0"},
+        {"name": "HF_HUB_ENABLE_HF_TRANSFER", "value": "0"},
+    ]
+
+    with create_isvc(
+        client=admin_client,
+        name=model_name,
+        namespace=model_namespace.name,
+        runtime=lmeval_vllm_serving_runtime.name,
+        model_format=lmeval_vllm_serving_runtime.instance.spec.supportedModelFormats[0].name,
+        deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
+        resources=resources,
+        argument=runtime_args,
+        model_env_variables=env_vars,
+        min_replicas=1,
+    ) as inference_service:
+        yield inference_service
+
+
+@pytest.fixture(scope="function")
+def lmevaljob_gpu(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    lmeval_vllm_inference_service: InferenceService,
+) -> Generator[LMEvalJob]:
+    """LMEvalJob for evaluating a GPU-deployed model via vLLM."""
+    model_path = "HuggingFaceTB/SmolLM-1.7B"
+    model_service = Service(
+        name=f"{lmeval_vllm_inference_service.name}-predictor",
+        namespace=lmeval_vllm_inference_service.namespace,
+    )
+
+    with LMEvalJob(
+        client=admin_client,
+        namespace=model_namespace.name,
+        name=LMEVALJOB_NAME,
+        model="local-completions",
+        task_list={"taskNames": ["arc_easy"]},
+        log_samples=True,
+        batch_size="1",
+        allow_online=True,
+        allow_code_execution=False,
+        outputs={"pvcManaged": {"size": "5Gi"}},
+        limit="0.01",
+        model_args=[
+            {"name": "model", "value": lmeval_vllm_inference_service.name},
+            {
+                "name": "base_url",
+                "value": f"http://{model_service.name}.{model_namespace.name}.svc.cluster.local:80/v1/completions",
+            },
+            {"name": "num_concurrent", "value": "1"},
+            {"name": "max_retries", "value": "3"},
+            {"name": "tokenized_requests", "value": "False"},
+            {"name": "tokenizer", "value": model_path},
+        ],
+    ) as lmevaljob:
+        yield lmevaljob
diff --git a/tests/model_explainability/lm_eval/test_lm_eval.py b/tests/model_explainability/lm_eval/test_lm_eval.py
index 4a03ea416..a36a9fe09 100644
--- a/tests/model_explainability/lm_eval/test_lm_eval.py
+++ b/tests/model_explainability/lm_eval/test_lm_eval.py
@@ -11,7 +11,11 @@
     LMEVAL_OCI_REPO,
     LMEVAL_OCI_TAG,
 )
-from tests.model_explainability.lm_eval.utils import get_lmeval_tasks, validate_lmeval_job_pod_and_logs
+from tests.model_explainability.lm_eval.utils import (
+    get_lmeval_tasks,
+    validate_lmeval_job_pod_and_logs,
+    wait_for_vllm_model_ready,
+)
 from tests.model_explainability.utils import validate_tai_component_images
 from utilities.constants import OCIRegistry
 from utilities.registry_utils import pull_manifest_from_oci_registry
@@ -194,3 +198,35 @@ def test_lmeval_local_offline_unitxt_tasks_flan_20newsgroups_oci_artifacts(
     LOGGER.info(f"Verifying artifact in OCI registry: {registry_url}/v2/{LMEVAL_OCI_REPO}/manifests/{LMEVAL_OCI_TAG}")
     pull_manifest_from_oci_registry(registry_url=registry_url, repo=LMEVAL_OCI_REPO, tag=LMEVAL_OCI_TAG)
     LOGGER.info("Manifest found in OCI registry")
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    "model_namespace",
+    [
+        pytest.param(
+            {"name": "test-lmeval-gpu"},
+        )
+    ],
+    indirect=True,
+)
+@pytest.mark.usefixtures("patched_dsc_kserve_headed", "skip_if_no_supported_accelerator_type")
+def test_lmeval_gpu(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    patched_dsc_lmeval_allow_all,
+    lmeval_vllm_inference_service,
+    lmevaljob_gpu_pod,
+):
+    """Test LMEval with GPU-backed model deployment via vLLM.
+
+    Verifies that LMEval can successfully evaluate a model deployed on GPU using vLLM runtime.
+    The model is downloaded directly from HuggingFace Hub and evaluated using the arc_easy task.
+    """
+    wait_for_vllm_model_ready(
+        client=admin_client,
+        namespace=model_namespace.name,
+        inference_service_name=lmeval_vllm_inference_service.name,
+    )
+
+    validate_lmeval_job_pod_and_logs(lmevaljob_pod=lmevaljob_gpu_pod)
diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py
index 33c587734..f9aa1e0fb 100644
--- a/tests/model_explainability/lm_eval/utils.py
+++ b/tests/model_explainability/lm_eval/utils.py
@@ -1,4 +1,5 @@
 import re
+import time
 
 import pandas as pd
 from kubernetes.dynamic import DynamicClient
@@ -106,3 +107,76 @@ def validate_lmeval_job_pod_and_logs(lmevaljob_pod: Pod) -> None:
         raise UnexpectedFailureError("LMEval job pod failed from a running state.") from e
     if not bool(re.search(pod_success_log_regex, lmevaljob_pod.log())):
         raise PodLogMissMatchError("LMEval job pod failed.")
+
+
+def wait_for_vllm_model_ready(
+    client: DynamicClient,
+    namespace: str,
+    inference_service_name: str,
+    max_wait_time: int = 600,
+    check_interval: int = 10,
+    stabilization_wait: int = 10,
+) -> Pod:
+    """Wait for vLLM model to download and be ready to serve requests.
+
+    Args:
+        client: Kubernetes dynamic client
+        namespace: Namespace where the inference service is deployed
+        inference_service_name: Name of the inference service
+        max_wait_time: Maximum time to wait in seconds
+        check_interval: Time between checks in seconds
+        stabilization_wait: Seconds to wait after model is ready for server stabilization
+
+    Returns:
+        The predictor pod once model is ready
+
+    Raises:
+        UnexpectedFailureError: If model fails to load or pod encounters errors
+    """
+    LOGGER.info("Waiting for vLLM model to download and load...")
+
+    predictor_pods = list(
+        Pod.get(
+            dyn_client=client,
+            namespace=namespace,
+            label_selector=f"serving.kserve.io/inferenceservice={inference_service_name}",
+        )
+    )
+
+    if not predictor_pods:
+        raise UnexpectedFailureError("No predictor pod found for inference service")
+
+    predictor_pod = predictor_pods[0]
+    LOGGER.info(f"Predictor pod: {predictor_pod.name}")
+
+    elapsed_time = 0
+    model_loaded = False
+
+    while elapsed_time < max_wait_time:
+        try:
+            pod_logs = predictor_pod.log(container="kserve-container")
+
+            if "Uvicorn running on" in pod_logs or "Application startup complete" in pod_logs:
+                LOGGER.info("vLLM server is running and ready!")
+                model_loaded = True
+                break
+            else:
+                LOGGER.info(f"Model still loading... (waited {elapsed_time}s)")
+        except Exception as e:
+            LOGGER.info(f"Could not get pod logs yet: {e}")
+
+        time.sleep(check_interval)
+        elapsed_time += check_interval
+
+    if not model_loaded:
+        try:
+            full_logs = predictor_pod.log(container="kserve-container")
+            LOGGER.error(f"vLLM pod failed to start within {max_wait_time}s. Full logs:\n{full_logs}")
+        except Exception as e:
+            LOGGER.error(f"Could not retrieve pod logs: {e}")
+        raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds")
+
+    LOGGER.info(f"Model loaded! Waiting {stabilization_wait} more seconds for server stabilization.")
+    time.sleep(stabilization_wait)
+
+    return predictor_pod

From 8f07c6a25e247ef5b66b138fe0a13a4c6bf51b1f Mon Sep 17 00:00:00 2001
From: ssaleem-rh <ssaleem@redhat.com>
Date: Tue, 17 Mar 2026 08:09:06 +0000
Subject: [PATCH 3/8] =?UTF-8?q?Fixed=20typo=20in=20SUPPORTED=5FACCELERATOR?=
 =?UTF-8?q?=5FTYPE=20environment=20variable:=20=20SUPPORTED=5FACCLERATOR?=
 =?UTF-8?q?=5FTYPE=20=E2=86=92=20SUPPORTED=5FACCELERATOR=5FTYPE=20in=20run?=
 =?UTF-8?q?time=20option.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED

Signed-off-by: Shehan Saleem <ssaleem@redhat.com>
---
 conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conftest.py b/conftest.py
index ebd05ff36..6bc851e95 100644
--- a/conftest.py
+++ b/conftest.py
@@ -104,7 +104,7 @@ def pytest_addoption(parser: Parser) -> None:
     # Runtime options
     runtime_group.addoption(
         "--supported-accelerator-type",
-        default=os.environ.get("SUPPORTED_ACCLERATOR_TYPE"),
+        default=os.environ.get("SUPPORTED_ACCELERATOR_TYPE"),
         help="Supported accelerator type : Nvidia,AMD,Gaudi",
     )
     runtime_group.addoption(

From 368ce13044ff3457a3eb4557de46cc1921458ad0 Mon Sep 17 00:00:00 2001
From: Shehan Saleem <ssaleem@redhat.com>
Date: Mon, 23 Mar 2026 11:17:31 +0000
Subject: [PATCH 4/8] refactor: centralize
 skip_if_no_supported_accelerator_type fixture

- Relocate skip_if_no_supported_accelerator_type fixture to tests/conftest.py for reuse across test modules
- Introduce ACCELERATOR_IDENTIFIER constant and update imports
- Relax type annotation in lmeval_vllm_inference_service to str | None

Signed-off-by: ssaleem-rh <ssaleem@redhat.com>

Signed-off-by: Shehan Saleem <ssaleem@redhat.com>

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED

Signed-off-by: Shehan Saleem <ssaleem@redhat.com>
---
 tests/conftest.py                               |  7 +++++++
 tests/model_explainability/lm_eval/conftest.py  | 12 +++---------
 tests/model_explainability/lm_eval/constants.py |  7 +++++++
 tests/model_explainability/lm_eval/utils.py     |  5 +++++
 tests/model_serving/model_runtime/conftest.py   |  6 ------
 5 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 59bf7056c..8dcd905dc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -985,3 +985,10 @@ def oci_registry_route(admin_client: DynamicClient, oci_registry_service: Servic
 def oci_registry_host(oci_registry_route: Route) -> str:
     """Get the OCI registry host from the route"""
     return oci_registry_route.host
+
+
+@pytest.fixture(scope="session")
+def skip_if_no_supported_accelerator_type(supported_accelerator_type: str | None) -> None:
+    """Skip test if no GPU accelerator is available."""
+    if not supported_accelerator_type:
+        pytest.skip("Accelerator type is not provided, vLLM test cannot be run on CPU")
diff --git a/tests/model_explainability/lm_eval/conftest.py b/tests/model_explainability/lm_eval/conftest.py
index 5eb4a8f37..a2fc9311e 100644
--- a/tests/model_explainability/lm_eval/conftest.py
+++ b/tests/model_explainability/lm_eval/conftest.py
@@ -18,13 +18,13 @@
 from pytest import Config, FixtureRequest
 
 from tests.model_explainability.lm_eval.constants import (
+    ACCELERATOR_IDENTIFIER,
     ARC_EASY_DATASET_IMAGE,
     FLAN_T5_IMAGE,
     LMEVAL_OCI_REPO,
     LMEVAL_OCI_TAG,
 )
 from tests.model_explainability.lm_eval.utils import get_lmevaljob_pod
-from tests.model_serving.model_runtime.vllm.constant import ACCELERATOR_IDENTIFIER
 from utilities.constants import ApiGroups, KServeDeploymentType, Labels, MinIo, Protocols, RuntimeTemplates, Timeout
 from utilities.exceptions import MissingParameter
 from utilities.general import b64_encoded_string
@@ -578,12 +578,6 @@ def lmeval_hf_access_token(
 
 # GPU-based vLLM fixtures for SmolLM-1.7B
 
-@pytest.fixture(scope="session")
-def skip_if_no_supported_accelerator_type(supported_accelerator_type: str) -> None:
-    """Skip test if no GPU accelerator is available."""
-    if not supported_accelerator_type:
-        pytest.skip("Accelerator type is not provided, GPU test cannot be run on CPU")
-
 
 @pytest.fixture(scope="function")
 def lmeval_vllm_serving_runtime(
@@ -609,14 +603,14 @@ def lmeval_vllm_inference_service(
     admin_client: DynamicClient,
     model_namespace: Namespace,
     lmeval_vllm_serving_runtime: ServingRuntime,
-    supported_accelerator_type: str,
+    supported_accelerator_type: str | None,
 ) -> Generator[InferenceService]:
     """InferenceService for GPU-based model deployment in LMEval tests."""
     model_path = "HuggingFaceTB/SmolLM-1.7B"
     model_name = "lmeval-model"
 
     # Get the correct GPU identifier based on accelerator type
-    accelerator_type = supported_accelerator_type.lower()
+    accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
     gpu_identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, Labels.Nvidia.NVIDIA_COM_GPU)
 
     resources = {
diff --git a/tests/model_explainability/lm_eval/constants.py b/tests/model_explainability/lm_eval/constants.py
index e4448983b..a04877395 100644
--- a/tests/model_explainability/lm_eval/constants.py
+++ b/tests/model_explainability/lm_eval/constants.py
@@ -119,3 +119,10 @@
 
 LMEVAL_OCI_REPO = "lmeval/offline-oci"
 LMEVAL_OCI_TAG = "v1"
+
+# Accelerator identifier mapping for GPU types
+ACCELERATOR_IDENTIFIER: dict[str, str] = {
+    "nvidia": "nvidia.com/gpu",
+    "amd": "amd.com/gpu",
+    "gaudi": "habana.ai/gaudi",
+}
diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py
index f9aa1e0fb..559c03572 100644
--- a/tests/model_explainability/lm_eval/utils.py
+++ b/tests/model_explainability/lm_eval/utils.py
@@ -143,6 +143,11 @@ def wait_for_vllm_model_ready(
         )
     )
 
+    if not predictor_pods:
+        raise UnexpectedFailureError("No predictor pod found for inference service")
+
+    predictor_pods = [pod for pod in predictor_pods if "predictor" in pod.name]
+
     if not predictor_pods:
         raise UnexpectedFailureError("No predictor pod found for inference service")
 
diff --git a/tests/model_serving/model_runtime/conftest.py b/tests/model_serving/model_runtime/conftest.py
index db4462d7c..0df8d5126 100644
--- a/tests/model_serving/model_runtime/conftest.py
+++ b/tests/model_serving/model_runtime/conftest.py
@@ -4,12 +4,6 @@
 from syrupy.extensions.json import JSONSnapshotExtension
 
 
-@pytest.fixture(scope="session")
-def skip_if_no_supported_accelerator_type(supported_accelerator_type: str) -> None:
-    if not supported_accelerator_type:
-        pytest.skip("Accelerator type is not provided,vLLM test cannot be run on CPU")
-
-
 @pytest.fixture
 def response_snapshot(snapshot: Any) -> Any:
     return snapshot.use_extension(extension_class=JSONSnapshotExtension)

From 5d47944820c2d0fa28f87d294c09da3ebf87b063 Mon Sep 17 00:00:00 2001
From: Shehan Saleem <ssaleem@redhat.com>
Date: Mon, 23 Mar 2026 13:37:40 +0000
Subject: [PATCH 5/8] fix: re-add # noqa: BLE001

Previously removed as unnecessary, but required to suppress BLE001 warning.

Signed-off-by: Shehan Saleem <ssaleem@redhat.com>

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED
---
 tests/model_explainability/lm_eval/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py
index 559c03572..308098e21 100644
--- a/tests/model_explainability/lm_eval/utils.py
+++ b/tests/model_explainability/lm_eval/utils.py
@@ -167,7 +167,7 @@ def wait_for_vllm_model_ready(
                 break
             else:
                 LOGGER.info(f"Model still loading... (waited {elapsed_time}s)")
-        except Exception as e:
+        except Exception as e:  # noqa: BLE001
             LOGGER.info(f"Could not get pod logs yet: {e}")
 
         time.sleep(check_interval)
@@ -177,7 +177,7 @@ def wait_for_vllm_model_ready(
         try:
             full_logs = predictor_pod.log(container="kserve-container")
             LOGGER.error(f"vLLM pod failed to start within {max_wait_time}s. Full logs:\n{full_logs}")
-        except Exception as e:
+        except Exception as e:  # noqa: BLE001
             LOGGER.error(f"Could not retrieve pod logs: {e}")
         raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds")
 

From 6205fa6d566c00f4cd4899db9e3f0ba3d14540a0 Mon Sep 17 00:00:00 2001
From: Shehan Saleem <ssaleem@redhat.com>
Date: Mon, 30 Mar 2026 13:02:13 +0100
Subject: [PATCH 6/8] fix: add multi-accelerator support for vLLM GPU tests

Add support for NVIDIA, AMD, and Gaudi in LMEval GPU tests.
Update the skip_if_no_supported_accelerator_type fixture to validate supported GPU types.
Improve exception handling in the vLLM readiness check and mark tests with skip_on_disconnected.

Signed-off-by: Shehan Saleem <ssaleem@redhat.com>

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED
---
 tests/conftest.py                              | 15 ++++++++++++---
 tests/model_explainability/lm_eval/conftest.py | 18 +++++++++++++++---
 .../lm_eval/test_lm_eval.py                    |  1 +
 tests/model_explainability/lm_eval/utils.py    |  5 +++--
 4 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 458523f3c..5216008fd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -989,6 +989,15 @@ def oci_registry_host(oci_registry_route: Route) -> str:
 
 @pytest.fixture(scope="session")
 def skip_if_no_supported_accelerator_type(supported_accelerator_type: str | None) -> None:
-    """Skip test if no GPU accelerator is available."""
-    if not supported_accelerator_type:
-        pytest.skip("Accelerator type is not provided, vLLM test cannot be run on CPU")
+    """Skip test if no supported GPU accelerator is available."""
+    # Only GPU accelerators that support vLLM
+    supported_gpu_accelerators = {
+        AcceleratorType.NVIDIA,
+        AcceleratorType.AMD,
+        AcceleratorType.GAUDI,
+    }
+
+    if not supported_accelerator_type or supported_accelerator_type.lower() not in supported_gpu_accelerators:
+        pytest.skip(
+            f"Unsupported accelerator '{supported_accelerator_type}'. Expected one of {supported_gpu_accelerators}."
+        )
diff --git a/tests/model_explainability/lm_eval/conftest.py b/tests/model_explainability/lm_eval/conftest.py
index a2fc9311e..600f6c476 100644
--- a/tests/model_explainability/lm_eval/conftest.py
+++ b/tests/model_explainability/lm_eval/conftest.py
@@ -577,20 +577,32 @@ def lmeval_hf_access_token(
 
 
 # GPU-based vLLM fixtures for SmolLM-1.7B
-
-
 @pytest.fixture(scope="function")
 def lmeval_vllm_serving_runtime(
     admin_client: DynamicClient,
     model_namespace: Namespace,
     vllm_runtime_image: str,
+    supported_accelerator_type: str | None,
 ) -> Generator[ServingRuntime]:
     """vLLM ServingRuntime for GPU-based model deployment in LMEval tests."""
+    # Map accelerator type to runtime template
+    accelerator_to_template = {
+        "nvidia": RuntimeTemplates.VLLM_CUDA,
+        "amd": RuntimeTemplates.VLLM_ROCM,
+        "gaudi": RuntimeTemplates.VLLM_GAUDI,
+    }
+
+    accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
+    template_name = accelerator_to_template.get(accelerator_type)
+
+    if not template_name:
+        pytest.skip(f"Unsupported accelerator type for vLLM: {supported_accelerator_type}")
+
     with ServingRuntimeFromTemplate(
         client=admin_client,
         name="lmeval-vllm-runtime",
         namespace=model_namespace.name,
-        template_name=RuntimeTemplates.VLLM_CUDA,
+        template_name=template_name,
         deployment_type=KServeDeploymentType.RAW_DEPLOYMENT,
         runtime_image=vllm_runtime_image,
         support_tgis_open_ai_endpoints=True,
diff --git a/tests/model_explainability/lm_eval/test_lm_eval.py b/tests/model_explainability/lm_eval/test_lm_eval.py
index 809ac484a..d295fe710 100644
--- a/tests/model_explainability/lm_eval/test_lm_eval.py
+++ b/tests/model_explainability/lm_eval/test_lm_eval.py
@@ -201,6 +201,7 @@ def test_lmeval_local_offline_unitxt_tasks_flan_20newsgroups_oci_artifacts(
 
 
 @pytest.mark.gpu
+@pytest.mark.skip_on_disconnected
 @pytest.mark.parametrize(
     "model_namespace",
     [
diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py
index 6868a16f9..53b1e175e 100644
--- a/tests/model_explainability/lm_eval/utils.py
+++ b/tests/model_explainability/lm_eval/utils.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 import structlog
+from kubernetes.client.rest import ApiException
 from kubernetes.dynamic import DynamicClient
 from ocp_resources.lm_eval_job import LMEvalJob
 from ocp_resources.pod import Pod
@@ -167,7 +168,7 @@ def wait_for_vllm_model_ready(
                 break
             else:
                 LOGGER.info(f"Model still loading... (waited {elapsed_time}s)")
-        except Exception as e:  # noqa: BLE001
+        except (ApiException, OSError) as e:
             LOGGER.info(f"Could not get pod logs yet: {e}")
 
         time.sleep(check_interval)
@@ -177,7 +178,7 @@ def wait_for_vllm_model_ready(
         try:
             full_logs = predictor_pod.log(container="kserve-container")
             LOGGER.error(f"vLLM pod failed to start within {max_wait_time}s. Full logs:\n{full_logs}")
-        except Exception as e:  # noqa: BLE001
+        except (ApiException, OSError) as e:
             LOGGER.error(f"Could not retrieve pod logs: {e}")
         raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds")
 

From feda52011b6ab2fc7ce81db1a4ac89adef6b90b9 Mon Sep 17 00:00:00 2001
From: Shehan Saleem <ssaleem@redhat.com>
Date: Tue, 31 Mar 2026 15:48:35 +0100
Subject: [PATCH 7/8] refactor(lmeval): improve vLLM model readiness check

Replace manual timeout and stabilization loop with TimeoutSampler.
Add specific exceptions (ResourceNotFoundError, UnexpectedResourceCountError).
Use component=predictor label selector for pod filtering.
Use collect_pod_information for better logging.

Signed-off-by: Shehan Saleem <ssaleem@redhat.com>

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED
---
 tests/model_explainability/lm_eval/utils.py | 65 +++++++++++----------
 utilities/exceptions.py                     |  4 ++
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py
index 53b1e175e..eb74eb49b 100644
--- a/tests/model_explainability/lm_eval/utils.py
+++ b/tests/model_explainability/lm_eval/utils.py
@@ -1,5 +1,4 @@
 import re
-import time
 
 import pandas as pd
 import structlog
@@ -8,10 +7,16 @@
 from ocp_resources.lm_eval_job import LMEvalJob
 from ocp_resources.pod import Pod
 from pyhelper_utils.general import tts
-from timeout_sampler import TimeoutExpiredError
+from timeout_sampler import TimeoutExpiredError, TimeoutSampler
 
 from utilities.constants import Timeout
-from utilities.exceptions import PodLogMissMatchError, UnexpectedFailureError
+from utilities.exceptions import (
+    PodLogMissMatchError,
+    ResourceNotFoundError,
+    UnexpectedFailureError,
+    UnexpectedResourceCountError,
+)
+from utilities.general import collect_pod_information
 
 LOGGER = structlog.get_logger(name=__name__)
 
@@ -116,7 +121,6 @@ def wait_for_vllm_model_ready(
     inference_service_name: str,
     max_wait_time: int = 600,
     check_interval: int = 10,
-    stabilization_wait: int = 10,
 ) -> Pod:
     """Wait for vLLM model to download and be ready to serve requests.
 
@@ -126,12 +130,12 @@ def wait_for_vllm_model_ready(
         inference_service_name: Name of the inference service
         max_wait_time: Maximum time to wait in seconds
         check_interval: Time between checks in seconds
-        stabilization_wait: Seconds to wait after model is ready for server stabilization
 
     Returns:
         The predictor pod once model is ready
 
     Raises:
+        ResourceNotFoundError: If no predictor pod is found
         UnexpectedFailureError: If model fails to load or pod encounters errors
     """
     LOGGER.info("Waiting for vLLM model to download and load...")
@@ -140,49 +144,46 @@ def wait_for_vllm_model_ready(
         Pod.get(
             dyn_client=client,
             namespace=namespace,
-            label_selector=f"serving.kserve.io/inferenceservice={inference_service_name}",
+            label_selector=f"serving.kserve.io/inferenceservice={inference_service_name},component=predictor",
         )
     )
 
     if not predictor_pods:
-        raise UnexpectedFailureError("No predictor pod found for inference service")
+        raise ResourceNotFoundError(f"No predictor pod found for inference service '{inference_service_name}'.")
 
-    predictor_pods = [pod for pod in predictor_pods if "predictor" in pod.name]
-
-    if not predictor_pods:
-        raise UnexpectedFailureError("No predictor pod found for inference service")
+    if len(predictor_pods) != 1:
+        raise UnexpectedResourceCountError(
+            f"Expected exactly 1 predictor pod for inference service '{inference_service_name}', "
+            f"but found {len(predictor_pods)}: {[pod.name for pod in predictor_pods]}"
+        )
 
     predictor_pod = predictor_pods[0]
     LOGGER.info(f"Predictor pod: {predictor_pod.name}")
 
-    elapsed_time = 0
-    model_loaded = False
-
-    while elapsed_time < max_wait_time:
+    def _check_model_ready() -> bool:
         try:
             pod_logs = predictor_pod.log(container="kserve-container")
-
             if "Uvicorn running on" in pod_logs or "Application startup complete" in pod_logs:
                 LOGGER.info("vLLM server is running and ready!")
-                model_loaded = True
-                break
+                return True
             else:
-                LOGGER.info(f"Model still loading... (waited {elapsed_time}s)")
+                LOGGER.info("Model still loading..")
+                return False
         except (ApiException, OSError) as e:
             LOGGER.info(f"Could not get pod logs yet: {e}")
+            return False
 
-        time.sleep(check_interval)
-        elapsed_time += check_interval
-
-    if not model_loaded:
-        try:
-            full_logs = predictor_pod.log(container="kserve-container")
-            LOGGER.error(f"vLLM pod failed to start within {max_wait_time}s. Full logs:\n{full_logs}")
-        except (ApiException, OSError) as e:
-            LOGGER.error(f"Could not retrieve pod logs: {e}")
-        raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds")
-
-    LOGGER.info(f"Model loaded! Waiting {stabilization_wait} more seconds for server stabilization.")
-    time.sleep(stabilization_wait)
+    try:
+        for sample in TimeoutSampler(
+            wait_timeout=max_wait_time,
+            sleep=check_interval,
+            func=_check_model_ready,
+        ):
+            if sample:
+                break
+    except TimeoutExpiredError as e:
+        LOGGER.error(f"vLLM pod failed to start within {max_wait_time} seconds")
+        collect_pod_information(pod=predictor_pod)
+        raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") from e
 
     return predictor_pod
diff --git a/utilities/exceptions.py b/utilities/exceptions.py
index f44be948b..e68d327d4 100644
--- a/utilities/exceptions.py
+++ b/utilities/exceptions.py
@@ -131,3 +131,7 @@ class ExceptionUserLogin(Exception):
 
 class UnexpectedValueError(Exception):
     """Unexpected value found"""
+
+
+class ResourceNotFoundError(Exception):
+    """Resource not found"""

From 2e74ee907ae5d9852e03f2366b9e4a169a353778 Mon Sep 17 00:00:00 2001
From: Shehan Saleem <ssaleem@redhat.com>
Date: Thu, 2 Apr 2026 12:00:36 +0100
Subject: [PATCH 8/8] fix(lmeval_gpu) : require explicit accelerator type for
 vLLM GPU tests

- Remove default NVIDIA assumption in lmeval_vllm_serving_runtime
- Add explicit skip when supported_accelerator_type is not provided
- Fix ResourceNotFoundError import (use kubernetes.dynamic.exceptions)
- Remove incorrectly added ResourceNotFoundError from utilities.exceptions

Signed-off-by: Shehan Saleem <ssaleem@redhat.com>

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED
---
 tests/model_explainability/lm_eval/conftest.py | 7 +++++--
 tests/model_explainability/lm_eval/utils.py    | 6 +++---
 utilities/exceptions.py                        | 4 ----
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/tests/model_explainability/lm_eval/conftest.py b/tests/model_explainability/lm_eval/conftest.py
index 600f6c476..7b24bb656 100644
--- a/tests/model_explainability/lm_eval/conftest.py
+++ b/tests/model_explainability/lm_eval/conftest.py
@@ -592,11 +592,14 @@ def lmeval_vllm_serving_runtime(
         "gaudi": RuntimeTemplates.VLLM_GAUDI,
     }
 
-    accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia"
+    if not supported_accelerator_type:
+        pytest.skip("supported_accelerator_type is required for GPU-backed vLLM tests")
+
+    accelerator_type = supported_accelerator_type.lower()
     template_name = accelerator_to_template.get(accelerator_type)
 
     if not template_name:
-        pytest.skip(f"Unsupported accelerator type for vLLM: {supported_accelerator_type}")
+        pytest.skip(f"Unsupported accelerator type for vLLM: {accelerator_type}")
 
     with ServingRuntimeFromTemplate(
         client=admin_client,
diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py
index eb74eb49b..1bf6d7d2e 100644
--- a/tests/model_explainability/lm_eval/utils.py
+++ b/tests/model_explainability/lm_eval/utils.py
@@ -4,6 +4,7 @@
 import structlog
 from kubernetes.client.rest import ApiException
 from kubernetes.dynamic import DynamicClient
+from kubernetes.dynamic.exceptions import ResourceNotFoundError
 from ocp_resources.lm_eval_job import LMEvalJob
 from ocp_resources.pod import Pod
 from pyhelper_utils.general import tts
@@ -12,7 +13,6 @@
 from utilities.constants import Timeout
 from utilities.exceptions import (
     PodLogMissMatchError,
-    ResourceNotFoundError,
     UnexpectedFailureError,
     UnexpectedResourceCountError,
 )
@@ -181,9 +181,9 @@ def _check_model_ready() -> bool:
         ):
             if sample:
                 break
-    except TimeoutExpiredError as e:
+    except TimeoutExpiredError:
         LOGGER.error(f"vLLM pod failed to start within {max_wait_time} seconds")
         collect_pod_information(pod=predictor_pod)
-        raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") from e
+        raise
 
     return predictor_pod
diff --git a/utilities/exceptions.py b/utilities/exceptions.py
index e68d327d4..f44be948b 100644
--- a/utilities/exceptions.py
+++ b/utilities/exceptions.py
@@ -131,7 +131,3 @@ class ExceptionUserLogin(Exception):
 
 class UnexpectedValueError(Exception):
     """Unexpected value found"""
-
-
-class ResourceNotFoundError(Exception):
-    """Resource not found"""