From 36e09f100bfa121bcd7bf78d5a76cf9f47847619 Mon Sep 17 00:00:00 2001 From: ssaleem-rh Date: Mon, 16 Mar 2026 12:46:57 +0000 Subject: [PATCH 1/8] =?UTF-8?q?Fixed=20typo=20in=20error=20message:=20SUPP?= =?UTF-8?q?ORTED=5FACCLERATOR=5FTYPE=20=E2=86=92=20SUPPORTED=5FACCELERATOR?= =?UTF-8?q?=5FTYPE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED Signed-off-by: Shehan Saleem --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 63ff3cac6..59bf7056c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -299,7 +299,7 @@ def supported_accelerator_type(pytestconfig: pytest.Config) -> str | None: if accelerator_type.lower() not in AcceleratorType.SUPPORTED_LISTS: raise ValueError( "accelerator type is not defined." - "Either pass with `--supported-accelerator-type` or set `SUPPORTED_ACCLERATOR_TYPE` environment variable" + "Either pass with `--supported-accelerator-type` or set `SUPPORTED_ACCELERATOR_TYPE` environment variable" ) return accelerator_type From 66b3b8b7a2ec7a1fbb9884b2fcbe61d275ddbd0d Mon Sep 17 00:00:00 2001 From: ssaleem-rh Date: Mon, 16 Mar 2026 16:37:56 +0000 Subject: [PATCH 2/8] test(lmeval): add GPU testing support with vLLM Add test_lmeval_gpu to verify LMEval works with GPU-backed model deployments via vLLM runtime. Includes: - New test for GPU model evaluation with SmolLM-1.7B - wait_for_vllm_model_ready utility for model readiness checks - GPU-specific fixtures: ServingRuntime, InferenceService, LMEvalJob, and pod; skip when no supported accelerator rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED Signed-off-by: Shehan Saleem --- .../model_explainability/lm_eval/conftest.py | 134 +++++++++++++++++- .../lm_eval/test_lm_eval.py | 38 ++++- tests/model_explainability/lm_eval/utils.py | 74 ++++++++++ 3 files changed, 244 insertions(+), 2 deletions(-) diff --git a/tests/model_explainability/lm_eval/conftest.py b/tests/model_explainability/lm_eval/conftest.py index 56c5bf5b5..5eb4a8f37 100644 --- a/tests/model_explainability/lm_eval/conftest.py +++ b/tests/model_explainability/lm_eval/conftest.py @@ -6,6 +6,7 @@ from kubernetes.dynamic import DynamicClient from ocp_resources.data_science_cluster import DataScienceCluster from ocp_resources.deployment import Deployment +from ocp_resources.inference_service import InferenceService from ocp_resources.lm_eval_job import LMEvalJob from ocp_resources.namespace import Namespace from ocp_resources.persistent_volume_claim import PersistentVolumeClaim @@ -13,6 +14,7 @@ from ocp_resources.route import Route from ocp_resources.secret import Secret from ocp_resources.service import Service +from ocp_resources.serving_runtime import ServingRuntime from pytest import Config, FixtureRequest from tests.model_explainability.lm_eval.constants import ( @@ -22,9 +24,12 @@ LMEVAL_OCI_TAG, ) from tests.model_explainability.lm_eval.utils import get_lmevaljob_pod -from utilities.constants import ApiGroups, Labels, MinIo, Protocols, Timeout +from tests.model_serving.model_runtime.vllm.constant import ACCELERATOR_IDENTIFIER +from utilities.constants import ApiGroups, KServeDeploymentType, Labels, MinIo, Protocols, RuntimeTemplates, Timeout from utilities.exceptions import MissingParameter from utilities.general import b64_encoded_string +from utilities.inference_utils import create_isvc +from utilities.serving_runtime import ServingRuntimeFromTemplate VLLM_EMULATOR: str = "vllm-emulator" VLLM_EMULATOR_PORT: int = 8000 @@ -542,6 +547,11 @@ def lmevaljob_s3_offline_pod(admin_client: DynamicClient, lmevaljob_s3_offline: yield get_lmevaljob_pod(client=admin_client, lmevaljob=lmevaljob_s3_offline) +@pytest.fixture(scope="function") +def lmevaljob_gpu_pod(admin_client: DynamicClient, lmevaljob_gpu: LMEvalJob) -> Generator[Pod, Any, Any]: + yield get_lmevaljob_pod(client=admin_client, lmevaljob=lmevaljob_gpu) + + @pytest.fixture(scope="function") def lmeval_hf_access_token( admin_client: DynamicClient, @@ -564,3 +574,125 @@ def lmeval_hf_access_token( wait_for_resource=True, ) as secret: yield secret + + +# GPU-based vLLM fixtures for SmolLM-1.7B + +@pytest.fixture(scope="session") +def skip_if_no_supported_accelerator_type(supported_accelerator_type: str) -> None: + """Skip test if no GPU accelerator is available.""" + if not supported_accelerator_type: + pytest.skip("Accelerator type is not provided, GPU test cannot be run on CPU") + + +@pytest.fixture(scope="function") +def lmeval_vllm_serving_runtime( + admin_client: DynamicClient, + model_namespace: Namespace, + vllm_runtime_image: str, +) -> Generator[ServingRuntime]: + """vLLM ServingRuntime for GPU-based model deployment in LMEval tests.""" + with ServingRuntimeFromTemplate( + client=admin_client, + name="lmeval-vllm-runtime", + namespace=model_namespace.name, + template_name=RuntimeTemplates.VLLM_CUDA, + deployment_type=KServeDeploymentType.RAW_DEPLOYMENT, + runtime_image=vllm_runtime_image, + support_tgis_open_ai_endpoints=True, + ) as serving_runtime: + yield serving_runtime + + +@pytest.fixture(scope="function") +def lmeval_vllm_inference_service( + admin_client: DynamicClient, + model_namespace: Namespace, + lmeval_vllm_serving_runtime: ServingRuntime, + supported_accelerator_type: str, +) -> Generator[InferenceService]: + """InferenceService for GPU-based model deployment in LMEval tests.""" + model_path = "HuggingFaceTB/SmolLM-1.7B" + model_name = "lmeval-model" + + # Get the correct GPU identifier based on accelerator type + accelerator_type = supported_accelerator_type.lower() + gpu_identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, Labels.Nvidia.NVIDIA_COM_GPU) + + resources = { + "requests": { + "cpu": "2", + "memory": "8Gi", + gpu_identifier: "1", + }, + "limits": { + "cpu": "3", + "memory": "8Gi", + gpu_identifier: "1", + }, + } + + runtime_args = [ + f"--model={model_path}", + "--dtype=float16", + "--max-model-len=2048", + ] + + env_vars = [ + {"name": "HF_HUB_OFFLINE", "value": "0"}, + {"name": "HF_HUB_ENABLE_HF_TRANSFER", "value": "0"}, + ] + + with create_isvc( + client=admin_client, + name=model_name, + namespace=model_namespace.name, + runtime=lmeval_vllm_serving_runtime.name, + model_format=lmeval_vllm_serving_runtime.instance.spec.supportedModelFormats[0].name, + deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT, + resources=resources, + argument=runtime_args, + model_env_variables=env_vars, + min_replicas=1, + ) as inference_service: + yield inference_service + + +@pytest.fixture(scope="function") +def lmevaljob_gpu( + admin_client: DynamicClient, + model_namespace: Namespace, + lmeval_vllm_inference_service: InferenceService, +) -> Generator[LMEvalJob]: + """LMEvalJob for evaluating a GPU-deployed model via vLLM.""" + model_path = "HuggingFaceTB/SmolLM-1.7B" + model_service = Service( + name=f"{lmeval_vllm_inference_service.name}-predictor", + namespace=lmeval_vllm_inference_service.namespace, + ) + + with LMEvalJob( + client=admin_client, + namespace=model_namespace.name, + name=LMEVALJOB_NAME, + model="local-completions", + task_list={"taskNames": ["arc_easy"]}, + log_samples=True, + batch_size="1", + allow_online=True, + allow_code_execution=False, + outputs={"pvcManaged": {"size": "5Gi"}}, + limit="0.01", + model_args=[ + {"name": "model", "value": lmeval_vllm_inference_service.name}, + { + "name": "base_url", + "value": f"http://{model_service.name}.{model_namespace.name}.svc.cluster.local:80/v1/completions", + }, + {"name": "num_concurrent", "value": "1"}, + {"name": "max_retries", "value": "3"}, + {"name": "tokenized_requests", "value": "False"}, + {"name": "tokenizer", "value": model_path}, + ], + ) as lmevaljob: + yield lmevaljob diff --git a/tests/model_explainability/lm_eval/test_lm_eval.py b/tests/model_explainability/lm_eval/test_lm_eval.py index 4a03ea416..a36a9fe09 100644 --- a/tests/model_explainability/lm_eval/test_lm_eval.py +++ b/tests/model_explainability/lm_eval/test_lm_eval.py @@ -11,7 +11,11 @@ LMEVAL_OCI_REPO, LMEVAL_OCI_TAG, ) -from tests.model_explainability.lm_eval.utils import get_lmeval_tasks, validate_lmeval_job_pod_and_logs +from tests.model_explainability.lm_eval.utils import ( + get_lmeval_tasks, + validate_lmeval_job_pod_and_logs, + wait_for_vllm_model_ready, +) from tests.model_explainability.utils import validate_tai_component_images from utilities.constants import OCIRegistry from utilities.registry_utils import pull_manifest_from_oci_registry @@ -194,3 +198,35 @@ def test_lmeval_local_offline_unitxt_tasks_flan_20newsgroups_oci_artifacts( LOGGER.info(f"Verifying artifact in OCI registry: {registry_url}/v2/{LMEVAL_OCI_REPO}/manifests/{LMEVAL_OCI_TAG}") pull_manifest_from_oci_registry(registry_url=registry_url, repo=LMEVAL_OCI_REPO, tag=LMEVAL_OCI_TAG) LOGGER.info("Manifest found in OCI registry") + + +@pytest.mark.gpu +@pytest.mark.parametrize( + "model_namespace", + [ + pytest.param( + {"name": "test-lmeval-gpu"}, + ) + ], + indirect=True, +) +@pytest.mark.usefixtures("patched_dsc_kserve_headed", "skip_if_no_supported_accelerator_type") +def test_lmeval_gpu( + admin_client: DynamicClient, + model_namespace: Namespace, + patched_dsc_lmeval_allow_all, + lmeval_vllm_inference_service, + lmevaljob_gpu_pod, +): + """Test LMEval with GPU-backed model deployment via vLLM. + + Verifies that LMEval can successfully evaluate a model deployed on GPU using vLLM runtime. + The model is downloaded directly from HuggingFace Hub and evaluated using the arc_easy task. + """ + wait_for_vllm_model_ready( + client=admin_client, + namespace=model_namespace.name, + inference_service_name=lmeval_vllm_inference_service.name, + ) + + validate_lmeval_job_pod_and_logs(lmevaljob_pod=lmevaljob_gpu_pod) diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py index 33c587734..f9aa1e0fb 100644 --- a/tests/model_explainability/lm_eval/utils.py +++ b/tests/model_explainability/lm_eval/utils.py @@ -1,4 +1,5 @@ import re +import time import pandas as pd from kubernetes.dynamic import DynamicClient @@ -106,3 +107,76 @@ def validate_lmeval_job_pod_and_logs(lmevaljob_pod: Pod) -> None: raise UnexpectedFailureError("LMEval job pod failed from a running state.") from e if not bool(re.search(pod_success_log_regex, lmevaljob_pod.log())): raise PodLogMissMatchError("LMEval job pod failed.") + + +def wait_for_vllm_model_ready( + client: DynamicClient, + namespace: str, + inference_service_name: str, + max_wait_time: int = 600, + check_interval: int = 10, + stabilization_wait: int = 10, +) -> Pod: + """Wait for vLLM model to download and be ready to serve requests. + + Args: + client: Kubernetes dynamic client + namespace: Namespace where the inference service is deployed + inference_service_name: Name of the inference service + max_wait_time: Maximum time to wait in seconds + check_interval: Time between checks in seconds + stabilization_wait: Seconds to wait after model is ready for server stabilization + + Returns: + The predictor pod once model is ready + + Raises: + UnexpectedFailureError: If model fails to load or pod encounters errors + """ + LOGGER.info("Waiting for vLLM model to download and load...") + + predictor_pods = list( + Pod.get( + dyn_client=client, + namespace=namespace, + label_selector=f"serving.kserve.io/inferenceservice={inference_service_name}", + ) + ) + + if not predictor_pods: + raise UnexpectedFailureError("No predictor pod found for inference service") + + predictor_pod = predictor_pods[0] + LOGGER.info(f"Predictor pod: {predictor_pod.name}") + + elapsed_time = 0 + model_loaded = False + + while elapsed_time < max_wait_time: + try: + pod_logs = predictor_pod.log(container="kserve-container") + + if "Uvicorn running on" in pod_logs or "Application startup complete" in pod_logs: + LOGGER.info("vLLM server is running and ready!") + model_loaded = True + break + else: + LOGGER.info(f"Model still loading... (waited {elapsed_time}s)") + except Exception as e: + LOGGER.info(f"Could not get pod logs yet: {e}") + + time.sleep(check_interval) + elapsed_time += check_interval + + if not model_loaded: + try: + full_logs = predictor_pod.log(container="kserve-container") + LOGGER.error(f"vLLM pod failed to start within {max_wait_time}s. Full logs:\n{full_logs}") + except Exception as e: + LOGGER.error(f"Could not retrieve pod logs: {e}") + raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") + + LOGGER.info(f"Model loaded! Waiting {stabilization_wait} more seconds for server stabilization.") + time.sleep(stabilization_wait) + + return predictor_pod From 8f07c6a25e247ef5b66b138fe0a13a4c6bf51b1f Mon Sep 17 00:00:00 2001 From: ssaleem-rh Date: Tue, 17 Mar 2026 08:09:06 +0000 Subject: [PATCH 3/8] =?UTF-8?q?Fixed=20typo=20in=20SUPPORTED=5FACCELERATOR?= =?UTF-8?q?=5FTYPE=20environment=20variable:=20=20SUPPORTED=5FACCLERATOR?= =?UTF-8?q?=5FTYPE=20=E2=86=92=20SUPPORTED=5FACCELERATOR=5FTYPE=20in=20run?= =?UTF-8?q?time=20option.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED Signed-off-by: Shehan Saleem --- conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index ebd05ff36..6bc851e95 100644 --- a/conftest.py +++ b/conftest.py @@ -104,7 +104,7 @@ def pytest_addoption(parser: Parser) -> None: # Runtime options runtime_group.addoption( "--supported-accelerator-type", - default=os.environ.get("SUPPORTED_ACCLERATOR_TYPE"), + default=os.environ.get("SUPPORTED_ACCELERATOR_TYPE"), help="Supported accelerator type : Nvidia,AMD,Gaudi", ) runtime_group.addoption( From 368ce13044ff3457a3eb4557de46cc1921458ad0 Mon Sep 17 00:00:00 2001 From: Shehan Saleem Date: Mon, 23 Mar 2026 11:17:31 +0000 Subject: [PATCH 4/8] refactor: centralize skip_if_no_supported_accelerator_type fixture - Relocate skip_if_no_supported_accelerator_type fixture to tests/conftest.py for reuse across test modules - Introduce ACCELERATOR_IDENTIFIER constant and update imports - Relax type annotation in lmeval_vllm_inference_service to str | None Signed-off-by: ssaleem-rh Signed-off-by: Shehan Saleem rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED Signed-off-by: Shehan Saleem --- tests/conftest.py | 7 +++++++ tests/model_explainability/lm_eval/conftest.py | 12 +++--------- tests/model_explainability/lm_eval/constants.py | 7 +++++++ tests/model_explainability/lm_eval/utils.py | 5 +++++ tests/model_serving/model_runtime/conftest.py | 6 ------ 5 files changed, 22 insertions(+), 15 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 59bf7056c..8dcd905dc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -985,3 +985,10 @@ def oci_registry_route(admin_client: DynamicClient, oci_registry_service: Servic def oci_registry_host(oci_registry_route: Route) -> str: """Get the OCI registry host from the route""" return oci_registry_route.host + + +@pytest.fixture(scope="session") +def skip_if_no_supported_accelerator_type(supported_accelerator_type: str | None) -> None: + """Skip test if no GPU accelerator is available.""" + if not supported_accelerator_type: + pytest.skip("Accelerator type is not provided, vLLM test cannot be run on CPU") diff --git a/tests/model_explainability/lm_eval/conftest.py b/tests/model_explainability/lm_eval/conftest.py index 5eb4a8f37..a2fc9311e 100644 --- a/tests/model_explainability/lm_eval/conftest.py +++ b/tests/model_explainability/lm_eval/conftest.py @@ -18,13 +18,13 @@ from pytest import Config, FixtureRequest from tests.model_explainability.lm_eval.constants import ( + ACCELERATOR_IDENTIFIER, ARC_EASY_DATASET_IMAGE, FLAN_T5_IMAGE, LMEVAL_OCI_REPO, LMEVAL_OCI_TAG, ) from tests.model_explainability.lm_eval.utils import get_lmevaljob_pod -from tests.model_serving.model_runtime.vllm.constant import ACCELERATOR_IDENTIFIER from utilities.constants import ApiGroups, KServeDeploymentType, Labels, MinIo, Protocols, RuntimeTemplates, Timeout from utilities.exceptions import MissingParameter from utilities.general import b64_encoded_string @@ -578,12 +578,6 @@ def lmeval_hf_access_token( # GPU-based vLLM fixtures for SmolLM-1.7B -@pytest.fixture(scope="session") -def skip_if_no_supported_accelerator_type(supported_accelerator_type: str) -> None: - """Skip test if no GPU accelerator is available.""" - if not supported_accelerator_type: - pytest.skip("Accelerator type is not provided, GPU test cannot be run on CPU") - @pytest.fixture(scope="function") def lmeval_vllm_serving_runtime( @@ -609,14 +603,14 @@ def lmeval_vllm_inference_service( admin_client: DynamicClient, model_namespace: Namespace, lmeval_vllm_serving_runtime: ServingRuntime, - supported_accelerator_type: str, + supported_accelerator_type: str | None, ) -> Generator[InferenceService]: """InferenceService for GPU-based model deployment in LMEval tests.""" model_path = "HuggingFaceTB/SmolLM-1.7B" model_name = "lmeval-model" # Get the correct GPU identifier based on accelerator type - accelerator_type = supported_accelerator_type.lower() + accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia" gpu_identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, Labels.Nvidia.NVIDIA_COM_GPU) resources = { diff --git a/tests/model_explainability/lm_eval/constants.py b/tests/model_explainability/lm_eval/constants.py index e4448983b..a04877395 100644 --- a/tests/model_explainability/lm_eval/constants.py +++ b/tests/model_explainability/lm_eval/constants.py @@ -119,3 +119,10 @@ LMEVAL_OCI_REPO = "lmeval/offline-oci" LMEVAL_OCI_TAG = "v1" + +# Accelerator identifier mapping for GPU types +ACCELERATOR_IDENTIFIER: dict[str, str] = { + "nvidia": "nvidia.com/gpu", + "amd": "amd.com/gpu", + "gaudi": "habana.ai/gaudi", +} diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py index f9aa1e0fb..559c03572 100644 --- a/tests/model_explainability/lm_eval/utils.py +++ b/tests/model_explainability/lm_eval/utils.py @@ -143,6 +143,11 @@ def wait_for_vllm_model_ready( ) ) + if not predictor_pods: + raise UnexpectedFailureError("No predictor pod found for inference service") + + predictor_pods = [pod for pod in predictor_pods if "predictor" in pod.name] + if not predictor_pods: raise UnexpectedFailureError("No predictor pod found for inference service") diff --git a/tests/model_serving/model_runtime/conftest.py b/tests/model_serving/model_runtime/conftest.py index db4462d7c..0df8d5126 100644 --- a/tests/model_serving/model_runtime/conftest.py +++ b/tests/model_serving/model_runtime/conftest.py @@ -4,12 +4,6 @@ from syrupy.extensions.json import JSONSnapshotExtension -@pytest.fixture(scope="session") -def skip_if_no_supported_accelerator_type(supported_accelerator_type: str) -> None: - if not supported_accelerator_type: - pytest.skip("Accelerator type is not provided,vLLM test cannot be run on CPU") - - @pytest.fixture def response_snapshot(snapshot: Any) -> Any: return snapshot.use_extension(extension_class=JSONSnapshotExtension) From 5d47944820c2d0fa28f87d294c09da3ebf87b063 Mon Sep 17 00:00:00 2001 From: Shehan Saleem Date: Mon, 23 Mar 2026 13:37:40 +0000 Subject: [PATCH 5/8] fix: re-add # noqa: BLE001 Previously removed as unnecessary, but required to suppress BLE001 warning. Signed-off-by: Shehan Saleem rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED --- tests/model_explainability/lm_eval/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py index 559c03572..308098e21 100644 --- a/tests/model_explainability/lm_eval/utils.py +++ b/tests/model_explainability/lm_eval/utils.py @@ -167,7 +167,7 @@ def wait_for_vllm_model_ready( break else: LOGGER.info(f"Model still loading... (waited {elapsed_time}s)") - except Exception as e: + except Exception as e: # noqa: BLE001 LOGGER.info(f"Could not get pod logs yet: {e}") time.sleep(check_interval) @@ -177,7 +177,7 @@ def wait_for_vllm_model_ready( try: full_logs = predictor_pod.log(container="kserve-container") LOGGER.error(f"vLLM pod failed to start within {max_wait_time}s. Full logs:\n{full_logs}") - except Exception as e: + except Exception as e: # noqa: BLE001 LOGGER.error(f"Could not retrieve pod logs: {e}") raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") From 6205fa6d566c00f4cd4899db9e3f0ba3d14540a0 Mon Sep 17 00:00:00 2001 From: Shehan Saleem Date: Mon, 30 Mar 2026 13:02:13 +0100 Subject: [PATCH 6/8] fix: add multi-accelerator support for vLLM GPU tests Add support for NVIDIA, AMD, and Gaudi in LMEval GPU tests. Update the skip_if_no_supported_accelerator_type fixture to validate supported GPU types. Improve exception handling in the vLLM readiness check and mark tests with skip_on_disconnected. Signed-off-by: Shehan Saleem rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED --- tests/conftest.py | 15 ++++++++++++--- tests/model_explainability/lm_eval/conftest.py | 18 +++++++++++++++--- .../lm_eval/test_lm_eval.py | 1 + tests/model_explainability/lm_eval/utils.py | 5 +++-- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 458523f3c..5216008fd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -989,6 +989,15 @@ def oci_registry_host(oci_registry_route: Route) -> str: @pytest.fixture(scope="session") def skip_if_no_supported_accelerator_type(supported_accelerator_type: str | None) -> None: - """Skip test if no GPU accelerator is available.""" - if not supported_accelerator_type: - pytest.skip("Accelerator type is not provided, vLLM test cannot be run on CPU") + """Skip test if no supported GPU accelerator is available.""" + # Only GPU accelerators that support vLLM + supported_gpu_accelerators = { + AcceleratorType.NVIDIA, + AcceleratorType.AMD, + AcceleratorType.GAUDI, + } + + if not supported_accelerator_type or supported_accelerator_type.lower() not in supported_gpu_accelerators: + pytest.skip( + f"Unsupported accelerator '{supported_accelerator_type}'. Expected one of {supported_gpu_accelerators}." + ) diff --git a/tests/model_explainability/lm_eval/conftest.py b/tests/model_explainability/lm_eval/conftest.py index a2fc9311e..600f6c476 100644 --- a/tests/model_explainability/lm_eval/conftest.py +++ b/tests/model_explainability/lm_eval/conftest.py @@ -577,20 +577,32 @@ def lmeval_hf_access_token( # GPU-based vLLM fixtures for SmolLM-1.7B - - @pytest.fixture(scope="function") def lmeval_vllm_serving_runtime( admin_client: DynamicClient, model_namespace: Namespace, vllm_runtime_image: str, + supported_accelerator_type: str | None, ) -> Generator[ServingRuntime]: """vLLM ServingRuntime for GPU-based model deployment in LMEval tests.""" + # Map accelerator type to runtime template + accelerator_to_template = { + "nvidia": RuntimeTemplates.VLLM_CUDA, + "amd": RuntimeTemplates.VLLM_ROCM, + "gaudi": RuntimeTemplates.VLLM_GAUDI, + } + + accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia" + template_name = accelerator_to_template.get(accelerator_type) + + if not template_name: + pytest.skip(f"Unsupported accelerator type for vLLM: {supported_accelerator_type}") + with ServingRuntimeFromTemplate( client=admin_client, name="lmeval-vllm-runtime", namespace=model_namespace.name, - template_name=RuntimeTemplates.VLLM_CUDA, + template_name=template_name, deployment_type=KServeDeploymentType.RAW_DEPLOYMENT, runtime_image=vllm_runtime_image, support_tgis_open_ai_endpoints=True, diff --git a/tests/model_explainability/lm_eval/test_lm_eval.py b/tests/model_explainability/lm_eval/test_lm_eval.py index 809ac484a..d295fe710 100644 --- a/tests/model_explainability/lm_eval/test_lm_eval.py +++ b/tests/model_explainability/lm_eval/test_lm_eval.py @@ -201,6 +201,7 @@ def test_lmeval_local_offline_unitxt_tasks_flan_20newsgroups_oci_artifacts( @pytest.mark.gpu +@pytest.mark.skip_on_disconnected @pytest.mark.parametrize( "model_namespace", [ diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py index 6868a16f9..53b1e175e 100644 --- a/tests/model_explainability/lm_eval/utils.py +++ b/tests/model_explainability/lm_eval/utils.py @@ -3,6 +3,7 @@ import pandas as pd import structlog +from kubernetes.client.rest import ApiException from kubernetes.dynamic import DynamicClient from ocp_resources.lm_eval_job import LMEvalJob from ocp_resources.pod import Pod @@ -167,7 +168,7 @@ def wait_for_vllm_model_ready( break else: LOGGER.info(f"Model still loading... (waited {elapsed_time}s)") - except Exception as e: # noqa: BLE001 + except (ApiException, OSError) as e: LOGGER.info(f"Could not get pod logs yet: {e}") time.sleep(check_interval) @@ -177,7 +178,7 @@ def wait_for_vllm_model_ready( try: full_logs = predictor_pod.log(container="kserve-container") LOGGER.error(f"vLLM pod failed to start within {max_wait_time}s. Full logs:\n{full_logs}") - except Exception as e: # noqa: BLE001 + except (ApiException, OSError) as e: LOGGER.error(f"Could not retrieve pod logs: {e}") raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") From feda52011b6ab2fc7ce81db1a4ac89adef6b90b9 Mon Sep 17 00:00:00 2001 From: Shehan Saleem Date: Tue, 31 Mar 2026 15:48:35 +0100 Subject: [PATCH 7/8] refactor(lmeval): improve vLLM model readiness check Replace manual timeout and stabilization loop with TimeoutSampler. Add specific exceptions (ResourceNotFoundError, UnexpectedResourceCountError). Use component=predictor label selector for pod filtering. Use collect_pod_information for better logging. Signed-off-by: Shehan Saleem rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED --- tests/model_explainability/lm_eval/utils.py | 65 +++++++++++---------- utilities/exceptions.py | 4 ++ 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py index 53b1e175e..eb74eb49b 100644 --- a/tests/model_explainability/lm_eval/utils.py +++ b/tests/model_explainability/lm_eval/utils.py @@ -1,5 +1,4 @@ import re -import time import pandas as pd import structlog @@ -8,10 +7,16 @@ from ocp_resources.lm_eval_job import LMEvalJob from ocp_resources.pod import Pod from pyhelper_utils.general import tts -from timeout_sampler import TimeoutExpiredError +from timeout_sampler import TimeoutExpiredError, TimeoutSampler from utilities.constants import Timeout -from utilities.exceptions import PodLogMissMatchError, UnexpectedFailureError +from utilities.exceptions import ( + PodLogMissMatchError, + ResourceNotFoundError, + UnexpectedFailureError, + UnexpectedResourceCountError, +) +from utilities.general import collect_pod_information LOGGER = structlog.get_logger(name=__name__) @@ -116,7 +121,6 @@ def wait_for_vllm_model_ready( inference_service_name: str, max_wait_time: int = 600, check_interval: int = 10, - stabilization_wait: int = 10, ) -> Pod: """Wait for vLLM model to download and be ready to serve requests. @@ -126,12 +130,12 @@ def wait_for_vllm_model_ready( inference_service_name: Name of the inference service max_wait_time: Maximum time to wait in seconds check_interval: Time between checks in seconds - stabilization_wait: Seconds to wait after model is ready for server stabilization Returns: The predictor pod once model is ready Raises: + ResourceNotFoundError: If no predictor pod is found UnexpectedFailureError: If model fails to load or pod encounters errors """ LOGGER.info("Waiting for vLLM model to download and load...") @@ -140,49 +144,46 @@ def wait_for_vllm_model_ready( Pod.get( dyn_client=client, namespace=namespace, - label_selector=f"serving.kserve.io/inferenceservice={inference_service_name}", + label_selector=f"serving.kserve.io/inferenceservice={inference_service_name},component=predictor", ) ) if not predictor_pods: - raise UnexpectedFailureError("No predictor pod found for inference service") + raise ResourceNotFoundError(f"No predictor pod found for inference service '{inference_service_name}'.") - predictor_pods = [pod for pod in predictor_pods if "predictor" in pod.name] - - if not predictor_pods: - raise UnexpectedFailureError("No predictor pod found for inference service") + if len(predictor_pods) != 1: + raise UnexpectedResourceCountError( + f"Expected exactly 1 predictor pod for inference service '{inference_service_name}', " + f"but found {len(predictor_pods)}: {[pod.name for pod in predictor_pods]}" + ) predictor_pod = predictor_pods[0] LOGGER.info(f"Predictor pod: {predictor_pod.name}") - elapsed_time = 0 - model_loaded = False - - while elapsed_time < max_wait_time: + def _check_model_ready() -> bool: try: pod_logs = predictor_pod.log(container="kserve-container") - if "Uvicorn running on" in pod_logs or "Application startup complete" in pod_logs: LOGGER.info("vLLM server is running and ready!") - model_loaded = True - break + return True else: - LOGGER.info(f"Model still loading... (waited {elapsed_time}s)") + LOGGER.info("Model still loading..") + return False except (ApiException, OSError) as e: LOGGER.info(f"Could not get pod logs yet: {e}") + return False - time.sleep(check_interval) - elapsed_time += check_interval - - if not model_loaded: - try: - full_logs = predictor_pod.log(container="kserve-container") - LOGGER.error(f"vLLM pod failed to start within {max_wait_time}s. Full logs:\n{full_logs}") - except (ApiException, OSError) as e: - LOGGER.error(f"Could not retrieve pod logs: {e}") - raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") - - LOGGER.info(f"Model loaded! Waiting {stabilization_wait} more seconds for server stabilization.") - time.sleep(stabilization_wait) + try: + for sample in TimeoutSampler( + wait_timeout=max_wait_time, + sleep=check_interval, + func=_check_model_ready, + ): + if sample: + break + except TimeoutExpiredError as e: + LOGGER.error(f"vLLM pod failed to start within {max_wait_time} seconds") + collect_pod_information(pod=predictor_pod) + raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") from e return predictor_pod diff --git a/utilities/exceptions.py b/utilities/exceptions.py index f44be948b..e68d327d4 100644 --- a/utilities/exceptions.py +++ b/utilities/exceptions.py @@ -131,3 +131,7 @@ class ExceptionUserLogin(Exception): class UnexpectedValueError(Exception): """Unexpected value found""" + + +class ResourceNotFoundError(Exception): + """Resource not found""" From 2e74ee907ae5d9852e03f2366b9e4a169a353778 Mon Sep 17 00:00:00 2001 From: Shehan Saleem Date: Thu, 2 Apr 2026 12:00:36 +0100 Subject: [PATCH 8/8] fix(lmeval_gpu) : require explicit accelerator type for vLLM GPU tests - Remove default NVIDIA assumption in lmeval_vllm_serving_runtime - Add explicit skip when supported_accelerator_type is not provided - Fix ResourceNotFoundError import (use kubernetes.dynamic.exceptions) - Remove incorrectly added ResourceNotFoundError from utilities.exceptions Signed-off-by: Shehan Saleem rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED --- tests/model_explainability/lm_eval/conftest.py | 7 +++++-- tests/model_explainability/lm_eval/utils.py | 6 +++--- utilities/exceptions.py | 4 ---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/model_explainability/lm_eval/conftest.py b/tests/model_explainability/lm_eval/conftest.py index 600f6c476..7b24bb656 100644 --- a/tests/model_explainability/lm_eval/conftest.py +++ b/tests/model_explainability/lm_eval/conftest.py @@ -592,11 +592,14 @@ def lmeval_vllm_serving_runtime( "gaudi": RuntimeTemplates.VLLM_GAUDI, } - accelerator_type = supported_accelerator_type.lower() if supported_accelerator_type else "nvidia" + if not supported_accelerator_type: + pytest.skip("supported_accelerator_type is required for GPU-backed vLLM tests") + + accelerator_type = supported_accelerator_type.lower() template_name = accelerator_to_template.get(accelerator_type) if not template_name: - pytest.skip(f"Unsupported accelerator type for vLLM: {supported_accelerator_type}") + pytest.skip(f"Unsupported accelerator type for vLLM: {accelerator_type}") with ServingRuntimeFromTemplate( client=admin_client, diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py index eb74eb49b..1bf6d7d2e 100644 --- a/tests/model_explainability/lm_eval/utils.py +++ b/tests/model_explainability/lm_eval/utils.py @@ -4,6 +4,7 @@ import structlog from kubernetes.client.rest import ApiException from kubernetes.dynamic import DynamicClient +from kubernetes.dynamic.exceptions import ResourceNotFoundError from ocp_resources.lm_eval_job import LMEvalJob from ocp_resources.pod import Pod from pyhelper_utils.general import tts @@ -12,7 +13,6 @@ from utilities.constants import Timeout from utilities.exceptions import ( PodLogMissMatchError, - ResourceNotFoundError, UnexpectedFailureError, UnexpectedResourceCountError, ) @@ -181,9 +181,9 @@ def _check_model_ready() -> bool: ): if sample: break - except TimeoutExpiredError as e: + except TimeoutExpiredError: LOGGER.error(f"vLLM pod failed to start within {max_wait_time} seconds") collect_pod_information(pod=predictor_pod) - raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") from e + raise return predictor_pod diff --git a/utilities/exceptions.py b/utilities/exceptions.py index e68d327d4..f44be948b 100644 --- a/utilities/exceptions.py +++ b/utilities/exceptions.py @@ -131,7 +131,3 @@ class ExceptionUserLogin(Exception): class UnexpectedValueError(Exception): """Unexpected value found""" - - -class ResourceNotFoundError(Exception): - """Resource not found"""