opendatahub-io
diff --git a/‎pytest.ini‎
Lines changed: 1 addition & 0 deletions b/‎pytest.ini‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/model_serving/model_server/kueue/conftest.py‎
Lines changed: 207 additions & 0 deletions b/‎tests/model_serving/model_server/kueue/conftest.py‎
Lines changed: 207 additions & 0 deletions
diff --git a/‎tests/model_serving/model_server/kueue/test_kueue_isvc_raw.py‎
Lines changed: 157 additions & 0 deletions b/‎tests/model_serving/model_server/kueue/test_kueue_isvc_raw.py‎
Lines changed: 157 additions & 0 deletions
@@ -26,6 +26,7 @@ markers =
     minio: Mark tests which are using MinIO storage
     tls: Mark tests which are testing TLS
     metrics: Mark tests which are testing metrics
+    kueue: Mark tests which are testing Kueue
 
 addopts =
     -s
 
@@ -0,0 +1,207 @@
+from typing import Generator, Any, Dict
+
+import pytest
+from kubernetes.dynamic import DynamicClient
+from _pytest.fixtures import FixtureRequest
+from utilities.kueue_utils import (
+    create_local_queue,
+    create_cluster_queue,
+    create_resource_flavor,
+    LocalQueue,
+    ClusterQueue,
+    ResourceFlavor,
+)
+from ocp_resources.namespace import Namespace
+from utilities.constants import ModelAndFormat, KServeDeploymentType
+from utilities.inference_utils import create_isvc
+from utilities.serving_runtime import ServingRuntimeFromTemplate
+from ocp_resources.secret import Secret
+from ocp_resources.inference_service import InferenceService
+from ocp_resources.serving_runtime import ServingRuntime
+from utilities.constants import RuntimeTemplates, ModelFormat
+import logging
+
+BASIC_LOGGER = logging.getLogger(name="basic")
+
+
+def kueue_resource_groups(
+    flavor_name: str,
+    cpu_quota: int,
+    memory_quota: str,
+) -> list[Dict[str, Any]]:
+    return [
+        {
+            "coveredResources": ["cpu", "memory"],
+            "flavors": [
+                {
+                    "name": flavor_name,
+                    "resources": [
+                        {"name": "cpu", "nominalQuota": cpu_quota},
+                        {"name": "memory", "nominalQuota": memory_quota},
+                    ],
+                }
+            ],
+        }
+    ]
+
+
+@pytest.fixture(scope="class")
+def kueue_cluster_queue_from_template(
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+) -> Generator[ClusterQueue, Any, Any]:
+    if request.param.get("name") is None:
+        raise ValueError("name is required")
+    with create_cluster_queue(
+        name=request.param.get("name"),
+        client=admin_client,
+        resource_groups=kueue_resource_groups(
+            request.param.get("resource_flavor_name"), request.param.get("cpu_quota"), request.param.get("memory_quota")
+        ),
+        namespace_selector=request.param.get("namespace_selector", {}),
+    ) as cluster_queue:
+        yield cluster_queue
+
+
+@pytest.fixture(scope="class")
+def kueue_resource_flavor_from_template(
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+) -> Generator[ResourceFlavor, Any, Any]:
+    if request.param.get("name") is None:
+        raise ValueError("name is required")
+    with create_resource_flavor(
+        name=request.param.get("name"),
+        client=admin_client,
+    ) as resource_flavor:
+        yield resource_flavor
+
+
+@pytest.fixture(scope="class")
+def kueue_local_queue_from_template(
+    request: FixtureRequest,
+    unprivileged_model_namespace: Namespace,
+    admin_client: DynamicClient,
+) -> Generator[LocalQueue, Any, Any]:
+    if request.param.get("name") is None:
+        raise ValueError("name is required")
+    if request.param.get("cluster_queue") is None:
+        raise ValueError("cluster_queue is required")
+    with create_local_queue(
+        name=request.param.get("name"),
+        namespace=unprivileged_model_namespace.name,
+        cluster_queue=request.param.get("cluster_queue"),
+        client=admin_client,
+    ) as local_queue:
+        yield local_queue
+
+
+@pytest.fixture(scope="class")
+def kueue_raw_inference_service(
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+    unprivileged_model_namespace: Namespace,
+    kueue_kserve_serving_runtime: ServingRuntime,
+    ci_endpoint_s3_secret: Secret,
+) -> Generator[InferenceService, Any, Any]:
+    with create_isvc(
+        client=admin_client,
+        name=f"{request.param['name']}-raw",
+        namespace=unprivileged_model_namespace.name,
+        external_route=True,
+        runtime=kueue_kserve_serving_runtime.name,
+        storage_path=request.param["model-dir"],
+        storage_key=ci_endpoint_s3_secret.name,
+        model_format=ModelAndFormat.OPENVINO_IR,
+        deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
+        model_version=request.param["model-version"],
+        labels=request.param.get("labels", {}),
+        resources=request.param.get(
+            "resources", {"requests": {"cpu": "1", "memory": "8Gi"}, "limits": {"cpu": "2", "memory": "10Gi"}}
+        ),
+        min_replicas=request.param.get("min-replicas", 1),
+        max_replicas=request.param.get("max-replicas", 2),
+    ) as isvc:
+        yield isvc
+
+
+@pytest.fixture(scope="class")
+def kueue_kserve_inference_service(
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+    unprivileged_model_namespace: Namespace,
+    kueue_kserve_serving_runtime: ServingRuntime,
+    ci_endpoint_s3_secret: Secret,
+) -> Generator[InferenceService, Any, Any]:
+    deployment_mode = request.param["deployment-mode"]
+    isvc_kwargs = {
+        "client": admin_client,
+        "name": f"{request.param['name']}-{deployment_mode.lower()}",
+        "namespace": unprivileged_model_namespace.name,
+        "runtime": kueue_kserve_serving_runtime.name,
+        "storage_path": request.param["model-dir"],
+        "storage_key": ci_endpoint_s3_secret.name,
+        "model_format": ModelAndFormat.OPENVINO_IR,
+        "deployment_mode": deployment_mode,
+        "model_version": request.param["model-version"],
+        "labels": request.param.get("labels", {}),
+        "resources": request.param.get(
+            "resources", {"requests": {"cpu": "1", "memory": "8Gi"}, "limits": {"cpu": "2", "memory": "10Gi"}}
+        ),
+        "min_replicas": request.param.get("min-replicas", 1),
+        "max_replicas": request.param.get("max-replicas", 2),
+    }
+
+    if env_vars := request.param.get("env-vars"):
+        isvc_kwargs["model_env_variables"] = env_vars
+
+    if (min_replicas := request.param.get("min-replicas")) is not None:
+        isvc_kwargs["min_replicas"] = min_replicas
+        if min_replicas == 0:
+            isvc_kwargs["wait_for_predictor_pods"] = False
+
+    if scale_metric := request.param.get("scale-metric"):
+        isvc_kwargs["scale_metric"] = scale_metric
+
+    if (scale_target := request.param.get("scale-target")) is not None:
+        isvc_kwargs["scale_target"] = scale_target
+
+    if (resources := request.param.get("resources")) is not None:
+        isvc_kwargs["resources"] = resources
+
+    print("isvc_kwargs before create_isvc", isvc_kwargs)
+    with create_isvc(**isvc_kwargs) as isvc:
+        yield isvc
+
+
+@pytest.fixture(scope="class")
+def kueue_kserve_serving_runtime(
+    request: FixtureRequest,
+    unprivileged_client: DynamicClient,
+    unprivileged_model_namespace: Namespace,
+) -> Generator[ServingRuntime, Any, Any]:
+    runtime_kwargs = {
+        "client": unprivileged_client,
+        "namespace": unprivileged_model_namespace.name,
+        "name": request.param["runtime-name"],
+        "template_name": RuntimeTemplates.OVMS_KSERVE,
+        "multi_model": False,
+        "resources": {
+            ModelFormat.OVMS: {
+                "requests": {"cpu": "1", "memory": "8Gi"},
+                "limits": {"cpu": "2", "memory": "10Gi"},
+            }
+        },
+    }
+
+    if model_format_name := request.param.get("model-format"):
+        runtime_kwargs["model_format_name"] = model_format_name
+
+    if supported_model_formats := request.param.get("supported-model-formats"):
+        runtime_kwargs["supported_model_formats"] = supported_model_formats
+
+    if runtime_image := request.param.get("runtime-image"):
+        runtime_kwargs["runtime_image"] = runtime_image
+
+    with ServingRuntimeFromTemplate(**runtime_kwargs) as model_runtime:
+        yield model_runtime
@@ -0,0 +1,157 @@
+"""
+Integration test for Kueue and InferenceService admission control.
+This test imports the reusable test logic from utilities.kueue_utils.
+"""
+
+import pytest
+from ocp_resources.deployment import Deployment
+from timeout_sampler import TimeoutExpiredError, TimeoutSampler
+from utilities.constants import RunTimeConfigs, KServeDeploymentType, ModelVersion
+from utilities.general import create_isvc_label_selector_str
+from utilities.kueue_utils import check_gated_pods_and_running_pods
+
+pytestmark = [
+    pytest.mark.rawdeployment,
+    pytest.mark.sanity,
+    pytest.mark.usefixtures("valid_aws_config"),
+    pytest.mark.kueue,
+    pytest.mark.smoke,
+]
+
+NAMESPACE_NAME = "kueue-isvc-raw-test"
+LOCAL_QUEUE_NAME = "local-queue-raw"
+CLUSTER_QUEUE_NAME = "cluster-queue-raw"
+RESOURCE_FLAVOR_NAME = "default-flavor-raw"
+CPU_QUOTA = 2
+MEMORY_QUOTA = "10Gi"
+ISVC_RESOURCES = {"requests": {"cpu": "1", "memory": "8Gi"}, "limits": {"cpu": CPU_QUOTA, "memory": MEMORY_QUOTA}}
+# min_replicas needs to be 1 or you need to change the test to check for the number of
+# available replicas
+MIN_REPLICAS = 1
+MAX_REPLICAS = 2
+EXPECTED_RUNNING_PODS = 1
+EXPECTED_GATED_PODS = 1
+EXPECTED_DEPLOYMENTS = 1
+EXPECTED_INITIAL_REPLICAS = 1
+EXPECTED_UPDATED_REPLICAS = 2
+
+
+@pytest.mark.rawdeployment
+@pytest.mark.parametrize(
+    "unprivileged_model_namespace, kueue_kserve_serving_runtime, kueue_raw_inference_service, "
+    "kueue_cluster_queue_from_template, kueue_resource_flavor_from_template, kueue_local_queue_from_template",
+    [
+        pytest.param(
+            {"name": NAMESPACE_NAME, "add-kueue-label": True},
+            RunTimeConfigs.ONNX_OPSET13_RUNTIME_CONFIG,
+            {
+                "name": "kueue-isvc-raw",
+                "min-replicas": MIN_REPLICAS,
+                "max-replicas": MAX_REPLICAS,
+                "labels": {"kueue.x-k8s.io/queue-name": LOCAL_QUEUE_NAME},
+                "deployment-mode": KServeDeploymentType.RAW_DEPLOYMENT,
+                "model-dir": "test-dir",
+                "model-version": ModelVersion.OPSET13,
+                "resources": ISVC_RESOURCES,
+            },
+            {
+                "name": CLUSTER_QUEUE_NAME,
+                "resource_flavor_name": RESOURCE_FLAVOR_NAME,
+                "cpu_quota": CPU_QUOTA,
+                "memory_quota": MEMORY_QUOTA,
+                # "namespace_selector": {"matchLabels": {"kubernetes.io/metadata.name": NAMESPACE_NAME}},
+                "namespace_selector": {},
+            },
+            {"name": RESOURCE_FLAVOR_NAME},
+            {"name": LOCAL_QUEUE_NAME, "cluster_queue": CLUSTER_QUEUE_NAME},
+        )
+    ],
+    indirect=True,
+)
+class TestKueueInferenceServiceRaw:
+    """Test inference service with raw deployment"""
+
+    def _get_deployment_status_replicas(self, deployment: Deployment) -> int:
+        deployment.get()
+        return deployment.instance.status.replicas
+
+    def test_kueue_inference_service_raw(
+        self,
+        admin_client,
+        kueue_resource_flavor_from_template,
+        kueue_cluster_queue_from_template,
+        kueue_local_queue_from_template,
+        kueue_raw_inference_service,
+        kueue_kserve_serving_runtime,
+    ):
+        """Test inference service with raw deployment"""
+        deployment_labels = [
+            create_isvc_label_selector_str(
+                isvc=kueue_raw_inference_service,
+                resource_type="deployment",
+                runtime_name=kueue_kserve_serving_runtime.name,
+            )
+        ]
+        pod_labels = [
+            create_isvc_label_selector_str(
+                isvc=kueue_raw_inference_service,
+                resource_type="pod",
+                runtime_name=kueue_kserve_serving_runtime.name,
+            )
+        ]
+        deployments = list(
+            Deployment.get(
+                label_selector=",".join(deployment_labels),
+                namespace=kueue_raw_inference_service.namespace,
+                dyn_client=admin_client,
+            )
+        )
+        assert len(deployments) == EXPECTED_DEPLOYMENTS, (
+            f"Expected {EXPECTED_DEPLOYMENTS} deployment, got {len(deployments)}"
+        )
+
+        deployment = deployments[0]
+        deployment.wait_for_replicas(deployed=True)
+        replicas = deployment.instance.spec.replicas
+        assert replicas == EXPECTED_INITIAL_REPLICAS, (
+            f"Deployment should have {EXPECTED_INITIAL_REPLICAS} replica, got {replicas}"
+        )
+
+        # Update inference service to request 2 replicas
+        isvc_to_update = kueue_raw_inference_service.instance.to_dict()
+        isvc_to_update["spec"]["predictor"]["minReplicas"] = EXPECTED_UPDATED_REPLICAS
+        kueue_raw_inference_service.update(isvc_to_update)
+
+        # Check the deployment until it has 2 replicas, which means it's been updated
+        for replicas in TimeoutSampler(
+            wait_timeout=30,
+            sleep=2,
+            func=lambda: self._get_deployment_status_replicas(deployment),
+        ):
+            if replicas == EXPECTED_UPDATED_REPLICAS:
+                break
+
+        # Verify only 1 pod is running due to Kueue admission control, 1 pod is pending due to Kueue admission control
+        try:
+            for running_pods, gated_pods in TimeoutSampler(
+                wait_timeout=30,
+                sleep=2,
+                func=lambda: check_gated_pods_and_running_pods(
+                    pod_labels, kueue_raw_inference_service.namespace, admin_client
+                ),
+            ):
+                if running_pods == EXPECTED_RUNNING_PODS and gated_pods == EXPECTED_GATED_PODS:
+                    break
+        except TimeoutExpiredError:
+            assert False, (
+                f"Timeout waiting for {EXPECTED_RUNNING_PODS} running pods and "
+                f"{EXPECTED_GATED_PODS} gated pods, got {running_pods} running pods and {gated_pods} gated pods"
+            )
+
+        # Refresh the isvc instance to get latest status
+        kueue_raw_inference_service.get()
+        isvc = kueue_raw_inference_service.instance
+        total_copies = isvc.status.modelStatus.copies.totalCopies
+        assert total_copies == EXPECTED_RUNNING_PODS, (
+            f"InferenceService should have {EXPECTED_RUNNING_PODS} total model copy, got {total_copies}"
+        )