Skip to content

Commit 699fc21

Browse files
committed
[RHOAIENG-25148] Addressed PR comments. Added more checks. Fixed teardown failure.
Signed-off-by: Andres Llausas <allausas@redhat.com>
1 parent 407dcb8 commit 699fc21

4 files changed

Lines changed: 153 additions & 54 deletions

File tree

tests/model_serving/model_server/kueue/test_kueue_isvc_raw.py

Lines changed: 76 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from utilities.exceptions import DeploymentValidationError
1111
from utilities.constants import RunTimeConfigs, KServeDeploymentType, ModelVersion, Timeout
1212
from utilities.general import create_isvc_label_selector_str
13+
from ocp_resources.pod import Pod
1314

1415

1516
pytestmark = [
@@ -18,16 +19,17 @@
1819
pytest.mark.usefixtures("valid_aws_config"),
1920
]
2021

21-
local_queue_name = "local-queue-raw"
22-
cluster_queue_name = "cluster-queue-raw"
23-
resource_flavor_name = "default-flavor-raw"
24-
cpu_quota = 2
25-
memory_quota = "10Gi"
26-
isvc_resources = {"requests": {"cpu": "1", "memory": "8Gi"}, "limits": {"cpu": cpu_quota, "memory": memory_quota}}
27-
min_replicas = (
22+
NAMESPACE_NAME = "kueue-isvc-raw-test"
23+
LOCAL_QUEUE_NAME = "local-queue-raw"
24+
CLUSTER_QUEUE_NAME = "cluster-queue-raw"
25+
RESOURCE_FLAVOR_NAME = "default-flavor-raw"
26+
CPU_QUOTA = 2
27+
MEMORY_QUOTA = "10Gi"
28+
ISVC_RESOURCES = {"requests": {"cpu": "1", "memory": "8Gi"}, "limits": {"cpu": CPU_QUOTA, "memory": MEMORY_QUOTA}}
29+
MIN_REPLICAS = (
2830
1 # min_replicas needs to be 1 or you need to change the test to check for the number of available replicas
2931
)
30-
max_replicas = 2
32+
MAX_REPLICAS = 2
3133

3234

3335
@pytest.mark.rawdeployment
@@ -36,27 +38,28 @@
3638
"kueue_cluster_queue_from_template, kueue_resource_flavor_from_template, kueue_local_queue_from_template",
3739
[
3840
pytest.param(
39-
{"name": "kueue-isvc-raw-test", "add-kueue-label": True},
41+
{"name": NAMESPACE_NAME, "add-kueue-label": True},
4042
RunTimeConfigs.ONNX_OPSET13_RUNTIME_CONFIG,
4143
{
4244
"name": "kueue-isvc-raw",
43-
"min-replicas": min_replicas,
44-
"max-replicas": max_replicas,
45-
"labels": {"kueue.x-k8s.io/queue-name": local_queue_name},
45+
"min-replicas": MIN_REPLICAS,
46+
"max-replicas": MAX_REPLICAS,
47+
"labels": {"kueue.x-k8s.io/queue-name": LOCAL_QUEUE_NAME},
4648
"deployment-mode": KServeDeploymentType.RAW_DEPLOYMENT,
4749
"model-dir": "test-dir",
4850
"model-version": ModelVersion.OPSET13,
49-
"resources": isvc_resources,
51+
"resources": ISVC_RESOURCES,
5052
},
5153
{
52-
"name": cluster_queue_name,
53-
"resource_flavor_name": resource_flavor_name,
54-
"cpu_quota": cpu_quota,
55-
"memory_quota": memory_quota,
54+
"name": CLUSTER_QUEUE_NAME,
55+
"resource_flavor_name": RESOURCE_FLAVOR_NAME,
56+
"cpu_quota": CPU_QUOTA,
57+
"memory_quota": MEMORY_QUOTA,
58+
# "namespace_selector": {"matchLabels": {"kubernetes.io/metadata.name": NAMESPACE_NAME}},
5659
"namespace_selector": {},
5760
},
58-
{"name": resource_flavor_name},
59-
{"name": local_queue_name, "cluster_queue": cluster_queue_name},
61+
{"name": RESOURCE_FLAVOR_NAME},
62+
{"name": LOCAL_QUEUE_NAME, "cluster_queue": CLUSTER_QUEUE_NAME},
6063
)
6164
],
6265
indirect=True,
@@ -67,8 +70,8 @@ class TestKueueInferenceServiceRaw:
6770
def test_kueue_inference_service_raw(
6871
self,
6972
admin_client,
70-
kueue_cluster_queue_from_template,
7173
kueue_resource_flavor_from_template,
74+
kueue_cluster_queue_from_template,
7275
kueue_local_queue_from_template,
7376
kueue_raw_inference_service,
7477
kueue_kserve_serving_runtime,
@@ -89,12 +92,14 @@ def test_kueue_inference_service_raw(
8992
)
9093
)
9194
if len(deployments) != 1:
92-
raise DeploymentValidationError("Too many deployments found")
95+
deployment_names = [deployment.instance.metadata.name for deployment in deployments]
96+
raise DeploymentValidationError(f"Expected 1 deployment, got {len(deployments)}: {deployment_names}")
9397

9498
deployment = deployments[0]
9599
deployment.wait_for_replicas(deployed=True)
96-
if deployment.instance.spec.replicas != 1:
97-
raise DeploymentValidationError("Deployment should have 1 replica")
100+
replicas = deployment.instance.spec.replicas
101+
if replicas != 1:
102+
raise DeploymentValidationError(f"Deployment should have 1 replica, got {replicas}")
98103

99104
# Update inference service to request 2 replicas
100105
isvc_to_update = kueue_raw_inference_service.instance.to_dict()
@@ -113,11 +118,55 @@ def test_kueue_inference_service_raw(
113118
)
114119
)
115120
if len(deployments) != 1:
116-
raise DeploymentValidationError("Too many deployments found")
121+
deployment_names = [deployment.instance.metadata.name for deployment in deployments]
122+
raise DeploymentValidationError(f"Expected 1 deployment, got {len(deployments)}: {deployment_names}")
117123

118124
deployment = deployments[0]
119125
try:
120126
deployment.wait_for_replicas(deployed=True, timeout=Timeout.TIMEOUT_30SEC)
121-
except TimeoutExpiredError:
122-
if deployment.instance.status.availableReplicas != 1:
123-
raise DeploymentValidationError("Deployment should have 1 available replica") from None
127+
except TimeoutExpiredError as e:
128+
available_replicas = deployment.instance.status.availableReplicas
129+
if available_replicas != 1:
130+
raise DeploymentValidationError(
131+
f"Deployment should have 1 available replica, got {available_replicas}"
132+
) from None
133+
# Get pods that match isvc labels and verify their status
134+
pods = list(
135+
Pod.get(
136+
label_selector=",".join(labels),
137+
namespace=kueue_raw_inference_service.namespace,
138+
dyn_client=admin_client,
139+
)
140+
)
141+
142+
if len(pods) != 2:
143+
pod_names = [pod.instance.metadata.name for pod in pods]
144+
raise DeploymentValidationError(f"Expected 2 pods, got {len(pods)}: {pod_names}") from e
145+
146+
running_pods = 0
147+
gated_pods = 0
148+
for pod in pods:
149+
pod_phase = pod.instance.status.phase
150+
if pod_phase == "Running":
151+
running_pods += 1
152+
elif pod_phase == "Pending" and all(
153+
condition.type == "PodScheduled"
154+
and condition.status == "False"
155+
and condition.reason == "SchedulingGated"
156+
for condition in pod.instance.status.conditions
157+
):
158+
gated_pods += 1
159+
160+
if running_pods != 1 or gated_pods != 1:
161+
raise DeploymentValidationError(
162+
f"Expected 1 Running pod and 1 SchedulingGated pod, "
163+
f"got {running_pods} Running and {gated_pods} SchedulingGated"
164+
) from e
165+
# Check InferenceService status for total model copies
166+
# Refresh the isvc instance to get latest status
167+
kueue_raw_inference_service.get()
168+
isvc = kueue_raw_inference_service.instance
169+
if isvc.status.modelStatus.copies.totalCopies != 1:
170+
raise DeploymentValidationError(
171+
f"InferenceService should have 1 total model copy, got {isvc.status.modelStatus.copies.totalCopies}"
172+
) from e

tests/model_serving/model_server/kueue/test_kueue_isvc_serverless.py

Lines changed: 71 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pytest
77
from ocp_resources.deployment import Deployment
8+
from ocp_resources.pod import Pod
89
from timeout_sampler import TimeoutExpiredError
910
from utilities.exceptions import DeploymentValidationError
1011
from utilities.constants import RunTimeConfigs, KServeDeploymentType, Timeout
@@ -18,16 +19,17 @@
1819
pytest.mark.usefixtures("valid_aws_config"),
1920
]
2021

21-
local_queue_name = "local-queue-serverless"
22-
cluster_queue_name = "cluster-queue-serverless"
23-
resource_flavor_name = "default-flavor-serverless"
24-
cpu_quota = 2
25-
memory_quota = "10Gi"
26-
isvc_resources = {"requests": {"cpu": "1", "memory": "8Gi"}, "limits": {"cpu": cpu_quota, "memory": memory_quota}}
27-
min_replicas = (
22+
NAMESPACE_NAME = "kueue-isvc-serverless-test"
23+
LOCAL_QUEUE_NAME = "local-queue-serverless"
24+
CLUSTER_QUEUE_NAME = "cluster-queue-serverless"
25+
RESOURCE_FLAVOR_NAME = "default-flavor-serverless"
26+
CPU_QUOTA = 2
27+
MEMORY_QUOTA = "10Gi"
28+
ISVC_RESOURCES = {"requests": {"cpu": "1", "memory": "8Gi"}, "limits": {"cpu": CPU_QUOTA, "memory": MEMORY_QUOTA}}
29+
MIN_REPLICAS = (
2830
1 # min_replicas needs to be 1 or you need to change the test to check for the number of available replicas
2931
)
30-
max_replicas = 2
32+
MAX_REPLICAS = 2
3133

3234

3335
@pytest.mark.serverless
@@ -36,26 +38,26 @@
3638
"kueue_cluster_queue_from_template, kueue_resource_flavor_from_template, kueue_local_queue_from_template",
3739
[
3840
pytest.param(
39-
{"name": "kueue-isvc-serverless-test", "add-kueue-label": True},
41+
{"name": NAMESPACE_NAME, "add-kueue-label": True},
4042
RunTimeConfigs.ONNX_OPSET13_RUNTIME_CONFIG,
4143
{
4244
**ONNX_SERVERLESS_INFERENCE_SERVICE_CONFIG,
4345
"name": "kueue",
44-
"min-replicas": min_replicas,
45-
"max-replicas": max_replicas,
46-
"labels": {"kueue.x-k8s.io/queue-name": local_queue_name},
46+
"min-replicas": MIN_REPLICAS,
47+
"max-replicas": MAX_REPLICAS,
48+
"labels": {"kueue.x-k8s.io/queue-name": LOCAL_QUEUE_NAME},
4749
"deployment-mode": KServeDeploymentType.SERVERLESS,
48-
"resources": isvc_resources,
50+
"resources": ISVC_RESOURCES,
4951
},
5052
{
51-
"name": cluster_queue_name,
52-
"resource_flavor_name": resource_flavor_name,
53-
"cpu_quota": cpu_quota,
54-
"memory_quota": memory_quota,
55-
"namespace_selector": {},
53+
"name": CLUSTER_QUEUE_NAME,
54+
"resource_flavor_name": RESOURCE_FLAVOR_NAME,
55+
"cpu_quota": CPU_QUOTA,
56+
"memory_quota": MEMORY_QUOTA,
57+
"namespace_selector": {"matchLabels": {"kubernetes.io/metadata.name": NAMESPACE_NAME}},
5658
},
57-
{"name": resource_flavor_name},
58-
{"name": local_queue_name, "cluster_queue": cluster_queue_name},
59+
{"name": RESOURCE_FLAVOR_NAME},
60+
{"name": LOCAL_QUEUE_NAME, "cluster_queue": CLUSTER_QUEUE_NAME},
5961
)
6062
],
6163
indirect=True,
@@ -66,8 +68,8 @@ class TestKueueInferenceServiceServerless:
6668
def test_kueue_inference_service_serverless(
6769
self,
6870
admin_client,
69-
kueue_cluster_queue_from_template,
7071
kueue_resource_flavor_from_template,
72+
kueue_cluster_queue_from_template,
7173
kueue_local_queue_from_template,
7274
kueue_kserve_inference_service,
7375
kueue_kserve_serving_runtime,
@@ -89,12 +91,14 @@ def test_kueue_inference_service_serverless(
8991
)
9092
)
9193
if len(deployments) != 1:
92-
raise DeploymentValidationError("Too many deployments found")
94+
deployment_names = [deployment.instance.metadata.name for deployment in deployments]
95+
raise DeploymentValidationError(f"Expected 1 deployment, got {len(deployments)}: {deployment_names}")
9396

9497
deployment = deployments[0]
9598
deployment.wait_for_replicas(deployed=True)
96-
if deployment.instance.spec.replicas != 1:
97-
raise DeploymentValidationError("Deployment should have 1 replica")
99+
replicas = deployment.instance.spec.replicas
100+
if replicas != 1:
101+
raise DeploymentValidationError(f"Deployment should have 1 replica, got {replicas}")
98102

99103
# Update inference service to request 2 replicas
100104
isvc_to_update = kueue_kserve_inference_service.instance.to_dict()
@@ -119,8 +123,49 @@ def test_kueue_inference_service_serverless(
119123
# it means spec.replicas == status.replicas == status.updatedReplicas ==
120124
# status.availableReplicas == status.readyReplicas
121125
deployment.wait_for_replicas(deployed=True, timeout=Timeout.TIMEOUT_30SEC)
122-
total_available_replicas += deployment.instance.spec.replicas
126+
total_available_replicas += deployment.instance.status.availableReplicas
123127
except TimeoutExpiredError:
124128
pass
125129
if total_available_replicas != 1:
126-
raise DeploymentValidationError("Total available replicas across all deployments should be 1")
130+
raise DeploymentValidationError(
131+
f"Total available replicas across all deployments should be 1, got {total_available_replicas}"
132+
)
133+
# Get pods that match isvc labels and verify their status
134+
pods = list(
135+
Pod.get(
136+
label_selector=",".join(labels),
137+
namespace=kueue_kserve_inference_service.namespace,
138+
dyn_client=admin_client,
139+
)
140+
)
141+
142+
if len(pods) != 3:
143+
pod_names = [pod.instance.metadata.name for pod in pods]
144+
raise DeploymentValidationError(f"Expected 3 pods, got {len(pods)}: {pod_names}")
145+
146+
running_pods = 0
147+
gated_pods = 0
148+
for pod in pods:
149+
pod_phase = pod.instance.status.phase
150+
if pod_phase == "Running":
151+
running_pods += 1
152+
elif pod_phase == "Pending" and all(
153+
condition.type == "PodScheduled"
154+
and condition.status == "False"
155+
and condition.reason == "SchedulingGated"
156+
for condition in pod.instance.status.conditions
157+
):
158+
gated_pods += 1
159+
160+
if running_pods != 1 or gated_pods != 2:
161+
raise DeploymentValidationError(
162+
f"Expected 1 Running pod and 2 SchedulingGated pods, "
163+
f"got {running_pods} Running and {gated_pods} SchedulingGated"
164+
)
165+
# Refresh the isvc instance to get latest status
166+
kueue_kserve_inference_service.get()
167+
isvc = kueue_kserve_inference_service.instance
168+
if isvc.status.modelStatus.copies.totalCopies != 1:
169+
raise DeploymentValidationError(
170+
f"InferenceService should have 1 total model copy, got {isvc.status.modelStatus.copies.totalCopies}"
171+
)

utilities/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,11 @@ class Kserve:
187187
class Nvidia:
188188
NVIDIA_COM_GPU: str = "nvidia.com/gpu"
189189

190+
class Kueue:
191+
# TODO: Change to kueue.openshift.io/managed once it's working
192+
MANAGED: str = "kueue-managed"
193+
# MANAGED: str = "kueue.openshift.io/managed"
194+
190195

191196
class Timeout:
192197
TIMEOUT_30SEC: int = 30

utilities/infra.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def create_ns(
127127
namespace_kwargs["label"][Labels.OpenDataHub.DASHBOARD] = "true" # type: ignore
128128

129129
if add_kueue_label:
130-
namespace_kwargs["label"]["kueue-managed"] = "true" # type: ignore
130+
namespace_kwargs["label"][Labels.Kueue.MANAGED] = "true" # type: ignore
131131

132132
if unprivileged_client:
133133
with ProjectRequest(name=name, client=unprivileged_client, teardown=teardown):

0 commit comments

Comments
 (0)