Skip to content

Commit 6ca5685

Browse files
authored
tighten e2e fixture ensure correctness (#1032)
* tighten e2e fixture ensure correctness Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * rm nil contract noise Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> --------- Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com>
1 parent 934737c commit 6ca5685

3 files changed

Lines changed: 50 additions & 29 deletions

File tree

test/e2e/fixtures/inference_objective.go

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,25 +21,32 @@ var inferenceObjectiveGVR = schema.GroupVersionResource{
2121
Resource: "inferenceobjectives",
2222
}
2323

24-
// EnsureInferenceObjective creates the e2e-default InferenceObjective for GIE flow-control
24+
const defaultInferenceObjectiveName = "e2e-default"
25+
26+
// EnsureInferenceObjective creates the default InferenceObjective for GIE flow-control
2527
// queuing when the CRD exists. poolName must match the InferencePool metadata.name (typically the
2628
// EPP Service name with an "-epp" suffix removed, e.g. gaie-sim-epp → gaie-sim).
2729
//
2830
// Returns applied=true if the object exists or was created. If the InferenceObjective API is not
2931
// available on the cluster, returns (false, nil).
3032
func EnsureInferenceObjective(ctx context.Context, dc dynamic.Interface, namespace, poolName string) (applied bool, err error) {
33+
return EnsureInferenceObjectiveNamed(ctx, dc, namespace, defaultInferenceObjectiveName, poolName)
34+
}
35+
36+
// EnsureInferenceObjectiveNamed creates or updates a named InferenceObjective when the CRD exists.
37+
func EnsureInferenceObjectiveNamed(ctx context.Context, dc dynamic.Interface, namespace, objectiveName, poolName string) (applied bool, err error) {
3138
ri := dc.Resource(inferenceObjectiveGVR).Namespace(namespace)
3239
poolGroup, gErr := resolveInferencePoolGroup(ctx, dc, namespace, poolName)
3340
if gErr != nil {
3441
return false, gErr
3542
}
36-
obj := buildInferenceObjective(namespace, poolName, poolGroup)
43+
obj := buildInferenceObjective(namespace, objectiveName, poolName, poolGroup)
3744

3845
if _, cErr := ri.Create(ctx, obj, metav1.CreateOptions{}); cErr != nil {
3946
if apierrors.IsAlreadyExists(cErr) {
40-
current, getErr := ri.Get(ctx, "e2e-default", metav1.GetOptions{})
47+
current, getErr := ri.Get(ctx, objectiveName, metav1.GetOptions{})
4148
if getErr != nil {
42-
return false, fmt.Errorf("get existing InferenceObjective e2e-default: %w", getErr)
49+
return false, fmt.Errorf("get existing InferenceObjective %s: %w", objectiveName, getErr)
4350
}
4451

4552
currentSpec, _, specErr := unstructured.NestedMap(current.Object, "spec")
@@ -58,34 +65,39 @@ func EnsureInferenceObjective(ctx context.Context, dc dynamic.Interface, namespa
5865
return false, fmt.Errorf("set desired InferenceObjective spec: %w", setErr)
5966
}
6067
if _, uErr := ri.Update(ctx, current, metav1.UpdateOptions{}); uErr != nil {
61-
return false, fmt.Errorf("update InferenceObjective e2e-default: %w", uErr)
68+
return false, fmt.Errorf("update InferenceObjective %s: %w", objectiveName, uErr)
6269
}
6370
return true, nil
6471
}
6572
if inferenceObjectiveAPIMissing(cErr) {
6673
return false, nil
6774
}
68-
return false, fmt.Errorf("create InferenceObjective e2e-default: %w", cErr)
75+
return false, fmt.Errorf("create InferenceObjective %s: %w", objectiveName, cErr)
6976
}
7077
return true, nil
7178
}
7279

73-
// DeleteInferenceObjective removes e2e-default InferenceObjective if present.
80+
// DeleteInferenceObjective removes the default InferenceObjective if present.
7481
func DeleteInferenceObjective(ctx context.Context, dc dynamic.Interface, namespace string) error {
75-
err := dc.Resource(inferenceObjectiveGVR).Namespace(namespace).Delete(ctx, "e2e-default", metav1.DeleteOptions{})
82+
return DeleteInferenceObjectiveNamed(ctx, dc, namespace, defaultInferenceObjectiveName)
83+
}
84+
85+
// DeleteInferenceObjectiveNamed removes a named InferenceObjective if present.
86+
func DeleteInferenceObjectiveNamed(ctx context.Context, dc dynamic.Interface, namespace, objectiveName string) error {
87+
err := dc.Resource(inferenceObjectiveGVR).Namespace(namespace).Delete(ctx, objectiveName, metav1.DeleteOptions{})
7688
if apierrors.IsNotFound(err) || inferenceObjectiveAPIMissing(err) {
7789
return nil
7890
}
7991
return err
8092
}
8193

82-
func buildInferenceObjective(namespace, poolName, poolGroup string) *unstructured.Unstructured {
94+
func buildInferenceObjective(namespace, objectiveName, poolName, poolGroup string) *unstructured.Unstructured {
8395
return &unstructured.Unstructured{
8496
Object: map[string]interface{}{
8597
"apiVersion": "inference.networking.x-k8s.io/v1alpha2",
8698
"kind": "InferenceObjective",
8799
"metadata": map[string]interface{}{
88-
"name": "e2e-default",
100+
"name": objectiveName,
89101
"namespace": namespace,
90102
},
91103
"spec": map[string]interface{}{
@@ -145,6 +157,5 @@ func resolveInferencePoolGroup(ctx context.Context, dc dynamic.Interface, namesp
145157
return "", fmt.Errorf("detect InferencePool API group for %q: %w", poolName, err)
146158
}
147159

148-
// Default to the primary group used by llm-d charts.
149-
return "inference.networking.k8s.io", nil
160+
return "", fmt.Errorf("detect InferencePool API group for %q: no supported InferencePool resource found", poolName)
150161
}

test/e2e/fixtures/lws_builder.go

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package fixtures
33
import (
44
"context"
55
"fmt"
6+
"reflect"
67
"time"
78

89
corev1 "k8s.io/api/core/v1"
@@ -17,24 +18,23 @@ import (
1718
// EnsureModelServiceLWS creates or replaces a LeaderWorkerSet for model service (idempotent for test setup).
1819
func EnsureModelServiceLWS(ctx context.Context, crClient client.Client, k8sClient *kubernetes.Clientset, namespace, name, poolName, modelID string, useSimulator bool, maxNumSeqs int, groupSize int32) error {
1920
lwsName := name + "-decode"
21+
desiredLWS := buildModelServiceLWS(namespace, name, poolName, modelID, useSimulator, maxNumSeqs, groupSize)
2022

2123
// Check if LWS already exists
2224
existingLWS := &lwsv1.LeaderWorkerSet{}
2325
err := crClient.Get(ctx, client.ObjectKey{Name: lwsName, Namespace: namespace}, existingLWS)
2426
if err == nil {
2527
// LWS exists, check if it's ready
26-
if existingLWS.Status.ReadyReplicas > 0 {
28+
if existingLWS.Status.ReadyReplicas > 0 && modelServiceLWSMatchesDesired(*existingLWS, *desiredLWS) {
2729
return nil
2830
}
2931
// Not ready, delete and recreate
3032
propagationPolicy := metav1.DeletePropagationForeground
3133
deleteErr := crClient.Delete(ctx, existingLWS, &client.DeleteOptions{
3234
PropagationPolicy: &propagationPolicy,
3335
})
34-
if deleteErr != nil && !errors.IsNotFound(deleteErr) {
35-
if !errors.IsConflict(deleteErr) {
36-
return fmt.Errorf("delete existing LWS %s: %w", lwsName, deleteErr)
37-
}
36+
if deleteErr != nil && !errors.IsNotFound(deleteErr) && !errors.IsConflict(deleteErr) {
37+
return fmt.Errorf("delete existing LWS %s: %w", lwsName, deleteErr)
3838
}
3939
// Wait for deletion
4040
waitCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
@@ -53,19 +53,24 @@ func EnsureModelServiceLWS(ctx context.Context, crClient client.Client, k8sClien
5353
return fmt.Errorf("check existing LWS %s: %w", lwsName, err)
5454
}
5555

56-
lws := buildModelServiceLWS(namespace, name, poolName, modelID, useSimulator, maxNumSeqs, groupSize)
57-
err = crClient.Create(ctx, lws)
56+
err = crClient.Create(ctx, desiredLWS)
5857
if err != nil && errors.IsAlreadyExists(err) {
5958
propagationPolicy := metav1.DeletePropagationForeground
60-
_ = crClient.Delete(ctx, lws, &client.DeleteOptions{
59+
_ = crClient.Delete(ctx, desiredLWS, &client.DeleteOptions{
6160
PropagationPolicy: &propagationPolicy,
6261
})
6362
time.Sleep(2 * time.Second)
64-
err = crClient.Create(ctx, lws)
63+
err = crClient.Create(ctx, desiredLWS)
6564
}
6665
return err
6766
}
6867

68+
func modelServiceLWSMatchesDesired(existing, desired lwsv1.LeaderWorkerSet) bool {
69+
return reflect.DeepEqual(existing.Spec, desired.Spec) &&
70+
reflect.DeepEqual(existing.Labels, desired.Labels) &&
71+
reflect.DeepEqual(existing.Annotations, desired.Annotations)
72+
}
73+
6974
// DeleteModelServiceLWS deletes the LeaderWorkerSet for model service. Idempotent; ignores NotFound.
7075
func DeleteModelServiceLWS(ctx context.Context, crClient client.Client, namespace, name string) error {
7176
lwsName := name + "-decode"

test/e2e/fixtures/model_service_builder.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package fixtures
33
import (
44
"context"
55
"fmt"
6+
"reflect"
67
"strconv"
78
"time"
89

@@ -39,20 +40,19 @@ func DeleteModelService(ctx context.Context, k8sClient *kubernetes.Clientset, na
3940
func EnsureModelService(ctx context.Context, k8sClient *kubernetes.Clientset, namespace, name, poolName, modelID string, useSimulator bool, maxNumSeqs int) error {
4041
appLabel := name + "-decode"
4142
deploymentName := appLabel
43+
desiredDeployment := buildModelServiceDeployment(namespace, name, poolName, modelID, useSimulator, maxNumSeqs)
4244

4345
existingDeployment, err := k8sClient.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{})
4446
if err == nil {
45-
if existingDeployment.Status.ReadyReplicas > 0 {
47+
if existingDeployment.Status.ReadyReplicas > 0 && modelServiceDeploymentMatchesDesired(*existingDeployment, *desiredDeployment) {
4648
return nil
4749
}
4850
propagationPolicy := metav1.DeletePropagationForeground
4951
deleteErr := k8sClient.AppsV1().Deployments(namespace).Delete(ctx, deploymentName, metav1.DeleteOptions{
5052
PropagationPolicy: &propagationPolicy,
5153
})
52-
if deleteErr != nil && !errors.IsNotFound(deleteErr) {
53-
if !errors.IsConflict(deleteErr) {
54-
return fmt.Errorf("delete existing deployment %s: %w", deploymentName, deleteErr)
55-
}
54+
if deleteErr != nil && !errors.IsNotFound(deleteErr) && !errors.IsConflict(deleteErr) {
55+
return fmt.Errorf("delete existing deployment %s: %w", deploymentName, deleteErr)
5656
}
5757
if err := WaitUntilDeploymentDeleted(ctx, k8sClient, namespace, deploymentName, 2*time.Minute); err != nil {
5858
return fmt.Errorf("timeout waiting for deployment %s to be deleted: %w", deploymentName, err)
@@ -61,8 +61,7 @@ func EnsureModelService(ctx context.Context, k8sClient *kubernetes.Clientset, na
6161
return fmt.Errorf("check existing deployment %s: %w", deploymentName, err)
6262
}
6363

64-
deployment := buildModelServiceDeployment(namespace, name, poolName, modelID, useSimulator, maxNumSeqs)
65-
_, err = k8sClient.AppsV1().Deployments(namespace).Create(ctx, deployment, metav1.CreateOptions{})
64+
_, err = k8sClient.AppsV1().Deployments(namespace).Create(ctx, desiredDeployment, metav1.CreateOptions{})
6665
if err != nil && errors.IsAlreadyExists(err) {
6766
propagationPolicy := metav1.DeletePropagationForeground
6867
_ = k8sClient.AppsV1().Deployments(namespace).Delete(ctx, deploymentName, metav1.DeleteOptions{
@@ -71,11 +70,17 @@ func EnsureModelService(ctx context.Context, k8sClient *kubernetes.Clientset, na
7170
if waitErr := WaitUntilDeploymentDeleted(ctx, k8sClient, namespace, deploymentName, 2*time.Minute); waitErr != nil {
7271
return fmt.Errorf("timeout waiting for deployment %s to be deleted before recreate: %w", deploymentName, waitErr)
7372
}
74-
_, err = k8sClient.AppsV1().Deployments(namespace).Create(ctx, deployment, metav1.CreateOptions{})
73+
_, err = k8sClient.AppsV1().Deployments(namespace).Create(ctx, desiredDeployment, metav1.CreateOptions{})
7574
}
7675
return err
7776
}
7877

78+
func modelServiceDeploymentMatchesDesired(existing, desired appsv1.Deployment) bool {
79+
return reflect.DeepEqual(existing.Spec.Selector, desired.Spec.Selector) &&
80+
reflect.DeepEqual(existing.Spec.Template.Labels, desired.Spec.Template.Labels) &&
81+
reflect.DeepEqual(existing.Spec.Template.Spec, desired.Spec.Template.Spec)
82+
}
83+
7984
func buildModelServiceDeployment(namespace, name, poolName, modelID string, useSimulator bool, maxNumSeqs int) *appsv1.Deployment {
8085
appLabel := name + "-decode"
8186
image := "ghcr.io/llm-d/llm-d-inference-sim:v0.7.1"

0 commit comments

Comments
 (0)