Skip to content

Commit 746d918

Browse files
fix: prevent finalizer from blocking subscription deletion when TRLP is in bad state
Classify errors during deletion as permanent (admission rejection, invalid spec, forbidden) vs transient (timeout, network, 5xx). Only take the force-delete escape hatch on permanent errors where retrying won't help. Transient errors return for requeue, preserving the shared TRLP. When force-deleting or proceeding despite cleanup failure, emit a Kubernetes Event on the subscription so operators have an auditable breadcrumb that an orphaned TRLP may need manual cleanup. Signed-off-by: Wen Liang <liangwen12year@gmail.com>
1 parent dbf6d03 commit 746d918

3 files changed

Lines changed: 253 additions & 9 deletions

File tree

deployment/base/maas-controller/rbac/clusterrole.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ rules:
2626
- get
2727
- list
2828
- watch
29+
- apiGroups:
30+
- ""
31+
resources:
32+
- events
33+
verbs:
34+
- create
35+
- patch
2936
- apiGroups:
3037
- ""
3138
resources:

maas-controller/pkg/controller/maas/maassubscription_controller.go

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"k8s.io/apimachinery/pkg/runtime"
3636
"k8s.io/apimachinery/pkg/runtime/schema"
3737
"k8s.io/apimachinery/pkg/types"
38+
"k8s.io/client-go/tools/record"
3839
"k8s.io/client-go/util/workqueue"
3940
ctrl "sigs.k8s.io/controller-runtime"
4041
"sigs.k8s.io/controller-runtime/pkg/builder"
@@ -52,7 +53,23 @@ import (
5253
// MaaSSubscriptionReconciler reconciles a MaaSSubscription object
5354
type MaaSSubscriptionReconciler struct {
5455
client.Client
55-
Scheme *runtime.Scheme
56+
Scheme *runtime.Scheme
57+
Recorder record.EventRecorder
58+
}
59+
60+
// isPermanentError returns true for errors that indicate a non-recoverable state
61+
// (admission rejection, invalid spec, forbidden) where retrying won't help.
62+
// Transient errors (timeouts, network, 5xx) return false and should be requeued.
63+
func isPermanentError(err error) bool {
64+
if err == nil {
65+
return false
66+
}
67+
return apierrors.IsInvalid(err) ||
68+
apierrors.IsForbidden(err) ||
69+
apierrors.IsConflict(err) ||
70+
apierrors.IsNotAcceptable(err) ||
71+
strings.Contains(err.Error(), "admission webhook") ||
72+
strings.Contains(err.Error(), "denied the request")
5673
}
5774

5875
//+kubebuilder:rbac:groups=maas.opendatahub.io,resources=maassubscriptions,verbs=get;list;watch;create;update;patch;delete
@@ -61,6 +78,7 @@ type MaaSSubscriptionReconciler struct {
6178
//+kubebuilder:rbac:groups=maas.opendatahub.io,resources=maasmodelrefs,verbs=get;list;watch
6279
//+kubebuilder:rbac:groups=kuadrant.io,resources=tokenratelimitpolicies,verbs=get;list;watch;create;update;patch;delete
6380
//+kubebuilder:rbac:groups=gateway.networking.k8s.io,resources=httproutes,verbs=get;list;watch
81+
//+kubebuilder:rbac:groups="",resources=events,verbs=create;patch
6482
//+kubebuilder:rbac:groups=gateway.networking.k8s.io,resources=httproutes/finalizers,verbs=update
6583

6684
const (
@@ -760,9 +778,6 @@ func (r *MaaSSubscriptionReconciler) deleteModelTRLP(ctx context.Context, log lo
760778

761779
func (r *MaaSSubscriptionReconciler) handleDeletion(ctx context.Context, log logr.Logger, subscription *maasv1alpha1.MaaSSubscription) (ctrl.Result, error) {
762780
if controllerutil.ContainsFinalizer(subscription, maasSubscriptionFinalizer) {
763-
// For each model referenced by this subscription, rebuild the aggregated TokenRateLimitPolicy
764-
// without the deleted subscription's limits. If no other subscriptions reference the model,
765-
// the TRLP will be deleted. This ensures zero-downtime rate limiting during subscription removal.
766781
seen := make(map[string]struct{}, len(subscription.Spec.ModelRefs))
767782
for _, modelRef := range subscription.Spec.ModelRefs {
768783
k := modelRef.Namespace + "/" + modelRef.Name
@@ -772,14 +787,31 @@ func (r *MaaSSubscriptionReconciler) handleDeletion(ctx context.Context, log log
772787
seen[k] = struct{}{}
773788
log.Info("Rebuilding TokenRateLimitPolicy without deleted subscription", "model", modelRef.Namespace+"/"+modelRef.Name, "subscription", subscription.Name)
774789
if err := r.reconcileTRLPForModel(ctx, log, modelRef.Namespace, modelRef.Name); err != nil {
775-
log.Error(err, "failed to reconcile TokenRateLimitPolicy during deletion, will retry", "model", modelRef.Namespace+"/"+modelRef.Name)
776-
return ctrl.Result{}, err
790+
if !isPermanentError(err) {
791+
log.Error(err, "transient TRLP reconciliation failure during deletion, will retry", "model", modelRef.Namespace+"/"+modelRef.Name)
792+
return ctrl.Result{}, err
793+
}
794+
log.Error(err, "permanent TRLP reconciliation failure during deletion, falling back to force-delete", "model", modelRef.Namespace+"/"+modelRef.Name)
795+
if delErr := r.deleteModelTRLP(ctx, log, modelRef.Namespace, modelRef.Name); delErr != nil {
796+
log.Error(delErr, "TRLP force-delete also failed, proceeding with finalizer removal", "model", modelRef.Namespace+"/"+modelRef.Name)
797+
}
798+
if r.Recorder != nil {
799+
r.Recorder.Eventf(subscription, "Warning", "TRLPCleanupFailed",
800+
"TRLP cleanup failed for model %s/%s during deletion (permanent error: %v); finalizer removed, orphaned TRLP may need manual cleanup",
801+
modelRef.Namespace, modelRef.Name, err)
802+
}
777803
}
778804
}
779-
// Also clean up stale TRLPs from modelRefs that were removed
780-
// before the CR was deleted (edge case: edit + delete before reconcile).
781805
if err := r.cleanupStaleTRLPs(ctx, log, subscription); err != nil {
782-
return ctrl.Result{}, err
806+
if !isPermanentError(err) {
807+
log.Error(err, "transient stale TRLP cleanup failure during deletion, will retry")
808+
return ctrl.Result{}, err
809+
}
810+
log.Error(err, "permanent stale TRLP cleanup failure during deletion, proceeding with finalizer removal")
811+
if r.Recorder != nil {
812+
r.Recorder.Eventf(subscription, "Warning", "StaleTRLPCleanupFailed",
813+
"Stale TRLP cleanup failed during deletion (permanent error: %v); finalizer removed, orphaned TRLP may need manual cleanup", err)
814+
}
783815
}
784816
controllerutil.RemoveFinalizer(subscription, maasSubscriptionFinalizer)
785817
if err := r.Update(ctx, subscription); err != nil {
@@ -941,6 +973,8 @@ func conditionsSemanticallyEqual(a, b *metav1.Condition) bool {
941973

942974
// SetupWithManager sets up the controller with the Manager.
943975
func (r *MaaSSubscriptionReconciler) SetupWithManager(mgr ctrl.Manager) error {
976+
r.Recorder = mgr.GetEventRecorderFor("maas-subscription-controller")
977+
944978
// Register field indexer for efficient lookup of MaaSSubscriptions by model reference.
945979
// This avoids cluster-wide scans when finding subscriptions for a specific model.
946980
if err := mgr.GetFieldIndexer().IndexField(

maas-controller/pkg/controller/maas/maassubscription_controller_test.go

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package maas
1818

1919
import (
2020
"context"
21+
"errors"
2122
"fmt"
2223
"strings"
2324
"testing"
@@ -31,6 +32,8 @@ import (
3132
ctrl "sigs.k8s.io/controller-runtime"
3233
"sigs.k8s.io/controller-runtime/pkg/client"
3334
"sigs.k8s.io/controller-runtime/pkg/client/fake"
35+
"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
36+
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3437
gatewayapiv1 "sigs.k8s.io/gateway-api/apis/v1"
3538

3639
maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1"
@@ -1346,3 +1349,203 @@ func TestMaaSSubscriptionReconciler_WindowValuesInTRLP(t *testing.T) {
13461349
})
13471350
}
13481351
}
1352+
1353+
// permanentUpdateError returns an apierrors.StatusError that isPermanentError classifies
1354+
// as non-recoverable (admission webhook rejection).
1355+
func permanentUpdateError() error {
1356+
return &apierrors.StatusError{ErrStatus: metav1.Status{
1357+
Status: metav1.StatusFailure,
1358+
Code: 422,
1359+
Reason: metav1.StatusReasonInvalid,
1360+
Message: "admission webhook denied the request: invalid window format",
1361+
}}
1362+
}
1363+
1364+
// TestHandleDeletion_PermanentError_FallsBackToDeleteTRLP verifies that when
1365+
// reconcileTRLPForModel fails with a permanent error (e.g. admission webhook rejects
1366+
// the TRLP update), the handler falls back to force-deleting the TRLP and removes
1367+
// the finalizer so the subscription deletion is not permanently blocked.
1368+
func TestHandleDeletion_PermanentError_FallsBackToDeleteTRLP(t *testing.T) {
1369+
const (
1370+
modelName = "shared-model"
1371+
namespace = "default"
1372+
trlpName = "maas-trlp-" + modelName
1373+
)
1374+
1375+
model := &maasv1alpha1.MaaSModelRef{
1376+
ObjectMeta: metav1.ObjectMeta{Name: modelName, Namespace: namespace},
1377+
Spec: maasv1alpha1.MaaSModelSpec{ModelRef: maasv1alpha1.ModelReference{Kind: "ExternalModel", Name: modelName}},
1378+
}
1379+
route := &gatewayapiv1.HTTPRoute{
1380+
ObjectMeta: metav1.ObjectMeta{Name: modelName, Namespace: namespace},
1381+
}
1382+
existingTRLP := newPreexistingTRLP(trlpName, namespace, modelName, map[string]string{})
1383+
subA := newMaaSSubscription("sub-a", namespace, "team-a", modelName, 100)
1384+
subA.Finalizers = []string{maasSubscriptionFinalizer}
1385+
subB := newMaaSSubscription("sub-b", namespace, "team-b", modelName, 200)
1386+
1387+
c := fake.NewClientBuilder().
1388+
WithScheme(scheme).
1389+
WithRESTMapper(testRESTMapper()).
1390+
WithObjects(model, route, subA, subB, existingTRLP).
1391+
WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer).
1392+
WithInterceptorFuncs(interceptor.Funcs{
1393+
Update: func(ctx context.Context, cl client.WithWatch, obj client.Object, opts ...client.UpdateOption) error {
1394+
if u, ok := obj.(*unstructured.Unstructured); ok && u.GetKind() == "TokenRateLimitPolicy" {
1395+
return permanentUpdateError()
1396+
}
1397+
return cl.Update(ctx, obj, opts...)
1398+
},
1399+
}).
1400+
Build()
1401+
1402+
if err := c.Delete(context.Background(), subA); err != nil {
1403+
t.Fatalf("Delete sub-a: %v", err)
1404+
}
1405+
1406+
r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme}
1407+
req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "sub-a", Namespace: namespace}}
1408+
if _, err := r.Reconcile(context.Background(), req); err != nil {
1409+
t.Fatalf("Reconcile: unexpected error: %v", err)
1410+
}
1411+
1412+
// TRLP should be deleted via the fallback deleteModelTRLP path
1413+
got := &unstructured.Unstructured{}
1414+
got.SetGroupVersionKind(schema.GroupVersionKind{Group: "kuadrant.io", Version: "v1alpha1", Kind: "TokenRateLimitPolicy"})
1415+
if err := c.Get(context.Background(), types.NamespacedName{Name: trlpName, Namespace: namespace}, got); !apierrors.IsNotFound(err) {
1416+
t.Errorf("expected TRLP to be deleted via fallback, but got err: %v", err)
1417+
}
1418+
1419+
// Finalizer should be removed
1420+
var sub maasv1alpha1.MaaSSubscription
1421+
if err := c.Get(context.Background(), types.NamespacedName{Name: "sub-a", Namespace: namespace}, &sub); !apierrors.IsNotFound(err) {
1422+
t.Errorf("expected sub-a to be fully deleted (finalizer removed), got err: %v", err)
1423+
}
1424+
}
1425+
1426+
// TestHandleDeletion_TransientError_RequeuesForRetry verifies that transient errors
1427+
// (timeouts, network flakes) during deletion are returned for requeue rather than
1428+
// triggering force-delete, which could collaterally damage shared TRLPs.
1429+
func TestHandleDeletion_TransientError_RequeuesForRetry(t *testing.T) {
1430+
const (
1431+
modelName = "shared-model"
1432+
namespace = "default"
1433+
trlpName = "maas-trlp-" + modelName
1434+
)
1435+
1436+
model := &maasv1alpha1.MaaSModelRef{
1437+
ObjectMeta: metav1.ObjectMeta{Name: modelName, Namespace: namespace},
1438+
Spec: maasv1alpha1.MaaSModelSpec{ModelRef: maasv1alpha1.ModelReference{Kind: "ExternalModel", Name: modelName}},
1439+
}
1440+
route := &gatewayapiv1.HTTPRoute{
1441+
ObjectMeta: metav1.ObjectMeta{Name: modelName, Namespace: namespace},
1442+
}
1443+
existingTRLP := newPreexistingTRLP(trlpName, namespace, modelName, map[string]string{})
1444+
subA := newMaaSSubscription("sub-a", namespace, "team-a", modelName, 100)
1445+
subA.Finalizers = []string{maasSubscriptionFinalizer}
1446+
subB := newMaaSSubscription("sub-b", namespace, "team-b", modelName, 200)
1447+
1448+
// Transient error: generic timeout, not an admission rejection
1449+
c := fake.NewClientBuilder().
1450+
WithScheme(scheme).
1451+
WithRESTMapper(testRESTMapper()).
1452+
WithObjects(model, route, subA, subB, existingTRLP).
1453+
WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer).
1454+
WithInterceptorFuncs(interceptor.Funcs{
1455+
Update: func(ctx context.Context, cl client.WithWatch, obj client.Object, opts ...client.UpdateOption) error {
1456+
if u, ok := obj.(*unstructured.Unstructured); ok && u.GetKind() == "TokenRateLimitPolicy" {
1457+
return errors.New("simulated transient API server timeout")
1458+
}
1459+
return cl.Update(ctx, obj, opts...)
1460+
},
1461+
}).
1462+
Build()
1463+
1464+
if err := c.Delete(context.Background(), subA); err != nil {
1465+
t.Fatalf("Delete sub-a: %v", err)
1466+
}
1467+
1468+
r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme}
1469+
req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "sub-a", Namespace: namespace}}
1470+
_, err := r.Reconcile(context.Background(), req)
1471+
if err == nil {
1472+
t.Fatal("Reconcile should return error on transient failure for requeue")
1473+
}
1474+
1475+
// TRLP should NOT be deleted — transient errors don't trigger force-delete
1476+
got := &unstructured.Unstructured{}
1477+
got.SetGroupVersionKind(schema.GroupVersionKind{Group: "kuadrant.io", Version: "v1alpha1", Kind: "TokenRateLimitPolicy"})
1478+
if getErr := c.Get(context.Background(), types.NamespacedName{Name: trlpName, Namespace: namespace}, got); getErr != nil {
1479+
t.Errorf("TRLP should still exist after transient error, but got: %v", getErr)
1480+
}
1481+
1482+
// Finalizer should NOT be removed — requeue will retry
1483+
var sub maasv1alpha1.MaaSSubscription
1484+
if getErr := c.Get(context.Background(), types.NamespacedName{Name: "sub-a", Namespace: namespace}, &sub); getErr != nil {
1485+
t.Fatalf("sub-a should still exist with finalizer, got: %v", getErr)
1486+
}
1487+
if !controllerutil.ContainsFinalizer(&sub, maasSubscriptionFinalizer) {
1488+
t.Error("sub-a finalizer should still be present after transient error")
1489+
}
1490+
}
1491+
1492+
// TestHandleDeletion_PermanentError_BothFail_StillRemovesFinalizer verifies that
1493+
// when both TRLP reconciliation and force-delete fail with permanent errors, the
1494+
// finalizer is still removed so the subscription is not stuck forever.
1495+
func TestHandleDeletion_PermanentError_BothFail_StillRemovesFinalizer(t *testing.T) {
1496+
const (
1497+
modelName = "shared-model"
1498+
namespace = "default"
1499+
trlpName = "maas-trlp-" + modelName
1500+
)
1501+
1502+
model := &maasv1alpha1.MaaSModelRef{
1503+
ObjectMeta: metav1.ObjectMeta{Name: modelName, Namespace: namespace},
1504+
Spec: maasv1alpha1.MaaSModelSpec{ModelRef: maasv1alpha1.ModelReference{Kind: "ExternalModel", Name: modelName}},
1505+
}
1506+
route := &gatewayapiv1.HTTPRoute{
1507+
ObjectMeta: metav1.ObjectMeta{Name: modelName, Namespace: namespace},
1508+
}
1509+
existingTRLP := newPreexistingTRLP(trlpName, namespace, modelName, map[string]string{})
1510+
subA := newMaaSSubscription("sub-a", namespace, "team-a", modelName, 100)
1511+
subA.Finalizers = []string{maasSubscriptionFinalizer}
1512+
subB := newMaaSSubscription("sub-b", namespace, "team-b", modelName, 200)
1513+
1514+
// Permanent error on Update + fail List so deleteModelTRLP also fails
1515+
c := fake.NewClientBuilder().
1516+
WithScheme(scheme).
1517+
WithRESTMapper(testRESTMapper()).
1518+
WithObjects(model, route, subA, subB, existingTRLP).
1519+
WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer).
1520+
WithInterceptorFuncs(interceptor.Funcs{
1521+
Update: func(ctx context.Context, cl client.WithWatch, obj client.Object, opts ...client.UpdateOption) error {
1522+
if u, ok := obj.(*unstructured.Unstructured); ok && u.GetKind() == "TokenRateLimitPolicy" {
1523+
return permanentUpdateError()
1524+
}
1525+
return cl.Update(ctx, obj, opts...)
1526+
},
1527+
List: func(ctx context.Context, cl client.WithWatch, list client.ObjectList, opts ...client.ListOption) error {
1528+
if ul, ok := list.(*unstructured.UnstructuredList); ok && ul.GetKind() == "TokenRateLimitPolicyList" {
1529+
return permanentUpdateError()
1530+
}
1531+
return cl.List(ctx, list, opts...)
1532+
},
1533+
}).
1534+
Build()
1535+
1536+
if err := c.Delete(context.Background(), subA); err != nil {
1537+
t.Fatalf("Delete sub-a: %v", err)
1538+
}
1539+
1540+
r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme}
1541+
req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "sub-a", Namespace: namespace}}
1542+
if _, err := r.Reconcile(context.Background(), req); err != nil {
1543+
t.Fatalf("Reconcile should not return error on permanent failure (finalizer must be removed), got: %v", err)
1544+
}
1545+
1546+
// Finalizer should be removed despite all cleanup failures
1547+
var sub maasv1alpha1.MaaSSubscription
1548+
if err := c.Get(context.Background(), types.NamespacedName{Name: "sub-a", Namespace: namespace}, &sub); !apierrors.IsNotFound(err) {
1549+
t.Errorf("expected sub-a to be fully deleted (finalizer removed despite permanent failures), got err: %v", err)
1550+
}
1551+
}

0 commit comments

Comments
 (0)