Skip to content

Commit e86820f

Browse files
authored
Prevent Calico from setting the NetworkUnavailable condition on nodes when overlay networking gets disabled (#1703)
* add mutatingadmissionpolicies for NetworkUnavailable condition for calico when overlay is disabled * address review feedback
1 parent 91779be commit e86820f

8 files changed

Lines changed: 359 additions & 12 deletions

File tree

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
description: Mutating admission policy to control NetworkUnavailable condition set by Calico CNI plugin.
3+
name: calico-mutating-admission-policy
4+
version: 0.1.0
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
{{- if .Values.enabled }}
2+
---
3+
apiVersion: admissionregistration.k8s.io/v1alpha1
4+
kind: MutatingAdmissionPolicyBinding
5+
metadata:
6+
name: block-calico-network-unavailable-binding
7+
spec:
8+
policyName: block-calico-network-unavailable
9+
---
10+
# MutatingAdmissionPolicy to block Calico from setting the NetworkUnavailable condition on nodes.
11+
# This policy intercepts node/status updates from the calico-node service account and removes
12+
# any changes to the NetworkUnavailable condition, effectively preserving the existing condition.
13+
apiVersion: admissionregistration.k8s.io/v1alpha1
14+
kind: MutatingAdmissionPolicy
15+
metadata:
16+
name: block-calico-network-unavailable
17+
spec:
18+
failurePolicy: Fail
19+
reinvocationPolicy: IfNeeded
20+
matchConstraints:
21+
resourceRules:
22+
- apiGroups: [""]
23+
apiVersions: ["v1"]
24+
operations: ["UPDATE"]
25+
resources: ["nodes/status"]
26+
matchConditions:
27+
# Only apply to requests from the calico-node service account
28+
- name: is-calico-node
29+
expression: >-
30+
request.userInfo.username == "system:serviceaccount:kube-system:calico-node"
31+
# Only apply if calico is trying to set/modify NetworkUnavailable condition
32+
- name: has-network-unavailable-in-request
33+
expression: >-
34+
object.status.conditions.exists(c, c.type == 'NetworkUnavailable')
35+
variables:
36+
# Extract the current NetworkUnavailable condition from the old object (may be empty for new nodes)
37+
- name: oldNetworkUnavailableCondition
38+
expression: >-
39+
has(oldObject.status) && has(oldObject.status.conditions) ?
40+
oldObject.status.conditions.filter(c, c.type == 'NetworkUnavailable') : []
41+
# Remove NetworkUnavailable from the new conditions and preserve all other conditions
42+
- name: conditionsWithoutNetworkUnavailable
43+
expression: >-
44+
object.status.conditions.filter(c, c.type != 'NetworkUnavailable')
45+
# Reconstruct the final conditions: all non-NetworkUnavailable conditions + old NetworkUnavailable (if it existed)
46+
- name: finalConditions
47+
expression: >-
48+
variables.oldNetworkUnavailableCondition.size() > 0 ?
49+
variables.conditionsWithoutNetworkUnavailable + variables.oldNetworkUnavailableCondition :
50+
variables.conditionsWithoutNetworkUnavailable
51+
mutations:
52+
# Replace the entire conditions array with the reconstructed version
53+
- patchType: JSONPatch
54+
jsonPatch:
55+
expression: |
56+
[
57+
JSONPatch{
58+
op: "replace",
59+
path: "/status/conditions",
60+
value: variables.finalConditions
61+
}
62+
]
63+
{{- end }}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
enabled: false
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
// SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
package controlplane
6+
7+
import (
8+
"context"
9+
"fmt"
10+
11+
extensionsconfigv1alpha1 "github.com/gardener/gardener/extensions/pkg/apis/config/v1alpha1"
12+
extensionscontroller "github.com/gardener/gardener/extensions/pkg/controller"
13+
"github.com/gardener/gardener/extensions/pkg/controller/controlplane"
14+
"github.com/gardener/gardener/extensions/pkg/util"
15+
extensionsv1alpha1 "github.com/gardener/gardener/pkg/apis/extensions/v1alpha1"
16+
"github.com/go-logr/logr"
17+
corev1 "k8s.io/api/core/v1"
18+
"k8s.io/client-go/util/retry"
19+
"sigs.k8s.io/controller-runtime/pkg/client"
20+
"sigs.k8s.io/controller-runtime/pkg/manager"
21+
22+
networking "github.com/gardener/gardener-extension-provider-aws/pkg/utils/networking"
23+
)
24+
25+
const (
26+
// NetworkUnavailableConditionType is the type of the NetworkUnavailable condition.
27+
NetworkUnavailableConditionType = "NetworkUnavailable"
28+
// CalicoIsUpReason is the reason set by Calico when it sets the NetworkUnavailable condition to indicate Calico is up.
29+
CalicoIsUpReason = "CalicoIsUp"
30+
// CalicoIsDownReason is the reason set by Calico when it sets the NetworkUnavailable condition to indicate Calico is down.
31+
CalicoIsDownReason = "CalicoIsDown"
32+
// AnnotationCalicoCleanupCompleted indicates that Calico condition cleanup has been completed.
33+
AnnotationCalicoCleanupCompleted = "aws.provider.extensions.gardener.cloud/calico-cleanup-completed"
34+
)
35+
36+
// NewActuator creates a new Actuator that wraps the generic actuator and adds cleanup logic.
37+
func NewActuator(mgr manager.Manager, a controlplane.Actuator) controlplane.Actuator {
38+
return &actuator{
39+
Actuator: a,
40+
client: mgr.GetClient(),
41+
}
42+
}
43+
44+
// actuator is an Actuator that acts upon and updates the status of ControlPlane resources.
45+
type actuator struct {
46+
controlplane.Actuator
47+
client client.Client
48+
}
49+
50+
func (a *actuator) Reconcile(
51+
ctx context.Context,
52+
log logr.Logger,
53+
cp *extensionsv1alpha1.ControlPlane,
54+
cluster *extensionscontroller.Cluster,
55+
) (bool, error) {
56+
// Call Reconcile on the composed Actuator
57+
ok, err := a.Actuator.Reconcile(ctx, log, cp, cluster)
58+
if err != nil {
59+
return ok, err
60+
}
61+
62+
// Only clean up NetworkUnavailable conditions if overlay is disabled
63+
overlayEnabled, err := networking.IsOverlayEnabled(cluster.Shoot.Spec.Networking)
64+
if err != nil {
65+
log.Error(err, "Failed to determine if overlay is enabled")
66+
return ok, err
67+
}
68+
69+
// Clean up NetworkUnavailable conditions set by Calico only when overlay is disabled
70+
// Only run cleanup if it hasn't been completed yet (annotation not present)
71+
if !overlayEnabled && cp.Annotations[AnnotationCalicoCleanupCompleted] != "true" {
72+
if err := a.cleanupCalicoNetworkUnavailableConditions(ctx, log, cp.Namespace, cluster); err != nil {
73+
log.Error(err, "Failed to cleanup Calico NetworkUnavailable conditions")
74+
return ok, err
75+
} else {
76+
// Mark cleanup as completed
77+
if err := a.markCleanupCompleted(ctx, cp); err != nil {
78+
log.Error(err, "Failed to mark cleanup as completed")
79+
return ok, err
80+
}
81+
}
82+
}
83+
84+
// Remove cleanup annotation when overlay is enabled so cleanup can run again if overlay is disabled later
85+
if overlayEnabled && cp.Annotations[AnnotationCalicoCleanupCompleted] == "true" {
86+
if err := a.removeCleanupAnnotation(ctx, cp); err != nil {
87+
log.Error(err, "Failed to remove cleanup annotation")
88+
return ok, err
89+
}
90+
}
91+
92+
return ok, nil
93+
}
94+
95+
// cleanupCalicoNetworkUnavailableConditions removes NetworkUnavailable conditions from nodes
96+
// that were set by Calico for example "CalicoIsUp" or "CalicoIsDown".
97+
func (a *actuator) cleanupCalicoNetworkUnavailableConditions(
98+
ctx context.Context,
99+
log logr.Logger,
100+
namespace string,
101+
cluster *extensionscontroller.Cluster,
102+
) error {
103+
if extensionscontroller.IsHibernated(cluster) {
104+
return nil
105+
}
106+
107+
_, shootClient, err := util.NewClientForShoot(ctx, a.client, namespace, client.Options{}, extensionsconfigv1alpha1.RESTOptions{})
108+
if err != nil {
109+
return fmt.Errorf("could not create shoot client: %w", err)
110+
}
111+
112+
nodes := &corev1.NodeList{}
113+
if err := shootClient.List(ctx, nodes); err != nil {
114+
return fmt.Errorf("could not list nodes in shoot cluster: %w", err)
115+
}
116+
117+
for _, node := range nodes.Items {
118+
if err := a.cleanupNodeNetworkUnavailableCondition(ctx, log, shootClient, &node); err != nil {
119+
log.Error(err, "Failed to cleanup NetworkUnavailable condition from node", "node", node.Name)
120+
return err
121+
}
122+
}
123+
124+
return nil
125+
}
126+
127+
// cleanupNodeNetworkUnavailableCondition removes the NetworkUnavailable condition from a node
128+
// if it was set by Calico.
129+
func (a *actuator) cleanupNodeNetworkUnavailableCondition(
130+
ctx context.Context,
131+
log logr.Logger,
132+
shootClient client.Client,
133+
node *corev1.Node,
134+
) error {
135+
// Check if the node has a NetworkUnavailable condition set by Calico
136+
hasCondition := false
137+
for _, condition := range node.Status.Conditions {
138+
if condition.Type == NetworkUnavailableConditionType &&
139+
(condition.Reason == CalicoIsUpReason || condition.Reason == CalicoIsDownReason) {
140+
hasCondition = true
141+
break
142+
}
143+
}
144+
145+
if !hasCondition {
146+
return nil
147+
}
148+
149+
// Remove the NetworkUnavailable condition
150+
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
151+
// Get the latest version of the node
152+
currentNode := &corev1.Node{}
153+
if err := shootClient.Get(ctx, client.ObjectKey{Name: node.Name}, currentNode); err != nil {
154+
return err
155+
}
156+
157+
// Filter out the NetworkUnavailable condition set by Calico
158+
var newConditions []corev1.NodeCondition
159+
removed := false
160+
for _, condition := range currentNode.Status.Conditions {
161+
if condition.Type == NetworkUnavailableConditionType &&
162+
(condition.Reason == CalicoIsUpReason || condition.Reason == CalicoIsDownReason) {
163+
removed = true
164+
log.Info("Removing NetworkUnavailable condition set by Calico", "node", currentNode.Name, "reason", condition.Reason)
165+
continue
166+
}
167+
newConditions = append(newConditions, condition)
168+
}
169+
170+
// Only update if we actually removed a condition
171+
if !removed {
172+
return nil
173+
}
174+
175+
currentNode.Status.Conditions = newConditions
176+
return shootClient.Status().Update(ctx, currentNode)
177+
})
178+
}
179+
180+
// markCleanupCompleted marks the cleanup as completed by adding an annotation to the ControlPlane resource.
181+
func (a *actuator) markCleanupCompleted(ctx context.Context, cp *extensionsv1alpha1.ControlPlane) error {
182+
patch := client.MergeFrom(cp.DeepCopy())
183+
if cp.Annotations == nil {
184+
cp.Annotations = make(map[string]string)
185+
}
186+
cp.Annotations[AnnotationCalicoCleanupCompleted] = "true"
187+
return a.client.Patch(ctx, cp, patch)
188+
}
189+
190+
// removeCleanupAnnotation removes the cleanup completion annotation from the ControlPlane resource.
191+
func (a *actuator) removeCleanupAnnotation(ctx context.Context, cp *extensionsv1alpha1.ControlPlane) error {
192+
patch := client.MergeFrom(cp.DeepCopy())
193+
delete(cp.Annotations, AnnotationCalicoCleanupCompleted)
194+
return a.client.Patch(ctx, cp, patch)
195+
}

pkg/controller/controlplane/add.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ type AddOptions struct {
4242
// AddToManagerWithOptions adds a controller with the given Options to the given manager.
4343
// The opts.Reconciler is being set with a newly instantiated actuator.
4444
func AddToManagerWithOptions(ctx context.Context, mgr manager.Manager, opts AddOptions) error {
45-
actuator, err := genericactuator.NewActuator(mgr, aws.Name,
45+
genericActuator, err := genericactuator.NewActuator(mgr, aws.Name,
4646
secretConfigsFunc, shootAccessSecretsFunc,
4747
configChart, controlPlaneChart, controlPlaneShootChart, controlPlaneShootCRDsChart, storageClassChart,
4848
NewValuesProvider(mgr), extensionscontroller.ChartRendererFactoryFunc(util.NewChartRendererForShoot),
@@ -51,8 +51,11 @@ func AddToManagerWithOptions(ctx context.Context, mgr manager.Manager, opts AddO
5151
return err
5252
}
5353

54+
// Wrap the generic actuator with our custom actuator for cleanup logic
55+
wrappedActuator := NewActuator(mgr, genericActuator)
56+
5457
return controlplane.Add(mgr, controlplane.AddArgs{
55-
Actuator: actuator,
58+
Actuator: wrappedActuator,
5659
ControllerOptions: opts.Controller,
5760
Predicates: controlplane.DefaultPredicates(ctx, mgr, opts.IgnoreOperationAnnotation),
5861
Type: aws.Type,

pkg/controller/controlplane/valuesprovider.go

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
versionutils "github.com/gardener/gardener/pkg/utils/version"
2929
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
3030
admissionregistrationv1 "k8s.io/api/admissionregistration/v1"
31+
admissionregistrationv1alpha1 "k8s.io/api/admissionregistration/v1alpha1"
3132
appsv1 "k8s.io/api/apps/v1"
3233
corev1 "k8s.io/api/core/v1"
3334
policyv1 "k8s.io/api/policy/v1"
@@ -46,6 +47,7 @@ import (
4647
"github.com/gardener/gardener-extension-provider-aws/pkg/apis/aws/helper"
4748
"github.com/gardener/gardener-extension-provider-aws/pkg/aws"
4849
"github.com/gardener/gardener-extension-provider-aws/pkg/utils"
50+
networking "github.com/gardener/gardener-extension-provider-aws/pkg/utils/networking"
4951
)
5052

5153
const (
@@ -300,6 +302,13 @@ var (
300302
{Type: &rbacv1.RoleBinding{}, Name: "efs-csi-provisioner-binding"},
301303
},
302304
},
305+
{
306+
Name: "calico-mutating-admission-policy",
307+
Objects: []*chart.Object{
308+
{Type: &admissionregistrationv1alpha1.MutatingAdmissionPolicy{}, Name: "calico-mutating-admission-policy"},
309+
{Type: &admissionregistrationv1alpha1.MutatingAdmissionPolicyBinding{}, Name: "block-calico-network-unavailable-binding"},
310+
},
311+
},
303312
},
304313
}
305314

@@ -881,13 +890,22 @@ func getControlPlaneShootChartValues(
881890
return nil, err
882891
}
883892

893+
overlayEnabled, err := networking.IsOverlayEnabled(cluster.Shoot.Spec.Networking)
894+
if err != nil {
895+
return nil, fmt.Errorf("could not determine if overlay is enabled: %w", err)
896+
}
897+
898+
// Only enable MutatingAdmissionPolicy if overlay is disabled AND all other conditions are met
899+
mutatingAdmissionPolicyEnabled := !overlayEnabled && isUsingCalico(cluster) && isMutatingAdmissionPolicyEnabled(cluster)
900+
884901
return map[string]interface{}{
885-
aws.CloudControllerManagerName: map[string]interface{}{"enabled": true},
886-
aws.AWSCustomRouteControllerName: map[string]interface{}{"enabled": customRouteControllerEnabled},
887-
aws.AWSIPAMControllerImageName: map[string]interface{}{"enabled": ipamControllerEnabled},
888-
aws.AWSLoadBalancerControllerName: albValues,
889-
aws.CSINodeName: csiDriverNodeValues,
890-
aws.CSIEfsNodeName: getControlPlaneShootChartCSIEfsValues(infraConfig, infraStatus),
902+
aws.CloudControllerManagerName: map[string]interface{}{"enabled": true},
903+
aws.AWSCustomRouteControllerName: map[string]interface{}{"enabled": customRouteControllerEnabled},
904+
aws.AWSIPAMControllerImageName: map[string]interface{}{"enabled": ipamControllerEnabled},
905+
aws.AWSLoadBalancerControllerName: albValues,
906+
aws.CSINodeName: csiDriverNodeValues,
907+
aws.CSIEfsNodeName: getControlPlaneShootChartCSIEfsValues(infraConfig, infraStatus),
908+
"calico-mutating-admission-policy": map[string]interface{}{"enabled": mutatingAdmissionPolicyEnabled},
891909
}, nil
892910
}
893911

@@ -919,3 +937,33 @@ func getControlPlaneShootChartCSIEfsValues(
919937

920938
return values
921939
}
940+
941+
func isUsingCalico(cluster *extensionscontroller.Cluster) bool {
942+
return cluster.Shoot.Spec.Networking != nil &&
943+
cluster.Shoot.Spec.Networking.Type != nil &&
944+
*cluster.Shoot.Spec.Networking.Type == "calico"
945+
}
946+
947+
func isMutatingAdmissionPolicyEnabled(cluster *extensionscontroller.Cluster) bool {
948+
if cluster.Shoot.Spec.Kubernetes.KubeAPIServer == nil {
949+
return false
950+
}
951+
952+
if cluster.Shoot.Spec.Kubernetes.KubeAPIServer.FeatureGates == nil {
953+
return false
954+
}
955+
956+
if enabled, ok := cluster.Shoot.Spec.Kubernetes.KubeAPIServer.FeatureGates["MutatingAdmissionPolicy"]; !ok || !enabled {
957+
return false
958+
}
959+
960+
if cluster.Shoot.Spec.Kubernetes.KubeAPIServer.RuntimeConfig == nil {
961+
return false
962+
}
963+
964+
if enabled, ok := cluster.Shoot.Spec.Kubernetes.KubeAPIServer.RuntimeConfig["admissionregistration.k8s.io/v1alpha1"]; !ok || !enabled {
965+
return false
966+
}
967+
968+
return true
969+
}

0 commit comments

Comments
 (0)