fix the incorrect calculation of node resources in case of sidecar containers being injected during sync to host when virtual scheduler is enabled

neogopher · neogopher · commit 979a673c85af · 2025-04-07T20:33:33.000+05:30
diff --git a/chart/values.schema.json b/chart/values.schema.json
@@ -2881,6 +2881,24 @@
       "additionalProperties": false,
       "type": "object"
     },
+    "ReservedResources": {
+      "properties": {
+        "cpu": {
+          "type": "string",
+          "description": "CPU is amount of CPU to reserve."
+        },
+        "memory": {
+          "type": "string",
+          "description": "Memory is amount of Memory to reserve."
+        },
+        "ephemeralStorage": {
+          "type": "string",
+          "description": "EphemeralStorage is amount of EphemeralStorage to reserve."
+        }
+      },
+      "additionalProperties": false,
+      "type": "object"
+    },
     "ResolveDNS": {
       "properties": {
         "hostname": {
@@ -3304,6 +3322,10 @@
           },
           "type": "array",
           "description": "Patches patch the resource according to the provided specification."
+        },
+        "reservedResources": {
+          "$ref": "#/$defs/ReservedResources",
+          "description": "ReservedResources specifies the amount of a particular resource type to be reserved/subtracted from the allocatable resource on the virtual node.\nThis is to account for additional resource requirement in case of sidecar container(s) injected via mutating webhooks when workloads are synced from virtual to host.\nThis will take effect only when the virtual scheduler is enabled."
         }
       },
       "additionalProperties": false,
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -168,6 +168,16 @@ sync:
         # All specifies if all nodes should get synced by vCluster from the host to the virtual cluster or only the ones where pods are assigned to.
         all: false
         labels: {}
+      # ReservedResources specifies the amount of a particular resource type to be reserved/subtracted from the allocatable resource on the virtual node.
+      # This is to account for additional resource requirement in case of sidecar container(s) injected via mutating webhooks when workloads are synced from virtual to host.
+      # This will take effect only when the virtual scheduler is enabled.
+      reservedResources:
+        # CPU is amount of CPU to reserve.
+        cpu: ""
+        # Memory is amount of Memory to reserve.
+        memory: ""
+        # EphemeralStorage is amount of EphemeralStorage to reserve.
+        ephemeralStorage: ""
     # Secrets defines if secrets in the host should get synced to the virtual cluster.
     secrets:
       # Enabled defines if this option should be enabled.
diff --git a/config/config.go b/config/config.go
@@ -800,6 +800,22 @@ type SyncNodes struct {
 
 	// Patches patch the resource according to the provided specification.
 	Patches []TranslatePatch `json:"patches,omitempty"`
+
+	// ReservedResources specifies the amount of a particular resource type to be reserved/subtracted from the allocatable resource on the virtual node.
+	// This is to account for additional resource requirement in case of sidecar container(s) injected via mutating webhooks when workloads are synced from virtual to host.
+	// This will take effect only when the virtual scheduler is enabled.
+	ReservedResources ReservedResources `json:"reservedResources,omitempty"`
+}
+
+type ReservedResources struct {
+	// CPU is amount of CPU to reserve.
+	CPU string `json:"cpu,omitempty"`
+
+	// Memory is amount of Memory to reserve.
+	Memory string `json:"memory,omitempty"`
+
+	// EphemeralStorage is amount of EphemeralStorage to reserve.
+	EphemeralStorage string `json:"ephemeralStorage,omitempty"`
 }
 
 type SyncNodeSelector struct {
diff --git a/config/values.yaml b/config/values.yaml
@@ -80,6 +80,10 @@ sync:
       selector:
         all: false
         labels: {}
+      reservedResources:
+        cpu: ""
+        memory: ""
+        ephemeralStorage: ""
     secrets:
       enabled: false
       mappings:
diff --git a/pkg/controllers/resources/nodes/syncer.go b/pkg/controllers/resources/nodes/syncer.go
@@ -69,6 +69,10 @@ func NewSyncer(ctx *synccontext.RegisterContext, nodeServiceProvider nodeservice
 		fakeKubeletIPs:       ctx.Config.Networking.Advanced.ProxyKubelets.ByIP,
 		fakeKubeletHostnames: ctx.Config.Networking.Advanced.ProxyKubelets.ByHostname,
 
+		reservedResourceCPU:              ctx.Config.Sync.FromHost.Nodes.ReservedResources.CPU,
+		reservedResourceMemory:           ctx.Config.Sync.FromHost.Nodes.ReservedResources.Memory,
+		reservedResourceEphemeralStorage: ctx.Config.Sync.FromHost.Nodes.ReservedResources.EphemeralStorage,
+
 		physicalClient:      ctx.PhysicalManager.GetClient(),
 		virtualClient:       ctx.VirtualManager.GetClient(),
 		nodeServiceProvider: nodeServiceProvider,
@@ -79,18 +83,22 @@ func NewSyncer(ctx *synccontext.RegisterContext, nodeServiceProvider nodeservice
 type nodeSyncer struct {
 	synccontext.Mapper
 
-	nodeSelector         labels.Selector
-	physicalClient       client.Client
-	virtualClient        client.Client
-	unmanagedPodCache    client.Reader
-	nodeServiceProvider  nodeservice.Provider
-	enforcedTolerations  []*corev1.Toleration
-	enableScheduler      bool
-	clearImages          bool
-	enforceNodeSelector  bool
-	useFakeKubelets      bool
-	fakeKubeletIPs       bool
-	fakeKubeletHostnames bool
+	nodeSelector                     labels.Selector
+	physicalClient                   client.Client
+	virtualClient                    client.Client
+	unmanagedPodCache                client.Reader
+	managedPodCache                  client.Reader
+	nodeServiceProvider              nodeservice.Provider
+	enforcedTolerations              []*corev1.Toleration
+	enableScheduler                  bool
+	clearImages                      bool
+	enforceNodeSelector              bool
+	useFakeKubelets                  bool
+	fakeKubeletIPs                   bool
+	fakeKubeletHostnames             bool
+	reservedResourceCPU              string
+	reservedResourceMemory           string
+	reservedResourceEphemeralStorage string
 }
 
 func (s *nodeSyncer) Resource() client.Object {
@@ -116,7 +124,7 @@ func (s *nodeSyncer) ModifyController(ctx *synccontext.RegisterContext, bld *bui
 			return bld, fmt.Errorf("constructing label selector for non-vcluster pods: %w", err)
 		}
 		// create a pod cache containing pods from all namespaces for calculating the correct node resources
-		podCache, err := cache.New(ctx.PhysicalManager.GetConfig(), cache.Options{
+		unmanagedPodCache, err := cache.New(ctx.PhysicalManager.GetConfig(), cache.Options{
 			Scheme: ctx.PhysicalManager.GetScheme(),
 			Mapper: ctx.PhysicalManager.GetRESTMapper(),
 			// omits pods managed by the vcluster
@@ -126,7 +134,7 @@ func (s *nodeSyncer) ModifyController(ctx *synccontext.RegisterContext, bld *bui
 			return nil, fmt.Errorf("create cache : %w", err)
 		}
 		// add index for pod by node
-		err = podCache.IndexField(ctx, &corev1.Pod{}, constants.IndexRunningNonVClusterPodsByNode, func(object client.Object) []string {
+		err = unmanagedPodCache.IndexField(ctx, &corev1.Pod{}, constants.IndexRunningNonVClusterPodsByNode, func(object client.Object) []string {
 			pPod := object.(*corev1.Pod)
 			// we ignore all non-running pods and the ones that are part of the current vcluster
 			// to later calculate the status.allocatable part of the nodes correctly
@@ -141,41 +149,89 @@ func (s *nodeSyncer) ModifyController(ctx *synccontext.RegisterContext, bld *bui
 		if err != nil {
 			return nil, fmt.Errorf("index pod by node: %w", err)
 		}
+
+		managedSelector, err := labels.NewRequirement(translate.MarkerLabel, selection.Equals, []string{translate.VClusterName})
+		if err != nil {
+			return bld, fmt.Errorf("constructing label selector for vcluster managed pods: %w", err)
+		}
+		// create a pod cache containing pods managed by the current vCluster
+		managedPodCache, err := cache.New(ctx.PhysicalManager.GetConfig(), cache.Options{
+			Scheme: ctx.PhysicalManager.GetScheme(),
+			Mapper: ctx.PhysicalManager.GetRESTMapper(),
+			// only select pods managed by the vcluster
+			DefaultLabelSelector: labels.NewSelector().Add(*managedSelector),
+		})
+		if err != nil {
+			return nil, fmt.Errorf("create cache : %w", err)
+		}
+		// add index for pod by node
+		err = managedPodCache.IndexField(ctx.Context, &corev1.Pod{}, constants.IndexByAssigned, func(object client.Object) []string {
+			pPod := object.(*corev1.Pod)
+			// we ignore all non-running pods and the ones not assigned to a node
+			if pPod.Status.Phase == corev1.PodSucceeded || pPod.Status.Phase == corev1.PodFailed {
+				return []string{}
+			} else if pPod.Spec.NodeName == "" {
+				return []string{}
+			}
+
+			return []string{pPod.Spec.NodeName}
+		})
+		if err != nil {
+			return nil, fmt.Errorf("index pod by node: %w", err)
+		}
+
+		go func() {
+			err := unmanagedPodCache.Start(ctx)
+			if err != nil {
+				klog.Fatalf("error starting unmanaged pod cache: %v", err)
+			}
+		}()
+
 		go func() {
-			err := podCache.Start(ctx)
+			err := managedPodCache.Start(ctx.Context)
 			if err != nil {
-				klog.Fatalf("error starting pod cache: %v", err)
+				klog.Fatalf("error starting managed pod cache: %v", err)
 			}
 		}()
 
-		podCache.WaitForCacheSync(ctx)
-		s.unmanagedPodCache = podCache
+		unmanagedPodCache.WaitForCacheSync(ctx)
+		s.unmanagedPodCache = unmanagedPodCache
+
+		managedPodCache.WaitForCacheSync(ctx)
+		s.managedPodCache = managedPodCache
 
 		// enqueues nodes based on pod phase changes if the scheduler is enabled
 		// the syncer is configured to update virtual node's .status.allocatable fields by summing the consumption of these pods
+		handlerFuncs := handler.TypedFuncs[*corev1.Pod, ctrl.Request]{
+			GenericFunc: func(_ context.Context, ev event.TypedGenericEvent[*corev1.Pod], q workqueue.TypedRateLimitingInterface[ctrl.Request]) {
+				enqueuePod(nil, ev.Object, q)
+			},
+			CreateFunc: func(_ context.Context, ev event.TypedCreateEvent[*corev1.Pod], q workqueue.TypedRateLimitingInterface[ctrl.Request]) {
+				enqueuePod(nil, ev.Object, q)
+			},
+			UpdateFunc: func(_ context.Context, ue event.TypedUpdateEvent[*corev1.Pod], q workqueue.TypedRateLimitingInterface[ctrl.Request]) {
+				enqueuePod(ue.ObjectOld, ue.ObjectNew, q)
+			},
+			DeleteFunc: func(_ context.Context, ev event.TypedDeleteEvent[*corev1.Pod], q workqueue.TypedRateLimitingInterface[ctrl.Request]) {
+				enqueuePod(nil, ev.Object, q)
+			},
+		}
+
+		bld.WatchesRawSource(
+			source.Kind(unmanagedPodCache, &corev1.Pod{},
+				handlerFuncs),
+		)
+
 		bld.WatchesRawSource(
-			source.Kind(podCache, &corev1.Pod{},
-				handler.TypedFuncs[*corev1.Pod, ctrl.Request]{
-					GenericFunc: func(_ context.Context, ev event.TypedGenericEvent[*corev1.Pod], q workqueue.TypedRateLimitingInterface[ctrl.Request]) {
-						enqueueNonVClusterPod(nil, ev.Object, q)
-					},
-					CreateFunc: func(_ context.Context, ev event.TypedCreateEvent[*corev1.Pod], q workqueue.TypedRateLimitingInterface[ctrl.Request]) {
-						enqueueNonVClusterPod(nil, ev.Object, q)
-					},
-					UpdateFunc: func(_ context.Context, ue event.TypedUpdateEvent[*corev1.Pod], q workqueue.TypedRateLimitingInterface[ctrl.Request]) {
-						enqueueNonVClusterPod(ue.ObjectOld, ue.ObjectNew, q)
-					},
-					DeleteFunc: func(_ context.Context, ev event.TypedDeleteEvent[*corev1.Pod], q workqueue.TypedRateLimitingInterface[ctrl.Request]) {
-						enqueueNonVClusterPod(nil, ev.Object, q)
-					},
-				}),
+			source.Kind(managedPodCache, &corev1.Pod{},
+				handlerFuncs),
 		)
 	}
 	return modifyController(ctx, s.nodeServiceProvider, bld)
 }
 
 // only used when scheduler is enabled
-func enqueueNonVClusterPod(old, new client.Object, q workqueue.TypedRateLimitingInterface[ctrl.Request]) {
+func enqueuePod(old, new client.Object, q workqueue.TypedRateLimitingInterface[ctrl.Request]) {
 	pod, ok := new.(*corev1.Pod)
 	if !ok {
 		klog.Errorf("invalid type passed to pod handler: %T", new)
diff --git a/pkg/controllers/resources/nodes/translate.go b/pkg/controllers/resources/nodes/translate.go