fix: throttle cache synchronization to prevent API call feedback loop

shess-macu · shess-macu · commit 0340ef5c4579 · 2026-04-17T18:36:39.000-06:00
SynchronizeCachesAsync was running on every heartbeat check interval (1s),
causing thousands of HostnameCache API calls per minute. Each sync lists
ClusterCache and HostnameCache CRDs, and any save triggers the DNS
reconciler which lists them again — creating a feedback loop that never
reaches quiescence.

Changes:
- Decouple cache sync from heartbeat loop with a separate CacheSyncInterval
  (default 30s), configurable via values.yaml
- Skip sync if one is already in progress (non-blocking WaitOne) instead of
  queuing callers
- Force immediate sync only when a cluster is actually removed
- Increase DNS probe timeouts from 1s to 5s
diff --git a/charts/multicluster-ingress/templates/configmap.yaml b/charts/multicluster-ingress/templates/configmap.yaml
@@ -28,6 +28,7 @@ data:
   HeartbeatCheckInterval: "{{ .Values.config.heartbeatCheckInterval }}"
   HeartbeatTimeout: "{{ .Values.config.heartbeatTimeout }}"
   HeartbeatSetInterval: "{{ .Values.config.heartbeatSetInterval }}"
+  CacheSyncInterval: "{{ .Values.config.cacheSyncInterval }}"
   Serilog__MinimumLevel__Default: "{{ .Values.config.logLevel }}"
 
   PeriodicRefreshInterval: "{{ .Values.config.periodicRefreshInterval }}"
diff --git a/charts/multicluster-ingress/templates/dns-server-deployment.yaml b/charts/multicluster-ingress/templates/dns-server-deployment.yaml
@@ -81,13 +81,13 @@ spec:
             path: /Healthz/Liveness
             port: http
           initialDelaySeconds: 10
-          timeoutSeconds: 1
+          timeoutSeconds: 5
         readinessProbe:
           httpGet:
             path: /Healthz/Ready
             port: http
           initialDelaySeconds: 10
-          timeoutSeconds: 1
+          timeoutSeconds: 5
         resources:
           {{- toYaml .Values.dnsServer.resources | nindent 10 }}
         volumeMounts:
diff --git a/charts/multicluster-ingress/values.yaml b/charts/multicluster-ingress/values.yaml
@@ -216,6 +216,10 @@ config:
   # How frequent to send heartbeat updates in seconds
   heartbeatSetInterval: 30
 
+  # How frequently the orchestrator synchronizes hostname caches in seconds.
+  # Decoupled from heartbeatCheckInterval to avoid a feedback loop of excessive API calls.
+  cacheSyncInterval: 30
+
   ################################################################################################
   ### Configuration for peer cluster communication
   ################################################################################################
diff --git a/src/Cyclops.MultiCluster/Services/Default/DefaultHostnameSynchronizer.cs b/src/Cyclops.MultiCluster/Services/Default/DefaultHostnameSynchronizer.cs
@@ -24,6 +24,10 @@ public class DefaultHostnameSynchronizer : IHostnameSynchronizer
         private readonly CancellationToken _shutdownCancellationToken;
         private readonly ManualResetEvent _shutdownEvent;
         private readonly ManualResetEventSlim  _synchronizingLocalClustersEvent = new ManualResetEventSlim(true);
+        // Tracks elapsed time since the last cache sync to enforce CacheSyncInterval.
+        // This decouples cache synchronization from the 1-second heartbeat check loop
+        // to prevent a feedback loop of excessive HostnameCache API calls.
+        private readonly System.Diagnostics.Stopwatch _cacheSyncStopwatch = System.Diagnostics.Stopwatch.StartNew();
 
         public DefaultHostnameSynchronizer(
             ILogger<DefaultHostnameSynchronizer> logger,
@@ -132,7 +136,7 @@ await Task.WhenAll(
                 {
                     await _cache.SetResourceVersionAsync(service.Metadata.Uid, service.Metadata.ResourceVersion);
                 }
-                
+
                 // Group endpoint slices by service name and track counts
                 _logger.LogDebug("Tracking endpoint slices and counts");
                 var slicesByService = endpointSlices
@@ -234,7 +238,7 @@ await Task.WhenAll(
                             var service = gslbServices[0]!;
                             // Get endpoint slices for this service
                             var serviceEndpointSlices = endpointSlices
-                                .Where(s => s.Namespace() == service.Namespace() && 
+                                .Where(s => s.Namespace() == service.Namespace() &&
                                        s.GetLabel("kubernetes.io/service-name") == service.Name())
                                 .ToList();
 
@@ -440,6 +444,7 @@ public async Task WatchClusterHeartbeatsAsync()
                     var clusterIdentifiers = await _cache.GetClusterIdentifiersAsync();
                     var timeout = _dateTimeProvider.UtcNow.AddSeconds(-_multiClusterOptions.Value.HeartbeatTimeout);
                     _logger.LogInformation("Pruncing clusters that haven't checked in since {timeout}", timeout);
+                    var clusterRemoved = false;
 
                     foreach (var clusterIdentifier in clusterIdentifiers)
                     {
@@ -454,26 +459,45 @@ public async Task WatchClusterHeartbeatsAsync()
                         {
                             _logger.LogWarning("Cluster heartbeat is stale for identifier {clusterIdentifier}", clusterIdentifier);
                             await _cache.RemoveClusterCacheAsync(clusterIdentifier);
+                            clusterRemoved = true;
                         }
                         else
                         {
                             _logger.LogTrace("Cluster heartbeat is valid for identifier {clusterIdentifier}", clusterIdentifier);
                         }
                     }
+
+                    if (clusterRemoved)
+                    {
+                        // A cluster was pruned — force an immediate cache sync to remove
+                        // its stale hostname entries without waiting for the next interval.
+                        _cacheSyncStopwatch.Restart();
+                        _logger.LogInformation("Cluster removed, forcing cache synchronization");
+                        await _cache.SynchronizeCachesAsync();
+                    }
                 }
                 catch (Exception exception)
                 {
                     _logger.LogError(exception, "Error checking cluster heartbeats");
                 }
 
-                try
-                {
-                    _logger.LogTrace("Making sure stale records are not in the cache");
-                    await _cache.SynchronizeCachesAsync();
-                }
-                catch (Exception exception)
+                // Run periodic cache sync on its own interval, independent of the
+                // heartbeat check loop, to avoid excessive K8s API calls.
+                if (_cacheSyncStopwatch.Elapsed.TotalSeconds >= _multiClusterOptions.Value.CacheSyncInterval)
                 {
-                    _logger.LogError(exception, "Error cleaning stale records in the cache");
+                    try
+                    {
+                        _logger.LogTrace("Making sure stale records are not in the cache");
+                        await _cache.SynchronizeCachesAsync();
+                    }
+                    catch (Exception exception)
+                    {
+                        _logger.LogError(exception, "Error cleaning stale records in the cache");
+                    }
+                    finally
+                    {
+                        _cacheSyncStopwatch.Restart();
+                    }
                 }
             }
 
diff --git a/src/Cyclops.MultiCluster/Services/Default/KubernetesApiCache.cs b/src/Cyclops.MultiCluster/Services/Default/KubernetesApiCache.cs
diff --git a/src/Cyclops.MultiCluster/Services/MultiClusterOptions.cs b/src/Cyclops.MultiCluster/Services/MultiClusterOptions.cs