Skip to content

Commit 0340ef5

Browse files
committed
fix: throttle cache synchronization to prevent API call feedback loop
SynchronizeCachesAsync was running on every heartbeat check interval (1s), causing thousands of HostnameCache API calls per minute. Each sync lists ClusterCache and HostnameCache CRDs, and any save triggers the DNS reconciler which lists them again — creating a feedback loop that never reaches quiescence. Changes: - Decouple cache sync from heartbeat loop with a separate CacheSyncInterval (default 30s), configurable via values.yaml - Skip sync if one is already in progress (non-blocking WaitOne) instead of queuing callers - Force immediate sync only when a cluster is actually removed - Increase DNS probe timeouts from 1s to 5s
1 parent 4704e5d commit 0340ef5

6 files changed

Lines changed: 576 additions & 532 deletions

File tree

charts/multicluster-ingress/templates/configmap.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ data:
2828
HeartbeatCheckInterval: "{{ .Values.config.heartbeatCheckInterval }}"
2929
HeartbeatTimeout: "{{ .Values.config.heartbeatTimeout }}"
3030
HeartbeatSetInterval: "{{ .Values.config.heartbeatSetInterval }}"
31+
CacheSyncInterval: "{{ .Values.config.cacheSyncInterval }}"
3132
Serilog__MinimumLevel__Default: "{{ .Values.config.logLevel }}"
3233

3334
PeriodicRefreshInterval: "{{ .Values.config.periodicRefreshInterval }}"

charts/multicluster-ingress/templates/dns-server-deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,13 @@ spec:
8181
path: /Healthz/Liveness
8282
port: http
8383
initialDelaySeconds: 10
84-
timeoutSeconds: 1
84+
timeoutSeconds: 5
8585
readinessProbe:
8686
httpGet:
8787
path: /Healthz/Ready
8888
port: http
8989
initialDelaySeconds: 10
90-
timeoutSeconds: 1
90+
timeoutSeconds: 5
9191
resources:
9292
{{- toYaml .Values.dnsServer.resources | nindent 10 }}
9393
volumeMounts:

charts/multicluster-ingress/values.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,10 @@ config:
216216
# How frequent to send heartbeat updates in seconds
217217
heartbeatSetInterval: 30
218218

219+
# How frequently the orchestrator synchronizes hostname caches in seconds.
220+
# Decoupled from heartbeatCheckInterval to avoid a feedback loop of excessive API calls.
221+
cacheSyncInterval: 30
222+
219223
################################################################################################
220224
### Configuration for peer cluster communication
221225
################################################################################################

src/Cyclops.MultiCluster/Services/Default/DefaultHostnameSynchronizer.cs

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ public class DefaultHostnameSynchronizer : IHostnameSynchronizer
2424
private readonly CancellationToken _shutdownCancellationToken;
2525
private readonly ManualResetEvent _shutdownEvent;
2626
private readonly ManualResetEventSlim _synchronizingLocalClustersEvent = new ManualResetEventSlim(true);
27+
// Tracks elapsed time since the last cache sync to enforce CacheSyncInterval.
28+
// This decouples cache synchronization from the 1-second heartbeat check loop
29+
// to prevent a feedback loop of excessive HostnameCache API calls.
30+
private readonly System.Diagnostics.Stopwatch _cacheSyncStopwatch = System.Diagnostics.Stopwatch.StartNew();
2731

2832
public DefaultHostnameSynchronizer(
2933
ILogger<DefaultHostnameSynchronizer> logger,
@@ -132,7 +136,7 @@ await Task.WhenAll(
132136
{
133137
await _cache.SetResourceVersionAsync(service.Metadata.Uid, service.Metadata.ResourceVersion);
134138
}
135-
139+
136140
// Group endpoint slices by service name and track counts
137141
_logger.LogDebug("Tracking endpoint slices and counts");
138142
var slicesByService = endpointSlices
@@ -234,7 +238,7 @@ await Task.WhenAll(
234238
var service = gslbServices[0]!;
235239
// Get endpoint slices for this service
236240
var serviceEndpointSlices = endpointSlices
237-
.Where(s => s.Namespace() == service.Namespace() &&
241+
.Where(s => s.Namespace() == service.Namespace() &&
238242
s.GetLabel("kubernetes.io/service-name") == service.Name())
239243
.ToList();
240244

@@ -440,6 +444,7 @@ public async Task WatchClusterHeartbeatsAsync()
440444
var clusterIdentifiers = await _cache.GetClusterIdentifiersAsync();
441445
var timeout = _dateTimeProvider.UtcNow.AddSeconds(-_multiClusterOptions.Value.HeartbeatTimeout);
442446
_logger.LogInformation("Pruncing clusters that haven't checked in since {timeout}", timeout);
447+
var clusterRemoved = false;
443448

444449
foreach (var clusterIdentifier in clusterIdentifiers)
445450
{
@@ -454,26 +459,45 @@ public async Task WatchClusterHeartbeatsAsync()
454459
{
455460
_logger.LogWarning("Cluster heartbeat is stale for identifier {clusterIdentifier}", clusterIdentifier);
456461
await _cache.RemoveClusterCacheAsync(clusterIdentifier);
462+
clusterRemoved = true;
457463
}
458464
else
459465
{
460466
_logger.LogTrace("Cluster heartbeat is valid for identifier {clusterIdentifier}", clusterIdentifier);
461467
}
462468
}
469+
470+
if (clusterRemoved)
471+
{
472+
// A cluster was pruned — force an immediate cache sync to remove
473+
// its stale hostname entries without waiting for the next interval.
474+
_cacheSyncStopwatch.Restart();
475+
_logger.LogInformation("Cluster removed, forcing cache synchronization");
476+
await _cache.SynchronizeCachesAsync();
477+
}
463478
}
464479
catch (Exception exception)
465480
{
466481
_logger.LogError(exception, "Error checking cluster heartbeats");
467482
}
468483

469-
try
470-
{
471-
_logger.LogTrace("Making sure stale records are not in the cache");
472-
await _cache.SynchronizeCachesAsync();
473-
}
474-
catch (Exception exception)
484+
// Run periodic cache sync on its own interval, independent of the
485+
// heartbeat check loop, to avoid excessive K8s API calls.
486+
if (_cacheSyncStopwatch.Elapsed.TotalSeconds >= _multiClusterOptions.Value.CacheSyncInterval)
475487
{
476-
_logger.LogError(exception, "Error cleaning stale records in the cache");
488+
try
489+
{
490+
_logger.LogTrace("Making sure stale records are not in the cache");
491+
await _cache.SynchronizeCachesAsync();
492+
}
493+
catch (Exception exception)
494+
{
495+
_logger.LogError(exception, "Error cleaning stale records in the cache");
496+
}
497+
finally
498+
{
499+
_cacheSyncStopwatch.Restart();
500+
}
477501
}
478502
}
479503

0 commit comments

Comments
 (0)