Skip to content

Commit cd176e3

Browse files
committed
better retry handling to prevent tight loops
1 parent 9adbdb2 commit cd176e3

File tree

2 files changed

+20
-1
lines changed

2 files changed

+20
-1
lines changed

internal/controller/memgraphcluster_controller.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ const (
3030
// Requeue intervals
3131
requeueAfterShort = 10 * time.Second
3232
requeueAfterLong = 30 * time.Second
33+
// Error backoff - prevent tight retry loops when operations fail
34+
requeueAfterError = 30 * time.Second
35+
requeueAfterErrorMax = 5 * time.Minute
3336

3437
// Finalizer name
3538
finalizerName = "memgraph.base14.io/finalizer"
@@ -212,14 +215,17 @@ func (r *MemgraphClusterReconciler) reconcileResources(ctx context.Context, clus
212215

213216
// 7. Configure replication if we have a write instance and pods are ready
214217
var registeredReplicas int32
218+
var replicationError error
215219
if writeInstance != "" && len(pods) > 1 {
216220
if err := r.ensureReplicationManager(); err != nil {
217221
log.Error("failed to create replication manager", zap.Error(err))
222+
replicationError = err
218223
} else {
219224
if err := r.replicationManager.ConfigureReplication(ctx, cluster, pods, writeInstance, log); err != nil {
220225
log.Error("failed to configure replication", zap.Error(err))
221226
r.Recorder.Event(cluster, corev1.EventTypeWarning, EventReasonReplicationError,
222227
fmt.Sprintf("Failed to configure replication: %v", err))
228+
replicationError = err
223229
} else {
224230
health, err := r.replicationManager.CheckReplicationHealth(ctx, cluster, writeInstance, log)
225231
if err == nil && health != nil {
@@ -264,6 +270,14 @@ func (r *MemgraphClusterReconciler) reconcileResources(ctx context.Context, clus
264270
return ctrl.Result{RequeueAfter: requeueAfterShort}, nil
265271
}
266272

273+
// If there was a replication error, use longer backoff to prevent overwhelming Memgraph
274+
if replicationError != nil {
275+
log.Info("replication error occurred, using longer backoff",
276+
zap.Duration("requeueAfter", requeueAfterError),
277+
zap.Error(replicationError))
278+
return ctrl.Result{RequeueAfter: requeueAfterError}, nil
279+
}
280+
267281
// Requeue for periodic health checks
268282
return ctrl.Result{RequeueAfter: requeueAfterLong}, nil
269283
}

internal/memgraph/client.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,8 +346,13 @@ func parseShowReplicasOutput(output string) []ReplicaInfo {
346346
if strings.HasPrefix(line, "|") {
347347
parts := strings.Split(line, "|")
348348
if len(parts) >= 6 {
349+
// Strip surrounding quotes from replica name if present
350+
// Memgraph returns names like "replica_name" but DROP REPLICA expects unquoted names
351+
name := strings.TrimSpace(parts[1])
352+
name = strings.Trim(name, "\"")
353+
349354
replica := ReplicaInfo{
350-
Name: strings.TrimSpace(parts[1]),
355+
Name: name,
351356
Host: strings.TrimSpace(parts[2]),
352357
Mode: strings.TrimSpace(parts[4]),
353358
Status: strings.TrimSpace(parts[5]),

0 commit comments

Comments
 (0)