Skip to content

Commit 4b0bd8d

Browse files
author
Joshua Sierles
committed
Add more verbose logging of reconciliation steps
1 parent 2526fac commit 4b0bd8d

File tree

2 files changed

+190
-23
lines changed

2 files changed

+190
-23
lines changed

internal/controller/postgrescluster/controller.go

Lines changed: 171 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,19 +79,29 @@ func (r *Reconciler) Reconcile(
7979
log := logging.FromContext(ctx)
8080
defer span.End()
8181

82+
log.Info("Starting reconciliation", "cluster", request.Name)
83+
8284
// get the postgrescluster from the cache
8385
cluster := &v1beta1.PostgresCluster{}
8486
if err := r.Client.Get(ctx, request.NamespacedName, cluster); err != nil {
8587
// NotFound cannot be fixed by requeuing so ignore it. During background
8688
// deletion, we receive delete events from cluster's dependents after
8789
// cluster is deleted.
8890
if err = client.IgnoreNotFound(err); err != nil {
89-
log.Error(err, "unable to fetch PostgresCluster")
91+
log.Error(err, "unable to fetch PostgresCluster", "cluster", request.Name)
9092
span.RecordError(err)
93+
} else {
94+
log.Info("PostgresCluster not found, likely deleted", "cluster", request.Name)
9195
}
9296
return runtime.ErrorWithBackoff(err)
9397
}
9498

99+
log.Info("Retrieved PostgresCluster",
100+
"cluster", cluster.Name,
101+
"generation", cluster.GetGeneration(),
102+
"observedGeneration", cluster.Status.ObservedGeneration,
103+
"resourceVersion", cluster.GetResourceVersion())
104+
95105
// Set any defaults that may not have been stored in the API. No DeepCopy
96106
// is necessary because controller-runtime makes a copy before returning
97107
// from its cache.
@@ -114,12 +124,12 @@ func (r *Reconciler) Reconcile(
114124
// deleted or there was an error.
115125
if result, err := r.handleDelete(ctx, cluster); err != nil {
116126
span.RecordError(err)
117-
log.Error(err, "deleting")
127+
log.Error(err, "deleting", "cluster", cluster.Name)
118128
return runtime.ErrorWithBackoff(err)
119129

120130
} else if result != nil {
121131
if log := log.V(1); log.Enabled() {
122-
log.Info("deleting", "result", fmt.Sprintf("%+v", *result))
132+
log.Info("deleting", "cluster", cluster.Name, "result", fmt.Sprintf("%+v", *result))
123133
}
124134
return *result, nil
125135
}
@@ -180,10 +190,10 @@ func (r *Reconciler) Reconcile(
180190
// managed fields on the status subresource: https://issue.k8s.io/88901
181191
if err := r.Client.Status().Patch(
182192
ctx, cluster, client.MergeFrom(before), r.Owner); err != nil {
183-
log.Error(err, "patching cluster status")
193+
log.Error(err, "patching cluster status", "cluster", cluster.Name)
184194
return err
185195
}
186-
log.V(1).Info("patched cluster status")
196+
log.V(1).Info("patched cluster status", "cluster", cluster.Name)
187197
}
188198
return nil
189199
}
@@ -210,7 +220,16 @@ func (r *Reconciler) Reconcile(
210220
}
211221

212222
if err == nil {
223+
log.V(1).Info("Starting backups reconciliation check", "cluster", cluster.Name)
213224
backupsSpecFound, backupsReconciliationAllowed, err = r.BackupsEnabled(ctx, cluster)
225+
if err != nil {
226+
log.Error(err, "Failed to check backups status", "cluster", cluster.Name)
227+
} else {
228+
log.V(1).Info("Backups status",
229+
"cluster", cluster.Name,
230+
"backupsSpecFound", backupsSpecFound,
231+
"backupsReconciliationAllowed", backupsReconciliationAllowed)
232+
}
214233

215234
// If we cannot reconcile because the backup reconciliation is paused, set a condition and exit
216235
if !backupsReconciliationAllowed {
@@ -264,7 +283,11 @@ func (r *Reconciler) Reconcile(
264283
postgres.SetHugePages(cluster, &pgParameters)
265284

266285
if err == nil {
286+
log.V(1).Info("Starting root certificate reconciliation", "cluster", cluster.Name)
267287
rootCA, err = r.reconcileRootCertificate(ctx, cluster)
288+
if err != nil {
289+
log.Error(err, "Failed to reconcile root certificate", "cluster", cluster.Name)
290+
}
268291
}
269292

270293
if err == nil {
@@ -274,45 +297,92 @@ func (r *Reconciler) Reconcile(
274297
// return a bool indicating that the controller should return early while any
275298
// required Jobs are running, after which it will indicate that an early
276299
// return is no longer needed, and reconciliation can proceed normally.
300+
log.V(1).Info("Starting directory move jobs reconciliation", "cluster", cluster.Name)
301+
277302
returnEarly, err := r.reconcileDirMoveJobs(ctx, cluster)
303+
if err != nil {
304+
log.Error(err, "Failed to reconcile directory move jobs", "cluster", cluster.Name)
305+
} else if returnEarly {
306+
log.Info("Directory move jobs in progress, returning early", "cluster", cluster.Name)
307+
}
278308
if err != nil || returnEarly {
279309
return runtime.ErrorWithBackoff(errors.Join(err, patchClusterStatus()))
280310
}
281311
}
282312
if err == nil {
313+
log.V(1).Info("Starting PVC observation", "cluster", cluster.Name)
283314
clusterVolumes, err = r.observePersistentVolumeClaims(ctx, cluster)
315+
if err != nil {
316+
log.Error(err, "Failed to observe PVCs", "cluster", cluster.Name)
317+
}
284318
}
285319
if err == nil {
320+
log.V(1).Info("Starting PVC configuration", "cluster", cluster.Name)
286321
clusterVolumes, err = r.configureExistingPVCs(ctx, cluster, clusterVolumes)
322+
if err != nil {
323+
log.Error(err, "Failed to configure PVCs", "cluster", cluster.Name)
324+
}
287325
}
288326
if err == nil {
327+
log.V(1).Info("Starting instance observation", "cluster", cluster.Name)
289328
instances, err = r.observeInstances(ctx, cluster)
329+
if err != nil {
330+
log.Error(err, "Failed to observe instances", "cluster", cluster.Name)
331+
} else {
332+
// Count writable instances
333+
writableCount := 0
334+
for _, instance := range instances.forCluster {
335+
if writable, known := instance.IsWritable(); writable && known {
336+
writableCount++
337+
}
338+
}
339+
log.V(1).Info("Instance observation complete",
340+
"cluster", cluster.Name,
341+
"instanceCount", len(instances.forCluster),
342+
"writableInstanceCount", writableCount)
343+
}
290344
}
291345

292346
result := reconcile.Result{}
293347

294348
if err == nil {
349+
log.V(1).Info("Starting Patroni status reconciliation", "cluster", cluster.Name)
295350
var requeue time.Duration
296351
if requeue, err = r.reconcilePatroniStatus(ctx, cluster, instances); err == nil && requeue > 0 {
297352
result.RequeueAfter = requeue
353+
log.V(1).Info("Patroni status reconciliation requires requeue",
354+
"cluster", cluster.Name,
355+
"requeueAfter", requeue)
298356
}
299357
}
300358
if err == nil {
359+
log.V(1).Info("Starting Patroni switchover reconciliation", "cluster", cluster.Name)
301360
err = r.reconcilePatroniSwitchover(ctx, cluster, instances)
361+
if err != nil {
362+
log.Error(err, "Failed to reconcile Patroni switchover", "cluster", cluster.Name)
363+
}
302364
}
303365
// reconcile the Pod service before reconciling any data source in case it is necessary
304366
// to start Pods during data source reconciliation that require network connections (e.g.
305367
// if it is necessary to start a dedicated repo host to bootstrap a new cluster using its
306368
// own existing backups).
307369
if err == nil {
370+
log.V(1).Info("Starting cluster pod service reconciliation", "cluster", cluster.Name)
308371
clusterPodService, err = r.reconcileClusterPodService(ctx, cluster)
372+
if err != nil {
373+
log.Error(err, "Failed to reconcile cluster pod service", "cluster", cluster.Name)
374+
}
309375
}
310376
// reconcile the RBAC resources before reconciling any data source in case
311377
// restore/move Job pods require the ServiceAccount to access any data source.
312378
// e.g., we are restoring from an S3 source using an IAM for access
313379
// - https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts-technical-overview.html
314380
if err == nil {
381+
log.V(1).Info("Starting RBAC resources reconciliation", "cluster", cluster.Name)
315382
instanceServiceAccount, err = r.reconcileRBACResources(ctx, cluster)
383+
if err != nil {
384+
log.Error(err, "Failed to reconcile RBAC resources", "cluster", cluster.Name)
385+
}
316386
}
317387
// First handle reconciling any data source configured for the PostgresCluster. This includes
318388
// reconciling the data source defined to bootstrap a new cluster, as well as a reconciling
@@ -330,93 +400,188 @@ func (r *Reconciler) Reconcile(
330400
}
331401
}
332402
if err == nil {
403+
log.V(1).Info("Starting cluster config map reconciliation", "cluster", cluster.Name)
333404
clusterConfigMap, err = r.reconcileClusterConfigMap(ctx, cluster, pgHBAs, pgParameters)
405+
if err != nil {
406+
log.Error(err, "Failed to reconcile cluster config map", "cluster", cluster.Name)
407+
}
334408
}
335409
if err == nil {
410+
log.V(1).Info("Starting replication secret reconciliation", "cluster", cluster.Name)
336411
clusterReplicationSecret, err = r.reconcileReplicationSecret(ctx, cluster, rootCA)
412+
if err != nil {
413+
log.Error(err, "Failed to reconcile replication secret", "cluster", cluster.Name)
414+
}
337415
}
338416
if err == nil {
417+
log.V(1).Info("Starting Patroni leader lease reconciliation", "cluster", cluster.Name)
339418
patroniLeaderService, err = r.reconcilePatroniLeaderLease(ctx, cluster)
419+
if err != nil {
420+
log.Error(err, "Failed to reconcile Patroni leader lease", "cluster", cluster.Name)
421+
}
340422
}
341423
if err == nil {
424+
log.V(1).Info("Starting primary service reconciliation", "cluster", cluster.Name)
342425
primaryService, err = r.reconcileClusterPrimaryService(ctx, cluster, patroniLeaderService)
426+
if err != nil {
427+
log.Error(err, "Failed to reconcile primary service", "cluster", cluster.Name)
428+
}
343429
}
344430
if err == nil {
431+
log.V(1).Info("Starting replica service reconciliation", "cluster", cluster.Name)
345432
replicaService, err = r.reconcileClusterReplicaService(ctx, cluster)
433+
if err != nil {
434+
log.Error(err, "Failed to reconcile replica service", "cluster", cluster.Name)
435+
}
346436
}
347437
if err == nil {
438+
log.V(1).Info("Starting cluster certificate reconciliation", "cluster", cluster.Name)
348439
primaryCertificate, err = r.reconcileClusterCertificate(ctx, rootCA, cluster, primaryService, replicaService)
440+
if err != nil {
441+
log.Error(err, "Failed to reconcile cluster certificate", "cluster", cluster.Name)
442+
}
349443
}
350444
if err == nil {
445+
log.V(1).Info("Starting Patroni distributed configuration reconciliation", "cluster", cluster.Name)
351446
err = r.reconcilePatroniDistributedConfiguration(ctx, cluster)
447+
if err != nil {
448+
log.Error(err, "Failed to reconcile Patroni distributed configuration", "cluster", cluster.Name)
449+
}
352450
}
353451
if err == nil {
452+
log.V(1).Info("Starting Patroni dynamic configuration reconciliation", "cluster", cluster.Name)
354453
err = r.reconcilePatroniDynamicConfiguration(ctx, cluster, instances, pgHBAs, pgParameters)
454+
if err != nil {
455+
log.Error(err, "Failed to reconcile Patroni dynamic configuration", "cluster", cluster.Name)
456+
}
355457
}
356458
if err == nil {
459+
log.V(1).Info("Starting monitoring secret reconciliation", "cluster", cluster.Name)
357460
monitoringSecret, err = r.reconcileMonitoringSecret(ctx, cluster)
461+
if err != nil {
462+
log.Error(err, "Failed to reconcile monitoring secret", "cluster", cluster.Name)
463+
}
358464
}
359465
if err == nil {
466+
log.V(1).Info("Starting exporter queries config reconciliation", "cluster", cluster.Name)
360467
exporterQueriesConfig, err = r.reconcileExporterQueriesConfig(ctx, cluster)
468+
if err != nil {
469+
log.Error(err, "Failed to reconcile exporter queries config", "cluster", cluster.Name)
470+
}
361471
}
362472
if err == nil {
473+
log.V(1).Info("Starting exporter web config reconciliation", "cluster", cluster.Name)
363474
exporterWebConfig, err = r.reconcileExporterWebConfig(ctx, cluster)
475+
if err != nil {
476+
log.Error(err, "Failed to reconcile exporter web config", "cluster", cluster.Name)
477+
}
364478
}
365479
if err == nil {
480+
log.V(1).Info("Starting instance sets reconciliation", "cluster", cluster.Name)
366481
err = r.reconcileInstanceSets(
367482
ctx, cluster, clusterConfigMap, clusterReplicationSecret, rootCA,
368483
clusterPodService, instanceServiceAccount, instances, patroniLeaderService,
369484
primaryCertificate, clusterVolumes, exporterQueriesConfig, exporterWebConfig,
370485
backupsSpecFound,
371486
)
487+
if err != nil {
488+
log.Error(err, "Failed to reconcile instance sets", "cluster", cluster.Name)
489+
}
372490
}
373491

374492
if err == nil {
493+
log.V(1).Info("Starting Postgres databases reconciliation", "cluster", cluster.Name)
375494
err = r.reconcilePostgresDatabases(ctx, cluster, instances)
495+
if err != nil {
496+
log.Error(err, "Failed to reconcile Postgres databases", "cluster", cluster.Name)
497+
}
376498
}
377499
if err == nil {
500+
log.V(1).Info("Starting Postgres users reconciliation", "cluster", cluster.Name)
378501
err = r.reconcilePostgresUsers(ctx, cluster, instances)
502+
if err != nil {
503+
log.Error(err, "Failed to reconcile Postgres users", "cluster", cluster.Name)
504+
}
379505
}
380506

381507
if err == nil {
508+
log.V(1).Info("Starting PGBackRest reconciliation", "cluster", cluster.Name)
382509
var next reconcile.Result
383510
if next, err = r.reconcilePGBackRest(ctx, cluster,
384511
instances, rootCA, backupsSpecFound); err == nil && !next.IsZero() {
385512
result.Requeue = result.Requeue || next.Requeue
386513
if next.RequeueAfter > 0 {
514+
log.V(1).Info("PGBackRest reconciliation requires requeue",
515+
"cluster", cluster.Name,
516+
"requeueAfter", next.RequeueAfter)
387517
result.RequeueAfter = next.RequeueAfter
388518
}
389519
}
520+
if err != nil {
521+
log.Error(err, "Failed to reconcile PGBackRest", "cluster", cluster.Name)
522+
}
390523
}
391524
if err == nil {
525+
log.V(1).Info("Starting dedicated snapshot volume reconciliation", "cluster", cluster.Name)
392526
dedicatedSnapshotPVC, err = r.reconcileDedicatedSnapshotVolume(ctx, cluster, clusterVolumes)
527+
if err != nil {
528+
log.Error(err, "Failed to reconcile dedicated snapshot volume", "cluster", cluster.Name)
529+
}
393530
}
394531
if err == nil {
532+
log.V(1).Info("Starting volume snapshots reconciliation", "cluster", cluster.Name)
395533
err = r.reconcileVolumeSnapshots(ctx, cluster, dedicatedSnapshotPVC)
534+
if err != nil {
535+
log.Error(err, "Failed to reconcile volume snapshots", "cluster", cluster.Name)
536+
}
396537
}
397538
if err == nil {
539+
log.V(1).Info("Starting PGBouncer reconciliation", "cluster", cluster.Name)
398540
err = r.reconcilePGBouncer(ctx, cluster, instances, primaryCertificate, rootCA)
541+
if err != nil {
542+
log.Error(err, "Failed to reconcile PGBouncer", "cluster", cluster.Name)
543+
}
399544
}
400545
if err == nil {
546+
log.V(1).Info("Starting PGMonitor reconciliation", "cluster", cluster.Name)
401547
err = r.reconcilePGMonitor(ctx, cluster, instances, monitoringSecret)
548+
if err != nil {
549+
log.Error(err, "Failed to reconcile PGMonitor", "cluster", cluster.Name)
550+
}
402551
}
403552
if err == nil {
553+
log.V(1).Info("Starting database init SQL reconciliation", "cluster", cluster.Name)
404554
err = r.reconcileDatabaseInitSQL(ctx, cluster, instances)
555+
if err != nil {
556+
log.Error(err, "Failed to reconcile database init SQL", "cluster", cluster.Name)
557+
}
405558
}
406559
if err == nil {
560+
log.V(1).Info("Starting PGAdmin reconciliation", "cluster", cluster.Name)
407561
err = r.reconcilePGAdmin(ctx, cluster)
562+
if err != nil {
563+
log.Error(err, "Failed to reconcile PGAdmin", "cluster", cluster.Name)
564+
}
408565
}
409566
if err == nil {
410567
// This is after [Reconciler.rolloutInstances] to ensure that recreating
411568
// Pods takes precedence.
569+
log.V(1).Info("Starting Patroni restarts reconciliation", "cluster", cluster.Name)
412570
err = r.handlePatroniRestarts(ctx, cluster, instances)
571+
if err != nil {
572+
log.Error(err, "Failed to reconcile Patroni restarts", "cluster", cluster.Name)
573+
}
413574
}
414575

415576
// at this point everything reconciled successfully, and we can update the
416577
// observedGeneration
417578
cluster.Status.ObservedGeneration = cluster.GetGeneration()
418579

419-
log.V(1).Info("reconciled cluster")
580+
log.Info("Completed reconciliation",
581+
"cluster", cluster.Name,
582+
"generation", cluster.GetGeneration(),
583+
"observedGeneration", cluster.Status.ObservedGeneration,
584+
"result", fmt.Sprintf("%+v", result))
420585

421586
return result, errors.Join(err, patchClusterStatus())
422587
}

0 commit comments

Comments
 (0)