Skip to content

Commit d317c9d

Browse files
committed
feat(patroni/http): adds tracing
Simple tracing support. Also fixes a test. Signed-off-by: Juliana Oliveira <[email protected]>
1 parent 873f5de commit d317c9d

File tree

2 files changed

+105
-7
lines changed

2 files changed

+105
-7
lines changed

internal/patroni/api_http.go

Lines changed: 88 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ import (
1515

1616
"github.com/go-logr/logr"
1717
"github.com/percona/percona-postgresql-operator/internal/logging"
18+
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
19+
"go.opentelemetry.io/otel"
20+
"go.opentelemetry.io/otel/attribute"
21+
"go.opentelemetry.io/otel/trace"
1822
corev1 "k8s.io/api/core/v1"
1923
"sigs.k8s.io/controller-runtime/pkg/client"
2024
)
@@ -412,12 +416,12 @@ func newInstanceClient(
412416

413417
httpClient := &http.Client{
414418
Timeout: 30 * time.Second,
415-
Transport: &http.Transport{
419+
Transport: otelhttp.NewTransport(&http.Transport{
416420
TLSClientConfig: &tls.Config{
417421
RootCAs: caCert,
418422
Certificates: []tls.Certificate{clientCert},
419423
},
420-
},
424+
}),
421425
}
422426

423427
return instanceClient{
@@ -431,12 +435,14 @@ type HTTPClient struct {
431435
kubeClient client.Client
432436
client instanceClient
433437
logger logr.Logger
438+
tracer trace.Tracer
434439
}
435440

436441
var _ API = HTTPClient{}
437442

438443
func NewHttpClient(ctx context.Context, kube client.Client, podName string) (HTTPClient, error) {
439444
logger := logging.FromContext(ctx).WithName("patroni.http")
445+
tracer := otel.Tracer("github.com/percona/percona-postgresql-operator/patroni")
440446

441447
// We can extract all the information we need from the podName due to the
442448
// way podName is built: ${namespace}-${instanceSuffix}-${podNumeral}.
@@ -455,13 +461,22 @@ func NewHttpClient(ctx context.Context, kube client.Client, podName string) (HTT
455461
client: patroniHttpClient,
456462
kubeClient: kube,
457463
logger: logger,
464+
tracer: tracer,
458465
}, nil
459466
}
460467

461468
// Called when the operator believes Patroni configuration needs to be updated due to CRD changes.
462469
func (h HTTPClient) ReplaceConfiguration(ctx context.Context, configuration map[string]any) error {
470+
ctx, span := h.tracer.Start(ctx, "patroni.replace-configuration")
471+
defer span.End()
472+
463473
h.logger.Info("Calling ReplaceConfiguration")
464-
return h.client.putConfig(ctx, configuration)
474+
475+
err := h.client.putConfig(ctx, configuration)
476+
if err != nil {
477+
span.RecordError(err)
478+
}
479+
return err
465480
}
466481

467482
// Called when the operator detects pod restarts or changes that require pod restarts, such
@@ -471,14 +486,34 @@ func (h HTTPClient) ChangePrimaryAndWait(
471486
leader, candidate string,
472487
_patroniVer4 bool,
473488
) (bool, error) {
489+
ctx, span := h.tracer.Start(ctx, "patroni.change-primary")
490+
defer span.End()
491+
492+
span.SetAttributes(
493+
attribute.String("patroni.leader", leader),
494+
attribute.String("patroni.candidate", candidate),
495+
attribute.Bool("patroni.ver4", _patroniVer4),
496+
)
497+
474498
h.logger.WithValues("leader", leader).Info("Calling ChangePrimaryAndWait")
475-
return h.client.switchover(ctx, leader, candidate)
499+
500+
success, err := h.client.switchover(ctx, leader, candidate)
501+
if err != nil {
502+
span.RecordError(err)
503+
}
504+
span.SetAttributes(attribute.Bool("patroni.success", success))
505+
return success, err
476506
}
477507

478508
// Very similar to ChangePrimaryAndWait, but implemented by the Percona team for
479509
// the reconcileSwitchover method. The difference here is that SwitchoverAndWait
480510
// does not provide a leader.
481511
func (h HTTPClient) SwitchoverAndWait(ctx context.Context, candidate string) (bool, error) {
512+
ctx, span := h.tracer.Start(ctx, "patroni.switchover")
513+
defer span.End()
514+
515+
span.SetAttributes(attribute.String("patroni.candidate", candidate))
516+
482517
h.logger.WithValues("candidate", candidate).Info("Calling SwitchoverAndWait")
483518
leader, err := h.client.getLeader(ctx)
484519

@@ -487,9 +522,12 @@ func (h HTTPClient) SwitchoverAndWait(ctx context.Context, candidate string) (bo
487522
err,
488523
"Failed to auto-detect current leader for switchover",
489524
)
525+
span.RecordError(err)
490526
return false, fmt.Errorf("failed to detect current leader: %w", err)
491527
}
492528

529+
span.SetAttributes(attribute.String("patroni.leader", leader.Name))
530+
493531
// NOTE:
494532
// Potential race condition where the leader changes between these two calls.
495533
// If this happens, Patroni will error out and the operation will be retried.
@@ -500,22 +538,47 @@ func (h HTTPClient) SwitchoverAndWait(ctx context.Context, candidate string) (bo
500538
"candidate",
501539
candidate,
502540
)
503-
return h.client.switchover(ctx, leader.Name, candidate)
541+
542+
success, err := h.client.switchover(ctx, leader.Name, candidate)
543+
if err != nil {
544+
span.RecordError(err)
545+
}
546+
span.SetAttributes(attribute.Bool("patroni.success", success))
547+
return success, err
504548
}
505549

506550
// FailoverAndWait tries to change the leader when the cluster is NOT healthy. When it's
507551
// healthy, switchover is advised.
508552
// Ref.: https://patroni.readthedocs.io/en/latest/rest_api.html#failover
509553
func (h HTTPClient) FailoverAndWait(ctx context.Context, candidate string) (bool, error) {
554+
ctx, span := h.tracer.Start(ctx, "patroni.failover")
555+
defer span.End()
556+
557+
span.SetAttributes(attribute.String("patroni.candidate", candidate))
558+
510559
h.logger.WithValues("candidate", candidate).Info("Calling FailoverAndWait")
511-
return h.client.failover(ctx, candidate)
560+
561+
success, err := h.client.failover(ctx, candidate)
562+
if err != nil {
563+
span.RecordError(err)
564+
}
565+
span.SetAttributes(attribute.Bool("patroni.success", success))
566+
return success, err
512567
}
513568

514569
// Restarts Patroni members that have a pending restart and match a given role.
515570
// The pending status is given by Patroni when it detects that a configuration was updated that
516571
// requires a restart to take effect. The operator watches for the pending status and first
517572
// asks for the leader to be restarted, followed by its replicas.
518573
func (h HTTPClient) RestartPendingMembers(ctx context.Context, role, _scope string) error {
574+
ctx, span := h.tracer.Start(ctx, "patroni.restart-pending-members")
575+
defer span.End()
576+
577+
span.SetAttributes(
578+
attribute.String("patroni.role", role),
579+
attribute.String("patroni.scope", _scope),
580+
)
581+
519582
h.logger.WithValues("role", role).Info("Calling RestartPendingMembers")
520583

521584
if role == "" {
@@ -525,6 +588,7 @@ func (h HTTPClient) RestartPendingMembers(ctx context.Context, role, _scope stri
525588
// function was called without a role.
526589
err := fmt.Errorf("role is empty")
527590
h.logger.Error(err, "Failed to restart pending members")
591+
span.RecordError(err)
528592
return err
529593
}
530594

@@ -539,9 +603,12 @@ func (h HTTPClient) RestartPendingMembers(ctx context.Context, role, _scope stri
539603
members, err := h.client.getMembersByRole(ctx, roles)
540604
if err != nil {
541605
h.logger.Error(err, "Failed to fetch cluster members")
606+
span.RecordError(err)
542607
return err
543608
}
544609

610+
span.SetAttributes(attribute.Int("patroni.members_found", len(members)))
611+
545612
if len(members) == 0 {
546613
h.logger.Info("Found no members to restart", "role", role)
547614
return nil
@@ -552,18 +619,21 @@ func (h HTTPClient) RestartPendingMembers(ctx context.Context, role, _scope stri
552619

553620
podMetadata, err := extractMetadataFromPodName(member.Name)
554621
if err != nil {
622+
span.RecordError(err)
555623
return err
556624
}
557625

558626
client, err := newInstanceClient(ctx, h.logger, h.kubeClient, podMetadata)
559627
if err != nil {
560628
h.logger.Error(err, "Failed to create client for pod", "pod", member.Name)
629+
span.RecordError(err)
561630
return err
562631
}
563632

564633
err = client.restartPendingWithRole(ctx, role)
565634
if err != nil {
566635
h.logger.Error(err, "Restart failed for pod", "pod", member.Name)
636+
span.RecordError(err)
567637
return err
568638
}
569639
}
@@ -576,20 +646,32 @@ func (h HTTPClient) RestartPendingMembers(ctx context.Context, role, _scope stri
576646
// Used as sanity check by the operator before a switchover/failover operation.
577647
// Returns the timeline of the running leader, or 0 if no running leader found.
578648
func (h HTTPClient) GetTimeline(ctx context.Context) (int64, error) {
649+
ctx, span := h.tracer.Start(ctx, "patroni.get-timeline")
650+
defer span.End()
651+
579652
h.logger.Info("Calling GetTimeline")
580653

581654
leader, err := h.client.getLeader(ctx)
582655
if err != nil {
583656
h.logger.Info("No leader found for timeline", "error", err)
657+
span.SetAttributes(attribute.Bool("patroni.leader_found", false))
584658
return 0, nil // Return 0 when no leader (matches CLI behavior)
585659
}
586660

661+
span.SetAttributes(
662+
attribute.Bool("patroni.leader_found", true),
663+
attribute.String("patroni.leader_name", leader.Name),
664+
attribute.String("patroni.leader_state", leader.State),
665+
)
666+
587667
// Check if leader is running (same logic as CLI implementation)
588668
if leader.State != "running" {
589669
h.logger.Info("Leader not in running state", "state", leader.State)
670+
span.SetAttributes(attribute.Int64("patroni.timeline", 0))
590671
return 0, nil
591672
}
592673

593674
h.logger.Info("Found running leader", "member", leader.Name, "timeline", leader.Timeline)
675+
span.SetAttributes(attribute.Int64("patroni.timeline", leader.Timeline))
594676
return leader.Timeline, nil
595677
}

internal/patroni/api_http_test.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"testing"
1010

1111
"github.com/go-logr/logr"
12+
"go.opentelemetry.io/otel"
1213
"gotest.tools/v3/assert"
1314
corev1 "k8s.io/api/core/v1"
1415
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -152,6 +153,7 @@ func TestHTTPClientChangePrimaryAndWait(t *testing.T) {
152153
logger: logr.Discard(),
153154
},
154155
logger: logr.Discard(),
156+
tracer: otel.Tracer("test"),
155157
}
156158

157159
success, err := httpClient.ChangePrimaryAndWait(context.Background(), "old", "new", true)
@@ -185,6 +187,7 @@ func TestHTTPClientChangePrimaryAndWait(t *testing.T) {
185187
logger: logr.Discard(),
186188
},
187189
logger: logr.Discard(),
190+
tracer: otel.Tracer("test"),
188191
}
189192

190193
success, err := httpClient.ChangePrimaryAndWait(context.Background(), "old", "new", true)
@@ -202,6 +205,7 @@ func TestHTTPClientChangePrimaryAndWait(t *testing.T) {
202205
logger: logr.Discard(),
203206
},
204207
logger: logr.Discard(),
208+
tracer: otel.Tracer("test"),
205209
}
206210

207211
success, err := httpClient.ChangePrimaryAndWait(context.Background(), "", "new", true)
@@ -232,6 +236,7 @@ func TestHTTPClientChangePrimaryAndWait(t *testing.T) {
232236
logger: logr.Discard(),
233237
},
234238
logger: logr.Discard(),
239+
tracer: otel.Tracer("test"),
235240
}
236241

237242
success, err := httpClient.ChangePrimaryAndWait(context.Background(), "old", "new", true)
@@ -282,6 +287,7 @@ func TestHTTPClientSwitchoverAndWait(t *testing.T) {
282287
logger: logr.Discard(),
283288
},
284289
logger: logr.Discard(),
290+
tracer: otel.Tracer("test"),
285291
}
286292

287293
success, err := httpClient.SwitchoverAndWait(context.Background(), "new")
@@ -327,6 +333,7 @@ func TestHTTPClientSwitchoverAndWait(t *testing.T) {
327333
logger: logr.Discard(),
328334
},
329335
logger: logr.Discard(),
336+
tracer: otel.Tracer("test"),
330337
}
331338

332339
success, err := httpClient.SwitchoverAndWait(context.Background(), "new")
@@ -361,6 +368,7 @@ func TestHTTPClientFailoverAndWait(t *testing.T) {
361368
logger: logr.Discard(),
362369
},
363370
logger: logr.Discard(),
371+
tracer: otel.Tracer("test"),
364372
}
365373

366374
success, err := httpClient.FailoverAndWait(context.Background(), "new")
@@ -392,6 +400,7 @@ func TestHTTPClientFailoverAndWait(t *testing.T) {
392400
logger: logr.Discard(),
393401
},
394402
logger: logr.Discard(),
403+
tracer: otel.Tracer("test"),
395404
}
396405

397406
success, err := httpClient.FailoverAndWait(context.Background(), "new")
@@ -409,13 +418,14 @@ func TestHTTPClientFailoverAndWait(t *testing.T) {
409418
logger: logr.Discard(),
410419
},
411420
logger: logr.Discard(),
421+
tracer: otel.Tracer("test"),
412422
}
413423

414424
success, err := httpClient.FailoverAndWait(context.Background(), "")
415425

416426
assert.Assert(t, err != nil)
417427
assert.Assert(t, !success)
418-
assert.Assert(t, strings.Contains(err.Error(), "failover requires a specific candidate"))
428+
assert.Assert(t, strings.Contains(err.Error(), "candidate is required for failover"))
419429
})
420430

421431
// Same as switchover. If we failover but to a different candidate, the
@@ -439,6 +449,7 @@ func TestHTTPClientFailoverAndWait(t *testing.T) {
439449
logger: logr.Discard(),
440450
},
441451
logger: logr.Discard(),
452+
tracer: otel.Tracer("test"),
442453
}
443454

444455
success, err := httpClient.FailoverAndWait(context.Background(), "new")
@@ -474,6 +485,7 @@ func TestHTTPClientReplaceConfiguration(t *testing.T) {
474485
logger: logr.Discard(),
475486
},
476487
logger: logr.Discard(),
488+
tracer: otel.Tracer("test"),
477489
}
478490

479491
config := map[string]any{"some": "values"}
@@ -504,6 +516,7 @@ func TestHTTPClientReplaceConfiguration(t *testing.T) {
504516
logger: logr.Discard(),
505517
},
506518
logger: logr.Discard(),
519+
tracer: otel.Tracer("test"),
507520
}
508521

509522
err := httpClient.ReplaceConfiguration(
@@ -553,6 +566,7 @@ func TestHTTPClientGetTimeline(t *testing.T) {
553566
logger: logr.Discard(),
554567
},
555568
logger: logr.Discard(),
569+
tracer: otel.Tracer("test"),
556570
}
557571

558572
timeline, err := httpClient.GetTimeline(context.Background())
@@ -591,6 +605,7 @@ func TestHTTPClientGetTimeline(t *testing.T) {
591605
logger: logr.Discard(),
592606
},
593607
logger: logr.Discard(),
608+
tracer: otel.Tracer("test"),
594609
}
595610

596611
timeline, err := httpClient.GetTimeline(context.Background())
@@ -629,6 +644,7 @@ func TestHTTPClientGetTimeline(t *testing.T) {
629644
logger: logr.Discard(),
630645
},
631646
logger: logr.Discard(),
647+
tracer: otel.Tracer("test"),
632648
}
633649

634650
timeline, err := httpClient.GetTimeline(context.Background())

0 commit comments

Comments
 (0)