@@ -15,6 +15,10 @@ import (
1515
1616 "github.com/go-logr/logr"
1717 "github.com/percona/percona-postgresql-operator/internal/logging"
18+ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
19+ "go.opentelemetry.io/otel"
20+ "go.opentelemetry.io/otel/attribute"
21+ "go.opentelemetry.io/otel/trace"
1822 corev1 "k8s.io/api/core/v1"
1923 "sigs.k8s.io/controller-runtime/pkg/client"
2024)
@@ -412,12 +416,12 @@ func newInstanceClient(
412416
413417 httpClient := & http.Client {
414418 Timeout : 30 * time .Second ,
415- Transport : & http.Transport {
419+ Transport : otelhttp . NewTransport ( & http.Transport {
416420 TLSClientConfig : & tls.Config {
417421 RootCAs : caCert ,
418422 Certificates : []tls.Certificate {clientCert },
419423 },
420- },
424+ }) ,
421425 }
422426
423427 return instanceClient {
@@ -431,12 +435,14 @@ type HTTPClient struct {
431435 kubeClient client.Client
432436 client instanceClient
433437 logger logr.Logger
438+ tracer trace.Tracer
434439}
435440
436441var _ API = HTTPClient {}
437442
438443func NewHttpClient (ctx context.Context , kube client.Client , podName string ) (HTTPClient , error ) {
439444 logger := logging .FromContext (ctx ).WithName ("patroni.http" )
445+ tracer := otel .Tracer ("github.com/percona/percona-postgresql-operator/patroni" )
440446
441447 // We can extract all the information we need from the podName due to the
442448 // way podName is built: ${namespace}-${instanceSuffix}-${podNumeral}.
@@ -455,13 +461,22 @@ func NewHttpClient(ctx context.Context, kube client.Client, podName string) (HTT
455461 client : patroniHttpClient ,
456462 kubeClient : kube ,
457463 logger : logger ,
464+ tracer : tracer ,
458465 }, nil
459466}
460467
461468// Called when the operator believes Patroni configuration needs to be updated due to CRD changes.
462469func (h HTTPClient ) ReplaceConfiguration (ctx context.Context , configuration map [string ]any ) error {
470+ ctx , span := h .tracer .Start (ctx , "patroni.replace-configuration" )
471+ defer span .End ()
472+
463473 h .logger .Info ("Calling ReplaceConfiguration" )
464- return h .client .putConfig (ctx , configuration )
474+
475+ err := h .client .putConfig (ctx , configuration )
476+ if err != nil {
477+ span .RecordError (err )
478+ }
479+ return err
465480}
466481
467482// Called when the operator detects pod restarts or changes that require pod restarts, such
@@ -471,14 +486,34 @@ func (h HTTPClient) ChangePrimaryAndWait(
471486 leader , candidate string ,
472487 _patroniVer4 bool ,
473488) (bool , error ) {
489+ ctx , span := h .tracer .Start (ctx , "patroni.change-primary" )
490+ defer span .End ()
491+
492+ span .SetAttributes (
493+ attribute .String ("patroni.leader" , leader ),
494+ attribute .String ("patroni.candidate" , candidate ),
495+ attribute .Bool ("patroni.ver4" , _patroniVer4 ),
496+ )
497+
474498 h .logger .WithValues ("leader" , leader ).Info ("Calling ChangePrimaryAndWait" )
475- return h .client .switchover (ctx , leader , candidate )
499+
500+ success , err := h .client .switchover (ctx , leader , candidate )
501+ if err != nil {
502+ span .RecordError (err )
503+ }
504+ span .SetAttributes (attribute .Bool ("patroni.success" , success ))
505+ return success , err
476506}
477507
478508// Very similar to ChangePrimaryAndWait, but implemented by the Percona team for
479509// the reconcileSwitchover method. The difference here is that SwitchoverAndWait
480510// does not provide a leader.
481511func (h HTTPClient ) SwitchoverAndWait (ctx context.Context , candidate string ) (bool , error ) {
512+ ctx , span := h .tracer .Start (ctx , "patroni.switchover" )
513+ defer span .End ()
514+
515+ span .SetAttributes (attribute .String ("patroni.candidate" , candidate ))
516+
482517 h .logger .WithValues ("candidate" , candidate ).Info ("Calling SwitchoverAndWait" )
483518 leader , err := h .client .getLeader (ctx )
484519
@@ -487,9 +522,12 @@ func (h HTTPClient) SwitchoverAndWait(ctx context.Context, candidate string) (bo
487522 err ,
488523 "Failed to auto-detect current leader for switchover" ,
489524 )
525+ span .RecordError (err )
490526 return false , fmt .Errorf ("failed to detect current leader: %w" , err )
491527 }
492528
529+ span .SetAttributes (attribute .String ("patroni.leader" , leader .Name ))
530+
493531 // NOTE:
494532 // Potential race condition where the leader changes between these two calls.
495533 // If this happens, Patroni will error out and the operation will be retried.
@@ -500,22 +538,47 @@ func (h HTTPClient) SwitchoverAndWait(ctx context.Context, candidate string) (bo
500538 "candidate" ,
501539 candidate ,
502540 )
503- return h .client .switchover (ctx , leader .Name , candidate )
541+
542+ success , err := h .client .switchover (ctx , leader .Name , candidate )
543+ if err != nil {
544+ span .RecordError (err )
545+ }
546+ span .SetAttributes (attribute .Bool ("patroni.success" , success ))
547+ return success , err
504548}
505549
506550// FailoverAndWait tries to change the leader when the cluster is NOT healthy. When it's
507551// healthy, switchover is advised.
508552// Ref.: https://patroni.readthedocs.io/en/latest/rest_api.html#failover
509553func (h HTTPClient ) FailoverAndWait (ctx context.Context , candidate string ) (bool , error ) {
554+ ctx , span := h .tracer .Start (ctx , "patroni.failover" )
555+ defer span .End ()
556+
557+ span .SetAttributes (attribute .String ("patroni.candidate" , candidate ))
558+
510559 h .logger .WithValues ("candidate" , candidate ).Info ("Calling FailoverAndWait" )
511- return h .client .failover (ctx , candidate )
560+
561+ success , err := h .client .failover (ctx , candidate )
562+ if err != nil {
563+ span .RecordError (err )
564+ }
565+ span .SetAttributes (attribute .Bool ("patroni.success" , success ))
566+ return success , err
512567}
513568
514569// Restarts Patroni members that have a pending restart and match a given role.
515570// The pending status is given by Patroni when it detects that a configuration was updated that
516571// requires a restart to take effect. The operator watches for the pending status and first
517572// asks for the leader to be restarted, followed by its replicas.
518573func (h HTTPClient ) RestartPendingMembers (ctx context.Context , role , _scope string ) error {
574+ ctx , span := h .tracer .Start (ctx , "patroni.restart-pending-members" )
575+ defer span .End ()
576+
577+ span .SetAttributes (
578+ attribute .String ("patroni.role" , role ),
579+ attribute .String ("patroni.scope" , _scope ),
580+ )
581+
519582 h .logger .WithValues ("role" , role ).Info ("Calling RestartPendingMembers" )
520583
521584 if role == "" {
@@ -525,6 +588,7 @@ func (h HTTPClient) RestartPendingMembers(ctx context.Context, role, _scope stri
525588 // function was called without a role.
526589 err := fmt .Errorf ("role is empty" )
527590 h .logger .Error (err , "Failed to restart pending members" )
591+ span .RecordError (err )
528592 return err
529593 }
530594
@@ -539,9 +603,12 @@ func (h HTTPClient) RestartPendingMembers(ctx context.Context, role, _scope stri
539603 members , err := h .client .getMembersByRole (ctx , roles )
540604 if err != nil {
541605 h .logger .Error (err , "Failed to fetch cluster members" )
606+ span .RecordError (err )
542607 return err
543608 }
544609
610+ span .SetAttributes (attribute .Int ("patroni.members_found" , len (members )))
611+
545612 if len (members ) == 0 {
546613 h .logger .Info ("Found no members to restart" , "role" , role )
547614 return nil
@@ -552,18 +619,21 @@ func (h HTTPClient) RestartPendingMembers(ctx context.Context, role, _scope stri
552619
553620 podMetadata , err := extractMetadataFromPodName (member .Name )
554621 if err != nil {
622+ span .RecordError (err )
555623 return err
556624 }
557625
558626 client , err := newInstanceClient (ctx , h .logger , h .kubeClient , podMetadata )
559627 if err != nil {
560628 h .logger .Error (err , "Failed to create client for pod" , "pod" , member .Name )
629+ span .RecordError (err )
561630 return err
562631 }
563632
564633 err = client .restartPendingWithRole (ctx , role )
565634 if err != nil {
566635 h .logger .Error (err , "Restart failed for pod" , "pod" , member .Name )
636+ span .RecordError (err )
567637 return err
568638 }
569639 }
@@ -576,20 +646,32 @@ func (h HTTPClient) RestartPendingMembers(ctx context.Context, role, _scope stri
576646// Used as sanity check by the operator before a switchover/failover operation.
577647// Returns the timeline of the running leader, or 0 if no running leader found.
578648func (h HTTPClient ) GetTimeline (ctx context.Context ) (int64 , error ) {
649+ ctx , span := h .tracer .Start (ctx , "patroni.get-timeline" )
650+ defer span .End ()
651+
579652 h .logger .Info ("Calling GetTimeline" )
580653
581654 leader , err := h .client .getLeader (ctx )
582655 if err != nil {
583656 h .logger .Info ("No leader found for timeline" , "error" , err )
657+ span .SetAttributes (attribute .Bool ("patroni.leader_found" , false ))
584658 return 0 , nil // Return 0 when no leader (matches CLI behavior)
585659 }
586660
661+ span .SetAttributes (
662+ attribute .Bool ("patroni.leader_found" , true ),
663+ attribute .String ("patroni.leader_name" , leader .Name ),
664+ attribute .String ("patroni.leader_state" , leader .State ),
665+ )
666+
587667 // Check if leader is running (same logic as CLI implementation)
588668 if leader .State != "running" {
589669 h .logger .Info ("Leader not in running state" , "state" , leader .State )
670+ span .SetAttributes (attribute .Int64 ("patroni.timeline" , 0 ))
590671 return 0 , nil
591672 }
592673
593674 h .logger .Info ("Found running leader" , "member" , leader .Name , "timeline" , leader .Timeline )
675+ span .SetAttributes (attribute .Int64 ("patroni.timeline" , leader .Timeline ))
594676 return leader .Timeline , nil
595677}
0 commit comments