Skip to content

Commit 53aeaf6

Browse files
committed
feat: add distributed tracing with timing spans and trace propagation
Propagate W3C trace context from build PipelineRuns to Snapshots, integration PipelineRuns, and Release CRs. Emit waitDuration and executeDuration timing spans on completed PipelineRuns across both ComponentGroup and Application model flows. Tracing is opt-in via OTEL_EXPORTER_OTLP_ENDPOINT; without it set, the service uses a noop tracer. The sampler family is selected via OTEL_TRACES_SAMPLER. Assisted-by: Claude Code Signed-off-by: Josiah England <jengland@redhat.com>
1 parent 760e850 commit 53aeaf6

23 files changed

Lines changed: 2129 additions & 8 deletions

cmd/main.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package main
1818

1919
import (
20+
"context"
2021
"crypto/tls"
2122
"flag"
2223
"os"
@@ -25,6 +26,7 @@ import (
2526
controllers "github.com/konflux-ci/integration-service/internal/controller"
2627
iswebhook "github.com/konflux-ci/integration-service/internal/webhook/v1beta2"
2728
imetrics "github.com/konflux-ci/integration-service/pkg/metrics"
29+
"github.com/konflux-ci/integration-service/pkg/tracing"
2830
"sigs.k8s.io/controller-runtime/pkg/metrics"
2931
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
3032
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
@@ -110,6 +112,16 @@ func main() {
110112

111113
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
112114

115+
// Initialize tracing
116+
tracerProvider := tracing.New()
117+
defer func() {
118+
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
119+
defer cancel()
120+
if err := tracerProvider.Shutdown(shutdownCtx); err != nil {
121+
setupLog.Error(err, "failed to shutdown tracer provider")
122+
}
123+
}()
124+
113125
// if the enable-http2 flag is false (the default), http/2 should be disabled
114126
// due to its vulnerabilities. More specifically, disabling http/2 will
115127
// prevent from being vulnerable to the HTTP/2 Stream Cancellation and

docs/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
- [build-pipeline-controller](https://github.com/konflux-ci/integration-service/blob/main/docs/build_pipeline_controller.md)
66
- [integration-pipeline-controller](https://github.com/konflux-ci/integration-service/blob/main/docs/integration_pipeline_controller.md)
77

8+
## Operational docs
9+
- [Distributed tracing](https://github.com/konflux-ci/integration-service/blob/main/docs/tracing.md)
10+
811
## Creating or editing Mermaid diagrams
912

1013
Mermaid is a JS based diagramming tool that renders markdown style syntax to create/modify diagrams. Mermaid has [native support in Github](https://github.com/github/roadmap/issues/372)

docs/tracing.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
<div align="center"><h1>Distributed tracing</h1></div>
2+
3+
The operator emits OpenTelemetry spans for build and integration-test PipelineRuns it reconciles, and propagates the trace context forward onto Snapshots and downstream Release CRs so a single trace can span the build, test, and release lifecycle.
4+
5+
## Configuration
6+
7+
| Env var | Purpose | Default |
8+
|---|---|---|
9+
| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP/gRPC collector URL. Unset disables tracing (noop provider). | *(unset)* |
10+
| `OTEL_TRACES_SAMPLER` | `always_on`, `always_off`, `traceidratio`, `parentbased_always_off`, `parentbased_traceidratio`. | `parentbased_always_on` |
11+
| `OTEL_TRACES_SAMPLER_ARG` | Ratio for ratio-based samplers (e.g. `0.1`). | *(unused unless a ratio sampler is selected)* |
12+
| `TRACING_LABEL_ACTION` | PipelineRun label read to populate `cicd.pipeline.action.name`. Empty string disables the attribute. | `delivery.tekton.dev/action` |
13+
| `TRACING_LABEL_APPLICATION` | PipelineRun label read to populate `delivery.tekton.dev.application`. Empty string disables the attribute. | `delivery.tekton.dev/application` |
14+
| `TRACING_LABEL_COMPONENT` | PipelineRun label read to populate `delivery.tekton.dev.component`. Empty string disables the attribute. | `delivery.tekton.dev/component` |
15+
16+
## Emitted spans
17+
18+
Two spans are emitted per PipelineRun when it completes:
19+
20+
- `waitDuration`: `pr.CreationTimestamp` to `pr.Status.StartTime`
21+
- `executeDuration`: `pr.Status.StartTime` to `pr.Status.CompletionTime`
22+
23+
The build-pipeline and integration-pipeline controllers each emit for their respective PipelineRun types. The `delivery.tekton.dev/timingEmitted` annotation guards against re-emission on subsequent reconciles.
24+
25+
## Trace-context propagation
26+
27+
Parenting follows the W3C Trace Context in the `tekton.dev/pipelinerunSpanContext` annotation. The annotation is propagated across resource boundaries so a single trace covers the full delivery flow:
28+
29+
```
30+
build PipelineRun (annotation set by upstream)
31+
└── Snapshot (annotation copied from the build PipelineRun)
32+
├── integration-test PipelineRun (annotation copied from the Snapshot)
33+
└── Release CR (annotation copied from the Snapshot)
34+
```
35+
36+
When the annotation is absent, spans are still emitted but without a parent.
37+
38+
## Span attributes
39+
40+
| Attribute | Span | Source |
41+
|---|---|---|
42+
| `namespace` | both | `pr.GetNamespace()` |
43+
| `pipelinerun` | both | `pr.GetName()` |
44+
| `delivery.tekton.dev.pipelinerun_uid` | both | `pr.GetUID()` |
45+
| `cicd.pipeline.action.name` | both | PipelineRun label (name configurable via `TRACING_LABEL_ACTION`) |
46+
| `delivery.tekton.dev.application` | both | PipelineRun label (name configurable via `TRACING_LABEL_APPLICATION`) |
47+
| `delivery.tekton.dev.component` | both | PipelineRun label (name configurable via `TRACING_LABEL_COMPONENT`) |
48+
| `cicd.pipeline.result` | execute | `Succeeded` condition mapped to the semconv `cicd.pipeline.result` enum (`success` / `failure` / `timeout` / `cancellation` / `error`) |
49+
| `delivery.tekton.dev.result_message` | execute | PipelineRun's `Succeeded` condition message on failure. Omitted on success; truncated to 1024 bytes (UTF-8 safe). |
50+
51+
## Superseded Snapshot dedup
52+
53+
When the snapshot controller skips auto-release because a newer Snapshot for the same Application has already been released, a single `waitDuration` span is emitted on the superseded Snapshot's trace context, anchored at the Snapshot's `CreationTimestamp` and ending at the `AutoReleased` condition's `LastTransitionTime`. It carries `cicd.pipeline.result=skip` and `delivery.tekton.dev.result_message="Released in newer Snapshot"` so the trace distinguishes the deliberate dedup from a broken chain.

e2e-tests/pkg/clients/gitlab/git.go

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,40 @@ import (
1010
"gitlab.com/gitlab-org/api/client-go/v2"
1111
)
1212

13+
// GitLab pre-receive hooks surface 5xx wrapped in a 4xx; RetryTransport's status-code check misses it.
14+
func isGitlabTransientError(err error) bool {
15+
if err == nil {
16+
return false
17+
}
18+
msg := err.Error()
19+
return strings.Contains(msg, "Internal API error") ||
20+
strings.Contains(msg, "504 Gateway Timeout")
21+
}
22+
23+
func (gc *GitlabClient) createBranchWithRetry(projectID string, opt *gitlab.CreateBranchOptions) (*gitlab.Response, error) {
24+
const maxAttempts = 5
25+
const baseDelay = 2 * time.Second
26+
var resp *gitlab.Response
27+
var err error
28+
for attempt := 1; attempt <= maxAttempts; attempt++ {
29+
_, resp, err = gc.client.Branches.CreateBranch(projectID, opt)
30+
if err == nil {
31+
return resp, nil
32+
}
33+
if resp != nil && resp.StatusCode == http.StatusConflict {
34+
return resp, err
35+
}
36+
if !isGitlabTransientError(err) || attempt == maxAttempts {
37+
return resp, err
38+
}
39+
delay := baseDelay * time.Duration(1<<(attempt-1))
40+
fmt.Printf("[gitlab-retry] CreateBranch attempt %d/%d: %v; retrying in %s\n",
41+
attempt, maxAttempts, err, delay)
42+
time.Sleep(delay)
43+
}
44+
return resp, err
45+
}
46+
1347
// CreateBranch creates a new branch in a GitLab project with the given projectID and newBranchName
1448
func (gc *GitlabClient) CreateBranch(projectID, newBranchName, defaultBranch string) error {
1549
// Prepare the branch creation request
@@ -19,7 +53,7 @@ func (gc *GitlabClient) CreateBranch(projectID, newBranchName, defaultBranch str
1953
}
2054

2155
// Perform the branch creation
22-
_, _, err := gc.client.Branches.CreateBranch(projectID, branchOpts)
56+
_, err := gc.createBranchWithRetry(projectID, branchOpts)
2357
if err != nil {
2458
return fmt.Errorf("failed to create branch %s in project %s: %w", newBranchName, projectID, err)
2559
}
@@ -77,7 +111,7 @@ func (gc *GitlabClient) CreateGitlabNewBranch(projectID, branchName, sha, baseBr
77111
Branch: &branchName,
78112
Ref: &sha,
79113
}
80-
_, resp, err := gc.client.Branches.CreateBranch(projectID, opt)
114+
resp, err := gc.createBranchWithRetry(projectID, opt)
81115
if err != nil {
82116
// Check if the error is due to the branch already existing
83117
if resp != nil && resp.StatusCode == http.StatusConflict {

gitops/snapshot.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,10 @@ const (
251251
// SnapshotAutoReleasedCondition is the condition for marking if Snapshot was auto-released released with AppStudio.
252252
SnapshotAutoReleasedCondition = "AutoReleased"
253253

254+
// SnapshotSupersededMessage is the AutoReleased condition message when
255+
// an older Snapshot is skipped by a newer Snapshot for the same Application.
256+
SnapshotSupersededMessage = "Released in newer Snapshot"
257+
254258
// SnapshotAddedToGlobalCandidateListCondition is the condition for marking if Snapshot's component was added to
255259
// the global candidate list.
256260
SnapshotAddedToGlobalCandidateListCondition = "AddedToGlobalCandidateList"

go.mod

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ require (
4747
github.com/tektoncd/pipeline v1.7.0
4848
github.com/tonglil/buflogr v1.1.1
4949
gitlab.com/gitlab-org/api/client-go/v2 v2.36.0
50+
go.opentelemetry.io/otel v1.40.0
51+
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0
52+
go.opentelemetry.io/otel/sdk v1.40.0
53+
go.opentelemetry.io/otel/trace v1.40.0
5054
go.uber.org/mock v0.6.0
5155
go.uber.org/zap v1.27.1
5256
golang.org/x/oauth2 v0.36.0
@@ -151,12 +155,8 @@ require (
151155
go.opencensus.io v0.24.0 // indirect
152156
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
153157
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.64.0 // indirect
154-
go.opentelemetry.io/otel v1.40.0 // indirect
155158
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 // indirect
156-
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 // indirect
157159
go.opentelemetry.io/otel/metric v1.40.0 // indirect
158-
go.opentelemetry.io/otel/sdk v1.40.0 // indirect
159-
go.opentelemetry.io/otel/trace v1.40.0 // indirect
160160
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
161161
go.uber.org/multierr v1.11.0 // indirect
162162
go.yaml.in/yaml/v2 v2.4.4 // indirect

internal/controller/buildpipeline/buildpipeline_adapter.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
h "github.com/konflux-ci/integration-service/helpers"
3535
"github.com/konflux-ci/integration-service/loader"
3636
intgteststat "github.com/konflux-ci/integration-service/pkg/integrationteststatus"
37+
"github.com/konflux-ci/integration-service/pkg/tracing"
3738
"github.com/konflux-ci/integration-service/snapshot"
3839
"github.com/konflux-ci/integration-service/status"
3940
"github.com/konflux-ci/integration-service/tekton"
@@ -158,6 +159,7 @@ func (a *Adapter) EnsureSnapshotExists() (result controller.OperationResult, err
158159
var canRemoveFinalizer bool
159160

160161
defer func() {
162+
a.emitBuildTimingSpans()
161163
// Don't write a failure annotation for transient Chains-not-signed errors
162164
annotationErr := err
163165
if h.IsChainsNotSignedError(err) {
@@ -261,6 +263,7 @@ func (a *Adapter) EnsureSnapshotExistsApplication() (result controller.Operation
261263
var canRemoveFinalizer bool
262264

263265
defer func() {
266+
a.emitBuildTimingSpans()
264267
// Don't write a failure annotation for transient Chains-not-signed errors
265268
annotationErr := err
266269
if h.IsChainsNotSignedError(err) {
@@ -847,6 +850,11 @@ func (a *Adapter) prepareSnapshotForPipelineRun(pipelineRun *tektonv1.PipelineRu
847850
prefixes := []string{gitops.BuildPipelineRunPrefix, gitops.TestLabelPrefix, gitops.CustomLabelPrefix, gitops.ReleaseLabelPrefix}
848851
gitops.CopySnapshotLabelsAndAnnotations(&application.ObjectMeta, snapshot, a.component.Name, &pipelineRun.ObjectMeta, prefixes, true)
849852

853+
// Propagate span context from build PipelineRun to Snapshot for distributed tracing
854+
if tp, found := pipelineRun.Annotations[tracing.SpanContextAnnotation]; found && tp != "" {
855+
snapshot.Annotations[tracing.SpanContextAnnotation] = tp
856+
}
857+
850858
snapshot.Labels[gitops.BuildPipelineRunNameLabel] = pipelineRun.Name
851859
if pipelineRun.Status.CompletionTime != nil {
852860
snapshot.Labels[gitops.BuildPipelineRunFinishTimeLabel] = strconv.FormatInt(pipelineRun.Status.CompletionTime.Unix(), 10)
@@ -1466,3 +1474,18 @@ func (a *Adapter) IsLatestBuildPipelineRunInComponentWithPRGroupHash(buildPlr *t
14661474
a.logger.Info(fmt.Sprintf("The build pipelineRun %s/%s with pr group %s is not the latest for its component, skipped", buildPlr.Namespace, buildPlr.Name, prGroupName))
14671475
return false, nil
14681476
}
1477+
1478+
// emitBuildTimingSpans emits timing spans for the build PipelineRun if not already emitted
1479+
func (a *Adapter) emitBuildTimingSpans() {
1480+
spanContext := a.pipelineRun.Annotations[tracing.SpanContextAnnotation]
1481+
patched, err := tracing.EmitAndMarkTimingSpans(a.context, a.pipelineRun, spanContext, "", a.client, func() (*tektonv1.PipelineRun, error) {
1482+
return a.loader.GetPipelineRun(a.context, a.client, a.pipelineRun.Name, a.pipelineRun.Namespace)
1483+
})
1484+
if err != nil {
1485+
a.logger.Error(err, "Failed to emit and mark build timing spans")
1486+
return
1487+
}
1488+
if patched != nil {
1489+
a.pipelineRun = patched
1490+
}
1491+
}

internal/controller/buildpipeline/buildpipeline_adapter_test.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,16 @@ import (
3232
"github.com/konflux-ci/integration-service/helpers"
3333
"github.com/konflux-ci/integration-service/loader"
3434
intgteststat "github.com/konflux-ci/integration-service/pkg/integrationteststatus"
35+
"github.com/konflux-ci/integration-service/pkg/tracing"
3536
"github.com/konflux-ci/integration-service/snapshot"
3637
"github.com/konflux-ci/integration-service/status"
3738
"github.com/konflux-ci/integration-service/tekton"
3839
"github.com/konflux-ci/operator-toolkit/metadata"
3940
"knative.dev/pkg/apis"
4041
v1 "knative.dev/pkg/apis/duck/v1"
4142

43+
"go.opentelemetry.io/otel"
44+
sdktrace "go.opentelemetry.io/otel/sdk/trace"
4245
"go.uber.org/mock/gomock"
4346
k8serrors "k8s.io/apimachinery/pkg/api/errors"
4447
"k8s.io/apimachinery/pkg/types"
@@ -3841,6 +3844,95 @@ var _ = Describe("Pipeline Adapter", Ordered, func() {
38413844

38423845
})
38433846

3847+
When("emitBuildTimingSpans is called", func() {
3848+
BeforeEach(func() {
3849+
adapter = createAdapter()
3850+
})
3851+
3852+
It("does nothing when the PipelineRun already has the timingEmitted annotation", func() {
3853+
adapter.pipelineRun.Annotations[tracing.TimingEmittedAnnotation] = "true"
3854+
adapter.emitBuildTimingSpans()
3855+
Expect(adapter.pipelineRun.Annotations[tracing.TimingEmittedAnnotation]).To(Equal("true"))
3856+
})
3857+
3858+
It("does not annotate the PipelineRun when the global tracer provider is the noop", func() {
3859+
adapter.pipelineRun.Status.StartTime = &metav1.Time{Time: time.Now().Add(-time.Minute)}
3860+
adapter.pipelineRun.Status.CompletionTime = &metav1.Time{Time: time.Now()}
3861+
Expect(k8sClient.Status().Update(ctx, adapter.pipelineRun)).To(Succeed())
3862+
3863+
adapter.emitBuildTimingSpans()
3864+
Expect(adapter.pipelineRun.GetAnnotations()).NotTo(HaveKey(tracing.TimingEmittedAnnotation))
3865+
})
3866+
3867+
It("does not annotate the PipelineRun when start or completion time is missing", func() {
3868+
prev := otel.GetTracerProvider()
3869+
tp := sdktrace.NewTracerProvider()
3870+
otel.SetTracerProvider(tp)
3871+
DeferCleanup(func() {
3872+
otel.SetTracerProvider(prev)
3873+
_ = tp.Shutdown(ctx)
3874+
})
3875+
3876+
adapter.emitBuildTimingSpans()
3877+
Expect(adapter.pipelineRun.GetAnnotations()).NotTo(HaveKey(tracing.TimingEmittedAnnotation))
3878+
})
3879+
3880+
It("emits spans and patches the timingEmitted annotation when the run has completed", func() {
3881+
prev := otel.GetTracerProvider()
3882+
tp := sdktrace.NewTracerProvider()
3883+
otel.SetTracerProvider(tp)
3884+
DeferCleanup(func() {
3885+
otel.SetTracerProvider(prev)
3886+
_ = tp.Shutdown(ctx)
3887+
})
3888+
3889+
adapter.pipelineRun.Status.StartTime = &metav1.Time{Time: time.Now().Add(-time.Minute)}
3890+
adapter.pipelineRun.Status.CompletionTime = &metav1.Time{Time: time.Now()}
3891+
Expect(k8sClient.Status().Update(ctx, adapter.pipelineRun)).To(Succeed())
3892+
adapter.context = toolkit.GetMockedContext(ctx, []toolkit.MockData{
3893+
{
3894+
ContextKey: loader.GetPipelineRunContextKey,
3895+
Resource: adapter.pipelineRun,
3896+
},
3897+
})
3898+
3899+
adapter.emitBuildTimingSpans()
3900+
3901+
Eventually(func(g Gomega) {
3902+
updated := &tektonv1.PipelineRun{}
3903+
g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: adapter.pipelineRun.Name, Namespace: adapter.pipelineRun.Namespace}, updated)).To(Succeed())
3904+
g.Expect(updated.GetAnnotations()).To(HaveKeyWithValue(tracing.TimingEmittedAnnotation, "true"))
3905+
}).Should(Succeed())
3906+
})
3907+
3908+
It("logs and returns without marking the run when the refetch errors, so reconciliation continues and the next pass retries", func() {
3909+
prev := otel.GetTracerProvider()
3910+
tp := sdktrace.NewTracerProvider()
3911+
otel.SetTracerProvider(tp)
3912+
DeferCleanup(func() {
3913+
otel.SetTracerProvider(prev)
3914+
_ = tp.Shutdown(ctx)
3915+
})
3916+
3917+
adapter.pipelineRun.Status.StartTime = &metav1.Time{Time: time.Now().Add(-time.Minute)}
3918+
adapter.pipelineRun.Status.CompletionTime = &metav1.Time{Time: time.Now()}
3919+
Expect(k8sClient.Status().Update(ctx, adapter.pipelineRun)).To(Succeed())
3920+
adapter.context = toolkit.GetMockedContext(ctx, []toolkit.MockData{
3921+
{
3922+
ContextKey: loader.GetPipelineRunContextKey,
3923+
Err: errors.New("refetch boom"),
3924+
},
3925+
})
3926+
3927+
Expect(func() { adapter.emitBuildTimingSpans() }).NotTo(Panic())
3928+
Expect(adapter.pipelineRun.GetAnnotations()).NotTo(HaveKey(tracing.TimingEmittedAnnotation))
3929+
3930+
stored := &tektonv1.PipelineRun{}
3931+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: adapter.pipelineRun.Name, Namespace: adapter.pipelineRun.Namespace}, stored)).To(Succeed())
3932+
Expect(stored.GetAnnotations()).NotTo(HaveKey(tracing.TimingEmittedAnnotation))
3933+
})
3934+
})
3935+
38443936
createAdapter = func() *Adapter {
38453937
adapter = NewAdapter(ctx, buildPipelineRun, hasComp, &[]v1beta2.ComponentGroup{*hasCompGroup}, logger, loader.NewMockLoader(), k8sClient)
38463938
return adapter

0 commit comments

Comments
 (0)