Skip to content

Commit dc66ed5

Browse files
committed
feat: add distributed tracing with timing spans and trace propagation
Propagate trace context from build PipelineRuns to Snapshots, integration PipelineRuns, and Release CRs. Emit waitDuration and executeDuration timing spans for build and integration PipelineRuns. Create a new root span when valid trace context is missing. See docs/tracing.md. Assisted-by: Claude Code Signed-off-by: Josiah England <jengland@redhat.com>
1 parent 01805a3 commit dc66ed5

15 files changed

Lines changed: 1602 additions & 5 deletions

File tree

cmd/main.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package main
1818

1919
import (
20+
"context"
2021
"crypto/tls"
2122
"flag"
2223
"os"
@@ -25,6 +26,7 @@ import (
2526
controllers "github.com/konflux-ci/integration-service/internal/controller"
2627
iswebhook "github.com/konflux-ci/integration-service/internal/webhook/v1beta2"
2728
imetrics "github.com/konflux-ci/integration-service/pkg/metrics"
29+
"github.com/konflux-ci/integration-service/pkg/tracing"
2830
"sigs.k8s.io/controller-runtime/pkg/metrics"
2931
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
3032
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
@@ -110,6 +112,18 @@ func main() {
110112

111113
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
112114

115+
// Initialize tracing
116+
ctx := context.Background()
117+
tracerProvider, err := tracing.New(ctx)
118+
if err != nil {
119+
setupLog.Error(err, "failed to initialize tracing")
120+
}
121+
defer func() {
122+
if err := tracerProvider.Shutdown(ctx); err != nil {
123+
setupLog.Error(err, "failed to shutdown tracer provider")
124+
}
125+
}()
126+
113127
// if the enable-http2 flag is false (the default), http/2 should be disabled
114128
// due to its vulnerabilities. More specifically, disabling http/2 will
115129
// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
@@ -193,7 +207,7 @@ func main() {
193207
os.Exit(1)
194208
}
195209

196-
ctx := ctrl.SetupSignalHandler()
210+
ctx = ctrl.SetupSignalHandler()
197211
integrationMetrics := imetrics.NewIntegrationMetrics([]imetrics.AvailabilityProbe{imetrics.NewGithubAppAvailabilityProbe(mgr.GetClient())})
198212
if err := integrationMetrics.InitMetrics(metrics.Registry); err != nil {
199213
setupLog.Error(err, "unable to initialize metrics")

docs/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
- [build-pipeline-controller](https://github.com/konflux-ci/integration-service/blob/main/docs/build_pipeline_controller.md)
66
- [integration-pipeline-controller](https://github.com/konflux-ci/integration-service/blob/main/docs/integration_pipeline_controller.md)
77

8+
## Operational docs
9+
- [Distributed tracing](https://github.com/konflux-ci/integration-service/blob/main/docs/tracing.md)
10+
811
## Creating or editing Mermaid diagrams
912

1013
Mermaid is a JS based diagramming tool that renders markdown style syntax to create/modify diagrams. Mermaid has [native support in Github](https://github.com/github/roadmap/issues/372)

docs/tracing.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
<div align="center"><h1>Distributed tracing</h1></div>
2+
3+
The operator emits OpenTelemetry spans for build and integration-test PipelineRuns it reconciles, and propagates the trace context forward onto Snapshots and downstream Release CRs so a single trace can span the build → test → release lifecycle.
4+
5+
## Configuration
6+
7+
| Env var | Purpose | Default |
8+
|---|---|---|
9+
| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP/gRPC collector URL. Unset disables tracing (noop provider). | *(unset)* |
10+
| `OTEL_TRACES_SAMPLER` | `always_on`, `always_off`, `traceidratio`, `parentbased_always_off`, `parentbased_traceidratio`. | `parentbased_always_on` |
11+
| `OTEL_TRACES_SAMPLER_ARG` | Ratio for ratio-based samplers (e.g. `0.1`). | *(unused unless a ratio sampler is selected)* |
12+
| `TRACING_LABEL_ACTION` | PipelineRun label read to populate `cicd.pipeline.action.name`. Empty string disables the attribute. | `delivery.tekton.dev/action` |
13+
| `TRACING_LABEL_APPLICATION` | PipelineRun label read to populate `delivery.tekton.dev.application`. Empty string disables the attribute. | `delivery.tekton.dev/application` |
14+
| `TRACING_LABEL_COMPONENT` | PipelineRun label read to populate `delivery.tekton.dev.component`. Empty string disables the attribute. | `delivery.tekton.dev/component` |
15+
16+
## Emitted spans
17+
18+
Two spans are emitted per PipelineRun when it completes:
19+
20+
- `waitDuration``pr.CreationTimestamp``pr.Status.StartTime`
21+
- `executeDuration``pr.Status.StartTime``pr.Status.CompletionTime`
22+
23+
The build-pipeline and integration-pipeline controllers each emit for their respective PipelineRun types. The `tekton.dev/timingEmitted` annotation guards against re-emission on subsequent reconciles.
24+
25+
## Trace-context propagation
26+
27+
Parenting follows the W3C Trace Context in the `tekton.dev/pipelinerunSpanContext` annotation. The annotation is propagated across resource boundaries so a single trace covers the full delivery flow:
28+
29+
```
30+
build PipelineRun (annotation set by upstream)
31+
└── Snapshot (annotation copied from the build PipelineRun)
32+
├── integration-test PipelineRun (annotation copied from the Snapshot)
33+
└── Release CR (annotation copied from the Snapshot)
34+
```
35+
36+
When the annotation is absent, spans are still emitted but without a parent.
37+
38+
## Span attributes
39+
40+
| Attribute | Span | Source |
41+
|---|---|---|
42+
| `namespace` | both | `pr.GetNamespace()` |
43+
| `pipelinerun` | both | `pr.GetName()` |
44+
| `delivery.tekton.dev.pipelinerun_uid` | both | `pr.GetUID()` |
45+
| `cicd.pipeline.action.name` | both | PipelineRun label (name configurable via `TRACING_LABEL_ACTION`) |
46+
| `delivery.tekton.dev.application` | both | PipelineRun label (name configurable via `TRACING_LABEL_APPLICATION`) |
47+
| `delivery.tekton.dev.component` | both | PipelineRun label (name configurable via `TRACING_LABEL_COMPONENT`) |
48+
| `cicd.pipeline.result` | execute | `Succeeded` condition mapped to the semconv `cicd.pipeline.result` enum (`success` / `failure` / `timeout` / `cancellation` / `error`) |
49+
| `delivery.tekton.dev.result_message` | execute | Earliest failing TaskRun's `Succeeded` condition message, falling back to the PipelineRun's own condition message. Omitted on success; truncated to 1024 bytes (UTF-8 safe). |

go.mod

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ require (
4747
github.com/tektoncd/pipeline v1.7.0
4848
github.com/tonglil/buflogr v1.1.1
4949
gitlab.com/gitlab-org/api/client-go v0.134.0
50+
go.opentelemetry.io/otel v1.40.0
51+
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0
52+
go.opentelemetry.io/otel/sdk v1.40.0
53+
go.opentelemetry.io/otel/trace v1.40.0
5054
go.uber.org/mock v0.6.0
5155
go.uber.org/zap v1.27.1
5256
golang.org/x/exp v0.0.0-20260112195511-716be5621a96
@@ -152,12 +156,8 @@ require (
152156
go.opencensus.io v0.24.0 // indirect
153157
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
154158
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.64.0 // indirect
155-
go.opentelemetry.io/otel v1.40.0 // indirect
156159
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 // indirect
157-
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 // indirect
158160
go.opentelemetry.io/otel/metric v1.40.0 // indirect
159-
go.opentelemetry.io/otel/sdk v1.40.0 // indirect
160-
go.opentelemetry.io/otel/trace v1.40.0 // indirect
161161
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
162162
go.uber.org/multierr v1.11.0 // indirect
163163
go.yaml.in/yaml/v2 v2.4.4 // indirect

internal/controller/buildpipeline/buildpipeline_adapter.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
h "github.com/konflux-ci/integration-service/helpers"
3535
"github.com/konflux-ci/integration-service/loader"
3636
intgteststat "github.com/konflux-ci/integration-service/pkg/integrationteststatus"
37+
"github.com/konflux-ci/integration-service/pkg/tracing"
3738
"github.com/konflux-ci/integration-service/snapshot"
3839
"github.com/konflux-ci/integration-service/status"
3940
"github.com/konflux-ci/integration-service/tekton"
@@ -343,6 +344,9 @@ func (a *Adapter) EnsureSnapshotExistsApplication() (result controller.Operation
343344
return controller.RequeueWithError(err)
344345
}
345346

347+
// Emit timing spans for the build PipelineRun if not already emitted
348+
a.emitBuildTimingSpans()
349+
346350
canRemoveFinalizer = true
347351
return controller.ContinueProcessing()
348352
}
@@ -753,6 +757,11 @@ func (a *Adapter) prepareSnapshotForPipelineRun(pipelineRun *tektonv1.PipelineRu
753757
prefixes := []string{gitops.BuildPipelineRunPrefix, gitops.TestLabelPrefix, gitops.CustomLabelPrefix, gitops.ReleaseLabelPrefix}
754758
gitops.CopySnapshotLabelsAndAnnotations(&application.ObjectMeta, snapshot, a.component.Name, &pipelineRun.ObjectMeta, prefixes, true)
755759

760+
// Propagate span context from build PipelineRun to Snapshot for distributed tracing
761+
if tp, found := pipelineRun.Annotations[tektonconsts.SpanContextAnnotation]; found && tp != "" {
762+
snapshot.Annotations[tektonconsts.SpanContextAnnotation] = tp
763+
}
764+
756765
snapshot.Labels[gitops.BuildPipelineRunNameLabel] = pipelineRun.Name
757766
if pipelineRun.Status.CompletionTime != nil {
758767
snapshot.Labels[gitops.BuildPipelineRunFinishTimeLabel] = strconv.FormatInt(pipelineRun.Status.CompletionTime.Unix(), 10)
@@ -1359,3 +1368,30 @@ func (a *Adapter) IsLatestBuildPipelineRunInComponentWithPRGroupHash(buildPlr *t
13591368
a.logger.Info(fmt.Sprintf("The build pipelineRun %s/%s with pr group %s is not the latest for its component, skipped", buildPlr.Namespace, buildPlr.Name, prGroupName))
13601369
return false, nil
13611370
}
1371+
1372+
// emitBuildTimingSpans emits timing spans for the build PipelineRun if not already emitted
1373+
func (a *Adapter) emitBuildTimingSpans() {
1374+
// Check if timing spans have already been emitted
1375+
if _, found := a.pipelineRun.Annotations[tektonconsts.TimingEmittedAnnotation]; found {
1376+
return // Already emitted
1377+
}
1378+
1379+
// Get span context from the PipelineRun
1380+
tp := a.pipelineRun.Annotations[tektonconsts.SpanContextAnnotation]
1381+
1382+
// Emit timing spans
1383+
if tracing.EmitTimingSpans(a.context, a.client, a.pipelineRun, tracing.LoadLabelNames(), tp) {
1384+
// Mark as emitted to avoid duplicates on subsequent reconciles
1385+
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
1386+
var err error
1387+
a.pipelineRun, err = a.loader.GetPipelineRun(a.context, a.client, a.pipelineRun.Name, a.pipelineRun.Namespace)
1388+
if err != nil {
1389+
return err
1390+
}
1391+
return tekton.AnnotateBuildPipelineRun(a.context, a.pipelineRun, tektonconsts.TimingEmittedAnnotation, "true", a.client)
1392+
})
1393+
if err != nil {
1394+
a.logger.Error(err, "Failed to mark build timing spans as emitted")
1395+
}
1396+
}
1397+
}

internal/controller/integrationpipeline/integrationpipeline_adapter.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
h "github.com/konflux-ci/integration-service/helpers"
2727
"github.com/konflux-ci/integration-service/loader"
2828
intgteststat "github.com/konflux-ci/integration-service/pkg/integrationteststatus"
29+
"github.com/konflux-ci/integration-service/pkg/tracing"
2930
"github.com/konflux-ci/integration-service/status"
3031

3132
tektonconsts "github.com/konflux-ci/integration-service/tekton/consts"
@@ -107,6 +108,11 @@ func (a *Adapter) EnsureStatusReportedInSnapshot() (controller.OperationResult,
107108
return controller.RequeueWithError(fmt.Errorf("failed to update test status in snapshot: %w", err))
108109
}
109110

111+
// Emit timing spans for the integration test PipelineRun if finished
112+
if h.HasPipelineRunFinished(a.pipelineRun) {
113+
a.emitTestTimingSpans()
114+
}
115+
110116
// Remove the finalizer from Integration PLRs if the snapshot is not group or component type and its PLR has finished
111117
if (!gitops.IsGroupSnapshot(a.snapshot) && !gitops.IsComponentSnapshot(a.snapshot)) && (h.HasPipelineRunFinished(a.pipelineRun) ||
112118
pipelinerunStatus == intgteststat.IntegrationTestStatusDeleted) {
@@ -206,3 +212,43 @@ func (a *Adapter) annotateIntegrationPipelineRunLogURL(ctx context.Context, adap
206212
return err
207213
})
208214
}
215+
216+
// emitTestTimingSpans emits timing spans for the integration test PipelineRun if not already emitted.
217+
// Works for any snapshot type:
218+
// - Component/Group snapshots: Parented under delivery trace from build PLR
219+
// - Override snapshots: Emitted without parent trace (still useful for timing metrics)
220+
func (a *Adapter) emitTestTimingSpans() {
221+
// Check if timing spans have already been emitted
222+
if _, found := a.pipelineRun.Annotations[tektonconsts.TimingEmittedAnnotation]; found {
223+
return // Already emitted
224+
}
225+
226+
// Get span context from the snapshot (which got it from the build PLR)
227+
tp := ""
228+
if a.snapshot != nil && a.snapshot.Annotations != nil {
229+
tp = a.snapshot.Annotations[tektonconsts.SpanContextAnnotation]
230+
}
231+
232+
// Emit timing spans
233+
emitted := tracing.EmitTimingSpans(a.context, a.client, a.pipelineRun, tracing.LoadLabelNames(), tp)
234+
235+
if emitted {
236+
// Mark as emitted to avoid duplicates on subsequent reconciles
237+
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
238+
var err error
239+
a.pipelineRun, err = a.loader.GetPipelineRun(a.context, a.client, a.pipelineRun.Name, a.pipelineRun.Namespace)
240+
if err != nil {
241+
return err
242+
}
243+
patch := client.MergeFrom(a.pipelineRun.DeepCopy())
244+
if a.pipelineRun.Annotations == nil {
245+
a.pipelineRun.Annotations = make(map[string]string)
246+
}
247+
a.pipelineRun.Annotations[tektonconsts.TimingEmittedAnnotation] = "true"
248+
return a.client.Patch(a.context, a.pipelineRun, patch)
249+
})
250+
if err != nil {
251+
a.logger.Error(err, "Failed to mark test timing spans as emitted")
252+
}
253+
}
254+
}

internal/controller/snapshot/snapshot_adapter.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,6 +1074,11 @@ func (a *Adapter) prepareGroupSnapshot(application *applicationapiv1alpha1.Appli
10741074
return nil, nil, err
10751075
}
10761076

1077+
// Propagate span context from triggering snapshot
1078+
if tp := a.snapshot.Annotations[tektonconsts.SpanContextAnnotation]; tp != "" {
1079+
groupSnapshot.Annotations[tektonconsts.SpanContextAnnotation] = tp
1080+
}
1081+
10771082
return groupSnapshot, componentSnapshotInfos, nil
10781083
}
10791084

0 commit comments

Comments
 (0)