Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package main

import (
"context"
"crypto/tls"
"flag"
"os"
Expand All @@ -25,6 +26,7 @@ import (
controllers "github.com/konflux-ci/integration-service/internal/controller"
iswebhook "github.com/konflux-ci/integration-service/internal/webhook/v1beta2"
imetrics "github.com/konflux-ci/integration-service/pkg/metrics"
"github.com/konflux-ci/integration-service/pkg/tracing"
"sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
Expand Down Expand Up @@ -110,6 +112,16 @@ func main() {

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

// Initialize tracing
tracerProvider := tracing.New()
defer func() {
Comment thread
dirgim marked this conversation as resolved.
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := tracerProvider.Shutdown(shutdownCtx); err != nil {
setupLog.Error(err, "failed to shutdown tracer provider")
}
}()

// if the enable-http2 flag is false (the default), http/2 should be disabled
// due to its vulnerabilities. More specifically, disabling http/2 will
// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
Expand Down
3 changes: 3 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
- [build-pipeline-controller](https://github.com/konflux-ci/integration-service/blob/main/docs/build_pipeline_controller.md)
- [integration-pipeline-controller](https://github.com/konflux-ci/integration-service/blob/main/docs/integration_pipeline_controller.md)

## Operational docs
- [Distributed tracing](https://github.com/konflux-ci/integration-service/blob/main/docs/tracing.md)

## Creating or editing Mermaid diagrams

Mermaid is a JS based diagramming tool that renders markdown style syntax to create/modify diagrams. Mermaid has [native support in Github](https://github.com/github/roadmap/issues/372)
Expand Down
53 changes: 53 additions & 0 deletions docs/tracing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<div align="center"><h1>Distributed tracing</h1></div>

The operator emits OpenTelemetry spans for build and integration-test PipelineRuns it reconciles, and propagates the trace context forward onto Snapshots and downstream Release CRs so a single trace can span the build, test, and release lifecycle.

## Configuration

| Env var | Purpose | Default |
|---|---|---|
| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP/gRPC collector URL. Unset disables tracing (noop provider). | *(unset)* |
| `OTEL_TRACES_SAMPLER` | `always_on`, `always_off`, `traceidratio`, `parentbased_always_off`, `parentbased_traceidratio`. | `parentbased_always_on` |
| `OTEL_TRACES_SAMPLER_ARG` | Ratio for ratio-based samplers (e.g. `0.1`). | *(unused unless a ratio sampler is selected)* |
| `TRACING_LABEL_ACTION` | PipelineRun label read to populate `cicd.pipeline.action.name`. Empty string disables the attribute. | `delivery.tekton.dev/action` |
| `TRACING_LABEL_APPLICATION` | PipelineRun label read to populate `delivery.tekton.dev.application`. Empty string disables the attribute. | `delivery.tekton.dev/application` |
| `TRACING_LABEL_COMPONENT` | PipelineRun label read to populate `delivery.tekton.dev.component`. Empty string disables the attribute. | `delivery.tekton.dev/component` |

## Emitted spans

Two spans are emitted per PipelineRun when it completes:

- `waitDuration`: `pr.CreationTimestamp` to `pr.Status.StartTime`
- `executeDuration`: `pr.Status.StartTime` to `pr.Status.CompletionTime`

The build-pipeline and integration-pipeline controllers each emit for their respective PipelineRun types. The `delivery.tekton.dev/timingEmitted` annotation guards against re-emission on subsequent reconciles.

## Trace-context propagation

Parenting follows the W3C Trace Context in the `tekton.dev/pipelinerunSpanContext` annotation. The annotation is propagated across resource boundaries so a single trace covers the full delivery flow:

```
build PipelineRun (annotation set by upstream)
└── Snapshot (annotation copied from the build PipelineRun)
├── integration-test PipelineRun (annotation copied from the Snapshot)
└── Release CR (annotation copied from the Snapshot)
```

When the annotation is absent, spans are still emitted but without a parent.

## Span attributes

| Attribute | Span | Source |
|---|---|---|
| `namespace` | both | `pr.GetNamespace()` |
| `pipelinerun` | both | `pr.GetName()` |
| `delivery.tekton.dev.pipelinerun_uid` | both | `pr.GetUID()` |
| `cicd.pipeline.action.name` | both | PipelineRun label (name configurable via `TRACING_LABEL_ACTION`) |
| `delivery.tekton.dev.application` | both | PipelineRun label (name configurable via `TRACING_LABEL_APPLICATION`) |
| `delivery.tekton.dev.component` | both | PipelineRun label (name configurable via `TRACING_LABEL_COMPONENT`) |
| `cicd.pipeline.result` | execute | `Succeeded` condition mapped to the semconv `cicd.pipeline.result` enum (`success` / `failure` / `timeout` / `cancellation` / `error`) |
| `delivery.tekton.dev.result_message` | execute | PipelineRun's `Succeeded` condition message on failure. Omitted on success; truncated to 1024 bytes (UTF-8 safe). |

## Superseded Snapshot dedup

When the snapshot controller skips auto-release because a newer Snapshot for the same Application has already been released, a single `waitDuration` span is emitted on the superseded Snapshot's trace context, anchored at the Snapshot's `CreationTimestamp` and ending at the `AutoReleased` condition's `LastTransitionTime`. It carries `cicd.pipeline.result=skip` and `delivery.tekton.dev.result_message="Released in newer Snapshot"` so the trace distinguishes the deliberate dedup from a broken chain.
38 changes: 36 additions & 2 deletions e2e-tests/pkg/clients/gitlab/git.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,40 @@ import (
"gitlab.com/gitlab-org/api/client-go/v2"
)

// GitLab pre-receive hooks surface 5xx wrapped in a 4xx; RetryTransport's status-code check misses it.
func isGitlabTransientError(err error) bool {
if err == nil {
return false
}
msg := err.Error()
return strings.Contains(msg, "Internal API error") ||
strings.Contains(msg, "504 Gateway Timeout")
}

func (gc *GitlabClient) createBranchWithRetry(projectID string, opt *gitlab.CreateBranchOptions) (*gitlab.Response, error) {
const maxAttempts = 5
const baseDelay = 2 * time.Second
var resp *gitlab.Response
var err error
for attempt := 1; attempt <= maxAttempts; attempt++ {
_, resp, err = gc.client.Branches.CreateBranch(projectID, opt)
if err == nil {
return resp, nil
}
if resp != nil && resp.StatusCode == http.StatusConflict {
return resp, err
}
if !isGitlabTransientError(err) || attempt == maxAttempts {
return resp, err
}
delay := baseDelay * time.Duration(1<<(attempt-1))
fmt.Printf("[gitlab-retry] CreateBranch attempt %d/%d: %v; retrying in %s\n",
attempt, maxAttempts, err, delay)
time.Sleep(delay)
}
return resp, err
}

// CreateBranch creates a new branch in a GitLab project with the given projectID and newBranchName
func (gc *GitlabClient) CreateBranch(projectID, newBranchName, defaultBranch string) error {
// Prepare the branch creation request
Expand All @@ -19,7 +53,7 @@ func (gc *GitlabClient) CreateBranch(projectID, newBranchName, defaultBranch str
}

// Perform the branch creation
_, _, err := gc.client.Branches.CreateBranch(projectID, branchOpts)
_, err := gc.createBranchWithRetry(projectID, branchOpts)
if err != nil {
return fmt.Errorf("failed to create branch %s in project %s: %w", newBranchName, projectID, err)
}
Expand Down Expand Up @@ -77,7 +111,7 @@ func (gc *GitlabClient) CreateGitlabNewBranch(projectID, branchName, sha, baseBr
Branch: &branchName,
Ref: &sha,
}
_, resp, err := gc.client.Branches.CreateBranch(projectID, opt)
resp, err := gc.createBranchWithRetry(projectID, opt)
if err != nil {
// Check if the error is due to the branch already existing
if resp != nil && resp.StatusCode == http.StatusConflict {
Expand Down
18 changes: 8 additions & 10 deletions gitops/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,10 @@ const (
// ComponentGroup that the snapshot belongs to have been created
ParentSnapshotsCreatedCondition = "ParentSnapshotsCreated"

// SnapshotSupersededMessage is the AutoReleased condition message when
// an older Snapshot is skipped by a newer Snapshot for the same Application.
SnapshotSupersededMessage = "Released in newer Snapshot"

// SnapshotAddedToGlobalCandidateListCondition is the condition for marking if Snapshot's component was added to
// the global candidate list.
SnapshotAddedToGlobalCandidateListCondition = "AddedToGlobalCandidateList"
Expand Down Expand Up @@ -338,10 +342,8 @@ const (
ChildSnapshotAnnotation = TestLabelPrefix + "/child-snapshot"
)

var (
// SnapshotComponentLabel contains the name of the updated Snapshot component - it should match the pipeline label.
SnapshotComponentLabel = tektonconsts.ComponentNameLabel
)
// SnapshotComponentLabel contains the name of the updated Snapshot component - it should match the pipeline label.
var SnapshotComponentLabel = tektonconsts.ComponentNameLabel

const (
// maxPrefixLength is the maximum length of the prefix for the snapshot name
Expand Down Expand Up @@ -769,7 +771,6 @@ func HaveAppStudioTestsSucceeded(snapshot *applicationapiv1alpha1.Snapshot) bool

// GetTestSucceededCondition checks status of tests on the snapshot
func GetTestSucceededCondition(snapshot *applicationapiv1alpha1.Snapshot) (condition *metav1.Condition, ok bool) {

condition = meta.FindStatusCondition(snapshot.Status.Conditions, AppStudioTestSucceededCondition)
if condition == nil {
condition = meta.FindStatusCondition(snapshot.Status.Conditions, LegacyTestSucceededCondition)
Expand Down Expand Up @@ -1215,7 +1216,6 @@ func ResetSnapshotStatusConditions(ctx context.Context, adapterClient client.Cli
// ObjectMeta: metav1.ObjectMeta{
// TODO: replace 'object' with 'componentGroup' after application is deprecated. Also delete Application bool
func CopySnapshotLabelsAndAnnotations(object *metav1.ObjectMeta, snapshot *applicationapiv1alpha1.Snapshot, componentName string, source *metav1.ObjectMeta, prefixes []string, objectIsApplication bool) {

if snapshot.Labels == nil {
snapshot.Labels = map[string]string{}
}
Expand All @@ -1242,7 +1242,6 @@ func CopySnapshotLabelsAndAnnotations(object *metav1.ObjectMeta, snapshot *appli
_ = metadata.CopyLabelsByPrefix(source, &snapshot.ObjectMeta, prefix)
_ = metadata.CopyAnnotationsByPrefix(source, &snapshot.ObjectMeta, prefix)
}

}

// BuildResultAnnotationKey normalizes a PipelineRun result name into a Snapshot annotation key.
Expand Down Expand Up @@ -1373,7 +1372,7 @@ func IsScenarioApplicableToSnapshotsContext(scenario *v1beta2.IntegrationTestSce
return true
}
for _, scenarioContext := range scenario.Spec.Contexts {
scenarioContext := scenarioContext //G601
scenarioContext := scenarioContext // G601
if IsContextValidForSnapshot(scenarioContext.Name, snapshot) {
return true
}
Expand All @@ -1386,7 +1385,7 @@ func IsScenarioApplicableToSnapshotsContext(scenario *v1beta2.IntegrationTestSce
func FilterIntegrationTestScenariosWithContext(scenarios *[]v1beta2.IntegrationTestScenario, snapshot *applicationapiv1alpha1.Snapshot) *[]v1beta2.IntegrationTestScenario {
var filteredScenarioList []v1beta2.IntegrationTestScenario
for _, scenario := range *scenarios {
scenario := scenario //G601
scenario := scenario // G601
if IsScenarioApplicableToSnapshotsContext(&scenario, snapshot) {
filteredScenarioList = append(filteredScenarioList, scenario)
}
Expand Down Expand Up @@ -1417,7 +1416,6 @@ func FindMatchingSnapshotComponent(snapshot *applicationapiv1alpha1.Snapshot, co
}
}
return applicationapiv1alpha1.SnapshotComponent{}

}

// SortSnapshots sorts the snapshots according to the snapshot annotation BuildPipelineRunStartTime
Expand Down
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ require (
github.com/tektoncd/pipeline v1.7.0
github.com/tonglil/buflogr v1.1.1
gitlab.com/gitlab-org/api/client-go/v2 v2.36.0
go.opentelemetry.io/otel v1.40.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0
go.opentelemetry.io/otel/sdk v1.40.0
go.opentelemetry.io/otel/trace v1.40.0
go.uber.org/mock v0.6.0
go.uber.org/zap v1.27.1
golang.org/x/oauth2 v0.36.0
Expand Down Expand Up @@ -151,12 +155,8 @@ require (
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.64.0 // indirect
go.opentelemetry.io/otel v1.40.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 // indirect
go.opentelemetry.io/otel/metric v1.40.0 // indirect
go.opentelemetry.io/otel/sdk v1.40.0 // indirect
go.opentelemetry.io/otel/trace v1.40.0 // indirect
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.yaml.in/yaml/v2 v2.4.4 // indirect
Expand Down
23 changes: 23 additions & 0 deletions internal/controller/buildpipeline/buildpipeline_adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
h "github.com/konflux-ci/integration-service/helpers"
"github.com/konflux-ci/integration-service/loader"
intgteststat "github.com/konflux-ci/integration-service/pkg/integrationteststatus"
"github.com/konflux-ci/integration-service/pkg/tracing"
"github.com/konflux-ci/integration-service/snapshot"
"github.com/konflux-ci/integration-service/status"
"github.com/konflux-ci/integration-service/tekton"
Expand Down Expand Up @@ -158,6 +159,7 @@ func (a *Adapter) EnsureSnapshotExists() (result controller.OperationResult, err
var canRemoveFinalizer bool

defer func() {
a.emitBuildTimingSpans()
// Don't write a failure annotation for transient Chains-not-signed errors
annotationErr := err
if h.IsChainsNotSignedError(err) {
Expand Down Expand Up @@ -261,6 +263,7 @@ func (a *Adapter) EnsureSnapshotExistsApplication() (result controller.Operation
var canRemoveFinalizer bool

defer func() {
a.emitBuildTimingSpans()
// Don't write a failure annotation for transient Chains-not-signed errors
annotationErr := err
if h.IsChainsNotSignedError(err) {
Expand Down Expand Up @@ -847,6 +850,11 @@ func (a *Adapter) prepareSnapshotForPipelineRun(pipelineRun *tektonv1.PipelineRu
prefixes := []string{gitops.BuildPipelineRunPrefix, gitops.TestLabelPrefix, gitops.CustomLabelPrefix, gitops.ReleaseLabelPrefix}
gitops.CopySnapshotLabelsAndAnnotations(&application.ObjectMeta, snapshot, a.component.Name, &pipelineRun.ObjectMeta, prefixes, true)

// Propagate span context from build PipelineRun to Snapshot for distributed tracing
if tp, found := pipelineRun.Annotations[tracing.SpanContextAnnotation]; found && tp != "" {
snapshot.Annotations[tracing.SpanContextAnnotation] = tp
}

snapshot.Labels[gitops.BuildPipelineRunNameLabel] = pipelineRun.Name
if pipelineRun.Status.CompletionTime != nil {
snapshot.Labels[gitops.BuildPipelineRunFinishTimeLabel] = strconv.FormatInt(pipelineRun.Status.CompletionTime.Unix(), 10)
Expand Down Expand Up @@ -1466,3 +1474,18 @@ func (a *Adapter) IsLatestBuildPipelineRunInComponentWithPRGroupHash(buildPlr *t
a.logger.Info(fmt.Sprintf("The build pipelineRun %s/%s with pr group %s is not the latest for its component, skipped", buildPlr.Namespace, buildPlr.Name, prGroupName))
return false, nil
}

// emitBuildTimingSpans emits timing spans for the build PipelineRun if not already emitted
func (a *Adapter) emitBuildTimingSpans() {
spanContext := a.pipelineRun.Annotations[tracing.SpanContextAnnotation]
patched, err := tracing.EmitAndMarkTimingSpans(a.context, a.pipelineRun, spanContext, "", a.client, func() (*tektonv1.PipelineRun, error) {
return a.loader.GetPipelineRun(a.context, a.client, a.pipelineRun.Name, a.pipelineRun.Namespace)
})
if err != nil {
a.logger.Error(err, "Failed to emit and mark build timing spans")
return
}
if patched != nil {
a.pipelineRun = patched
}
}
Loading
Loading