Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,23 @@ Git sources are cloned into a per-repo cache directory. The cache root is create
| `PROVIDER_KUBECONFIG_CACHE_DIR` | `$XDG_CACHE_HOME/provider-kubeconfig` (else `$TMPDIR/provider-kubeconfig`) | Cache root. Point at a dedicated writable volume (e.g. an `emptyDir`) to keep clones off shared `/tmp`. |
| `PROVIDER_KUBECONFIG_CACHE_MAX_ENTRIES` | `32` | Max cached repo directories retained before LRU eviction. |

## Observability

### Metrics

Custom Prometheus metrics are exposed on the manager's existing `/metrics` endpoint alongside the standard controller-runtime and crossplane metrics:

| Metric | Type | Labels | Description |
|--------|------|--------|-------------|
| `provider_kubeconfig_git_fetch_duration_seconds` | histogram | `repo`, `branch`, `operation`, `result` | Git clone/pull/revision latency. `operation` ∈ `clone\|pull\|revision`. |
| `provider_kubeconfig_git_cache_total` | counter | `repo`, `branch`, `operation` | Git source operations, distinguishing fresh clone from cache-hit pull. |
| `provider_kubeconfig_sops_decrypt_duration_seconds` | histogram | `format`, `result` | SOPS decrypt latency. |
| `provider_kubeconfig_reconcile_errors_total` | counter | `stage` | Reconcile errors by stage (`git\|decrypt\|secret\|downstream`). |

### Tracing

The reconcile hot path emits OpenTelemetry spans (`git.EnsureCloned`, `git.ReadFile`, `sops.Decrypt`) so traces show which phase dominates. Tracing is **off by default** and activates when a standard OTLP endpoint is configured — e.g. set `OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317`. Standard `OTEL_*` env vars (headers, TLS, sampling) are honored.

## Building

### Prerequisites
Expand Down
11 changes: 11 additions & 0 deletions cmd/provider/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ import (
"github.com/stuttgart-things/provider-kubeconfig/apis"
kubeconfig "github.com/stuttgart-things/provider-kubeconfig/internal/controller"
rbacpkg "github.com/stuttgart-things/provider-kubeconfig/internal/rbac"
"github.com/stuttgart-things/provider-kubeconfig/internal/tracing"
"github.com/stuttgart-things/provider-kubeconfig/internal/version"
)

Expand Down Expand Up @@ -85,6 +86,16 @@ func main() {
ctrl.SetLogger(zap.New(zap.WriteTo(io.Discard)))
}

// Optional OpenTelemetry tracing. No-op unless an OTEL endpoint is set; a
// bad/unreachable endpoint must not stop the provider, so failures only log.
shutdownTracing, tracingOn, err := tracing.Setup(context.Background(), version.Version)
if err != nil {
log.Info("OpenTelemetry tracing setup failed; continuing without tracing", "error", err)
} else if tracingOn {
log.Info("OpenTelemetry tracing enabled")
defer func() { _ = shutdownTracing(context.Background()) }()
}

cfg, err := ctrl.GetConfig()
kingpin.FatalIfError(err, "Cannot get API server rest config")

Expand Down
13 changes: 9 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ require (
github.com/google/go-cmp v0.7.0
github.com/hashicorp/vault/api v1.23.0
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.23.2
go.opentelemetry.io/otel v1.43.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0
go.opentelemetry.io/otel/sdk v1.43.0
go.opentelemetry.io/otel/trace v1.43.0
google.golang.org/grpc v1.81.1
k8s.io/api v0.36.2
k8s.io/apiextensions-apiserver v0.36.2
Expand Down Expand Up @@ -76,6 +81,7 @@ require (
github.com/blang/semver v3.5.1+incompatible // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cloudflare/circl v1.6.3 // indirect
github.com/cncf/xds/go v0.0.0-20260202195803-dba9d589def2 // indirect
Expand Down Expand Up @@ -127,6 +133,7 @@ require (
github.com/googleapis/enterprise-certificate-proxy v0.3.15 // indirect
github.com/googleapis/gax-go/v2 v2.22.0 // indirect
github.com/goware/prefixer v0.0.0-20160118172347-395022866408 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-cleanhttp v0.5.2 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
Expand Down Expand Up @@ -156,7 +163,6 @@ require (
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_golang v1.23.2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.5 // indirect
github.com/prometheus/procfs v0.20.1 // indirect
Expand All @@ -179,11 +185,10 @@ require (
go.opentelemetry.io/contrib/detectors/gcp v1.43.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 // indirect
go.opentelemetry.io/otel v1.43.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0 // indirect
go.opentelemetry.io/otel/metric v1.43.0 // indirect
go.opentelemetry.io/otel/sdk v1.43.0 // indirect
go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect
go.opentelemetry.io/otel/trace v1.43.0 // indirect
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.1 // indirect
go.yaml.in/yaml/v2 v2.4.4 // indirect
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,6 @@ github.com/crossplane/crossplane-runtime/v2 v2.3.2 h1:gjfJmr0PTf3/Ccg4iasogXKIRj
github.com/crossplane/crossplane-runtime/v2 v2.3.2/go.mod h1:POGt8DSTcxQJlTww+3yGeeXuEdLyjZ61vZ3ap5tTxhE=
github.com/crossplane/crossplane-tools v0.0.0-20251017183449-dd4517244339 h1:MPbMxSlY+82UsjrLUAGyXlh/iX1tL5WNj8W9SOaq/nk=
github.com/crossplane/crossplane-tools v0.0.0-20251017183449-dd4517244339/go.mod h1:8etxwmP4cZwJDwen4+PQlnc1tggltAhEfyyigmdHulQ=
github.com/crossplane/crossplane/apis/v2 v2.0.0-20260424160951-8f231230ebb6 h1:9ki6AJQgBJIcLNjK+scUZp2ZDenuAo18d0JSNOlkY2Y=
github.com/crossplane/crossplane/apis/v2 v2.0.0-20260424160951-8f231230ebb6/go.mod h1:h7KE74Z4TFs1L/FFv3RdsiG9Uax7L56oHpcggSZnONg=
github.com/crossplane/crossplane/apis/v2 v2.3.2 h1:Drs3xz59qT3zFfaszxQWqr51a0leAx20DBL4TqMnqi0=
github.com/crossplane/crossplane/apis/v2 v2.3.2/go.mod h1:o+D0ktZQKJCFcpfzMKA4n53aTo2sFqqDsADBNIRuIyE=
github.com/cyphar/filepath-securejoin v0.6.1 h1:5CeZ1jPXEiYt3+Z6zqprSAgSWiggmpVyciv8syjIpVE=
Expand Down
85 changes: 73 additions & 12 deletions internal/controller/remotecluster/remotecluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,17 @@ import (
"crypto/sha256"
"encoding/json"
"fmt"
"time"

"github.com/crossplane/crossplane-runtime/v2/pkg/feature"
xpv2 "github.com/crossplane/crossplane/apis/v2/core/v2"

vaultapi "github.com/hashicorp/vault/api"
"github.com/pkg/errors"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
corev1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -50,9 +55,13 @@ import (
clusterpkg "github.com/stuttgart-things/provider-kubeconfig/internal/cluster"
decryptpkg "github.com/stuttgart-things/provider-kubeconfig/internal/decrypt"
gitpkg "github.com/stuttgart-things/provider-kubeconfig/internal/git"
providermetrics "github.com/stuttgart-things/provider-kubeconfig/internal/metrics"
vaultpkg "github.com/stuttgart-things/provider-kubeconfig/internal/vault"
)

// tracerName identifies the provider's spans in OpenTelemetry traces.
const tracerName = "github.com/stuttgart-things/provider-kubeconfig"

const (
errNotRemoteCluster = "managed resource is not a RemoteCluster custom resource"
errTrackPCUsage = "cannot track ProviderConfig usage"
Expand Down Expand Up @@ -416,35 +425,82 @@ func (c *external) readFromVault(ctx context.Context, path, key string) ([]byte,

func (c *external) cloneAndReadFile(ctx context.Context, filePath string) ([]byte, error) {
log := ctrl.LoggerFrom(ctx)

repo := gitpkg.NewRepo(c.providerSpec.Git.URL, c.providerSpec.Git.Branch, c.providerSpec.Git.Revision, c.gitToken)

if _, err := repo.EnsureCloned(ctx); err != nil {
log.Info("Git clone/pull failed", "url", c.providerSpec.Git.URL, "branch", c.providerSpec.Git.Branch, "revision", c.providerSpec.Git.Revision, "error", err)
return nil, errors.Wrap(err, errCloneRepo)
}
log.V(1).Info("Git repo ready", "url", c.providerSpec.Git.URL, "branch", c.providerSpec.Git.Branch, "revision", c.providerSpec.Git.Revision)

tracer := otel.Tracer(tracerName)

repoURL := c.providerSpec.Git.URL
branch := c.providerSpec.Git.Branch
revision := c.providerSpec.Git.Revision
repo := gitpkg.NewRepo(repoURL, branch, revision, c.gitToken)

// --- git fetch (clone/pull/revision) ---
fetchCtx, span := tracer.Start(ctx, "git.EnsureCloned", trace.WithAttributes(
attribute.String("git.repo", repoURL),
attribute.String("git.branch", branch),
attribute.String("git.revision", revision),
))
start := time.Now()
_, op, err := repo.EnsureCloned(fetchCtx)
span.SetAttributes(attribute.String("git.operation", string(op)))
providermetrics.GitFetchDuration.WithLabelValues(repoURL, branch, string(op), result(err)).Observe(time.Since(start).Seconds())
providermetrics.GitCacheOps.WithLabelValues(repoURL, branch, string(op)).Inc()
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "git fetch failed")
span.End()
providermetrics.ReconcileErrors.WithLabelValues(providermetrics.StageGit).Inc()
log.Info("Git clone/pull failed", "url", repoURL, "branch", branch, "revision", revision, "error", err)
return nil, errors.Wrapf(err, "%s (url=%q branch=%q revision=%q)", errCloneRepo, repoURL, branch, revision)
}
span.End()
log.V(1).Info("Git repo ready", "url", repoURL, "branch", branch, "revision", revision, "operation", op)

// --- read file ---
_, readSpan := tracer.Start(ctx, "git.ReadFile", trace.WithAttributes(attribute.String("git.path", filePath)))
content, err := repo.ReadFile(filePath)
if err != nil {
readSpan.RecordError(err)
readSpan.SetStatus(codes.Error, "read file failed")
readSpan.End()
providermetrics.ReconcileErrors.WithLabelValues(providermetrics.StageGit).Inc()
log.Info("Failed to read file from git repo", "path", filePath, "error", err)
return nil, errors.Wrap(err, errReadFile)
return nil, errors.Wrapf(err, "%s (url=%q path=%q)", errReadFile, repoURL, filePath)
}
readSpan.End()

// Decrypt if an age key is available
// --- decrypt (if an age key is available) ---
if c.ageKey != "" {
format := decryptpkg.FormatFromPath(filePath)
_, decSpan := tracer.Start(ctx, "sops.Decrypt", trace.WithAttributes(
attribute.String("sops.format", format),
attribute.String("git.path", filePath),
))
start := time.Now()
decrypted, err := decryptpkg.SOPSDecrypt(content, filePath, c.ageKey)
providermetrics.SOPSDecryptDuration.WithLabelValues(format, result(err)).Observe(time.Since(start).Seconds())
if err != nil {
decSpan.RecordError(err)
decSpan.SetStatus(codes.Error, "decrypt failed")
decSpan.End()
providermetrics.ReconcileErrors.WithLabelValues(providermetrics.StageDecrypt).Inc()
log.Info("SOPS decryption failed", "path", filePath, "error", err)
return nil, errors.Wrap(err, errDecryptFile)
return nil, errors.Wrapf(err, "%s (url=%q path=%q)", errDecryptFile, repoURL, filePath)
}
decSpan.End()
log.V(1).Info("Decrypted kubeconfig", "path", filePath)
return decrypted, nil
}

return content, nil
}

// result maps an error to a Prometheus result label value.
func result(err error) string {
if err != nil {
return providermetrics.ResultError
}
return providermetrics.ResultSuccess
}

// buildDownstreamProviderConfig builds an unstructured downstream ProviderConfig
// for provider-kubernetes or provider-helm using the resolved providerConfigMeta.
func buildDownstreamProviderConfig(meta providerConfigMeta, pcName, secretName, secretNamespace, crName string) (*unstructured.Unstructured, error) {
Expand Down Expand Up @@ -892,12 +948,14 @@ func (c *external) Create(ctx context.Context, mg resource.Managed) (managed.Ext
}

if err := c.kube.Create(ctx, secret); err != nil {
providermetrics.ReconcileErrors.WithLabelValues(providermetrics.StageSecret).Inc()
return managed.ExternalCreation{}, errors.Wrap(err, errCreateSecret)
}
log.Info("Created kubeconfig secret", "cluster", cr.GetName(), "secret", name, "namespace", ns)

// Create downstream ProviderConfigs and ArgoCD secrets
if err := c.ensureDownstreamProviderConfigs(ctx, cr, name, ns, content); err != nil {
providermetrics.ReconcileErrors.WithLabelValues(providermetrics.StageDownstream).Inc()
log.Info("Failed to create downstream ProviderConfigs", "cluster", cr.GetName(), "error", err)
return managed.ExternalCreation{}, err
}
Expand Down Expand Up @@ -947,6 +1005,7 @@ func (c *external) Update(ctx context.Context, mg resource.Managed) (managed.Ext
// Fetch and update the existing Secret
secret := &corev1.Secret{}
if err := c.kube.Get(ctx, types.NamespacedName{Name: name, Namespace: ns}, secret); err != nil {
providermetrics.ReconcileErrors.WithLabelValues(providermetrics.StageSecret).Inc()
return managed.ExternalUpdate{}, errors.Wrap(err, errGetSecret)
}

Expand All @@ -958,11 +1017,13 @@ func (c *external) Update(ctx context.Context, mg resource.Managed) (managed.Ext
cr.Status.AtProvider.VaultSecretVersion = vaultVersion

if err := c.kube.Update(ctx, secret); err != nil {
providermetrics.ReconcileErrors.WithLabelValues(providermetrics.StageSecret).Inc()
return managed.ExternalUpdate{}, errors.Wrap(err, errUpdateSecret)
}

// Reconcile downstream ProviderConfigs and ArgoCD secrets (create missing, delete stale)
if err := c.ensureDownstreamProviderConfigs(ctx, cr, name, ns, content); err != nil {
providermetrics.ReconcileErrors.WithLabelValues(providermetrics.StageDownstream).Inc()
return managed.ExternalUpdate{}, err
}

Expand Down
6 changes: 3 additions & 3 deletions internal/decrypt/decrypt.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func SOPSDecrypt(data []byte, filePath string, ageKey string) ([]byte, error) {
}
defer func() { _ = os.Setenv("SOPS_AGE_KEY", prev) }()

format := formatFromPath(filePath)
format := FormatFromPath(filePath)
cleartext, err := decrypt.Data(data, format)
if err != nil {
return nil, errors.Wrap(err, "cannot decrypt SOPS data")
Expand All @@ -63,8 +63,8 @@ func SOPSDecrypt(data []byte, filePath string, ageKey string) ([]byte, error) {
return cleartext, nil
}

// formatFromPath returns the SOPS format string based on file extension.
func formatFromPath(path string) string {
// FormatFromPath returns the SOPS format string based on file extension.
func FormatFromPath(path string) string {
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".json":
Expand Down
4 changes: 2 additions & 2 deletions internal/decrypt/decrypt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ func TestFormatFromPath(t *testing.T) {

for name, tc := range cases {
t.Run(name, func(t *testing.T) {
got := formatFromPath(tc.path)
got := FormatFromPath(tc.path)
if diff := cmp.Diff(tc.want, got); diff != "" {
t.Errorf("formatFromPath(%q): -want, +got:\n%s", tc.path, diff)
t.Errorf("FormatFromPath(%q): -want, +got:\n%s", tc.path, diff)
}
})
}
Expand Down
38 changes: 27 additions & 11 deletions internal/git/git.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,20 @@ func repoMutex(key string) *sync.Mutex {
return mu
}

// Operation describes which git action EnsureCloned performed, so callers can
// distinguish a fresh clone from a cache-hit pull or a pinned-revision checkout
// (e.g. for metrics) without importing any observability dependency here.
type Operation string

const (
// OpClone means the repo was (re)cloned from scratch.
OpClone Operation = "clone"
// OpPull means an existing cache was updated via pull.
OpPull Operation = "pull"
// OpRevision means a pinned revision was ensured (full clone or cache hit).
OpRevision Operation = "revision"
)

// Repo manages cloning and pulling a Git repository to a local cache directory.
type Repo struct {
url string
Expand Down Expand Up @@ -199,41 +213,43 @@ func (r *Repo) auth() *http.BasicAuth {
// revision is pinned it delegates to ensureRevision, which checks out the exact
// commit/tag instead of tracking the branch tip. On success it marks the cache
// as recently used and evicts least-recently-used entries beyond the cap.
func (r *Repo) EnsureCloned(ctx context.Context) (string, error) {
func (r *Repo) EnsureCloned(ctx context.Context) (string, Operation, error) {
mu := repoMutex(r.cacheDir)
mu.Lock()
defer mu.Unlock()

dir, err := r.ensure(ctx)
dir, op, err := r.ensure(ctx)
if err != nil {
return "", err
return "", op, err
}

touch(dir)
evictCache(filepath.Dir(r.cacheDir), r.cacheDir, maxCacheEntries())
return dir, nil
return dir, op, nil
}

// ensure performs the clone/pull (or pinned-revision checkout) and returns the
// cache directory, without the LRU bookkeeping handled by EnsureCloned.
func (r *Repo) ensure(ctx context.Context) (string, error) {
// cache directory and which Operation it performed, without the LRU bookkeeping
// handled by EnsureCloned.
func (r *Repo) ensure(ctx context.Context) (string, Operation, error) {
if r.revision != "" {
return r.ensureRevision(ctx)
dir, err := r.ensureRevision(ctx)
return dir, OpRevision, err
}

refName := plumbing.NewBranchReferenceName(r.branch)

if _, err := os.Stat(filepath.Join(r.cacheDir, ".git")); err == nil {
dir, pullErr := r.pull(ctx, refName)
if pullErr == nil {
return dir, nil
return dir, OpPull, nil
}
// Pull failed (e.g. stale shallow clone) — remove cache and re-clone
_ = os.RemoveAll(r.cacheDir)
}

if err := ensureCacheRoot(filepath.Dir(r.cacheDir)); err != nil {
return "", err
return "", OpClone, err
}

opts := &git.CloneOptions{
Expand All @@ -247,10 +263,10 @@ func (r *Repo) ensure(ctx context.Context) (string, error) {
if _, err := git.PlainCloneContext(ctx, r.cacheDir, false, opts); err != nil {
// Clean up partial clone on failure
_ = os.RemoveAll(r.cacheDir)
return "", errors.Wrap(err, "cannot clone git repository")
return "", OpClone, errors.Wrap(err, "cannot clone git repository")
}

return r.cacheDir, nil
return r.cacheDir, OpClone, nil
}

// ensureRevision clones the full repository (no shallow/single-branch limits, so
Expand Down
Loading