From 250ecac3a417e4a2c48568aba369e076160b681b Mon Sep 17 00:00:00 2001 From: Xavi Garcia Date: Tue, 3 Mar 2026 15:46:43 +0100 Subject: [PATCH 1/3] Adds fleet-event-monitor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `fleet-event-monitor` is a separate binary, Docker image, and Helm chart containing **read-only monitoring controllers**. These controllers: - Mirror the exact watch configuration of Fleet's production controllers (same `SetupWithManager` logic) - Log detailed diffs (spec, status, annotations, labels) when controllers are triggered - Perform **no reconciliation or write operations** - Are enabled/disabled per controller via environment variables or Helm values - Use read-only RBAC permissions (only `get`, `list`, `watch` — except leases for leader election) **Problem solved**: Understanding why Fleet controllers are triggered repeatedly or what specific changes cause reconciliation loops, without impacting production workloads. --- ``` fleet/ ├── cmd/fleeteventmonitor/main.go # Entry point ├── internal/cmd/monitor/ │ ├── root.go # CLI / cobra setup, env var parsing │ ├── operator.go # controller-runtime manager, reconciler wiring │ └── reconciler/ │ ├── monitor.go # Shared logging utilities (logSpecChange, logStatusChange, etc.) │ ├── stats.go # EventType constants, StatsTracker, Summary (JSON) │ ├── filter.go # EventTypeFilters struct, ResourceFilter, ShouldLog/ShouldLogTrigger logic │ ├── cache.go # ObjectCache (thread-safe, namespace/name keyed) │ ├── predicate.go # TypedResourceVersionUnchangedPredicate │ ├── bundle_monitor.go # Bundle controller (watches BD + Cluster) │ ├── bundle_query.go # BundleQuery interface + impl (cluster→bundle mapping) │ ├── cluster_monitor.go # Cluster controller (watches BD) │ ├── bundledeployment_monitor.go # BundleDeployment controller │ ├── gitrepo_monitor.go # GitRepo controller (watches Job) │ └── helmop_monitor.go # HelmOp controller ├── package/Dockerfile.event-monitor # Multi-arch Docker image (BCI 15.7, non-root) ├── charts/fleet-event-monitor/ │ ├── Chart.yaml │ ├── values.yaml │ └── templates/ │ ├── _helpers.tpl │ ├── deployment.yaml │ └── rbac.yaml └── .goreleaser.yaml # Build/release config (fleet-event-monitor added) ``` **Original production controllers (for watch pattern reference)**: - `internal/cmd/controller/reconciler/bundle_controller.go` - `internal/cmd/controller/reconciler/cluster_controller.go` - `internal/cmd/controller/reconciler/bundledeployment_controller.go` - `internal/cmd/controller/gitops/reconciler/gitjob_controller.go` - `internal/cmd/controller/helmops/reconciler/helmapp_controller.go` --- Each monitor controller: 1. Copies `SetupWithManager()` from the original controller (identical watches, predicates, event filters) 2. Replaces `Reconcile()` with logging-only logic 3. Uses an in-memory `ObjectCache` to detect changes between events 1. Reconcile triggered by watch event 2. `Get()` current object from Kubernetes API 3. Look up previous version from `ObjectCache` 4. If first time → log "create", cache object, return 5. If seen before → compare and log: spec diff, status diff, annotation/label/resourceVersion changes 6. Update cache with new version | Controller | Primary Watch | Secondary Watches | |---|---|---| | Bundle | Bundle | BundleDeployment (status changes), Cluster (all changes) | | Cluster | Cluster | BundleDeployment (spec/status changes) | | BundleDeployment | BundleDeployment | — | | GitRepo | GitRepo | Job (status changes) | | HelmOp | HelmOp | — | --- Each controller independently operates in one of two modes, controlled by a per-controller `detailed` flag: | Mode | `detailed` value | Behavior | |---|---|---| | **Summary** (default) | `false` | Counts events; prints periodic JSON summaries. No per-event log lines. | | **Detailed** | `true` | Emits a structured log line for every event with diffs included. | The summary printer **always runs** regardless of mode, so you always get aggregate statistics. Setting `detailed=true` adds verbose per-event logs on top. **Default**: all controllers default to `false` (summary only). | Environment Variable | Helm Value | Default | |---|---|---| | `FLEET_EVENT_MONITOR_BUNDLE_DETAILED` | `logging.bundle.detailed` | `false` | | `FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_DETAILED` | `logging.bundleDeployment.detailed` | `false` | | `FLEET_EVENT_MONITOR_CLUSTER_DETAILED` | `logging.cluster.detailed` | `false` | | `FLEET_EVENT_MONITOR_GITREPO_DETAILED` | `logging.gitRepo.detailed` | `false` | | `FLEET_EVENT_MONITOR_HELMOP_DETAILED` | `logging.helmOp.detailed` | `false` | > **Note**: The wrangler command framework does not parse boolean env vars automatically. They are manually parsed in `root.go` using `strconv.ParseBool()`. Valid values: `true`/`false`, `1`/`0`, `True`/`False`, `TRUE`/`FALSE`. | Environment Variable | Helm Value | Default | Description | |---|---|---|---| | `FLEET_EVENT_MONITOR_SUMMARY_INTERVAL` | `logging.summary.interval` | `"30s"` | How often to print the JSON summary | | `FLEET_EVENT_MONITOR_SUMMARY_RESET` | `logging.summary.resetOnPrint` | `false` | Reset counters after each print (false = cumulative) | | Event Type | Env var suffix / Helm key | Description | |---|---|---| | `generation-change` | `GENERATION_CHANGE` / `generationChange` | Spec modifications (generation bump) | | `status-change` | `STATUS_CHANGE` / `statusChange` | Status field updates | | `annotation-change` | `ANNOTATION_CHANGE` / `annotationChange` | Annotation modifications | | `label-change` | `LABEL_CHANGE` / `labelChange` | Label modifications | | `resourceversion-change` | `RESVER_CHANGE` / `resourceVersionChange` | Cache sync / metadata updates (finalizers, ownerRefs, managedFields) | | `triggered-by` | `TRIGGERED_BY` / `triggeredBy` | Trigger source breakdown by resource type | | `deletion` | `DELETION` / `deletion` | Resource being deleted | | `not-found` | `NOT_FOUND` / `notFound` | Resource not found (likely deleted) | | `create` | `CREATE` / `create` | First observation of resource | ```json { "timestamp": "2026-02-09T10:00:30Z", "interval_seconds": 30, "summary": { "Bundle": { "fleet-local/test-bundle": { "generation-change": 5, "status-change": 20, "triggered-by": { "BundleDeployment": 12, "Cluster": 3 }, "total_events": 41 } } }, "totals": { "total_resources_monitored": 3, "total_events": 63 } } ``` ```bash kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | \ grep "Fleet Monitor Summary" | tail -1 | \ jq -r '.summary.Bundle | to_entries[] | select(.value.total_events > 50) | "\(.key): \(.value.total_events) events"' kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | \ grep "Fleet Monitor Summary" | tail -1 | \ jq '.summary.Bundle["fleet-local/test-bundle"]["triggered-by"]' kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | \ grep "Fleet Monitor Summary" | tail -1 | \ jq -r '.summary | to_entries[] | .key as $t | .value | to_entries[] | select(.value["status-change"] > 0) | "\($t)/\(.key): \(.value["status-change"]) status changes"' kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | grep "parsed per-controller" kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | grep "registered monitor controller" ``` --- When a controller is in detailed mode (`detailed=true`), event type filters let you restrict which event types produce a log line. **Statistics are always tracked** regardless of filters — filters only affect the verbosity of the per-event log output. **Default behavior**: if all event filter flags are `false`, **all event types are logged** (backwards compatible). To restrict output, set the specific types you want to `true`. Any `true` flag activates selective filtering. `EventTypeFilters.IsEmpty()` returns true when all fields are false → `ShouldLog()` returns true for every event type. Once any field is set to `true`, only enabled types pass through. The env var pattern is: - Bundle: `FLEET_EVENT_MONITOR_BUNDLE_EVENT_` - BundleDeployment: `FLEET_EVENT_MONITOR_BD_EVENT_` - Cluster: `FLEET_EVENT_MONITOR_CLUSTER_EVENT_` - GitRepo: `FLEET_EVENT_MONITOR_GITREPO_EVENT_` - HelmOp: `FLEET_EVENT_MONITOR_HELMOP_EVENT_` Where `` is one of: `GENERATION_CHANGE`, `STATUS_CHANGE`, `ANNOTATION_CHANGE`, `LABEL_CHANGE`, `RESVER_CHANGE`, `DELETION`, `NOT_FOUND`, `CREATE`, `TRIGGERED_BY`. ```yaml logging: bundle: detailed: true # Must be true for event filters to have any effect eventFilters: generationChange: false # Set true to see spec diffs statusChange: false # Set true to see status diffs annotationChange: false labelChange: false resourceVersionChange: false # Set true to see cache-sync/metadata events deletion: false notFound: false create: false triggeredBy: false # Set true to see which resource triggered reconciliation ``` The same structure applies for `bundleDeployment`, `cluster`, `gitRepo`, and `helmOp`. **Example 1: Only watch generation changes (spec diffs) for Bundle** ```bash helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ --set logging.bundle.detailed=true \ --set logging.bundle.eventFilters.generationChange=true ``` **Example 2: Focus on reconciliation trigger sources only** ```bash helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ --set logging.bundle.detailed=true \ --set logging.bundle.eventFilters.triggeredBy=true ``` **Example 3: See everything for Bundle (all filters false = log all)** ```bash helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ --set logging.bundle.detailed=true ``` **Example 4: Via environment variables** ```bash FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true \ FLEET_EVENT_MONITOR_BUNDLE_EVENT_GENERATION_CHANGE=true \ FLEET_EVENT_MONITOR_BUNDLE_EVENT_TRIGGERED_BY=true \ ./fleeteventmonitor --kubeconfig ~/.kube/config ``` **Example 5: Debug only cache-sync/metadata noise** ```bash helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ --set logging.cluster.detailed=true \ --set logging.cluster.eventFilters.resourceVersionChange=true ``` --- Resource filters allow you to restrict monitoring to a specific subset of resources by namespace and/or name. This is useful in large deployments (100+ bundles) where you only care about specific resources and want to reduce log volume. **Filters apply to both detailed logs AND statistics** — filtered-out resources do not appear in the JSON summary either. At the top of each controller's `Reconcile()`, the resource namespace and name are tested against the compiled regex patterns. Resources that do not match are skipped entirely — no logs, no statistics. - Both patterns are **regular expressions** (Go `regexp` syntax) - An **empty pattern matches all** values for that field (backwards compatible) - Patterns are compiled at startup; an **invalid regex causes the binary to exit** with a clear error message - Namespace and name patterns are ANDed — a resource must match both to be monitored - Filters are orthogonal to event type filtering — both can be combined | Controller | Namespace Pattern | Name Pattern | |---|---|---| | Bundle | `FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAMESPACE` | `FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAME` | | BundleDeployment | `FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_RESOURCE_FILTER_NAMESPACE` | `FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_RESOURCE_FILTER_NAME` | | Cluster | `FLEET_EVENT_MONITOR_CLUSTER_RESOURCE_FILTER_NAMESPACE` | `FLEET_EVENT_MONITOR_CLUSTER_RESOURCE_FILTER_NAME` | | GitRepo | `FLEET_EVENT_MONITOR_GITREPO_RESOURCE_FILTER_NAMESPACE` | `FLEET_EVENT_MONITOR_GITREPO_RESOURCE_FILTER_NAME` | | HelmOp | `FLEET_EVENT_MONITOR_HELMOP_RESOURCE_FILTER_NAMESPACE` | `FLEET_EVENT_MONITOR_HELMOP_RESOURCE_FILTER_NAME` | ```yaml logging: bundle: resourceFilter: namespace: "" # Regular expression for namespace matching (e.g., "^fleet-local$") name: "" # Regular expression for name matching (e.g., "^test-.*") ``` The same structure applies for `bundleDeployment`, `cluster`, `gitRepo`, and `helmOp`. **Example 1: Monitor only a specific bundle** ```bash helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ --set logging.bundle.detailed=true \ --set "logging.bundle.resourceFilter.namespace=^fleet-local$" \ --set "logging.bundle.resourceFilter.name=^my-app$" ``` **Example 2: Monitor all bundles in a namespace** ```bash helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ --set logging.bundle.detailed=true \ --set "logging.bundle.resourceFilter.namespace=^fleet-local$" ``` **Example 3: Monitor bundles matching a name prefix** ```bash helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ --set logging.bundle.detailed=true \ --set "logging.bundle.resourceFilter.name=^payment-.*" ``` **Example 4: Via environment variables** ```bash FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true \ FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAMESPACE="^fleet-local$" \ FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAME="^my-app$" \ ./fleeteventmonitor --kubeconfig ~/.kube/config ``` **Example 5: Combine resource filter with event type filter** ```bash FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true \ FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAMESPACE="^fleet-local$" \ FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAME="^my-app$" \ FLEET_EVENT_MONITOR_BUNDLE_EVENT_STATUS_CHANGE=true \ ./fleeteventmonitor --kubeconfig ~/.kube/config ``` --- The Bundle monitor's Cluster watch handler queries which bundles are affected by a cluster change, logging the correct bundle name and namespace in trigger events. `internal/cmd/monitor/reconciler/bundle_query.go` — adapted from `internal/cmd/controller/target/`: ```go type BundleQuery interface { BundlesForCluster(context.Context, *fleet.Cluster) ([]*fleet.Bundle, []*fleet.Bundle, error) } ``` Supports: basic targeting, label-based cluster matching, ClusterGroups, BundleNamespaceMapping (cross-namespace), Fleet agent bundles, deduplicated results. **Without the query**: `Bundle reconciliation triggered Bundle= Namespace= Name= TriggeredBy=Cluster:my-cluster:fleet-default` **With the query**: `Bundle reconciliation triggered Bundle=fleet-default/my-app Namespace=fleet-default Name=my-app TriggeredBy=Cluster:my-cluster:fleet-default` --- Full `values.yaml` structure as shipped: ```yaml image: repository: rancher/fleet-event-monitor tag: dev imagePullPolicy: IfNotPresent namespace: cattle-fleet-system controllers: bundle: false bundledeployment: false cluster: false gitrepo: false helmop: false workers: bundle: 5 bundledeployment: 5 cluster: 5 gitrepo: 5 helmop: 5 logFormat: json logLevel: info debug: false debugLevel: 0 shardID: "" nodeSelector: {} tolerations: [] priorityClassName: "" leaderElection: enabled: true leaseDuration: 30s retryPeriod: 10s renewDeadline: 25s resources: limits: cpu: 500m memory: 256Mi requests: cpu: 100m memory: 128Mi securityContext: runAsNonRoot: true runAsUser: 1000 runAsGroup: 1000 fsGroup: 1000 extraEnv: [] logging: bundle: detailed: false resourceFilter: namespace: "" name: "" eventFilters: generationChange: false statusChange: false annotationChange: false labelChange: false resourceVersionChange: false deletion: false notFound: false create: false triggeredBy: false bundleDeployment: detailed: false resourceFilter: namespace: "" name: "" eventFilters: generationChange: false statusChange: false annotationChange: false labelChange: false resourceVersionChange: true deletion: false notFound: false create: false triggeredBy: false cluster: detailed: false resourceFilter: namespace: "" name: "" eventFilters: generationChange: false statusChange: false annotationChange: false labelChange: false resourceVersionChange: false deletion: false notFound: false create: false triggeredBy: false gitRepo: detailed: false resourceFilter: namespace: "" name: "" eventFilters: # all false helmOp: detailed: false resourceFilter: namespace: "" name: "" eventFilters: # all false summary: interval: "30s" resetOnPrint: false ``` --- ```bash go build -o bin/fleeteventmonitor ./cmd/fleeteventmonitor ``` Set environment variables before running. At minimum, enable at least one controller: ```bash export ENABLE_BUNDLE_EVENT_MONITOR=true export NAMESPACE=cattle-fleet-system ./bin/fleeteventmonitor --kubeconfig ~/.kube/config ``` For detailed logging with filters: ```bash export ENABLE_BUNDLE_EVENT_MONITOR=true export NAMESPACE=cattle-fleet-system export FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true export FLEET_EVENT_MONITOR_BUNDLE_EVENT_STATUS_CHANGE=true export FLEET_EVENT_MONITOR_BUNDLE_EVENT_TRIGGERED_BY=true ./bin/fleeteventmonitor --kubeconfig ~/.kube/config ``` To narrow down to a specific resource: ```bash export ENABLE_BUNDLE_EVENT_MONITOR=true export NAMESPACE=cattle-fleet-system export FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true export FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAMESPACE="^fleet-local$" export FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAME="^my-app$" ./bin/fleeteventmonitor --kubeconfig ~/.kube/config ``` ```bash helm install fleet-event-monitor ./charts/fleet-event-monitor \ --namespace cattle-fleet-system \ --set controllers.bundle=true \ --set controllers.bundledeployment=true \ --set controllers.cluster=true \ --set controllers.gitrepo=true \ --set controllers.helmop=true helm install fleet-event-monitor ./charts/fleet-event-monitor \ --namespace cattle-fleet-system \ --set controllers.bundle=true \ --set logging.bundle.detailed=true helm install fleet-event-monitor ./charts/fleet-event-monitor \ --namespace cattle-fleet-system \ --set controllers.bundle=true \ --set logging.bundle.detailed=true \ --set logging.bundle.eventFilters.generationChange=true \ --set logging.bundle.eventFilters.triggeredBy=true ``` ```bash helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ --reuse-values \ --set logging.cluster.detailed=true \ --set logging.cluster.eventFilters.statusChange=true ``` ```bash helm install fleet-event-monitor-shard0 ./charts/fleet-event-monitor --set shardID=shard0 helm install fleet-event-monitor-shard1 ./charts/fleet-event-monitor --set shardID=shard1 ``` --- ClusterRole: `get`, `list`, `watch` on Fleet resources, core resources, RBAC resources, Jobs, Deployments. Role (namespaced): `get`, `list`, `watch`, `create`, `update`, `patch`, `delete` on `coordination.k8s.io/leases` (leader election only). No write access to any Fleet or Kubernetes resources. --- | Limitation | Workaround | |---|---| | Controller-runtime doesn't expose which watch triggered a reconciliation | Log at fan-out mapping functions (Cluster→Bundle handler, BD→Bundle handler) | | `TypedResourceVersionUnchangedPredicate` causes cache-sync noise | Filter using `eventFilters.resourceVersionChange=false` to suppress in detailed mode | --- Several scripts in `dev/` help parse and visualize monitor output. They all read from stdin, a pipe, or a file argument and require `jq`. Parses all `Fleet Monitor Summary` lines from a log stream and renders the last (or cumulative) summary as a human-readable table. Also computes the time range covered if multiple summaries are present. ```bash kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | ./dev/format-monitor-summary.sh ./dev/format-monitor-summary.sh logs.json ``` Output example: ``` ================================================================================ FLEET MONITOR SUMMARY ================================================================================ Timestamp: 2026-02-09T10:00:30Z Interval: 30s Total Resources: 3 Total Events: 63 ================================================================================ ▼ Bundle ------------------------------------------------------------------------------- RESOURCE CREATE DELETE N-FOUND STATUS GEN-CHG ANNOT LABEL RESVER EVENTS ---------------------- ------ -------- ------- -------- ------- ----- ----- ------ ------ fleet-local/my-app 1 0 0 20 5 0 0 0 41 └─ triggered-by: BundleDeployment = 12 └─ triggered-by: Cluster = 3 ================================================================================ Time range: ... ================================================================================ ``` Filters `status-change` events from detailed log output and renders each diff with colour-coded `+`/`-` lines. Requires the controller to be running in detailed mode with `statusChange` enabled: ```bash export FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true export FLEET_EVENT_MONITOR_BUNDLE_EVENT_STATUS_CHANGE=true ``` Usage: ```bash kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | ./dev/parse-status-log.sh ``` Filters `resourceversion-change` events and renders each event with version numbers, change reason, metadata change list, and colour-coded diff output. Useful for identifying which SSA managers or finalizer changes are causing metadata-only reconciliation loops. Requires the controller to be running in detailed mode with `resourceVersionChange` enabled: ```bash export FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true export FLEET_EVENT_MONITOR_BUNDLE_EVENT_RESVER_CHANGE=true ``` Usage: ```bash kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | ./dev/parse-resourceversion-log.sh ``` The output includes a `Changed:` line listing which metadata fields changed (`finalizers`, `ownerReferences`, `managedFields`) and, for `managedFields`, a manager-level summary (added/removed/changed SSA managers) followed by a field-level diff of their `FieldsV1` entries. Signed-off-by: Xavi Garcia --- EVENT-MONITOR.md | 682 ++++++++++++++++++ charts/fleet-event-monitor/Chart.yaml | 9 + .../templates/_helpers.tpl | 34 + .../templates/deployment.yaml | 243 +++++++ .../fleet-event-monitor/templates/rbac.yaml | 131 ++++ charts/fleet-event-monitor/values.yaml | 173 +++++ cmd/fleeteventmonitor/main.go | 19 + dev/build-event-monitor | 23 + dev/format-monitor-summary.sh | 147 ++++ dev/parse-resourceversion-log.sh | 70 ++ dev/parse-status-log.sh | 52 ++ internal/cmd/monitor/operator.go | 380 ++++++++++ .../cmd/monitor/reconciler/bundle_monitor.go | 261 +++++++ .../monitor/reconciler/bundle_monitor_test.go | 339 +++++++++ .../cmd/monitor/reconciler/bundle_query.go | 257 +++++++ .../reconciler/bundledeployment_monitor.go | 109 +++ internal/cmd/monitor/reconciler/cache.go | 45 ++ internal/cmd/monitor/reconciler/cache_test.go | 117 +++ .../cmd/monitor/reconciler/cluster_monitor.go | 175 +++++ internal/cmd/monitor/reconciler/filter.go | 130 ++++ .../cmd/monitor/reconciler/filter_test.go | 285 ++++++++ .../cmd/monitor/reconciler/gitrepo_monitor.go | 251 +++++++ .../cmd/monitor/reconciler/helmop_monitor.go | 116 +++ internal/cmd/monitor/reconciler/monitor.go | 328 +++++++++ internal/cmd/monitor/reconciler/predicate.go | 260 +++++++ .../cmd/monitor/reconciler/predicate_test.go | 348 +++++++++ internal/cmd/monitor/reconciler/stats.go | 222 ++++++ internal/cmd/monitor/reconciler/stats_test.go | 252 +++++++ internal/cmd/monitor/root.go | 376 ++++++++++ package/Dockerfile.event-monitor | 29 + 30 files changed, 5863 insertions(+) create mode 100644 EVENT-MONITOR.md create mode 100644 charts/fleet-event-monitor/Chart.yaml create mode 100644 charts/fleet-event-monitor/templates/_helpers.tpl create mode 100644 charts/fleet-event-monitor/templates/deployment.yaml create mode 100644 charts/fleet-event-monitor/templates/rbac.yaml create mode 100644 charts/fleet-event-monitor/values.yaml create mode 100644 cmd/fleeteventmonitor/main.go create mode 100755 dev/build-event-monitor create mode 100755 dev/format-monitor-summary.sh create mode 100755 dev/parse-resourceversion-log.sh create mode 100755 dev/parse-status-log.sh create mode 100644 internal/cmd/monitor/operator.go create mode 100644 internal/cmd/monitor/reconciler/bundle_monitor.go create mode 100644 internal/cmd/monitor/reconciler/bundle_monitor_test.go create mode 100644 internal/cmd/monitor/reconciler/bundle_query.go create mode 100644 internal/cmd/monitor/reconciler/bundledeployment_monitor.go create mode 100644 internal/cmd/monitor/reconciler/cache.go create mode 100644 internal/cmd/monitor/reconciler/cache_test.go create mode 100644 internal/cmd/monitor/reconciler/cluster_monitor.go create mode 100644 internal/cmd/monitor/reconciler/filter.go create mode 100644 internal/cmd/monitor/reconciler/filter_test.go create mode 100644 internal/cmd/monitor/reconciler/gitrepo_monitor.go create mode 100644 internal/cmd/monitor/reconciler/helmop_monitor.go create mode 100644 internal/cmd/monitor/reconciler/monitor.go create mode 100644 internal/cmd/monitor/reconciler/predicate.go create mode 100644 internal/cmd/monitor/reconciler/predicate_test.go create mode 100644 internal/cmd/monitor/reconciler/stats.go create mode 100644 internal/cmd/monitor/reconciler/stats_test.go create mode 100644 internal/cmd/monitor/root.go create mode 100644 package/Dockerfile.event-monitor diff --git a/EVENT-MONITOR.md b/EVENT-MONITOR.md new file mode 100644 index 0000000000..d03ed1018a --- /dev/null +++ b/EVENT-MONITOR.md @@ -0,0 +1,682 @@ +# Fleet Event Monitor + +`fleet-event-monitor` is a separate binary, Docker image, and Helm chart containing **read-only monitoring controllers**. These controllers: + +- Mirror the exact watch configuration of Fleet's production controllers (same `SetupWithManager` logic) +- Log detailed diffs (spec, status, annotations, labels) when controllers are triggered +- Perform **no reconciliation or write operations** +- Are enabled/disabled per controller via environment variables or Helm values +- Use read-only RBAC permissions (only `get`, `list`, `watch` — except leases for leader election) + +**Problem solved**: Understanding why Fleet controllers are triggered repeatedly or what specific changes cause reconciliation loops, without impacting production workloads. + +--- + +## Codebase Layout + +``` +fleet/ +├── cmd/fleeteventmonitor/main.go # Entry point +├── internal/cmd/monitor/ +│ ├── root.go # CLI / cobra setup, env var parsing +│ ├── operator.go # controller-runtime manager, reconciler wiring +│ └── reconciler/ +│ ├── monitor.go # Shared logging utilities (logSpecChange, logStatusChange, etc.) +│ ├── stats.go # EventType constants, StatsTracker, Summary (JSON) +│ ├── filter.go # EventTypeFilters struct, ResourceFilter, ShouldLog/ShouldLogTrigger logic +│ ├── cache.go # ObjectCache (thread-safe, namespace/name keyed) +│ ├── predicate.go # TypedResourceVersionUnchangedPredicate +│ ├── bundle_monitor.go # Bundle controller (watches BD + Cluster) +│ ├── bundle_query.go # BundleQuery interface + impl (cluster→bundle mapping) +│ ├── cluster_monitor.go # Cluster controller (watches BD) +│ ├── bundledeployment_monitor.go # BundleDeployment controller +│ ├── gitrepo_monitor.go # GitRepo controller (watches Job) +│ └── helmop_monitor.go # HelmOp controller +├── package/Dockerfile.event-monitor # Multi-arch Docker image (BCI 15.7, non-root) +├── charts/fleet-event-monitor/ +│ ├── Chart.yaml +│ ├── values.yaml +│ └── templates/ +│ ├── _helpers.tpl +│ ├── deployment.yaml +│ └── rbac.yaml +└── .goreleaser.yaml # Build/release config (fleet-event-monitor added) +``` + +**Original production controllers (for watch pattern reference)**: +- `internal/cmd/controller/reconciler/bundle_controller.go` +- `internal/cmd/controller/reconciler/cluster_controller.go` +- `internal/cmd/controller/reconciler/bundledeployment_controller.go` +- `internal/cmd/controller/gitops/reconciler/gitjob_controller.go` +- `internal/cmd/controller/helmops/reconciler/helmapp_controller.go` + +--- + +## Architecture & Design Principles + +### Design Pattern: Reuse Watch Logic, Replace Reconcile Logic + +Each monitor controller: +1. Copies `SetupWithManager()` from the original controller (identical watches, predicates, event filters) +2. Replaces `Reconcile()` with logging-only logic +3. Uses an in-memory `ObjectCache` to detect changes between events + +### Change Detection Flow + +1. Reconcile triggered by watch event +2. `Get()` current object from Kubernetes API +3. Look up previous version from `ObjectCache` +4. If first time → log "create", cache object, return +5. If seen before → compare and log: spec diff, status diff, annotation/label/resourceVersion changes +6. Update cache with new version + +### Watch Configurations (per controller) + +| Controller | Primary Watch | Secondary Watches | +|---|---|---| +| Bundle | Bundle | BundleDeployment (status changes), Cluster (all changes) | +| Cluster | Cluster | BundleDeployment (spec/status changes) | +| BundleDeployment | BundleDeployment | — | +| GitRepo | GitRepo | Job (status changes) | +| HelmOp | HelmOp | — | + +--- + +## Logging System + +### Two Modes per Controller + +Each controller independently operates in one of two modes, controlled by a per-controller `detailed` flag: + +| Mode | `detailed` value | Behavior | +|---|---|---| +| **Summary** (default) | `false` | Counts events; prints periodic JSON summaries. No per-event log lines. | +| **Detailed** | `true` | Emits a structured log line for every event with diffs included. | + +The summary printer **always runs** regardless of mode, so you always get aggregate statistics. Setting `detailed=true` adds verbose per-event logs on top. + +### Per-Controller Detailed Logging + +**Default**: all controllers default to `false` (summary only). + +| Environment Variable | Helm Value | Default | +|---|---|---| +| `FLEET_EVENT_MONITOR_BUNDLE_DETAILED` | `logging.bundle.detailed` | `false` | +| `FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_DETAILED` | `logging.bundleDeployment.detailed` | `false` | +| `FLEET_EVENT_MONITOR_CLUSTER_DETAILED` | `logging.cluster.detailed` | `false` | +| `FLEET_EVENT_MONITOR_GITREPO_DETAILED` | `logging.gitRepo.detailed` | `false` | +| `FLEET_EVENT_MONITOR_HELMOP_DETAILED` | `logging.helmOp.detailed` | `false` | + +> **Note**: The wrangler command framework does not parse boolean env vars automatically. They are manually parsed in `root.go` using `strconv.ParseBool()`. Valid values: `true`/`false`, `1`/`0`, `True`/`False`, `TRUE`/`FALSE`. + +### Summary Configuration + +| Environment Variable | Helm Value | Default | Description | +|---|---|---|---| +| `FLEET_EVENT_MONITOR_SUMMARY_INTERVAL` | `logging.summary.interval` | `"30s"` | How often to print the JSON summary | +| `FLEET_EVENT_MONITOR_SUMMARY_RESET` | `logging.summary.resetOnPrint` | `false` | Reset counters after each print (false = cumulative) | + +### Event Types Tracked + +| Event Type | Env var suffix / Helm key | Description | +|---|---|---| +| `generation-change` | `GENERATION_CHANGE` / `generationChange` | Spec modifications (generation bump) | +| `status-change` | `STATUS_CHANGE` / `statusChange` | Status field updates | +| `annotation-change` | `ANNOTATION_CHANGE` / `annotationChange` | Annotation modifications | +| `label-change` | `LABEL_CHANGE` / `labelChange` | Label modifications | +| `resourceversion-change` | `RESVER_CHANGE` / `resourceVersionChange` | Cache sync / metadata updates (finalizers, ownerRefs, managedFields) | +| `triggered-by` | `TRIGGERED_BY` / `triggeredBy` | Trigger source breakdown by resource type | +| `deletion` | `DELETION` / `deletion` | Resource being deleted | +| `not-found` | `NOT_FOUND` / `notFound` | Resource not found (likely deleted) | +| `create` | `CREATE` / `create` | First observation of resource | + +### Summary Output Format (JSON) + +```json +{ + "timestamp": "2026-02-09T10:00:30Z", + "interval_seconds": 30, + "summary": { + "Bundle": { + "fleet-local/test-bundle": { + "generation-change": 5, + "status-change": 20, + "triggered-by": { "BundleDeployment": 12, "Cluster": 3 }, + "total_events": 41 + } + } + }, + "totals": { "total_resources_monitored": 3, "total_events": 63 } +} +``` + +### Useful Log Queries + +```bash +# Find high-churn resources +kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | \ + grep "Fleet Monitor Summary" | tail -1 | \ + jq -r '.summary.Bundle | to_entries[] | select(.value.total_events > 50) | "\(.key): \(.value.total_events) events"' + +# Analyze trigger sources for a bundle +kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | \ + grep "Fleet Monitor Summary" | tail -1 | \ + jq '.summary.Bundle["fleet-local/test-bundle"]["triggered-by"]' + +# Get all status-change counts across all resources +kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | \ + grep "Fleet Monitor Summary" | tail -1 | \ + jq -r '.summary | to_entries[] | .key as $t | .value | to_entries[] | select(.value["status-change"] > 0) | "\($t)/\(.key): \(.value["status-change"]) status changes"' + +# Verify env vars are parsed correctly at startup +kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | grep "parsed per-controller" + +# Verify which controllers registered and in which mode +kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | grep "registered monitor controller" +``` + +--- + +## Event Type Filtering + +When a controller is in detailed mode (`detailed=true`), event type filters let you restrict which event types produce a log line. **Statistics are always tracked** regardless of filters — filters only affect the verbosity of the per-event log output. + +**Default behavior**: if all event filter flags are `false`, **all event types are logged** (backwards compatible). To restrict output, set the specific types you want to `true`. Any `true` flag activates selective filtering. + +### How It Works + +`EventTypeFilters.IsEmpty()` returns true when all fields are false → `ShouldLog()` returns true for every event type. Once any field is set to `true`, only enabled types pass through. + +### Env Var Reference (per controller) + +The env var pattern is: +- Bundle: `FLEET_EVENT_MONITOR_BUNDLE_EVENT_` +- BundleDeployment: `FLEET_EVENT_MONITOR_BD_EVENT_` +- Cluster: `FLEET_EVENT_MONITOR_CLUSTER_EVENT_` +- GitRepo: `FLEET_EVENT_MONITOR_GITREPO_EVENT_` +- HelmOp: `FLEET_EVENT_MONITOR_HELMOP_EVENT_` + +Where `` is one of: `GENERATION_CHANGE`, `STATUS_CHANGE`, `ANNOTATION_CHANGE`, `LABEL_CHANGE`, `RESVER_CHANGE`, `DELETION`, `NOT_FOUND`, `CREATE`, `TRIGGERED_BY`. + +### Helm Values Reference + +```yaml +logging: + bundle: + detailed: true # Must be true for event filters to have any effect + eventFilters: + generationChange: false # Set true to see spec diffs + statusChange: false # Set true to see status diffs + annotationChange: false + labelChange: false + resourceVersionChange: false # Set true to see cache-sync/metadata events + deletion: false + notFound: false + create: false + triggeredBy: false # Set true to see which resource triggered reconciliation +``` + +The same structure applies for `bundleDeployment`, `cluster`, `gitRepo`, and `helmOp`. + +### Usage Examples + +**Example 1: Only watch generation changes (spec diffs) for Bundle** +```bash +helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ + --set logging.bundle.detailed=true \ + --set logging.bundle.eventFilters.generationChange=true +# All other event types are counted in summary but not logged +``` + +**Example 2: Focus on reconciliation trigger sources only** +```bash +helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ + --set logging.bundle.detailed=true \ + --set logging.bundle.eventFilters.triggeredBy=true +# Shows which BD or Cluster changes are causing bundle reconciliations +``` + +**Example 3: See everything for Bundle (all filters false = log all)** +```bash +helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ + --set logging.bundle.detailed=true +# All eventFilters default to false → all events are logged +``` + +**Example 4: Via environment variables** +```bash +FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true \ +FLEET_EVENT_MONITOR_BUNDLE_EVENT_GENERATION_CHANGE=true \ +FLEET_EVENT_MONITOR_BUNDLE_EVENT_TRIGGERED_BY=true \ +./fleeteventmonitor --kubeconfig ~/.kube/config +``` + +**Example 5: Debug only cache-sync/metadata noise** +```bash +# See what managedFields/finalizer changes are causing resourceversion-only events +helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ + --set logging.cluster.detailed=true \ + --set logging.cluster.eventFilters.resourceVersionChange=true +``` + +--- + +## Resource Filtering + +Resource filters allow you to restrict monitoring to a specific subset of resources by namespace and/or name. This is useful in large deployments (100+ bundles) where you only care about specific resources and want to reduce log volume. + +**Filters apply to both detailed logs AND statistics** — filtered-out resources do not appear in the JSON summary either. + +### How It Works + +At the top of each controller's `Reconcile()`, the resource namespace and name are tested against the compiled regex patterns. Resources that do not match are skipped entirely — no logs, no statistics. + +- Both patterns are **regular expressions** (Go `regexp` syntax) +- An **empty pattern matches all** values for that field (backwards compatible) +- Patterns are compiled at startup; an **invalid regex causes the binary to exit** with a clear error message +- Namespace and name patterns are ANDed — a resource must match both to be monitored +- Filters are orthogonal to event type filtering — both can be combined + +### Env Var Reference (per controller) + +| Controller | Namespace Pattern | Name Pattern | +|---|---|---| +| Bundle | `FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAMESPACE` | `FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAME` | +| BundleDeployment | `FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_RESOURCE_FILTER_NAMESPACE` | `FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_RESOURCE_FILTER_NAME` | +| Cluster | `FLEET_EVENT_MONITOR_CLUSTER_RESOURCE_FILTER_NAMESPACE` | `FLEET_EVENT_MONITOR_CLUSTER_RESOURCE_FILTER_NAME` | +| GitRepo | `FLEET_EVENT_MONITOR_GITREPO_RESOURCE_FILTER_NAMESPACE` | `FLEET_EVENT_MONITOR_GITREPO_RESOURCE_FILTER_NAME` | +| HelmOp | `FLEET_EVENT_MONITOR_HELMOP_RESOURCE_FILTER_NAMESPACE` | `FLEET_EVENT_MONITOR_HELMOP_RESOURCE_FILTER_NAME` | + +### Helm Values Reference + +```yaml +logging: + bundle: + resourceFilter: + namespace: "" # Regular expression for namespace matching (e.g., "^fleet-local$") + name: "" # Regular expression for name matching (e.g., "^test-.*") +``` + +The same structure applies for `bundleDeployment`, `cluster`, `gitRepo`, and `helmOp`. + +### Usage Examples + +**Example 1: Monitor only a specific bundle** +```bash +helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ + --set logging.bundle.detailed=true \ + --set "logging.bundle.resourceFilter.namespace=^fleet-local$" \ + --set "logging.bundle.resourceFilter.name=^my-app$" +``` + +**Example 2: Monitor all bundles in a namespace** +```bash +helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ + --set logging.bundle.detailed=true \ + --set "logging.bundle.resourceFilter.namespace=^fleet-local$" +``` + +**Example 3: Monitor bundles matching a name prefix** +```bash +helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ + --set logging.bundle.detailed=true \ + --set "logging.bundle.resourceFilter.name=^payment-.*" +``` + +**Example 4: Via environment variables** +```bash +FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true \ +FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAMESPACE="^fleet-local$" \ +FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAME="^my-app$" \ +./fleeteventmonitor --kubeconfig ~/.kube/config +``` + +**Example 5: Combine resource filter with event type filter** +```bash +# Monitor only status changes for a specific bundle in fleet-local +FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true \ +FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAMESPACE="^fleet-local$" \ +FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAME="^my-app$" \ +FLEET_EVENT_MONITOR_BUNDLE_EVENT_STATUS_CHANGE=true \ +./fleeteventmonitor --kubeconfig ~/.kube/config +``` + +--- + +## BundleQuery (Cluster → Bundle Mapping) + +The Bundle monitor's Cluster watch handler queries which bundles are affected by a cluster change, logging the correct bundle name and namespace in trigger events. + +`internal/cmd/monitor/reconciler/bundle_query.go` — adapted from `internal/cmd/controller/target/`: + +```go +type BundleQuery interface { + BundlesForCluster(context.Context, *fleet.Cluster) ([]*fleet.Bundle, []*fleet.Bundle, error) +} +``` + +Supports: basic targeting, label-based cluster matching, ClusterGroups, BundleNamespaceMapping (cross-namespace), Fleet agent bundles, deduplicated results. + +**Without the query**: `Bundle reconciliation triggered Bundle= Namespace= Name= TriggeredBy=Cluster:my-cluster:fleet-default` + +**With the query**: `Bundle reconciliation triggered Bundle=fleet-default/my-app Namespace=fleet-default Name=my-app TriggeredBy=Cluster:my-cluster:fleet-default` + +--- + +## Configuration Reference (Helm Values) + +Full `values.yaml` structure as shipped: + +```yaml +image: + repository: rancher/fleet-event-monitor + tag: dev + imagePullPolicy: IfNotPresent + +namespace: cattle-fleet-system + +# Enable/disable individual controllers +controllers: + bundle: false + bundledeployment: false + cluster: false + gitrepo: false + helmop: false + +# Worker counts per controller +workers: + bundle: 5 + bundledeployment: 5 + cluster: 5 + gitrepo: 5 + helmop: 5 + +# Logging level / format +logFormat: json +logLevel: info +debug: false +debugLevel: 0 + +# Sharding (same as fleet controller) +shardID: "" + +# Node selector and tolerations +nodeSelector: {} +tolerations: [] +priorityClassName: "" + +# Leader election +leaderElection: + enabled: true + leaseDuration: 30s + retryPeriod: 10s + renewDeadline: 25s + +# Resource limits +resources: + limits: + cpu: 500m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + +# Security context +securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + +# Extra env vars injected verbatim +extraEnv: [] + +# Per-controller logging configuration +logging: + bundle: + detailed: false + resourceFilter: + namespace: "" + name: "" + eventFilters: + generationChange: false + statusChange: false + annotationChange: false + labelChange: false + resourceVersionChange: false + deletion: false + notFound: false + create: false + triggeredBy: false + + bundleDeployment: + detailed: false + resourceFilter: + namespace: "" + name: "" + eventFilters: + generationChange: false + statusChange: false + annotationChange: false + labelChange: false + resourceVersionChange: true + deletion: false + notFound: false + create: false + triggeredBy: false + + cluster: + detailed: false + resourceFilter: + namespace: "" + name: "" + eventFilters: + generationChange: false + statusChange: false + annotationChange: false + labelChange: false + resourceVersionChange: false + deletion: false + notFound: false + create: false + triggeredBy: false + + gitRepo: + detailed: false + resourceFilter: + namespace: "" + name: "" + eventFilters: + # all false + + helmOp: + detailed: false + resourceFilter: + namespace: "" + name: "" + eventFilters: + # all false + + summary: + interval: "30s" + resetOnPrint: false +``` + +--- + +## Quick Start + +### Build + +```bash +go build -o bin/fleeteventmonitor ./cmd/fleeteventmonitor +``` + +### Run locally (standalone) + +Set environment variables before running. At minimum, enable at least one controller: + +```bash +# Enable the bundle controller in summary mode +export ENABLE_BUNDLE_EVENT_MONITOR=true +export NAMESPACE=cattle-fleet-system + +./bin/fleeteventmonitor --kubeconfig ~/.kube/config +``` + +For detailed logging with filters: + +```bash +export ENABLE_BUNDLE_EVENT_MONITOR=true +export NAMESPACE=cattle-fleet-system +export FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true +export FLEET_EVENT_MONITOR_BUNDLE_EVENT_STATUS_CHANGE=true +export FLEET_EVENT_MONITOR_BUNDLE_EVENT_TRIGGERED_BY=true + +./bin/fleeteventmonitor --kubeconfig ~/.kube/config +``` + +To narrow down to a specific resource: + +```bash +export ENABLE_BUNDLE_EVENT_MONITOR=true +export NAMESPACE=cattle-fleet-system +export FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true +export FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAMESPACE="^fleet-local$" +export FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAME="^my-app$" + +./bin/fleeteventmonitor --kubeconfig ~/.kube/config +``` + +### Deploy with Helm + +```bash +# All controllers in summary mode +helm install fleet-event-monitor ./charts/fleet-event-monitor \ + --namespace cattle-fleet-system \ + --set controllers.bundle=true \ + --set controllers.bundledeployment=true \ + --set controllers.cluster=true \ + --set controllers.gitrepo=true \ + --set controllers.helmop=true + +# Bundle controller in detailed mode, all event types +helm install fleet-event-monitor ./charts/fleet-event-monitor \ + --namespace cattle-fleet-system \ + --set controllers.bundle=true \ + --set logging.bundle.detailed=true + +# Only log generation-change and triggered-by events for Bundle +helm install fleet-event-monitor ./charts/fleet-event-monitor \ + --namespace cattle-fleet-system \ + --set controllers.bundle=true \ + --set logging.bundle.detailed=true \ + --set logging.bundle.eventFilters.generationChange=true \ + --set logging.bundle.eventFilters.triggeredBy=true +``` + +### Upgrade and change config without rebuilding + +```bash +helm upgrade fleet-event-monitor ./charts/fleet-event-monitor \ + --reuse-values \ + --set logging.cluster.detailed=true \ + --set logging.cluster.eventFilters.statusChange=true +``` + +### With sharding + +```bash +helm install fleet-event-monitor-shard0 ./charts/fleet-event-monitor --set shardID=shard0 +helm install fleet-event-monitor-shard1 ./charts/fleet-event-monitor --set shardID=shard1 +``` + +--- + +## RBAC + +ClusterRole: `get`, `list`, `watch` on Fleet resources, core resources, RBAC resources, Jobs, Deployments. +Role (namespaced): `get`, `list`, `watch`, `create`, `update`, `patch`, `delete` on `coordination.k8s.io/leases` (leader election only). +No write access to any Fleet or Kubernetes resources. + +--- + +## Known Limitations + +| Limitation | Workaround | +|---|---| +| Controller-runtime doesn't expose which watch triggered a reconciliation | Log at fan-out mapping functions (Cluster→Bundle handler, BD→Bundle handler) | +| `TypedResourceVersionUnchangedPredicate` causes cache-sync noise | Filter using `eventFilters.resourceVersionChange=false` to suppress in detailed mode | + +--- + +## Dev Scripts + +Several scripts in `dev/` help parse and visualize monitor output. They all read from stdin, a pipe, or a file argument and require `jq`. + +### `dev/format-monitor-summary.sh` — pretty-print the JSON summary + +Parses all `Fleet Monitor Summary` lines from a log stream and renders the last (or cumulative) summary as a human-readable table. Also computes the time range covered if multiple summaries are present. + +```bash +# From a running pod +kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | ./dev/format-monitor-summary.sh + +# From a saved log file +./dev/format-monitor-summary.sh logs.json +``` + +Output example: +``` +================================================================================ + FLEET MONITOR SUMMARY +================================================================================ + Timestamp: 2026-02-09T10:00:30Z + Interval: 30s + Total Resources: 3 + Total Events: 63 +================================================================================ + +▼ Bundle +------------------------------------------------------------------------------- + RESOURCE CREATE DELETE N-FOUND STATUS GEN-CHG ANNOT LABEL RESVER EVENTS + ---------------------- ------ -------- ------- -------- ------- ----- ----- ------ ------ + fleet-local/my-app 1 0 0 20 5 0 0 0 41 + └─ triggered-by: BundleDeployment = 12 + └─ triggered-by: Cluster = 3 +================================================================================ + Time range: ... +================================================================================ +``` + +### `dev/parse-status-log.sh` — visualize status change diffs + +Filters `status-change` events from detailed log output and renders each diff with colour-coded `+`/`-` lines. + +Requires the controller to be running in detailed mode with `statusChange` enabled: +```bash +export FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true +export FLEET_EVENT_MONITOR_BUNDLE_EVENT_STATUS_CHANGE=true +``` + +Usage: +```bash +kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | ./dev/parse-status-log.sh +``` + +### `dev/parse-resourceversion-log.sh` — visualize resource version change diffs + +Filters `resourceversion-change` events and renders each event with version numbers, change reason, metadata change list, and colour-coded diff output. Useful for identifying which SSA managers or finalizer changes are causing metadata-only reconciliation loops. + +Requires the controller to be running in detailed mode with `resourceVersionChange` enabled: +```bash +export FLEET_EVENT_MONITOR_BUNDLE_DETAILED=true +export FLEET_EVENT_MONITOR_BUNDLE_EVENT_RESVER_CHANGE=true +``` + +Usage: +```bash +kubectl logs -n cattle-fleet-system deploy/fleet-event-monitor | ./dev/parse-resourceversion-log.sh +``` + +The output includes a `Changed:` line listing which metadata fields changed (`finalizers`, `ownerReferences`, `managedFields`) and, for `managedFields`, a manager-level summary (added/removed/changed SSA managers) followed by a field-level diff of their `FieldsV1` entries. diff --git a/charts/fleet-event-monitor/Chart.yaml b/charts/fleet-event-monitor/Chart.yaml new file mode 100644 index 0000000000..030fd94688 --- /dev/null +++ b/charts/fleet-event-monitor/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: fleet-event-monitor +version: 0.0.0 +appVersion: 0.0.0 +description: Fleet Event Monitor - Read-only monitoring for Fleet controllers +icon: https://charts.rancher.io/assets/logos/fleet.svg +annotations: + catalog.cattle.io/namespace: cattle-fleet-system + catalog.cattle.io/kube-version: '>= 1.28.0-0 < 1.35.0-0' diff --git a/charts/fleet-event-monitor/templates/_helpers.tpl b/charts/fleet-event-monitor/templates/_helpers.tpl new file mode 100644 index 0000000000..c40c6b9467 --- /dev/null +++ b/charts/fleet-event-monitor/templates/_helpers.tpl @@ -0,0 +1,34 @@ +{{- define "fleet-event-monitor.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{- define "fleet-event-monitor.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{- define "fleet-event-monitor.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{- define "fleet-event-monitor.labels" -}} +helm.sh/chart: {{ include "fleet-event-monitor.chart" . }} +{{ include "fleet-event-monitor.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{- define "fleet-event-monitor.selectorLabels" -}} +app.kubernetes.io/name: {{ include "fleet-event-monitor.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} diff --git a/charts/fleet-event-monitor/templates/deployment.yaml b/charts/fleet-event-monitor/templates/deployment.yaml new file mode 100644 index 0000000000..0291111372 --- /dev/null +++ b/charts/fleet-event-monitor/templates/deployment.yaml @@ -0,0 +1,243 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fleet-event-monitor{{- if .Values.shardID }}-shard-{{ .Values.shardID }}{{- end }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "fleet-event-monitor.labels" . | nindent 4 }} + {{- if .Values.shardID }} + fleet.cattle.io/shard-id: {{ .Values.shardID }} + {{- end }} +spec: + replicas: 1 + selector: + matchLabels: + {{- include "fleet-event-monitor.selectorLabels" . | nindent 6 }} + {{- if .Values.shardID }} + fleet.cattle.io/shard-id: {{ .Values.shardID }} + {{- end }} + template: + metadata: + labels: + {{- include "fleet-event-monitor.selectorLabels" . | nindent 8 }} + {{- if .Values.shardID }} + fleet.cattle.io/shard-id: {{ .Values.shardID }} + {{- end }} + spec: + serviceAccountName: fleet-event-monitor + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: fleet-event-monitor + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.imagePullPolicy }} + command: + - fleeteventmonitor + {{- if .Values.shardID }} + - --shard-id={{ .Values.shardID }} + {{- end }} + {{- if .Values.debug }} + - --debug + - --debug-level={{ .Values.debugLevel }} + {{- end }} + env: + - name: NAMESPACE + value: {{ .Values.namespace | quote }} + - name: ENABLE_BUNDLE_EVENT_MONITOR + value: {{ .Values.controllers.bundle | quote }} + - name: ENABLE_BUNDLEDEPLOYMENT_EVENT_MONITOR + value: {{ .Values.controllers.bundledeployment | quote }} + - name: ENABLE_CLUSTER_EVENT_MONITOR + value: {{ .Values.controllers.cluster | quote }} + - name: ENABLE_GITREPO_EVENT_MONITOR + value: {{ .Values.controllers.gitrepo | quote }} + - name: ENABLE_HELMOP_EVENT_MONITOR + value: {{ .Values.controllers.helmop | quote }} + - name: BUNDLE_RECONCILER_WORKERS + value: {{ .Values.workers.bundle | quote }} + - name: BUNDLEDEPLOYMENT_RECONCILER_WORKERS + value: {{ .Values.workers.bundledeployment | quote }} + - name: CLUSTER_RECONCILER_WORKERS + value: {{ .Values.workers.cluster | quote }} + - name: GITREPO_RECONCILER_WORKERS + value: {{ .Values.workers.gitrepo | quote }} + - name: HELMOP_RECONCILER_WORKERS + value: {{ .Values.workers.helmop | quote }} + {{- if .Values.debug }} + - name: CATTLE_DEV_MODE + value: "true" + {{- end }} + {{- if .Values.leaderElection.enabled }} + - name: CATTLE_ELECTION_LEASE_DURATION + value: {{ .Values.leaderElection.leaseDuration | quote }} + - name: CATTLE_ELECTION_RETRY_PERIOD + value: {{ .Values.leaderElection.retryPeriod | quote }} + - name: CATTLE_ELECTION_RENEW_DEADLINE + value: {{ .Values.leaderElection.renewDeadline | quote }} + {{- end }} + # Per-controller detailed logging flags + - name: FLEET_EVENT_MONITOR_BUNDLE_DETAILED + value: {{ .Values.logging.bundle.detailed | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_DETAILED + value: {{ .Values.logging.bundleDeployment.detailed | quote }} + - name: FLEET_EVENT_MONITOR_CLUSTER_DETAILED + value: {{ .Values.logging.cluster.detailed | quote }} + - name: FLEET_EVENT_MONITOR_GITREPO_DETAILED + value: {{ .Values.logging.gitRepo.detailed | quote }} + - name: FLEET_EVENT_MONITOR_HELMOP_DETAILED + value: {{ .Values.logging.helmOp.detailed | quote }} + # Bundle resource filters + - name: FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAMESPACE + value: {{ .Values.logging.bundle.resourceFilter.namespace | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAME + value: {{ .Values.logging.bundle.resourceFilter.name | quote }} + # Bundle event filters + - name: FLEET_EVENT_MONITOR_BUNDLE_EVENT_GENERATION_CHANGE + value: {{ .Values.logging.bundle.eventFilters.generationChange | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLE_EVENT_STATUS_CHANGE + value: {{ .Values.logging.bundle.eventFilters.statusChange | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLE_EVENT_ANNOTATION_CHANGE + value: {{ .Values.logging.bundle.eventFilters.annotationChange | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLE_EVENT_LABEL_CHANGE + value: {{ .Values.logging.bundle.eventFilters.labelChange | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLE_EVENT_RESVER_CHANGE + value: {{ .Values.logging.bundle.eventFilters.resourceVersionChange | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLE_EVENT_DELETION + value: {{ .Values.logging.bundle.eventFilters.deletion | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLE_EVENT_NOT_FOUND + value: {{ .Values.logging.bundle.eventFilters.notFound | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLE_EVENT_CREATE + value: {{ .Values.logging.bundle.eventFilters.create | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLE_EVENT_TRIGGERED_BY + value: {{ .Values.logging.bundle.eventFilters.triggeredBy | quote }} + # BundleDeployment resource filters + - name: FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_RESOURCE_FILTER_NAMESPACE + value: {{ .Values.logging.bundleDeployment.resourceFilter.namespace | quote }} + - name: FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_RESOURCE_FILTER_NAME + value: {{ .Values.logging.bundleDeployment.resourceFilter.name | quote }} + # BundleDeployment event filters + - name: FLEET_EVENT_MONITOR_BD_EVENT_GENERATION_CHANGE + value: {{ .Values.logging.bundleDeployment.eventFilters.generationChange | quote }} + - name: FLEET_EVENT_MONITOR_BD_EVENT_STATUS_CHANGE + value: {{ .Values.logging.bundleDeployment.eventFilters.statusChange | quote }} + - name: FLEET_EVENT_MONITOR_BD_EVENT_ANNOTATION_CHANGE + value: {{ .Values.logging.bundleDeployment.eventFilters.annotationChange | quote }} + - name: FLEET_EVENT_MONITOR_BD_EVENT_LABEL_CHANGE + value: {{ .Values.logging.bundleDeployment.eventFilters.labelChange | quote }} + - name: FLEET_EVENT_MONITOR_BD_EVENT_RESVER_CHANGE + value: {{ .Values.logging.bundleDeployment.eventFilters.resourceVersionChange | quote }} + - name: FLEET_EVENT_MONITOR_BD_EVENT_DELETION + value: {{ .Values.logging.bundleDeployment.eventFilters.deletion | quote }} + - name: FLEET_EVENT_MONITOR_BD_EVENT_NOT_FOUND + value: {{ .Values.logging.bundleDeployment.eventFilters.notFound | quote }} + - name: FLEET_EVENT_MONITOR_BD_EVENT_CREATE + value: {{ .Values.logging.bundleDeployment.eventFilters.create | quote }} + - name: FLEET_EVENT_MONITOR_BD_EVENT_TRIGGERED_BY + value: {{ .Values.logging.bundleDeployment.eventFilters.triggeredBy | quote }} + # Cluster resource filters + - name: FLEET_EVENT_MONITOR_CLUSTER_RESOURCE_FILTER_NAMESPACE + value: {{ .Values.logging.cluster.resourceFilter.namespace | quote }} + - name: FLEET_EVENT_MONITOR_CLUSTER_RESOURCE_FILTER_NAME + value: {{ .Values.logging.cluster.resourceFilter.name | quote }} + # Cluster event filters + - name: FLEET_EVENT_MONITOR_CLUSTER_EVENT_GENERATION_CHANGE + value: {{ .Values.logging.cluster.eventFilters.generationChange | quote }} + - name: FLEET_EVENT_MONITOR_CLUSTER_EVENT_STATUS_CHANGE + value: {{ .Values.logging.cluster.eventFilters.statusChange | quote }} + - name: FLEET_EVENT_MONITOR_CLUSTER_EVENT_ANNOTATION_CHANGE + value: {{ .Values.logging.cluster.eventFilters.annotationChange | quote }} + - name: FLEET_EVENT_MONITOR_CLUSTER_EVENT_LABEL_CHANGE + value: {{ .Values.logging.cluster.eventFilters.labelChange | quote }} + - name: FLEET_EVENT_MONITOR_CLUSTER_EVENT_RESVER_CHANGE + value: {{ .Values.logging.cluster.eventFilters.resourceVersionChange | quote }} + - name: FLEET_EVENT_MONITOR_CLUSTER_EVENT_DELETION + value: {{ .Values.logging.cluster.eventFilters.deletion | quote }} + - name: FLEET_EVENT_MONITOR_CLUSTER_EVENT_NOT_FOUND + value: {{ .Values.logging.cluster.eventFilters.notFound | quote }} + - name: FLEET_EVENT_MONITOR_CLUSTER_EVENT_CREATE + value: {{ .Values.logging.cluster.eventFilters.create | quote }} + - name: FLEET_EVENT_MONITOR_CLUSTER_EVENT_TRIGGERED_BY + value: {{ .Values.logging.cluster.eventFilters.triggeredBy | quote }} + # GitRepo resource filters + - name: FLEET_EVENT_MONITOR_GITREPO_RESOURCE_FILTER_NAMESPACE + value: {{ .Values.logging.gitRepo.resourceFilter.namespace | quote }} + - name: FLEET_EVENT_MONITOR_GITREPO_RESOURCE_FILTER_NAME + value: {{ .Values.logging.gitRepo.resourceFilter.name | quote }} + # GitRepo event filters + - name: FLEET_EVENT_MONITOR_GITREPO_EVENT_GENERATION_CHANGE + value: {{ .Values.logging.gitRepo.eventFilters.generationChange | quote }} + - name: FLEET_EVENT_MONITOR_GITREPO_EVENT_STATUS_CHANGE + value: {{ .Values.logging.gitRepo.eventFilters.statusChange | quote }} + - name: FLEET_EVENT_MONITOR_GITREPO_EVENT_ANNOTATION_CHANGE + value: {{ .Values.logging.gitRepo.eventFilters.annotationChange | quote }} + - name: FLEET_EVENT_MONITOR_GITREPO_EVENT_LABEL_CHANGE + value: {{ .Values.logging.gitRepo.eventFilters.labelChange | quote }} + - name: FLEET_EVENT_MONITOR_GITREPO_EVENT_RESVER_CHANGE + value: {{ .Values.logging.gitRepo.eventFilters.resourceVersionChange | quote }} + - name: FLEET_EVENT_MONITOR_GITREPO_EVENT_DELETION + value: {{ .Values.logging.gitRepo.eventFilters.deletion | quote }} + - name: FLEET_EVENT_MONITOR_GITREPO_EVENT_NOT_FOUND + value: {{ .Values.logging.gitRepo.eventFilters.notFound | quote }} + - name: FLEET_EVENT_MONITOR_GITREPO_EVENT_CREATE + value: {{ .Values.logging.gitRepo.eventFilters.create | quote }} + - name: FLEET_EVENT_MONITOR_GITREPO_EVENT_TRIGGERED_BY + value: {{ .Values.logging.gitRepo.eventFilters.triggeredBy | quote }} + # HelmOp resource filters + - name: FLEET_EVENT_MONITOR_HELMOP_RESOURCE_FILTER_NAMESPACE + value: {{ .Values.logging.helmOp.resourceFilter.namespace | quote }} + - name: FLEET_EVENT_MONITOR_HELMOP_RESOURCE_FILTER_NAME + value: {{ .Values.logging.helmOp.resourceFilter.name | quote }} + # HelmOp event filters + - name: FLEET_EVENT_MONITOR_HELMOP_EVENT_GENERATION_CHANGE + value: {{ .Values.logging.helmOp.eventFilters.generationChange | quote }} + - name: FLEET_EVENT_MONITOR_HELMOP_EVENT_STATUS_CHANGE + value: {{ .Values.logging.helmOp.eventFilters.statusChange | quote }} + - name: FLEET_EVENT_MONITOR_HELMOP_EVENT_ANNOTATION_CHANGE + value: {{ .Values.logging.helmOp.eventFilters.annotationChange | quote }} + - name: FLEET_EVENT_MONITOR_HELMOP_EVENT_LABEL_CHANGE + value: {{ .Values.logging.helmOp.eventFilters.labelChange | quote }} + - name: FLEET_EVENT_MONITOR_HELMOP_EVENT_RESVER_CHANGE + value: {{ .Values.logging.helmOp.eventFilters.resourceVersionChange | quote }} + - name: FLEET_EVENT_MONITOR_HELMOP_EVENT_DELETION + value: {{ .Values.logging.helmOp.eventFilters.deletion | quote }} + - name: FLEET_EVENT_MONITOR_HELMOP_EVENT_NOT_FOUND + value: {{ .Values.logging.helmOp.eventFilters.notFound | quote }} + - name: FLEET_EVENT_MONITOR_HELMOP_EVENT_CREATE + value: {{ .Values.logging.helmOp.eventFilters.create | quote }} + - name: FLEET_EVENT_MONITOR_HELMOP_EVENT_TRIGGERED_BY + value: {{ .Values.logging.helmOp.eventFilters.triggeredBy | quote }} + # Summary configuration + - name: FLEET_EVENT_MONITOR_SUMMARY_INTERVAL + value: {{ .Values.logging.summary.interval | quote }} + - name: FLEET_EVENT_MONITOR_SUMMARY_RESET + value: {{ .Values.logging.summary.resetOnPrint | quote }} + {{- with .Values.extraEnv }} + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + resources: + {{- toYaml .Values.resources | nindent 10 }} + volumeMounts: + - name: tmp + mountPath: /tmp + volumes: + - name: tmp + emptyDir: {} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} diff --git a/charts/fleet-event-monitor/templates/rbac.yaml b/charts/fleet-event-monitor/templates/rbac.yaml new file mode 100644 index 0000000000..10ad1b04e2 --- /dev/null +++ b/charts/fleet-event-monitor/templates/rbac.yaml @@ -0,0 +1,131 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fleet-event-monitor + namespace: {{ .Release.Namespace }} + labels: + {{- include "fleet-event-monitor.labels" . | nindent 4 }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: fleet-event-monitor + labels: + {{- include "fleet-event-monitor.labels" . | nindent 4 }} +rules: +# Fleet resources - READ ONLY +- apiGroups: + - fleet.cattle.io + resources: + - bundles + - bundledeployments + - bundlenamespacemappings + - clusters + - clustergroups + - gitrepos + - imagescans + - helmops + - contents + verbs: + - get + - list + - watch + +# Core resources - READ ONLY +- apiGroups: + - "" + resources: + - namespaces + - secrets + - configmaps + - serviceaccounts + verbs: + - get + - list + - watch + +# RBAC resources - READ ONLY +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - clusterrolebindings + - roles + - rolebindings + verbs: + - get + - list + - watch + +# Jobs - READ ONLY +- apiGroups: + - batch + resources: + - jobs + verbs: + - get + - list + - watch + +# Apps - READ ONLY +- apiGroups: + - apps + resources: + - deployments + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fleet-event-monitor + labels: + {{- include "fleet-event-monitor.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: fleet-event-monitor +subjects: +- kind: ServiceAccount + name: fleet-event-monitor + namespace: {{ .Release.Namespace }} +--- +# Role for leader election (write access to leases only) +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: fleet-event-monitor + namespace: {{ .Release.Namespace }} + labels: + {{- include "fleet-event-monitor.labels" . | nindent 4 }} +rules: +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: fleet-event-monitor + namespace: {{ .Release.Namespace }} + labels: + {{- include "fleet-event-monitor.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: fleet-event-monitor +subjects: +- kind: ServiceAccount + name: fleet-event-monitor + namespace: {{ .Release.Namespace }} diff --git a/charts/fleet-event-monitor/values.yaml b/charts/fleet-event-monitor/values.yaml new file mode 100644 index 0000000000..a0efe9282e --- /dev/null +++ b/charts/fleet-event-monitor/values.yaml @@ -0,0 +1,173 @@ +image: + repository: rancher/fleet-event-monitor + tag: dev + imagePullPolicy: IfNotPresent + +# Namespace to watch +namespace: cattle-fleet-system + +# Which controllers to enable +controllers: + bundle: false + bundledeployment: false + cluster: false + gitrepo: false + helmop: false + +# Worker counts (lower than production controllers) +workers: + bundle: 5 + bundledeployment: 5 + cluster: 5 + gitrepo: 5 + helmop: 5 + +# Logging configuration +logFormat: json +logLevel: info +debug: false +debugLevel: 0 + +# Sharding support (same as fleet controller) +shardID: "" + +# Node selector and tolerations +nodeSelector: {} +tolerations: [] +priorityClassName: "" + +# Leader election +leaderElection: + enabled: true + leaseDuration: 30s + retryPeriod: 10s + renewDeadline: 25s + +# Resource limits (monitors should be lightweight) +resources: + limits: + cpu: 500m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + +# Security context +securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + +# Extra environment variables +extraEnv: [] +# Example: +# - name: CUSTOM_VAR +# value: "custom_value" + +# Per-controller logging configuration +logging: + # Bundle controller logging + bundle: + detailed: false # true = detailed logs, false = summary only + + # Resource filter (optional) - filter which resources to monitor + # Empty strings match all resources (backwards compatible) + resourceFilter: + namespace: "" # Regular expression for namespace matching (e.g., "^fleet-local$") + name: "" # Regular expression for name matching (e.g., "^test-.*") + + eventFilters: + generationChange: false + statusChange: false + annotationChange: false + labelChange: false + resourceVersionChange: false + deletion: false + notFound: false + create: false + triggeredBy: false + + # BundleDeployment controller logging + bundleDeployment: + detailed: false + + resourceFilter: + namespace: "" + name: "" + + eventFilters: + generationChange: false + statusChange: false + annotationChange: false + labelChange: false + resourceVersionChange: false + deletion: false + notFound: false + create: false + triggeredBy: false + + # Cluster controller logging + cluster: + detailed: false + + resourceFilter: + namespace: "" + name: "" + + eventFilters: + generationChange: false + statusChange: false + annotationChange: false + labelChange: false + resourceVersionChange: false + deletion: false + notFound: false + create: false + triggeredBy: false + + # GitRepo controller logging + gitRepo: + detailed: false + + resourceFilter: + namespace: "" + name: "" + + eventFilters: + generationChange: false + statusChange: false + annotationChange: false + labelChange: false + resourceVersionChange: false + deletion: false + notFound: false + create: false + triggeredBy: false + + # HelmOp controller logging + helmOp: + detailed: false + + resourceFilter: + namespace: "" + name: "" + + eventFilters: + generationChange: false + statusChange: false + annotationChange: false + labelChange: false + resourceVersionChange: false + deletion: false + notFound: false + create: false + triggeredBy: false + + # Summary configuration (applies to all controllers in summary mode) + summary: + # How often to emit summary in duration format (5s, 30s, 1m, etc.) + interval: "30s" + + # Reset counters after each summary (false = cumulative counts) + resetOnPrint: false diff --git a/cmd/fleeteventmonitor/main.go b/cmd/fleeteventmonitor/main.go new file mode 100644 index 0000000000..719c90a732 --- /dev/null +++ b/cmd/fleeteventmonitor/main.go @@ -0,0 +1,19 @@ +// Package main provides the entrypoint for the fleet-event-monitor binary. +package main + +import ( + _ "net/http/pprof" + + "github.com/rancher/wrangler/v3/pkg/signals" + "github.com/sirupsen/logrus" + + "github.com/rancher/fleet/internal/cmd/monitor" +) + +func main() { + ctx := signals.SetupSignalContext() + cmd := monitor.App() + if err := cmd.ExecuteContext(ctx); err != nil { + logrus.Fatal(err) + } +} diff --git a/dev/build-event-monitor b/dev/build-event-monitor new file mode 100755 index 0000000000..a8a183ec16 --- /dev/null +++ b/dev/build-event-monitor @@ -0,0 +1,23 @@ +#!/bin/bash +# Description: build fleet binary and image with debug flags + +set -euxo pipefail + +if [ ! -d ./cmd/fleetcontroller ]; then + echo "please change the current directory to the fleet repo checkout" + exit 1 +fi + +export GOARCH="${GOARCH:-amd64}" +export CGO_ENABLED=0 + +# re-generate code +if ! git diff --quiet HEAD origin/main -- pkg/apis/fleet.cattle.io/v1alpha1; then + go generate +fi + +export GOOS=linux + +# fleet-monitor +go build -gcflags='all=-N -l' -o "bin/fleeteventmonitor-linux-$GOARCH" ./cmd/fleeteventmonitor +docker build -f package/Dockerfile.event-monitor -t rancher/fleet-event-monitor:dev --build-arg="ARCH=$GOARCH" . \ No newline at end of file diff --git a/dev/format-monitor-summary.sh b/dev/format-monitor-summary.sh new file mode 100755 index 0000000000..dcddc949f8 --- /dev/null +++ b/dev/format-monitor-summary.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# format-monitor-summary.sh - Parse and display Fleet Monitor summary in a readable format +# +# Usage: +# cat logfile.log | ./format-monitor-summary.sh +# ./format-monitor-summary.sh < logfile.log +# ./format-monitor-summary.sh logfile.log + +set -euo pipefail + +# Check if jq is installed +if ! command -v jq &> /dev/null; then + echo "Error: jq is required but not installed. Please install jq first." >&2 + exit 1 +fi + +# Read input (either from file argument, pipe, or stdin) +if [ $# -gt 0 ]; then + input=$(cat "$1") +else + input=$(cat) +fi + +# Find all lines with "Fleet Monitor Summary" and write to a temp file +tmp_summaries=$(mktemp) +trap 'rm -f "$tmp_summaries"' EXIT +echo "$input" | grep '"msg":"Fleet Monitor Summary"' > "$tmp_summaries" || true + +if [ ! -s "$tmp_summaries" ]; then + echo "Error: No 'Fleet Monitor Summary' log line found in input" >&2 + exit 1 +fi + +# Extract first and last summary lines (using file to avoid SIGPIPE with pipefail) +first_json=$(head -1 "$tmp_summaries" | grep -o '{"level":"info".*}') +json=$(tail -1 "$tmp_summaries" | grep -o '{"level":"info".*}') + +# Calculate time range across all summaries +first_ts=$(echo "$first_json" | jq -r '.summary.timestamp') +last_ts=$(echo "$json" | jq -r '.summary.timestamp') +summary_count=$(wc -l < "$tmp_summaries" | tr -d ' ') + +# Extract summary data +summary=$(echo "$json" | jq -r '.msg') +timestamp=$(echo "$json" | jq -r '.summary.timestamp') +interval=$(echo "$json" | jq -r '.summary.interval_seconds') +total_resources=$(echo "$json" | jq -r '.summary.totals.total_resources_monitored') +total_events=$(echo "$json" | jq -r '.summary.totals.total_events') + +# Print header +echo "================================================================================" +echo " FLEET MONITOR SUMMARY" +echo "================================================================================" +echo " Timestamp: $timestamp" +echo " Interval: ${interval}s" +echo " Total Resources: $total_resources" +echo " Total Events: $total_events" +echo "================================================================================" +echo + +# Function to print a resource type table +print_resource_table() { + local resource_type=$1 + local data=$(echo "$json" | jq -r ".summary.summary.\"$resource_type\"") + + if [ "$data" = "null" ] || [ -z "$data" ]; then + return + fi + + echo "▼ $resource_type" + echo "-------------------------------------------------------------------------------" + + # Get all resource names + local resources=$(echo "$data" | jq -r 'keys[]') + + if [ -z "$resources" ]; then + echo " No resources" + echo + return + fi + + # Calculate maximum resource name length + local max_len=8 # Minimum width for "RESOURCE" header + while IFS= read -r resource; do + local len=${#resource} + if [ $len -gt $max_len ]; then + max_len=$len + fi + done <<< "$resources" + + # Add some padding + max_len=$((max_len + 2)) + + # Print table header + printf " %-${max_len}s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n" "RESOURCE" "CREATE" "DELETE" "N-FOUND" "STATUS" "GEN-CHG" "ANNOT" "LABEL" "RESVER" "EVENTS" + local separator=$(printf '%*s' $max_len | tr ' ' '-') + printf " %-${max_len}s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n" "$separator" "------" "------" "-------" "------" "-------" "-----" "-----" "------" "------" + + # Print each resource + while IFS= read -r resource; do + local create=$(echo "$data" | jq -r ".\"$resource\".create // 0") + local deletion=$(echo "$data" | jq -r ".\"$resource\".deletion // 0") + local not_found=$(echo "$data" | jq -r ".\"$resource\".\"not-found\" // 0") + local status_change=$(echo "$data" | jq -r ".\"$resource\".\"status-change\" // 0") + local gen_change=$(echo "$data" | jq -r ".\"$resource\".\"generation-change\" // 0") + local annot_change=$(echo "$data" | jq -r ".\"$resource\".\"annotation-change\" // 0") + local label_change=$(echo "$data" | jq -r ".\"$resource\".\"label-change\" // 0") + local resver_change=$(echo "$data" | jq -r ".\"$resource\".\"resourceversion-change\" // 0") + local total_events=$(echo "$data" | jq -r ".\"$resource\".total_events // 0") + + printf " %-${max_len}s %8d %8d %8d %8d %8d %8d %8d %8d %8d\n" \ + "$resource" "$create" "$deletion" "$not_found" "$status_change" "$gen_change" "$annot_change" "$label_change" "$resver_change" "$total_events" + + # Print triggered-by if present + local triggered_by=$(echo "$data" | jq -r ".\"$resource\".\"triggered-by\" // null") + if [ "$triggered_by" != "null" ]; then + echo "$triggered_by" | jq -r 'to_entries[] | " └─ triggered-by: \(.key) = \(.value)"' + fi + done <<< "$resources" + + echo +} + +# Print tables for each resource type +resource_types=$(echo "$json" | jq -r '.summary.summary | keys[]') + +while IFS= read -r resource_type; do + print_resource_table "$resource_type" +done <<< "$resource_types" + +echo "================================================================================" + +# Calculate and display time range +if [ "$first_ts" != "$last_ts" ]; then + first_epoch=$(date -d "$first_ts" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${first_ts%%.*}" +%s 2>/dev/null) + last_epoch=$(date -d "$last_ts" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%S" "${last_ts%%.*}" +%s 2>/dev/null) + duration_s=$(( last_epoch - first_epoch )) + hours=$(( duration_s / 3600 )) + minutes=$(( (duration_s % 3600) / 60 )) + seconds=$(( duration_s % 60 )) + echo " Time range: $first_ts" + echo " -> $last_ts" + printf " Duration: %02dh %02dm %02ds (%d summaries)\n" "$hours" "$minutes" "$seconds" "$summary_count" +else + echo " Time range: $first_ts (single summary)" +fi +echo "================================================================================" diff --git a/dev/parse-resourceversion-log.sh b/dev/parse-resourceversion-log.sh new file mode 100755 index 0000000000..21b5d517f7 --- /dev/null +++ b/dev/parse-resourceversion-log.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# parse-resourceversion-log.sh - Summarize resource version change diffs from bundle-monitor logs +# +# Usage: +# cat logs.json | ./dev/parse-resourceversion-log.sh +# kubectl logs | ./dev/parse-resourceversion-log.sh +# ./dev/parse-resourceversion-log.sh < logs.json + +set -euo pipefail + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +CYAN='\033[0;36m' +YELLOW='\033[1;33m' +MAGENTA='\033[0;35m' +BOLD='\033[1m' +DIM='\033[2m' +RESET='\033[0m' + +# Disable colors if not a terminal +if [[ ! -t 1 ]]; then + RED='' GREEN='' CYAN='' YELLOW='' MAGENTA='' BOLD='' DIM='' RESET='' +fi + +while IFS= read -r line; do + # Skip lines that aren't resourceversion-change events + event=$(echo "$line" | jq -r '.event // empty' 2>/dev/null) || continue + [[ "$event" == "resourceversion-change" ]] || continue + + ts=$(echo "$line" | jq -r '.ts // "?"') + bundle=$(echo "$line" | jq -r '.bundle // "?"') + gitrepo=$(echo "$line" | jq -r '.gitrepo // "?"') + commit=$(echo "$line" | jq -r '.commit // "?"') + old_rv=$(echo "$line" | jq -r '.oldResourceVersion // "?"') + new_rv=$(echo "$line" | jq -r '.newResourceVersion // "?"') + reason=$(echo "$line" | jq -r '.reason // "?"') + metadata_changes=$(echo "$line" | jq -r '(.metadataChanges // []) | join(", ")') + diff=$(echo "$line" | jq -r '.diff // empty') + + echo -e "${BOLD}${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}" + echo -e "${BOLD}Time:${RESET} $ts" + echo -e "${BOLD}Bundle:${RESET} $bundle" + echo -e "${BOLD}GitRepo:${RESET} $gitrepo" + echo -e "${BOLD}Commit:${RESET} ${DIM}${commit:0:12}${RESET}" + echo -e "${BOLD}Version:${RESET} ${YELLOW}$old_rv${RESET} → ${GREEN}$new_rv${RESET}" + echo -e "${BOLD}Reason:${RESET} $reason" + [[ -n "$metadata_changes" ]] && echo -e "${BOLD}Changed:${RESET} ${MAGENTA}$metadata_changes${RESET}" + + if [[ -n "$diff" ]]; then + echo -e "${BOLD}Diff:${RESET}" + echo "$diff" | while IFS= read -r dline; do + if [[ "$dline" =~ ^changed: ]]; then + echo -e " ${YELLOW}$dline${RESET}" + elif [[ "$dline" =~ ^added: ]]; then + echo -e " ${GREEN}$dline${RESET}" + elif [[ "$dline" =~ ^removed: ]]; then + echo -e " ${RED}$dline${RESET}" + elif [[ "$dline" =~ ^-[[:space:]] ]]; then + echo -e " ${RED}$dline${RESET}" + elif [[ "$dline" =~ ^\+[[:space:]] ]]; then + echo -e " ${GREEN}$dline${RESET}" + else + echo -e " ${DIM}$dline${RESET}" + fi + done + fi + + echo "" +done diff --git a/dev/parse-status-log.sh b/dev/parse-status-log.sh new file mode 100755 index 0000000000..10ab6a43fc --- /dev/null +++ b/dev/parse-status-log.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# parse-status-log.sh - Summarize status change diffs from bundle-monitor logs +# +# Usage: +# cat logs.json | ./dev/parse-status-log.sh +# kubectl logs | ./dev/parse-status-log.sh +# ./dev/parse-status-log.sh < logs.json + +set -euo pipefail + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +CYAN='\033[0;36m' +YELLOW='\033[1;33m' +BOLD='\033[1m' +RESET='\033[0m' + +# Disable colors if not a terminal +if [[ ! -t 1 ]]; then + RED='' GREEN='' CYAN='' YELLOW='' BOLD='' RESET='' +fi + +while IFS= read -r line; do + # Skip lines that aren't status-change events + event=$(echo "$line" | jq -r '.event // empty' 2>/dev/null) || continue + [[ "$event" == "status-change" ]] || continue + + ts=$(echo "$line" | jq -r '.ts // "?"') + bundle=$(echo "$line" | jq -r '.bundle // "?"') + gitrepo=$(echo "$line" | jq -r '.gitrepo // "?"') + diff=$(echo "$line" | jq -r '.diff // empty') + + [[ -z "$diff" ]] && continue + + echo -e "${BOLD}${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}" + echo -e "${BOLD}Time:${RESET} $ts" + echo -e "${BOLD}Bundle:${RESET} $bundle" + echo -e "${BOLD}GitRepo:${RESET} $gitrepo" + echo -e "${BOLD}Changes:${RESET}" + + # Extract the meaningful diff lines (- and + prefixed) with context + echo "$diff" | while IFS= read -r dline; do + if [[ "$dline" =~ ^-[[:space:]] ]]; then + echo -e " ${RED}$dline${RESET}" + elif [[ "$dline" =~ ^\+[[:space:]] ]]; then + echo -e " ${GREEN}$dline${RESET}" + fi + done + + echo "" +done diff --git a/internal/cmd/monitor/operator.go b/internal/cmd/monitor/operator.go new file mode 100644 index 0000000000..b88c0ba8a3 --- /dev/null +++ b/internal/cmd/monitor/operator.go @@ -0,0 +1,380 @@ +package monitor + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/rancher/fleet/internal/cmd" + "github.com/rancher/fleet/internal/cmd/monitor/reconciler" + "github.com/rancher/fleet/internal/config" + "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" +) + +var ( + scheme = runtime.NewScheme() +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha1.AddToScheme(scheme)) +} + +// ControllerLogConfig holds logging configuration for a single controller +type ControllerLogConfig struct { + Detailed bool // true = detailed logs, false = summary only + EventFilters reconciler.EventTypeFilters // Which event types to show in detailed mode + ResourceFilter *reconciler.ResourceFilter // Which resources to monitor (namespace/name patterns) +} + +// ControllerLoggingConfig holds logging configuration for all controllers +type ControllerLoggingConfig struct { + Bundle ControllerLogConfig + BundleDeployment ControllerLogConfig + Cluster ControllerLogConfig + GitRepo ControllerLogConfig + HelmOp ControllerLogConfig +} + +type MonitorOptions struct { + EnableBundle bool + EnableBundleDeployment bool + EnableCluster bool + EnableGitRepo bool + EnableHelmOp bool + Workers MonitorReconcilerWorkers + + // Per-controller logging configuration + ControllerLogging ControllerLoggingConfig + + // Summary configuration + SummaryInterval time.Duration + SummaryReset bool +} + +func start( + ctx context.Context, + systemNamespace string, + config *rest.Config, + leaderOpts cmd.LeaderElectionOptions, + monitorOpts MonitorOptions, + shardID string, +) error { + // Compile resource filters and check for errors + if err := compileResourceFilters(&monitorOpts.ControllerLogging); err != nil { + return fmt.Errorf("invalid resource filter configuration: %w", err) + } + + setupLog.Info("starting fleet monitor", + "namespace", systemNamespace, + "shardID", shardID, + "enableBundle", monitorOpts.EnableBundle, + "enableBundleDeployment", monitorOpts.EnableBundleDeployment, + "enableCluster", monitorOpts.EnableCluster, + "enableGitRepo", monitorOpts.EnableGitRepo, + "enableHelmOp", monitorOpts.EnableHelmOp, + "bundleDetailedLogs", monitorOpts.ControllerLogging.Bundle.Detailed, + "bundleDeploymentDetailedLogs", monitorOpts.ControllerLogging.BundleDeployment.Detailed, + "clusterDetailedLogs", monitorOpts.ControllerLogging.Cluster.Detailed, + "gitRepoDetailedLogs", monitorOpts.ControllerLogging.GitRepo.Detailed, + "helmOpDetailedLogs", monitorOpts.ControllerLogging.HelmOp.Detailed, + "summaryInterval", monitorOpts.SummaryInterval, + "summaryReset", monitorOpts.SummaryReset, + ) + + // Log resource filter configuration if any filters are set + logResourceFilters(&monitorOpts.ControllerLogging) + + // Start summary printer (always runs, prints stats for all controllers) + go startSummaryPrinter(ctx, monitorOpts.SummaryInterval, monitorOpts.SummaryReset) + + // No metrics for monitoring controllers + metricServerOptions := metricsserver.Options{BindAddress: "0"} + + var leaderElectionSuffix string + if shardID != "" { + leaderElectionSuffix = fmt.Sprintf("-%s", shardID) + } + + mgr, err := ctrl.NewManager(config, ctrl.Options{ + Scheme: scheme, + Metrics: metricServerOptions, + HealthProbeBindAddress: "0", // No health probes + LeaderElection: true, + LeaderElectionID: fmt.Sprintf("fleet-event-monitor-leader-election-shard%s", leaderElectionSuffix), + LeaderElectionNamespace: systemNamespace, + LeaseDuration: &leaderOpts.LeaseDuration, + RenewDeadline: &leaderOpts.RenewDeadline, + RetryPeriod: &leaderOpts.RetryPeriod, + }) + if err != nil { + setupLog.Error(err, "unable to start manager") + return err + } + + // Add field indexers required by the monitor controllers + if monitorOpts.EnableBundle { + if err := addBundleDownstreamResourceIndexer(ctx, mgr); err != nil { + setupLog.Error(err, "unable to add Bundle downstream resource indexer") + return err + } + } + + if monitorOpts.EnableGitRepo { + if err := addGitRepoSecretIndexers(ctx, mgr); err != nil { + setupLog.Error(err, "unable to add GitRepo secret indexers") + return err + } + } + + // Register enabled monitor controllers with per-controller logging mode + if monitorOpts.EnableBundle { + if err := (&reconciler.BundleMonitorReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + ShardID: shardID, + Workers: monitorOpts.Workers.Bundle, + Query: reconciler.NewBundleQuery(mgr.GetClient()), + DetailedLogs: monitorOpts.ControllerLogging.Bundle.Detailed, + EventFilters: monitorOpts.ControllerLogging.Bundle.EventFilters, + ResourceFilter: monitorOpts.ControllerLogging.Bundle.ResourceFilter, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create monitor controller", "controller", "Bundle") + return err + } + setupLog.Info("registered monitor controller", "controller", "Bundle", "workers", monitorOpts.Workers.Bundle, "mode", reconciler.LogMode(monitorOpts.ControllerLogging.Bundle.Detailed)) + } + + if monitorOpts.EnableCluster { + if err := (&reconciler.ClusterMonitorReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + ShardID: shardID, + Workers: monitorOpts.Workers.Cluster, + DetailedLogs: monitorOpts.ControllerLogging.Cluster.Detailed, + EventFilters: monitorOpts.ControllerLogging.Cluster.EventFilters, + ResourceFilter: monitorOpts.ControllerLogging.Cluster.ResourceFilter, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create monitor controller", "controller", "Cluster") + return err + } + setupLog.Info("registered monitor controller", "controller", "Cluster", "workers", monitorOpts.Workers.Cluster, "mode", reconciler.LogMode(monitorOpts.ControllerLogging.Cluster.Detailed)) + } + + if monitorOpts.EnableBundleDeployment { + if err := (&reconciler.BundleDeploymentMonitorReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + ShardID: shardID, + Workers: monitorOpts.Workers.BundleDeployment, + DetailedLogs: monitorOpts.ControllerLogging.BundleDeployment.Detailed, + EventFilters: monitorOpts.ControllerLogging.BundleDeployment.EventFilters, + ResourceFilter: monitorOpts.ControllerLogging.BundleDeployment.ResourceFilter, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create monitor controller", "controller", "BundleDeployment") + return err + } + setupLog.Info("registered monitor controller", "controller", "BundleDeployment", "workers", monitorOpts.Workers.BundleDeployment, "mode", reconciler.LogMode(monitorOpts.ControllerLogging.BundleDeployment.Detailed)) + } + + if monitorOpts.EnableGitRepo { + if err := (&reconciler.GitRepoMonitorReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + ShardID: shardID, + Workers: monitorOpts.Workers.GitRepo, + DetailedLogs: monitorOpts.ControllerLogging.GitRepo.Detailed, + EventFilters: monitorOpts.ControllerLogging.GitRepo.EventFilters, + ResourceFilter: monitorOpts.ControllerLogging.GitRepo.ResourceFilter, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create monitor controller", "controller", "GitRepo") + return err + } + setupLog.Info("registered monitor controller", "controller", "GitRepo", "workers", monitorOpts.Workers.GitRepo, "mode", reconciler.LogMode(monitorOpts.ControllerLogging.GitRepo.Detailed)) + } + + if monitorOpts.EnableHelmOp { + if err := (&reconciler.HelmOpMonitorReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + ShardID: shardID, + Workers: monitorOpts.Workers.HelmOp, + DetailedLogs: monitorOpts.ControllerLogging.HelmOp.Detailed, + EventFilters: monitorOpts.ControllerLogging.HelmOp.EventFilters, + ResourceFilter: monitorOpts.ControllerLogging.HelmOp.ResourceFilter, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create monitor controller", "controller", "HelmOp") + return err + } + setupLog.Info("registered monitor controller", "controller", "HelmOp", "workers", monitorOpts.Workers.HelmOp, "mode", reconciler.LogMode(monitorOpts.ControllerLogging.HelmOp.Detailed)) + } + + setupLog.Info("starting manager") + if err := mgr.Start(ctx); err != nil { + setupLog.Error(err, "problem running manager") + return err + } + + return nil +} + +// compileResourceFilters compiles all resource filter regex patterns +// Returns error if any pattern is invalid +func compileResourceFilters(cfg *ControllerLoggingConfig) error { + if err := cfg.Bundle.ResourceFilter.Compile(); err != nil { + return fmt.Errorf("Bundle resource filter: %w", err) + } + if err := cfg.BundleDeployment.ResourceFilter.Compile(); err != nil { + return fmt.Errorf("BundleDeployment resource filter: %w", err) + } + if err := cfg.Cluster.ResourceFilter.Compile(); err != nil { + return fmt.Errorf("Cluster resource filter: %w", err) + } + if err := cfg.GitRepo.ResourceFilter.Compile(); err != nil { + return fmt.Errorf("GitRepo resource filter: %w", err) + } + if err := cfg.HelmOp.ResourceFilter.Compile(); err != nil { + return fmt.Errorf("HelmOp resource filter: %w", err) + } + return nil +} + +// logResourceFilters logs resource filter configuration for debugging +func logResourceFilters(cfg *ControllerLoggingConfig) { + logFilter := func(controller string, filter *reconciler.ResourceFilter) { + if filter != nil && (filter.NamespacePattern != "" || filter.NamePattern != "") { + setupLog.Info("resource filter configured", + "controller", controller, + "namespacePattern", filter.NamespacePattern, + "namePattern", filter.NamePattern, + ) + } + } + + logFilter("Bundle", cfg.Bundle.ResourceFilter) + logFilter("BundleDeployment", cfg.BundleDeployment.ResourceFilter) + logFilter("Cluster", cfg.Cluster.ResourceFilter) + logFilter("GitRepo", cfg.GitRepo.ResourceFilter) + logFilter("HelmOp", cfg.HelmOp.ResourceFilter) +} + +// startSummaryPrinter periodically prints statistics summary +func startSummaryPrinter(ctx context.Context, interval time.Duration, reset bool) { + ticker := time.NewTicker(interval) + defer ticker.Stop() + + statsTracker := reconciler.GetStatsTracker() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + summary := statsTracker.GetSummary() + + // Convert to JSON and print + jsonStr, err := summary.ToJSON() + if err != nil { + setupLog.Error(err, "failed to marshal summary to JSON") + continue + } + + // Print as structured log (will be formatted as JSON by zap) + setupLog.Info("Fleet Monitor Summary", "summary", json.RawMessage(jsonStr)) + + // Reset or just update timestamp + if reset { + statsTracker.Reset() + } else { + statsTracker.UpdateLastSummaryTime() + } + } + } +} + +// addBundleDownstreamResourceIndexer indexes Bundles by their DownstreamResources (secrets and configmaps). +// Required for the bundle monitor's Secret/ConfigMap watches. +func addBundleDownstreamResourceIndexer(ctx context.Context, mgr manager.Manager) error { + return mgr.GetFieldIndexer().IndexField( + ctx, + &v1alpha1.Bundle{}, + config.BundleDownstreamResourceIndex, + func(obj client.Object) []string { + bundle, ok := obj.(*v1alpha1.Bundle) + if !ok { + return nil + } + + var resources []string + for _, dr := range bundle.Spec.DownstreamResources { + lowerKind := strings.ToLower(dr.Kind) + if lowerKind == "secret" || lowerKind == "configmap" { + resources = append(resources, fmt.Sprintf("%s/%s", lowerKind, dr.Name)) + } + } + return resources + }, + ) +} + +// addGitRepoSecretIndexers adds field indexers for GitRepo secret fields. +// Required for the gitrepo monitor's Secret watch. +func addGitRepoSecretIndexers(ctx context.Context, mgr manager.Manager) error { + if err := mgr.GetFieldIndexer().IndexField( + ctx, + &v1alpha1.GitRepo{}, + config.GitRepoClientSecretNameIndex, + func(obj client.Object) []string { + gitRepo, ok := obj.(*v1alpha1.GitRepo) + if !ok || gitRepo.Spec.ClientSecretName == "" { + return nil + } + return []string{gitRepo.Spec.ClientSecretName} + }, + ); err != nil { + return fmt.Errorf("GitRepoClientSecretName indexer: %w", err) + } + + if err := mgr.GetFieldIndexer().IndexField( + ctx, + &v1alpha1.GitRepo{}, + config.GitRepoHelmSecretNameIndex, + func(obj client.Object) []string { + gitRepo, ok := obj.(*v1alpha1.GitRepo) + if !ok || gitRepo.Spec.HelmSecretName == "" { + return nil + } + return []string{gitRepo.Spec.HelmSecretName} + }, + ); err != nil { + return fmt.Errorf("GitRepoHelmSecretName indexer: %w", err) + } + + if err := mgr.GetFieldIndexer().IndexField( + ctx, + &v1alpha1.GitRepo{}, + config.GitRepoHelmSecretNameForPathsIndex, + func(obj client.Object) []string { + gitRepo, ok := obj.(*v1alpha1.GitRepo) + if !ok || gitRepo.Spec.HelmSecretNameForPaths == "" { + return nil + } + return []string{gitRepo.Spec.HelmSecretNameForPaths} + }, + ); err != nil { + return fmt.Errorf("GitRepoHelmSecretNameForPaths indexer: %w", err) + } + + return nil +} diff --git a/internal/cmd/monitor/reconciler/bundle_monitor.go b/internal/cmd/monitor/reconciler/bundle_monitor.go new file mode 100644 index 0000000000..38b43b0575 --- /dev/null +++ b/internal/cmd/monitor/reconciler/bundle_monitor.go @@ -0,0 +1,261 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "context" + "fmt" + "strings" + + "github.com/rancher/fleet/internal/cmd/controller/target" + "github.com/rancher/fleet/internal/config" + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "github.com/rancher/fleet/pkg/sharding" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// BundleMonitorReconciler monitors Bundle reconciliations +type BundleMonitorReconciler struct { + client.Client + Scheme *runtime.Scheme + ShardID string + Workers int + + // BundleQuery for cluster->bundle mapping + Query BundleQuery + + // Cache to store previous state + cache *ObjectCache + + // Per-controller logging mode + DetailedLogs bool + EventFilters EventTypeFilters + ResourceFilter *ResourceFilter +} + +// SetupWithManager sets up the controller - mirrors BundleReconciler.SetupWithManager +func (r *BundleMonitorReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.cache = NewObjectCache() + + return ctrl.NewControllerManagedBy(mgr). + For(&fleet.Bundle{}, + builder.WithPredicates( + // do not trigger for bundle status changes (except for cache sync) + predicate.Or( + TypedResourceVersionUnchangedPredicate[client.Object]{}, + predicate.GenerationChangedPredicate{}, + predicate.AnnotationChangedPredicate{}, + predicate.LabelChangedPredicate{}, + ), + ), + ). + // Note: Maybe improve with WatchesMetadata, does it have access to labels? + Watches( + // Fan out from bundledeployment to bundle + &fleet.BundleDeployment{}, + handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, a client.Object) []ctrl.Request { + bd := a.(*fleet.BundleDeployment) + labels := bd.GetLabels() + if labels == nil { + return nil + } + + ns, name := target.BundleFromDeployment(labels) + if ns != "" && name != "" { + // Check resource filter before logging + if r.ResourceFilter.Matches(ns, name) { + // Log trigger source + logger := log.FromContext(ctx) + logRelatedResourceTrigger(logger, r.DetailedLogs, r.EventFilters, "Bundle", ns, name, "BundleDeployment", a.GetName(), a.GetNamespace()) + + return []ctrl.Request{{ + NamespacedName: types.NamespacedName{ + Namespace: ns, + Name: name, + }, + }} + } + } + + return nil + }), + builder.WithPredicates(bundleDeploymentStatusChangedPredicate()), + ). + Watches( + // Fan out from cluster to bundle + &fleet.Cluster{}, + handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, a client.Object) []ctrl.Request { + cluster := a.(*fleet.Cluster) + logger := log.FromContext(ctx) + + // Query which bundles are affected by this cluster + bundlesToRefresh, _, err := r.Query.BundlesForCluster(ctx, cluster) + if err != nil { + // Log error but don't fail - monitoring shouldn't crash on query errors + logger.Error(err, "Failed to query bundles for cluster", + "cluster", cluster.Name, + "namespace", cluster.Namespace) + return nil + } + + requests := []ctrl.Request{} + for _, bundle := range bundlesToRefresh { + if !sharding.ShouldProcess(bundle, r.ShardID) { + continue + } + // Check resource filter before logging and enqueueing + if r.ResourceFilter.Matches(bundle.Namespace, bundle.Name) { + // Log each bundle trigger with correct name/namespace + logRelatedResourceTrigger(logger, r.DetailedLogs, r.EventFilters, + "Bundle", bundle.Namespace, bundle.Name, + "Cluster", cluster.GetName(), cluster.GetNamespace()) + + requests = append(requests, ctrl.Request{ + NamespacedName: types.NamespacedName{ + Namespace: bundle.Namespace, + Name: bundle.Name, + }, + }) + } + } + + return requests + }), + builder.WithPredicates(clusterChangedPredicate()), + ). + Watches( + // Fan out from secret to bundle, reconcile bundles when a secret + // referenced in DownstreamResources changes. + &corev1.Secret{}, + handler.EnqueueRequestsFromMapFunc(r.downstreamResourceMapFunc("Secret")), + builder.WithPredicates(dataChangedPredicate()), + ). + Watches( + // Fan out from configmap to bundle, reconcile bundles when a configmap + // referenced in DownstreamResources changes. + &corev1.ConfigMap{}, + handler.EnqueueRequestsFromMapFunc(r.downstreamResourceMapFunc("ConfigMap")), + builder.WithPredicates(dataChangedPredicate()), + ). + WithEventFilter(sharding.FilterByShardID(r.ShardID)). + WithOptions(controller.Options{MaxConcurrentReconciles: r.Workers}). + Complete(r) +} + +// downstreamResourceMapFunc returns a function that maps a Secret or ConfigMap to Bundles +// that reference it in their DownstreamResources. +func (r *BundleMonitorReconciler) downstreamResourceMapFunc(kind string) func(ctx context.Context, obj client.Object) []ctrl.Request { + lowerKind := strings.ToLower(kind) + + return func(ctx context.Context, obj client.Object) []ctrl.Request { + // Create the index key for this resource (Kind/Name) + indexKey := fmt.Sprintf("%s/%s", lowerKind, obj.GetName()) + + // Find all bundles that reference this resource + bundleList := &fleet.BundleList{} + err := r.List(ctx, bundleList, + client.InNamespace(obj.GetNamespace()), + client.MatchingFields{config.BundleDownstreamResourceIndex: indexKey}, + ) + if err != nil { + return nil + } + + requests := make([]ctrl.Request, 0, len(bundleList.Items)) + for _, bundle := range bundleList.Items { + if !sharding.ShouldProcess(&bundle, r.ShardID) { + continue + } + if !r.ResourceFilter.Matches(bundle.Namespace, bundle.Name) { + continue + } + requests = append(requests, ctrl.Request{ + NamespacedName: types.NamespacedName{ + Namespace: bundle.Namespace, + Name: bundle.Name, + }, + }) + } + + return requests + } +} + +// Reconcile monitors bundle reconciliation events +func (r *BundleMonitorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Check resource filter - skip if resource doesn't match + if !r.ResourceFilter.Matches(req.Namespace, req.Name) { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx).WithName("bundle-monitor") + logger = logger.WithValues( + "bundle", req.NamespacedName.String(), + "mode", LogMode(r.DetailedLogs), + ) + ctx = log.IntoContext(ctx, logger) + + bundle := &fleet.Bundle{} + if err := r.Get(ctx, req.NamespacedName, bundle); err != nil { + if client.IgnoreNotFound(err) == nil { + logNotFound(logger, r.DetailedLogs, r.EventFilters, "Bundle", req.Namespace, req.Name) + r.cache.Delete(req.NamespacedName) + } + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Add gitrepo context if available + if bundle.Labels[fleet.RepoLabel] != "" { + logger = logger.WithValues( + "gitrepo", bundle.Labels[fleet.RepoLabel], + "commit", bundle.Labels[fleet.CommitLabel], + ) + } + + // Check for deletion + if !bundle.DeletionTimestamp.IsZero() { + logDeletion(logger, r.DetailedLogs, r.EventFilters, "Bundle", bundle.Namespace, bundle.Name, bundle.DeletionTimestamp.String()) + r.cache.Delete(req.NamespacedName) + return ctrl.Result{}, nil + } + + // Retrieve old object from cache + oldBundle, exists := r.cache.Get(req.NamespacedName) + if !exists { + logCreate(logger, r.DetailedLogs, r.EventFilters, "Bundle", bundle.Namespace, bundle.Name, bundle.Generation, bundle.ResourceVersion) + r.cache.Set(req.NamespacedName, bundle.DeepCopy()) + return ctrl.Result{}, nil + } + + oldBundleTyped := oldBundle.(*fleet.Bundle) + + // Detect what changed - pass DetailedLogs flag + logSpecChange(logger, r.DetailedLogs, r.EventFilters, "Bundle", bundle.Namespace, bundle.Name, oldBundleTyped.Spec, bundle.Spec, oldBundleTyped.Generation, bundle.Generation) + logStatusChange(logger, r.DetailedLogs, r.EventFilters, "Bundle", bundle.Namespace, bundle.Name, oldBundleTyped.Status, bundle.Status) + logResourceVersionChangeWithMetadata(logger, r.DetailedLogs, r.EventFilters, "Bundle", bundle.Namespace, bundle.Name, oldBundleTyped, bundle) + logAnnotationChange(logger, r.DetailedLogs, r.EventFilters, "Bundle", bundle.Namespace, bundle.Name, oldBundleTyped.Annotations, bundle.Annotations) + logLabelChange(logger, r.DetailedLogs, r.EventFilters, "Bundle", bundle.Namespace, bundle.Name, oldBundleTyped.Labels, bundle.Labels) + + // Update cache with new state + r.cache.Set(req.NamespacedName, bundle.DeepCopy()) + + return ctrl.Result{}, nil +} + +// LogMode returns "detailed" or "summary" based on the flag. +func LogMode(detailed bool) string { + if detailed { + return "detailed" + } + return "summary" +} diff --git a/internal/cmd/monitor/reconciler/bundle_monitor_test.go b/internal/cmd/monitor/reconciler/bundle_monitor_test.go new file mode 100644 index 0000000000..1225b4ee18 --- /dev/null +++ b/internal/cmd/monitor/reconciler/bundle_monitor_test.go @@ -0,0 +1,339 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "context" + "testing" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +func newBundleTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(s)) + utilruntime.Must(fleet.AddToScheme(s)) + return s +} + +// newBundleReconciler creates a BundleMonitorReconciler with a fresh cache and the given objects. +func newBundleReconciler(t *testing.T, sch *runtime.Scheme, filter *ResourceFilter, objs ...fleet.Bundle) (*BundleMonitorReconciler, func()) { + t.Helper() + builder := fake.NewClientBuilder().WithScheme(sch) + for i := range objs { + builder = builder.WithObjects(&objs[i]) + } + c := builder.Build() + r := &BundleMonitorReconciler{ + Client: c, + Scheme: sch, + cache: NewObjectCache(), + ResourceFilter: filter, + } + reset := func() { globalStatsTracker.Reset() } + globalStatsTracker.Reset() + return r, reset +} + +func TestBundleMonitorReconciler_ResourceFilterSkip(t *testing.T) { + sch := newBundleTestScheme(t) + filter := &ResourceFilter{NamespacePattern: "fleet-.*"} + _ = filter.Compile() + + r, cleanup := newBundleReconciler(t, sch, filter) + defer cleanup() + + ctx := context.Background() + req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "default", Name: "my-bundle"}} + result, err := r.Reconcile(ctx, req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result != (ctrl.Result{}) { + t.Errorf("expected empty result, got %v", result) + } + // No stats should have been recorded for a filtered resource + if len(globalStatsTracker.stats) != 0 { + t.Errorf("expected no stats recorded for filtered resource, got %d entries", len(globalStatsTracker.stats)) + } +} + +func TestBundleMonitorReconciler_NotFound(t *testing.T) { + sch := newBundleTestScheme(t) + r, cleanup := newBundleReconciler(t, sch, nil) + defer cleanup() + + ctx := context.Background() + req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "ns", Name: "missing"}} + result, err := r.Reconcile(ctx, req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result != (ctrl.Result{}) { + t.Errorf("expected empty result, got %v", result) + } + + key := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "missing"} + stats := globalStatsTracker.stats[key] + if stats == nil || stats.Counts[EventTypeNotFound] != 1 { + t.Error("expected NotFound event to be recorded") + } +} + +func TestBundleMonitorReconciler_FirstObservation(t *testing.T) { + sch := newBundleTestScheme(t) + bundle := fleet.Bundle{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-bundle", + Namespace: "ns", + ResourceVersion: "1", + Generation: 1, + }, + } + r, cleanup := newBundleReconciler(t, sch, nil, bundle) + defer cleanup() + + ctx := context.Background() + req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "ns", Name: "my-bundle"}} + result, err := r.Reconcile(ctx, req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result != (ctrl.Result{}) { + t.Errorf("expected empty result, got %v", result) + } + + // Object should be cached after first observation + _, exists := r.cache.Get(req.NamespacedName) + if !exists { + t.Error("expected bundle to be in cache after first observation") + } + + key := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "my-bundle"} + stats := globalStatsTracker.stats[key] + if stats == nil || stats.Counts[EventTypeCreate] != 1 { + t.Error("expected Create event to be recorded on first observation") + } +} + +func TestBundleMonitorReconciler_Deletion(t *testing.T) { + sch := newBundleTestScheme(t) + now := metav1.Now() + bundle := fleet.Bundle{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-bundle", + Namespace: "ns", + ResourceVersion: "1", + DeletionTimestamp: &now, + Finalizers: []string{"fleet.cattle.io/bundle-finalizer"}, + }, + } + r, cleanup := newBundleReconciler(t, sch, nil, bundle) + defer cleanup() + + // Pre-populate cache to simulate a prior observation + cacheKey := types.NamespacedName{Namespace: "ns", Name: "my-bundle"} + r.cache.Set(cacheKey, bundle.DeepCopy()) + + ctx := context.Background() + req := reconcile.Request{NamespacedName: cacheKey} + _, err := r.Reconcile(ctx, req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Cache entry should be removed on deletion + _, exists := r.cache.Get(cacheKey) + if exists { + t.Error("expected bundle to be removed from cache after deletion") + } + + statsKey := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "my-bundle"} + stats := globalStatsTracker.stats[statsKey] + if stats == nil || stats.Counts[EventTypeDeletion] != 1 { + t.Error("expected Deletion event to be recorded") + } +} + +func TestBundleMonitorReconciler_GenerationChange(t *testing.T) { + sch := newBundleTestScheme(t) + bundle := fleet.Bundle{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-bundle", + Namespace: "ns", + ResourceVersion: "2", + Generation: 2, + }, + } + r, cleanup := newBundleReconciler(t, sch, nil, bundle) + defer cleanup() + + // Put old version (generation=1) in cache + cacheKey := types.NamespacedName{Namespace: "ns", Name: "my-bundle"} + oldBundle := bundle.DeepCopy() + oldBundle.Generation = 1 + r.cache.Set(cacheKey, oldBundle) + + ctx := context.Background() + req := reconcile.Request{NamespacedName: cacheKey} + _, err := r.Reconcile(ctx, req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Cache should be updated to the new generation + cached, exists := r.cache.Get(cacheKey) + if !exists { + t.Fatal("expected bundle to remain in cache after update") + } + if cached.(*fleet.Bundle).Generation != 2 { + t.Errorf("expected generation 2 in cache, got %d", cached.(*fleet.Bundle).Generation) + } + + statsKey := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "my-bundle"} + stats := globalStatsTracker.stats[statsKey] + if stats == nil || stats.Counts[EventTypeGenerationChange] != 1 { + t.Errorf("expected GenerationChange event to be recorded, stats = %+v", stats) + } +} + +func TestBundleMonitorReconciler_StatusChange(t *testing.T) { + sch := newBundleTestScheme(t) + bundle := fleet.Bundle{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-bundle", + Namespace: "ns", + ResourceVersion: "2", + Generation: 1, + }, + Status: fleet.BundleStatus{ + Summary: fleet.BundleSummary{Ready: 3}, + }, + } + r, cleanup := newBundleReconciler(t, sch, nil, bundle) + defer cleanup() + + cacheKey := types.NamespacedName{Namespace: "ns", Name: "my-bundle"} + oldBundle := bundle.DeepCopy() + oldBundle.Status.Summary.Ready = 1 + r.cache.Set(cacheKey, oldBundle) + + ctx := context.Background() + req := reconcile.Request{NamespacedName: cacheKey} + _, err := r.Reconcile(ctx, req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + statsKey := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "my-bundle"} + stats := globalStatsTracker.stats[statsKey] + if stats == nil || stats.Counts[EventTypeStatusChange] != 1 { + t.Errorf("expected StatusChange event to be recorded, stats = %+v", stats) + } +} + +func TestBundleMonitorReconciler_AnnotationChange(t *testing.T) { + sch := newBundleTestScheme(t) + bundle := fleet.Bundle{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-bundle", + Namespace: "ns", + ResourceVersion: "2", + Generation: 1, + Annotations: map[string]string{"key": "new-value"}, + }, + } + r, cleanup := newBundleReconciler(t, sch, nil, bundle) + defer cleanup() + + cacheKey := types.NamespacedName{Namespace: "ns", Name: "my-bundle"} + oldBundle := bundle.DeepCopy() + oldBundle.Annotations["key"] = "old-value" + r.cache.Set(cacheKey, oldBundle) + + ctx := context.Background() + req := reconcile.Request{NamespacedName: cacheKey} + _, err := r.Reconcile(ctx, req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + statsKey := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "my-bundle"} + stats := globalStatsTracker.stats[statsKey] + if stats == nil || stats.Counts[EventTypeAnnotationChange] != 1 { + t.Errorf("expected AnnotationChange event to be recorded, stats = %+v", stats) + } +} + +func TestBundleMonitorReconciler_LabelChange(t *testing.T) { + sch := newBundleTestScheme(t) + bundle := fleet.Bundle{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-bundle", + Namespace: "ns", + ResourceVersion: "2", + Generation: 1, + Labels: map[string]string{"env": "production"}, + }, + } + r, cleanup := newBundleReconciler(t, sch, nil, bundle) + defer cleanup() + + cacheKey := types.NamespacedName{Namespace: "ns", Name: "my-bundle"} + oldBundle := bundle.DeepCopy() + oldBundle.Labels["env"] = "staging" + r.cache.Set(cacheKey, oldBundle) + + ctx := context.Background() + req := reconcile.Request{NamespacedName: cacheKey} + _, err := r.Reconcile(ctx, req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + statsKey := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "my-bundle"} + stats := globalStatsTracker.stats[statsKey] + if stats == nil || stats.Counts[EventTypeLabelChange] != 1 { + t.Errorf("expected LabelChange event to be recorded, stats = %+v", stats) + } +} + +func TestBundleMonitorReconciler_NoChange(t *testing.T) { + sch := newBundleTestScheme(t) + bundle := fleet.Bundle{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-bundle", + Namespace: "ns", + ResourceVersion: "1", + Generation: 1, + }, + } + r, cleanup := newBundleReconciler(t, sch, nil, bundle) + defer cleanup() + + cacheKey := types.NamespacedName{Namespace: "ns", Name: "my-bundle"} + r.cache.Set(cacheKey, bundle.DeepCopy()) + + ctx := context.Background() + req := reconcile.Request{NamespacedName: cacheKey} + _, err := r.Reconcile(ctx, req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // No change events expected + statsKey := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "my-bundle"} + stats := globalStatsTracker.stats[statsKey] + if stats != nil && stats.Total > 0 { + t.Errorf("expected no events recorded for unchanged bundle, got total=%d", stats.Total) + } +} diff --git a/internal/cmd/monitor/reconciler/bundle_query.go b/internal/cmd/monitor/reconciler/bundle_query.go new file mode 100644 index 0000000000..612b137173 --- /dev/null +++ b/internal/cmd/monitor/reconciler/bundle_query.go @@ -0,0 +1,257 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "context" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "github.com/rancher/fleet/internal/cmd/controller/target/matcher" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// BundleQuery interface for mapping clusters to bundles +// Copied from internal/cmd/controller/reconciler/bundle_controller.go +type BundleQuery interface { + // BundlesForCluster is used to map from a cluster to bundles + // Returns: bundlesToRefresh, bundlesToCleanup, error + BundlesForCluster(context.Context, *fleet.Cluster) ([]*fleet.Bundle, []*fleet.Bundle, error) +} + +// bundleQueryImpl implements BundleQuery for the monitor +type bundleQueryImpl struct { + client client.Client +} + +// NewBundleQuery creates a new BundleQuery implementation +func NewBundleQuery(c client.Client) BundleQuery { + return &bundleQueryImpl{client: c} +} + +// BundlesForCluster returns bundles affected by cluster changes +// Adapted from internal/cmd/controller/target/query.go:16-44 +func (q *bundleQueryImpl) BundlesForCluster(ctx context.Context, cluster *fleet.Cluster) (bundlesToRefresh, bundlesToCleanup []*fleet.Bundle, err error) { + bundles, err := q.getBundlesInScopeForCluster(ctx, cluster) + if err != nil { + return nil, nil, err + } + + logger := log.FromContext(ctx).WithName("bundle-query") + for _, bundle := range bundles { + bm, err := matcher.New(bundle) + if err != nil { + logger.Error(err, "ignore bad app bundle", "namespace", bundle.Namespace, "name", bundle.Name) + continue + } + + cgs, err := q.clusterGroupsForCluster(ctx, cluster) + if err != nil { + return nil, nil, err + } + + match := bm.Match(cluster.Name, clusterGroupsToLabelMap(cgs), cluster.Labels) + if match != nil { + bundlesToRefresh = append(bundlesToRefresh, bundle) + } else { + bundlesToCleanup = append(bundlesToCleanup, bundle) + } + } + + return +} + +// getBundlesInScopeForCluster returns all bundles that could target this cluster +// Adapted from internal/cmd/controller/target/query.go:46-89 +func (q *bundleQueryImpl) getBundlesInScopeForCluster(ctx context.Context, cluster *fleet.Cluster) ([]*fleet.Bundle, error) { + bundleSet := newBundleSet() + + // All bundles in the cluster namespace are in scope + // except for agent bundles of other clusters + bundles := &fleet.BundleList{} + err := q.client.List(ctx, bundles, client.InNamespace(cluster.Namespace)) + if err != nil { + return nil, err + } + for _, b := range bundles.Items { + b := b + if b.Annotations["objectset.rio.cattle.io/id"] == "fleet-manage-agent" { + if b.Name == "fleet-agent-"+cluster.Name { + bundleSet.insertSingle(&b) + } + } else { + bundleSet.insertSingle(&b) + } + } + + // Handle BundleNamespaceMapping for cross-namespace bundles + mappings := &fleet.BundleNamespaceMappingList{} + err = q.client.List(ctx, mappings) + if err != nil { + return nil, err + } + + logger := log.FromContext(ctx).WithName("bundle-query") + for _, mapping := range mappings.Items { + mapping := mapping + matcher, err := newBundleMapping(&mapping) + if err != nil { + logger.Error(err, "invalid BundleNamespaceMapping, skipping", "namespace", mapping.Namespace, "name", mapping.Name) + continue + } + if !matcher.MatchesNamespace(ctx, q.client, cluster.Namespace) { + continue + } + if err := bundleSet.insert(matcher.Bundles(ctx, q.client)); err != nil { + return nil, err + } + } + + return bundleSet.bundles(), nil +} + +// clusterGroupsForCluster returns ClusterGroups that match this cluster +// Adapted from internal/cmd/controller/target/query.go:91-116 +func (q *bundleQueryImpl) clusterGroupsForCluster(ctx context.Context, cluster *fleet.Cluster) (result []*fleet.ClusterGroup, _ error) { + cgs := &fleet.ClusterGroupList{} + err := q.client.List(ctx, cgs, client.InNamespace(cluster.Namespace)) + if err != nil { + return nil, err + } + + logger := log.FromContext(ctx).WithName("bundle-query") + for _, cg := range cgs.Items { + cg := cg + if cg.Spec.Selector == nil { + continue + } + sel, err := metav1.LabelSelectorAsSelector(cg.Spec.Selector) + if err != nil { + logger.Error(err, "invalid selector on clusterGroup", "namespace", cg.Namespace, "name", cg.Name, + "selector", cg.Spec.Selector) + continue + } + if sel.Matches(labels.Set(cluster.Labels)) { + result = append(result, &cg) + } + } + + return result, nil +} + +// clusterGroupsToLabelMap converts cluster groups to label map format +// Copied from internal/cmd/controller/target/query.go:118-124 +func clusterGroupsToLabelMap(cgs []*fleet.ClusterGroup) map[string]map[string]string { + result := map[string]map[string]string{} + for _, cg := range cgs { + result[cg.Name] = cg.Labels + } + return result +} + +// Bundle set helper - adapted from internal/cmd/controller/target/mapping.go:95-130 + +type bundleSet struct { + bundleKeys sets.Set[string] + bundleMap map[string]*fleet.Bundle +} + +func newBundleSet() *bundleSet { + return &bundleSet{ + bundleKeys: sets.New[string](), + bundleMap: map[string]*fleet.Bundle{}, + } +} + +func (b *bundleSet) bundles() []*fleet.Bundle { + var result []*fleet.Bundle + // list is sorted + for _, key := range sets.List(b.bundleKeys) { + result = append(result, b.bundleMap[key]) + } + return result +} + +func (b *bundleSet) insert(bundles []*fleet.Bundle, err error) error { + if err != nil { + return err + } + for _, bundle := range bundles { + b.insertSingle(bundle) + } + return nil +} + +func (b *bundleSet) insertSingle(bundle *fleet.Bundle) { + key := bundle.Namespace + "/" + bundle.Name + b.bundleMap[key] = bundle + b.bundleKeys.Insert(key) +} + +// BundleMapping helper - adapted from internal/cmd/controller/target/mapping.go:16-93 + +// BundleMapping is created from a BundleNamespaceMapping resource +type BundleMapping struct { + namespace string + namespaceSelector labels.Selector + bundleSelector labels.Selector + noMatch bool +} + +func newBundleMapping(mapping *fleet.BundleNamespaceMapping) (*BundleMapping, error) { + var ( + result = &BundleMapping{ + namespace: mapping.Namespace, + } + err error + ) + + if mapping.BundleSelector == nil || mapping.NamespaceSelector == nil { + result.noMatch = true + return result, nil + } + + result.bundleSelector, err = metav1.LabelSelectorAsSelector(mapping.BundleSelector) + if err != nil { + return nil, err + } + + result.namespaceSelector, err = metav1.LabelSelectorAsSelector(mapping.NamespaceSelector) + if err != nil { + return nil, err + } + + return result, nil +} + +func (b *BundleMapping) Bundles(ctx context.Context, c client.Client) ([]*fleet.Bundle, error) { + if b.noMatch { + return nil, nil + } + list := &fleet.BundleList{} + err := c.List(ctx, list, client.InNamespace(b.namespace), client.MatchingLabelsSelector{Selector: b.bundleSelector}) + + bundles := make([]*fleet.Bundle, len(list.Items)) + for i := range list.Items { + bundles[i] = &list.Items[i] + } + return bundles, err +} + +func (b *BundleMapping) MatchesNamespace(ctx context.Context, c client.Client, namespace string) bool { + if b.noMatch { + return false + } + ns := &corev1.Namespace{} + err := c.Get(ctx, types.NamespacedName{Name: namespace}, ns) + if err != nil { + return false + } + return b.namespaceSelector.Matches(labels.Set(ns.Labels)) +} diff --git a/internal/cmd/monitor/reconciler/bundledeployment_monitor.go b/internal/cmd/monitor/reconciler/bundledeployment_monitor.go new file mode 100644 index 0000000000..36cede5df7 --- /dev/null +++ b/internal/cmd/monitor/reconciler/bundledeployment_monitor.go @@ -0,0 +1,109 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "context" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "github.com/rancher/fleet/pkg/sharding" + + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// BundleDeploymentMonitorReconciler monitors BundleDeployment reconciliations +type BundleDeploymentMonitorReconciler struct { + client.Client + Scheme *runtime.Scheme + ShardID string + Workers int + + // Cache to store previous state + cache *ObjectCache + + // Per-controller logging mode + DetailedLogs bool + EventFilters EventTypeFilters + ResourceFilter *ResourceFilter +} + +// SetupWithManager sets up the controller - IDENTICAL to BundleDeploymentReconciler.SetupWithManager +func (r *BundleDeploymentMonitorReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.cache = NewObjectCache() + + return ctrl.NewControllerManagedBy(mgr). + For(&fleet.BundleDeployment{}, builder.WithPredicates( + bundleDeploymentStatusChangedPredicate(), + )). + WithEventFilter(sharding.FilterByShardID(r.ShardID)). + WithOptions(controller.Options{MaxConcurrentReconciles: r.Workers}). + Complete(r) +} + +// Reconcile monitors bundledeployment reconciliation events (READ-ONLY) +func (r *BundleDeploymentMonitorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Check resource filter - skip if resource doesn't match + if !r.ResourceFilter.Matches(req.Namespace, req.Name) { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx).WithName("bundledeployment-monitor") + logger = logger.WithValues( + "bundledeployment", req.NamespacedName.String(), + ) + ctx = log.IntoContext(ctx, logger) + + bd := &fleet.BundleDeployment{} + if err := r.Get(ctx, req.NamespacedName, bd); err != nil { + if client.IgnoreNotFound(err) == nil { + logNotFound(logger, r.DetailedLogs, r.EventFilters, "BundleDeployment", req.Namespace, req.Name) + r.cache.Delete(req.NamespacedName) + } + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Add bundle context if available from labels + if bd.Labels != nil { + bundleNS := bd.Labels["fleet.cattle.io/bundle-namespace"] + bundleName := bd.Labels["fleet.cattle.io/bundle"] + if bundleNS != "" && bundleName != "" { + logger = logger.WithValues( + "bundle", bundleNS+"/"+bundleName, + ) + } + } + + // Check for deletion + if !bd.DeletionTimestamp.IsZero() { + logDeletion(logger, r.DetailedLogs, r.EventFilters, "BundleDeployment", bd.Namespace, bd.Name, bd.DeletionTimestamp.String()) + r.cache.Delete(req.NamespacedName) + return ctrl.Result{}, nil + } + + // Retrieve old object from cache + oldBD, exists := r.cache.Get(req.NamespacedName) + if !exists { + logCreate(logger, r.DetailedLogs, r.EventFilters, "BundleDeployment", bd.Namespace, bd.Name, bd.Generation, bd.ResourceVersion) + r.cache.Set(req.NamespacedName, bd.DeepCopy()) + return ctrl.Result{}, nil + } + + oldBDTyped := oldBD.(*fleet.BundleDeployment) + + // Detect what changed + logSpecChange(logger, r.DetailedLogs, r.EventFilters, "BundleDeployment", bd.Namespace, bd.Name, oldBDTyped.Spec, bd.Spec, oldBDTyped.Generation, bd.Generation) + logStatusChange(logger, r.DetailedLogs, r.EventFilters, "BundleDeployment", bd.Namespace, bd.Name, oldBDTyped.Status, bd.Status) + logResourceVersionChangeWithMetadata(logger, r.DetailedLogs, r.EventFilters, "BundleDeployment", bd.Namespace, bd.Name, oldBDTyped, bd) + logAnnotationChange(logger, r.DetailedLogs, r.EventFilters, "BundleDeployment", bd.Namespace, bd.Name, oldBDTyped.Annotations, bd.Annotations) + logLabelChange(logger, r.DetailedLogs, r.EventFilters, "BundleDeployment", bd.Namespace, bd.Name, oldBDTyped.Labels, bd.Labels) + + // Update cache with new state + r.cache.Set(req.NamespacedName, bd.DeepCopy()) + + return ctrl.Result{}, nil +} diff --git a/internal/cmd/monitor/reconciler/cache.go b/internal/cmd/monitor/reconciler/cache.go new file mode 100644 index 0000000000..32b1d55543 --- /dev/null +++ b/internal/cmd/monitor/reconciler/cache.go @@ -0,0 +1,45 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "sync" + + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ObjectCache stores previous versions of objects for comparison +type ObjectCache struct { + mu sync.RWMutex + cache map[types.NamespacedName]client.Object +} + +// NewObjectCache creates a new ObjectCache +func NewObjectCache() *ObjectCache { + return &ObjectCache{ + cache: make(map[types.NamespacedName]client.Object), + } +} + +// Get retrieves an object from the cache +func (c *ObjectCache) Get(key types.NamespacedName) (client.Object, bool) { + c.mu.RLock() + defer c.mu.RUnlock() + obj, exists := c.cache[key] + return obj, exists +} + +// Set stores an object in the cache +func (c *ObjectCache) Set(key types.NamespacedName, obj client.Object) { + c.mu.Lock() + defer c.mu.Unlock() + c.cache[key] = obj +} + +// Delete removes an object from the cache +func (c *ObjectCache) Delete(key types.NamespacedName) { + c.mu.Lock() + defer c.mu.Unlock() + delete(c.cache, key) +} diff --git a/internal/cmd/monitor/reconciler/cache_test.go b/internal/cmd/monitor/reconciler/cache_test.go new file mode 100644 index 0000000000..6fbde625b9 --- /dev/null +++ b/internal/cmd/monitor/reconciler/cache_test.go @@ -0,0 +1,117 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "testing" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +func TestObjectCache_GetFromEmpty(t *testing.T) { + cache := NewObjectCache() + key := types.NamespacedName{Namespace: "ns", Name: "name"} + _, exists := cache.Get(key) + if exists { + t.Error("expected false on Get from empty cache") + } +} + +func TestObjectCache_SetAndGet(t *testing.T) { + cache := NewObjectCache() + key := types.NamespacedName{Namespace: "ns", Name: "name"} + bundle := &fleet.Bundle{ + ObjectMeta: metav1.ObjectMeta{Name: "name", Namespace: "ns"}, + } + cache.Set(key, bundle) + + got, exists := cache.Get(key) + if !exists { + t.Fatal("expected true on Get after Set") + } + if got.GetName() != bundle.Name || got.GetNamespace() != bundle.Namespace { + t.Errorf("got %s/%s, want %s/%s", got.GetNamespace(), got.GetName(), bundle.Namespace, bundle.Name) + } +} + +func TestObjectCache_Delete(t *testing.T) { + cache := NewObjectCache() + key := types.NamespacedName{Namespace: "ns", Name: "name"} + bundle := &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{Name: "name", Namespace: "ns"}} + cache.Set(key, bundle) + + cache.Delete(key) + + _, exists := cache.Get(key) + if exists { + t.Error("expected false on Get after Delete") + } +} + +func TestObjectCache_DeleteMissingKey(t *testing.T) { + cache := NewObjectCache() + key := types.NamespacedName{Namespace: "ns", Name: "does-not-exist"} + // Should not panic + cache.Delete(key) +} + +func TestObjectCache_OverwriteExisting(t *testing.T) { + cache := NewObjectCache() + key := types.NamespacedName{Namespace: "ns", Name: "name"} + + original := &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{Name: "name", Namespace: "ns", ResourceVersion: "1"}} + updated := &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{Name: "name", Namespace: "ns", ResourceVersion: "2"}} + + cache.Set(key, original) + cache.Set(key, updated) + + got, exists := cache.Get(key) + if !exists { + t.Fatal("expected entry to exist") + } + if got.GetResourceVersion() != "2" { + t.Errorf("got ResourceVersion %q, want %q", got.GetResourceVersion(), "2") + } +} + +func TestObjectCache_MultipleKeys(t *testing.T) { + cache := NewObjectCache() + keys := []types.NamespacedName{ + {Namespace: "ns", Name: "a"}, + {Namespace: "ns", Name: "b"}, + {Namespace: "other-ns", Name: "a"}, + } + + for _, k := range keys { + bundle := &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{Name: k.Name, Namespace: k.Namespace}} + cache.Set(k, bundle) + } + + for _, k := range keys { + got, exists := cache.Get(k) + if !exists { + t.Errorf("expected key %v to exist", k) + continue + } + if got.GetName() != k.Name || got.GetNamespace() != k.Namespace { + t.Errorf("got %s/%s, want %s/%s", got.GetNamespace(), got.GetName(), k.Namespace, k.Name) + } + } +} + +func TestObjectCache_IndependentKeys(t *testing.T) { + cache := NewObjectCache() + keyA := types.NamespacedName{Namespace: "ns", Name: "a"} + keyB := types.NamespacedName{Namespace: "ns", Name: "b"} + + cache.Set(keyA, &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{Name: "a", Namespace: "ns"}}) + cache.Delete(keyA) + + // Deleting A should not affect B (which was never set) + _, exists := cache.Get(keyB) + if exists { + t.Error("expected B to not exist after deleting A") + } +} diff --git a/internal/cmd/monitor/reconciler/cluster_monitor.go b/internal/cmd/monitor/reconciler/cluster_monitor.go new file mode 100644 index 0000000000..58046e18bf --- /dev/null +++ b/internal/cmd/monitor/reconciler/cluster_monitor.go @@ -0,0 +1,175 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "context" + "reflect" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "github.com/rancher/fleet/pkg/sharding" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// ClusterMonitorReconciler monitors Cluster reconciliations +type ClusterMonitorReconciler struct { + client.Client + Scheme *runtime.Scheme + ShardID string + Workers int + + // Cache to store previous state + cache *ObjectCache + + // Per-controller logging mode + DetailedLogs bool + EventFilters EventTypeFilters + ResourceFilter *ResourceFilter +} + +// SetupWithManager sets up the controller with the Manager - IDENTICAL to ClusterReconciler.SetupWithManager +func (r *ClusterMonitorReconciler) SetupWithManager(mgr ctrl.Manager) error { + // Initialize cache + r.cache = NewObjectCache() + + return ctrl.NewControllerManagedBy(mgr). + For(&fleet.Cluster{}). + // Watch bundledeployments so we can update the status fields + Watches( + &fleet.BundleDeployment{}, + handler.EnqueueRequestsFromMapFunc(r.mapBundleDeploymentToCluster), + builder.WithPredicates(predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return true + }, + // Triggering on every update would run into an + // endless loop with the agentmanagement + // cluster controller. + // We still need to update often enough to keep the + // status fields up to date. + UpdateFunc: func(e event.UpdateEvent) bool { + n := e.ObjectNew.(*fleet.BundleDeployment) + o := e.ObjectOld.(*fleet.BundleDeployment) + if n == nil || o == nil { + return false + } + if !reflect.DeepEqual(n.Spec, o.Spec) { + return true + } + if n.Status.AppliedDeploymentID != o.Status.AppliedDeploymentID { + return true + } + if n.Status.Ready != o.Status.Ready { + return true + } + return false + }, + DeleteFunc: func(e event.DeleteEvent) bool { + o := e.Object.(*fleet.BundleDeployment) + if o == nil || o.Status.AppliedDeploymentID == "" { + return false + } + return true + }, + }), + ). + WithEventFilter(sharding.FilterByShardID(r.ShardID)). + WithOptions(controller.Options{MaxConcurrentReconciles: r.Workers}). + Complete(r) +} + +// Reconcile monitors cluster reconciliation events (read-only) +func (r *ClusterMonitorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Check resource filter - skip if resource doesn't match + if !r.ResourceFilter.Matches(req.Namespace, req.Name) { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx).WithName("cluster-monitor") + logger = logger.WithValues( + "cluster", req.NamespacedName.String(), + ) + ctx = log.IntoContext(ctx, logger) + + cluster := &fleet.Cluster{} + err := r.Get(ctx, req.NamespacedName, cluster) + if err != nil { + if apierrors.IsNotFound(err) { + logNotFound(logger, r.DetailedLogs, r.EventFilters, "Cluster", req.Namespace, req.Name) + r.cache.Delete(req.NamespacedName) + } + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Check for deletion + if !cluster.DeletionTimestamp.IsZero() { + logDeletion(logger, r.DetailedLogs, r.EventFilters, "Cluster", cluster.Namespace, cluster.Name, cluster.DeletionTimestamp.String()) + r.cache.Delete(req.NamespacedName) + return ctrl.Result{}, nil + } + + // Retrieve old object from cache + oldCluster, exists := r.cache.Get(req.NamespacedName) + if !exists { + logCreate(logger, r.DetailedLogs, r.EventFilters, "Cluster", cluster.Namespace, cluster.Name, cluster.Generation, cluster.ResourceVersion) + r.cache.Set(req.NamespacedName, cluster.DeepCopy()) + return ctrl.Result{}, nil + } + + oldClusterTyped := oldCluster.(*fleet.Cluster) + + // Detect what changed + logSpecChange(logger, r.DetailedLogs, r.EventFilters, "Cluster", cluster.Namespace, cluster.Name, oldClusterTyped.Spec, cluster.Spec, oldClusterTyped.Generation, cluster.Generation) + logStatusChange(logger, r.DetailedLogs, r.EventFilters, "Cluster", cluster.Namespace, cluster.Name, oldClusterTyped.Status, cluster.Status) + logResourceVersionChangeWithMetadata(logger, r.DetailedLogs, r.EventFilters, "Cluster", cluster.Namespace, cluster.Name, oldClusterTyped, cluster) + logAnnotationChange(logger, r.DetailedLogs, r.EventFilters, "Cluster", cluster.Namespace, cluster.Name, oldClusterTyped.Annotations, cluster.Annotations) + logLabelChange(logger, r.DetailedLogs, r.EventFilters, "Cluster", cluster.Namespace, cluster.Name, oldClusterTyped.Labels, cluster.Labels) + + // Update cache with new state + r.cache.Set(req.NamespacedName, cluster.DeepCopy()) + + return ctrl.Result{}, nil +} + +// mapBundleDeploymentToCluster maps BundleDeployment to Cluster - identical to cluster_controller.go +func (r *ClusterMonitorReconciler) mapBundleDeploymentToCluster(ctx context.Context, a client.Object) []ctrl.Request { + clusterNS := &corev1.Namespace{} + err := r.Get(ctx, types.NamespacedName{Name: a.GetNamespace()}, clusterNS) + if err != nil { + return nil + } + + ns := clusterNS.Annotations[fleet.ClusterNamespaceAnnotation] + name := clusterNS.Annotations[fleet.ClusterAnnotation] + if ns == "" || name == "" { + return nil + } + + // Check resource filter before logging + if !r.ResourceFilter.Matches(ns, name) { + return nil + } + + // Log trigger source + logger := log.FromContext(ctx).WithName("cluster-monitor-handler") + logRelatedResourceTrigger(logger, r.DetailedLogs, r.EventFilters, "Cluster", ns, name, "BundleDeployment", a.GetName(), a.GetNamespace()) + + return []ctrl.Request{{ + NamespacedName: types.NamespacedName{ + Namespace: ns, + Name: name, + }, + }} +} diff --git a/internal/cmd/monitor/reconciler/filter.go b/internal/cmd/monitor/reconciler/filter.go new file mode 100644 index 0000000000..06641816a3 --- /dev/null +++ b/internal/cmd/monitor/reconciler/filter.go @@ -0,0 +1,130 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "fmt" + "regexp" +) + +// EventTypeFilters controls which event types produce detailed logs +type EventTypeFilters struct { + GenerationChange bool // generation-change events + StatusChange bool // status-change events + AnnotationChange bool // annotation-change events + LabelChange bool // label-change events + ResourceVersionChange bool // resourceversion-change events + Deletion bool // deletion events + NotFound bool // not-found events + Create bool // create events + TriggeredBy bool // triggered-by events +} + +// IsEmpty returns true if no filters were explicitly set (use all events) +func (f EventTypeFilters) IsEmpty() bool { + return !f.GenerationChange && + !f.StatusChange && + !f.AnnotationChange && + !f.LabelChange && + !f.ResourceVersionChange && + !f.Deletion && + !f.NotFound && + !f.Create && + !f.TriggeredBy +} + +// ShouldLog returns true if the given event type should produce detailed logs +func (f EventTypeFilters) ShouldLog(eventType EventType) bool { + // If no filters set, log everything (backwards compatible) + if f.IsEmpty() { + return true + } + + switch eventType { + case EventTypeGenerationChange: + return f.GenerationChange + case EventTypeStatusChange: + return f.StatusChange + case EventTypeAnnotationChange: + return f.AnnotationChange + case EventTypeLabelChange: + return f.LabelChange + case EventTypeResourceVersionChange: + return f.ResourceVersionChange + case EventTypeDeletion: + return f.Deletion + case EventTypeNotFound: + return f.NotFound + case EventTypeCreate: + return f.Create + default: + return true // Unknown event types always logged + } +} + +// ShouldLogTrigger returns true if triggered-by events should produce detailed logs +func (f EventTypeFilters) ShouldLogTrigger() bool { + if f.IsEmpty() { + return true + } + return f.TriggeredBy +} + +// ResourceFilter defines namespace/name patterns for filtering monitored resources +type ResourceFilter struct { + // NamespacePattern is a regular expression for matching resource namespaces + // Empty string matches all namespaces + NamespacePattern string + + // NamePattern is a regular expression for matching resource names + // Empty string matches all names + NamePattern string + + // Compiled regex patterns (internal use) + namespaceRegex *regexp.Regexp + nameRegex *regexp.Regexp +} + +// Matches returns true if the resource namespace and name match the filter +// If filter is nil or both patterns are empty, returns true (match all) +func (f *ResourceFilter) Matches(namespace, name string) bool { + if f == nil { + return true + } + + // If both patterns are empty, match everything (backwards compatible) + if f.NamespacePattern == "" && f.NamePattern == "" { + return true + } + + // Empty patterns match everything + namespaceMatch := f.NamespacePattern == "" || (f.namespaceRegex != nil && f.namespaceRegex.MatchString(namespace)) + nameMatch := f.NamePattern == "" || (f.nameRegex != nil && f.nameRegex.MatchString(name)) + + return namespaceMatch && nameMatch +} + +// Compile prepares the regex patterns for use +// Returns error if any pattern is invalid +func (f *ResourceFilter) Compile() error { + if f == nil { + return nil + } + + var err error + if f.NamespacePattern != "" { + f.namespaceRegex, err = regexp.Compile(f.NamespacePattern) + if err != nil { + return fmt.Errorf("invalid namespace pattern %q: %w", f.NamespacePattern, err) + } + } + + if f.NamePattern != "" { + f.nameRegex, err = regexp.Compile(f.NamePattern) + if err != nil { + return fmt.Errorf("invalid name pattern %q: %w", f.NamePattern, err) + } + } + + return nil +} diff --git a/internal/cmd/monitor/reconciler/filter_test.go b/internal/cmd/monitor/reconciler/filter_test.go new file mode 100644 index 0000000000..2ed2ba01d4 --- /dev/null +++ b/internal/cmd/monitor/reconciler/filter_test.go @@ -0,0 +1,285 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import "testing" + +func TestEventTypeFilters_IsEmpty(t *testing.T) { + tests := []struct { + name string + filters EventTypeFilters + want bool + }{ + {name: "zero value", filters: EventTypeFilters{}, want: true}, + {name: "GenerationChange set", filters: EventTypeFilters{GenerationChange: true}, want: false}, + {name: "StatusChange set", filters: EventTypeFilters{StatusChange: true}, want: false}, + {name: "AnnotationChange set", filters: EventTypeFilters{AnnotationChange: true}, want: false}, + {name: "LabelChange set", filters: EventTypeFilters{LabelChange: true}, want: false}, + {name: "ResourceVersionChange set", filters: EventTypeFilters{ResourceVersionChange: true}, want: false}, + {name: "Deletion set", filters: EventTypeFilters{Deletion: true}, want: false}, + {name: "NotFound set", filters: EventTypeFilters{NotFound: true}, want: false}, + {name: "Create set", filters: EventTypeFilters{Create: true}, want: false}, + {name: "TriggeredBy set", filters: EventTypeFilters{TriggeredBy: true}, want: false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.filters.IsEmpty(); got != tt.want { + t.Errorf("IsEmpty() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestEventTypeFilters_ShouldLog_EmptyFiltersLogAll(t *testing.T) { + f := EventTypeFilters{} + eventTypes := []EventType{ + EventTypeGenerationChange, + EventTypeStatusChange, + EventTypeAnnotationChange, + EventTypeLabelChange, + EventTypeResourceVersionChange, + EventTypeDeletion, + EventTypeNotFound, + EventTypeCreate, + } + for _, et := range eventTypes { + if !f.ShouldLog(et) { + t.Errorf("empty filters: ShouldLog(%q) = false, want true", et) + } + } +} + +func TestEventTypeFilters_ShouldLog_SpecificFilters(t *testing.T) { + tests := []struct { + name string + filters EventTypeFilters + eventType EventType + want bool + }{ + { + name: "GenerationChange enabled, query generation-change", + filters: EventTypeFilters{GenerationChange: true}, + eventType: EventTypeGenerationChange, + want: true, + }, + { + name: "GenerationChange enabled, query status-change", + filters: EventTypeFilters{GenerationChange: true}, + eventType: EventTypeStatusChange, + want: false, + }, + { + name: "StatusChange enabled", + filters: EventTypeFilters{StatusChange: true}, + eventType: EventTypeStatusChange, + want: true, + }, + { + name: "AnnotationChange enabled", + filters: EventTypeFilters{AnnotationChange: true}, + eventType: EventTypeAnnotationChange, + want: true, + }, + { + name: "LabelChange enabled", + filters: EventTypeFilters{LabelChange: true}, + eventType: EventTypeLabelChange, + want: true, + }, + { + name: "ResourceVersionChange enabled", + filters: EventTypeFilters{ResourceVersionChange: true}, + eventType: EventTypeResourceVersionChange, + want: true, + }, + { + name: "Deletion enabled", + filters: EventTypeFilters{Deletion: true}, + eventType: EventTypeDeletion, + want: true, + }, + { + name: "NotFound enabled", + filters: EventTypeFilters{NotFound: true}, + eventType: EventTypeNotFound, + want: true, + }, + { + name: "Create enabled", + filters: EventTypeFilters{Create: true}, + eventType: EventTypeCreate, + want: true, + }, + { + name: "unknown event type always logged when filters set", + filters: EventTypeFilters{GenerationChange: true}, + eventType: EventType("unknown"), + want: true, + }, + { + name: "multiple filters, only one matches", + filters: EventTypeFilters{GenerationChange: true, StatusChange: true}, + eventType: EventTypeAnnotationChange, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.filters.ShouldLog(tt.eventType); got != tt.want { + t.Errorf("ShouldLog(%q) = %v, want %v", tt.eventType, got, tt.want) + } + }) + } +} + +func TestEventTypeFilters_ShouldLogTrigger(t *testing.T) { + tests := []struct { + name string + filters EventTypeFilters + want bool + }{ + {name: "empty filters log all", filters: EventTypeFilters{}, want: true}, + {name: "TriggeredBy true", filters: EventTypeFilters{TriggeredBy: true}, want: true}, + {name: "only other filters set, TriggeredBy false", filters: EventTypeFilters{GenerationChange: true}, want: false}, + {name: "TriggeredBy true with other filters", filters: EventTypeFilters{GenerationChange: true, TriggeredBy: true}, want: true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.filters.ShouldLogTrigger(); got != tt.want { + t.Errorf("ShouldLogTrigger() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestResourceFilter_Compile(t *testing.T) { + tests := []struct { + name string + filter *ResourceFilter + wantError bool + }{ + {name: "nil filter", filter: nil, wantError: false}, + {name: "empty filter", filter: &ResourceFilter{}, wantError: false}, + {name: "valid namespace pattern", filter: &ResourceFilter{NamespacePattern: "fleet-.*"}, wantError: false}, + {name: "valid name pattern", filter: &ResourceFilter{NamePattern: "my-app-.*"}, wantError: false}, + {name: "both valid patterns", filter: &ResourceFilter{NamespacePattern: "fleet-.*", NamePattern: "my-.*"}, wantError: false}, + {name: "invalid namespace pattern", filter: &ResourceFilter{NamespacePattern: "["}, wantError: true}, + {name: "invalid name pattern", filter: &ResourceFilter{NamePattern: "["}, wantError: true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.filter.Compile() + if (err != nil) != tt.wantError { + t.Errorf("Compile() error = %v, wantError %v", err, tt.wantError) + } + }) + } +} + +func TestResourceFilter_Matches(t *testing.T) { + tests := []struct { + name string + filter *ResourceFilter + namespace string + resName string + want bool + }{ + { + name: "nil filter matches all", + filter: nil, + namespace: "any-ns", + resName: "any-name", + want: true, + }, + { + name: "empty filter matches all", + filter: &ResourceFilter{}, + namespace: "any-ns", + resName: "any-name", + want: true, + }, + { + name: "namespace pattern matches", + filter: &ResourceFilter{NamespacePattern: "fleet-.*"}, + namespace: "fleet-local", + resName: "anything", + want: true, + }, + { + name: "namespace pattern no match", + filter: &ResourceFilter{NamespacePattern: "fleet-.*"}, + namespace: "default", + resName: "anything", + want: false, + }, + { + name: "name pattern matches", + filter: &ResourceFilter{NamePattern: "my-app"}, + namespace: "default", + resName: "my-app", + want: true, + }, + { + name: "name pattern no match", + filter: &ResourceFilter{NamePattern: "^my-app$"}, + namespace: "default", + resName: "other-app", + want: false, + }, + { + name: "both patterns match", + filter: &ResourceFilter{NamespacePattern: "fleet-.*", NamePattern: "my-.*"}, + namespace: "fleet-local", + resName: "my-bundle", + want: true, + }, + { + name: "namespace matches but name does not", + filter: &ResourceFilter{NamespacePattern: "fleet-.*", NamePattern: "^my-.*"}, + namespace: "fleet-local", + resName: "other-bundle", + want: false, + }, + { + name: "name matches but namespace does not", + filter: &ResourceFilter{NamespacePattern: "fleet-.*", NamePattern: "my-.*"}, + namespace: "default", + resName: "my-bundle", + want: false, + }, + { + name: "regex partial match (substring)", + filter: &ResourceFilter{NamePattern: "bundle"}, + namespace: "default", + resName: "my-bundle", + want: true, + }, + { + name: "only namespace pattern set, name matches all", + filter: &ResourceFilter{NamespacePattern: "fleet-.*"}, + namespace: "fleet-local", + resName: "any-name", + want: true, + }, + { + name: "only name pattern set, namespace matches all", + filter: &ResourceFilter{NamePattern: "specific"}, + namespace: "any-ns", + resName: "specific", + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.filter != nil { + if err := tt.filter.Compile(); err != nil { + t.Fatalf("Compile() unexpected error: %v", err) + } + } + got := tt.filter.Matches(tt.namespace, tt.resName) + if got != tt.want { + t.Errorf("Matches(%q, %q) = %v, want %v", tt.namespace, tt.resName, got, tt.want) + } + }) + } +} diff --git a/internal/cmd/monitor/reconciler/gitrepo_monitor.go b/internal/cmd/monitor/reconciler/gitrepo_monitor.go new file mode 100644 index 0000000000..0a74bc965a --- /dev/null +++ b/internal/cmd/monitor/reconciler/gitrepo_monitor.go @@ -0,0 +1,251 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "context" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "github.com/rancher/fleet/internal/config" + "github.com/rancher/fleet/pkg/sharding" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// GitRepoMonitorReconciler monitors GitRepo reconciliations +type GitRepoMonitorReconciler struct { + client.Client + Scheme *runtime.Scheme + ShardID string + Workers int + + // Cache to store previous state + cache *ObjectCache + + // Per-controller logging mode + DetailedLogs bool + EventFilters EventTypeFilters + ResourceFilter *ResourceFilter +} + +// SetupWithManager sets up the controller - mirrors GitJobReconciler.SetupWithManager +func (r *GitRepoMonitorReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.cache = NewObjectCache() + + return ctrl.NewControllerManagedBy(mgr). + For(&fleet.GitRepo{}, + builder.WithPredicates( + // do not trigger for GitRepo status changes (except for commit changes and cache sync) + predicate.Or( + TypedResourceVersionUnchangedPredicate[client.Object]{}, + predicate.GenerationChangedPredicate{}, + // Use nonSecretAnnotationChangedPredicate instead of predicate.AnnotationChangedPredicate + // to avoid redundant reconciles when the controller updates secret data hash + // tracking annotations (e.g., fleet.cattle.io/client-secret-hash). + nonSecretAnnotationChangedPredicate(), + predicate.LabelChangedPredicate{}, + commitChangedPredicate(), + ), + ), + ). + Owns(&batchv1.Job{}, builder.WithPredicates(jobUpdatedPredicate())). + Watches( + // Fan out from secret to gitrepo, reconcile gitrepos when a secret + // referenced in ClientSecretName, HelmSecretName, or HelmSecretNameForPaths changes. + &corev1.Secret{}, + handler.EnqueueRequestsFromMapFunc(r.secretMapFunc()), + builder.WithPredicates(secretDataChangedPredicate()), + ). + WithEventFilter(sharding.FilterByShardID(r.ShardID)). + WithOptions(controller.Options{MaxConcurrentReconciles: r.Workers}). + Complete(r) +} + +// secretMapFunc returns a function that maps a Secret to GitRepos that reference it. +// Mirrors production gitjob_controller.go secretMapFunc. +func (r *GitRepoMonitorReconciler) secretMapFunc() func(ctx context.Context, obj client.Object) []ctrl.Request { + return func(ctx context.Context, obj client.Object) []ctrl.Request { + logger := log.FromContext(ctx).WithName("secret-watch") + secretName := obj.GetName() + namespace := obj.GetNamespace() + + // Use a map to deduplicate requests (same GitRepo might reference secret in multiple fields) + seen := make(map[types.NamespacedName]struct{}) + requests := make([]ctrl.Request, 0) + + addRequest := func(gitRepo *fleet.GitRepo) { + if !sharding.ShouldProcess(gitRepo, r.ShardID) { + return + } + if !r.ResourceFilter.Matches(gitRepo.Namespace, gitRepo.Name) { + return + } + key := types.NamespacedName{ + Namespace: gitRepo.Namespace, + Name: gitRepo.Name, + } + if _, exists := seen[key]; !exists { + seen[key] = struct{}{} + requests = append(requests, ctrl.Request{NamespacedName: key}) + } + } + + // Find GitRepos using this secret as ClientSecretName + gitRepoList := &fleet.GitRepoList{} + if err := r.List(ctx, gitRepoList, + client.InNamespace(namespace), + client.MatchingFields{config.GitRepoClientSecretNameIndex: secretName}, + ); err != nil { + logger.V(1).Error(err, "Failed to list GitRepos by ClientSecretName", "secret", secretName) + } else { + for i := range gitRepoList.Items { + addRequest(&gitRepoList.Items[i]) + } + } + + // Find GitRepos using this secret as HelmSecretName + gitRepoList = &fleet.GitRepoList{} + if err := r.List(ctx, gitRepoList, + client.InNamespace(namespace), + client.MatchingFields{config.GitRepoHelmSecretNameIndex: secretName}, + ); err != nil { + logger.V(1).Error(err, "Failed to list GitRepos by HelmSecretName", "secret", secretName) + } else { + for i := range gitRepoList.Items { + addRequest(&gitRepoList.Items[i]) + } + } + + // Find GitRepos using this secret as HelmSecretNameForPaths + gitRepoList = &fleet.GitRepoList{} + if err := r.List(ctx, gitRepoList, + client.InNamespace(namespace), + client.MatchingFields{config.GitRepoHelmSecretNameForPathsIndex: secretName}, + ); err != nil { + logger.V(1).Error(err, "Failed to list GitRepos by HelmSecretNameForPaths", "secret", secretName) + } else { + for i := range gitRepoList.Items { + addRequest(&gitRepoList.Items[i]) + } + } + + return requests + } +} + +// Reconcile monitors GitRepo reconciliation events (READ-ONLY) +func (r *GitRepoMonitorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Check resource filter - skip if resource doesn't match + if !r.ResourceFilter.Matches(req.Namespace, req.Name) { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx).WithName("gitrepo-monitor") + logger = logger.WithValues( + "gitrepo", req.NamespacedName.String(), + "mode", LogMode(r.DetailedLogs)) + ctx = log.IntoContext(ctx, logger) + + gitrepo := &fleet.GitRepo{} + if err := r.Get(ctx, req.NamespacedName, gitrepo); err != nil { + if apierrors.IsNotFound(err) { + logNotFound(logger, r.DetailedLogs, r.EventFilters, "GitRepo", req.Namespace, req.Name) + r.cache.Delete(req.NamespacedName) + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // Add more context to logger + logger = logger.WithValues("generation", gitrepo.Generation, "commit", gitrepo.Status.Commit) + if gitrepo.Labels[fleet.RepoLabel] != "" { + logger = logger.WithValues("repo", gitrepo.Labels[fleet.RepoLabel]) + } + ctx = log.IntoContext(ctx, logger) + + // Check for deletion + if !gitrepo.DeletionTimestamp.IsZero() { + logDeletion(logger, r.DetailedLogs, r.EventFilters, "GitRepo", gitrepo.Namespace, gitrepo.Name, gitrepo.DeletionTimestamp.String()) + r.cache.Delete(req.NamespacedName) + return ctrl.Result{}, nil + } + + // Retrieve old object from cache + oldGitRepo, exists := r.cache.Get(req.NamespacedName) + if !exists { + logCreate(logger, r.DetailedLogs, r.EventFilters, "GitRepo", gitrepo.Namespace, gitrepo.Name, gitrepo.Generation, gitrepo.ResourceVersion) + r.cache.Set(req.NamespacedName, gitrepo.DeepCopy()) + return ctrl.Result{}, nil + } + + oldGitRepoTyped := oldGitRepo.(*fleet.GitRepo) + + // Detect what changed + logSpecChange(logger, r.DetailedLogs, r.EventFilters, "GitRepo", gitrepo.Namespace, gitrepo.Name, oldGitRepoTyped.Spec, gitrepo.Spec, oldGitRepoTyped.Generation, gitrepo.Generation) + logStatusChange(logger, r.DetailedLogs, r.EventFilters, "GitRepo", gitrepo.Namespace, gitrepo.Name, oldGitRepoTyped.Status, gitrepo.Status) + logResourceVersionChangeWithMetadata(logger, r.DetailedLogs, r.EventFilters, "GitRepo", gitrepo.Namespace, gitrepo.Name, oldGitRepoTyped, gitrepo) + logAnnotationChange(logger, r.DetailedLogs, r.EventFilters, "GitRepo", gitrepo.Namespace, gitrepo.Name, oldGitRepoTyped.Annotations, gitrepo.Annotations) + logLabelChange(logger, r.DetailedLogs, r.EventFilters, "GitRepo", gitrepo.Namespace, gitrepo.Name, oldGitRepoTyped.Labels, gitrepo.Labels) + + // Log specific GitRepo changes (only in detailed mode) + if r.DetailedLogs { + if oldGitRepoTyped.Spec.Repo != gitrepo.Spec.Repo { + logger.Info("Repository URL changed", + "event", "repo-change", + "oldRepo", oldGitRepoTyped.Spec.Repo, + "newRepo", gitrepo.Spec.Repo, + ) + } + if oldGitRepoTyped.Spec.Branch != gitrepo.Spec.Branch { + logger.Info("Branch changed", + "event", "branch-change", + "oldBranch", oldGitRepoTyped.Spec.Branch, + "newBranch", gitrepo.Spec.Branch, + ) + } + if oldGitRepoTyped.Spec.Revision != gitrepo.Spec.Revision { + logger.Info("Revision changed", + "event", "revision-change", + "oldRevision", oldGitRepoTyped.Spec.Revision, + "newRevision", gitrepo.Spec.Revision, + ) + } + if oldGitRepoTyped.Status.Commit != gitrepo.Status.Commit { + logger.Info("Commit changed", + "event", "commit-change", + "oldCommit", oldGitRepoTyped.Status.Commit, + "newCommit", gitrepo.Status.Commit, + ) + } + if oldGitRepoTyped.Status.WebhookCommit != gitrepo.Status.WebhookCommit { + logger.Info("Webhook commit changed", + "event", "webhook-commit-change", + "oldWebhookCommit", oldGitRepoTyped.Status.WebhookCommit, + "newWebhookCommit", gitrepo.Status.WebhookCommit, + ) + } + if oldGitRepoTyped.Spec.ForceSyncGeneration != gitrepo.Spec.ForceSyncGeneration { + logger.Info("ForceSyncGeneration changed", + "event", "force-sync-change", + "oldForceSyncGeneration", oldGitRepoTyped.Spec.ForceSyncGeneration, + "newForceSyncGeneration", gitrepo.Spec.ForceSyncGeneration, + ) + } + } + + // Update cache with new state + r.cache.Set(req.NamespacedName, gitrepo.DeepCopy()) + + return ctrl.Result{}, nil +} diff --git a/internal/cmd/monitor/reconciler/helmop_monitor.go b/internal/cmd/monitor/reconciler/helmop_monitor.go new file mode 100644 index 0000000000..8990ec4dd4 --- /dev/null +++ b/internal/cmd/monitor/reconciler/helmop_monitor.go @@ -0,0 +1,116 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "context" + + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "github.com/rancher/fleet/pkg/sharding" +) + +// HelmOpMonitorReconciler monitors HelmOp reconciliations +type HelmOpMonitorReconciler struct { + client.Client + Scheme *runtime.Scheme + ShardID string + Workers int + + // Cache to store previous state + cache *ObjectCache + + // Per-controller logging mode + DetailedLogs bool + EventFilters EventTypeFilters + ResourceFilter *ResourceFilter +} + +// SetupWithManager sets up the controller - mirrors HelmOpReconciler.SetupWithManager +func (r *HelmOpMonitorReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.cache = NewObjectCache() + + return ctrl.NewControllerManagedBy(mgr). + For(&fleet.HelmOp{}, + builder.WithPredicates( + predicate.Or( + // Note: These predicates prevent cache + // syncPeriod from triggering reconcile, since + // cache sync is an Update event. + predicate.GenerationChangedPredicate{}, + predicate.AnnotationChangedPredicate{}, + predicate.LabelChangedPredicate{}, + ), + ), + ). + WithEventFilter(sharding.FilterByShardID(r.ShardID)). + WithOptions(controller.Options{MaxConcurrentReconciles: r.Workers}). + Complete(r) +} + +// Reconcile monitors HelmOp reconciliation events +func (r *HelmOpMonitorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Check resource filter - skip if resource doesn't match + if !r.ResourceFilter.Matches(req.Namespace, req.Name) { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx).WithName("helmop-monitor") + logger = logger.WithValues( + "helmop", req.NamespacedName.String(), + ) + ctx = log.IntoContext(ctx, logger) + + helmop := &fleet.HelmOp{} + if err := r.Get(ctx, req.NamespacedName, helmop); err != nil { + if client.IgnoreNotFound(err) == nil { + logNotFound(logger, r.DetailedLogs, r.EventFilters, "HelmOp", req.Namespace, req.Name) + r.cache.Delete(req.NamespacedName) + } + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Add chart context if available + if helmop.Spec.Helm != nil && helmop.Spec.Helm.Chart != "" { + logger = logger.WithValues( + "chart", helmop.Spec.Helm.Chart, + "version", helmop.Spec.Helm.Version, + ) + } + + // Check for deletion + if !helmop.DeletionTimestamp.IsZero() { + logDeletion(logger, r.DetailedLogs, r.EventFilters, "HelmOp", helmop.Namespace, helmop.Name, helmop.DeletionTimestamp.String()) + r.cache.Delete(req.NamespacedName) + return ctrl.Result{}, nil + } + + // Retrieve old object from cache + oldHelmOp, exists := r.cache.Get(req.NamespacedName) + if !exists { + logCreate(logger, r.DetailedLogs, r.EventFilters, "HelmOp", helmop.Namespace, helmop.Name, helmop.Generation, helmop.ResourceVersion) + r.cache.Set(req.NamespacedName, helmop.DeepCopy()) + return ctrl.Result{}, nil + } + + oldHelmOpTyped := oldHelmOp.(*fleet.HelmOp) + + // Detect what changed + logSpecChange(logger, r.DetailedLogs, r.EventFilters, "HelmOp", helmop.Namespace, helmop.Name, oldHelmOpTyped.Spec, helmop.Spec, oldHelmOpTyped.Generation, helmop.Generation) + logStatusChange(logger, r.DetailedLogs, r.EventFilters, "HelmOp", helmop.Namespace, helmop.Name, oldHelmOpTyped.Status, helmop.Status) + logResourceVersionChangeWithMetadata(logger, r.DetailedLogs, r.EventFilters, "HelmOp", helmop.Namespace, helmop.Name, oldHelmOpTyped, helmop) + logAnnotationChange(logger, r.DetailedLogs, r.EventFilters, "HelmOp", helmop.Namespace, helmop.Name, oldHelmOpTyped.Annotations, helmop.Annotations) + logLabelChange(logger, r.DetailedLogs, r.EventFilters, "HelmOp", helmop.Namespace, helmop.Name, oldHelmOpTyped.Labels, helmop.Labels) + + // Update cache with new state + r.cache.Set(req.NamespacedName, helmop.DeepCopy()) + + return ctrl.Result{}, nil +} diff --git a/internal/cmd/monitor/reconciler/monitor.go b/internal/cmd/monitor/reconciler/monitor.go new file mode 100644 index 0000000000..f6cdbb6ca5 --- /dev/null +++ b/internal/cmd/monitor/reconciler/monitor.go @@ -0,0 +1,328 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "encoding/json" + "fmt" + "strings" + + "github.com/go-logr/logr" + "github.com/google/go-cmp/cmp" + "k8s.io/apimachinery/pkg/api/equality" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var globalStatsTracker = NewStatsTracker() + +// GetStatsTracker returns the global stats tracker +func GetStatsTracker() *StatsTracker { + return globalStatsTracker +} + +// recordEvent records an event in statistics (always, regardless of mode) +func recordEvent(resourceType, namespace, name string, eventType EventType) { + globalStatsTracker.RecordEvent(resourceType, namespace, name, eventType) +} + +// logSpecChange logs the differences in spec between old and new objects +// detailedLogs parameter controls whether to emit detailed log lines +// eventFilters parameter controls which event types to show +func logSpecChange(logger logr.Logger, detailedLogs bool, eventFilters interface{ ShouldLog(EventType) bool }, resourceType, namespace, name string, oldSpec, newSpec interface{}, oldGen, newGen int64) { + if oldGen == newGen { + return + } + + // Always record in stats + recordEvent(resourceType, namespace, name, EventTypeGenerationChange) + + // Only log details if detailed mode enabled AND event type is enabled + if detailedLogs && eventFilters.ShouldLog(EventTypeGenerationChange) { + diff := cmp.Diff(oldSpec, newSpec) + if diff != "" { + logger.Info("Spec changed - Generation update detected", + "event", "generation-change", + "oldGeneration", oldGen, + "newGeneration", newGen, + "specDiff", diff, + ) + } else { + logger.Info("Generation changed but spec appears identical", + "event", "generation-change", + "oldGeneration", oldGen, + "newGeneration", newGen, + ) + } + } +} + +// logStatusChange logs differences in status between old and new objects +func logStatusChange(logger logr.Logger, detailedLogs bool, eventFilters interface{ ShouldLog(EventType) bool }, resourceType, namespace, name string, oldStatus, newStatus interface{}) { + if equality.Semantic.DeepEqual(oldStatus, newStatus) { + return + } + + // Always record in stats + recordEvent(resourceType, namespace, name, EventTypeStatusChange) + + // Only log details if detailed mode enabled AND event type is enabled + if detailedLogs && eventFilters.ShouldLog(EventTypeStatusChange) { + diff := cmp.Diff(oldStatus, newStatus) + + logger.Info("Status changed", + "event", "status-change", + "diff", diff, + ) + } +} + +// logResourceVersionChangeWithMetadata logs resource version changes and checks for metadata differences +func logResourceVersionChangeWithMetadata(logger logr.Logger, detailedLogs bool, eventFilters interface{ ShouldLog(EventType) bool }, resourceType, namespace, name string, oldObj, newObj client.Object) { + oldRV := oldObj.GetResourceVersion() + newRV := newObj.GetResourceVersion() + + if oldRV == newRV { + return + } + + // Always record in stats + recordEvent(resourceType, namespace, name, EventTypeResourceVersionChange) + + // Only log details if detailed mode enabled AND event type is enabled + if detailedLogs && eventFilters.ShouldLog(EventTypeResourceVersionChange) { + // Check for specific metadata changes + var metadataChanges []string + var diffs []string + + // Check finalizers + oldFinalizers := oldObj.GetFinalizers() + newFinalizers := newObj.GetFinalizers() + if !equality.Semantic.DeepEqual(oldFinalizers, newFinalizers) { + metadataChanges = append(metadataChanges, "finalizers") + diff := cmp.Diff(oldFinalizers, newFinalizers) + diffs = append(diffs, "Finalizers:\n"+diff) + } + + // Check owner references + oldOwners := oldObj.GetOwnerReferences() + newOwners := newObj.GetOwnerReferences() + if !equality.Semantic.DeepEqual(oldOwners, newOwners) { + metadataChanges = append(metadataChanges, "ownerReferences") + diff := cmp.Diff(oldOwners, newOwners) + diffs = append(diffs, "OwnerReferences:\n"+diff) + } + + // Check managed fields (common with Server-Side Apply). + // Use managedFieldsDiff as the sole detector: equality.Semantic.DeepEqual + // on slices is order-sensitive, so a mere reordering of SSA entries would + // trigger a false "changed" detection while producing an empty diff. + if managedDiff := managedFieldsDiff(oldObj.GetManagedFields(), newObj.GetManagedFields()); managedDiff != "" { + metadataChanges = append(metadataChanges, "managedFields") + diffs = append(diffs, "ManagedFields:\n"+managedDiff) + } + + reason := "cache sync or unknown metadata update" + if len(metadataChanges) > 0 { + // Format metadataChanges as comma-separated list + var changeList string + for i, change := range metadataChanges { + if i > 0 { + changeList += ", " + } + changeList += change + } + reason = "metadata update: " + changeList + } + + logFields := []interface{}{ + "event", "resourceversion-change", + "oldResourceVersion", oldRV, + "newResourceVersion", newRV, + "reason", reason, + } + + if len(metadataChanges) > 0 { + logFields = append(logFields, "metadataChanges", metadataChanges) + if len(diffs) > 0 { + logFields = append(logFields, "diff", strings.Join(diffs, "\n")) + } + } + + logger.Info("Resource version changed", logFields...) + } +} + +// logAnnotationChange logs annotation changes +func logAnnotationChange(logger logr.Logger, detailedLogs bool, eventFilters interface{ ShouldLog(EventType) bool }, resourceType, namespace, name string, oldAnnotations, newAnnotations map[string]string) { + if equality.Semantic.DeepEqual(oldAnnotations, newAnnotations) { + return + } + + // Always record in stats + recordEvent(resourceType, namespace, name, EventTypeAnnotationChange) + + // Only log details if detailed mode enabled AND event type is enabled + if detailedLogs && eventFilters.ShouldLog(EventTypeAnnotationChange) { + diff := cmp.Diff(oldAnnotations, newAnnotations) + logger.Info("Annotations changed", + "event", "annotation-change", + "diff", diff, + ) + } +} + +// logLabelChange logs label changes +func logLabelChange(logger logr.Logger, detailedLogs bool, eventFilters interface{ ShouldLog(EventType) bool }, resourceType, namespace, name string, oldLabels, newLabels map[string]string) { + if equality.Semantic.DeepEqual(oldLabels, newLabels) { + return + } + + // Always record in stats + recordEvent(resourceType, namespace, name, EventTypeLabelChange) + + // Only log details if detailed mode enabled AND event type is enabled + if detailedLogs && eventFilters.ShouldLog(EventTypeLabelChange) { + diff := cmp.Diff(oldLabels, newLabels) + logger.Info("Labels changed", + "event", "label-change", + "diff", diff, + ) + } +} + +// logRelatedResourceTrigger logs when a reconciliation is triggered by a related resource +func logRelatedResourceTrigger(logger logr.Logger, detailedLogs bool, eventFilters interface{ ShouldLogTrigger() bool }, resourceType, namespace, name string, triggerType, triggerName, triggerNamespace string) { + // Always record in stats with breakdown by trigger type + globalStatsTracker.RecordTrigger(resourceType, namespace, name, triggerType) + + // Only log details if detailed mode enabled AND triggered-by events are enabled + if detailedLogs && eventFilters.ShouldLogTrigger() { + logger.Info("Triggered by related resource change", + "event", "related-resource-trigger", + "triggerResourceType", triggerType, + "triggerResourceName", triggerName, + "triggerResourceNamespace", triggerNamespace, + ) + } +} + +// logDeletion logs when a resource is being deleted +func logDeletion(logger logr.Logger, detailedLogs bool, eventFilters interface{ ShouldLog(EventType) bool }, resourceType, namespace, name string, deletionTimestamp string) { + // Always record in stats + recordEvent(resourceType, namespace, name, EventTypeDeletion) + + // Only log details if detailed mode enabled AND event type is enabled + if detailedLogs && eventFilters.ShouldLog(EventTypeDeletion) { + logger.Info("Resource deletion detected", + "event", "deletion", + "deletionTimestamp", deletionTimestamp, + ) + } +} + +// logNotFound logs when a resource is not found (deleted) +func logNotFound(logger logr.Logger, detailedLogs bool, eventFilters interface{ ShouldLog(EventType) bool }, resourceType, namespace, name string) { + // Always record in stats + recordEvent(resourceType, namespace, name, EventTypeNotFound) + + // Only log details if detailed mode enabled AND event type is enabled + if detailedLogs && eventFilters.ShouldLog(EventTypeNotFound) { + logger.Info("Resource not found - likely deleted", + "event", "not-found", + ) + } +} + +// logCreate logs first observation of a resource +func logCreate(logger logr.Logger, detailedLogs bool, eventFilters interface{ ShouldLog(EventType) bool }, resourceType, namespace, name string, generation int64, resourceVersion string) { + // Always record in stats + recordEvent(resourceType, namespace, name, EventTypeCreate) + + // Only log details if detailed mode enabled AND event type is enabled + if detailedLogs && eventFilters.ShouldLog(EventTypeCreate) { + logger.Info("First observation of resource", + "event", "create", + "generation", generation, + "resourceVersion", resourceVersion, + ) + } +} + +// managedFieldsDiff returns a human-readable summary of what changed in managedFields. +// It identifies which field managers were added, removed, or changed, and for changed +// managers it shows a diff of their owned fields (parsed from FieldsV1 JSON). +func managedFieldsDiff(old, new []metav1.ManagedFieldsEntry) string { + type entryKey struct { + Manager string + Operation metav1.ManagedFieldsOperationType + Subresource string + } + + oldMap := make(map[entryKey]metav1.ManagedFieldsEntry, len(old)) + for _, e := range old { + oldMap[entryKey{e.Manager, e.Operation, e.Subresource}] = e + } + + newMap := make(map[entryKey]metav1.ManagedFieldsEntry, len(new)) + for _, e := range new { + newMap[entryKey{e.Manager, e.Operation, e.Subresource}] = e + } + + var added, removed, changed []string + var fieldDiffs []string + + for k, newEntry := range newMap { + oldEntry, exists := oldMap[k] + if !exists { + added = append(added, fmt.Sprintf("%s(%s)", k.Manager, k.Operation)) + continue + } + if !equality.Semantic.DeepEqual(newEntry, oldEntry) { + label := fmt.Sprintf("%s(%s)", k.Manager, k.Operation) + changed = append(changed, label) + diff := diffFieldsV1(oldEntry.FieldsV1, newEntry.FieldsV1) + if diff != "" { + fieldDiffs = append(fieldDiffs, fmt.Sprintf("[%s]:\n%s", label, diff)) + } + } + } + + for k := range oldMap { + if _, exists := newMap[k]; !exists { + removed = append(removed, fmt.Sprintf("%s(%s)", k.Manager, k.Operation)) + } + } + + var sb strings.Builder + if len(added) > 0 { + sb.WriteString("added: " + strings.Join(added, ", ") + "\n") + } + if len(removed) > 0 { + sb.WriteString("removed: " + strings.Join(removed, ", ") + "\n") + } + if len(changed) > 0 { + sb.WriteString("changed: " + strings.Join(changed, ", ") + "\n") + } + for _, fd := range fieldDiffs { + sb.WriteString(fd + "\n") + } + + return sb.String() +} + +// diffFieldsV1 diffs two FieldsV1 values by parsing their raw JSON. +// Falls back to an empty string if both are nil or identical. +func diffFieldsV1(old, new *metav1.FieldsV1) string { + if old == nil && new == nil { + return "" + } + var oldParsed, newParsed interface{} + if old != nil { + _ = json.Unmarshal(old.Raw, &oldParsed) + } + if new != nil { + _ = json.Unmarshal(new.Raw, &newParsed) + } + return cmp.Diff(oldParsed, newParsed) +} diff --git a/internal/cmd/monitor/reconciler/predicate.go b/internal/cmd/monitor/reconciler/predicate.go new file mode 100644 index 0000000000..8996491e73 --- /dev/null +++ b/internal/cmd/monitor/reconciler/predicate.go @@ -0,0 +1,260 @@ +// Copyright (c) 2021-2026 SUSE LLC + +package reconciler + +import ( + "maps" + "reflect" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// TypedResourceVersionUnchangedPredicate implements a update predicate to +// allow syncPeriod to trigger the reconciler +type TypedResourceVersionUnchangedPredicate[T metav1.Object] struct { + predicate.TypedFuncs[T] +} + +func isNil[T metav1.Object](arg T) bool { + return any(arg) == nil +} + +func (TypedResourceVersionUnchangedPredicate[T]) Create(e event.CreateEvent) bool { + return false +} + +func (TypedResourceVersionUnchangedPredicate[T]) Delete(e event.DeleteEvent) bool { + return false +} + +// Update implements default UpdateEvent filter for validating resource version change. +func (TypedResourceVersionUnchangedPredicate[T]) Update(e event.TypedUpdateEvent[T]) bool { + if isNil(e.ObjectOld) { + return false + } + if isNil(e.ObjectNew) { + return false + } + + return e.ObjectNew.GetResourceVersion() == e.ObjectOld.GetResourceVersion() +} + +func (TypedResourceVersionUnchangedPredicate[T]) Generic(e event.GenericEvent) bool { + return false +} + +// bundleDeploymentStatusChangedPredicate returns true if the bundledeployment +// status has changed, or the bundledeployment was created +func bundleDeploymentStatusChangedPredicate() predicate.Funcs { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return true + }, + UpdateFunc: func(e event.UpdateEvent) bool { + n := e.ObjectNew.(*fleet.BundleDeployment) + o := e.ObjectOld.(*fleet.BundleDeployment) + if n == nil || o == nil { + return false + } + return !n.DeletionTimestamp.IsZero() || !reflect.DeepEqual(n.Status, o.Status) + }, + } +} + +// jobUpdatedPredicate returns true if the job status has changed +func jobUpdatedPredicate() predicate.Funcs { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return false + }, + UpdateFunc: func(e event.UpdateEvent) bool { + n, isJob := e.ObjectNew.(*batchv1.Job) + if !isJob { + return false + } + o := e.ObjectOld.(*batchv1.Job) + if n == nil || o == nil { + return false + } + return !reflect.DeepEqual(n.Status, o.Status) || + (n.DeletionTimestamp != nil && o.DeletionTimestamp == nil) + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return false + }, + } +} + +// commitChangedPredicate returns true if the webhook or polling commit has changed. +// Mirrors production gitjob_controller.go commitChangedPredicate. +func commitChangedPredicate() predicate.Predicate { + return predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldGitRepo, ok := e.ObjectOld.(*fleet.GitRepo) + if !ok { + return true + } + newGitRepo, ok := e.ObjectNew.(*fleet.GitRepo) + if !ok { + return true + } + return (oldGitRepo.Status.WebhookCommit != newGitRepo.Status.WebhookCommit) || + (oldGitRepo.Status.PollingCommit != newGitRepo.Status.PollingCommit) + }, + } +} + +// clusterChangedPredicate filters cluster events that relate to bundle deployment creation. +// Mirrors production bundle_controller.go clusterChangedPredicate. +func clusterChangedPredicate() predicate.Funcs { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return true + }, + UpdateFunc: func(e event.UpdateEvent) bool { + n := e.ObjectNew.(*fleet.Cluster) + o := e.ObjectOld.(*fleet.Cluster) + // cluster deletion will eventually trigger a delete event + if n == nil || !n.DeletionTimestamp.IsZero() { + return true + } + // labels and annotations are used for templating and targeting + if !maps.Equal(n.Labels, o.Labels) { + return true + } + if !maps.Equal(n.Annotations, o.Annotations) { + return true + } + // spec templateValues is used in templating + if !reflect.DeepEqual(n.Spec, o.Spec) { + return true + } + // this namespace contains the bundledeployments + if n.Status.Namespace != o.Status.Namespace { + return true + } + // this namespace indicates the agent is running + if n.Status.Agent.Namespace != o.Status.Agent.Namespace { + return true + } + if n.Status.Scheduled != o.Status.Scheduled { + return true + } + if n.Status.ActiveSchedule != o.Status.ActiveSchedule { + return true + } + return false + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return true + }, + } +} + +// nonSecretAnnotationChangedPredicate returns true if annotations changed, +// excluding changes to only the secret data hash tracking annotations. +// Mirrors production gitjob_controller.go nonSecretAnnotationChangedPredicate. +func nonSecretAnnotationChangedPredicate() predicate.Funcs { + secretAnnotationKeys := map[string]struct{}{ + "fleet.cattle.io/client-secret-hash": {}, + "fleet.cattle.io/helm-secret-hash": {}, + "fleet.cattle.io/helm-secret-for-paths-hash": {}, + } + + annotationsChangedExcludingSecrets := func(oldAnnotations, newAnnotations map[string]string) bool { + // Check if any non-secret annotation was added, removed, or changed + for key, newVal := range newAnnotations { + if _, isSecretAnnotation := secretAnnotationKeys[key]; isSecretAnnotation { + continue + } + if oldVal, exists := oldAnnotations[key]; !exists || oldVal != newVal { + return true + } + } + // Check if any non-secret annotation was removed + for key := range oldAnnotations { + if _, isSecretAnnotation := secretAnnotationKeys[key]; isSecretAnnotation { + continue + } + if _, exists := newAnnotations[key]; !exists { + return true + } + } + return false + } + + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return false + }, + UpdateFunc: func(e event.UpdateEvent) bool { + return annotationsChangedExcludingSecrets( + e.ObjectOld.GetAnnotations(), + e.ObjectNew.GetAnnotations(), + ) + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return false + }, + } +} + +// dataChangedPredicate filters Secret and ConfigMap events to only trigger reconciliation +// when Data or BinaryData fields have changed. +// Mirrors production bundle_controller.go dataChangedPredicate. +func dataChangedPredicate() predicate.Funcs { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return true + }, + UpdateFunc: func(e event.UpdateEvent) bool { + switch new := e.ObjectNew.(type) { + case *corev1.Secret: + old, ok := e.ObjectOld.(*corev1.Secret) + if !ok { + return false + } + return !reflect.DeepEqual(new.Data, old.Data) + case *corev1.ConfigMap: + old, ok := e.ObjectOld.(*corev1.ConfigMap) + if !ok { + return false + } + return !maps.Equal(new.Data, old.Data) || !reflect.DeepEqual(new.BinaryData, old.BinaryData) + default: + return false + } + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return false + }, + } +} + +// secretDataChangedPredicate filters Secret events to only trigger reconciliation +// when Data field has changed, or when the secret is created or deleted. +// Mirrors production gitjob_controller.go secretDataChangedPredicate. +func secretDataChangedPredicate() predicate.Funcs { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return true + }, + UpdateFunc: func(e event.UpdateEvent) bool { + newSecret, newOk := e.ObjectNew.(*corev1.Secret) + oldSecret, oldOk := e.ObjectOld.(*corev1.Secret) + if !newOk || !oldOk { + return false + } + return !reflect.DeepEqual(newSecret.Data, oldSecret.Data) + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return true + }, + } +} diff --git a/internal/cmd/monitor/reconciler/predicate_test.go b/internal/cmd/monitor/reconciler/predicate_test.go new file mode 100644 index 0000000000..6d2db35e6c --- /dev/null +++ b/internal/cmd/monitor/reconciler/predicate_test.go @@ -0,0 +1,348 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "testing" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" +) + +func TestTypedResourceVersionUnchangedPredicate(t *testing.T) { + p := TypedResourceVersionUnchangedPredicate[client.Object]{} + + t.Run("Create returns false", func(t *testing.T) { + if p.Create(event.CreateEvent{Object: &fleet.Bundle{}}) { + t.Error("expected false for Create") + } + }) + + t.Run("Delete returns false", func(t *testing.T) { + if p.Delete(event.DeleteEvent{Object: &fleet.Bundle{}}) { + t.Error("expected false for Delete") + } + }) + + t.Run("Generic returns false", func(t *testing.T) { + if p.Generic(event.GenericEvent{Object: &fleet.Bundle{}}) { + t.Error("expected false for Generic") + } + }) + + t.Run("Update with same ResourceVersion returns true", func(t *testing.T) { + old := &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{ResourceVersion: "42"}} + new := &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{ResourceVersion: "42"}} + e := event.TypedUpdateEvent[client.Object]{ObjectOld: old, ObjectNew: new} + if !p.Update(e) { + t.Error("expected true when ResourceVersion unchanged") + } + }) + + t.Run("Update with changed ResourceVersion returns false", func(t *testing.T) { + old := &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{ResourceVersion: "1"}} + new := &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{ResourceVersion: "2"}} + e := event.TypedUpdateEvent[client.Object]{ObjectOld: old, ObjectNew: new} + if p.Update(e) { + t.Error("expected false when ResourceVersion changed") + } + }) + + t.Run("Update with nil old returns false", func(t *testing.T) { + new := &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{ResourceVersion: "1"}} + e := event.TypedUpdateEvent[client.Object]{ObjectOld: nil, ObjectNew: new} + if p.Update(e) { + t.Error("expected false for nil old object") + } + }) + + t.Run("Update with nil new returns false", func(t *testing.T) { + old := &fleet.Bundle{ObjectMeta: metav1.ObjectMeta{ResourceVersion: "1"}} + e := event.TypedUpdateEvent[client.Object]{ObjectOld: old, ObjectNew: nil} + if p.Update(e) { + t.Error("expected false for nil new object") + } + }) +} + +func TestClusterChangedPredicate(t *testing.T) { + p := clusterChangedPredicate() + + t.Run("Create returns true", func(t *testing.T) { + if !p.Create(event.CreateEvent{Object: &fleet.Cluster{}}) { + t.Error("expected true for Create") + } + }) + + t.Run("Delete returns true", func(t *testing.T) { + if !p.Delete(event.DeleteEvent{Object: &fleet.Cluster{}}) { + t.Error("expected true for Delete") + } + }) + + t.Run("Update with no change returns false", func(t *testing.T) { + cluster := &fleet.Cluster{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"env": "prod"}}, + Status: fleet.ClusterStatus{Namespace: "test-ns"}, + } + e := event.UpdateEvent{ObjectOld: cluster, ObjectNew: cluster.DeepCopy()} + if p.Update(e) { + t.Error("expected false when nothing changed") + } + }) + + t.Run("Update with label change returns true", func(t *testing.T) { + old := &fleet.Cluster{ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"env": "prod"}}} + new := old.DeepCopy() + new.Labels["env"] = "staging" + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true for label change") + } + }) + + t.Run("Update with annotation change returns true", func(t *testing.T) { + old := &fleet.Cluster{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{"key": "val"}}} + new := old.DeepCopy() + new.Annotations["key"] = "newval" + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true for annotation change") + } + }) + + t.Run("Update with deletion timestamp set returns true", func(t *testing.T) { + old := &fleet.Cluster{} + new := old.DeepCopy() + now := metav1.Now() + new.DeletionTimestamp = &now + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true when deletion timestamp set") + } + }) + + t.Run("Update with status namespace change returns true", func(t *testing.T) { + old := &fleet.Cluster{Status: fleet.ClusterStatus{Namespace: "old-ns"}} + new := old.DeepCopy() + new.Status.Namespace = "new-ns" + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true for status namespace change") + } + }) +} + +func TestNonSecretAnnotationChangedPredicate(t *testing.T) { + p := nonSecretAnnotationChangedPredicate() + + t.Run("Create returns false", func(t *testing.T) { + if p.Create(event.CreateEvent{Object: &fleet.GitRepo{}}) { + t.Error("expected false for Create") + } + }) + + t.Run("Delete returns false", func(t *testing.T) { + if p.Delete(event.DeleteEvent{Object: &fleet.GitRepo{}}) { + t.Error("expected false for Delete") + } + }) + + t.Run("Update with no annotation change returns false", func(t *testing.T) { + old := &fleet.GitRepo{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{"app": "v1"}}} + new := old.DeepCopy() + if p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected false when annotations unchanged") + } + }) + + t.Run("Update with regular annotation change returns true", func(t *testing.T) { + old := &fleet.GitRepo{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{"app": "v1"}}} + new := old.DeepCopy() + new.Annotations["app"] = "v2" + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true when regular annotation changed") + } + }) + + t.Run("Update adding regular annotation returns true", func(t *testing.T) { + old := &fleet.GitRepo{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{}}} + new := old.DeepCopy() + new.Annotations["new-key"] = "new-value" + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true when regular annotation added") + } + }) + + t.Run("Update removing regular annotation returns true", func(t *testing.T) { + old := &fleet.GitRepo{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{"app": "v1"}}} + new := old.DeepCopy() + delete(new.Annotations, "app") + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true when regular annotation removed") + } + }) + + t.Run("Update with only client-secret-hash change returns false", func(t *testing.T) { + old := &fleet.GitRepo{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + "fleet.cattle.io/client-secret-hash": "old-hash", + }}} + new := old.DeepCopy() + new.Annotations["fleet.cattle.io/client-secret-hash"] = "new-hash" + if p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected false when only client-secret-hash changed") + } + }) + + t.Run("Update with only helm-secret-hash change returns false", func(t *testing.T) { + old := &fleet.GitRepo{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + "fleet.cattle.io/helm-secret-hash": "old-hash", + }}} + new := old.DeepCopy() + new.Annotations["fleet.cattle.io/helm-secret-hash"] = "new-hash" + if p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected false when only helm-secret-hash changed") + } + }) + + t.Run("Update with only helm-secret-for-paths-hash change returns false", func(t *testing.T) { + old := &fleet.GitRepo{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + "fleet.cattle.io/helm-secret-for-paths-hash": "old-hash", + }}} + new := old.DeepCopy() + new.Annotations["fleet.cattle.io/helm-secret-for-paths-hash"] = "new-hash" + if p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected false when only helm-secret-for-paths-hash changed") + } + }) + + t.Run("Update with secret hash and regular annotation change returns true", func(t *testing.T) { + old := &fleet.GitRepo{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + "fleet.cattle.io/client-secret-hash": "old-hash", + "app": "v1", + }}} + new := old.DeepCopy() + new.Annotations["fleet.cattle.io/client-secret-hash"] = "new-hash" + new.Annotations["app"] = "v2" + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true when regular annotation also changed") + } + }) +} + +func TestDataChangedPredicate(t *testing.T) { + p := dataChangedPredicate() + + t.Run("Create returns true", func(t *testing.T) { + if !p.Create(event.CreateEvent{Object: &corev1.Secret{}}) { + t.Error("expected true for Secret Create") + } + }) + + t.Run("Delete returns false", func(t *testing.T) { + if p.Delete(event.DeleteEvent{Object: &corev1.Secret{}}) { + t.Error("expected false for Secret Delete") + } + }) + + t.Run("Secret Update with no data change returns false", func(t *testing.T) { + old := &corev1.Secret{Data: map[string][]byte{"key": []byte("val")}} + new := old.DeepCopy() + if p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected false when secret data unchanged") + } + }) + + t.Run("Secret Update with data change returns true", func(t *testing.T) { + old := &corev1.Secret{Data: map[string][]byte{"key": []byte("val")}} + new := old.DeepCopy() + new.Data["key"] = []byte("newval") + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true when secret data changed") + } + }) + + t.Run("Secret Update adding key returns true", func(t *testing.T) { + old := &corev1.Secret{Data: map[string][]byte{}} + new := old.DeepCopy() + new.Data["new-key"] = []byte("value") + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true when key added to secret") + } + }) + + t.Run("ConfigMap Update with no data change returns false", func(t *testing.T) { + old := &corev1.ConfigMap{Data: map[string]string{"key": "val"}} + new := old.DeepCopy() + if p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected false when configmap data unchanged") + } + }) + + t.Run("ConfigMap Update with data change returns true", func(t *testing.T) { + old := &corev1.ConfigMap{Data: map[string]string{"key": "val"}} + new := old.DeepCopy() + new.Data["key"] = "newval" + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true when configmap data changed") + } + }) + + t.Run("ConfigMap Update with BinaryData change returns true", func(t *testing.T) { + old := &corev1.ConfigMap{BinaryData: map[string][]byte{"key": []byte("val")}} + new := old.DeepCopy() + new.BinaryData["key"] = []byte("newval") + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true when configmap binary data changed") + } + }) + + t.Run("Update with wrong type returns false", func(t *testing.T) { + // Pod is not a Secret or ConfigMap + old := &corev1.Pod{} + new := &corev1.Pod{} + if p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected false for unsupported object type") + } + }) +} + +func TestSecretDataChangedPredicate(t *testing.T) { + p := secretDataChangedPredicate() + + t.Run("Create returns true", func(t *testing.T) { + if !p.Create(event.CreateEvent{Object: &corev1.Secret{}}) { + t.Error("expected true for Create") + } + }) + + t.Run("Delete returns true", func(t *testing.T) { + if !p.Delete(event.DeleteEvent{Object: &corev1.Secret{}}) { + t.Error("expected true for Delete") + } + }) + + t.Run("Update with no data change returns false", func(t *testing.T) { + old := &corev1.Secret{Data: map[string][]byte{"key": []byte("val")}} + new := old.DeepCopy() + if p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected false when secret data unchanged") + } + }) + + t.Run("Update with data change returns true", func(t *testing.T) { + old := &corev1.Secret{Data: map[string][]byte{"key": []byte("val")}} + new := old.DeepCopy() + new.Data["key"] = []byte("newval") + if !p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected true when secret data changed") + } + }) + + t.Run("Update with non-Secret objects returns false", func(t *testing.T) { + old := &corev1.ConfigMap{} + new := &corev1.ConfigMap{} + if p.Update(event.UpdateEvent{ObjectOld: old, ObjectNew: new}) { + t.Error("expected false when objects are not Secrets") + } + }) +} diff --git a/internal/cmd/monitor/reconciler/stats.go b/internal/cmd/monitor/reconciler/stats.go new file mode 100644 index 0000000000..256d56cb1c --- /dev/null +++ b/internal/cmd/monitor/reconciler/stats.go @@ -0,0 +1,222 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "encoding/json" + "sync" + "time" +) + +// EventType represents the type of reconciliation event +type EventType string + +const ( + EventTypeGenerationChange EventType = "generation-change" + EventTypeStatusChange EventType = "status-change" + EventTypeAnnotationChange EventType = "annotation-change" + EventTypeLabelChange EventType = "label-change" + EventTypeResourceVersionChange EventType = "resourceversion-change" + EventTypeDeletion EventType = "deletion" + EventTypeNotFound EventType = "not-found" + EventTypeCreate EventType = "create" +) + +// ResourceKey identifies a Kubernetes resource +type ResourceKey struct { + ResourceType string // "Bundle", "Cluster", etc. + Namespace string + Name string +} + +func (r ResourceKey) String() string { + if r.Namespace == "" { + return r.Name + } + return r.Namespace + "/" + r.Name +} + +// ResourceStats tracks event counts for a single resource +type ResourceStats struct { + Counts map[EventType]int64 `json:"-"` // Internal tracking + TriggeredBy map[string]int64 `json:"triggered-by,omitempty"` + Total int64 `json:"total_events"` +} + +// MarshalJSON implements custom JSON marshaling to flatten event counts +func (rs *ResourceStats) MarshalJSON() ([]byte, error) { + // Create a map with all fields + m := make(map[string]interface{}) + + // Add simple event counts + for eventType, count := range rs.Counts { + if count > 0 { + m[string(eventType)] = count + } + } + + // Add triggered-by breakdown + if len(rs.TriggeredBy) > 0 { + m["triggered-by"] = rs.TriggeredBy + } + + // Add total + m["total_events"] = rs.Total + + return json.Marshal(m) +} + +// StatsTracker aggregates reconciliation statistics +type StatsTracker struct { + mu sync.RWMutex + stats map[ResourceKey]*ResourceStats + startTime time.Time + lastSummaryTime time.Time +} + +// NewStatsTracker creates a new statistics tracker +func NewStatsTracker() *StatsTracker { + now := time.Now() + return &StatsTracker{ + stats: make(map[ResourceKey]*ResourceStats), + startTime: now, + lastSummaryTime: now, + } +} + +// RecordEvent records an event for a resource +func (s *StatsTracker) RecordEvent(resourceType, namespace, name string, eventType EventType) { + s.mu.Lock() + defer s.mu.Unlock() + + key := ResourceKey{ + ResourceType: resourceType, + Namespace: namespace, + Name: name, + } + + if s.stats[key] == nil { + s.stats[key] = &ResourceStats{ + Counts: make(map[EventType]int64), + TriggeredBy: make(map[string]int64), + } + } + + s.stats[key].Counts[eventType]++ + s.stats[key].Total++ +} + +// RecordTrigger records a trigger event with the trigger resource type +func (s *StatsTracker) RecordTrigger(resourceType, namespace, name string, triggerResourceType string) { + s.mu.Lock() + defer s.mu.Unlock() + + key := ResourceKey{ + ResourceType: resourceType, + Namespace: namespace, + Name: name, + } + + if s.stats[key] == nil { + s.stats[key] = &ResourceStats{ + Counts: make(map[EventType]int64), + TriggeredBy: make(map[string]int64), + } + } + + s.stats[key].TriggeredBy[triggerResourceType]++ + s.stats[key].Total++ +} + +// Summary represents a snapshot of statistics +type Summary struct { + Timestamp time.Time `json:"timestamp"` + IntervalSeconds float64 `json:"interval_seconds"` + Summary map[string]map[string]*ResourceStats `json:"summary"` + Totals Totals `json:"totals"` +} + +// Totals represents aggregate statistics +type Totals struct { + TotalResourcesMonitored int `json:"total_resources_monitored"` + TotalEvents int64 `json:"total_events"` +} + +// GetSummary returns a summary of all statistics +func (s *StatsTracker) GetSummary() Summary { + s.mu.RLock() + defer s.mu.RUnlock() + + now := time.Now() + intervalSeconds := now.Sub(s.lastSummaryTime).Seconds() + + // Group by resource type + grouped := make(map[string]map[string]*ResourceStats) + totalEvents := int64(0) + + for key, stats := range s.stats { + if grouped[key.ResourceType] == nil { + grouped[key.ResourceType] = make(map[string]*ResourceStats) + } + + // Deep copy stats to avoid race conditions + statsCopy := &ResourceStats{ + Counts: make(map[EventType]int64), + TriggeredBy: make(map[string]int64), + Total: stats.Total, + } + for eventType, count := range stats.Counts { + statsCopy.Counts[eventType] = count + } + for triggerType, count := range stats.TriggeredBy { + statsCopy.TriggeredBy[triggerType] = count + } + + grouped[key.ResourceType][key.String()] = statsCopy + totalEvents += stats.Total + } + + return Summary{ + Timestamp: now, + IntervalSeconds: intervalSeconds, + Summary: grouped, + Totals: Totals{ + TotalResourcesMonitored: len(s.stats), + TotalEvents: totalEvents, + }, + } +} + +// Reset clears all statistics +func (s *StatsTracker) Reset() { + s.mu.Lock() + defer s.mu.Unlock() + + s.stats = make(map[ResourceKey]*ResourceStats) + s.lastSummaryTime = time.Now() +} + +// UpdateLastSummaryTime updates the last summary timestamp without resetting +func (s *StatsTracker) UpdateLastSummaryTime() { + s.mu.Lock() + defer s.mu.Unlock() + s.lastSummaryTime = time.Now() +} + +// ToJSON converts summary to JSON string +func (s Summary) ToJSON() (string, error) { + bytes, err := json.Marshal(s) + if err != nil { + return "", err + } + return string(bytes), nil +} + +// ToJSONIndent converts summary to indented JSON string for readability +func (s Summary) ToJSONIndent() (string, error) { + bytes, err := json.MarshalIndent(s, "", " ") + if err != nil { + return "", err + } + return string(bytes), nil +} diff --git a/internal/cmd/monitor/reconciler/stats_test.go b/internal/cmd/monitor/reconciler/stats_test.go new file mode 100644 index 0000000000..028d7598eb --- /dev/null +++ b/internal/cmd/monitor/reconciler/stats_test.go @@ -0,0 +1,252 @@ +// Copyright (c) 2024-2026 SUSE LLC + +package reconciler + +import ( + "strings" + "testing" + "time" +) + +func TestResourceKey_String(t *testing.T) { + tests := []struct { + key ResourceKey + want string + }{ + { + key: ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "name"}, + want: "ns/name", + }, + { + key: ResourceKey{ResourceType: "Cluster", Namespace: "", Name: "cluster-a"}, + want: "cluster-a", + }, + } + for _, tt := range tests { + t.Run(tt.want, func(t *testing.T) { + got := tt.key.String() + if got != tt.want { + t.Errorf("String() = %q, want %q", got, tt.want) + } + }) + } +} + +func TestStatsTracker_RecordEvent(t *testing.T) { + tracker := NewStatsTracker() + + tracker.RecordEvent("Bundle", "ns", "name", EventTypeCreate) + tracker.RecordEvent("Bundle", "ns", "name", EventTypeStatusChange) + tracker.RecordEvent("Bundle", "ns", "name", EventTypeCreate) // repeated + + key := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "name"} + stats := tracker.stats[key] + if stats == nil { + t.Fatal("expected stats entry for key") + } + if stats.Counts[EventTypeCreate] != 2 { + t.Errorf("Create count = %d, want 2", stats.Counts[EventTypeCreate]) + } + if stats.Counts[EventTypeStatusChange] != 1 { + t.Errorf("StatusChange count = %d, want 1", stats.Counts[EventTypeStatusChange]) + } + if stats.Total != 3 { + t.Errorf("Total = %d, want 3", stats.Total) + } +} + +func TestStatsTracker_RecordEvent_DifferentResources(t *testing.T) { + tracker := NewStatsTracker() + + tracker.RecordEvent("Bundle", "ns", "bundle-a", EventTypeCreate) + tracker.RecordEvent("Bundle", "ns", "bundle-b", EventTypeCreate) + tracker.RecordEvent("Cluster", "ns", "cluster-a", EventTypeStatusChange) + + if len(tracker.stats) != 3 { + t.Errorf("expected 3 stats entries, got %d", len(tracker.stats)) + } +} + +func TestStatsTracker_RecordTrigger(t *testing.T) { + tracker := NewStatsTracker() + + tracker.RecordTrigger("Bundle", "ns", "name", "Cluster") + tracker.RecordTrigger("Bundle", "ns", "name", "Cluster") + tracker.RecordTrigger("Bundle", "ns", "name", "BundleDeployment") + + key := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "name"} + stats := tracker.stats[key] + if stats == nil { + t.Fatal("expected stats entry for key") + } + if stats.TriggeredBy["Cluster"] != 2 { + t.Errorf("Cluster trigger count = %d, want 2", stats.TriggeredBy["Cluster"]) + } + if stats.TriggeredBy["BundleDeployment"] != 1 { + t.Errorf("BundleDeployment trigger count = %d, want 1", stats.TriggeredBy["BundleDeployment"]) + } + if stats.Total != 3 { + t.Errorf("Total = %d, want 3", stats.Total) + } +} + +func TestStatsTracker_GetSummary(t *testing.T) { + tracker := NewStatsTracker() + + tracker.RecordEvent("Bundle", "ns", "bundle-a", EventTypeCreate) + tracker.RecordEvent("Bundle", "ns", "bundle-b", EventTypeCreate) + tracker.RecordEvent("Cluster", "ns", "cluster-a", EventTypeStatusChange) + + summary := tracker.GetSummary() + + if len(summary.Summary) == 0 { + t.Error("expected non-empty summary") + } + + bundleStats := summary.Summary["Bundle"] + if bundleStats == nil { + t.Fatal("expected Bundle group in summary") + } + if len(bundleStats) != 2 { + t.Errorf("expected 2 Bundle entries in summary, got %d", len(bundleStats)) + } + + clusterStats := summary.Summary["Cluster"] + if clusterStats == nil { + t.Fatal("expected Cluster group in summary") + } + + if summary.Totals.TotalResourcesMonitored != 3 { + t.Errorf("TotalResourcesMonitored = %d, want 3", summary.Totals.TotalResourcesMonitored) + } + if summary.Totals.TotalEvents != 3 { + t.Errorf("TotalEvents = %d, want 3", summary.Totals.TotalEvents) + } +} + +func TestStatsTracker_GetSummary_DeepCopy(t *testing.T) { + tracker := NewStatsTracker() + tracker.RecordEvent("Bundle", "ns", "name", EventTypeCreate) + + summary := tracker.GetSummary() + + // Mutate the summary copy - should not affect tracker + bundleStats := summary.Summary["Bundle"]["ns/name"] + bundleStats.Counts[EventTypeCreate] = 999 + + key := ResourceKey{ResourceType: "Bundle", Namespace: "ns", Name: "name"} + if tracker.stats[key].Counts[EventTypeCreate] != 1 { + t.Error("mutating summary copy should not affect original tracker") + } +} + +func TestStatsTracker_Reset(t *testing.T) { + tracker := NewStatsTracker() + tracker.RecordEvent("Bundle", "ns", "name", EventTypeCreate) + + before := tracker.lastSummaryTime + time.Sleep(time.Millisecond) + tracker.Reset() + + if len(tracker.stats) != 0 { + t.Error("expected empty stats after Reset") + } + if !tracker.lastSummaryTime.After(before) { + t.Error("expected lastSummaryTime to be updated after Reset") + } +} + +func TestStatsTracker_UpdateLastSummaryTime(t *testing.T) { + tracker := NewStatsTracker() + before := tracker.lastSummaryTime + time.Sleep(time.Millisecond) + tracker.UpdateLastSummaryTime() + if !tracker.lastSummaryTime.After(before) { + t.Error("expected lastSummaryTime to be updated") + } +} + +func TestSummary_ToJSON(t *testing.T) { + tracker := NewStatsTracker() + tracker.RecordEvent("Bundle", "ns", "name", EventTypeCreate) + summary := tracker.GetSummary() + + jsonStr, err := summary.ToJSON() + if err != nil { + t.Fatalf("ToJSON() error = %v", err) + } + if jsonStr == "" { + t.Error("ToJSON() returned empty string") + } + if !strings.Contains(jsonStr, "Bundle") { + t.Error("expected JSON to contain 'Bundle'") + } + if !strings.Contains(jsonStr, "total_events") { + t.Error("expected JSON to contain 'total_events'") + } +} + +func TestSummary_ToJSONIndent(t *testing.T) { + tracker := NewStatsTracker() + tracker.RecordEvent("Cluster", "ns", "cluster-a", EventTypeStatusChange) + summary := tracker.GetSummary() + + jsonStr, err := summary.ToJSONIndent() + if err != nil { + t.Fatalf("ToJSONIndent() error = %v", err) + } + if !strings.Contains(jsonStr, "\n") { + t.Error("expected indented JSON to contain newlines") + } + if !strings.Contains(jsonStr, "Cluster") { + t.Error("expected JSON to contain 'Cluster'") + } +} + +func TestResourceStats_MarshalJSON(t *testing.T) { + rs := &ResourceStats{ + Counts: map[EventType]int64{ + EventTypeCreate: 3, + EventTypeStatusChange: 1, + }, + TriggeredBy: map[string]int64{ + "Cluster": 2, + }, + Total: 4, + } + + data, err := rs.MarshalJSON() + if err != nil { + t.Fatalf("MarshalJSON() error = %v", err) + } + jsonStr := string(data) + if !strings.Contains(jsonStr, "create") { + t.Error("expected JSON to contain 'create'") + } + if !strings.Contains(jsonStr, "total_events") { + t.Error("expected JSON to contain 'total_events'") + } + if !strings.Contains(jsonStr, "triggered-by") { + t.Error("expected JSON to contain 'triggered-by'") + } +} + +func TestResourceStats_MarshalJSON_ZeroCountsOmitted(t *testing.T) { + rs := &ResourceStats{ + Counts: map[EventType]int64{ + EventTypeCreate: 0, + }, + TriggeredBy: map[string]int64{}, + Total: 0, + } + + data, err := rs.MarshalJSON() + if err != nil { + t.Fatalf("MarshalJSON() error = %v", err) + } + // Zero-count events should be omitted + jsonStr := string(data) + if strings.Contains(jsonStr, "create") { + t.Error("expected zero-count events to be omitted from JSON") + } +} diff --git a/internal/cmd/monitor/root.go b/internal/cmd/monitor/root.go new file mode 100644 index 0000000000..eb70bb494b --- /dev/null +++ b/internal/cmd/monitor/root.go @@ -0,0 +1,376 @@ +// Package monitor starts the fleet monitor. +package monitor + +import ( + "flag" + "fmt" + "os" + "strconv" + "time" + + "github.com/spf13/cobra" + + ctrl "sigs.k8s.io/controller-runtime" + clog "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + command "github.com/rancher/fleet/internal/cmd" + "github.com/rancher/fleet/internal/cmd/monitor/reconciler" + "github.com/rancher/fleet/pkg/version" +) + +type FleetMonitor struct { + command.DebugConfig + Kubeconfig string `usage:"Kubeconfig file"` + Namespace string `usage:"namespace to watch" default:"cattle-fleet-system" env:"NAMESPACE"` + ShardID string `usage:"only monitor resources labeled with a specific shard ID" name:"shard-id"` + + // Controller toggles (env vars parsed manually in Run — see parseBoolEnv) + EnableBundleMonitor bool `usage:"Enable bundle monitoring"` + EnableBundleDeploymentMonitor bool `usage:"Enable bundledeployment monitoring"` + EnableClusterMonitor bool `usage:"Enable cluster monitoring"` + EnableGitRepoMonitor bool `usage:"Enable gitrepo monitoring"` + EnableHelmOpMonitor bool `usage:"Enable helmop monitoring"` + + // Per-controller logging modes (env vars parsed manually in Run — see parseBoolEnv) + BundleDetailedLogs bool `usage:"Enable detailed logging for Bundle controller"` + BundleDeploymentDetailedLogs bool `usage:"Enable detailed logging for BundleDeployment controller"` + ClusterDetailedLogs bool `usage:"Enable detailed logging for Cluster controller"` + GitRepoDetailedLogs bool `usage:"Enable detailed logging for GitRepo controller"` + HelmOpDetailedLogs bool `usage:"Enable detailed logging for HelmOp controller"` + + // Bundle event filters (env vars parsed manually in Run — see parseBoolEnv) + BundleEventFilterGenerationChange bool `usage:"Show generation-change events for Bundle"` + BundleEventFilterStatusChange bool `usage:"Show status-change events for Bundle"` + BundleEventFilterAnnotationChange bool `usage:"Show annotation-change events for Bundle"` + BundleEventFilterLabelChange bool `usage:"Show label-change events for Bundle"` + BundleEventFilterResourceVersionChange bool `usage:"Show resourceversion-change events for Bundle"` + BundleEventFilterDeletion bool `usage:"Show deletion events for Bundle"` + BundleEventFilterNotFound bool `usage:"Show not-found events for Bundle"` + BundleEventFilterCreate bool `usage:"Show create events for Bundle"` + BundleEventFilterTriggeredBy bool `usage:"Show triggered-by events for Bundle"` + + // BundleDeployment event filters (env vars parsed manually in Run — see parseBoolEnv) + BundleDeploymentEventFilterGenerationChange bool `usage:"Show generation-change events for BundleDeployment"` + BundleDeploymentEventFilterStatusChange bool `usage:"Show status-change events for BundleDeployment"` + BundleDeploymentEventFilterAnnotationChange bool `usage:"Show annotation-change events for BundleDeployment"` + BundleDeploymentEventFilterLabelChange bool `usage:"Show label-change events for BundleDeployment"` + BundleDeploymentEventFilterResourceVersionChange bool `usage:"Show resourceversion-change events for BundleDeployment"` + BundleDeploymentEventFilterDeletion bool `usage:"Show deletion events for BundleDeployment"` + BundleDeploymentEventFilterNotFound bool `usage:"Show not-found events for BundleDeployment"` + BundleDeploymentEventFilterCreate bool `usage:"Show create events for BundleDeployment"` + BundleDeploymentEventFilterTriggeredBy bool `usage:"Show triggered-by events for BundleDeployment"` + + // Cluster event filters (env vars parsed manually in Run — see parseBoolEnv) + ClusterEventFilterGenerationChange bool `usage:"Show generation-change events for Cluster"` + ClusterEventFilterStatusChange bool `usage:"Show status-change events for Cluster"` + ClusterEventFilterAnnotationChange bool `usage:"Show annotation-change events for Cluster"` + ClusterEventFilterLabelChange bool `usage:"Show label-change events for Cluster"` + ClusterEventFilterResourceVersionChange bool `usage:"Show resourceversion-change events for Cluster"` + ClusterEventFilterDeletion bool `usage:"Show deletion events for Cluster"` + ClusterEventFilterNotFound bool `usage:"Show not-found events for Cluster"` + ClusterEventFilterCreate bool `usage:"Show create events for Cluster"` + ClusterEventFilterTriggeredBy bool `usage:"Show triggered-by events for Cluster"` + + // GitRepo event filters (env vars parsed manually in Run — see parseBoolEnv) + GitRepoEventFilterGenerationChange bool `usage:"Show generation-change events for GitRepo"` + GitRepoEventFilterStatusChange bool `usage:"Show status-change events for GitRepo"` + GitRepoEventFilterAnnotationChange bool `usage:"Show annotation-change events for GitRepo"` + GitRepoEventFilterLabelChange bool `usage:"Show label-change events for GitRepo"` + GitRepoEventFilterResourceVersionChange bool `usage:"Show resourceversion-change events for GitRepo"` + GitRepoEventFilterDeletion bool `usage:"Show deletion events for GitRepo"` + GitRepoEventFilterNotFound bool `usage:"Show not-found events for GitRepo"` + GitRepoEventFilterCreate bool `usage:"Show create events for GitRepo"` + GitRepoEventFilterTriggeredBy bool `usage:"Show triggered-by events for GitRepo"` + + // HelmOp event filters (env vars parsed manually in Run — see parseBoolEnv) + HelmOpEventFilterGenerationChange bool `usage:"Show generation-change events for HelmOp"` + HelmOpEventFilterStatusChange bool `usage:"Show status-change events for HelmOp"` + HelmOpEventFilterAnnotationChange bool `usage:"Show annotation-change events for HelmOp"` + HelmOpEventFilterLabelChange bool `usage:"Show label-change events for HelmOp"` + HelmOpEventFilterResourceVersionChange bool `usage:"Show resourceversion-change events for HelmOp"` + HelmOpEventFilterDeletion bool `usage:"Show deletion events for HelmOp"` + HelmOpEventFilterNotFound bool `usage:"Show not-found events for HelmOp"` + HelmOpEventFilterCreate bool `usage:"Show create events for HelmOp"` + HelmOpEventFilterTriggeredBy bool `usage:"Show triggered-by events for HelmOp"` + + SummaryInterval string `usage:"How often to print summary (e.g., 5s, 30s, 1m)" env:"FLEET_EVENT_MONITOR_SUMMARY_INTERVAL" default:"30s"` + SummaryReset bool `usage:"Reset counters after each summary"` +} + +type MonitorReconcilerWorkers struct { + Bundle int + BundleDeployment int + Cluster int + GitRepo int + HelmOp int +} + +var ( + setupLog = ctrl.Log.WithName("setup") + zopts = &zap.Options{ + Development: true, + } +) + +func (f *FleetMonitor) PersistentPre(_ *cobra.Command, _ []string) error { + if err := f.SetupDebug(); err != nil { + return fmt.Errorf("failed to setup debug logging: %w", err) + } + zopts = f.OverrideZapOpts(zopts) + + return nil +} + +func (f *FleetMonitor) Run(cmd *cobra.Command, args []string) error { + ctrl.SetLogger(zap.New(zap.UseFlagOptions(zopts))) + ctx := clog.IntoContext(cmd.Context(), ctrl.Log) + + kubeconfig := ctrl.GetConfigOrDie() + workersOpts := MonitorReconcilerWorkers{} + + leaderOpts, err := command.NewLeaderElectionOptions() + if err != nil { + return err + } + + if d := os.Getenv("BUNDLE_RECONCILER_WORKERS"); d != "" { + w, err := strconv.Atoi(d) + if err != nil { + setupLog.Error(err, "failed to parse BUNDLE_RECONCILER_WORKERS", "value", d) + } + workersOpts.Bundle = w + } + + if d := os.Getenv("BUNDLEDEPLOYMENT_RECONCILER_WORKERS"); d != "" { + w, err := strconv.Atoi(d) + if err != nil { + setupLog.Error(err, "failed to parse BUNDLEDEPLOYMENT_RECONCILER_WORKERS", "value", d) + } + workersOpts.BundleDeployment = w + } + + if d := os.Getenv("CLUSTER_RECONCILER_WORKERS"); d != "" { + w, err := strconv.Atoi(d) + if err != nil { + setupLog.Error(err, "failed to parse CLUSTER_RECONCILER_WORKERS", "value", d) + } + workersOpts.Cluster = w + } + + if d := os.Getenv("GITREPO_RECONCILER_WORKERS"); d != "" { + w, err := strconv.Atoi(d) + if err != nil { + setupLog.Error(err, "failed to parse GITREPO_RECONCILER_WORKERS", "value", d) + } + workersOpts.GitRepo = w + } + + if d := os.Getenv("HELMOP_RECONCILER_WORKERS"); d != "" { + w, err := strconv.Atoi(d) + if err != nil { + setupLog.Error(err, "failed to parse HELMOP_RECONCILER_WORKERS", "value", d) + } + workersOpts.HelmOp = w + } + + // The wrangler command framework does not reliably parse boolean env vars, + // so all boolean env vars are parsed manually here. The struct fields + // above intentionally omit env: tags to avoid a dual source of truth. + parseBoolEnv := func(key string, defaultValue bool) bool { + if val := os.Getenv(key); val != "" { + b, err := strconv.ParseBool(val) + if err != nil { + setupLog.Error(err, "failed to parse boolean env var", "key", key, "value", val) + return defaultValue + } + return b + } + return defaultValue + } + + // Parse controller enable flags + enableBundle := parseBoolEnv("ENABLE_BUNDLE_EVENT_MONITOR", f.EnableBundleMonitor) + enableBundleDeployment := parseBoolEnv("ENABLE_BUNDLEDEPLOYMENT_EVENT_MONITOR", f.EnableBundleDeploymentMonitor) + enableCluster := parseBoolEnv("ENABLE_CLUSTER_EVENT_MONITOR", f.EnableClusterMonitor) + enableGitRepo := parseBoolEnv("ENABLE_GITREPO_EVENT_MONITOR", f.EnableGitRepoMonitor) + enableHelmOp := parseBoolEnv("ENABLE_HELMOP_EVENT_MONITOR", f.EnableHelmOpMonitor) + + bundleDetailed := parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLE_DETAILED", f.BundleDetailedLogs) + bundleDeploymentDetailed := parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_DETAILED", f.BundleDeploymentDetailedLogs) + clusterDetailed := parseBoolEnv("FLEET_EVENT_MONITOR_CLUSTER_DETAILED", f.ClusterDetailedLogs) + gitRepoDetailed := parseBoolEnv("FLEET_EVENT_MONITOR_GITREPO_DETAILED", f.GitRepoDetailedLogs) + helmOpDetailed := parseBoolEnv("FLEET_EVENT_MONITOR_HELMOP_DETAILED", f.HelmOpDetailedLogs) + + // Parse event filters for each controller + bundleEventFilters := reconciler.EventTypeFilters{ + GenerationChange: parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLE_EVENT_GENERATION_CHANGE", f.BundleEventFilterGenerationChange), + StatusChange: parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLE_EVENT_STATUS_CHANGE", f.BundleEventFilterStatusChange), + AnnotationChange: parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLE_EVENT_ANNOTATION_CHANGE", f.BundleEventFilterAnnotationChange), + LabelChange: parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLE_EVENT_LABEL_CHANGE", f.BundleEventFilterLabelChange), + ResourceVersionChange: parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLE_EVENT_RESVER_CHANGE", f.BundleEventFilterResourceVersionChange), + Deletion: parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLE_EVENT_DELETION", f.BundleEventFilterDeletion), + NotFound: parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLE_EVENT_NOT_FOUND", f.BundleEventFilterNotFound), + Create: parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLE_EVENT_CREATE", f.BundleEventFilterCreate), + TriggeredBy: parseBoolEnv("FLEET_EVENT_MONITOR_BUNDLE_EVENT_TRIGGERED_BY", f.BundleEventFilterTriggeredBy), + } + + bundleDeploymentEventFilters := reconciler.EventTypeFilters{ + GenerationChange: parseBoolEnv("FLEET_EVENT_MONITOR_BD_EVENT_GENERATION_CHANGE", f.BundleDeploymentEventFilterGenerationChange), + StatusChange: parseBoolEnv("FLEET_EVENT_MONITOR_BD_EVENT_STATUS_CHANGE", f.BundleDeploymentEventFilterStatusChange), + AnnotationChange: parseBoolEnv("FLEET_EVENT_MONITOR_BD_EVENT_ANNOTATION_CHANGE", f.BundleDeploymentEventFilterAnnotationChange), + LabelChange: parseBoolEnv("FLEET_EVENT_MONITOR_BD_EVENT_LABEL_CHANGE", f.BundleDeploymentEventFilterLabelChange), + ResourceVersionChange: parseBoolEnv("FLEET_EVENT_MONITOR_BD_EVENT_RESVER_CHANGE", f.BundleDeploymentEventFilterResourceVersionChange), + Deletion: parseBoolEnv("FLEET_EVENT_MONITOR_BD_EVENT_DELETION", f.BundleDeploymentEventFilterDeletion), + NotFound: parseBoolEnv("FLEET_EVENT_MONITOR_BD_EVENT_NOT_FOUND", f.BundleDeploymentEventFilterNotFound), + Create: parseBoolEnv("FLEET_EVENT_MONITOR_BD_EVENT_CREATE", f.BundleDeploymentEventFilterCreate), + TriggeredBy: parseBoolEnv("FLEET_EVENT_MONITOR_BD_EVENT_TRIGGERED_BY", f.BundleDeploymentEventFilterTriggeredBy), + } + + clusterEventFilters := reconciler.EventTypeFilters{ + GenerationChange: parseBoolEnv("FLEET_EVENT_MONITOR_CLUSTER_EVENT_GENERATION_CHANGE", f.ClusterEventFilterGenerationChange), + StatusChange: parseBoolEnv("FLEET_EVENT_MONITOR_CLUSTER_EVENT_STATUS_CHANGE", f.ClusterEventFilterStatusChange), + AnnotationChange: parseBoolEnv("FLEET_EVENT_MONITOR_CLUSTER_EVENT_ANNOTATION_CHANGE", f.ClusterEventFilterAnnotationChange), + LabelChange: parseBoolEnv("FLEET_EVENT_MONITOR_CLUSTER_EVENT_LABEL_CHANGE", f.ClusterEventFilterLabelChange), + ResourceVersionChange: parseBoolEnv("FLEET_EVENT_MONITOR_CLUSTER_EVENT_RESVER_CHANGE", f.ClusterEventFilterResourceVersionChange), + Deletion: parseBoolEnv("FLEET_EVENT_MONITOR_CLUSTER_EVENT_DELETION", f.ClusterEventFilterDeletion), + NotFound: parseBoolEnv("FLEET_EVENT_MONITOR_CLUSTER_EVENT_NOT_FOUND", f.ClusterEventFilterNotFound), + Create: parseBoolEnv("FLEET_EVENT_MONITOR_CLUSTER_EVENT_CREATE", f.ClusterEventFilterCreate), + TriggeredBy: parseBoolEnv("FLEET_EVENT_MONITOR_CLUSTER_EVENT_TRIGGERED_BY", f.ClusterEventFilterTriggeredBy), + } + + gitRepoEventFilters := reconciler.EventTypeFilters{ + GenerationChange: parseBoolEnv("FLEET_EVENT_MONITOR_GITREPO_EVENT_GENERATION_CHANGE", f.GitRepoEventFilterGenerationChange), + StatusChange: parseBoolEnv("FLEET_EVENT_MONITOR_GITREPO_EVENT_STATUS_CHANGE", f.GitRepoEventFilterStatusChange), + AnnotationChange: parseBoolEnv("FLEET_EVENT_MONITOR_GITREPO_EVENT_ANNOTATION_CHANGE", f.GitRepoEventFilterAnnotationChange), + LabelChange: parseBoolEnv("FLEET_EVENT_MONITOR_GITREPO_EVENT_LABEL_CHANGE", f.GitRepoEventFilterLabelChange), + ResourceVersionChange: parseBoolEnv("FLEET_EVENT_MONITOR_GITREPO_EVENT_RESVER_CHANGE", f.GitRepoEventFilterResourceVersionChange), + Deletion: parseBoolEnv("FLEET_EVENT_MONITOR_GITREPO_EVENT_DELETION", f.GitRepoEventFilterDeletion), + NotFound: parseBoolEnv("FLEET_EVENT_MONITOR_GITREPO_EVENT_NOT_FOUND", f.GitRepoEventFilterNotFound), + Create: parseBoolEnv("FLEET_EVENT_MONITOR_GITREPO_EVENT_CREATE", f.GitRepoEventFilterCreate), + TriggeredBy: parseBoolEnv("FLEET_EVENT_MONITOR_GITREPO_EVENT_TRIGGERED_BY", f.GitRepoEventFilterTriggeredBy), + } + + helmOpEventFilters := reconciler.EventTypeFilters{ + GenerationChange: parseBoolEnv("FLEET_EVENT_MONITOR_HELMOP_EVENT_GENERATION_CHANGE", f.HelmOpEventFilterGenerationChange), + StatusChange: parseBoolEnv("FLEET_EVENT_MONITOR_HELMOP_EVENT_STATUS_CHANGE", f.HelmOpEventFilterStatusChange), + AnnotationChange: parseBoolEnv("FLEET_EVENT_MONITOR_HELMOP_EVENT_ANNOTATION_CHANGE", f.HelmOpEventFilterAnnotationChange), + LabelChange: parseBoolEnv("FLEET_EVENT_MONITOR_HELMOP_EVENT_LABEL_CHANGE", f.HelmOpEventFilterLabelChange), + ResourceVersionChange: parseBoolEnv("FLEET_EVENT_MONITOR_HELMOP_EVENT_RESVER_CHANGE", f.HelmOpEventFilterResourceVersionChange), + Deletion: parseBoolEnv("FLEET_EVENT_MONITOR_HELMOP_EVENT_DELETION", f.HelmOpEventFilterDeletion), + NotFound: parseBoolEnv("FLEET_EVENT_MONITOR_HELMOP_EVENT_NOT_FOUND", f.HelmOpEventFilterNotFound), + Create: parseBoolEnv("FLEET_EVENT_MONITOR_HELMOP_EVENT_CREATE", f.HelmOpEventFilterCreate), + TriggeredBy: parseBoolEnv("FLEET_EVENT_MONITOR_HELMOP_EVENT_TRIGGERED_BY", f.HelmOpEventFilterTriggeredBy), + } + + // Parse resource filters for each controller + bundleResourceFilter := &reconciler.ResourceFilter{ + NamespacePattern: os.Getenv("FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAMESPACE"), + NamePattern: os.Getenv("FLEET_EVENT_MONITOR_BUNDLE_RESOURCE_FILTER_NAME"), + } + + bundleDeploymentResourceFilter := &reconciler.ResourceFilter{ + NamespacePattern: os.Getenv("FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_RESOURCE_FILTER_NAMESPACE"), + NamePattern: os.Getenv("FLEET_EVENT_MONITOR_BUNDLEDEPLOYMENT_RESOURCE_FILTER_NAME"), + } + + clusterResourceFilter := &reconciler.ResourceFilter{ + NamespacePattern: os.Getenv("FLEET_EVENT_MONITOR_CLUSTER_RESOURCE_FILTER_NAMESPACE"), + NamePattern: os.Getenv("FLEET_EVENT_MONITOR_CLUSTER_RESOURCE_FILTER_NAME"), + } + + gitRepoResourceFilter := &reconciler.ResourceFilter{ + NamespacePattern: os.Getenv("FLEET_EVENT_MONITOR_GITREPO_RESOURCE_FILTER_NAMESPACE"), + NamePattern: os.Getenv("FLEET_EVENT_MONITOR_GITREPO_RESOURCE_FILTER_NAME"), + } + + helmOpResourceFilter := &reconciler.ResourceFilter{ + NamespacePattern: os.Getenv("FLEET_EVENT_MONITOR_HELMOP_RESOURCE_FILTER_NAMESPACE"), + NamePattern: os.Getenv("FLEET_EVENT_MONITOR_HELMOP_RESOURCE_FILTER_NAME"), + } + + // Log the parsed configuration for debugging + setupLog.Info("parsed per-controller logging configuration", + "bundle", bundleDetailed, + "bundleDeployment", bundleDeploymentDetailed, + "cluster", clusterDetailed, + "gitRepo", gitRepoDetailed, + "helmOp", helmOpDetailed, + ) + + // Parse summary interval + summaryInterval, err := time.ParseDuration(f.SummaryInterval) + if err != nil { + setupLog.Error(err, "invalid summary interval, using default 30s", "value", f.SummaryInterval) + summaryInterval = 30 * time.Second + } + + monitorOpts := MonitorOptions{ + EnableBundle: enableBundle, + EnableBundleDeployment: enableBundleDeployment, + EnableCluster: enableCluster, + EnableGitRepo: enableGitRepo, + EnableHelmOp: enableHelmOp, + Workers: workersOpts, + + // Per-controller logging configuration + ControllerLogging: ControllerLoggingConfig{ + Bundle: ControllerLogConfig{ + Detailed: bundleDetailed, + EventFilters: bundleEventFilters, + ResourceFilter: bundleResourceFilter, + }, + BundleDeployment: ControllerLogConfig{ + Detailed: bundleDeploymentDetailed, + EventFilters: bundleDeploymentEventFilters, + ResourceFilter: bundleDeploymentResourceFilter, + }, + Cluster: ControllerLogConfig{ + Detailed: clusterDetailed, + EventFilters: clusterEventFilters, + ResourceFilter: clusterResourceFilter, + }, + GitRepo: ControllerLogConfig{ + Detailed: gitRepoDetailed, + EventFilters: gitRepoEventFilters, + ResourceFilter: gitRepoResourceFilter, + }, + HelmOp: ControllerLogConfig{ + Detailed: helmOpDetailed, + EventFilters: helmOpEventFilters, + ResourceFilter: helmOpResourceFilter, + }, + }, + + SummaryInterval: summaryInterval, + SummaryReset: parseBoolEnv("FLEET_EVENT_MONITOR_SUMMARY_RESET", f.SummaryReset), + } + + if err := start( + ctx, + f.Namespace, + kubeconfig, + leaderOpts, + monitorOpts, + f.ShardID, + ); err != nil { + return err + } + + <-cmd.Context().Done() + return nil +} + +func App() *cobra.Command { + root := command.Command(&FleetMonitor{}, cobra.Command{ + Version: version.FriendlyVersion(), + Use: "fleeteventmonitor", + Short: "Fleet read-only monitoring controllers", + }) + fs := flag.NewFlagSet("", flag.ExitOnError) + zopts.BindFlags(fs) + ctrl.RegisterFlags(fs) + root.Flags().AddGoFlagSet(fs) + + return root +} diff --git a/package/Dockerfile.event-monitor b/package/Dockerfile.event-monitor new file mode 100644 index 0000000000..758f3d95a1 --- /dev/null +++ b/package/Dockerfile.event-monitor @@ -0,0 +1,29 @@ +ARG BUILD_ENV=dapper +ARG ARCH + +FROM --platform=linux/$ARCH registry.suse.com/bci/bci-base:15.7 AS base +RUN zypper rm -y container-suseconnect && \ + zypper ar --priority=500 https://download.opensuse.org/repositories/Virtualization:containers/5.5/Virtualization:containers.repo && \ + zypper --gpg-auto-import-keys ref && \ + zypper -n update && \ + zypper -n install --no-recommends tini && \ + zypper -n clean -a && \ + rm -fr /var/log/zypp* /usr/share/doc + +FROM base AS copy_dapper +ONBUILD ARG ARCH +ONBUILD COPY bin/fleeteventmonitor-linux-$ARCH /usr/bin/fleeteventmonitor + +FROM base AS copy_buildx +ONBUILD ARG TARGETARCH +ONBUILD COPY bin/fleeteventmonitor-linux-$TARGETARCH /usr/bin/fleeteventmonitor + +FROM base AS copy_goreleaser +ONBUILD ARG ARCH +ONBUILD COPY fleeteventmonitor-linux-$ARCH /usr/bin/fleeteventmonitor + +FROM copy_${BUILD_ENV} +RUN useradd -u 1000 user +USER 1000 +ENTRYPOINT ["tini", "--"] +CMD ["fleeteventmonitor"] From 868a1fa83088203a1750930f660f360f703b02cc Mon Sep 17 00:00:00 2001 From: Xavi Garcia Date: Fri, 13 Mar 2026 11:26:05 +0100 Subject: [PATCH 2/3] linter issues fixed Signed-off-by: Xavi Garcia --- cmd/fleeteventmonitor/main.go | 2 -- internal/cmd/monitor/operator.go | 10 +++++----- internal/cmd/monitor/reconciler/bundle_monitor.go | 2 +- internal/cmd/monitor/reconciler/bundle_query.go | 5 +---- .../cmd/monitor/reconciler/bundledeployment_monitor.go | 2 +- internal/cmd/monitor/reconciler/cluster_monitor.go | 2 +- internal/cmd/monitor/reconciler/gitrepo_monitor.go | 6 ++---- internal/cmd/monitor/reconciler/helmop_monitor.go | 2 +- internal/cmd/monitor/reconciler/predicate_test.go | 2 +- internal/cmd/monitor/reconciler/stats_test.go | 2 +- 10 files changed, 14 insertions(+), 21 deletions(-) diff --git a/cmd/fleeteventmonitor/main.go b/cmd/fleeteventmonitor/main.go index 719c90a732..ca3e6be2ca 100644 --- a/cmd/fleeteventmonitor/main.go +++ b/cmd/fleeteventmonitor/main.go @@ -2,8 +2,6 @@ package main import ( - _ "net/http/pprof" - "github.com/rancher/wrangler/v3/pkg/signals" "github.com/sirupsen/logrus" diff --git a/internal/cmd/monitor/operator.go b/internal/cmd/monitor/operator.go index b88c0ba8a3..483ff62259 100644 --- a/internal/cmd/monitor/operator.go +++ b/internal/cmd/monitor/operator.go @@ -233,19 +233,19 @@ func start( // Returns error if any pattern is invalid func compileResourceFilters(cfg *ControllerLoggingConfig) error { if err := cfg.Bundle.ResourceFilter.Compile(); err != nil { - return fmt.Errorf("Bundle resource filter: %w", err) + return fmt.Errorf("bundle resource filter: %w", err) } if err := cfg.BundleDeployment.ResourceFilter.Compile(); err != nil { - return fmt.Errorf("BundleDeployment resource filter: %w", err) + return fmt.Errorf("bundleDeployment resource filter: %w", err) } if err := cfg.Cluster.ResourceFilter.Compile(); err != nil { - return fmt.Errorf("Cluster resource filter: %w", err) + return fmt.Errorf("cluster resource filter: %w", err) } if err := cfg.GitRepo.ResourceFilter.Compile(); err != nil { - return fmt.Errorf("GitRepo resource filter: %w", err) + return fmt.Errorf("gitRepo resource filter: %w", err) } if err := cfg.HelmOp.ResourceFilter.Compile(); err != nil { - return fmt.Errorf("HelmOp resource filter: %w", err) + return fmt.Errorf("helmOp resource filter: %w", err) } return nil } diff --git a/internal/cmd/monitor/reconciler/bundle_monitor.go b/internal/cmd/monitor/reconciler/bundle_monitor.go index 38b43b0575..cee54478df 100644 --- a/internal/cmd/monitor/reconciler/bundle_monitor.go +++ b/internal/cmd/monitor/reconciler/bundle_monitor.go @@ -200,7 +200,7 @@ func (r *BundleMonitorReconciler) Reconcile(ctx context.Context, req ctrl.Reques logger := log.FromContext(ctx).WithName("bundle-monitor") logger = logger.WithValues( - "bundle", req.NamespacedName.String(), + "bundle", req.String(), "mode", LogMode(r.DetailedLogs), ) ctx = log.IntoContext(ctx, logger) diff --git a/internal/cmd/monitor/reconciler/bundle_query.go b/internal/cmd/monitor/reconciler/bundle_query.go index 612b137173..0208a79a2f 100644 --- a/internal/cmd/monitor/reconciler/bundle_query.go +++ b/internal/cmd/monitor/reconciler/bundle_query.go @@ -5,8 +5,8 @@ package reconciler import ( "context" - fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "github.com/rancher/fleet/internal/cmd/controller/target/matcher" + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -80,7 +80,6 @@ func (q *bundleQueryImpl) getBundlesInScopeForCluster(ctx context.Context, clust return nil, err } for _, b := range bundles.Items { - b := b if b.Annotations["objectset.rio.cattle.io/id"] == "fleet-manage-agent" { if b.Name == "fleet-agent-"+cluster.Name { bundleSet.insertSingle(&b) @@ -99,7 +98,6 @@ func (q *bundleQueryImpl) getBundlesInScopeForCluster(ctx context.Context, clust logger := log.FromContext(ctx).WithName("bundle-query") for _, mapping := range mappings.Items { - mapping := mapping matcher, err := newBundleMapping(&mapping) if err != nil { logger.Error(err, "invalid BundleNamespaceMapping, skipping", "namespace", mapping.Namespace, "name", mapping.Name) @@ -127,7 +125,6 @@ func (q *bundleQueryImpl) clusterGroupsForCluster(ctx context.Context, cluster * logger := log.FromContext(ctx).WithName("bundle-query") for _, cg := range cgs.Items { - cg := cg if cg.Spec.Selector == nil { continue } diff --git a/internal/cmd/monitor/reconciler/bundledeployment_monitor.go b/internal/cmd/monitor/reconciler/bundledeployment_monitor.go index 36cede5df7..1d2ae7f92c 100644 --- a/internal/cmd/monitor/reconciler/bundledeployment_monitor.go +++ b/internal/cmd/monitor/reconciler/bundledeployment_monitor.go @@ -54,7 +54,7 @@ func (r *BundleDeploymentMonitorReconciler) Reconcile(ctx context.Context, req c logger := log.FromContext(ctx).WithName("bundledeployment-monitor") logger = logger.WithValues( - "bundledeployment", req.NamespacedName.String(), + "bundledeployment", req.String(), ) ctx = log.IntoContext(ctx, logger) diff --git a/internal/cmd/monitor/reconciler/cluster_monitor.go b/internal/cmd/monitor/reconciler/cluster_monitor.go index 58046e18bf..5024062aed 100644 --- a/internal/cmd/monitor/reconciler/cluster_monitor.go +++ b/internal/cmd/monitor/reconciler/cluster_monitor.go @@ -99,7 +99,7 @@ func (r *ClusterMonitorReconciler) Reconcile(ctx context.Context, req ctrl.Reque logger := log.FromContext(ctx).WithName("cluster-monitor") logger = logger.WithValues( - "cluster", req.NamespacedName.String(), + "cluster", req.String(), ) ctx = log.IntoContext(ctx, logger) diff --git a/internal/cmd/monitor/reconciler/gitrepo_monitor.go b/internal/cmd/monitor/reconciler/gitrepo_monitor.go index 0a74bc965a..c7eeddb8f9 100644 --- a/internal/cmd/monitor/reconciler/gitrepo_monitor.go +++ b/internal/cmd/monitor/reconciler/gitrepo_monitor.go @@ -5,8 +5,8 @@ package reconciler import ( "context" - fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "github.com/rancher/fleet/internal/config" + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "github.com/rancher/fleet/pkg/sharding" batchv1 "k8s.io/api/batch/v1" @@ -153,7 +153,7 @@ func (r *GitRepoMonitorReconciler) Reconcile(ctx context.Context, req ctrl.Reque logger := log.FromContext(ctx).WithName("gitrepo-monitor") logger = logger.WithValues( - "gitrepo", req.NamespacedName.String(), + "gitrepo", req.String(), "mode", LogMode(r.DetailedLogs)) ctx = log.IntoContext(ctx, logger) @@ -172,8 +172,6 @@ func (r *GitRepoMonitorReconciler) Reconcile(ctx context.Context, req ctrl.Reque if gitrepo.Labels[fleet.RepoLabel] != "" { logger = logger.WithValues("repo", gitrepo.Labels[fleet.RepoLabel]) } - ctx = log.IntoContext(ctx, logger) - // Check for deletion if !gitrepo.DeletionTimestamp.IsZero() { logDeletion(logger, r.DetailedLogs, r.EventFilters, "GitRepo", gitrepo.Namespace, gitrepo.Name, gitrepo.DeletionTimestamp.String()) diff --git a/internal/cmd/monitor/reconciler/helmop_monitor.go b/internal/cmd/monitor/reconciler/helmop_monitor.go index 8990ec4dd4..74e3bc168d 100644 --- a/internal/cmd/monitor/reconciler/helmop_monitor.go +++ b/internal/cmd/monitor/reconciler/helmop_monitor.go @@ -64,7 +64,7 @@ func (r *HelmOpMonitorReconciler) Reconcile(ctx context.Context, req ctrl.Reques logger := log.FromContext(ctx).WithName("helmop-monitor") logger = logger.WithValues( - "helmop", req.NamespacedName.String(), + "helmop", req.String(), ) ctx = log.IntoContext(ctx, logger) diff --git a/internal/cmd/monitor/reconciler/predicate_test.go b/internal/cmd/monitor/reconciler/predicate_test.go index 6d2db35e6c..a47fe3e74e 100644 --- a/internal/cmd/monitor/reconciler/predicate_test.go +++ b/internal/cmd/monitor/reconciler/predicate_test.go @@ -218,7 +218,7 @@ func TestNonSecretAnnotationChangedPredicate(t *testing.T) { t.Run("Update with secret hash and regular annotation change returns true", func(t *testing.T) { old := &fleet.GitRepo{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ "fleet.cattle.io/client-secret-hash": "old-hash", - "app": "v1", + "app": "v1", }}} new := old.DeepCopy() new.Annotations["fleet.cattle.io/client-secret-hash"] = "new-hash" diff --git a/internal/cmd/monitor/reconciler/stats_test.go b/internal/cmd/monitor/reconciler/stats_test.go index 028d7598eb..0eef30a3b3 100644 --- a/internal/cmd/monitor/reconciler/stats_test.go +++ b/internal/cmd/monitor/reconciler/stats_test.go @@ -206,7 +206,7 @@ func TestSummary_ToJSONIndent(t *testing.T) { func TestResourceStats_MarshalJSON(t *testing.T) { rs := &ResourceStats{ Counts: map[EventType]int64{ - EventTypeCreate: 3, + EventTypeCreate: 3, EventTypeStatusChange: 1, }, TriggeredBy: map[string]int64{ From 7738a89a66602671b6c51e9485e8eea0be35b420 Mon Sep 17 00:00:00 2001 From: Xavi Garcia Date: Fri, 13 Mar 2026 12:58:00 +0100 Subject: [PATCH 3/3] Code review changes Signed-off-by: Xavi Garcia --- EVENT-MONITOR.md | 2 +- .../templates/deployment.yaml | 2 + dev/build-event-monitor | 2 +- internal/cmd/monitor/operator.go | 22 +++++++--- internal/cmd/monitor/root.go | 42 +++++++++++-------- package/Dockerfile.event-monitor | 3 +- 6 files changed, 46 insertions(+), 27 deletions(-) diff --git a/EVENT-MONITOR.md b/EVENT-MONITOR.md index d03ed1018a..0883008ed8 100644 --- a/EVENT-MONITOR.md +++ b/EVENT-MONITOR.md @@ -459,7 +459,7 @@ logging: statusChange: false annotationChange: false labelChange: false - resourceVersionChange: true + resourceVersionChange: false deletion: false notFound: false create: false diff --git a/charts/fleet-event-monitor/templates/deployment.yaml b/charts/fleet-event-monitor/templates/deployment.yaml index 0291111372..f26723e06d 100644 --- a/charts/fleet-event-monitor/templates/deployment.yaml +++ b/charts/fleet-event-monitor/templates/deployment.yaml @@ -69,6 +69,8 @@ spec: - name: CATTLE_DEV_MODE value: "true" {{- end }} + - name: FLEET_LEADER_ELECTION_ENABLED + value: {{ .Values.leaderElection.enabled | quote }} {{- if .Values.leaderElection.enabled }} - name: CATTLE_ELECTION_LEASE_DURATION value: {{ .Values.leaderElection.leaseDuration | quote }} diff --git a/dev/build-event-monitor b/dev/build-event-monitor index a8a183ec16..c0803a2c72 100755 --- a/dev/build-event-monitor +++ b/dev/build-event-monitor @@ -12,7 +12,7 @@ export GOARCH="${GOARCH:-amd64}" export CGO_ENABLED=0 # re-generate code -if ! git diff --quiet HEAD origin/main -- pkg/apis/fleet.cattle.io/v1alpha1; then +if git rev-parse --verify origin/main >/dev/null 2>&1 && ! git diff --quiet HEAD origin/main -- pkg/apis/fleet.cattle.io/v1alpha1; then go generate fi diff --git a/internal/cmd/monitor/operator.go b/internal/cmd/monitor/operator.go index 483ff62259..c1c8c11b4a 100644 --- a/internal/cmd/monitor/operator.go +++ b/internal/cmd/monitor/operator.go @@ -55,6 +55,8 @@ type MonitorOptions struct { EnableHelmOp bool Workers MonitorReconcilerWorkers + LeaderElectionEnabled bool + // Per-controller logging configuration ControllerLogging ControllerLoggingConfig @@ -107,17 +109,20 @@ func start( leaderElectionSuffix = fmt.Sprintf("-%s", shardID) } - mgr, err := ctrl.NewManager(config, ctrl.Options{ + managerOpts := ctrl.Options{ Scheme: scheme, Metrics: metricServerOptions, HealthProbeBindAddress: "0", // No health probes - LeaderElection: true, + LeaderElection: monitorOpts.LeaderElectionEnabled, LeaderElectionID: fmt.Sprintf("fleet-event-monitor-leader-election-shard%s", leaderElectionSuffix), LeaderElectionNamespace: systemNamespace, - LeaseDuration: &leaderOpts.LeaseDuration, - RenewDeadline: &leaderOpts.RenewDeadline, - RetryPeriod: &leaderOpts.RetryPeriod, - }) + } + if monitorOpts.LeaderElectionEnabled { + managerOpts.LeaseDuration = &leaderOpts.LeaseDuration + managerOpts.RenewDeadline = &leaderOpts.RenewDeadline + managerOpts.RetryPeriod = &leaderOpts.RetryPeriod + } + mgr, err := ctrl.NewManager(config, managerOpts) if err != nil { setupLog.Error(err, "unable to start manager") return err @@ -271,6 +276,11 @@ func logResourceFilters(cfg *ControllerLoggingConfig) { // startSummaryPrinter periodically prints statistics summary func startSummaryPrinter(ctx context.Context, interval time.Duration, reset bool) { + const defaultInterval = 30 * time.Second + if interval <= 0 { + setupLog.Info("summary interval is zero or negative, using default", "default", defaultInterval) + interval = defaultInterval + } ticker := time.NewTicker(interval) defer ticker.Stop() diff --git a/internal/cmd/monitor/root.go b/internal/cmd/monitor/root.go index eb70bb494b..9c55347f68 100644 --- a/internal/cmd/monitor/root.go +++ b/internal/cmd/monitor/root.go @@ -129,9 +129,29 @@ func (f *FleetMonitor) Run(cmd *cobra.Command, args []string) error { kubeconfig := ctrl.GetConfigOrDie() workersOpts := MonitorReconcilerWorkers{} - leaderOpts, err := command.NewLeaderElectionOptions() - if err != nil { - return err + // The wrangler command framework does not reliably parse boolean env vars, + // so all boolean env vars are parsed manually here. The struct fields + // above intentionally omit env: tags to avoid a dual source of truth. + parseBoolEnv := func(key string, defaultValue bool) bool { + if val := os.Getenv(key); val != "" { + b, err := strconv.ParseBool(val) + if err != nil { + setupLog.Error(err, "failed to parse boolean env var", "key", key, "value", val) + return defaultValue + } + return b + } + return defaultValue + } + + leaderElectionEnabled := parseBoolEnv("FLEET_LEADER_ELECTION_ENABLED", true) + var leaderOpts command.LeaderElectionOptions + if leaderElectionEnabled { + var err error + leaderOpts, err = command.NewLeaderElectionOptions() + if err != nil { + return err + } } if d := os.Getenv("BUNDLE_RECONCILER_WORKERS"); d != "" { @@ -174,21 +194,6 @@ func (f *FleetMonitor) Run(cmd *cobra.Command, args []string) error { workersOpts.HelmOp = w } - // The wrangler command framework does not reliably parse boolean env vars, - // so all boolean env vars are parsed manually here. The struct fields - // above intentionally omit env: tags to avoid a dual source of truth. - parseBoolEnv := func(key string, defaultValue bool) bool { - if val := os.Getenv(key); val != "" { - b, err := strconv.ParseBool(val) - if err != nil { - setupLog.Error(err, "failed to parse boolean env var", "key", key, "value", val) - return defaultValue - } - return b - } - return defaultValue - } - // Parse controller enable flags enableBundle := parseBoolEnv("ENABLE_BUNDLE_EVENT_MONITOR", f.EnableBundleMonitor) enableBundleDeployment := parseBoolEnv("ENABLE_BUNDLEDEPLOYMENT_EVENT_MONITOR", f.EnableBundleDeploymentMonitor) @@ -312,6 +317,7 @@ func (f *FleetMonitor) Run(cmd *cobra.Command, args []string) error { EnableGitRepo: enableGitRepo, EnableHelmOp: enableHelmOp, Workers: workersOpts, + LeaderElectionEnabled: leaderElectionEnabled, // Per-controller logging configuration ControllerLogging: ControllerLoggingConfig{ diff --git a/package/Dockerfile.event-monitor b/package/Dockerfile.event-monitor index 758f3d95a1..df9eaebff0 100644 --- a/package/Dockerfile.event-monitor +++ b/package/Dockerfile.event-monitor @@ -1,7 +1,8 @@ ARG BUILD_ENV=dapper ARG ARCH +ARG TARGETARCH -FROM --platform=linux/$ARCH registry.suse.com/bci/bci-base:15.7 AS base +FROM --platform=linux/${TARGETARCH:-${ARCH:-amd64}} registry.suse.com/bci/bci-base:15.7 AS base RUN zypper rm -y container-suseconnect && \ zypper ar --priority=500 https://download.opensuse.org/repositories/Virtualization:containers/5.5/Virtualization:containers.repo && \ zypper --gpg-auto-import-keys ref && \