ironcore-dev · xkonni · Mar 11, 2026 · Mar 2, 2026 · Mar 3, 2026 · Mar 6, 2026
diff --git a/README.md b/README.md
@@ -97,6 +97,27 @@ Users can just run kubectl apply -f <URL for YAML BUNDLE> to install the project
 kubectl apply -f https://raw.githubusercontent.com/<org>/metal-operator/<tag or branch>/dist/install.yaml
 ```
 
+## Monitoring
+
+The metal-operator exposes custom Prometheus metrics for monitoring server state, power operations, and reconciliation performance. Metrics are available at the `/metrics` endpoint and include:
+
+- **Server State Distribution** - Count of servers by state (Available, Reserved, Error, etc.)
+- **Server Power State** - Count of servers by power state (On, Off, PoweringOn, etc.)
+- **Server Conditions** - Health status of server conditions (Ready, Discovered, etc.)
+- **Reconciliation Metrics** - Success/error counts for reconciliation operations
+
+For detailed metrics documentation, example queries, and alerting rules, see [docs/metrics.md](docs/metrics.md).
+
+### Quick Example
+
+```bash
+# Port-forward to metrics endpoint
+kubectl -n metal-operator-system port-forward deployment/metal-operator-controller-manager 8443:8443
+
+# Query server metrics
+curl -k https://localhost:8443/metrics | grep metal_server
+```
+
 ## Contributing
 
 **NOTE:** Run `make help` for more information on all potential `make` targets

@@ -28,6 +28,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/certwatcher"
 	"sigs.k8s.io/controller-runtime/pkg/healthz"
 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
+	ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
 	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 	"sigs.k8s.io/controller-runtime/pkg/webhook"
@@ -38,6 +39,7 @@ import (
 	metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1"
 	"github.com/ironcore-dev/metal-operator/internal/api/macdb"
 	"github.com/ironcore-dev/metal-operator/internal/controller"
+	metalmetrics "github.com/ironcore-dev/metal-operator/internal/metrics"
 	"github.com/ironcore-dev/metal-operator/internal/registry"
 	// +kubebuilder:scaffold:imports
 )
@@ -321,6 +323,11 @@ func main() { // nolint: gocyclo
 		os.Exit(1)
 	}
 
+	// Register custom Prometheus metrics collector for server states
+	serverCollector := metalmetrics.NewServerStateCollector(mgr.GetClient())
+	ctrlmetrics.Registry.MustRegister(serverCollector)
+	setupLog.Info("Registered custom server metrics collector")
+
 	if err = (&controller.EndpointReconciler{
 		Client:      mgr.GetClient(),
 		Scheme:      mgr.GetScheme(),

@@ -0,0 +1,88 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: metal-operator-server-alerts
+  namespace: metal-operator-system
+  labels:
+    control-plane: controller-manager
+spec:
+  groups:
+  - name: metal_operator_servers
+    interval: 30s
+    rules:
+    - alert: NoAvailableServers
+      expr: sum(metal_server_state{state="Available"} or on() vector(0)) < 1
+      for: 5m
+      annotations:
+        summary: "No available servers in the fleet"
+        description: "All servers are either Reserved, in Maintenance, or in Error state"
+      labels:
+        severity: warning
+
+    - alert: ServersInErrorState
+      expr: metal_server_state{state="Error"} > 0
+      for: 2m
+      annotations:
+        summary: "Servers are in Error state"
+        description: "{{ $value }} server(s) are in Error state and require attention"
+      labels:
+        severity: critical
+
+    - alert: ServersPoweringOnTooLong
+      expr: metal_server_power_state{power_state="PoweringOn"} > 0
+      for: 10m
+      annotations:
+        summary: "Servers stuck in PoweringOn state"
+        description: "{{ $value }} server(s) have been in PoweringOn state for over 10 minutes"
+      labels:
+        severity: warning
+
+    - alert: ServersPoweringOffTooLong
+      expr: metal_server_power_state{power_state="PoweringOff"} > 0
+      for: 10m
+      annotations:
+        summary: "Servers stuck in PoweringOff state"
+        description: "{{ $value }} server(s) have been in PoweringOff state for over 10 minutes"
+      labels:
+        severity: warning
+
+    - alert: HighReconciliationErrorRate
+      expr: rate(metal_server_reconciliation_total{result=~"error_.*"}[5m]) > 0.1
+      for: 5m
+      annotations:
+        summary: "High server reconciliation error rate"
+        description: "Server reconciliation errors are occurring at {{ $value | humanize }} per second"
+      labels:
+        severity: warning
+
+    - alert: LowAvailableServerCapacity
+      expr: sum(metal_server_state{state="Available"} or on() vector(0)) < 2
+      for: 5m
+      annotations:
+        summary: "Low available server capacity"
+        description: "Only {{ $value }} server(s) are available"
+      labels:
+        severity: warning
+
+    - alert: ServerMetricsMissing
+      expr: absent(metal_server_state{state="Available"})
+      for: 5m
+      annotations:
+        summary: "Server metrics are not being collected"
+        description: "The metal-operator metrics endpoint is not reporting server state metrics"
+      labels:
+        severity: critical
+
+    - alert: ServerReconciliationFailureSpike
+      expr: |
+        (
+          sum(rate(metal_server_reconciliation_total{result=~"error_.*"}[5m]))
+          /
+          sum(rate(metal_server_reconciliation_total[5m]))
+        ) > 0.5
+      for: 10m
+      annotations:
+        summary: "High rate of server reconciliation failures"
+        description: "More than 50% of server reconciliations are failing"
+      labels:
+        severity: critical