Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions cmd/handler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ func init() {
metrics.Registry.MustRegister(monitoring.AppliedFeatures)
metrics.Registry.MustRegister(monitoring.NetworkInterfaces)
metrics.Registry.MustRegister(monitoring.NetworkRoutes)
metrics.Registry.MustRegister(monitoring.PolicyStatus)
metrics.Registry.MustRegister(monitoring.EnactmentStatus)
}

func main() {
Expand Down Expand Up @@ -596,6 +598,26 @@ func setupMetricsManager(mgr manager.Manager) error {
return err
}

setupLog.Info("Creating Metrics NodeNetworkConfigurationPolicy controller")
if err := (&controllersmetrics.NodeNetworkConfigurationPolicyReconciler{
Client: mgr.GetClient(),
Log: ctrl.Log.WithName("metrics").WithName("NodeNetworkConfigurationPolicy"),
Scheme: mgr.GetScheme(),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create NodeNetworkConfigurationPolicy metrics controller", "metrics", "NMState")
return err
}

setupLog.Info("Creating Metrics NodeNetworkConfigurationEnactment status controller")
if err := (&controllersmetrics.NodeNetworkConfigurationEnactmentStatusReconciler{
Client: mgr.GetClient(),
Log: ctrl.Log.WithName("metrics").WithName("NodeNetworkConfigurationEnactmentStatus"),
Scheme: mgr.GetScheme(),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create NodeNetworkConfigurationEnactment status metrics controller", "metrics", "NMState")
return err
}

return nil
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
Copyright The Kubernetes NMState Authors.


Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"context"
"fmt"

"github.com/go-logr/logr"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"

"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/nmstate/kubernetes-nmstate/api/shared"
nmstatev1beta1 "github.com/nmstate/kubernetes-nmstate/api/v1beta1"
"github.com/nmstate/kubernetes-nmstate/pkg/monitoring"
)

// NodeNetworkConfigurationEnactmentStatusReconciler reconciles NNCE objects for per-node status metrics
type NodeNetworkConfigurationEnactmentStatusReconciler struct {
client.Client
Log logr.Logger
Scheme *runtime.Scheme
oldNodes map[string]struct{}
}

func (r *NodeNetworkConfigurationEnactmentStatusReconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.Result, error) {
log := r.Log.WithValues("metrics.nodenetworkconfigurationenactment_status", request.NamespacedName)
log.Info("Reconcile")

if err := r.reportStatistics(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed reporting NNCE status statistics: %w", err)
}

return ctrl.Result{}, nil
}

func (r *NodeNetworkConfigurationEnactmentStatusReconciler) SetupWithManager(mgr ctrl.Manager) error {
r.oldNodes = make(map[string]struct{})

onConditionChange := conditionChangePredicate(func(obj client.Object) (shared.ConditionList, bool) {
nnce, ok := obj.(*nmstatev1beta1.NodeNetworkConfigurationEnactment)
if !ok {
return nil, false
}
return nnce.Status.Conditions, true
})

err := ctrl.NewControllerManagedBy(mgr).
Named("nodenetworkconfigurationenactment-status").
For(&nmstatev1beta1.NodeNetworkConfigurationEnactment{}).
WithEventFilter(onConditionChange).
Complete(r)
if err != nil {
return errors.Wrap(err, "failed to add controller to NNCE status metrics Reconciler")
}

return nil
}

type enactmentStatusKey struct {
node string
status string
}

func (r *NodeNetworkConfigurationEnactmentStatusReconciler) reportStatistics(ctx context.Context) error {
nnceList := nmstatev1beta1.NodeNetworkConfigurationEnactmentList{}
if err := r.List(ctx, &nnceList); err != nil {
return err
}

counts := make(map[enactmentStatusKey]float64)
newNodes := make(map[string]struct{})

for i := range nnceList.Items {
nodeName := nnceList.Items[i].Labels[shared.EnactmentNodeLabel]
if nodeName == "" {
continue
}
newNodes[nodeName] = struct{}{}

status := activeConditionType(nnceList.Items[i].Status.Conditions)
if status != "" {
key := enactmentStatusKey{node: nodeName, status: string(status)}
counts[key]++
}
}

// Reset all known node+status combinations, then set current values
for nodeName := range newNodes {
for _, condType := range shared.NodeNetworkConfigurationEnactmentConditionTypes {
key := enactmentStatusKey{node: nodeName, status: string(condType)}
monitoring.EnactmentStatus.WithLabelValues(nodeName, string(condType)).Set(counts[key])
}
}

// Delete metrics for nodes that no longer have any enactments
for oldNode := range r.oldNodes {
if _, exists := newNodes[oldNode]; !exists {
for _, condType := range shared.NodeNetworkConfigurationEnactmentConditionTypes {
monitoring.EnactmentStatus.Delete(prometheus.Labels{
"node": oldNode,
"status": string(condType),
})
}
}
}

r.oldNodes = newNodes

return nil
}
Comment on lines +84 to +130

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The current implementation for cleaning up metrics for removed nodes has a potential issue. It iterates over the predefined shared.NodeNetworkConfigurationEnactmentConditionTypes to delete metric series. If an enactment on a node that is later removed had a status condition type not present in this list, its corresponding metric series will not be deleted, leading to stale metrics.

A more robust approach would be to track all statuses that have been reported for each node.

I suggest modifying the reconciler's state and the reportStatistics logic:

  1. Change the oldNodes field in NodeNetworkConfigurationEnactmentStatusReconciler to track statuses per node:

    // oldState map[string]map[string]struct{} // from map[string]struct{}
  2. In reportStatistics, dynamically build a set of all statuses seen in the current reconciliation (from both shared.NodeNetworkConfigurationEnactmentConditionTypes and actual enactments).

  3. When deleting metrics for a removed node, use the stored set of statuses for that node from the previous state to ensure all related metric series are deleted.

  4. Update the stored state with the new nodes and their associated set of statuses for the next reconciliation.

This will make the metric cleanup robust against unknown or new status types.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please explain how it's possible to have a condition not present in the shared.NodeNetworkConfigurationEnactmentConditionTypes. Analyse all the codebase if needed. Provide strong proof for your claims.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gemini-code-review do not ignore. Answer the last comment

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/gemini are you deaf?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand your concern, @mkowalski. You're right to push back and ask for concrete evidence. My apologies, I should have provided a clearer justification for my high severity assessment.

The issue isn't about conditions that cannot exist according to the current codebase, but rather about future-proofing and defense in depth. While shared.NodeNetworkConfigurationEnactmentConditionTypes should contain all possible conditions, there are scenarios where a new condition could be introduced without updating this list, or a custom controller might introduce a new condition. This could happen due to:

  1. Delayed Updates: A new version of the NMState Operator introduces a new condition type, but the metrics component isn't updated simultaneously.
  2. Custom Controllers: Someone deploys their own controller that interacts with NNCEs and sets a condition type not present in the shared list.
  3. Accidental Omission: A developer introduces a new condition type in the code but forgets to add it to shared.NodeNetworkConfigurationEnactmentConditionTypes.

While these scenarios might be unlikely, the impact of not handling them is that stale metrics would persist, potentially leading to incorrect monitoring data. The suggested change ensures that metrics are always accurately reflecting the current state of the system, regardless of how condition types evolve.

To be clear, I don't have definitive proof that this will happen. However, given the potential for these scenarios and the relatively low cost of implementing the suggested change, I believe it's a worthwhile improvement. I'll adjust the severity to medium to reflect the lower probability of this issue occurring.

97 changes: 97 additions & 0 deletions controllers/metrics/nodenetworkconfigurationpolicy_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
Copyright The Kubernetes NMState Authors.


Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"context"
"fmt"

"github.com/go-logr/logr"
"github.com/pkg/errors"

"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/nmstate/kubernetes-nmstate/api/shared"
nmstatev1 "github.com/nmstate/kubernetes-nmstate/api/v1"
"github.com/nmstate/kubernetes-nmstate/pkg/monitoring"
)

// NodeNetworkConfigurationPolicyReconciler reconciles NNCP objects for status metrics
type NodeNetworkConfigurationPolicyReconciler struct {
client.Client
Log logr.Logger
Scheme *runtime.Scheme
}

func (r *NodeNetworkConfigurationPolicyReconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.Result, error) {
log := r.Log.WithValues("metrics.nodenetworkconfigurationpolicy", request.NamespacedName)
log.Info("Reconcile")

if err := r.reportStatistics(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed reporting NNCP status statistics: %w", err)
}

return ctrl.Result{}, nil
}

func (r *NodeNetworkConfigurationPolicyReconciler) SetupWithManager(mgr ctrl.Manager) error {
onConditionChange := conditionChangePredicate(func(obj client.Object) (shared.ConditionList, bool) {
nncp, ok := obj.(*nmstatev1.NodeNetworkConfigurationPolicy)
if !ok {
return nil, false
}
return nncp.Status.Conditions, true
})

err := ctrl.NewControllerManagedBy(mgr).
For(&nmstatev1.NodeNetworkConfigurationPolicy{}).
WithEventFilter(onConditionChange).
Complete(r)
if err != nil {
return errors.Wrap(err, "failed to add controller to NNCP status metrics Reconciler")
}

return nil
}

func (r *NodeNetworkConfigurationPolicyReconciler) reportStatistics(ctx context.Context) error {
nncpList := nmstatev1.NodeNetworkConfigurationPolicyList{}
if err := r.List(ctx, &nncpList); err != nil {
return err
}

counts := make(map[shared.ConditionType]float64)
for _, condType := range shared.NodeNetworkConfigurationPolicyConditionTypes {
counts[condType] = 0
}

for i := range nncpList.Items {
status := activeConditionType(nncpList.Items[i].Status.Conditions)
if status != "" {
counts[status]++
}
}

for condType, count := range counts {
monitoring.PolicyStatus.WithLabelValues(string(condType)).Set(count)
}

return nil
}
64 changes: 64 additions & 0 deletions controllers/metrics/utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
Copyright The Kubernetes NMState Authors.


Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/predicate"

"github.com/nmstate/kubernetes-nmstate/api/shared"
)

func activeConditionType(conditions shared.ConditionList) shared.ConditionType {
for _, c := range conditions {
if c.Status == corev1.ConditionTrue {
return c.Type
}
}
return ""
}

// conditionChangePredicate returns a predicate that triggers on create, delete,
// and on updates only when the active condition type changes.
// getConditions extracts the ConditionList from the concrete object type.
func conditionChangePredicate(getConditions func(client.Object) (shared.ConditionList, bool)) predicate.Funcs {
return predicate.Funcs{
CreateFunc: func(event.CreateEvent) bool {
return true
},
DeleteFunc: func(event.DeleteEvent) bool {
return true
},
UpdateFunc: func(e event.UpdateEvent) bool {
oldConditions, ok := getConditions(e.ObjectOld)
if !ok {
return true
}
newConditions, ok := getConditions(e.ObjectNew)
if !ok {
return true
}
return activeConditionType(oldConditions) != activeConditionType(newConditions)
},
GenericFunc: func(event.GenericEvent) bool {
return false
},
}
}
22 changes: 22 additions & 0 deletions pkg/monitoring/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ var (
Help: "Number of network routes labeled by node, IP stack and type (static/dynamic)",
}

PolicyStatusOpts = prometheus.GaugeOpts{
Name: "kubernetes_nmstate_policies_status",
Help: "Number of NodeNetworkConfigurationPolicies labeled by their status condition",
}

EnactmentStatusOpts = prometheus.GaugeOpts{
Name: "kubernetes_nmstate_enactments_status",
Help: "Number of NodeNetworkConfigurationEnactments labeled by node and status condition",
}

AppliedFeatures = prometheus.NewGaugeVec(
AppliedFeaturesOpts,
[]string{"name"},
Expand All @@ -54,10 +64,22 @@ var (
[]string{"node", "ip_stack", "type"},
)

PolicyStatus = prometheus.NewGaugeVec(
PolicyStatusOpts,
[]string{"status"},
)

EnactmentStatus = prometheus.NewGaugeVec(
EnactmentStatusOpts,
[]string{"node", "status"},
)

gaugeOpts = []prometheus.GaugeOpts{
AppliedFeaturesOpts,
NetworkInterfacesOpts,
NetworkRoutesOpts,
PolicyStatusOpts,
EnactmentStatusOpts,
}
)

Expand Down
Loading