Skip to content

Commit 3ff1b85

Browse files
committed
metrics: Add node label to sriov_* metrics
It might happen that two SR-IOV pods, deployed on different node, are using devices with the same PCI address. In such cases, the query suggested [1] by the sriov-network-metrics-exporter produces the error: ``` Error loading values found duplicate series for the match group {pciAddr="0000:3b:02.4"} on the right hand-side of the operation: [ { __name__="sriov_kubepoddevice", container="test", dev_type="openshift.io/intelnetdevice", endpoint="sriov-network-metrics", instance="10.1.98.60:9110", job="sriov-network-metrics-exporter-service", namespace="cnf-4916", pciAddr="0000:3b:02.4", pod="pod-cnfdr22.telco5g.eng.rdu2.redhat.com", prometheus="openshift-monitoring/k8s", service="sriov-network-metrics-exporter-service" }, { __name__="sriov_kubepoddevice", container="test", dev_type="openshift.io/intelnetdevice", endpoint="sriov-network-metrics", instance="10.1.98.230:9110", job="sriov-network-metrics-exporter-service", namespace="cnf-4916", pciAddr="0000:3b:02.4", pod="pod-dhcp-98-230.telco5g.eng.rdu2.redhat.com", prometheus="openshift-monitoring/k8s", service="sriov-network-metrics-exporter-service" } ];many-to-many matching not allowed: matching labels must be unique on one side ``` Configure the ServiceMonitor resource to add a `node` label to all metrics. The right query to get metrics, as updated in the PrometheusRule, will be: ``` sriov_vf_tx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice ``` Also remove `pod`, `namespace` and `container` label from the `sriov_vf_*` metrics, as they were wrongly set to `sriov-network-metrics-exporter-zj2n9`, `openshift-sriov-network-operator`, `kube-rbac-proxy` [1] https://github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/blob/0f6a784f377ede87b95f31e569116ceb9775b5b9/README.md?plain=1#L38 Signed-off-by: Andrea Panattoni <[email protected]>
1 parent f17bb2a commit 3ff1b85

File tree

4 files changed

+111
-31
lines changed

4 files changed

+111
-31
lines changed

bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,28 +11,28 @@ spec:
1111
interval: 30s
1212
rules:
1313
- expr: |
14-
sriov_vf_tx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
14+
sriov_vf_tx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
1515
record: network:sriov_vf_tx_packets
1616
- expr: |
17-
sriov_vf_rx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
17+
sriov_vf_rx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
1818
record: network:sriov_vf_rx_packets
1919
- expr: |
20-
sriov_vf_tx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
20+
sriov_vf_tx_bytes * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
2121
record: network:sriov_vf_tx_bytes
2222
- expr: |
23-
sriov_vf_rx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
23+
sriov_vf_rx_bytes * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
2424
record: network:sriov_vf_rx_bytes
2525
- expr: |
26-
sriov_vf_tx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
26+
sriov_vf_tx_dropped * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
2727
record: network:sriov_vf_tx_dropped
2828
- expr: |
29-
sriov_vf_rx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
29+
sriov_vf_rx_dropped * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
3030
record: network:sriov_vf_rx_dropped
3131
- expr: |
32-
sriov_vf_rx_broadcast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
32+
sriov_vf_rx_broadcast * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
3333
record: network:sriov_vf_rx_broadcast
3434
- expr: |
35-
sriov_vf_rx_multicast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
35+
sriov_vf_rx_multicast * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
3636
record: network:sriov_vf_rx_multicast
3737
{{ end }}
3838

bindata/manifests/metrics-exporter/metrics-prometheus.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,17 @@ spec:
1212
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token"
1313
scheme: "https"
1414
honorLabels: true
15+
relabelings:
16+
- action: replace
17+
sourceLabels:
18+
- __meta_kubernetes_endpoint_node_name
19+
targetLabel: node
20+
- action: labeldrop
21+
regex: pod
22+
- action: labeldrop
23+
regex: container
24+
- action: labeldrop
25+
regex: namespace
1526
tlsConfig:
1627
serverName: sriov-network-metrics-exporter-service.{{.Namespace}}.svc
1728
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt

test/conformance/tests/test_exporter_metrics.go

Lines changed: 72 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,18 @@ import (
1919
"github.com/prometheus/common/model"
2020

2121
corev1 "k8s.io/api/core/v1"
22+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2223
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2324

2425
. "github.com/onsi/ginkgo/v2"
2526
. "github.com/onsi/gomega"
2627
)
2728

28-
var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
29+
var _ = Describe("[sriov] Metrics Exporter", Ordered, ContinueOnFailure, func() {
2930
var node string
3031
var nic *sriovv1.InterfaceExt
3132

3233
BeforeAll(func() {
33-
if cluster.VirtualCluster() {
34-
Skip("IGB driver does not support VF statistics")
35-
}
36-
3734
err := namespaces.Create(namespaces.Test, clients)
3835
Expect(err).ToNot(HaveOccurred())
3936

@@ -73,6 +70,9 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
7370
})
7471

7572
It("collects metrics regarding receiving traffic via VF", func() {
73+
if cluster.VirtualCluster() {
74+
Skip("IGB driver does not support VF statistics")
75+
}
7676

7777
pod := createTestPod(node, []string{"test-me-network"})
7878
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)
@@ -98,27 +98,76 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
9898
Expect(finalRxPackets).Should(BeNumerically(">", initialRxPackets))
9999
})
100100

101-
It("PrometheusRule should provide namespaced metrics", func() {
102-
pod := createTestPod(node, []string{"test-me-network"})
103-
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)
101+
Context("When Prometheus operator is available", func() {
102+
BeforeEach(func() {
103+
_, err := clients.ServiceMonitors(operatorNamespace).List(context.Background(), metav1.ListOptions{})
104+
if k8serrors.IsNotFound(err) {
105+
Skip("Prometheus operator not available in the cluster")
106+
}
107+
})
104108

105-
namespacedMetricNames := []string{
106-
"network:sriov_vf_rx_bytes",
107-
"network:sriov_vf_tx_bytes",
108-
"network:sriov_vf_rx_packets",
109-
"network:sriov_vf_tx_packets",
110-
"network:sriov_vf_rx_dropped",
111-
"network:sriov_vf_tx_dropped",
112-
"network:sriov_vf_rx_broadcast",
113-
"network:sriov_vf_rx_multicast",
114-
}
109+
It("PrometheusRule should provide namespaced metrics", func() {
110+
pod := createTestPod(node, []string{"test-me-network"})
111+
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)
112+
113+
namespacedMetricNames := []string{
114+
"network:sriov_vf_rx_bytes",
115+
"network:sriov_vf_tx_bytes",
116+
"network:sriov_vf_rx_packets",
117+
"network:sriov_vf_tx_packets",
118+
"network:sriov_vf_rx_dropped",
119+
"network:sriov_vf_tx_dropped",
120+
"network:sriov_vf_rx_broadcast",
121+
"network:sriov_vf_rx_multicast",
122+
}
115123

116-
Eventually(func(g Gomega) {
117-
for _, metricName := range namespacedMetricNames {
118-
values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name))
119-
g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName)
124+
Eventually(func(g Gomega) {
125+
for _, metricName := range namespacedMetricNames {
126+
values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name))
127+
g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName)
128+
}
129+
}, "90s", "1s").Should(Succeed())
130+
})
131+
132+
It("Metrics should have the correct labels", func() {
133+
pod := createTestPod(node, []string{"test-me-network"})
134+
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)
135+
136+
metricsName := []string{
137+
"sriov_vf_rx_bytes",
138+
"sriov_vf_tx_bytes",
139+
"sriov_vf_rx_packets",
140+
"sriov_vf_tx_packets",
141+
"sriov_vf_rx_dropped",
142+
"sriov_vf_tx_dropped",
143+
"sriov_vf_rx_broadcast",
144+
"sriov_vf_rx_multicast",
120145
}
121-
}, "40s", "1s").Should(Succeed())
146+
147+
Eventually(func(g Gomega) {
148+
for _, metricName := range metricsName {
149+
samples := runPromQLQuery(metricName)
150+
g.Expect(samples).ToNot(BeEmpty(), "no value for metric %s", metricName)
151+
g.Expect(samples[0].Metric).To(And(
152+
HaveKey(model.LabelName("pciAddr")),
153+
HaveKey(model.LabelName("node")),
154+
HaveKey(model.LabelName("pf")),
155+
HaveKey(model.LabelName("vf")),
156+
))
157+
}
158+
}, "90s", "1s").Should(Succeed())
159+
160+
// sriov_kubepoddevice has a different sets of label than statistics metrics
161+
samples := runPromQLQuery(fmt.Sprintf(`sriov_kubepoddevice{namespace="%s",pod="%s"}`, pod.Namespace, pod.Name))
162+
Expect(samples).ToNot(BeEmpty(), "no value for metric sriov_kubepoddevice")
163+
Expect(samples[0].Metric).To(And(
164+
HaveKey(model.LabelName("pciAddr")),
165+
HaveKeyWithValue(model.LabelName("node"), model.LabelValue(pod.Spec.NodeName)),
166+
HaveKeyWithValue(model.LabelName("dev_type"), model.LabelValue("openshift.io/metricsResource")),
167+
HaveKeyWithValue(model.LabelName("namespace"), model.LabelValue(pod.Namespace)),
168+
HaveKeyWithValue(model.LabelName("pod"), model.LabelValue(pod.Name)),
169+
))
170+
})
122171
})
123172
})
124173

test/util/k8sreporter/reporter.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ import (
1010

1111
sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
1212
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces"
13+
14+
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
15+
rbacv1 "k8s.io/api/rbac/v1"
1316
)
1417

1518
func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) {
@@ -18,6 +21,17 @@ func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) {
1821
if err != nil {
1922
return err
2023
}
24+
25+
err = monitoringv1.AddToScheme(s)
26+
if err != nil {
27+
return err
28+
}
29+
30+
err = rbacv1.AddToScheme(s)
31+
if err != nil {
32+
return err
33+
}
34+
2135
return nil
2236
}
2337

@@ -38,6 +52,8 @@ func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) {
3852
return true
3953
case multusNamespace != "" && ns == multusNamespace:
4054
return true
55+
case ns == "openshift-monitoring":
56+
return true
4157
}
4258
return false
4359
}
@@ -47,6 +63,10 @@ func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) {
4763
{Cr: &sriovv1.SriovNetworkNodePolicyList{}},
4864
{Cr: &sriovv1.SriovNetworkList{}},
4965
{Cr: &sriovv1.SriovOperatorConfigList{}},
66+
{Cr: &monitoringv1.ServiceMonitorList{}, Namespace: &operatorNamespace},
67+
{Cr: &monitoringv1.PrometheusRuleList{}, Namespace: &operatorNamespace},
68+
{Cr: &rbacv1.RoleList{}, Namespace: &operatorNamespace},
69+
{Cr: &rbacv1.RoleBindingList{}, Namespace: &operatorNamespace},
5070
}
5171

5272
err := os.Mkdir(reportPath, 0755)

0 commit comments

Comments
 (0)