Skip to content

Commit c6489eb

Browse files
Eoghan RussellEoghan1232eoghanlawlessIpawlikxnhennigan
authored
Refactor and updates (#22)
* Updates - Update README - Update to go1.18 - Update to k8s v1.25.5 - Reworked vfstats collector - Implemented endpoint unit tests - Add netlink support detection - Add image building to Makefile - Remove deprecated references - Add Mellanox driver to drivers DB - Refactor code to enable testing - Support for NFD SR-IOV feature label - Changes to ensure more uniform Makefile - Implemented initial unit tests - Implemented vfstats package unit tests Co-Authored-By: Eoghan1232 <[email protected]> Co-Authored-By: eoghanlawless <[email protected]> Co-Authored-By: Ipawlikx <[email protected]> Co-Authored-By: nhennigan <[email protected]> * fixing incorrect flag * fixing typos * Adding in github action workflow * Addressed comments * Fixing vulnerability * Updating action workflow to run on ubuntu-latest * fixing go version in action * Fixing Hadolint scan in action * testing ginkgo issue in action * Revert "testing ginkgo issue in action" This reverts commit 6343cb8. * Updating Makefile to print coverage per function --------- Co-authored-by: Eoghan1232 <[email protected]> Co-authored-by: eoghanlawless <[email protected]> Co-authored-by: Ipawlikx <[email protected]> Co-authored-by: nhennigan <[email protected]>
1 parent 165b761 commit c6489eb

26 files changed

+2615
-1082
lines changed
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
name: build-test-lint
2+
on: [push, pull_request]
3+
jobs:
4+
build:
5+
name: build
6+
strategy:
7+
matrix:
8+
go-version: [1.18.x]
9+
goarch: [amd64]
10+
os: [ubuntu-latest]
11+
runs-on: ${{ matrix.os }}
12+
steps:
13+
- name: Set up Go matrix
14+
uses: actions/setup-go@v3
15+
with:
16+
go-version: ${{ matrix.go-version }}
17+
18+
- name: Check out code into the Go module directory
19+
uses: actions/checkout@v2
20+
21+
- name: Build
22+
env:
23+
GOARCH: ${{ matrix.goarch }}
24+
GOOS: ${{ matrix.goos }}
25+
run: make build
26+
27+
test:
28+
runs-on: ubuntu-latest
29+
needs: build
30+
name: test
31+
steps:
32+
- name: Set up Go
33+
uses: actions/setup-go@v3
34+
with:
35+
go-version: 1.18.x
36+
37+
- name: Check out code into the Go module directory
38+
uses: actions/checkout@v2
39+
40+
- name: Install hwdata
41+
run: sudo apt-get install hwdata -y
42+
43+
- name: Go test
44+
run: make test
45+
46+
test-coverage:
47+
runs-on: ubuntu-latest
48+
needs: build
49+
name: test-coverage
50+
steps:
51+
- name: Set up Go
52+
uses: actions/setup-go@v3
53+
with:
54+
go-version: 1.18.x
55+
56+
- uses: actions/checkout@v2
57+
58+
- name: Install hwdata
59+
run: sudo apt-get install hwdata -y
60+
61+
- name: Go test with coverage
62+
run: make test-coverage
63+
64+
golangci:
65+
name: Golangci-lint
66+
runs-on: ubuntu-latest
67+
steps:
68+
- name: Set up Go
69+
uses: actions/setup-go@v3
70+
with:
71+
go-version: 1.18.x
72+
- uses: actions/checkout@v2
73+
- name: golangci-lint
74+
uses: golangci/golangci-lint-action@v3
75+
with:
76+
# Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version.
77+
version: v1.46.2
78+
79+
hadolint:
80+
runs-on: ubuntu-latest
81+
name: Hadolint
82+
steps:
83+
- uses: actions/checkout@v2
84+
- uses: brpaz/[email protected]
85+
name: Run Hadolint
86+
with:
87+
dockerfile: ./Dockerfile
88+
ignore: DL3018 # DL3018: GH issue 368
89+
90+
go-check:
91+
runs-on: ubuntu-latest
92+
steps:
93+
- uses: actions/checkout@v2
94+
95+
- name: Set up Go
96+
uses: actions/setup-go@v3
97+
with:
98+
go-version: 1.18.x
99+
100+
# if this fails, run go mod tidy
101+
- name: Check if module files are consistent with code
102+
run: go mod tidy && git diff --exit-code
103+
104+
# if this fails, run go mod vendor
105+
- name: Check if vendor directory is consistent with go modules
106+
run: go mod vendor && git diff --exit-code

Dockerfile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
FROM golang:alpine as builder
22

3-
ENV HTTP_PROXY $http_proxy
4-
ENV HTTPS_PROXY $https_proxy
53
RUN apk add --no-cache --virtual build-dependencies build-base linux-headers git
64
COPY ./ /usr/src/sriov-network-metrics-exporter
75
WORKDIR /usr/src/sriov-network-metrics-exporter

Makefile

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,47 @@
1+
IMAGE_REGISTRY?=localhost:5000/
2+
IMAGE_VERSION?=latest
3+
4+
IMAGE_NAME?=$(IMAGE_REGISTRY)sriov-metrics-exporter:$(IMAGE_VERSION)
5+
IMAGE_BUILDER?=docker
6+
7+
DOCKERARGS?=
8+
ifdef HTTP_PROXY
9+
DOCKERARGS += --build-arg http_proxy=$(HTTP_PROXY)
10+
endif
11+
ifdef HTTPS_PROXY
12+
DOCKERARGS += --build-arg https_proxy=$(HTTPS_PROXY)
13+
endif
14+
15+
all: build image-build test
16+
117
clean:
218
rm -rf bin
3-
go clean --modcache
4-
19+
go clean -modcache -testcache
20+
521
build:
22+
GO111MODULE=on go build -ldflags "-s -w" -buildmode=pie -o bin/sriov-exporter cmd/sriov-network-metrics-exporter.go
23+
24+
image-build:
25+
@echo "Bulding container image $(IMAGE_NAME)"
26+
$(IMAGE_BUILDER) build -f Dockerfile -t $(IMAGE_NAME) $(DOCKERARGS) .
27+
28+
image-push:
29+
$(IMAGE_BUILDER) push $(IMAGE_NAME)
30+
31+
test:
32+
go test ./... -count=1
33+
34+
test-coverage:
35+
go test ./... -coverprofile cover.out
36+
go tool cover -func cover.out
37+
38+
go-lint-install:
639
go install github.com/golangci/golangci-lint/cmd/[email protected]
40+
41+
go-lint: go-lint-install
742
go mod tidy
843
go fmt ./...
9-
golangci-lint run
10-
GO111MODULE=on go build -ldflags "-s -w" -buildmode=pie -o bin/sriov-exporter cmd/sriov-network-metrics-exporter.go
44+
golangci-lint run --color always -v ./...
45+
46+
go-lint-report: go-lint-install
47+
golangci-lint run --color always -v ./... &> golangci-lint.txt

README.md

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,14 @@ The SR-IOV Network Metrics Exporter is designed with the Kubernetes SR-IOV stack
66
**This software is a pre-production alpha version and should not be deployed to production servers.**
77

88
## Hardware support
9-
The default netlink implementation for Virtual Function telemetry relies on driver support and a kernel version of 4.4 or higher. This version requires i40e driver of 2.11+ for Intel® 700 series NICs. Updated i40e drivers can be fould at the [Intel Download Center](https://downloadcenter.intel.com/download/24411/Intel-Network-Adapter-Driver-for-PCIe-40-Gigabit-Ethernet-Network-Connections-under-Linux-?v=t)
9+
The sysfs collector for Virtual Function telemetry supports NICs with drivers that implement the SR-IOV sysfs management interface e.g. ice, i40e, mlnx_en and mlnx_ofed.
1010

11-
For kernels older than 4.4 a driver specific collector is enabled which is compatible with Intel® 700 series NICs using and i40e driver of 2.11 or above. To check your current driver version run: ``modinfo i40e | grep ^version``
12-
To upgrade visit the [official driver download site](https://downloadcenter.intel.com/download/24411/Intel-Network-Adapter-Driver-for-PCIe-40-Gigabit-Ethernet-Network-Connections-Under-Linux-).
13-
To use this version the flag collector.netlink must be set to "false".
11+
The netlink collector relies on driver support and a kernel version of 4.4 or higher.
12+
To support netlink, we recommend these driver versions: an i40e driver of 2.11+ or higher for Intel® 700 series NICs and ice driver 1.2+ for Intel® 800 series NICs.
13+
14+
To check your current driver version run: `modinfo <driver> | grep ^version` where driver is `i40e` or `ice`\
15+
i40e drivers: [Intel Download Center](https://downloadcenter.intel.com/download/18026/), [Source Forge](https://sourceforge.net/projects/e1000/files/i40e%20stable/)\
16+
ice drivers: [Intel Download Center](https://www.intel.com/content/www/us/en/download/19630/), [Source Forge](https://sourceforge.net/projects/e1000/files/ice%20stable/)
1417

1518
## Metrics
1619
This exporter will make the following metrics available:
@@ -42,17 +45,35 @@ Once available through Prometheus VF metrics can be used by metrics applications
4245

4346
## Installation
4447
### Kubernetes installation
48+
49+
#### Building images
4550
Typical deployment is as a daemonset in a cluster. A daemonset requires the image to be available on each node in the cluster or at a registry accessible from each node.
4651
The following assumes a local Docker registry available at localhost:5000, and assumes Docker is being used to build and manage containers in the cluster.
4752

4853
In order to build the container and load it to a local registry run:
4954

5055
```
5156
docker build . -t localhost:5000/sriov-metrics-exporter && docker push localhost:5000/sriov-metrics-exporter
57+
58+
or
59+
60+
make docker-build && make docker-push
5261
```
5362

5463
The above assumes a registry available across the cluster at localhost:5000, for example on using the [Docker Registry Proxy](https://github.com/kubernetes-sigs/kubespray/blob/master/roles/kubernetes-apps/registry/README.md). If your registry is at a different address the image name will need to be changed to reflect that in the [Kubernetes daemonset](/deployment/daemonset.yaml)
5564

65+
#### Labeling nodes
66+
67+
SR-IOV Network Metrics Exporter will only be deployed on nodes labeled with `"feature.node.kubernetes.io/network-sriov.capable": "true"` label. You can label the nodes automatically using [Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery), or manually, executing the following `kubectl` command:
68+
69+
```
70+
kubectl label node <nodename> feature.node.kubernetes.io/network-sriov.capable="true"
71+
```
72+
73+
If you prefer to use the `Node Feature Discovery` you can refer to the [Quick-start guide](https://github.com/kubernetes-sigs/node-feature-discovery#quick-start--the-short-short-version) on the project's repository.
74+
75+
#### Deploying SR-IOV Network Metrics Exporter
76+
5677
Create monitoring namespace:
5778
```
5879
kubectl create namespace monitoring
@@ -98,7 +119,7 @@ In order to expose these metrics to Prometheus we need to configure the database
98119
```
99120
The above should be added to the Prometheus configuration as a new target. For more about configuring Prometheus see the [official guide.](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) Once Prometheus is started with this included in its config sriov-metrics should appear on the "Targets page". Metrics should be available by querying the Prometheus API or in the web interface.
100121

101-
In this mode it will serve stats on an endpoint inside the cluster. Prometheus will detect the label on the service endpoint throught the above configuration.
122+
In this mode it will serve stats on an endpoint inside the cluster. Prometheus will detect the label on the service endpoint through the above configuration.
102123

103124
### Standalone installation to an endpoint on the host.
104125

@@ -145,21 +166,25 @@ The above should be added to the Prometheus configuration as a new target. For m
145166
### Configuration
146167
A number of configuration flags can be passed to the SR-IOV Network Metrics Exporter in order to change enabled collectors, the paths it reads from and some properties of its web endpoint.
147168

169+
The collector.vfstatspriority flag defines the priority of vf stats collectors, each pf will use the first supported collector in the list.\
170+
Example: using the priority, "sysfs,netlink", with Intel® 700 and 800 series NICs installed and vfs initialized, the sysfs collector will be used for the 700 series NIC, and netlink for the 800 series NIC since it doesn't support sysfs collection, therefore it falls back to the netlink driver.
171+
148172
| Flag | Type | Description | Default Value |
149173
|----|:----|:----|:----|
150174
| collector.kubepodcpu | boolean | Enables the kubepodcpu collector | false |
151175
| collector.kubepoddevice | boolean | Enables the kubepoddevice collector | false |
152-
| collector.vfstats | boolean |Enables the vfstats collector | true |
153-
| collector.netlink | boolean |Enables using netlink for vfstats collection | true |
176+
| collector.vfstatspriority | string | Sets the priority of vfstats collectors | sysfs,netlink |
177+
| collector.sysfs | boolean | Enables using sr-iov sysfs for vfstats collection | true |
178+
| collector.netlink | boolean | Enables using netlink for vfstats collection | true |
154179
| path.cpucheckpoint | string | Path for location of cpu manager checkpoint file | /var/lib/kubelet/cpu_manager_state |
155-
| path.kubecgroup |string | Path for location of kubernetes cgroups on the host system | /sys/fs/cgroup/cpuset/kubepods/|
156-
| path.kubeletSocket | string | Path to kubelet resources socket | /var/lib/kubelet/pod-resources/kubelet.sock |
180+
| path.kubecgroup |string | Path for location of kubernetes cgroups on the host system | /sys/fs/cgroup/cpuset/kubepods/ |
181+
| path.kubeletsocket | string | Path to kubelet resources socket | /var/lib/kubelet/pod-resources/kubelet.sock |
157182
| path.nodecpuinfo | string | Path for location of system cpu information | /sys/devices/system/node/ |
158183
| path.sysbuspci | string | Path to sys/bus/pci on host | /sys/bus/pci/devices |
159184
| path.sysclassnet | string | Path to sys/class/net on host | /sys/class/net/ |
160-
| web.listen-address | string | Address to listen on for web interface and telemetry. | :9808 |
161-
| web.rate-burst | int | Maximum per second burst rate for requests. | 10 |
162-
| web.rate-limit | int | Limit for requests per second. | 1 |
185+
| web.listen-address | string | Address to listen on for web interface and telemetry | :9808 |
186+
| web.rate-burst | int | Maximum per second burst rate for requests | 10 |
187+
| web.rate-limit | int | Limit for requests per second | 1 |
163188

164189
## Communication and contribution
165190

cmd/sriov-network-metrics-exporter.go

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,37 @@
11
// The SR-IOV networks exporter makes metrics from SR-IOV Virtual Functions available in a prometheus format.
22
// Different classes of metrics are implemented as individual collectors.
3+
34
package main
45

56
import (
67
"flag"
78
"log"
89
"net/http"
9-
"sriov-network-metrics-exporter/collectors"
1010

11-
"github.com/prometheus/client_golang/prometheus/promhttp"
11+
"github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/collectors"
1212

1313
"github.com/prometheus/client_golang/prometheus"
14+
"github.com/prometheus/client_golang/prometheus/promhttp"
1415
"golang.org/x/time/rate"
1516
)
1617

1718
var (
18-
addr = flag.String("web.listen-address", ":9808", "Address to listen on for web interface and telemetry.")
19+
addr = flag.String("web.listen-address", ":9808", "Port to listen on for web interface and telemetry.")
1920
rateLimit = flag.Int("web.rate-limit", 1, "Limit for requests per second.")
2021
rateBurst = flag.Int("web.rate-burst", 10, "Maximum per second burst rate for requests.")
2122
metricsEndpoint = "/metrics"
2223
)
2324

2425
func main() {
25-
flag.Parse()
26-
verifyFlags()
27-
enabledCollectors := collectors.Enabled()
28-
err := prometheus.Register(enabledCollectors)
26+
parseAndVerifyFlags()
27+
28+
err := prometheus.Register(collectors.Enabled())
2929
if err != nil {
3030
log.Fatalf("collector could not be registered: %v", err)
3131
return
3232
}
33-
//Use the default promhttp handler wrapped with middleware to serve at the metrics endpoint
33+
34+
// Use the default promhttp handler wrapped with middleware to serve at the metrics endpoint
3435
handlerWithMiddleware := limitRequests(
3536
getOnly(
3637
endpointOnly(
@@ -41,7 +42,12 @@ func main() {
4142
log.Fatalf("ListenAndServe error: %v", http.ListenAndServe(*addr, handlerWithMiddleware))
4243
}
4344

44-
//enpointOnly restricts all responses to 404 where the passed endpoint isn't used. Used to minimize the possible outputs of the server.
45+
func parseAndVerifyFlags() {
46+
flag.Parse()
47+
verifyFlags()
48+
}
49+
50+
// endpointOnly restricts all responses to 404 where the passed endpoint isn't used. Used to minimize the possible outputs of the server.
4551
func endpointOnly(next http.Handler, endpoint string) http.Handler {
4652
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
4753
if r.URL.Path != endpoint {
@@ -56,7 +62,7 @@ func endpointOnly(next http.Handler, endpoint string) http.Handler {
5662
})
5763
}
5864

59-
//getOnly restricts the possible verbs used in a http request to GET only
65+
// getOnly restricts the possible verbs used in a http request to GET only
6066
func getOnly(next http.Handler) http.Handler {
6167
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
6268
if r.Method != http.MethodGet {
@@ -71,7 +77,7 @@ func getOnly(next http.Handler) http.Handler {
7177
})
7278
}
7379

74-
//noBody returns a 400 to any request that contains a body
80+
// noBody returns a 400 to any request that contains a body
7581
func noBody(next http.Handler) http.Handler {
7682
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
7783
if r.Body != http.NoBody {
@@ -86,20 +92,20 @@ func noBody(next http.Handler) http.Handler {
8692
})
8793
}
8894

89-
//limitRequests sets a rate limit and a burst limit for requests to the endpoint
95+
// limitRequests sets a rate limit and a burst limit for requests to the endpoint
9096
func limitRequests(next http.Handler, rateLimit rate.Limit, burstLimit int) http.Handler {
9197
limiter := rate.NewLimiter(rateLimit, burstLimit)
9298
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
9399
if !limiter.Allow() {
94-
http.Error(w, http.StatusText(429), http.StatusTooManyRequests)
100+
http.Error(w, http.StatusText(http.StatusTooManyRequests), http.StatusTooManyRequests)
95101
return
96102
}
97103
next.ServeHTTP(w, r)
98104
})
99105
}
100106

101107
func verifyFlags() {
102-
collectors.ResolveSriovDevFilepaths()
103-
collectors.ResolveKubePodCPUFilepaths()
104-
collectors.ResolveKubePodDeviceFilepaths()
108+
if err := collectors.ResolveFilepaths(); err != nil {
109+
log.Panicf("failed to resolve paths\n%v", err)
110+
}
105111
}

0 commit comments

Comments
 (0)