Skip to content

Commit f09883d

Browse files
committed
Add sysfs class for Amazon Elastic Fabric Adapter
Signed-off-by: Pierre-Yves Aquilanti <[email protected]>
1 parent b5312c5 commit f09883d

File tree

1 file changed

+262
-0
lines changed

1 file changed

+262
-0
lines changed

Diff for: sysfs/class_amazon_efa.go

+262
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
// Copyright 2022 Amazon Web Services
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build linux
15+
// +build linux
16+
17+
package sysfs
18+
19+
import (
20+
"fmt"
21+
"os"
22+
"path/filepath"
23+
"strconv"
24+
"strings"
25+
26+
"github.com/prometheus/procfs/internal/util"
27+
)
28+
29+
// Amazon Elastic Fabric Adapter counters are exposed similarly
30+
// to InfiniBand counters in SysFS. The same structure is used
31+
// in this class for consistency.
32+
33+
const AmazonEfaPath = "class/infiniband"
34+
35+
// AmazonEfaCounters contains counter values from files in
36+
// /sys/class/infiniband/<Name>/ports/<Port>/hw_counters
37+
// for a single port of one Amazon Elastic Fabric Adapter device.
38+
type AmazonEfaCounters struct {
39+
AllocPdErr *uint64 // hw_counters/alloc_pd_err
40+
AllocUcontextErr *uint64 // hw_counters/alloc_ucontext_err
41+
CmdsErr *uint64 // hw_counters/cmds_err
42+
CompletedCmds *uint64 // hw_counters/completed_cmds
43+
CreateAhErr *uint64 // hw_counters/create_ah_err
44+
CreateCqErr *uint64 // hw_counters/create_cq_err
45+
CreateQpErr *uint64 // hw_counters/create_qp_err
46+
KeepAliveRcvd *uint64 // hw_counters/keep_alive_rcvd
47+
Lifespan *uint64 // hw_counters/lifespan
48+
MmapErr *uint64 // hw_counters/mmap_err
49+
NoCompletionCmds *uint64 // hw_counters/no_completion_cmds
50+
RdmaReadBytes *uint64 // hw_counters/rdma_read_bytes
51+
RdmaReadRespBytes *uint64 // hw_counters/rdma_read_resp_bytes
52+
RdmaReadWrErr *uint64 // hw_counters/rdma_read_wr_err
53+
RdmaReadWrs *uint64 // hw_counters/rdma_read_wrs
54+
RecvBytes *uint64 // hw_counters/recv_bytes
55+
RecvWrs *uint64 // hw_counters/recv_wrs
56+
RegMrErr *uint64 // hw_counters/reg_mr_err
57+
RxBytes *uint64 // hw_counters/rx_bytes
58+
RxDrops *uint64 // hw_counters/rx_drops
59+
RxPkts *uint64 // hw_counters/rx_pkts
60+
SendBytes *uint64 // hw_counters/send_bytes
61+
SendWrs *uint64 // hw_counters/send_wrs
62+
SubmittedCmds *uint64 // hw_counters/submitted_cmds
63+
TxBytes *uint64 // hw_counters/tx_bytes
64+
TxPkts *uint64 // hw_counters/tx_pkts
65+
}
66+
67+
// AmazonEfaPort contains info from files in
68+
// /sys/class/infiniband/<Name>/ports/<Port>
69+
// for a single port of one Amazon Elastic Fabric Adapter device.
70+
type AmazonEfaPort struct {
71+
Name string
72+
Port uint
73+
State string // String representation from /sys/class/infiniband/<Name>/ports/<Port>/state
74+
StateID uint // ID from /sys/class/infiniband/<Name>/ports/<Port>/state
75+
PhysState string // String representation from /sys/class/infiniband/<Name>/ports/<Port>/phys_state
76+
PhysStateID uint // String representation from /sys/class/infiniband/<Name>/ports/<Port>/phys_state
77+
Rate uint64 // in bytes/second from /sys/class/infiniband/<Name>/ports/<Port>/rate
78+
Counters AmazonEfaCounters
79+
}
80+
81+
// AmazonEfaDevice contains info from files in /sys/class/infiniband for a
82+
// single Amazon Elastic Fabric Adapter (EFA) device.
83+
type AmazonEfaDevice struct {
84+
Name string
85+
Ports map[uint]AmazonEfaPort
86+
}
87+
88+
// AmazonEfaClass is a collection of every Amazon Elastic Fabric Adapter (EFA) device in
89+
// /sys/class/infiniband.
90+
//
91+
// The map keys are the names of the Amazon Elastic Fabric Adapter (EFA) devices.
92+
type AmazonEfaClass map[string]AmazonEfaDevice
93+
94+
// AmazonEfaClass returns info for all Amazon Elastic Fabric Adapter (EFA) devices read from
95+
// /sys/class/infiniband.
96+
func (fs FS) AmazonEfaClass() (AmazonEfaClass, error) {
97+
path := fs.sys.Path(AmazonEfaPath)
98+
99+
dirs, err := os.ReadDir(path)
100+
if err != nil {
101+
return nil, err
102+
}
103+
104+
aec := make(AmazonEfaClass, len(dirs))
105+
for _, d := range dirs {
106+
device, err := fs.parseAmazonEfaDevice(d.Name())
107+
if err != nil {
108+
return nil, err
109+
}
110+
111+
aec[device.Name] = *device
112+
}
113+
114+
return aec, nil
115+
}
116+
117+
// Parse one AmazonEfa device.
118+
func (fs FS) parseAmazonEfaDevice(name string) (*AmazonEfaDevice, error) {
119+
path := fs.sys.Path(AmazonEfaPath, name)
120+
device := AmazonEfaDevice{Name: name}
121+
122+
portsPath := filepath.Join(path, "ports")
123+
ports, err := os.ReadDir(portsPath)
124+
if err != nil {
125+
return nil, fmt.Errorf("failed to list AmazonEfa ports at %q: %w", portsPath, err)
126+
}
127+
128+
device.Ports = make(map[uint]AmazonEfaPort, len(ports))
129+
for _, d := range ports {
130+
port, err := fs.parseAmazonEfaPort(name, d.Name())
131+
if err != nil {
132+
return nil, err
133+
}
134+
135+
device.Ports[port.Port] = *port
136+
}
137+
138+
return &device, nil
139+
}
140+
141+
// Scans predefined files in /sys/class/infiniband/<device>/ports/<port>
142+
// directory and gets their contents.
143+
func (fs FS) parseAmazonEfaPort(name string, port string) (*AmazonEfaPort, error) {
144+
portNumber, err := strconv.ParseUint(port, 10, 32)
145+
if err != nil {
146+
return nil, fmt.Errorf("failed to convert %s into uint", port)
147+
}
148+
aep := AmazonEfaPort{Name: name, Port: uint(portNumber)}
149+
150+
portPath := fs.sys.Path(AmazonEfaPath, name, "ports", port)
151+
content, err := os.ReadFile(filepath.Join(portPath, "state"))
152+
if err != nil {
153+
return nil, err
154+
}
155+
id, name, err := parseState(string(content))
156+
if err != nil {
157+
return nil, fmt.Errorf("could not parse state file in %q: %w", portPath, err)
158+
}
159+
aep.State = name
160+
aep.StateID = id
161+
162+
content, err = os.ReadFile(filepath.Join(portPath, "phys_state"))
163+
if err != nil {
164+
return nil, err
165+
}
166+
id, name, err = parseState(string(content))
167+
if err != nil {
168+
return nil, fmt.Errorf("could not parse phys_state file in %q: %w", portPath, err)
169+
}
170+
aep.PhysState = name
171+
aep.PhysStateID = id
172+
173+
content, err = os.ReadFile(filepath.Join(portPath, "rate"))
174+
if err != nil {
175+
return nil, err
176+
}
177+
aep.Rate, err = parseRate(string(content))
178+
if err != nil {
179+
return nil, fmt.Errorf("could not parse rate file in %q: %w", portPath, err)
180+
}
181+
182+
counters, err := parseAmazonEfaCounters(portPath)
183+
if err != nil {
184+
return nil, err
185+
}
186+
aep.Counters = *counters
187+
188+
return &aep, nil
189+
}
190+
191+
func parseAmazonEfaCounters(portPath string) (*AmazonEfaCounters, error) {
192+
var counters AmazonEfaCounters
193+
194+
path := filepath.Join(portPath, "hw_counters")
195+
files, err := os.ReadDir(path)
196+
if err != nil {
197+
return nil, err
198+
}
199+
200+
for _, f := range files {
201+
if !f.Type().IsRegular() {
202+
continue
203+
}
204+
205+
name := filepath.Join(path, f.Name())
206+
value, err := util.SysReadFile(name)
207+
if err != nil {
208+
if os.IsNotExist(err) || os.IsPermission(err) || err.Error() == "operation not supported" || err.Error() == "invalid argument" {
209+
continue
210+
}
211+
return nil, fmt.Errorf("failed to read file %q: %w", name, err)
212+
}
213+
214+
vp := util.NewValueParser(value)
215+
216+
switch f.Name() {
217+
218+
case "lifespan":
219+
counters.Lifespan = vp.PUInt64()
220+
case "rdma_read_bytes":
221+
counters.RdmaReadBytes = vp.PUInt64()
222+
case "rdma_read_resp_bytes":
223+
counters.RdmaReadRespBytes = vp.PUInt64()
224+
case "rdma_read_wr_err":
225+
counters.RdmaReadWrErr = vp.PUInt64()
226+
case "rdma_read_wrs":
227+
counters.RdmaReadWrs = vp.PUInt64()
228+
case "recv_bytes":
229+
counters.RecvBytes = vp.PUInt64()
230+
case "recv_wrs":
231+
counters.RecvWrs = vp.PUInt64()
232+
case "rx_bytes":
233+
counters.RxBytes = vp.PUInt64()
234+
case "rx_drops":
235+
counters.RxDrops = vp.PUInt64()
236+
case "rx_pkts":
237+
counters.RxPkts = vp.PUInt64()
238+
case "send_bytes":
239+
counters.SendBytes = vp.PUInt64()
240+
case "send_wrs":
241+
counters.SendWrs = vp.PUInt64()
242+
case "tx_bytes":
243+
counters.TxBytes = vp.PUInt64()
244+
case "tx_pkts":
245+
counters.TxPkts = vp.PUInt64()
246+
247+
if err != nil {
248+
// Ugly workaround for handling https://github.com/prometheus/node_exporter/issues/966
249+
// when counters are `N/A (not available)`.
250+
// This was already patched and submitted, see
251+
// https://www.spinics.net/lists/linux-rdma/msg68596.html
252+
// Remove this as soon as the fix lands in the enterprise distros.
253+
if strings.Contains(value, "N/A (no PMA)") {
254+
continue
255+
}
256+
return nil, err
257+
}
258+
}
259+
}
260+
261+
return &counters, nil
262+
}

0 commit comments

Comments
 (0)