|
| 1 | +// Copyright 2022 Amazon Web Services |
| 2 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 3 | +// you may not use this file except in compliance with the License. |
| 4 | +// You may obtain a copy of the License at |
| 5 | +// |
| 6 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 7 | +// |
| 8 | +// Unless required by applicable law or agreed to in writing, software |
| 9 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 10 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 11 | +// See the License for the specific language governing permissions and |
| 12 | +// limitations under the License. |
| 13 | + |
| 14 | +//go:build linux |
| 15 | +// +build linux |
| 16 | + |
| 17 | +package sysfs |
| 18 | + |
| 19 | +import ( |
| 20 | + "fmt" |
| 21 | + "os" |
| 22 | + "path/filepath" |
| 23 | + "strconv" |
| 24 | + "strings" |
| 25 | + |
| 26 | + "github.com/prometheus/procfs/internal/util" |
| 27 | +) |
| 28 | + |
| 29 | +// Amazon Elastic Fabric Adapter counters are exposed similarly |
| 30 | +// to InfiniBand counters in SysFS. The same structure is used |
| 31 | +// in this class for consistency. |
| 32 | + |
| 33 | +const AmazonEfaPath = "class/infiniband" |
| 34 | + |
| 35 | +// AmazonEfaCounters contains counter values from files in |
| 36 | +// /sys/class/infiniband/<Name>/ports/<Port>/hw_counters |
| 37 | +// for a single port of one Amazon Elastic Fabric Adapter device. |
| 38 | +type AmazonEfaCounters struct { |
| 39 | + AllocPdErr *uint64 // hw_counters/alloc_pd_err |
| 40 | + AllocUcontextErr *uint64 // hw_counters/alloc_ucontext_err |
| 41 | + CmdsErr *uint64 // hw_counters/cmds_err |
| 42 | + CompletedCmds *uint64 // hw_counters/completed_cmds |
| 43 | + CreateAhErr *uint64 // hw_counters/create_ah_err |
| 44 | + CreateCqErr *uint64 // hw_counters/create_cq_err |
| 45 | + CreateQpErr *uint64 // hw_counters/create_qp_err |
| 46 | + KeepAliveRcvd *uint64 // hw_counters/keep_alive_rcvd |
| 47 | + Lifespan *uint64 // hw_counters/lifespan |
| 48 | + MmapErr *uint64 // hw_counters/mmap_err |
| 49 | + NoCompletionCmds *uint64 // hw_counters/no_completion_cmds |
| 50 | + RdmaReadBytes *uint64 // hw_counters/rdma_read_bytes |
| 51 | + RdmaReadRespBytes *uint64 // hw_counters/rdma_read_resp_bytes |
| 52 | + RdmaReadWrErr *uint64 // hw_counters/rdma_read_wr_err |
| 53 | + RdmaReadWrs *uint64 // hw_counters/rdma_read_wrs |
| 54 | + RecvBytes *uint64 // hw_counters/recv_bytes |
| 55 | + RecvWrs *uint64 // hw_counters/recv_wrs |
| 56 | + RegMrErr *uint64 // hw_counters/reg_mr_err |
| 57 | + RxBytes *uint64 // hw_counters/rx_bytes |
| 58 | + RxDrops *uint64 // hw_counters/rx_drops |
| 59 | + RxPkts *uint64 // hw_counters/rx_pkts |
| 60 | + SendBytes *uint64 // hw_counters/send_bytes |
| 61 | + SendWrs *uint64 // hw_counters/send_wrs |
| 62 | + SubmittedCmds *uint64 // hw_counters/submitted_cmds |
| 63 | + TxBytes *uint64 // hw_counters/tx_bytes |
| 64 | + TxPkts *uint64 // hw_counters/tx_pkts |
| 65 | +} |
| 66 | + |
| 67 | +// AmazonEfaPort contains info from files in |
| 68 | +// /sys/class/infiniband/<Name>/ports/<Port> |
| 69 | +// for a single port of one Amazon Elastic Fabric Adapter device. |
| 70 | +type AmazonEfaPort struct { |
| 71 | + Name string |
| 72 | + Port uint |
| 73 | + State string // String representation from /sys/class/infiniband/<Name>/ports/<Port>/state |
| 74 | + StateID uint // ID from /sys/class/infiniband/<Name>/ports/<Port>/state |
| 75 | + PhysState string // String representation from /sys/class/infiniband/<Name>/ports/<Port>/phys_state |
| 76 | + PhysStateID uint // String representation from /sys/class/infiniband/<Name>/ports/<Port>/phys_state |
| 77 | + Rate uint64 // in bytes/second from /sys/class/infiniband/<Name>/ports/<Port>/rate |
| 78 | + Counters AmazonEfaCounters |
| 79 | +} |
| 80 | + |
| 81 | +// AmazonEfaDevice contains info from files in /sys/class/infiniband for a |
| 82 | +// single Amazon Elastic Fabric Adapter (EFA) device. |
| 83 | +type AmazonEfaDevice struct { |
| 84 | + Name string |
| 85 | + Ports map[uint]AmazonEfaPort |
| 86 | +} |
| 87 | + |
| 88 | +// AmazonEfaClass is a collection of every Amazon Elastic Fabric Adapter (EFA) device in |
| 89 | +// /sys/class/infiniband. |
| 90 | +// |
| 91 | +// The map keys are the names of the Amazon Elastic Fabric Adapter (EFA) devices. |
| 92 | +type AmazonEfaClass map[string]AmazonEfaDevice |
| 93 | + |
| 94 | +// AmazonEfaClass returns info for all Amazon Elastic Fabric Adapter (EFA) devices read from |
| 95 | +// /sys/class/infiniband. |
| 96 | +func (fs FS) AmazonEfaClass() (AmazonEfaClass, error) { |
| 97 | + path := fs.sys.Path(AmazonEfaPath) |
| 98 | + |
| 99 | + dirs, err := os.ReadDir(path) |
| 100 | + if err != nil { |
| 101 | + return nil, err |
| 102 | + } |
| 103 | + |
| 104 | + aec := make(AmazonEfaClass, len(dirs)) |
| 105 | + for _, d := range dirs { |
| 106 | + device, err := fs.parseAmazonEfaDevice(d.Name()) |
| 107 | + if err != nil { |
| 108 | + return nil, err |
| 109 | + } |
| 110 | + |
| 111 | + aec[device.Name] = *device |
| 112 | + } |
| 113 | + |
| 114 | + return aec, nil |
| 115 | +} |
| 116 | + |
| 117 | +// Parse one AmazonEfa device. |
| 118 | +func (fs FS) parseAmazonEfaDevice(name string) (*AmazonEfaDevice, error) { |
| 119 | + path := fs.sys.Path(AmazonEfaPath, name) |
| 120 | + device := AmazonEfaDevice{Name: name} |
| 121 | + |
| 122 | + portsPath := filepath.Join(path, "ports") |
| 123 | + ports, err := os.ReadDir(portsPath) |
| 124 | + if err != nil { |
| 125 | + return nil, fmt.Errorf("failed to list AmazonEfa ports at %q: %w", portsPath, err) |
| 126 | + } |
| 127 | + |
| 128 | + device.Ports = make(map[uint]AmazonEfaPort, len(ports)) |
| 129 | + for _, d := range ports { |
| 130 | + port, err := fs.parseAmazonEfaPort(name, d.Name()) |
| 131 | + if err != nil { |
| 132 | + return nil, err |
| 133 | + } |
| 134 | + |
| 135 | + device.Ports[port.Port] = *port |
| 136 | + } |
| 137 | + |
| 138 | + return &device, nil |
| 139 | +} |
| 140 | + |
| 141 | +// Scans predefined files in /sys/class/infiniband/<device>/ports/<port> |
| 142 | +// directory and gets their contents. |
| 143 | +func (fs FS) parseAmazonEfaPort(name string, port string) (*AmazonEfaPort, error) { |
| 144 | + portNumber, err := strconv.ParseUint(port, 10, 32) |
| 145 | + if err != nil { |
| 146 | + return nil, fmt.Errorf("failed to convert %s into uint", port) |
| 147 | + } |
| 148 | + aep := AmazonEfaPort{Name: name, Port: uint(portNumber)} |
| 149 | + |
| 150 | + portPath := fs.sys.Path(AmazonEfaPath, name, "ports", port) |
| 151 | + content, err := os.ReadFile(filepath.Join(portPath, "state")) |
| 152 | + if err != nil { |
| 153 | + return nil, err |
| 154 | + } |
| 155 | + id, name, err := parseState(string(content)) |
| 156 | + if err != nil { |
| 157 | + return nil, fmt.Errorf("could not parse state file in %q: %w", portPath, err) |
| 158 | + } |
| 159 | + aep.State = name |
| 160 | + aep.StateID = id |
| 161 | + |
| 162 | + content, err = os.ReadFile(filepath.Join(portPath, "phys_state")) |
| 163 | + if err != nil { |
| 164 | + return nil, err |
| 165 | + } |
| 166 | + id, name, err = parseState(string(content)) |
| 167 | + if err != nil { |
| 168 | + return nil, fmt.Errorf("could not parse phys_state file in %q: %w", portPath, err) |
| 169 | + } |
| 170 | + aep.PhysState = name |
| 171 | + aep.PhysStateID = id |
| 172 | + |
| 173 | + content, err = os.ReadFile(filepath.Join(portPath, "rate")) |
| 174 | + if err != nil { |
| 175 | + return nil, err |
| 176 | + } |
| 177 | + aep.Rate, err = parseRate(string(content)) |
| 178 | + if err != nil { |
| 179 | + return nil, fmt.Errorf("could not parse rate file in %q: %w", portPath, err) |
| 180 | + } |
| 181 | + |
| 182 | + counters, err := parseAmazonEfaCounters(portPath) |
| 183 | + if err != nil { |
| 184 | + return nil, err |
| 185 | + } |
| 186 | + aep.Counters = *counters |
| 187 | + |
| 188 | + return &aep, nil |
| 189 | +} |
| 190 | + |
| 191 | +func parseAmazonEfaCounters(portPath string) (*AmazonEfaCounters, error) { |
| 192 | + var counters AmazonEfaCounters |
| 193 | + |
| 194 | + path := filepath.Join(portPath, "hw_counters") |
| 195 | + files, err := os.ReadDir(path) |
| 196 | + if err != nil { |
| 197 | + return nil, err |
| 198 | + } |
| 199 | + |
| 200 | + for _, f := range files { |
| 201 | + if !f.Type().IsRegular() { |
| 202 | + continue |
| 203 | + } |
| 204 | + |
| 205 | + name := filepath.Join(path, f.Name()) |
| 206 | + value, err := util.SysReadFile(name) |
| 207 | + if err != nil { |
| 208 | + if os.IsNotExist(err) || os.IsPermission(err) || err.Error() == "operation not supported" || err.Error() == "invalid argument" { |
| 209 | + continue |
| 210 | + } |
| 211 | + return nil, fmt.Errorf("failed to read file %q: %w", name, err) |
| 212 | + } |
| 213 | + |
| 214 | + vp := util.NewValueParser(value) |
| 215 | + |
| 216 | + switch f.Name() { |
| 217 | + |
| 218 | + case "lifespan": |
| 219 | + counters.Lifespan = vp.PUInt64() |
| 220 | + case "rdma_read_bytes": |
| 221 | + counters.RdmaReadBytes = vp.PUInt64() |
| 222 | + case "rdma_read_resp_bytes": |
| 223 | + counters.RdmaReadRespBytes = vp.PUInt64() |
| 224 | + case "rdma_read_wr_err": |
| 225 | + counters.RdmaReadWrErr = vp.PUInt64() |
| 226 | + case "rdma_read_wrs": |
| 227 | + counters.RdmaReadWrs = vp.PUInt64() |
| 228 | + case "recv_bytes": |
| 229 | + counters.RecvBytes = vp.PUInt64() |
| 230 | + case "recv_wrs": |
| 231 | + counters.RecvWrs = vp.PUInt64() |
| 232 | + case "rx_bytes": |
| 233 | + counters.RxBytes = vp.PUInt64() |
| 234 | + case "rx_drops": |
| 235 | + counters.RxDrops = vp.PUInt64() |
| 236 | + case "rx_pkts": |
| 237 | + counters.RxPkts = vp.PUInt64() |
| 238 | + case "send_bytes": |
| 239 | + counters.SendBytes = vp.PUInt64() |
| 240 | + case "send_wrs": |
| 241 | + counters.SendWrs = vp.PUInt64() |
| 242 | + case "tx_bytes": |
| 243 | + counters.TxBytes = vp.PUInt64() |
| 244 | + case "tx_pkts": |
| 245 | + counters.TxPkts = vp.PUInt64() |
| 246 | + |
| 247 | + if err != nil { |
| 248 | + // Ugly workaround for handling https://github.com/prometheus/node_exporter/issues/966 |
| 249 | + // when counters are `N/A (not available)`. |
| 250 | + // This was already patched and submitted, see |
| 251 | + // https://www.spinics.net/lists/linux-rdma/msg68596.html |
| 252 | + // Remove this as soon as the fix lands in the enterprise distros. |
| 253 | + if strings.Contains(value, "N/A (no PMA)") { |
| 254 | + continue |
| 255 | + } |
| 256 | + return nil, err |
| 257 | + } |
| 258 | + } |
| 259 | + } |
| 260 | + |
| 261 | + return &counters, nil |
| 262 | +} |
0 commit comments