Skip to content

Commit 313f8b1

Browse files
SchSebacursoragent
andcommitted
Fix "message too long" error for InfiniBand devices with many VFs
When discovering or configuring InfiniBand devices with many VFs, netlink.LinkByName() can fail with EMSGSIZE (message too long) because the kernel's netlink response exceeds the message limit (~16KB). This happens because LinkByName requests VF info which generates large messages for IB devices with their GUIDs. This commit adds two new methods to the netlink wrapper: 1. LinkByNameForSetVf: Returns a minimal link object (name and index only) by reading from sysfs. Sufficient for VF operations like LinkSetVfNodeGUID. 2. LinkByNameWithBasicInfo: Returns a link with basic info (name, index, MTU, MAC, EncapType). If netlink fails with EMSGSIZE, it falls back to reading these values from sysfs. The fix is applied to: - DiscoverSriovDevices: Uses LinkByNameWithBasicInfo for PF discovery - getVfInfo: Uses LinkByNameWithBasicInfo for VF info discovery - GetLinkType: Uses LinkByNameWithBasicInfo for link type detection - configSriovVFDevices: Falls back to LinkByNameForSetVf on EMSGSIZE - configSriovDevice: Falls back to LinkByNameForSetVf on EMSGSIZE - GetNetdevMTU: Uses LinkByNameWithBasicInfo - GetNetDevMac: Uses LinkByNameWithBasicInfo - SetNetdevMTU: Falls back to LinkByNameForSetVf on EMSGSIZE Includes path traversal protection for interface names to prevent security issues when reading from sysfs. Signed-off-by: User <user@example.com> Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 3ef4f0d commit 313f8b1

File tree

6 files changed

+237
-19
lines changed

6 files changed

+237
-19
lines changed

pkg/host/internal/lib/netlink/mock/mock_netlink.go

Lines changed: 30 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/host/internal/lib/netlink/netlink.go

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
11
package netlink
22

33
import (
4+
"errors"
5+
"fmt"
46
"net"
7+
"os"
8+
"strconv"
9+
"strings"
10+
"syscall"
511

612
"github.com/vishvananda/netlink"
13+
"sigs.k8s.io/controller-runtime/pkg/log"
714
)
815

916
func New() NetlinkLib {
@@ -24,6 +31,14 @@ type NetlinkLib interface {
2431
LinkSetVfPortGUID(link Link, vf int, portguid net.HardwareAddr) error
2532
// LinkByName finds a link by name and returns a pointer to the object.
2633
LinkByName(name string) (Link, error)
34+
// LinkByNameForSetVf returns a minimal link object (name and index only) that can be used
35+
// for VF operations like LinkSetVfNodeGUID. This reads from sysfs instead of netlink
36+
// to avoid the "message too long" error for InfiniBand devices with many VFs.
37+
LinkByNameForSetVf(name string) (Link, error)
38+
// LinkByNameWithBasicInfo returns a link with basic info (name, index, MTU, MAC, EncapType).
39+
// If standard netlink query fails with EMSGSIZE (common for IB devices with many VFs),
40+
// it falls back to reading from sysfs.
41+
LinkByNameWithBasicInfo(name string) (Link, error)
2742
// LinkByIndex finds a link by index and returns a pointer to the object.
2843
LinkByIndex(index int) (Link, error)
2944
// LinkList gets a list of link devices.
@@ -91,6 +106,128 @@ func (w *libWrapper) LinkByName(name string) (Link, error) {
91106
return netlink.LinkByName(name)
92107
}
93108

109+
// LinkByNameForSetVf returns a minimal link object (name and index only) that can be used
110+
// for VF operations like LinkSetVfNodeGUID. This reads from sysfs instead of netlink
111+
// to avoid the "message too long" error for InfiniBand devices with many VFs.
112+
func (w *libWrapper) LinkByNameForSetVf(name string) (Link, error) {
113+
log.Log.V(2).Info("LinkByNameForSetVf(): getting minimal link info from sysfs", "name", name)
114+
115+
// Sanitize the interface name to prevent path traversal
116+
if strings.ContainsAny(name, "/\\") {
117+
return nil, fmt.Errorf("invalid interface name, contains path separators: %s", name)
118+
}
119+
120+
// Read interface index from sysfs
121+
indexPath := fmt.Sprintf("/sys/class/net/%s/ifindex", name)
122+
indexData, err := os.ReadFile(indexPath)
123+
if err != nil {
124+
return nil, fmt.Errorf("failed to read interface index from %s: %w", indexPath, err)
125+
}
126+
127+
index, err := strconv.Atoi(strings.TrimSpace(string(indexData)))
128+
if err != nil {
129+
return nil, fmt.Errorf("failed to parse interface index: %w", err)
130+
}
131+
132+
// Create a minimal Device link with just name and index
133+
// This is sufficient for VF operations like LinkSetVfNodeGUID
134+
link := &netlink.Device{}
135+
link.Attrs().Name = name
136+
link.Attrs().Index = index
137+
138+
log.Log.V(2).Info("LinkByNameForSetVf(): created minimal link", "name", name, "index", index)
139+
return link, nil
140+
}
141+
142+
// LinkByNameWithBasicInfo returns a link with basic info (name, index, MTU, MAC, EncapType).
143+
// If standard netlink query fails with EMSGSIZE (common for IB devices with many VFs),
144+
// it falls back to reading from sysfs.
145+
func (w *libWrapper) LinkByNameWithBasicInfo(name string) (Link, error) {
146+
// First, try standard LinkByName
147+
link, err := netlink.LinkByName(name)
148+
if err == nil {
149+
return link, nil
150+
}
151+
152+
// If it's not EMSGSIZE, return the original error
153+
if !errors.Is(err, syscall.EMSGSIZE) {
154+
return nil, err
155+
}
156+
157+
log.Log.V(2).Info("LinkByNameWithBasicInfo(): LinkByName failed with EMSGSIZE, using sysfs fallback", "name", name)
158+
159+
// Sanitize the interface name to prevent path traversal
160+
if strings.ContainsAny(name, "/\\") {
161+
return nil, fmt.Errorf("invalid interface name, contains path separators: %s", name)
162+
}
163+
164+
basePath := fmt.Sprintf("/sys/class/net/%s", name)
165+
166+
// Read interface index
167+
indexData, err := os.ReadFile(basePath + "/ifindex")
168+
if err != nil {
169+
return nil, fmt.Errorf("failed to read interface index: %w", err)
170+
}
171+
index, err := strconv.Atoi(strings.TrimSpace(string(indexData)))
172+
if err != nil {
173+
return nil, fmt.Errorf("failed to parse interface index: %w", err)
174+
}
175+
176+
// Read MTU
177+
mtuData, err := os.ReadFile(basePath + "/mtu")
178+
if err != nil {
179+
return nil, fmt.Errorf("failed to read MTU: %w", err)
180+
}
181+
mtu, err := strconv.Atoi(strings.TrimSpace(string(mtuData)))
182+
if err != nil {
183+
return nil, fmt.Errorf("failed to parse MTU: %w", err)
184+
}
185+
186+
// Read MAC address
187+
macData, err := os.ReadFile(basePath + "/address")
188+
if err != nil {
189+
return nil, fmt.Errorf("failed to read MAC address: %w", err)
190+
}
191+
mac, err := net.ParseMAC(strings.TrimSpace(string(macData)))
192+
if err != nil {
193+
return nil, fmt.Errorf("failed to parse MAC address: %w", err)
194+
}
195+
196+
// Read interface type to determine encap type
197+
typeData, err := os.ReadFile(basePath + "/type")
198+
if err != nil {
199+
return nil, fmt.Errorf("failed to read interface type: %w", err)
200+
}
201+
ifType, err := strconv.Atoi(strings.TrimSpace(string(typeData)))
202+
if err != nil {
203+
return nil, fmt.Errorf("failed to parse interface type: %w", err)
204+
}
205+
206+
// Map ARPHRD type to encap type string
207+
// See linux/if_arp.h for ARPHRD_* constants
208+
var encapType string
209+
switch ifType {
210+
case 1: // ARPHRD_ETHER
211+
encapType = "ether"
212+
case 32: // ARPHRD_INFINIBAND
213+
encapType = "infiniband"
214+
default:
215+
encapType = fmt.Sprintf("unknown(%d)", ifType)
216+
}
217+
218+
// Create a link with the basic info
219+
link = &netlink.Device{}
220+
link.Attrs().Name = name
221+
link.Attrs().Index = index
222+
link.Attrs().MTU = mtu
223+
link.Attrs().HardwareAddr = mac
224+
link.Attrs().EncapType = encapType
225+
226+
log.Log.V(2).Info("LinkByNameWithBasicInfo(): created link from sysfs",
227+
"name", name, "index", index, "mtu", mtu, "mac", mac.String(), "encapType", encapType)
228+
return link, nil
229+
}
230+
94231
// LinkByIndex finds a link by index and returns a pointer to the object.
95232
func (w *libWrapper) LinkByIndex(index int) (Link, error) {
96233
return netlink.LinkByIndex(index)

pkg/host/internal/network/network.go

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"path/filepath"
99
"strconv"
1010
"strings"
11+
"syscall"
1112
"time"
1213

1314
"github.com/cenkalti/backoff"
@@ -161,7 +162,9 @@ func (n *network) GetNetdevMTU(pciAddr string) int {
161162
return 0
162163
}
163164

164-
link, err := n.netlinkLib.LinkByName(ifaceName)
165+
// Use LinkByNameWithBasicInfo which falls back to sysfs on EMSGSIZE.
166+
// This handles InfiniBand devices with many VFs.
167+
link, err := n.netlinkLib.LinkByNameWithBasicInfo(ifaceName)
165168
if err != nil {
166169
log.Log.Error(err, "GetNetdevMTU(): fail to get Link ", "device", ifaceName)
167170
return 0
@@ -184,10 +187,21 @@ func (n *network) SetNetdevMTU(pciAddr string, mtu int) error {
184187
return fmt.Errorf("failed to get netdevice for device %s", pciAddr)
185188
}
186189

190+
// Try standard LinkByName first. For InfiniBand devices with many VFs, this may fail
191+
// with EMSGSIZE because the kernel's netlink response exceeds the message limit.
192+
// In that case, fall back to LinkByNameForSetVf which reads minimal info from sysfs.
193+
// LinkSetMTU only needs the link index.
187194
link, err := n.netlinkLib.LinkByName(ifaceName)
188195
if err != nil {
189-
log.Log.Error(err, "SetNetdevMTU(): fail to get Link ", "device", ifaceName)
190-
return err
196+
if errors.Is(err, syscall.EMSGSIZE) {
197+
log.Log.V(2).Info("SetNetdevMTU(): LinkByName failed with EMSGSIZE, using sysfs fallback",
198+
"device", ifaceName)
199+
link, err = n.netlinkLib.LinkByNameForSetVf(ifaceName)
200+
}
201+
if err != nil {
202+
log.Log.Error(err, "SetNetdevMTU(): fail to get Link ", "device", ifaceName)
203+
return err
204+
}
191205
}
192206
return n.netlinkLib.LinkSetMTU(link, mtu)
193207
}, backoff.WithMaxRetries(b, 10))
@@ -203,7 +217,9 @@ func (n *network) SetNetdevMTU(pciAddr string, mtu int) error {
203217
// retrieved.
204218
func (n *network) GetNetDevMac(ifaceName string) string {
205219
log.Log.V(2).Info("GetNetDevMac(): get Mac", "device", ifaceName)
206-
link, err := n.netlinkLib.LinkByName(ifaceName)
220+
// Use LinkByNameWithBasicInfo which falls back to sysfs on EMSGSIZE.
221+
// This handles InfiniBand devices with many VFs.
222+
link, err := n.netlinkLib.LinkByNameWithBasicInfo(ifaceName)
207223
if err != nil {
208224
log.Log.Error(err, "GetNetDevMac(): failed to get Link", "device", ifaceName)
209225
return ""

pkg/host/internal/network/network_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -358,14 +358,14 @@ var _ = Describe("Network", func() {
358358
})
359359
It("should return 0 if not able to get interface by name", func() {
360360
dputilsLibMock.EXPECT().GetNetNames("0000:d8:00.0").Return([]string{"eno1"}, nil)
361-
netlinkLibMock.EXPECT().LinkByName("eno1").Return(nil, fmt.Errorf("failed to get interface"))
361+
netlinkLibMock.EXPECT().LinkByNameWithBasicInfo("eno1").Return(nil, fmt.Errorf("failed to get interface"))
362362
mtu := n.GetNetdevMTU("0000:d8:00.0")
363363
Expect(mtu).To(Equal(0))
364364
})
365365
It("should return mtu for interface", func() {
366366
dputilsLibMock.EXPECT().GetNetNames("0000:d8:00.0").Return([]string{"eno1"}, nil)
367367
link := &netlink.GenericLink{LinkType: "PF", LinkAttrs: netlink.LinkAttrs{Name: "eno1", MTU: 1500}}
368-
netlinkLibMock.EXPECT().LinkByName("eno1").Return(link, nil)
368+
netlinkLibMock.EXPECT().LinkByNameWithBasicInfo("eno1").Return(link, nil)
369369
mtu := n.GetNetdevMTU("0000:d8:00.0")
370370
Expect(mtu).To(Equal(1500))
371371
})
@@ -386,13 +386,13 @@ var _ = Describe("Network", func() {
386386
})
387387
Context("GetNetDevMac", func() {
388388
It("should return empty mac address if not able to get interface by link", func() {
389-
netlinkLibMock.EXPECT().LinkByName("eno1").Return(nil, fmt.Errorf("failed to find intreface"))
389+
netlinkLibMock.EXPECT().LinkByNameWithBasicInfo("eno1").Return(nil, fmt.Errorf("failed to find intreface"))
390390
mac := n.GetNetDevMac("eno1")
391391
Expect(mac).To(BeEmpty())
392392
})
393393
It("should return interface mac address", func() {
394394
link := &netlink.GenericLink{LinkType: "PF", LinkAttrs: netlink.LinkAttrs{Name: "eno1", HardwareAddr: net.HardwareAddr{0x00, 0x00, 0x5e, 0x00, 0x53, 0x01}}}
395-
netlinkLibMock.EXPECT().LinkByName("eno1").Return(link, nil)
395+
netlinkLibMock.EXPECT().LinkByNameWithBasicInfo("eno1").Return(link, nil)
396396
mac := n.GetNetDevMac("eno1")
397397
Expect(mac).To(Equal("00:00:5e:00:53:01"))
398398
})

pkg/host/internal/sriov/sriov.go

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,9 @@ func (s *sriov) getVfInfo(vfAddr string, pfName string, eswitchMode string, devi
149149
}
150150

151151
if name := s.networkHelper.TryGetInterfaceName(vfAddr); name != "" {
152-
link, err := s.netlinkLib.LinkByName(name)
152+
// Use LinkByNameWithBasicInfo which falls back to sysfs on EMSGSIZE.
153+
// This handles InfiniBand VFs where netlink can fail.
154+
link, err := s.netlinkLib.LinkByNameWithBasicInfo(name)
153155
if err != nil {
154156
log.Log.Error(err, "getVfInfo(): unable to get VF Link Object", "name", name, "device", vfAddr)
155157
} else {
@@ -270,7 +272,10 @@ func (s *sriov) DiscoverSriovDevices(storeManager store.ManagerInterface) ([]sri
270272
continue
271273
}
272274

273-
link, err := s.netlinkLib.LinkByName(pfNetName)
275+
// Use LinkByNameWithBasicInfo which falls back to sysfs if netlink fails with EMSGSIZE.
276+
// This handles InfiniBand devices with many VFs where the netlink response can exceed
277+
// the kernel's message size limit.
278+
link, err := s.netlinkLib.LinkByNameWithBasicInfo(pfNetName)
274279
if err != nil {
275280
log.Log.Error(err, "DiscoverSriovDevices(): unable to get Link for device, skipping", "device", device.Address)
276281
continue
@@ -507,10 +512,20 @@ func (s *sriov) configSriovVFDevices(iface *sriovnetworkv1.Interface) error {
507512
if err != nil {
508513
log.Log.Error(err, "configSriovVFDevices(): unable to parse VFs for device", "device", iface.PciAddress)
509514
}
515+
// Try standard LinkByName first. For InfiniBand devices with many VFs, this may fail
516+
// with EMSGSIZE because the kernel's netlink response exceeds the message limit.
517+
// In that case, fall back to LinkByNameForSetVf which reads minimal info from sysfs.
510518
pfLink, err := s.netlinkLib.LinkByName(iface.Name)
511519
if err != nil {
512-
log.Log.Error(err, "configSriovVFDevices(): unable to get PF link for device", "device", iface)
513-
return err
520+
if errors.Is(err, syscall.EMSGSIZE) {
521+
log.Log.V(2).Info("configSriovVFDevices(): LinkByName failed with EMSGSIZE, using sysfs fallback",
522+
"device", iface.Name)
523+
pfLink, err = s.netlinkLib.LinkByNameForSetVf(iface.Name)
524+
}
525+
if err != nil {
526+
log.Log.Error(err, "configSriovVFDevices(): unable to get PF link for device", "device", iface)
527+
return err
528+
}
514529
}
515530

516531
for _, addr := range vfAddrs {
@@ -655,10 +670,23 @@ func (s *sriov) configSriovDevice(iface *sriovnetworkv1.Interface, skipVFConfigu
655670
return err
656671
}
657672
// Set PF link up
673+
// Try standard LinkByName first. For InfiniBand devices with many VFs, this may fail
674+
// with EMSGSIZE because the kernel's netlink response exceeds the message limit.
675+
// In that case, fall back to LinkByNameForSetVf which reads minimal info from sysfs.
658676
pfLink, err := s.netlinkLib.LinkByName(iface.Name)
659677
if err != nil {
660-
return err
678+
if errors.Is(err, syscall.EMSGSIZE) {
679+
log.Log.V(2).Info("configSriovDevice(): LinkByName failed with EMSGSIZE, using sysfs fallback",
680+
"device", iface.Name)
681+
pfLink, err = s.netlinkLib.LinkByNameForSetVf(iface.Name)
682+
}
683+
if err != nil {
684+
return err
685+
}
661686
}
687+
// Note: With the sysfs fallback, IsLinkAdminStateUp will always return false because
688+
// the minimal link object doesn't have the Flags field populated. However, LinkSetUp
689+
// is idempotent - calling it on an already-up link is a no-op.
662690
if !s.netlinkLib.IsLinkAdminStateUp(pfLink) {
663691
err = s.netlinkLib.LinkSetUp(pfLink)
664692
if err != nil {
@@ -1042,7 +1070,9 @@ func (s *sriov) SetNicSriovMode(pciAddress string, mode string) error {
10421070

10431071
func (s *sriov) GetLinkType(name string) string {
10441072
log.Log.V(2).Info("GetLinkType()", "name", name)
1045-
link, err := s.netlinkLib.LinkByName(name)
1073+
// Use LinkByNameWithBasicInfo which falls back to sysfs if netlink fails with EMSGSIZE.
1074+
// This handles InfiniBand devices with many VFs.
1075+
link, err := s.netlinkLib.LinkByNameWithBasicInfo(name)
10461076
if err != nil {
10471077
log.Log.Error(err, "GetLinkType(): failed to get link", "device", name)
10481078
return ""

0 commit comments

Comments
 (0)