Skip to content

Commit bc7ebe5

Browse files
winsopcZhen Wang
authored andcommitted
feat: Add PF VFIO support
This change enables ib-sriov-cni to support Physical Function (PF) passthrough in addition to Virtual Functions (VF), particularly for KubeVirt GPU workloads requiring InfiniBand RDMA. Closes #159 Signed-off-by: Zhen(Winson) Wang <zhewang@nvidia.com>
1 parent aecb881 commit bc7ebe5

File tree

5 files changed

+166
-66
lines changed

5 files changed

+166
-66
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,9 @@ echo 8 > /sys/class/net/ib0/device/sriov_numvfs
207207
* `rdmaIsolation` (boolean, optional): Enable RDMA network namespace isolation for RDMA workloads. More information
208208
about the system requirements to support this mode of operation can be found [here](https://github.com/Mellanox/rdma-cni)
209209
* `ibKubernetesEnabled` (bool, optional): Enforces ib-sriov-cni to work with [ib-kubernetes](https://www.github.com/Mellanox/ib-kubernetes).
210-
* `vfioPciMode` (boolean, optional): Enable VFIO mode for VF devices bound to vfio-pci driver. When enabled, the CNI skips network interface configuration as VFIO devices are used for direct device assignment (e.g., for kubevirt/VM workloads). Defaults to false. If not explicitly set, the mode is auto-detected based on the VF's driver binding.
210+
* `vfioPciMode` (boolean, optional): Enable VFIO mode for devices (VF or PF) bound to vfio-pci driver. When enabled, the CNI skips network interface configuration as VFIO devices are used for direct device assignment (e.g., for kubevirt/VM workloads). Defaults to false. If not explicitly set, the mode is auto-detected based on the device's driver binding.
211+
212+
> *__Note__*: PF passthrough is only supported in VFIO mode. When using a PF device, it must be bound to the vfio-pci driver and `vfioPciMode` must be enabled (or auto-detected). Moving a PF's InfiniBand interface into a pod network namespace is not supported.
211213
212214
> *__Note__*: If `rdmaIsolation` is set to _true_, [`rdma-cni`](https://github.com/Mellanox/rdma-cni) should not be used.
213215

cmd/ib-sriov-cni/main.go

Lines changed: 117 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,6 @@ func unlockCNIExecution(lock *flock.Flock) {
9696
}
9797

9898
func handleVfioPciDetection(netConf *localtypes.NetConf) error {
99-
if netConf.DeviceID == "" {
100-
return fmt.Errorf("device ID is required for VFIO PCI detection")
101-
}
102-
10399
isVfioPci, err := utils.IsVfioPciDevice(netConf.DeviceID)
104100
if err != nil {
105101
return fmt.Errorf("failed to check vfio-pci driver binding for device %s: %v", netConf.DeviceID, err)
@@ -132,29 +128,45 @@ func getNetConfNetns(args *skel.CmdArgs) (*localtypes.NetConf, ns.NetNS, error)
132128
infiniBandAnnotation, configuredInfiniBand)
133129
}
134130

135-
netConf.GUID = getGUIDFromConf(netConf)
136-
137-
// Ensure GUID was provided if ib-kubernetes integration is enabled
138-
if netConf.IBKubernetesEnabled && netConf.GUID == "" {
139-
return nil, nil, fmt.Errorf(
140-
"infiniband SRIOV-CNI failed, Unexpected error. GUID must be provided by ib-kubernetes")
141-
}
142-
143131
if netConf.RdmaIsolation {
144132
err = utils.EnsureRdmaSystemMode()
145133
if err != nil {
146134
return nil, nil, err
147135
}
148136
}
149137

138+
// Validate deviceID is provided
139+
if netConf.DeviceID == "" {
140+
return nil, nil, fmt.Errorf("deviceID is required")
141+
}
142+
150143
// Handle vfio-pci detection
151144
if err := handleVfioPciDetection(netConf); err != nil {
152145
return nil, nil, err
153146
}
154147

155-
err = config.LoadDeviceInfo(netConf)
148+
// Check if device is PF or VF to load appropriate device info
149+
isVF, err := utils.IsVirtualFunction(netConf.DeviceID)
156150
if err != nil {
157-
return nil, nil, fmt.Errorf("failed to get device specific information. %v", err)
151+
return nil, nil, fmt.Errorf("failed to determine if device %s is VF or PF: %v", netConf.DeviceID, err)
152+
}
153+
netConf.IsVFDevice = isVF
154+
155+
netConf.GUID = getGUIDFromConf(netConf)
156+
157+
// Ensure GUID was provided if ib-kubernetes integration is enabled
158+
// Note: PF devices already have their own GUID, so only check for VF devices
159+
if netConf.IBKubernetesEnabled && netConf.IsVFDevice && netConf.GUID == "" {
160+
return nil, nil, fmt.Errorf(
161+
"infiniband SRIOV-CNI failed, Unexpected error. GUID must be provided by ib-kubernetes")
162+
}
163+
164+
// Only load VF device info for VF devices (PF device that is bound to vfio dont need this)
165+
if netConf.IsVFDevice {
166+
err = config.LoadDeviceInfo(netConf)
167+
if err != nil {
168+
return nil, nil, fmt.Errorf("failed to get VF device information: %v", err)
169+
}
158170
}
159171

160172
netns, err := ns.GetNS(args.Netns)
@@ -249,23 +261,11 @@ func runIPAMPlugin(stdinData []byte, netConf *localtypes.NetConf) (_ *current.Re
249261
return newResult, nil
250262
}
251263

252-
func cmdAdd(args *skel.CmdArgs) (retErr error) {
253-
netConf, netns, err := getNetConfNetns(args)
254-
if err != nil {
255-
return err
256-
}
257-
defer func() { _ = netns.Close() }()
258-
264+
// handleVFAdd handles VF device configuration in cmdAdd
265+
func handleVFAdd(args *skel.CmdArgs, netConf *localtypes.NetConf, netns ns.NetNS, result *current.Result) (retErr error) {
259266
sm := sriov.NewSriovManager()
260267

261-
// Lock CNI operation to serialize the operation
262-
lock, err := lockCNIExecution()
263-
if err != nil {
264-
return err
265-
}
266-
defer unlockCNIExecution(lock)
267-
268-
err = doVFConfig(sm, netConf, netns, args)
268+
err := doVFConfig(sm, netConf, netns, args)
269269
if err != nil {
270270
return err
271271
}
@@ -284,12 +284,6 @@ func cmdAdd(args *skel.CmdArgs) (retErr error) {
284284
}
285285
}()
286286

287-
result := &current.Result{}
288-
result.Interfaces = []*current.Interface{{
289-
Name: args.IfName,
290-
Sandbox: netns.Path(),
291-
}}
292-
293287
// VFIO devices don't have network interfaces, skip IPAM configuration
294288
if netConf.IPAM.Type != "" && !netConf.VfioPciMode {
295289
var newResult *current.Result
@@ -318,12 +312,54 @@ func cmdAdd(args *skel.CmdArgs) (retErr error) {
318312
return err
319313
}
320314

321-
result = newResult
315+
// Update result pointer to point to the new result
316+
*result = *newResult
322317
}
323318

324319
// Cache NetConf for CmdDel
325320
if err = utils.SaveNetConf(args.ContainerID, config.DefaultCNIDir, args.IfName, netConf); err != nil {
326-
return fmt.Errorf("error saving NetConf %q", err)
321+
return fmt.Errorf("error saving NetConf: %v", err)
322+
}
323+
324+
return nil
325+
}
326+
327+
func cmdAdd(args *skel.CmdArgs) (retErr error) {
328+
netConf, netns, err := getNetConfNetns(args)
329+
if err != nil {
330+
return err
331+
}
332+
defer func() { _ = netns.Close() }()
333+
334+
// Lock CNI operation to serialize the operation
335+
lock, err := lockCNIExecution()
336+
if err != nil {
337+
return err
338+
}
339+
defer unlockCNIExecution(lock)
340+
341+
result := &current.Result{}
342+
result.Interfaces = []*current.Interface{{
343+
Name: args.IfName,
344+
Sandbox: netns.Path(),
345+
}}
346+
347+
// Check if device is PF (Physical Function) - flag was set in getNetConfNetns
348+
// PF passthrough devices don't need VF configuration
349+
if !netConf.IsVFDevice {
350+
if !netConf.VfioPciMode {
351+
return fmt.Errorf("PF device %s requires vfioPciMode to be enabled", netConf.DeviceID)
352+
}
353+
// PF device - just cache config and return success
354+
if err = utils.SaveNetConf(args.ContainerID, config.DefaultCNIDir, args.IfName, netConf); err != nil {
355+
return fmt.Errorf("error saving NetConf: %v", err)
356+
}
357+
} else {
358+
// VF device - continue with normal VF configuration
359+
err = handleVFAdd(args, netConf, netns, result)
360+
if err != nil {
361+
return err
362+
}
327363
}
328364

329365
return types.PrintResult(result, netConf.CNIVersion)
@@ -340,6 +376,38 @@ func handleIPAMCleanup(netConf *localtypes.NetConf, stdinData []byte) error {
340376
return ipam.ExecDel(netConf.IPAM.Type, stdinData)
341377
}
342378

379+
// handleVFCleanup performs VF-specific cleanup operations
380+
func handleVFCleanup(sm localtypes.Manager, netConf *localtypes.NetConf, args *skel.CmdArgs, netns ns.NetNS) error {
381+
// VFIO devices don't have network interfaces to release
382+
if !netConf.VfioPciMode {
383+
err := sm.ReleaseVF(netConf, args.IfName, args.ContainerID, netns)
384+
if err != nil {
385+
return err
386+
}
387+
}
388+
389+
// Move RDMA device to default namespace
390+
// Note(adrianc): Due to some un-intuitive kernel behavior (which i hope will change), moving an RDMA device
391+
// to namespace causes all of its associated ULP devices (IPoIB) to be recreated in the default namespace.
392+
// we strategically place this here to allow:
393+
// 1. netedv cleanup during ReleaseVF.
394+
// 2. rdma dev netns cleanup as ResetVFConfig will rebind the VF.
395+
// Doing anything would have yielded the same results however ResetVFConfig will eventually not trigger VF rebind.
396+
if netConf.RdmaIsolation {
397+
err := utils.MoveRdmaDevFromNs(netConf.RdmaNetState.ContainerRdmaDevName, netns)
398+
if err != nil {
399+
return fmt.Errorf(
400+
"failed to restore RDMA device %s to default namespace. %v",
401+
netConf.RdmaNetState.ContainerRdmaDevName, err)
402+
}
403+
}
404+
405+
if err := sm.ResetVFConfig(netConf); err != nil {
406+
return fmt.Errorf("cmdDel() error resetting VF: %v", err)
407+
}
408+
return nil
409+
}
410+
343411
func cmdDel(args *skel.CmdArgs) (retErr error) {
344412
// https://github.com/kubernetes/kubernetes/pull/35240
345413
if args.Netns == "" {
@@ -388,41 +456,25 @@ func cmdDel(args *skel.CmdArgs) (retErr error) {
388456
}
389457
defer func() { _ = netns.Close() }()
390458

391-
// Lock CNI operation to serialize the operation
392-
lock, err := lockCNIExecution()
459+
// Detect if device is VF or PF at runtime during Del
460+
isVF, err := utils.IsVirtualFunction(netConf.DeviceID)
393461
if err != nil {
394-
return err
462+
return fmt.Errorf("failed to determine if device %s is VF or PF: %v", netConf.DeviceID, err)
395463
}
396-
defer unlockCNIExecution(lock)
397464

398-
// VFIO devices don't have network interfaces to release
399-
if !netConf.VfioPciMode {
400-
err = sm.ReleaseVF(netConf, args.IfName, args.ContainerID, netns)
401-
if err != nil {
402-
return err
403-
}
465+
// PF devices don't need VF cleanup
466+
if !isVF {
467+
return nil
404468
}
405469

406-
// Move RDMA device to default namespace
407-
// Note(adrianc): Due to some un-intuitive kernel behavior (which i hope will change), moving an RDMA device
408-
// to namespace causes all of its associated ULP devices (IPoIB) to be recreated in the default namespace.
409-
// we strategically place this here to allow:
410-
// 1. netedv cleanup during ReleaseVF.
411-
// 2. rdma dev netns cleanup as ResetVFConfig will rebind the VF.
412-
// Doing anything would have yielded the same results however ResetVFConfig will eventually not trigger VF rebind.
413-
if netConf.RdmaIsolation {
414-
err = utils.MoveRdmaDevFromNs(netConf.RdmaNetState.ContainerRdmaDevName, netns)
415-
if err != nil {
416-
return fmt.Errorf(
417-
"failed to restore RDMA device %s to default namespace. %v",
418-
netConf.RdmaNetState.ContainerRdmaDevName, err)
419-
}
470+
// Lock CNI operation to serialize the operation
471+
lock, err := lockCNIExecution()
472+
if err != nil {
473+
return err
420474
}
475+
defer unlockCNIExecution(lock)
421476

422-
if err = sm.ResetVFConfig(netConf); err != nil {
423-
return fmt.Errorf("cmdDel() error reseting VF: %q", err)
424-
}
425-
return nil
477+
return handleVFCleanup(sm, netConf, args, netns)
426478
}
427479

428480
func cmdCheck(args *skel.CmdArgs) error {

pkg/types/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ type IbSriovNetConf struct {
3131
RdmaIsolation bool `json:"rdmaIsolation,omitempty"`
3232
IBKubernetesEnabled bool `json:"ibKubernetesEnabled,omitempty"`
3333
VfioPciMode bool `json:"vfioPciMode,omitempty"` // Skip SR-IOV network setup, default false
34+
IsVFDevice bool `json:"-"` // Runtime flag: true if device is VF, false if PF
3435
RdmaNetState rdmatypes.RdmaNetState
3536
RuntimeConfig RuntimeConf `json:"runtimeConfig,omitempty"`
3637
Args struct {

pkg/utils/utils.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,3 +262,22 @@ func IsVfioPciDevice(pciAddr string) (bool, error) {
262262
driverName := filepath.Base(linkTarget)
263263
return driverName == VfioPciDriverName, nil
264264
}
265+
266+
// IsVirtualFunction checks if a PCI device is a VF by checking for physfn symlink
267+
func IsVirtualFunction(pciAddr string) (bool, error) {
268+
physfnPath := filepath.Join(SysBusPci, pciAddr, "physfn")
269+
270+
// Check if physfn symlink exists
271+
_, err := os.Lstat(physfnPath)
272+
if err != nil {
273+
if os.IsNotExist(err) {
274+
// physfn doesn't exist, so this is not a VF (likely a PF)
275+
return false, nil
276+
}
277+
// Other error occurred
278+
return false, err
279+
}
280+
281+
// physfn exists, so this is a VF
282+
return true, nil
283+
}

pkg/utils/utils_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,32 @@ var _ = Describe("Utils", func() {
8181
Expect(guid).To(Equal(""))
8282
})
8383
})
84+
Context("Checking IsVirtualFunction function", func() {
85+
It("Assuming VF device (has physfn)", func() {
86+
// This test assumes 0000:af:06.0 is a VF with physfn symlink
87+
result, err := IsVirtualFunction("0000:af:06.0")
88+
Expect(err).NotTo(HaveOccurred(), "Should not return error for valid PCI address")
89+
Expect(result).To(Equal(true), "VF device should return true")
90+
})
91+
It("Assuming PF device (no physfn)", func() {
92+
// Test with the actual PF device 0000:af:00.1 (ib0) from fixture
93+
result, err := IsVirtualFunction("0000:af:00.1")
94+
Expect(err).NotTo(HaveOccurred(), "Should not return error for valid PCI address")
95+
Expect(result).To(Equal(false), "PF device should return false")
96+
})
97+
It("Assuming VFIO VF device (has physfn)", func() {
98+
// Test with VFIO VF 0000:af:06.1 - should still be detected as VF
99+
result, err := IsVirtualFunction("0000:af:06.1")
100+
Expect(err).NotTo(HaveOccurred(), "Should not return error for valid PCI address")
101+
Expect(result).To(Equal(true), "VFIO VF device should still return true")
102+
})
103+
It("Assuming non-existing device", func() {
104+
// This should return false and no error for non-existing device
105+
result, err := IsVirtualFunction("0000:ff:ff.f")
106+
Expect(err).NotTo(HaveOccurred(), "Should not return error for non-existing device")
107+
Expect(result).To(Equal(false), "Non-existing device should return false")
108+
})
109+
})
84110
Context("Checking IsVfioPciDevice function", func() {
85111
It("Assuming device bound to vfio-pci driver", func() {
86112
// Test with VF (0000:af:06.1) that is bound to vfio-pci in the mock

0 commit comments

Comments
 (0)