Skip to content

Commit 8dba049

Browse files
committed
feat: Add PF VFIO support
This change enables ib-sriov-cni to support Physical Function (PF) passthrough in addition to Virtual Functions (VF), particularly for KubeVirt GPU workloads requiring InfiniBand RDMA. Closes #159 Signed-off-by: Zhen(Winson) Wang <zhewang@nvidia.com>
1 parent 223358e commit 8dba049

File tree

5 files changed

+153
-65
lines changed

5 files changed

+153
-65
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ echo 8 > /sys/class/net/ib0/device/sriov_numvfs
207207
* `rdmaIsolation` (boolean, optional): Enable RDMA network namespace isolation for RDMA workloads. More information
208208
about the system requirements to support this mode of operation can be found [here](https://github.com/Mellanox/rdma-cni)
209209
* `ibKubernetesEnabled` (bool, optional): Enforces ib-sriov-cni to work with [ib-kubernetes](https://www.github.com/Mellanox/ib-kubernetes).
210-
* `vfioPciMode` (boolean, optional): Enable VFIO mode for VF devices bound to vfio-pci driver. When enabled, the CNI skips network interface configuration as VFIO devices are used for direct device assignment (e.g., for kubevirt/VM workloads). Defaults to false. If not explicitly set, the mode is auto-detected based on the VF's driver binding.
210+
* `vfioPciMode` (boolean, optional): Enable VFIO mode for devices (VF or PF) bound to vfio-pci driver. When enabled, the CNI skips network interface configuration as VFIO devices are used for direct device assignment (e.g., for kubevirt/VM workloads). Defaults to false. If not explicitly set, the mode is auto-detected based on the device's driver binding.
211211

212212
> *__Note__*: If `rdmaIsolation` is set to _true_, [`rdma-cni`](https://github.com/Mellanox/rdma-cni) should not be used.
213213

cmd/ib-sriov-cni/main.go

Lines changed: 106 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,6 @@ func unlockCNIExecution(lock *flock.Flock) {
9696
}
9797

9898
func handleVfioPciDetection(netConf *localtypes.NetConf) error {
99-
if netConf.DeviceID == "" {
100-
return fmt.Errorf("device ID is required for VFIO PCI detection")
101-
}
102-
10399
isVfioPci, err := utils.IsVfioPciDevice(netConf.DeviceID)
104100
if err != nil {
105101
return fmt.Errorf("failed to check vfio-pci driver binding for device %s: %v", netConf.DeviceID, err)
@@ -132,29 +128,45 @@ func getNetConfNetns(args *skel.CmdArgs) (*localtypes.NetConf, ns.NetNS, error)
132128
infiniBandAnnotation, configuredInfiniBand)
133129
}
134130

135-
netConf.GUID = getGUIDFromConf(netConf)
136-
137-
// Ensure GUID was provided if ib-kubernetes integration is enabled
138-
if netConf.IBKubernetesEnabled && netConf.GUID == "" {
139-
return nil, nil, fmt.Errorf(
140-
"infiniband SRIOV-CNI failed, Unexpected error. GUID must be provided by ib-kubernetes")
141-
}
142-
143131
if netConf.RdmaIsolation {
144132
err = utils.EnsureRdmaSystemMode()
145133
if err != nil {
146134
return nil, nil, err
147135
}
148136
}
149137

138+
// Validate deviceID is provided
139+
if netConf.DeviceID == "" {
140+
return nil, nil, fmt.Errorf("deviceID is required")
141+
}
142+
150143
// Handle vfio-pci detection
151144
if err := handleVfioPciDetection(netConf); err != nil {
152145
return nil, nil, err
153146
}
154147

155-
err = config.LoadDeviceInfo(netConf)
148+
// Check if device is PF or VF to load appropriate device info
149+
isVF, err := utils.IsVirtualFunction(netConf.DeviceID)
156150
if err != nil {
157-
return nil, nil, fmt.Errorf("failed to get device specific information. %v", err)
151+
return nil, nil, fmt.Errorf("failed to determine if device %s is VF or PF: %v", netConf.DeviceID, err)
152+
}
153+
netConf.IsVFDevice = isVF
154+
155+
netConf.GUID = getGUIDFromConf(netConf)
156+
157+
// Ensure GUID was provided if ib-kubernetes integration is enabled
158+
// Note: PF devices already have their own GUID, so only check for VF devices
159+
if netConf.IBKubernetesEnabled && netConf.IsVFDevice && netConf.GUID == "" {
160+
return nil, nil, fmt.Errorf(
161+
"infiniband SRIOV-CNI failed, Unexpected error. GUID must be provided by ib-kubernetes")
162+
}
163+
164+
// Only load VF device info for VF devices (PF devices don't need this)
165+
if netConf.IsVFDevice {
166+
err = config.LoadDeviceInfo(netConf)
167+
if err != nil {
168+
return nil, nil, fmt.Errorf("failed to get VF device information: %v", err)
169+
}
158170
}
159171

160172
netns, err := ns.GetNS(args.Netns)
@@ -249,23 +261,11 @@ func runIPAMPlugin(stdinData []byte, netConf *localtypes.NetConf) (_ *current.Re
249261
return newResult, nil
250262
}
251263

252-
func cmdAdd(args *skel.CmdArgs) (retErr error) {
253-
netConf, netns, err := getNetConfNetns(args)
254-
if err != nil {
255-
return err
256-
}
257-
defer netns.Close()
258-
264+
// handleVFAdd handles VF device configuration in cmdAdd
265+
func handleVFAdd(args *skel.CmdArgs, netConf *localtypes.NetConf, netns ns.NetNS, result *current.Result) (retErr error) {
259266
sm := sriov.NewSriovManager()
260267

261-
// Lock CNI operation to serialize the operation
262-
lock, err := lockCNIExecution()
263-
if err != nil {
264-
return err
265-
}
266-
defer unlockCNIExecution(lock)
267-
268-
err = doVFConfig(sm, netConf, netns, args)
268+
err := doVFConfig(sm, netConf, netns, args)
269269
if err != nil {
270270
return err
271271
}
@@ -284,12 +284,6 @@ func cmdAdd(args *skel.CmdArgs) (retErr error) {
284284
}
285285
}()
286286

287-
result := &current.Result{}
288-
result.Interfaces = []*current.Interface{{
289-
Name: args.IfName,
290-
Sandbox: netns.Path(),
291-
}}
292-
293287
// VFIO devices don't have network interfaces, skip IPAM configuration
294288
if netConf.IPAM.Type != "" && !netConf.VfioPciMode {
295289
var newResult *current.Result
@@ -318,14 +312,53 @@ func cmdAdd(args *skel.CmdArgs) (retErr error) {
318312
return err
319313
}
320314

321-
result = newResult
315+
// Update result pointer to point to the new result
316+
*result = *newResult
322317
}
323318

324319
// Cache NetConf for CmdDel
325320
if err = utils.SaveNetConf(args.ContainerID, config.DefaultCNIDir, args.IfName, netConf); err != nil {
326321
return fmt.Errorf("error saving NetConf %q", err)
327322
}
328323

324+
return nil
325+
}
326+
327+
func cmdAdd(args *skel.CmdArgs) (retErr error) {
328+
netConf, netns, err := getNetConfNetns(args)
329+
if err != nil {
330+
return err
331+
}
332+
defer netns.Close()
333+
334+
// Lock CNI operation to serialize the operation
335+
lock, err := lockCNIExecution()
336+
if err != nil {
337+
return err
338+
}
339+
defer unlockCNIExecution(lock)
340+
341+
result := &current.Result{}
342+
result.Interfaces = []*current.Interface{{
343+
Name: args.IfName,
344+
Sandbox: netns.Path(),
345+
}}
346+
347+
// Check if device is PF (Physical Function) - flag was set in getNetConfNetns
348+
// PF passthrough devices don't need VF configuration
349+
if !netConf.IsVFDevice {
350+
// PF device - just cache config and return success
351+
if err = utils.SaveNetConf(args.ContainerID, config.DefaultCNIDir, args.IfName, netConf); err != nil {
352+
return fmt.Errorf("error saving NetConf %q", err)
353+
}
354+
} else {
355+
// VF device - continue with normal VF configuration
356+
err = handleVFAdd(args, netConf, netns, result)
357+
if err != nil {
358+
return err
359+
}
360+
}
361+
329362
return types.PrintResult(result, netConf.CNIVersion)
330363
}
331364

@@ -340,6 +373,38 @@ func handleIPAMCleanup(netConf *localtypes.NetConf, stdinData []byte) error {
340373
return ipam.ExecDel(netConf.IPAM.Type, stdinData)
341374
}
342375

376+
// handleVFCleanup performs VF-specific cleanup operations
377+
func handleVFCleanup(sm localtypes.Manager, netConf *localtypes.NetConf, args *skel.CmdArgs, netns ns.NetNS) error {
378+
// VFIO devices don't have network interfaces to release
379+
if !netConf.VfioPciMode {
380+
err := sm.ReleaseVF(netConf, args.IfName, args.ContainerID, netns)
381+
if err != nil {
382+
return err
383+
}
384+
}
385+
386+
// Move RDMA device to default namespace
387+
// Note(adrianc): Due to some un-intuitive kernel behavior (which i hope will change), moving an RDMA device
388+
// to namespace causes all of its associated ULP devices (IPoIB) to be recreated in the default namespace.
389+
// we strategically place this here to allow:
390+
// 1. netedv cleanup during ReleaseVF.
391+
// 2. rdma dev netns cleanup as ResetVFConfig will rebind the VF.
392+
// Doing anything would have yielded the same results however ResetVFConfig will eventually not trigger VF rebind.
393+
if netConf.RdmaIsolation {
394+
err := utils.MoveRdmaDevFromNs(netConf.RdmaNetState.ContainerRdmaDevName, netns)
395+
if err != nil {
396+
return fmt.Errorf(
397+
"failed to restore RDMA device %s to default namespace. %v",
398+
netConf.RdmaNetState.ContainerRdmaDevName, err)
399+
}
400+
}
401+
402+
if err := sm.ResetVFConfig(netConf); err != nil {
403+
return fmt.Errorf("cmdDel() error reseting VF: %q", err)
404+
}
405+
return nil
406+
}
407+
343408
func cmdDel(args *skel.CmdArgs) (retErr error) {
344409
// https://github.com/kubernetes/kubernetes/pull/35240
345410
if args.Netns == "" {
@@ -388,41 +453,18 @@ func cmdDel(args *skel.CmdArgs) (retErr error) {
388453
}
389454
defer netns.Close()
390455

456+
if !netConf.IsVFDevice {
457+
return nil
458+
}
459+
391460
// Lock CNI operation to serialize the operation
392461
lock, err := lockCNIExecution()
393462
if err != nil {
394463
return err
395464
}
396465
defer unlockCNIExecution(lock)
397466

398-
// VFIO devices don't have network interfaces to release
399-
if !netConf.VfioPciMode {
400-
err = sm.ReleaseVF(netConf, args.IfName, args.ContainerID, netns)
401-
if err != nil {
402-
return err
403-
}
404-
}
405-
406-
// Move RDMA device to default namespace
407-
// Note(adrianc): Due to some un-intuitive kernel behavior (which i hope will change), moving an RDMA device
408-
// to namespace causes all of its associated ULP devices (IPoIB) to be recreated in the default namespace.
409-
// we strategically place this here to allow:
410-
// 1. netedv cleanup during ReleaseVF.
411-
// 2. rdma dev netns cleanup as ResetVFConfig will rebind the VF.
412-
// Doing anything would have yielded the same results however ResetVFConfig will eventually not trigger VF rebind.
413-
if netConf.RdmaIsolation {
414-
err = utils.MoveRdmaDevFromNs(netConf.RdmaNetState.ContainerRdmaDevName, netns)
415-
if err != nil {
416-
return fmt.Errorf(
417-
"failed to restore RDMA device %s to default namespace. %v",
418-
netConf.RdmaNetState.ContainerRdmaDevName, err)
419-
}
420-
}
421-
422-
if err = sm.ResetVFConfig(netConf); err != nil {
423-
return fmt.Errorf("cmdDel() error reseting VF: %q", err)
424-
}
425-
return nil
467+
return handleVFCleanup(sm, netConf, args, netns)
426468
}
427469

428470
func cmdCheck(args *skel.CmdArgs) error {

pkg/types/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ type IbSriovNetConf struct {
3131
RdmaIsolation bool `json:"rdmaIsolation,omitempty"`
3232
IBKubernetesEnabled bool `json:"ibKubernetesEnabled,omitempty"`
3333
VfioPciMode bool `json:"vfioPciMode,omitempty"` // Skip SR-IOV network setup, default false
34+
IsVFDevice bool `json:"-"` // Runtime flag: true if device is VF, false if PF
3435
RdmaNetState rdmatypes.RdmaNetState
3536
RuntimeConfig RuntimeConf `json:"runtimeConfig,omitempty"`
3637
Args struct {

pkg/utils/utils.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,3 +262,22 @@ func IsVfioPciDevice(pciAddr string) (bool, error) {
262262
driverName := filepath.Base(linkTarget)
263263
return driverName == VfioPciDriverName, nil
264264
}
265+
266+
// IsVirtualFunction checks if a PCI device is a VF by checking for physfn symlink
267+
func IsVirtualFunction(pciAddr string) (bool, error) {
268+
physfnPath := filepath.Join(SysBusPci, pciAddr, "physfn")
269+
270+
// Check if physfn symlink exists
271+
_, err := os.Lstat(physfnPath)
272+
if err != nil {
273+
if os.IsNotExist(err) {
274+
// physfn doesn't exist, so this is not a VF (likely a PF)
275+
return false, nil
276+
}
277+
// Other error occurred
278+
return false, err
279+
}
280+
281+
// physfn exists, so this is a VF
282+
return true, nil
283+
}

pkg/utils/utils_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,32 @@ var _ = Describe("Utils", func() {
8181
Expect(guid).To(Equal(""))
8282
})
8383
})
84+
Context("Checking IsVirtualFunction function", func() {
85+
It("Assuming VF device (has physfn)", func() {
86+
// This test assumes 0000:af:06.0 is a VF with physfn symlink
87+
result, err := IsVirtualFunction("0000:af:06.0")
88+
Expect(err).NotTo(HaveOccurred(), "Should not return error for valid PCI address")
89+
Expect(result).To(Equal(true), "VF device should return true")
90+
})
91+
It("Assuming PF device (no physfn)", func() {
92+
// Test with the actual PF device 0000:af:00.1 (ib0) from fixture
93+
result, err := IsVirtualFunction("0000:af:00.1")
94+
Expect(err).NotTo(HaveOccurred(), "Should not return error for valid PCI address")
95+
Expect(result).To(Equal(false), "PF device should return false")
96+
})
97+
It("Assuming VFIO VF device (has physfn)", func() {
98+
// Test with VFIO VF 0000:af:06.1 - should still be detected as VF
99+
result, err := IsVirtualFunction("0000:af:06.1")
100+
Expect(err).NotTo(HaveOccurred(), "Should not return error for valid PCI address")
101+
Expect(result).To(Equal(true), "VFIO VF device should still return true")
102+
})
103+
It("Assuming non-existing device", func() {
104+
// This should return false and no error for non-existing device
105+
result, err := IsVirtualFunction("0000:ff:ff.f")
106+
Expect(err).NotTo(HaveOccurred(), "Should not return error for non-existing device")
107+
Expect(result).To(Equal(false), "Non-existing device should return false")
108+
})
109+
})
84110
Context("Checking IsVfioPciDevice function", func() {
85111
It("Assuming device bound to vfio-pci driver", func() {
86112
// Test with VF (0000:af:06.1) that is bound to vfio-pci in the mock

0 commit comments

Comments
 (0)