Skip to content

Commit ff07f0c

Browse files
authored
Merge pull request #313 from zeeke/us/ocpbugs-45028
Avoid resetting the VF if the netns does not exist
2 parents d1f41e4 + 9332ce0 commit ff07f0c

File tree

5 files changed

+80
-3
lines changed

5 files changed

+80
-3
lines changed

.github/workflows/buildtest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ jobs:
8080
env:
8181
LOCAL_SRIOV_CNI_IMAGE: ghaction-sriov-cni:pr-${{github.event.pull_request.number}}
8282

83-
- uses: actions/upload-artifact@v3
83+
- uses: actions/upload-artifact@v4
8484
if: always()
8585
with:
8686
name: ${{ env.TEST_REPORT_PATH }}

cmd/sriov/main.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,17 @@ func cmdDel(args *skel.CmdArgs) error {
245245
return nil
246246
}
247247

248+
allocator := utils.NewPCIAllocator(config.DefaultCNIDir)
249+
250+
err = allocator.Lock(netConf.DeviceID)
251+
if err != nil {
252+
return fmt.Errorf("cmdDel() error obtaining lock for device [%s]: %w", netConf.DeviceID, err)
253+
}
254+
255+
logging.Debug("Acquired device lock",
256+
"func", "cmdDel",
257+
"DeviceID", netConf.DeviceID)
258+
248259
defer func() {
249260
if err == nil && cRefPath != "" {
250261
_ = utils.CleanCachedNetConf(cRefPath)
@@ -270,6 +281,9 @@ func cmdDel(args *skel.CmdArgs) error {
270281

271282
sm := sriov.NewSriovManager()
272283

284+
logging.Debug("Reset VF configuration",
285+
"func", "cmdDel",
286+
"netConf.DeviceID", netConf.DeviceID)
273287
/* ResetVFConfig resets a VF administratively. We must run ResetVFConfig
274288
before ReleaseVF because some drivers will error out if we try to
275289
reset netdev VF with trust off. So, reset VF MAC address via PF first.
@@ -288,13 +302,22 @@ func cmdDel(args *skel.CmdArgs) error {
288302
// IPAM resources
289303
_, ok := err.(ns.NSPathNotExistErr)
290304
if ok {
305+
logging.Debug("Exiting as the network namespace does not exists anymore",
306+
"func", "cmdDel",
307+
"netConf.DeviceID", netConf.DeviceID,
308+
"args.Netns", args.Netns)
291309
return nil
292310
}
293311

294312
return fmt.Errorf("failed to open netns %s: %q", netns, err)
295313
}
296314
defer netns.Close()
297315

316+
logging.Debug("Release the VF",
317+
"func", "cmdDel",
318+
"netConf.DeviceID", netConf.DeviceID,
319+
"args.Netns", args.Netns,
320+
"args.IfName", args.IfName)
298321
if err = sm.ReleaseVF(netConf, args.IfName, netns); err != nil {
299322
return err
300323
}
@@ -305,7 +328,6 @@ func cmdDel(args *skel.CmdArgs) error {
305328
"func", "cmdDel",
306329
"config.DefaultCNIDir", config.DefaultCNIDir,
307330
"netConf.DeviceID", netConf.DeviceID)
308-
allocator := utils.NewPCIAllocator(config.DefaultCNIDir)
309331
if err = allocator.DeleteAllocatedPCI(netConf.DeviceID); err != nil {
310332
return fmt.Errorf("error cleaning the pci allocation for vf pci address %s: %v", netConf.DeviceID, err)
311333
}

pkg/config/config.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,15 @@ func LoadConf(bytes []byte) (*sriovtypes.NetConf, error) {
4848
return nil, fmt.Errorf("LoadConf(): VF pci addr is required")
4949
}
5050

51+
allocator := utils.NewPCIAllocator(DefaultCNIDir)
52+
err := allocator.Lock(n.DeviceID)
53+
if err != nil {
54+
return nil, err
55+
}
56+
logging.Debug("Acquired device lock",
57+
"func", "LoadConf",
58+
"DeviceID", n.DeviceID)
59+
5160
// Check if the device is already allocated.
5261
// This is to prevent issues where kubelet request to delete a pod and in the same time a new pod using the same
5362
// vf is started. we can have an issue where the cmdDel of the old pod is called AFTER the cmdAdd of the new one
@@ -56,7 +65,6 @@ func LoadConf(bytes []byte) (*sriovtypes.NetConf, error) {
5665
"func", "LoadConf",
5766
"DefaultCNIDir", DefaultCNIDir,
5867
"n.DeviceID", n.DeviceID)
59-
allocator := utils.NewPCIAllocator(DefaultCNIDir)
6068
isAllocated, err := allocator.IsAllocated(n.DeviceID)
6169
if err != nil {
6270
return n, err

pkg/config/config_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ import (
1313
)
1414

1515
var _ = Describe("Config", func() {
16+
BeforeEach(func() {
17+
DeferCleanup(func(x string) { DefaultCNIDir = x }, DefaultCNIDir)
18+
DefaultCNIDir = GinkgoT().TempDir()
19+
})
20+
1621
Context("Checking LoadConf function", func() {
1722
It("Assuming correct config file - existing DeviceID", func() {
1823
conf := []byte(`{
@@ -80,6 +85,7 @@ var _ = Describe("Config", func() {
8085
invalidProto := "802"
8186
DescribeTable("Vlan ID, QoS and Proto",
8287
func(vlanID *int, vlanQoS *int, vlanProto *string, failure bool) {
88+
8389
s := `{
8490
"name": "mynet",
8591
"type": "sriov",

pkg/utils/pci_allocator.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,18 @@ package utils
33
import (
44
"fmt"
55
"os"
6+
"path"
67
"path/filepath"
8+
"time"
79

810
"github.com/containernetworking/plugins/pkg/ns"
911
"github.com/k8snetworkplumbingwg/sriov-cni/pkg/logging"
12+
13+
"golang.org/x/sys/unix"
1014
)
1115

16+
const pciLockAcquireTimeout = 60 * time.Second
17+
1218
type PCIAllocation interface {
1319
SaveAllocatedPCI(string, string) error
1420
DeleteAllocatedPCI(string) error
@@ -25,6 +31,41 @@ func NewPCIAllocator(dataDir string) *PCIAllocator {
2531
return &PCIAllocator{dataDir: filepath.Join(dataDir, "pci")}
2632
}
2733

34+
// Lock gets an exclusive lock on the given PCI address, ensuring there is no other process configuring / or de-configuring the same device.
35+
func (p *PCIAllocator) Lock(pciAddress string) error {
36+
lockDir := path.Join(p.dataDir, "vf_lock")
37+
if err := os.MkdirAll(lockDir, 0600); err != nil {
38+
return fmt.Errorf("failed to create the sriov lock directory(%q): %v", lockDir, err)
39+
}
40+
41+
path := filepath.Join(lockDir, fmt.Sprintf("%s.lock", pciAddress))
42+
43+
// unix.O_CREAT - Create the file if it doesn't exist
44+
// unix.O_RDONLY - Open the file for read
45+
// unix.O_CLOEXEC - Automatically close the file on exit. This is useful to keep the flock until the process exits
46+
fd, err := unix.Open(path, unix.O_CREAT|unix.O_RDONLY|unix.O_CLOEXEC, 0600)
47+
if err != nil {
48+
return fmt.Errorf("failed to open PCI file [%s] for locking: %w", path, err)
49+
}
50+
51+
errCh := make(chan error)
52+
go func() {
53+
// unix.LOCK_EX - Exclusive lock
54+
errCh <- unix.Flock(fd, unix.LOCK_EX)
55+
}()
56+
57+
select {
58+
case err = <-errCh:
59+
if err != nil {
60+
return fmt.Errorf("failed to flock PCI file [%s]: %w", path, err)
61+
}
62+
return nil
63+
64+
case <-time.After(pciLockAcquireTimeout):
65+
return fmt.Errorf("time out while waiting to acquire exclusive lock on [%s]", path)
66+
}
67+
}
68+
2869
// SaveAllocatedPCI creates a file with the pci address as a name and the network namespace as the content
2970
// return error if the file was not created
3071
func (p *PCIAllocator) SaveAllocatedPCI(pciAddress, ns string) error {

0 commit comments

Comments
 (0)