Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 136 additions & 4 deletions pkg/pillar/hypervisor/hypervisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,132 @@ func logError(format string, a ...interface{}) error {
return fmt.Errorf(format, a...)
}

// iommuGroupContext holds sysfs paths for IOMMU group operations.
// Production code uses defaultIOMMUGroupCtx; tests create custom instances
// with temporary directories.
type iommuGroupContext struct {
pciDevicesDir string
iommuGroupsDir string
vfioDriverDir string
driversProbe string
}

var defaultIOMMUGroupCtx = iommuGroupContext{
pciDevicesDir: sysfsPciDevices,
iommuGroupsDir: "/sys/kernel/iommu_groups",
vfioDriverDir: vfioDriverPath,
driversProbe: sysfsPciDriversProbe,
}

// getIOMMUGroup returns the IOMMU group number for a PCI device by reading
// the iommu_group symlink in sysfs.
func (ctx *iommuGroupContext) getIOMMUGroup(long string) (string, error) {
iommuGroupLink := filepath.Join(ctx.pciDevicesDir, long, "iommu_group")
iommuPath, err := os.Readlink(iommuGroupLink)
if err != nil {
return "", fmt.Errorf("can't determine iommu group for %s (%v)", long, err)
}
return filepath.Base(iommuPath), nil
}

// getMembers returns all PCI device addresses in the same IOMMU group
// by reading /sys/kernel/iommu_groups/<group>/devices/
func (ctx *iommuGroupContext) getMembers(long string) ([]string, error) {
group, err := ctx.getIOMMUGroup(long)
if err != nil {
return nil, err
}
devicesPath := filepath.Join(ctx.iommuGroupsDir, group, "devices")
entries, err := os.ReadDir(devicesPath)
if err != nil {
return nil, fmt.Errorf("cannot read IOMMU group %s devices: %v", group, err)
}
var members []string
for _, entry := range entries {
members = append(members, entry.Name())
}
return members, nil
}

// isBoundToVfioPci returns true if the device is currently bound to the vfio-pci driver.
func (ctx *iommuGroupContext) isBoundToVfioPci(long string) bool {
driverPath := filepath.Join(ctx.pciDevicesDir, long, "driver")
driverPathInfo, driverPathErr := os.Stat(driverPath)
vfioDriverPathInfo, vfioDriverPathErr := os.Stat(ctx.vfioDriverDir)
return driverPathErr == nil && vfioDriverPathErr == nil &&
os.SameFile(driverPathInfo, vfioDriverPathInfo)
}

// unbindSiblings unbinds kernel drivers from all devices in the same
// IOMMU group as the given device. This is necessary because VFIO requires
// exclusive DMA ownership of the entire IOMMU group — if any sibling device
// has a kernel driver bound, it claims DMA ownership via
// iommu_device_use_default_domain() and the group becomes non-viable.
// Sibling devices are only unbound (not bound to vfio-pci) since they are
// not the passthrough target.
func (ctx *iommuGroupContext) unbindSiblings(long string) {
members, err := ctx.getMembers(long)
if err != nil {
logrus.Warnf("unbindIOMMUGroupSiblings: cannot get IOMMU group members for %s: %v", long, err)
return
}
for _, member := range members {
if member == long {
continue
}
if ctx.isBoundToVfioPci(member) {
continue
}
unbindFile := filepath.Join(ctx.pciDevicesDir, member, "driver/unbind")
if _, err := os.Stat(unbindFile); err != nil {
// No driver bound, nothing to do
continue
}
logrus.Infof("unbindIOMMUGroupSiblings: unbinding driver from IOMMU group sibling %s (sibling of %s)", member, long)
if err := os.WriteFile(unbindFile, []byte(member), 0644); err != nil {
logrus.Warnf("unbindIOMMUGroupSiblings: failed to unbind driver from %s: %v", member, err)
}
}
}

// reprobeSiblings re-probes sibling devices in the IOMMU group
// so their original kernel drivers can rebind after VFIO release.
func (ctx *iommuGroupContext) reprobeSiblings(long string) {
members, err := ctx.getMembers(long)
if err != nil {
logrus.Warnf("reprobeIOMMUGroupSiblings: cannot get IOMMU group members for %s: %v", long, err)
return
}
for _, member := range members {
if member == long {
continue
}
driverPath := filepath.Join(ctx.pciDevicesDir, member, "driver")
if _, err := os.Stat(driverPath); err == nil {
// Already has a driver bound, skip
continue
}
logrus.Infof("reprobeIOMMUGroupSiblings: re-probing IOMMU group sibling %s (sibling of %s)", member, long)
if err := os.WriteFile(ctx.driversProbe, []byte(member), 0644); err != nil {
logrus.Warnf("reprobeIOMMUGroupSiblings: failed to re-probe %s: %v", member, err)
}
}
}

// Public wrapper functions using default sysfs paths.

func isBoundToVfioPci(long string) bool {
return defaultIOMMUGroupCtx.isBoundToVfioPci(long)
}

func unbindIOMMUGroupSiblings(long string) {
defaultIOMMUGroupCtx.unbindSiblings(long)
}

func reprobeIOMMUGroupSiblings(long string) {
defaultIOMMUGroupCtx.reprobeSiblings(long)
}

// PCIReserveGeneric : Common Reserve function used by both EVE kvm and 'k'
func PCIReserveGeneric(long string) error {
logrus.Infof("PCIReserve long addr is %s", long)
Expand All @@ -168,14 +294,16 @@ func PCIReserveGeneric(long string) error {
unbindFile := filepath.Join(driverPath, "unbind")

//Check if already bound to vfio-pci
driverPathInfo, driverPathErr := os.Stat(driverPath)
vfioDriverPathInfo, vfioDriverPathErr := os.Stat(vfioDriverPath)
if driverPathErr == nil && vfioDriverPathErr == nil &&
os.SameFile(driverPathInfo, vfioDriverPathInfo) {
if isBoundToVfioPci(long) {
logrus.Infof("Driver for %s is already bound to vfio-pci, skipping unbind", long)
return nil
}

// Unbind kernel drivers from all sibling devices in the IOMMU group.
// VFIO requires exclusive DMA ownership of the entire group; any sibling
// with a kernel driver makes the group non-viable for passthrough.
unbindIOMMUGroupSiblings(long)

//map vfio-pci as the driver_override for the device
if err := os.WriteFile(overrideFile, []byte("vfio-pci"), 0644); err != nil {
return logError("driver_override failure for PCI device %s: %v",
Expand Down Expand Up @@ -226,6 +354,10 @@ func PCIReleaseGeneric(long string) error {
long, err)
}

// Re-probe IOMMU group siblings so their original drivers can rebind.
// These were unbound during PCIReserveGeneric to make the group viable.
reprobeIOMMUGroupSiblings(long)

return nil
}

Expand Down
215 changes: 215 additions & 0 deletions pkg/pillar/hypervisor/iommu_group_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
// Copyright (c) 2026 Zededa, Inc.
// SPDX-License-Identifier: Apache-2.0

package hypervisor

import (
"os"
"path/filepath"
"sort"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

// setupFakeIOMMUSysfs creates a temporary sysfs tree for IOMMU group tests
// and returns an iommuGroupContext pointing at it.
func setupFakeIOMMUSysfs(t *testing.T) (iommuGroupContext, string) {
t.Helper()
tmpDir, err := os.MkdirTemp("", "iommu-test-*")
require.NoError(t, err)
t.Cleanup(func() { os.RemoveAll(tmpDir) })

pciDevicesDir := filepath.Join(tmpDir, "bus/pci/devices")
iommuGroupsDir := filepath.Join(tmpDir, "kernel/iommu_groups")
vfioDriverDir := filepath.Join(tmpDir, "bus/pci/drivers/vfio-pci")
driversProbe := filepath.Join(tmpDir, "bus/pci/drivers_probe")

require.NoError(t, os.MkdirAll(pciDevicesDir, 0755))
require.NoError(t, os.MkdirAll(iommuGroupsDir, 0755))
require.NoError(t, os.MkdirAll(vfioDriverDir, 0755))
require.NoError(t, os.WriteFile(driversProbe, nil, 0644))

ctx := iommuGroupContext{
pciDevicesDir: pciDevicesDir,
iommuGroupsDir: iommuGroupsDir,
vfioDriverDir: vfioDriverDir,
driversProbe: driversProbe,
}
return ctx, tmpDir
}

// addDeviceToGroup creates a fake PCI device directory with an iommu_group
// symlink and registers it in the group's devices directory.
func addDeviceToGroup(t *testing.T, ctx *iommuGroupContext, addr, group string) {
t.Helper()
devDir := filepath.Join(ctx.pciDevicesDir, addr)
require.NoError(t, os.MkdirAll(devDir, 0755))

groupDevicesDir := filepath.Join(ctx.iommuGroupsDir, group, "devices")
require.NoError(t, os.MkdirAll(groupDevicesDir, 0755))

// iommu_group symlink — os.Readlink returns the target, filepath.Base extracts group number
iommuGroupTarget := filepath.Join(ctx.iommuGroupsDir, group)
require.NoError(t, os.Symlink(iommuGroupTarget, filepath.Join(devDir, "iommu_group")))

// Register device in group's devices directory
require.NoError(t, os.WriteFile(filepath.Join(groupDevicesDir, addr), nil, 0644))
}

// bindToKernelDriver simulates a kernel driver binding by creating a driver
// directory with an unbind file and symlinking device/driver to it.
func bindToKernelDriver(t *testing.T, ctx *iommuGroupContext, tmpDir, addr, driverName string) {
t.Helper()
driverDir := filepath.Join(tmpDir, "bus/pci/drivers", driverName)
require.NoError(t, os.MkdirAll(driverDir, 0755))

unbindFile := filepath.Join(driverDir, "unbind")
if _, err := os.Stat(unbindFile); err != nil {
require.NoError(t, os.WriteFile(unbindFile, nil, 0644))
}
require.NoError(t, os.Symlink(driverDir, filepath.Join(ctx.pciDevicesDir, addr, "driver")))
}

// bindToVfioPci simulates vfio-pci driver binding.
func bindToVfioPci(t *testing.T, ctx *iommuGroupContext, addr string) {
t.Helper()
require.NoError(t, os.Symlink(ctx.vfioDriverDir, filepath.Join(ctx.pciDevicesDir, addr, "driver")))
}

func TestGetIOMMUGroup(t *testing.T) {
ctx, _ := setupFakeIOMMUSysfs(t)
addDeviceToGroup(t, &ctx, "0000:80:1f.6", "19")

group, err := ctx.getIOMMUGroup("0000:80:1f.6")
require.NoError(t, err)
assert.Equal(t, "19", group)
}

func TestGetIOMMUGroupNoSymlink(t *testing.T) {
ctx, _ := setupFakeIOMMUSysfs(t)
// Device exists but has no iommu_group symlink
require.NoError(t, os.MkdirAll(filepath.Join(ctx.pciDevicesDir, "0000:00:01.0"), 0755))

_, err := ctx.getIOMMUGroup("0000:00:01.0")
assert.Error(t, err)
}

func TestGetMembers(t *testing.T) {
ctx, _ := setupFakeIOMMUSysfs(t)
addDeviceToGroup(t, &ctx, "0000:80:1f.0", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.4", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.6", "19")

members, err := ctx.getMembers("0000:80:1f.6")
require.NoError(t, err)
sort.Strings(members)
assert.Equal(t, []string{"0000:80:1f.0", "0000:80:1f.4", "0000:80:1f.6"}, members)
}

func TestGetMembersSingleDevice(t *testing.T) {
ctx, _ := setupFakeIOMMUSysfs(t)
addDeviceToGroup(t, &ctx, "0000:01:00.0", "5")

members, err := ctx.getMembers("0000:01:00.0")
require.NoError(t, err)
assert.Equal(t, []string{"0000:01:00.0"}, members)
}

func TestIsBoundToVfioPci(t *testing.T) {
ctx, tmpDir := setupFakeIOMMUSysfs(t)
addDeviceToGroup(t, &ctx, "0000:80:1f.6", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.4", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.0", "19")

// Bound to vfio-pci
bindToVfioPci(t, &ctx, "0000:80:1f.6")
assert.True(t, ctx.isBoundToVfioPci("0000:80:1f.6"))

// Bound to a kernel driver
bindToKernelDriver(t, &ctx, tmpDir, "0000:80:1f.4", "i801_smbus")
assert.False(t, ctx.isBoundToVfioPci("0000:80:1f.4"))

// Not bound to any driver
assert.False(t, ctx.isBoundToVfioPci("0000:80:1f.0"))
}

func TestUnbindSiblings(t *testing.T) {
ctx, tmpDir := setupFakeIOMMUSysfs(t)
addDeviceToGroup(t, &ctx, "0000:80:1f.0", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.4", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.5", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.6", "19")

// Target device bound to vfio-pci (should be skipped)
bindToVfioPci(t, &ctx, "0000:80:1f.6")
// Kernel driver siblings (should be unbound)
bindToKernelDriver(t, &ctx, tmpDir, "0000:80:1f.0", "lpc_ich")
bindToKernelDriver(t, &ctx, tmpDir, "0000:80:1f.4", "i801_smbus")
// 80:1f.5 has no driver (should be skipped)

ctx.unbindSiblings("0000:80:1f.6")

// Verify unbind was written for kernel driver siblings
lpcUnbind := filepath.Join(tmpDir, "bus/pci/drivers/lpc_ich/unbind")
content, err := os.ReadFile(lpcUnbind)
require.NoError(t, err)
assert.Equal(t, "0000:80:1f.0", string(content))

i801Unbind := filepath.Join(tmpDir, "bus/pci/drivers/i801_smbus/unbind")
content, err = os.ReadFile(i801Unbind)
require.NoError(t, err)
assert.Equal(t, "0000:80:1f.4", string(content))
}

func TestUnbindSiblingsSkipsVfioPci(t *testing.T) {
ctx, _ := setupFakeIOMMUSysfs(t)
addDeviceToGroup(t, &ctx, "0000:80:1f.4", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.6", "19")

// Both bound to vfio-pci — no unbind should happen
bindToVfioPci(t, &ctx, "0000:80:1f.6")
bindToVfioPci(t, &ctx, "0000:80:1f.4")

// Should not panic or error
ctx.unbindSiblings("0000:80:1f.6")
}

func TestReprobeSiblings(t *testing.T) {
ctx, tmpDir := setupFakeIOMMUSysfs(t)
addDeviceToGroup(t, &ctx, "0000:80:1f.0", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.4", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.6", "19")

// 80:1f.6 is the target (skip)
// 80:1f.0 already has a driver rebound (skip)
someDriverDir := filepath.Join(tmpDir, "bus/pci/drivers/some_driver")
require.NoError(t, os.MkdirAll(someDriverDir, 0755))
require.NoError(t, os.Symlink(someDriverDir, filepath.Join(ctx.pciDevicesDir, "0000:80:1f.0", "driver")))
// 80:1f.4 has no driver (should be re-probed)

ctx.reprobeSiblings("0000:80:1f.6")

// Verify drivers_probe was written with the unbound device address
content, err := os.ReadFile(ctx.driversProbe)
require.NoError(t, err)
assert.Equal(t, "0000:80:1f.4", string(content))
}

func TestReprobeSiblingsAllBound(t *testing.T) {
ctx, tmpDir := setupFakeIOMMUSysfs(t)
addDeviceToGroup(t, &ctx, "0000:80:1f.4", "19")
addDeviceToGroup(t, &ctx, "0000:80:1f.6", "19")

// Both have drivers — nothing to reprobe
bindToKernelDriver(t, &ctx, tmpDir, "0000:80:1f.4", "i801_smbus")
bindToVfioPci(t, &ctx, "0000:80:1f.6")

ctx.reprobeSiblings("0000:80:1f.6")

// drivers_probe should remain empty
content, err := os.ReadFile(ctx.driversProbe)
require.NoError(t, err)
assert.Empty(t, content)
}
Loading