@@ -56,6 +56,24 @@ import (
5656 "gopkg.in/yaml.v2"
5757)
5858
59+ // waitForPCINetworkInterface polls sysfs for a network interface to appear
60+ // on the given PCI device after releasing it from vfio-pci. The kernel
61+ // driver needs a moment to rebind and register the netdev. Returns the
62+ // actual kernel interface name, which may differ from the boot-time name
63+ // stored in IoBundle.Ifname (the kernel's ethN counter only goes up, so
64+ // after unbind+rebind the interface often gets a new sequential name).
65+ func waitForPCINetworkInterface (pciAddr string , timeout time.Duration ) (string , error ) {
66+ deadline := time .Now ().Add (timeout )
67+ for time .Now ().Before (deadline ) {
68+ if found , ifname := types .PciLongToIfname (log , pciAddr ); found {
69+ log .Noticef ("waitForPCINetworkInterface: PCI %s appeared as %s" , pciAddr , ifname )
70+ return ifname , nil
71+ }
72+ time .Sleep (500 * time .Millisecond )
73+ }
74+ return "" , fmt .Errorf ("no network interface appeared for PCI %s within %v" , pciAddr , timeout )
75+ }
76+
5977const (
6078 agentName = "domainmgr"
6179 runDirname = "/run/" + agentName
@@ -1545,20 +1563,59 @@ func doAssignIoAdaptersToDomain(ctx *domainContext, config types.DomainConfig,
15451563 log .Functionf ("Assigning %s (%s) to %s" ,
15461564 ib .Phylabel , ib .UsbAddr , status .DomainName )
15471565 assignmentsUsb = addNoDuplicate (assignmentsUsb , ib .UsbAddr )
1548- } else if ib .PciLong != "" && ! ib .IsPCIBack {
1549- if ! (ctx .hvTypeKube && config .VirtualizationMode == types .NOHYPER ) || ib .Type != types .IoNetEth {
1550- log .Functionf ("Assigning %s (%s) to %s" ,
1551- ib .Phylabel , ib .PciLong , status .DomainName )
1552- assignmentsPci = addNoDuplicate (assignmentsPci , ib .PciLong )
1553- ib .IsPCIBack = true
1554- } else {
1555- // For native container with ethernet IO passthrough, we use the NAD for the Multus
1556- // for the container to directly access the ethernet port through network mechanism
1557- log .Noticef ("doAssignIoAdaptersToDomain: skip IO assign %v" , ib )
1566+ } else if ib .PciLong != "" && config .VirtualizationMode == types .NOHYPER && ib .Type == types .IoNetEth {
1567+ // NOHYPE containers cannot do PCI passthrough (no shim VM).
1568+ // For kube we use Multus/NAD; for non-kube we move the
1569+ // physical interface into the container's network namespace
1570+ // via an OCI prestart hook (direct-net.sh).
1571+ //
1572+ // The adapter may already be bound to vfio-pci from the
1573+ // early pciback assignment in updatePortAndPciBackIoMember.
1574+ // Release it so the original network driver rebinds and
1575+ // the kernel interface (e.g. eth1) reappears.
1576+ if ib .IsPCIBack {
1577+ log .Noticef ("doAssignIoAdaptersToDomain: releasing %s (%s) from pciback for NOHYPE IoNetEth" ,
1578+ ib .Phylabel , ib .PciLong )
1579+ // Use PCIReleaseGeneric directly instead of hyper.PCIRelease
1580+ // because the containerd hypervisor's PCIRelease is a no-op
1581+ // (just bookkeeping) — it does not actually unbind vfio-pci
1582+ // via sysfs. We need the real sysfs operations regardless
1583+ // of which hypervisor is active.
1584+ if err := hypervisor .PCIReleaseGeneric (ib .PciLong ); err != nil {
1585+ log .Errorf ("doAssignIoAdaptersToDomain: PCIRelease failed for %s: %v" , ib .PciLong , err )
1586+ return fmt .Errorf ("failed to release %s from pciback for NOHYPE: %v" ,
1587+ ib .Phylabel , err )
1588+ }
1589+ ib .IsPCIBack = false
1590+ publishAssignableAdapters = true
1591+ // Wait for the network driver to rebind and the
1592+ // interface to reappear after releasing from vfio-pci.
1593+ // Poll by PCI address rather than interface name because
1594+ // after unbind+rebind the kernel may assign a different
1595+ // name (e.g. eth2 instead of eth1).
1596+ actualIfname , err := waitForPCINetworkInterface (ib .PciLong , 10 * time .Second )
1597+ if err != nil {
1598+ log .Warnf ("doAssignIoAdaptersToDomain: interface for %s (%s) did not appear after PCIRelease: %v" ,
1599+ ib .Phylabel , ib .PciLong , err )
1600+ } else if actualIfname != ib .Ifname {
1601+ // The kernel assigned a different name after driver
1602+ // rebind. Rename it back to the expected name so the
1603+ // OCI prestart hook (direct-net.sh) can find it.
1604+ log .Noticef ("doAssignIoAdaptersToDomain: interface for %s reappeared as %s, renaming to %s" ,
1605+ ib .PciLong , actualIfname , ib .Ifname )
1606+ types .IfRename (log , actualIfname , ib .Ifname )
1607+ }
15581608 }
1609+ log .Noticef ("doAssignIoAdaptersToDomain: skip PCI assign for NOHYPE IoNetEth %s (%s)" ,
1610+ ib .Phylabel , ib .PciLong )
1611+ } else if ib .PciLong != "" && ! ib .IsPCIBack {
1612+ log .Functionf ("Assigning %s (%s) to %s" ,
1613+ ib .Phylabel , ib .PciLong , status .DomainName )
1614+ assignmentsPci = addNoDuplicate (assignmentsPci , ib .PciLong )
1615+ ib .IsPCIBack = true
15591616 }
15601617 }
1561- publishAssignableAdapters = len (assignmentsUsb ) > 0 || len (assignmentsPci ) > 0
1618+ publishAssignableAdapters = publishAssignableAdapters || len (assignmentsUsb ) > 0 || len (assignmentsPci ) > 0
15621619 }
15631620
15641621 for i , long := range assignmentsPci {
@@ -2125,7 +2182,27 @@ func releaseAdapters(ctx *domainContext, ioAdapterList []types.IoAdapter,
21252182 if ib == nil {
21262183 continue
21272184 }
2128- if ctx .hvTypeKube && status != nil && status .VirtualizationMode == types .NOHYPER && ib .Type == types .IoNetEth {
2185+ if status != nil && status .VirtualizationMode == types .NOHYPER && ib .Type == types .IoNetEth {
2186+ // For NOHYPE containers, the adapter was released from
2187+ // pciback at domain activation time and the interface was
2188+ // moved into the container netns. It returns to the root
2189+ // netns automatically when the container exits.
2190+ // Re-assign to pciback so it is ready for the next user
2191+ // (could be a VM or another container).
2192+ if ib .PciLong != "" && ! ib .IsPCIBack {
2193+ log .Noticef ("releaseAdapters: returning %s (%s) to pciback after NOHYPE use" ,
2194+ ib .Phylabel , ib .PciLong )
2195+ // Use PCIReserveGeneric directly (same reason as the
2196+ // PCIReleaseGeneric call in doAssignIoAdaptersToDomain:
2197+ // the containerd hypervisor's PCIReserve is a no-op).
2198+ if err := hypervisor .PCIReserveGeneric (ib .PciLong ); err != nil {
2199+ log .Errorf ("releaseAdapters: PCIReserve failed for %s: %v" ,
2200+ ib .PciLong , err )
2201+ } else {
2202+ ib .IsPCIBack = true
2203+ }
2204+ }
2205+ ib .UsedByUUID = nilUUID
21292206 continue
21302207 }
21312208 if ib .UsedByUUID != myUUID {
@@ -2314,8 +2391,13 @@ func reserveAdapters(ctx *domainContext, config types.DomainConfig) *types.Error
23142391 }
23152392 log .Functionf ("reserveAdapters processing adapter %d %s phylabel %s" ,
23162393 adapter .Type , adapter .Name , ibp .Phylabel )
2317- if ctx .hvTypeKube && config .VirtualizationMode == types .NOHYPER && ibp .Type == types .IoNetEth {
2318- log .Noticef ("reserveAdapters: ethernet io, skip reserve" )
2394+ if config .VirtualizationMode == types .NOHYPER && ibp .Type == types .IoNetEth {
2395+ // For NOHYPE containers we do not need IO virtualization
2396+ // for ethernet adapters — we move the kernel interface
2397+ // into the container's network namespace instead.
2398+ // Still reserve (UsedByUUID) below, but skip PCI checks.
2399+ log .Noticef ("reserveAdapters: NOHYPE ethernet io, skip PCI/IOV check for %s" ,
2400+ ibp .Phylabel )
23192401 continue
23202402 }
23212403 if ibp .AssignmentGroup == "" {
@@ -3434,7 +3516,19 @@ func updatePortAndPciBackIoMember(ctx *domainContext, ib *types.IoBundle, isPort
34343516 }
34353517
34363518 if ! ib .KeepInHost && ! ib .IsPCIBack {
3437- if ! ib .Error .Empty () {
3519+ if ib .UsedByUUID != nilUUID {
3520+ // Adapter is in active use by a domain — don't rebind to
3521+ // pciback. For NOHYPE containers the adapter was
3522+ // intentionally released from pciback so the kernel
3523+ // network interface can be moved into the container's
3524+ // network namespace. Without this guard,
3525+ // updatePortAndPciBackIoBundleAll (triggered by DNS or
3526+ // adapter-list changes) would immediately rebind the
3527+ // device to vfio-pci, destroying the interface before
3528+ // the OCI prestart hook can move it.
3529+ log .Functionf ("Not assigning %s (%s) to pciback — in use by %s" ,
3530+ ib .Phylabel , ib .PciLong , ib .UsedByUUID )
3531+ } else if ! ib .Error .Empty () {
34383532 log .Warningf ("Not assigning %s (%s) to pciback due to error: %s at %s" ,
34393533 ib .Phylabel , ib .PciLong , ib .Error .String (), ib .Error .ErrorTime ())
34403534 } else if ctx .deviceNetworkStatus .Testing && ib .Type .IsNet () {
0 commit comments