Skip to content

Commit 77e3459

Browse files
rucoderrene
authored andcommitted
pillar: NOHYPE direct-attach network interface passthrough
Implement physical network interface passthrough for NOHYPE (containerd/runc) containers by moving the interface into the container's network namespace using 'ip link set <dev> netns <pid>', instead of PCI passthrough which requires a shim VM that NOHYPE containers don't have. Changes: - domainmgr: release adapter from vfio-pci at activation time for NOHYPE IoNetEth adapters using PCIReleaseGeneric (not the containerd no-op), wait for kernel driver rebind, rename interface if needed - domainmgr: re-bind adapter to vfio-pci on container release via PCIReserveGeneric so it's ready for the next user - domainmgr: extend kube-only NOHYPE skip to all NOHYPE in reserveAdapters - domainmgr: guard in updatePortAndPciBackIoMember to prevent rebinding adapters in active use (UsedByUUID != nilUUID) - domainmgr: fix publishAssignableAdapters overwrite (||= instead of =) - oci: detect IoNetEth adapters for NOHYPE containers and add OCI prestart/ poststop hooks to move the interface into/out of the container netns - oci: grant CAP_NET_ADMIN and CAP_NET_RAW for direct-attach containers - Add direct-net.sh OCI hook script (follows veth.sh pattern) - Dockerfile: install direct-net.sh Signed-off-by: Mikhail Malyshev <mike.malyshev@gmail.com>
1 parent dc36bd4 commit 77e3459

File tree

4 files changed

+328
-16
lines changed

4 files changed

+328
-16
lines changed

pkg/pillar/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ COPY scripts/device-steps.sh \
183183
scripts/handlezedserverconfig.sh \
184184
scripts/veth.sh \
185185
scripts/dhcpcd.sh \
186+
scripts/direct-net.sh \
186187
scripts/copy-image-to-qcow.sh \
187188
scripts/check-eval-state.sh \
188189
/out/opt/zededa/bin/

pkg/pillar/cmd/domainmgr/domainmgr.go

Lines changed: 109 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,24 @@ import (
5656
"gopkg.in/yaml.v2"
5757
)
5858

59+
// waitForPCINetworkInterface polls sysfs for a network interface to appear
60+
// on the given PCI device after releasing it from vfio-pci. The kernel
61+
// driver needs a moment to rebind and register the netdev. Returns the
62+
// actual kernel interface name, which may differ from the boot-time name
63+
// stored in IoBundle.Ifname (the kernel's ethN counter only goes up, so
64+
// after unbind+rebind the interface often gets a new sequential name).
65+
func waitForPCINetworkInterface(pciAddr string, timeout time.Duration) (string, error) {
66+
deadline := time.Now().Add(timeout)
67+
for time.Now().Before(deadline) {
68+
if found, ifname := types.PciLongToIfname(log, pciAddr); found {
69+
log.Noticef("waitForPCINetworkInterface: PCI %s appeared as %s", pciAddr, ifname)
70+
return ifname, nil
71+
}
72+
time.Sleep(500 * time.Millisecond)
73+
}
74+
return "", fmt.Errorf("no network interface appeared for PCI %s within %v", pciAddr, timeout)
75+
}
76+
5977
const (
6078
agentName = "domainmgr"
6179
runDirname = "/run/" + agentName
@@ -1545,20 +1563,59 @@ func doAssignIoAdaptersToDomain(ctx *domainContext, config types.DomainConfig,
15451563
log.Functionf("Assigning %s (%s) to %s",
15461564
ib.Phylabel, ib.UsbAddr, status.DomainName)
15471565
assignmentsUsb = addNoDuplicate(assignmentsUsb, ib.UsbAddr)
1548-
} else if ib.PciLong != "" && !ib.IsPCIBack {
1549-
if !(ctx.hvTypeKube && config.VirtualizationMode == types.NOHYPER) || ib.Type != types.IoNetEth {
1550-
log.Functionf("Assigning %s (%s) to %s",
1551-
ib.Phylabel, ib.PciLong, status.DomainName)
1552-
assignmentsPci = addNoDuplicate(assignmentsPci, ib.PciLong)
1553-
ib.IsPCIBack = true
1554-
} else {
1555-
// For native container with ethernet IO passthrough, we use the NAD for the Multus
1556-
// for the container to directly access the ethernet port through network mechanism
1557-
log.Noticef("doAssignIoAdaptersToDomain: skip IO assign %v", ib)
1566+
} else if ib.PciLong != "" && config.VirtualizationMode == types.NOHYPER && ib.Type == types.IoNetEth {
1567+
// NOHYPE containers cannot do PCI passthrough (no shim VM).
1568+
// For kube we use Multus/NAD; for non-kube we move the
1569+
// physical interface into the container's network namespace
1570+
// via an OCI prestart hook (direct-net.sh).
1571+
//
1572+
// The adapter may already be bound to vfio-pci from the
1573+
// early pciback assignment in updatePortAndPciBackIoMember.
1574+
// Release it so the original network driver rebinds and
1575+
// the kernel interface (e.g. eth1) reappears.
1576+
if ib.IsPCIBack {
1577+
log.Noticef("doAssignIoAdaptersToDomain: releasing %s (%s) from pciback for NOHYPE IoNetEth",
1578+
ib.Phylabel, ib.PciLong)
1579+
// Use PCIReleaseGeneric directly instead of hyper.PCIRelease
1580+
// because the containerd hypervisor's PCIRelease is a no-op
1581+
// (just bookkeeping) — it does not actually unbind vfio-pci
1582+
// via sysfs. We need the real sysfs operations regardless
1583+
// of which hypervisor is active.
1584+
if err := hypervisor.PCIReleaseGeneric(ib.PciLong); err != nil {
1585+
log.Errorf("doAssignIoAdaptersToDomain: PCIRelease failed for %s: %v", ib.PciLong, err)
1586+
return fmt.Errorf("failed to release %s from pciback for NOHYPE: %v",
1587+
ib.Phylabel, err)
1588+
}
1589+
ib.IsPCIBack = false
1590+
publishAssignableAdapters = true
1591+
// Wait for the network driver to rebind and the
1592+
// interface to reappear after releasing from vfio-pci.
1593+
// Poll by PCI address rather than interface name because
1594+
// after unbind+rebind the kernel may assign a different
1595+
// name (e.g. eth2 instead of eth1).
1596+
actualIfname, err := waitForPCINetworkInterface(ib.PciLong, 10*time.Second)
1597+
if err != nil {
1598+
log.Warnf("doAssignIoAdaptersToDomain: interface for %s (%s) did not appear after PCIRelease: %v",
1599+
ib.Phylabel, ib.PciLong, err)
1600+
} else if actualIfname != ib.Ifname {
1601+
// The kernel assigned a different name after driver
1602+
// rebind. Rename it back to the expected name so the
1603+
// OCI prestart hook (direct-net.sh) can find it.
1604+
log.Noticef("doAssignIoAdaptersToDomain: interface for %s reappeared as %s, renaming to %s",
1605+
ib.PciLong, actualIfname, ib.Ifname)
1606+
types.IfRename(log, actualIfname, ib.Ifname)
1607+
}
15581608
}
1609+
log.Noticef("doAssignIoAdaptersToDomain: skip PCI assign for NOHYPE IoNetEth %s (%s)",
1610+
ib.Phylabel, ib.PciLong)
1611+
} else if ib.PciLong != "" && !ib.IsPCIBack {
1612+
log.Functionf("Assigning %s (%s) to %s",
1613+
ib.Phylabel, ib.PciLong, status.DomainName)
1614+
assignmentsPci = addNoDuplicate(assignmentsPci, ib.PciLong)
1615+
ib.IsPCIBack = true
15591616
}
15601617
}
1561-
publishAssignableAdapters = len(assignmentsUsb) > 0 || len(assignmentsPci) > 0
1618+
publishAssignableAdapters = publishAssignableAdapters || len(assignmentsUsb) > 0 || len(assignmentsPci) > 0
15621619
}
15631620

15641621
for i, long := range assignmentsPci {
@@ -2125,7 +2182,27 @@ func releaseAdapters(ctx *domainContext, ioAdapterList []types.IoAdapter,
21252182
if ib == nil {
21262183
continue
21272184
}
2128-
if ctx.hvTypeKube && status != nil && status.VirtualizationMode == types.NOHYPER && ib.Type == types.IoNetEth {
2185+
if status != nil && status.VirtualizationMode == types.NOHYPER && ib.Type == types.IoNetEth {
2186+
// For NOHYPE containers, the adapter was released from
2187+
// pciback at domain activation time and the interface was
2188+
// moved into the container netns. It returns to the root
2189+
// netns automatically when the container exits.
2190+
// Re-assign to pciback so it is ready for the next user
2191+
// (could be a VM or another container).
2192+
if ib.PciLong != "" && !ib.IsPCIBack {
2193+
log.Noticef("releaseAdapters: returning %s (%s) to pciback after NOHYPE use",
2194+
ib.Phylabel, ib.PciLong)
2195+
// Use PCIReserveGeneric directly (same reason as the
2196+
// PCIReleaseGeneric call in doAssignIoAdaptersToDomain:
2197+
// the containerd hypervisor's PCIReserve is a no-op).
2198+
if err := hypervisor.PCIReserveGeneric(ib.PciLong); err != nil {
2199+
log.Errorf("releaseAdapters: PCIReserve failed for %s: %v",
2200+
ib.PciLong, err)
2201+
} else {
2202+
ib.IsPCIBack = true
2203+
}
2204+
}
2205+
ib.UsedByUUID = nilUUID
21292206
continue
21302207
}
21312208
if ib.UsedByUUID != myUUID {
@@ -2314,8 +2391,13 @@ func reserveAdapters(ctx *domainContext, config types.DomainConfig) *types.Error
23142391
}
23152392
log.Functionf("reserveAdapters processing adapter %d %s phylabel %s",
23162393
adapter.Type, adapter.Name, ibp.Phylabel)
2317-
if ctx.hvTypeKube && config.VirtualizationMode == types.NOHYPER && ibp.Type == types.IoNetEth {
2318-
log.Noticef("reserveAdapters: ethernet io, skip reserve")
2394+
if config.VirtualizationMode == types.NOHYPER && ibp.Type == types.IoNetEth {
2395+
// For NOHYPE containers we do not need IO virtualization
2396+
// for ethernet adapters — we move the kernel interface
2397+
// into the container's network namespace instead.
2398+
// Still reserve (UsedByUUID) below, but skip PCI checks.
2399+
log.Noticef("reserveAdapters: NOHYPE ethernet io, skip PCI/IOV check for %s",
2400+
ibp.Phylabel)
23192401
continue
23202402
}
23212403
if ibp.AssignmentGroup == "" {
@@ -3434,7 +3516,19 @@ func updatePortAndPciBackIoMember(ctx *domainContext, ib *types.IoBundle, isPort
34343516
}
34353517

34363518
if !ib.KeepInHost && !ib.IsPCIBack {
3437-
if !ib.Error.Empty() {
3519+
if ib.UsedByUUID != nilUUID {
3520+
// Adapter is in active use by a domain — don't rebind to
3521+
// pciback. For NOHYPE containers the adapter was
3522+
// intentionally released from pciback so the kernel
3523+
// network interface can be moved into the container's
3524+
// network namespace. Without this guard,
3525+
// updatePortAndPciBackIoBundleAll (triggered by DNS or
3526+
// adapter-list changes) would immediately rebind the
3527+
// device to vfio-pci, destroying the interface before
3528+
// the OCI prestart hook can move it.
3529+
log.Functionf("Not assigning %s (%s) to pciback — in use by %s",
3530+
ib.Phylabel, ib.PciLong, ib.UsedByUUID)
3531+
} else if !ib.Error.Empty() {
34383532
log.Warningf("Not assigning %s (%s) to pciback due to error: %s at %s",
34393533
ib.Phylabel, ib.PciLong, ib.Error.String(), ib.Error.ErrorTime())
34403534
} else if ctx.deviceNetworkStatus.Testing && ib.Type.IsNet() {

pkg/pillar/containerd/oci.go

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2020 Zededa, Inc.
1+
// Copyright (c) 2020-2026 Zededa, Inc.
22
// SPDX-License-Identifier: Apache-2.0
33
//
44
// This is basically re-implementation of:
@@ -39,6 +39,8 @@ var vethScript = []string{"eve", "exec", "pillar", "/opt/zededa/bin/veth.sh"}
3939

4040
var dhcpcdScript = []string{"eve", "exec", "pillar", "/opt/zededa/bin/dhcpcd.sh"}
4141

42+
var directNetScript = []string{"eve", "exec", "pillar", "/opt/zededa/bin/direct-net.sh"}
43+
4244
// ociSpec is kept private (with all the actions done by getters and setters
4345
// This is because we expect the implementation to still evolve quite a bit
4446
// for all the different task usecases
@@ -321,10 +323,20 @@ func (s *ociSpec) AdjustMemLimit(dom types.DomainConfig, addMemory int64) {
321323
}
322324
}
323325

326+
// directNetIf holds the host and guest interface names for a direct-attach
327+
// network adapter that should be moved into the container's network namespace.
328+
type directNetIf struct {
329+
hostIfname string // kernel interface name on the host (e.g. "eth1")
330+
guestIfname string // name inside the container (e.g. "ethercat")
331+
}
332+
324333
func (s *ociSpec) UpdateWithIoBundles(config *types.DomainConfig, aa *types.AssignableAdapters, domainID int) error {
325334
// Process I/O adapters
326335
devList := []string{}
327336
cdiList := []string{}
337+
var directNets []directNetIf
338+
isNOHYPE := config.VirtualizationMode == types.NOHYPER
339+
328340
for _, adapter := range config.IoAdapterList {
329341
logrus.Debugf("processing adapter %d %s\n", adapter.Type, adapter.Name)
330342
list := aa.LookupIoBundleAny(adapter.Name)
@@ -365,6 +377,23 @@ func (s *ociSpec) UpdateWithIoBundles(config *types.DomainConfig, aa *types.Assi
365377
logrus.Infof("Adding generic device %s\n", ib.Ifname)
366378
devList = append(devList, ib.Ifname)
367379
}
380+
381+
// Direct-attach network interfaces for NOHYPE containers.
382+
// Instead of PCI passthrough (which requires a shim VM),
383+
// we move the physical interface into the container's
384+
// network namespace via an OCI prestart hook.
385+
if isNOHYPE && ib.Type == types.IoNetEth && ib.Ifname != "" {
386+
guestName := ib.Ifname
387+
if ib.Logicallabel != "" {
388+
guestName = ib.Logicallabel
389+
}
390+
logrus.Infof("Adding direct-attach net %s (guest: %s) for NOHYPE container\n",
391+
ib.Ifname, guestName)
392+
directNets = append(directNets, directNetIf{
393+
hostIfname: ib.Ifname,
394+
guestIfname: guestName,
395+
})
396+
}
368397
}
369398
}
370399

@@ -403,6 +432,42 @@ func (s *ociSpec) UpdateWithIoBundles(config *types.DomainConfig, aa *types.Assi
403432
}
404433
s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, *cgrDev)
405434
}
435+
436+
// Add OCI hooks for direct-attach network interfaces (NOHYPE only).
437+
// The prestart hook moves the physical interface into the container's
438+
// network namespace; the poststop hook moves it back on teardown.
439+
if len(directNets) > 0 {
440+
if s.Hooks == nil {
441+
s.Hooks = &specs.Hooks{}
442+
}
443+
timeout := 60
444+
for _, dn := range directNets {
445+
s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
446+
Path: eveScript,
447+
Args: append(directNetScript, "up", dn.hostIfname, dn.guestIfname),
448+
Timeout: &timeout,
449+
})
450+
s.Hooks.Poststop = append(s.Hooks.Poststop, specs.Hook{
451+
Path: eveScript,
452+
Args: append(directNetScript, "down", dn.hostIfname, dn.guestIfname),
453+
Timeout: &timeout,
454+
})
455+
}
456+
457+
// Grant CAP_NET_ADMIN so the container can configure the
458+
// direct-attach interface (ip addr/route, ethtool, etc.)
459+
// and CAP_NET_RAW for raw socket access (required by protocols
460+
// like EtherCAT that operate directly on L2 frames).
461+
if s.Process != nil && s.Process.Capabilities != nil {
462+
for _, cap := range []string{"CAP_NET_ADMIN", "CAP_NET_RAW"} {
463+
s.Process.Capabilities.Bounding = appendCapIfMissing(s.Process.Capabilities.Bounding, cap)
464+
s.Process.Capabilities.Effective = appendCapIfMissing(s.Process.Capabilities.Effective, cap)
465+
s.Process.Capabilities.Permitted = appendCapIfMissing(s.Process.Capabilities.Permitted, cap)
466+
s.Process.Capabilities.Ambient = appendCapIfMissing(s.Process.Capabilities.Ambient, cap)
467+
}
468+
}
469+
}
470+
406471
return nil
407472
}
408473

@@ -483,6 +548,16 @@ func (s *ociSpec) UpdateFromDomain(dom *types.DomainConfig, status *types.Domain
483548
s.Annotations[EVEOCIVNCPasswordLabel] = dom.VncPasswd
484549
}
485550

551+
// appendCapIfMissing appends a capability string to a slice if it is not already present.
552+
func appendCapIfMissing(caps []string, capName string) []string {
553+
for _, c := range caps {
554+
if c == capName {
555+
return caps
556+
}
557+
}
558+
return append(caps, capName)
559+
}
560+
486561
// UpdateFromVolume updates values in the OCI spec based on the location
487562
// of an EVE volume. EVE volume's are expected to be structured as directories
488563
// in the filesystem with either config.json containing the full OCI runtime

0 commit comments

Comments
 (0)