Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ pub struct PhysicalEndpointState {
pub vendor_id: String,
pub device_id: String,
pub hard_addr: String,
pub is_vf: bool,
pub iface_name: String,
/// "pci" or "vmbus"
pub bus_type: String,
}

#[derive(Serialize, Deserialize, Clone, Default)]
Expand Down
197 changes: 146 additions & 51 deletions src/runtime-rs/crates/resource/src/network/endpoint/physical_endpoint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,23 @@
// SPDX-License-Identifier: Apache-2.0
//

use std::io::{self, Error};
use std::path::Path;
use std::sync::Arc;

use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use hypervisor::device::device_manager::{do_handle_device, DeviceManager};
use hypervisor::device::DeviceConfig;
use hypervisor::{device::driver, Hypervisor};
use hypervisor::device::driver::NetworkConfig;
use hypervisor::device::{DeviceConfig, DeviceType};
use hypervisor::{device::driver, Hypervisor, NetworkDevice};
use hypervisor::{get_vfio_device, VfioConfig};
use tokio::sync::RwLock;

use super::endpoint_persist::{EndpointState, PhysicalEndpointState};
use super::Endpoint;
use crate::network::network_pair::NetworkPair;
use crate::network::utils::{self, link};
pub const SYS_PCI_DEVICES_PATH: &str = "/sys/bus/pci/devices";

#[derive(Debug)]
pub struct VendorDevice {
Expand Down Expand Up @@ -53,29 +55,68 @@ pub struct PhysicalEndpoint {
bdf: String,
driver: String,
vendor_device_id: VendorDevice,
is_vf: bool,
bus_type: link::BusType,
net_pair: NetworkPair,
d: Arc<RwLock<DeviceManager>>,
}

impl PhysicalEndpoint {
pub fn new(name: &str, hardware_addr: &[u8], d: Arc<RwLock<DeviceManager>>) -> Result<Self> {
let driver_info = link::get_driver_info(name).context("get driver info")?;
let bdf = driver_info.bus_info;
let sys_pci_devices_path = Path::new(SYS_PCI_DEVICES_PATH);
// get driver by following symlink /sys/bus/pci/devices/$bdf/driver
let driver_path = sys_pci_devices_path.join(&bdf).join("driver");
let link = driver_path.read_link().context("read link")?;
let driver = link
/// Create a new PhysicalEndpoint.
///
/// For VF (SR-IOV Virtual Function) devices, the NIC will be passed through
/// to the VM via VFIO — no tap/bridge pair is needed, so we create a minimal
/// stub NetworkPair.
///
/// For non-VF physical NICs, we need a real tap+bridge pair (like veth
/// endpoints use) so that traffic can flow between the host NIC and the VM
/// through the hypervisor's TAP backend. The `handle`, `idx`, `model`, and
/// `queues` parameters are required to create this pair.
pub async fn new(
handle: &rtnetlink::Handle,
name: &str,
hardware_addr: &[u8],
idx: u32,
model: &str,
queues: usize,
d: Arc<RwLock<DeviceManager>>,
) -> Result<Self> {
// Determine bus type (PCI or VMBus) and resolve the sysfs device path.
// For PCI: uses ethtool to get BDF, path = /sys/bus/pci/devices/<bdf>
// For VMBus: resolves device symlink, path = /sys/bus/vmbus/devices/<guid>
let (sys_iface_device_path, bdf, bus_type) =
link::get_iface_device_path(name).context("get iface device path")?;
let sys_device_path = Path::new(&sys_iface_device_path);

// Get driver by following symlink <device_path>/driver
let driver_path = sys_device_path.join("driver");
let link_target = driver_path.read_link().context("read link")?;
let driver = link_target
.file_name()
.map_or(String::new(), |v| v.to_str().unwrap().to_owned());

// get vendor and device id from pci space (sys/bus/pci/devices/$bdf)
let iface_device_path = sys_pci_devices_path.join(&bdf).join("device");
// Get vendor and device id from sysfs device path
let iface_device_path = sys_device_path.join("device");
let device_id = std::fs::read_to_string(&iface_device_path)
.with_context(|| format!("read device path {:?}", &iface_device_path))?;

let iface_vendor_path = sys_pci_devices_path.join(&bdf).join("vendor");
let iface_vendor_path = sys_device_path.join("vendor");
let vendor_id = std::fs::read_to_string(&iface_vendor_path)
.with_context(|| format!("read vendor path {:?}", &iface_vendor_path))?;
let is_vf = link::is_vf(name).context("check if is vf")?;

// VF devices use VFIO passthrough — no real tap/bridge needed.
// Non-VF devices need a real tap+bridge pair for the hypervisor to
// connect the host NIC traffic into the VM, matching Go's
// createNetworkInterfacePair() call in createPhysicalEndpoint().
let net_pair = if is_vf {
NetworkPair::new_for_physical(name, hardware_addr, is_vf)
.context("new network pair for physical vf endpoint")?
} else {
NetworkPair::new(handle, idx, name, model, queues)
.await
.context("new network pair for physical non-vf endpoint")?
};

Ok(Self {
iface_name: name.to_string(),
Expand All @@ -84,9 +125,34 @@ impl PhysicalEndpoint {
.context("new vendor device")?,
driver,
bdf,
is_vf,
bus_type,
net_pair,
d,
})
}

#[allow(dead_code)]
pub fn network_pair(&self) -> &NetworkPair {
&self.net_pair
}

fn get_network_config(&self) -> Result<NetworkConfig> {
let iface = &self.net_pair.tap.tap_iface;
let guest_mac = utils::parse_mac(&iface.hard_addr).ok_or_else(|| {
Error::new(
io::ErrorKind::InvalidData,
format!("hard_addr {}", &iface.hard_addr),
)
})?;

Ok(NetworkConfig {
host_dev_name: iface.name.clone(),
virt_iface_name: self.net_pair.virt_iface.name.clone(),
guest_mac: Some(guest_mac),
..Default::default()
})
}
}

#[async_trait]
Expand All @@ -100,51 +166,74 @@ impl Endpoint for PhysicalEndpoint {
}

async fn attach(&self) -> Result<()> {
// bind physical interface from host driver and bind to vfio
driver::bind_device_to_vfio(
&self.bdf,
&self.driver,
&self.vendor_device_id.vendor_device_id(),
)
.with_context(|| format!("bind physical endpoint from {} to vfio", &self.driver))?;

let vfio_device = get_vfio_device(self.bdf.clone()).context("get vfio device failed.")?;
let vfio_dev_config = &mut VfioConfig {
host_path: vfio_device.clone(),
dev_type: "pci".to_string(),
hostdev_prefix: "physical_nic_".to_owned(),
..Default::default()
};
if self.is_vf {
// bind physical interface from host driver and bind to vfio
driver::bind_device_to_vfio(
&self.bdf,
&self.driver,
&self.vendor_device_id.vendor_device_id(),
)
.with_context(|| format!("bind physical endpoint from {} to vfio", &self.driver))?;

// create and insert VFIO device into Kata VM
do_handle_device(&self.d, &DeviceConfig::VfioCfg(vfio_dev_config.clone()))
.await
.context("do handle device failed.")?;
let vfio_device =
get_vfio_device(self.bdf.clone()).context("get vfio device failed.")?;
let vfio_dev_config = &mut VfioConfig {
host_path: vfio_device.clone(),
dev_type: "pci".to_string(),
hostdev_prefix: "physical_nic_".to_owned(),
..Default::default()
};

Ok(())
// create and insert VFIO device into Kata VM
do_handle_device(&self.d, &DeviceConfig::VfioCfg(vfio_dev_config.clone()))
.await
.context("do handle device failed.")?;

Ok(())
} else {
self.net_pair
.add_network_model()
.await
.context("add network model")?;
let config = self.get_network_config().context("get network config")?;
do_handle_device(&self.d, &DeviceConfig::NetworkCfg(config))
.await
.context("do handle network Physical endpoint device failed.")?;
Ok(())
}
}

// detach for physical endpoint unbinds the physical network interface from vfio-pci
// and binds it back to the saved host driver.
async fn detach(&self, _hypervisor: &dyn Hypervisor) -> Result<()> {
// bind back the physical network interface to host.
// we need to do this even if a new network namespace has not
// been created by virt-containers.

// we do not need to enter the network namespace to bind back the
// physical interface to host driver.
driver::bind_device_to_host(
&self.bdf,
&self.driver,
&self.vendor_device_id.vendor_device_id(),
)
.with_context(|| {
format!(
"bind physical endpoint device from vfio to {}",
&self.driver
if self.is_vf {
driver::bind_device_to_host(
&self.bdf,
&self.driver,
&self.vendor_device_id.vendor_device_id(),
)
})?;
Ok(())
.with_context(|| {
format!(
"bind physical endpoint device from vfio to {}",
&self.driver
)
})?;
Ok(())
} else {
self.net_pair
.del_network_model()
.await
.context("del network model")?;
let config = self.get_network_config().context("get network config")?;
_hypervisor
.remove_device(DeviceType::Network(NetworkDevice {
config,
..Default::default()
}))
.await
.context("remove Physical endpoint device by hypervisor failed.")?;
Ok(())
}
}

async fn save(&self) -> Option<EndpointState> {
Expand All @@ -155,6 +244,12 @@ impl Endpoint for PhysicalEndpoint {
vendor_id: self.vendor_device_id.vendor_id.clone(),
device_id: self.vendor_device_id.device_id.clone(),
hard_addr: self.hard_addr.clone(),
is_vf: self.is_vf,
iface_name: self.iface_name.clone(),
bus_type: match self.bus_type {
link::BusType::Pci => "pci".to_string(),
link::BusType::Vmbus => "vmbus".to_string(),
},
}),
..Default::default()
})
Expand Down
58 changes: 58 additions & 0 deletions src/runtime-rs/crates/resource/src/network/network_pair.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,35 @@ pub struct NetworkPair {
}

impl NetworkPair {
pub(crate) fn new_for_physical(name: &str, hardware_addr: &[u8], is_vf: bool) -> Result<Self> {
let unique_id = kata_sys_util::rand::UUID::new();
let model = network_model::new("none").context("new none model")?;
let hard_addr = utils::get_mac_addr(hardware_addr).context("get mac addr")?;

Ok(Self {
tap: TapInterface {
id: String::from(&unique_id),
name: String::new(),
tap_iface: NetworkInterface {
name: if is_vf {
String::new()
} else {
name.to_string()
},
hard_addr: hard_addr.clone(),
..Default::default()
},
},
virt_iface: NetworkInterface {
name: name.to_string(),
hard_addr,
..Default::default()
},
model,
network_qos: false,
})
}

pub(crate) async fn new(
handle: &rtnetlink::Handle,
idx: u32,
Expand Down Expand Up @@ -266,4 +295,33 @@ mod tests {
}
}
}

#[test]
fn test_new_for_physical_vf() {
let name = "enp3s0f1";
let hardware_addr = &[0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff];
let pair = NetworkPair::new_for_physical(name, hardware_addr, true).unwrap();

// VF: tap_iface.name should be empty (VFIO passthrough, no tap needed)
assert!(pair.tap.tap_iface.name.is_empty());
// virt_iface should still carry the interface name
assert_eq!(pair.virt_iface.name, name);
// MAC addresses should be set on both sides
assert!(!pair.tap.tap_iface.hard_addr.is_empty());
assert_eq!(pair.tap.tap_iface.hard_addr, pair.virt_iface.hard_addr);
assert!(!pair.network_qos);
}

#[test]
fn test_new_for_physical_non_vf() {
let name = "enp4s0";
let hardware_addr = &[0x11, 0x22, 0x33, 0x44, 0x55, 0x66];
let pair = NetworkPair::new_for_physical(name, hardware_addr, false).unwrap();

// Non-VF: tap_iface.name should be the interface name (used by hypervisor)
assert_eq!(pair.tap.tap_iface.name, name);
assert_eq!(pair.virt_iface.name, name);
assert!(!pair.tap.tap_iface.hard_addr.is_empty());
assert_eq!(pair.tap.tap_iface.hard_addr, pair.virt_iface.hard_addr);
}
}
21 changes: 14 additions & 7 deletions src/runtime-rs/crates/resource/src/network/network_with_netns.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,17 @@ async fn create_endpoint(
&attrs.name,
nix::unistd::gettid()
);
let t = PhysicalEndpoint::new(&attrs.name, &attrs.hardware_addr, d)
.context("new physical endpoint")?;
let t = PhysicalEndpoint::new(
handle,
&attrs.name,
&attrs.hardware_addr,
idx,
&config.network_model,
config.queues,
d,
)
.await
.context("new physical endpoint")?;
Arc::new(t)
} else {
info!(
Expand Down Expand Up @@ -297,9 +306,7 @@ fn is_physical_iface(name: &str) -> Result<bool> {
if name == "lo" {
return Ok(false);
}
let driver_info = link::get_driver_info(name)?;
if driver_info.bus_info.split(':').count() != 3 {
return Ok(false);
}
Ok(true)
// A physical interface is one backed by a PCI or VMBus device.
// This mirrors Go's isPhysicalIface which checks ParentDevBus == "pci" || "vmbus".
Ok(link::get_bus_type(name)?.is_some())
}
Loading