diff --git a/.gitignore b/.gitignore index 796ab4953..8fab5d65c 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,7 @@ utils/reports/ utils/logs/ utils/collect_iio_occ plots/ +multi_vms/generated/ *.log *.out diff --git a/multi_vms/run_vms.sh b/multi_vms/run_vms.sh new file mode 100755 index 000000000..b60c0bc68 --- /dev/null +++ b/multi_vms/run_vms.sh @@ -0,0 +1,638 @@ +#!/bin/bash +set -euo pipefail + +# ============================================================ +# Configuration +# ============================================================ + +# GUEST_CMD_LINE_NESTED="root=/dev/vda2 ro console=ttyS0,115200 earlyprintk=serial,ttyS0,115200 intel_iommu=on,sm_on iommu.strict=1 intel_iommu_pinned=on intel_iommu_dfp=on" +GUEST_CMD_LINE_NESTED="root=/dev/vda2 ro console=ttyS0,115200 earlyprintk=serial,ttyS0,115200 intel_iommu=on,sm_on iommu.strict=1" +GUEST_CMD_LINE_OFF="root=/dev/vda2 ro console=ttyS0,115200 earlyprintk=serial,ttyS0,115200 intel_iommu=off" + +# GUEST_KERNEL="6.12.9-iommufd-nested-iova-contig-cb-opt" +# GUEST_KERNEL_PATH="/boot-VM/vmlinuz-$GUEST_KERNEL" +# GUEST_INITRD_PATH="/boot-VM/initrd.img-$GUEST_KERNEL" +GUEST_KERNEL="6.12.9-iommufd" +GUEST_KERNEL_PATH="/boot/vmlinuz-$GUEST_KERNEL" +GUEST_INITRD_PATH="/boot/initrd.img-$GUEST_KERNEL" +GUEST_VIOMMU="off" # nested/off +NUM_VMS=12 +NUM_CORES="2" +NUM_IPRF="1" +NUM_FLOWS="1" +REUSE=0 + +# --- Hardcoded experiment config --- +GIT_REPO="/home/schai/viommu" +GIT_BRANCH="many-vm-setup" +VM_SCRIPT="cd /home/schai/viommu/scripts/sosp24-experiments; ./many_vm_flows_exp.sh" + +# --- Host paths (this script runs ON the host) --- +HOST_HOME="/home/lbalara" +HOST_FandS_REL="viommu/ManyVM-FandS" +HOST_SETUP_DIR="${HOST_HOME}/${HOST_FandS_REL}/utils" + +# --- Client machine config --- +CLIENT_HOME="/home/siyuanc3" +CLIENT_FandS_REL="Fast-and-Safe-IO-Memory-Protection" +CLIENT_SETUP_DIR_REL="utils" +CLIENT_INTF="ens1006np0" +CLIENT_IP="192.168.101.3" + +# Client SSH (from host -> client) +CLIENT_SSH_UNAME="siyuanc3" +CLIENT_SSH_HOST="nexus03.csl.illinois.edu" +CLIENT_SSH_PASSWORD="saksham" +CLIENT_USE_PASS_AUTH=0 +CLIENT_SSH_IDENTITY_FILE="/home/lbalara/.ssh/id_rsa" + +# Client kernel validation +CLIENT_EXPECTED_KERNEL="6.12.9" +CLIENT_EXPECTED_IOMMU="intel_iommu=off" + +# --- Network & experiment parameters --- +MTU=4000 +DDIO_ENABLED=1 +RING_BUFFER_SIZE=512 +TCP_SOCKET_BUF_MB=1 + +# --- VM SSH settings (host -> guest VMs) --- +SSH_USER="schai" +SSH_KEY="/home/lbalara/.ssh/id_rsa" +SSH_OPTS="-i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -o LogLevel=ERROR" + +XML_DIR="./generated" + + +EXP_NAME="" +IP_BASE="192.168.122.100" + +# --- SR-IOV settings --- +VF_DRIVER="vfio-pci" + +# --- Timeouts --- +BOOT_TIMEOUT=120 # seconds to poll SSH +NIC_WAIT=120 # seconds to wait for guest NIC + +# --- Guest NIC interface name (SR-IOV VF) --- +GUEST_NIC="enp0s1" + +# --- Host-side in-tree modules to push into guests --- +HOST_MLX5_CORE="/lib/modules/$GUEST_KERNEL/kernel/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.ko" +HOST_MLXFW="/lib/modules/$GUEST_KERNEL/kernel/drivers/net/ethernet/mellanox/mlxfw/mlxfw.ko" + +# Save original directory +ORIG_DIR="$(pwd)" + +# ============================================================ +# Argument parsing +# ============================================================ + +usage() { + cat <<-USAGE + Usage: $0 [OPTIONS] + --num-cores N Number of cores per VM experiment + --num-flows N Number of flows per VM experiment + --num-vms N Number of VMs (default: $NUM_VMS) + --reuse Reuse already-defined VMs (skip undefine/define, just start) + --boot-timeout N Seconds to wait for SSH per VM (default: $BOOT_TIMEOUT) + -h, --help Show this help + USAGE +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --num-vms) NUM_VMS="$2"; shift 2 ;; + --num-cores) NUM_CORES="$2"; shift 2 ;; + --num-flows) NUM_FLOWS="$2"; shift 2 ;; + --reuse) REUSE=1; shift ;; + --boot-timeout) BOOT_TIMEOUT="$2"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "Error: unknown option '$1'" >&2; usage >&2; exit 1 ;; + esac +done + +if [[ -z "$NUM_CORES" || -z "$NUM_FLOWS" || -z "$NUM_VMS" || -z "$NUM_IPRF" ]]; then + echo "Error: --num-cores, --num-flows, --num-iperf, and --num-vms are required" >&2 + usage >&2 + exit 1 +fi + +# --- Build client SSH command --- +CLIENT_SETUP_DIR="${CLIENT_HOME}/${CLIENT_FandS_REL}/${CLIENT_SETUP_DIR_REL}" + +if [ "$CLIENT_USE_PASS_AUTH" -eq 1 ]; then + SSH_CLIENT_CMD="sshpass -p $CLIENT_SSH_PASSWORD ssh ${CLIENT_SSH_UNAME}@${CLIENT_SSH_HOST}" +else + SSH_CLIENT_CMD="ssh -i $CLIENT_SSH_IDENTITY_FILE ${CLIENT_SSH_UNAME}@${CLIENT_SSH_HOST}" +fi + +GUEST_CMD_LINE="" +if [[ "$GUEST_VIOMMU" == "nested" ]]; then + GUEST_CMD_LINE="$GUEST_CMD_LINE_NESTED" +else + GUEST_CMD_LINE="$GUEST_CMD_LINE_OFF" +fi + +# ============================================================ +# Helper functions +# ============================================================ + +log_info() { + echo "[INFO-$(date +%Y-%m-%d-%H:%M:%S)] [host] $1" +} + +log_error() { + echo "[ERROR-$(date +%Y-%m-%d-%H:%M:%S)] [host] $1" >&2 +} + +parse_iommu_mode() { + local cmdline="${1:-$(/dev/null | grep -v '^$'); do + echo " Destroying: $vm" + virsh destroy "$vm" 2>/dev/null || true + done + sleep 2 +} + +wait_for_ssh() { + local ip=$1 + local name=$2 + local elapsed=0 + + while [[ $elapsed -lt $BOOT_TIMEOUT ]]; do + if ssh $SSH_OPTS "$SSH_USER@$ip" true 2>/dev/null; then + echo " ${name} (${ip}): SSH reachable after ${elapsed}s" + return 0 + fi + sleep 5 + elapsed=$((elapsed + 5)) + done + + log_error "${name} (${ip}): SSH not reachable after ${BOOT_TIMEOUT}s" + return 1 +} + +wait_for_nic() { + local ip=$1 + local name=$2 + + if ssh $SSH_OPTS "$SSH_USER@$ip" "ip link show $GUEST_NIC" &>/dev/null; then + echo " ${name} (${ip}): $GUEST_NIC already up" + return 0 + fi + + echo " ${name} (${ip}): $GUEST_NIC not found, pushing in-tree mlx5 modules..." + scp $SSH_OPTS "$HOST_MLX5_CORE" "$HOST_MLXFW" "$SSH_USER@$ip:/tmp/" &>/dev/null + scp $SSH_OPTS /tmp/modules.tar.gz "$SSH_USER@$ip:/tmp/" &>/dev/null + + ssh $SSH_OPTS "$SSH_USER@$ip" " + sudo tar xzf /tmp/modules.tar.gz -C /lib/modules/$(uname -r)/ + sudo rm -f /lib/modules/\$(uname -r)/updates/dkms/mlx5_core.ko \ + /lib/modules/\$(uname -r)/updates/dkms/mlx5-vfio-pci.ko \ + /lib/modules/\$(uname -r)/updates/dkms/mlxfw.ko \ + /lib/modules/\$(uname -r)/updates/dkms/mlx_compat.ko \ + /lib/modules/\$(uname -r)/updates/dkms/mlxdevm.ko + sudo mkdir -p /lib/modules/\$(uname -r)/kernel/drivers/net/ethernet/mellanox/mlx5/core + sudo mkdir -p /lib/modules/\$(uname -r)/kernel/drivers/net/ethernet/mellanox/mlxfw + sudo cp /tmp/mlx5_core.ko /lib/modules/\$(uname -r)/kernel/drivers/net/ethernet/mellanox/mlx5/core/ + sudo cp /tmp/mlxfw.ko /lib/modules/\$(uname -r)/kernel/drivers/net/ethernet/mellanox/mlxfw/ + sudo depmod -a 2>/dev/null + sudo modprobe mlx5_core 2>/dev/null + sudo modprobe msr 2>/dev/null + " &>/dev/null + + local elapsed=0 + while [[ $elapsed -lt $NIC_WAIT ]]; do + if ssh $SSH_OPTS "$SSH_USER@$ip" "ip link show $GUEST_NIC" &>/dev/null; then + echo " ${name} (${ip}): $GUEST_NIC ready after ${elapsed}s" + return 0 + fi + sleep 5 + elapsed=$((elapsed + 5)) + echo " ${name} (${ip}): waiting for $GUEST_NIC... (${elapsed}s/${NIC_WAIT}s)" + done + + log_error "${name} (${ip}): $GUEST_NIC not found after ${NIC_WAIT}s" + return 1 +} + +sync_git_repo() { + local ip=$1 + local name=$2 + + echo " ${name} (${ip}): git checkout ${GIT_BRANCH} + pull..." + ssh $SSH_OPTS "$SSH_USER@$ip" " + cd ${GIT_REPO} && \ + git fetch --all && \ + git reset --hard HEAD && \ + git checkout ${GIT_BRANCH} && \ + git reset --hard origin/${GIT_BRANCH} + " + if [[ $? -ne 0 ]]; then + log_error "${name} (${ip}): git sync failed" + return 1 + fi + echo " ${name} (${ip}): git repo synced" +} + +# --- Client kernel check (run once from host) --- +check_client_kernel() { + log_info "Checking client kernel and IOMMU config..." + + local client_kernel + client_kernel=$($SSH_CLIENT_CMD 'uname -r') + local client_cmdline + client_cmdline=$($SSH_CLIENT_CMD 'cat /proc/cmdline') + + if [[ "$client_kernel" != *"$CLIENT_EXPECTED_KERNEL"* ]]; then + log_error "Client kernel mismatch. Expected: $CLIENT_EXPECTED_KERNEL, Actual: $client_kernel" + return 1 + fi + + if [[ "$client_cmdline" != *"$CLIENT_EXPECTED_IOMMU"* ]]; then + log_error "Client IOMMU mismatch. Expected: $CLIENT_EXPECTED_IOMMU, Actual: $client_cmdline" + return 1 + fi + + log_info "Client kernel check PASSED (kernel=$client_kernel)" +} + +# --- Client environment setup (run once from host before experiments) --- +setup_client() { + log_info "Setting up client environment on $CLIENT_SSH_HOST..." + + # Disable flow control + log_info "Disabling TX/RX pause on client interface $CLIENT_INTF" + $SSH_CLIENT_CMD "sudo ethtool --pause $CLIENT_INTF tx off rx off" + + # Disable SMT + log_info "Disabling SMT on client" + $SSH_CLIENT_CMD "echo off | sudo tee /sys/devices/system/cpu/smt/control" + + # Run client setup-envir.sh + log_info "Running client setup-envir.sh..." + $SSH_CLIENT_CMD "cd '$CLIENT_SETUP_DIR'; \ + sudo bash setup-envir.sh \ + --dep '$CLIENT_HOME' \ + --intf '$CLIENT_INTF' \ + --ip '$CLIENT_IP' \ + -m '$MTU' \ + -d '$DDIO_ENABLED' \ + -r '$RING_BUFFER_SIZE' \ + --socket-buf '$TCP_SOCKET_BUF_MB' \ + --hwpref 1 --rdma 0 --pfc 0 --ecn 1 --opt 1" + + $SSH_CLIENT_CMD "sudo pkill -9 -f iperf" + + log_info "Client setup complete" +} + +# --- Host environment setup (run locally, this IS the host) --- +setup_host() { + log_info "Setting up host environment..." + + cd "$HOST_SETUP_DIR" || { log_error "Failed to cd to $HOST_SETUP_DIR"; return 1; } + sudo bash setup-host.sh \ + -m "$MTU" \ + --socket-buf "$TCP_SOCKET_BUF_MB" \ + --hwpref 1 --rdma 0 --ecn 1 + cd "$ORIG_DIR" + + log_info "Host setup complete" +} + +# --- Cleanup (post-experiment) --- +cleanup() { + echo "" + log_info "=== Cleanup ===" + + # Sync guest filesystems + log_info "Syncing VM filesystems..." + for ((i = 0; i < NUM_VMS; i++)); do + local ip + ip=$(vm_ip "$i") + ssh $SSH_OPTS "$SSH_USER@$ip" "sync" 2>/dev/null || true + done + sleep 2 + + # Destroy VMs and kill console loggers + log_info "Destroying VMs..." + for ((i = 0; i < NUM_VMS; i++)); do + echo " Destroying: ${vm_names[$i]}" + virsh destroy "${vm_names[$i]}" 2>/dev/null || true + if [[ -n "${console_pids[$i]:-}" ]]; then + kill "${console_pids[$i]}" 2>/dev/null || true + wait "${console_pids[$i]}" 2>/dev/null || true + fi + done + + # Reset SR-IOV + log_info "Resetting SR-IOV..." + sudo ./sriov_undo.sh + + # Host reset + log_info "Running host reset..." + cd "$HOST_SETUP_DIR" || { log_error "Failed to cd to $HOST_SETUP_DIR"; return; } + sudo ./reset-host.sh + cd "$ORIG_DIR" + + log_info "=== Cleanup complete ===" +} + +# ============================================================ +# Main +# ============================================================ + +host_cmdline=$(cat /proc/cmdline) +host_iommu_config=$(parse_iommu_mode "$host_cmdline") + +iommu_config="host-${host_iommu_config}-guest-${GUEST_VIOMMU}" +echo "iommu_config: $iommu_config" + +log_info "Doing sriov undo" +sudo ./sriov_undo.sh + +# --- Step 1: Generate XML files --- +if [[ $REUSE -eq 0 ]]; then + rm -rf ./generated + ./xml_generator.sh --kernel "$GUEST_KERNEL_PATH" --initrd "$GUEST_INITRD_PATH" --cmdline "$GUEST_CMD_LINE" \ + --vcpus $NUM_CORES --num-vms $NUM_VMS --viommu $GUEST_VIOMMU +fi + +timestamp=$(date '+%Y-%m-%d-%H-%M-%S') +EXP_NAME="${timestamp}-$GUEST_KERNEL-MANY-flow${NUM_FLOWS}-${iommu_config}-${NUM_CORES}cores-${NUM_IPRF}iprf" + +echo "============================================================" +echo " VM Benchmark Runner" +echo "============================================================" +echo " VMs: ${NUM_VMS}" +echo " XML dir: ${XML_DIR}" +echo " Reuse: ${REUSE}" +echo " Branch: ${GIT_BRANCH}" +echo " Cores: ${NUM_CORES}" +echo " Iperf: ${NUM_IPRF}" +echo " Flows: ${NUM_FLOWS}" +echo " Exp name: ${EXP_NAME}" +echo " MTU: ${MTU}" +echo " Ring buf: ${RING_BUFFER_SIZE}" +echo " Socket buf: ${TCP_SOCKET_BUF_MB} MB" +echo "============================================================" +echo "" + +# --- Step 2: Discover XML files and VM names --- +log_info "Step 1: Discovering VM XML files..." +xml_files=() +for f in "${XML_DIR}"/*.xml; do + [[ -f "$f" ]] || continue + xml_files+=("$f") +done + +if [[ ${#xml_files[@]} -lt $NUM_VMS ]]; then + log_error "Found ${#xml_files[@]} XML files in ${XML_DIR}, need ${NUM_VMS}" + exit 1 +fi + +IFS=$'\n' xml_files=($(sort <<<"${xml_files[*]}")); unset IFS + +vm_names=() +for ((i = 0; i < NUM_VMS; i++)); do + xml="${xml_files[$i]}" + vm_name=$(grep -oP '(?<=)[^<]+' "$xml") + vm_names+=("$vm_name") + echo " VM${i}: ${vm_name}" +done + +# --- Step 3: Destroy any running VMs --- +echo "" +log_info "Step 3: Destroying any running VMs..." +destroy_all_running_vms + +# --- Step 4: Define or reuse VMs --- +echo "" +if [[ $REUSE -eq 0 ]]; then + log_info "Step 4: Defining VMs from ${XML_DIR}..." + for ((i = 0; i < NUM_VMS; i++)); do + xml="${xml_files[$i]}" + name="${vm_names[$i]}" + + if virsh dominfo "$name" &>/dev/null; then + echo " ${name}: already exists, undefining..." + virsh undefine "$name" 2>/dev/null || true + fi + + virsh define "$xml" + echo " Defined: ${name} (from ${xml})" + done +else + log_info "Step 4: Reusing already-defined VMs..." + for ((i = 0; i < NUM_VMS; i++)); do + name="${vm_names[$i]}" + if ! virsh dominfo "$name" &>/dev/null; then + log_error "${name} is not defined. Run without --reuse first." + exit 1 + fi + echo " Reusing: ${name}" + done +fi + +# --- Step 5: Check client kernel --- +echo "" +log_info "Step 5: Checking client kernel..." +if ! check_client_kernel; then + log_error "Client kernel check failed, aborting" + cleanup + exit 1 +fi + +# --- Step 6: Setup client environment --- +# Setup before so we have access to device +echo "" +log_info "Step 6: Setting up client environment..." +if ! setup_client; then + log_error "Client setup failed, aborting" + cleanup + exit 1 +fi + +# --- Step 7: Configure SR-IOV --- +echo "" +log_info "Step 7: Configuring SR-IOV (${NUM_VMS} VFs)..." +sudo ./sriov.sh "$NUM_VMS" + +# --- Step 8: Start all VMs + attach console loggers --- +echo "" +log_info "Step 8: Starting VMs..." +console_pids=() +for ((i = 0; i < NUM_VMS; i++)); do + name="${vm_names[$i]}" + virsh start "$name" + sleep 2 + script -q -c "virsh console $name" "${name}.log" > /dev/null 2>&1 & + console_pids+=($!) + echo " Started: ${name} (console -> ${name}.log)" +done + +# --- Step 9: Wait for SSH on all VMs --- +echo "" +log_info "Step 9: Waiting for SSH (timeout: ${BOOT_TIMEOUT}s per VM)..." +failed=0 +for ((i = 0; i < NUM_VMS; i++)); do + ip=$(vm_ip "$i") + if ! wait_for_ssh "$ip" "${vm_names[$i]}"; then + failed=$((failed + 1)) + fi +done + +if [[ $failed -gt 0 ]]; then + log_error "${failed} VM(s) not reachable via SSH, aborting" + cleanup + exit 1 +fi + +# --- Step 10: Fix mlx5 modules and wait for guest NIC --- +echo "" +log_info "Step 10: Waiting for guest NIC ($GUEST_NIC)..." +failed=0 +tar czf /tmp/modules.tar.gz -C /lib/modules/$GUEST_KERNEL . +for ((i = 0; i < NUM_VMS; i++)); do + ip=$(vm_ip "$i") + if ! wait_for_nic "$ip" "${vm_names[$i]}"; then + failed=$((failed + 1)) + fi +done + +if [[ $failed -gt 0 ]]; then + log_error "${failed} VM(s) missing $GUEST_NIC, aborting" + cleanup + exit 1 +fi + +# --- Step 11: Sync git repo on all VMs --- +echo "" +log_info "Step 11: Syncing git repo (${GIT_BRANCH})..." +failed=0 +for ((i = 0; i < NUM_VMS; i++)); do + ip=$(vm_ip "$i") + if ! sync_git_repo "$ip" "${vm_names[$i]}"; then + failed=$((failed + 1)) + fi +done + +if [[ $failed -gt 0 ]]; then + log_error "${failed} VM(s) failed git sync, aborting" + cleanup + exit 1 +fi + + +# --- Step 12: Setup host environment --- +echo "" +log_info "Step 12: Setting up host environment..." +if ! setup_host; then + log_error "Host setup failed, aborting" + cleanup + exit 1 +fi + +# --- Step 13: Run experiment on all VMs simultaneously --- +echo "" +log_info "Step 13: Launching experiments on all VMs..." +echo " Script: ${VM_SCRIPT}" +echo " Cores: ${NUM_CORES}" +echo " Iperf: ${NUM_IPRF}" +echo " Flows: ${NUM_FLOWS}" +echo " Exp name: ${EXP_NAME}" +echo "" + +ssh_pids=() +for ((i = 0; i < NUM_VMS; i++)); do + ip=$(vm_ip "$i") + name="${vm_names[$i]}" + + vm_cmd="${VM_SCRIPT} --vm-name ${name} --num-cores ${NUM_IPRF} --num-flows ${NUM_FLOWS} --exp-name ${EXP_NAME}-${name}" + ssh $SSH_OPTS "$SSH_USER@$ip" "$vm_cmd" &>"${name}_experiment.log" & + ssh_pids+=($!) + echo " Launched on ${name} (${ip}), log -> ${name}_experiment.log" +done + +echo "" +log_info "All experiments launched. Waiting for completion..." + +any_failed=false +for ((i = 0; i < NUM_VMS; i++)); do + name="${vm_names[$i]}" + wait "${ssh_pids[$i]}" + rc=$? + if [[ $rc -eq 0 ]]; then + echo " ${name}: finished (exit 0)" + else + echo " ${name}: FAILED (exit $rc)" + any_failed=true + fi +done + +# --- Step 14: Cleanup --- +echo "" +log_info "Step 14: Cleanup..." +cleanup + +if [[ "$any_failed" == true ]]; then + echo "" + log_info "=== Experiment finished with ERRORS (check *_experiment.log files) ===" + exit 1 +else + echo "" + log_info "=== Experiment finished successfully ===" + exit 0 +fi \ No newline at end of file diff --git a/multi_vms/sriov.sh b/multi_vms/sriov.sh new file mode 100755 index 000000000..e22f5198e --- /dev/null +++ b/multi_vms/sriov.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -euo pipefail + +PF_BDF="0000:98:00.0" +NUM_VFS="${1:?Usage: $0 }" +PF_SYSFS="/sys/bus/pci/devices/$PF_BDF" + +log_info() { echo "[INFO] $1"; } +log_warn() { echo "[WARN] $1"; } +log_error() { echo "[ERROR] $1"; } + +[[ $EUID -eq 0 ]] || { echo "Run as root."; exit 1; } + +get_drv() { + local d="/sys/bus/pci/devices/$1/driver" + if [[ -L "$d" ]]; then + basename "$(readlink "$d")" + else + echo "none" + fi +} + +modprobe -q mlx5_core || true +pf_drv=$(get_drv "$PF_BDF") +echo "PF driver: $pf_drv" + +if [[ "$pf_drv" != "mlx5_core" ]]; then + if [[ "$pf_drv" != "none" ]]; then + echo "$PF_BDF" > "/sys/bus/pci/drivers/$pf_drv/unbind" + fi + echo "mlx5_core" > "$PF_SYSFS/driver_override" + echo "$PF_BDF" > /sys/bus/pci/drivers_probe + echo "" > "$PF_SYSFS/driver_override" + echo "PF bound to mlx5_core" +fi + +echo 0 > "$PF_SYSFS/sriov_numvfs" +echo "$NUM_VFS" > "$PF_SYSFS/sriov_numvfs" +echo "Created $NUM_VFS VFs" + +modprobe -q vfio-pci +echo 1 | sudo tee /sys/module/vfio_pci/parameters/enable_sriov + +for i in $(seq 0 $((NUM_VFS - 1))); do + vf_bdf=$(basename "$(readlink -f "$PF_SYSFS/virtfn${i}")") + vf_drv=$(basename "$(readlink -f "/sys/bus/pci/devices/$vf_bdf/driver")" 2>/dev/null || echo "none") + [[ "$vf_drv" != "none" ]] && echo "$vf_bdf" > "/sys/bus/pci/drivers/$vf_drv/unbind" + echo "vfio-pci" > "/sys/bus/pci/devices/$vf_bdf/driver_override" + echo "$vf_bdf" > /sys/bus/pci/drivers_probe + echo "" > "/sys/bus/pci/devices/$vf_bdf/driver_override" + grp=$(basename "$(readlink -f "/sys/bus/pci/devices/$vf_bdf/iommu_group")") + echo "VF $((i+1)): $vf_bdf iommu_group=$grp" +done diff --git a/multi_vms/sriov_undo.sh b/multi_vms/sriov_undo.sh new file mode 100755 index 000000000..55232132e --- /dev/null +++ b/multi_vms/sriov_undo.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +PF_BDF="0000:98:00.0" +PF_SYSFS="/sys/bus/pci/devices/$PF_BDF" + +get_drv() { + local d="/sys/bus/pci/devices/$1/driver" + if [[ -L "$d" ]]; then + basename "$(readlink "$d")" + else + echo "none" + fi +} + +modprobe -q mlx5_core || true +pf_drv=$(get_drv "$PF_BDF") +echo "PF driver: $pf_drv" + + +if [[ "$pf_drv" != "mlx5_core" ]]; then + if [[ "$pf_drv" != "none" ]]; then + echo "$PF_BDF" > "/sys/bus/pci/drivers/$pf_drv/unbind" + fi + echo "mlx5_core" > "$PF_SYSFS/driver_override" + echo "$PF_BDF" > /sys/bus/pci/drivers_probe + echo "" > "$PF_SYSFS/driver_override" + echo "PF bound to mlx5_core" +fi + +# Destroy VFS +echo 0 > "$PF_SYSFS/sriov_numvfs" + +# Rebind physical function for passthrough +# echo "0000:98:00.0" | sudo tee /sys/bus/pci/drivers/mlx5_core/unbind +# echo "vfio-pci" | sudo tee /sys/bus/pci/devices/0000:98:00.0/driver_override +# echo "0000:98:00.0" | sudo tee /sys/bus/pci/drivers/vfio-pci/bind + diff --git a/multi_vms/xml_generator.sh b/multi_vms/xml_generator.sh new file mode 100755 index 000000000..018d1974a --- /dev/null +++ b/multi_vms/xml_generator.sh @@ -0,0 +1,299 @@ +#!/bin/bash +# Generate libvirt XML configs for nested vIOMMU VMs with VFIO passthrough. +# +# Each VM gets: +# - VF at 0000:98:00. +# - Disk at /data/server_small_.qcow2 +# - vCPUs pinned starting at cpuset 64 + i * vcpus +# - Unique MAC address +# +# Usage: +# ./xml-generator.sh --kernel /boot/vmlinuz-6.12.9-iommufd \ +# --initrd /boot/initrd.img-6.12.9-iommufd \ +# --vcpus 16 --num-vms 4 --viommu on \ +# --cmdline "root=/dev/vda2 ro console=ttyS0,115200 intel_iommu=on,sm_on iommu.strict=1" + +set -euo pipefail + +# --- Defaults --- +NUM_VMS=2 +VCPUS=16 +VIOMMU="nested" +KERNEL="/boot/vmlinuz-6.12.9-iommufd" +INITRD="/boot/initrd.img-6.12.9-iommufd" +CMDLINE="root=/dev/vda2 ro console=ttyS0,115200 earlyprintk=serial,ttyS0,115200 intel_iommu=on,sm_on iommu.strict=1" +TAG="vanilla" +OUTPUT_DIR="./generated" + +QEMU_BIN="/home/lbalara/viommu/qemu-nested/build/qemu-system-x86_64" +BIOS="/home/lbalara/viommu/qemu-nested/pc-bios/bios-256k.bin" + +# --- Mostly static configs --- +CPU_START=64 +NUMA_NODE=2 +MEMORY_KIB=14680064 # 32 GiB + +MAX_VMS=16 + +usage() { + cat <<-USAGE + Usage: $0 [OPTIONS] + + Required: + --kernel PATH Full path to kernel image + --initrd PATH Full path to initrd image + --cmdline STRING Full kernel command line + + Optional: + --vcpus N vCPUs per VM (default: 16) + --num-vms N Number of VMs to generate, max $MAX_VMS (default: 2) + --viommu nested|off Enable vIOMMU + iommufd passthrough (default: nested) + --qemu PATH QEMU binary path (default: $QEMU_BIN) + --bios PATH BIOS image path (default: $BIOS) + --output-dir DIR Output directory for XML files (default: .) + --cpu-start N First physical CPU for pinning (default: $CPU_START) + --numa-node N NUMA node for memory binding (default: $NUMA_NODE) + -h, --help Show this help + USAGE +} + +# --- Parse arguments --- +while [[ $# -gt 0 ]]; do + case "$1" in + --kernel) KERNEL="$2"; shift 2 ;; + --initrd) INITRD="$2"; shift 2 ;; + --cmdline) CMDLINE="$2"; shift 2 ;; + --vcpus) VCPUS="$2"; shift 2 ;; + --num-vms) NUM_VMS="$2"; shift 2 ;; + --viommu) VIOMMU="$2"; shift 2 ;; + --qemu) QEMU_BIN="$2"; shift 2 ;; + --bios) BIOS="$2"; shift 2 ;; + --output-dir) OUTPUT_DIR="$2"; shift 2 ;; + --cpu-start) CPU_START="$2"; shift 2 ;; + --numa-node) NUMA_NODE="$2"; shift 2 ;; + --tag) TAG="$2"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "Error: unknown option '$1'" >&2; usage >&2; exit 1 ;; + esac +done + +# --- Validate --- +err=0 +for var in KERNEL INITRD CMDLINE; do + if [[ -z "${!var}" ]]; then + echo "Error: --$(echo "$var" | tr '[:upper:]' '[:lower:]') is required" >&2 + err=1 + fi +done +[[ $err -ne 0 ]] && exit 1 + +if [[ "$VIOMMU" != "nested" && "$VIOMMU" != "off" ]]; then + echo "Error: --viommu must be 'nested' or 'off'" >&2 + exit 1 +fi + +if [[ "$NUM_VMS" -lt 1 || "$NUM_VMS" -gt "$MAX_VMS" ]]; then + echo "Error: --num-vms must be between 1 and $MAX_VMS" >&2 + exit 1 +fi + +mkdir -p "$OUTPUT_DIR" + +# --- Helper: generate vcpupin entries --- +gen_vcpupin() { + local cpu_base=$1 + local nvcpus=$2 + local i + + for ((i = 0; i < nvcpus; i++)); do + printf " \n" "$i" "$((cpu_base + i))" + done +} + +# --- Helper: generate pcie-root-port controllers --- +gen_pcie_root_ports() { + local idx + local port + local slot + local func + + for ((idx = 1; idx <= 14; idx++)); do + port=$((0x0f + idx)) + if [[ $idx -le 8 ]]; then + slot=2 + func=$((idx - 1)) + else + slot=3 + func=$((idx - 9)) + fi + + local mf="" + if [[ $func -eq 0 ]]; then + mf=" multifunction='on'" + fi + + cat <<-PORT + + + +
+ + PORT + done +} + +# --- Generate VM XMLs --- +for ((vm = 0; vm < NUM_VMS; vm++)); do + vm_name="${TAG}-generated-iommufd-${VIOMMU}-vcpu${VCPUS}-vm${vm}" + vf_num=$((vm + 1)) + vf_dev=$(printf '%02x' $((vf_num / 8))) + vf_func=$((vf_num % 8)) + vf_pci="0000:98:${vf_dev}.${vf_func}" + if [[ $vm -eq 0 ]]; then + disk="/data/server_small.qcow2" + else + disk="/data/server_small${vf_num}.qcow2" + fi + cpu_base=$((CPU_START + vm * VCPUS)) + mac_last=$(printf '%02x' $(( (0xe2 + vm) & 0xff ))) + mac="52:54:00:14:26:${mac_last}" + outfile="${OUTPUT_DIR}/${TAG}-iommufd-${VIOMMU}-vcpu${VCPUS}-vm${vm}.xml" + + cat > "$outfile" < + ${vm_name} + + + + + + ${MEMORY_KIB} + ${MEMORY_KIB} + ${VCPUS} + + + + +$(gen_vcpupin "$cpu_base" "$VCPUS") + + + /machine + + + hvm + ${KERNEL} + ${INITRD} + ${CMDLINE} + ${BIOS} + + + + + + + + + + + + + destroy + restart + destroy + + + + + + ${QEMU_BIN} + + + + + + +
+ + + + + +
+ + +
+ + +$(gen_pcie_root_ports) + +
+ + +
+ + + + + +
+ + + + + + + + + + + + + +
+ + + +
+ + + + + + + +