From eca54a4b970fc8725b074ca0ffd4c23674f4b56e Mon Sep 17 00:00:00 2001 From: Saul Paredes Date: Thu, 26 Mar 2026 09:04:04 -0700 Subject: [PATCH 1/9] node-builder: also build runtime-rs build runtime-rs binary and debug config Signed-off-by: Saul Paredes --- .../node-builder/azure-linux/common.sh | 7 ++ .../node-builder/azure-linux/package_build.sh | 76 ++++++++++++++----- 2 files changed, 64 insertions(+), 19 deletions(-) diff --git a/tools/osbuilder/node-builder/azure-linux/common.sh b/tools/osbuilder/node-builder/azure-linux/common.sh index 8b0665c47aa2..50a9d297edb6 100755 --- a/tools/osbuilder/node-builder/azure-linux/common.sh +++ b/tools/osbuilder/node-builder/azure-linux/common.sh @@ -12,6 +12,13 @@ OS_VERSION=$(sort -r /etc/*-release | gawk 'match($0, /^(VERSION_ID=(.*))$/, a) ([[ "${OS_VERSION}" == "2.0" ]] || [[ "${OS_VERSION}" == "3.0" ]]) || die "OS_VERSION: value '${OS_VERSION}' must equal 3.0 (default) or 2.0" +SHIM_CONFIG_FILE_NAME_RUNTIME_GO="configuration-clh.toml" +SHIM_DBG_CONFIG_FILE_NAME_RUNTIME_GO="configuration-clh-debug.toml" +CONFIG_DIR_RUNTIME_GO="src/runtime/config" +SHIM_CONFIG_FILE_NAME_RUNTIME_RS="configuration-cloud-hypervisor.toml" +SHIM_DBG_CONFIG_FILE_NAME_RUNTIME_RS="configuration-cloud-hypervisor-debug.toml" +CONFIG_DIR_RUNTIME_RS="src/runtime-rs/config" + if [ "${CONF_PODS}" == "yes" ]; then INSTALL_PATH_PREFIX="/opt/confidential-containers" UVM_TOOLS_PATH_OSB="${INSTALL_PATH_PREFIX}/uvm/tools/osbuilder" diff --git a/tools/osbuilder/node-builder/azure-linux/package_build.sh b/tools/osbuilder/node-builder/azure-linux/package_build.sh index 346ba5a9f092..6d0f85174147 100755 --- a/tools/osbuilder/node-builder/azure-linux/package_build.sh +++ b/tools/osbuilder/node-builder/azure-linux/package_build.sh @@ -19,9 +19,28 @@ repo_dir="${script_dir}/../../../../" common_file="common.sh" source "${common_file}" -# these options ensure we produce the proper CLH config file -runtime_make_flags="SKIP_GO_VERSION_CHECK=1 QEMUCMD= FCCMD= ACRNCMD= STRATOVIRTCMD= DEFAULT_HYPERVISOR=cloud-hypervisor - DEFMEMSZ=0 DEFSTATICSANDBOXWORKLOADMEM=512 DEFVCPUS=0 DEFSTATICSANDBOXWORKLOADVCPUS=1 DEFVIRTIOFSDAEMON=${VIRTIOFSD_BINARY_LOCATION} PREFIX=${INSTALL_PATH_PREFIX}" +runtime_go_make_flags="SKIP_GO_VERSION_CHECK=1 \ + QEMUCMD= \ + FCCMD= \ + ACRNCMD= \ + STRATOVIRTCMD= \ + DEFAULT_HYPERVISOR=cloud-hypervisor \ + DEFMEMSZ=0 \ + DEFSTATICSANDBOXWORKLOADMEM=512 \ + DEFVCPUS=0 \ + DEFSTATICSANDBOXWORKLOADVCPUS=1 \ + DEFVIRTIOFSDAEMON=${VIRTIOFSD_BINARY_LOCATION} \ + PREFIX=${INSTALL_PATH_PREFIX}" + +runtime_rs_make_flags="BUILD_TYPE=release \ + LIBC=gnu \ + HYPERVISOR=cloud-hypervisor \ + OPENSSL_NO_VENDOR=Y \ + USE_BUILDIN_DB=false \ + QEMUCMD= \ + FCCMD= \ + DEFVIRTIOFSDAEMON=${VIRTIOFSD_BINARY_LOCATION} \ + PREFIX=${INSTALL_PATH_PREFIX}" # - for vanilla Kata we use the kernel binary. For ConfPods we use IGVM, so no need to provide kernel path. # - for vanilla Kata we explicitly set DEFSTATICRESOURCEMGMT_CLH. For ConfPods, @@ -29,14 +48,17 @@ runtime_make_flags="SKIP_GO_VERSION_CHECK=1 QEMUCMD= FCCMD= ACRNCMD= STRATOVIRTC # - for ConfPods we explicitly set the cloud-hypervisor path. The path is independent of the PREFIX variable # as we have a single CLH binary for both vanilla Kata and ConfPods if [ "${CONF_PODS}" == "no" ]; then - runtime_make_flags+=" DEFSTATICRESOURCEMGMT_CLH=true KERNELPATH_CLH=${KERNEL_BINARY_LOCATION} DEFSANDBOXWORKLOADMEMMIN=128" + runtime_go_make_flags+=" DEFSTATICRESOURCEMGMT_CLH=true KERNELPATH_CLH=${KERNEL_BINARY_LOCATION} DEFSANDBOXWORKLOADMEMMIN=128" + runtime_rs_make_flags+=" DEFSTATICRESOURCEMGMT_CLH=true KERNELPATH_CLH=${KERNEL_BINARY_LOCATION}" else - runtime_make_flags+=" CLHPATH=${CLOUD_HYPERVISOR_LOCATION} DEFSANDBOXWORKLOADMEMMIN=192" + runtime_go_make_flags+=" CLHPATH=${CLOUD_HYPERVISOR_LOCATION} DEFSANDBOXWORKLOADMEMMIN=192" + runtime_rs_make_flags+=" CLHPATH=${CLOUD_HYPERVISOR_LOCATION}" fi # On Mariner 3.0 we use cgroupsv2 with a single sandbox cgroup if [ "${OS_VERSION}" == "3.0" ]; then - runtime_make_flags+=" DEFSANDBOXCGROUPONLY=true" + runtime_go_make_flags+=" DEFSANDBOXCGROUPONLY=true" + runtime_rs_make_flags+=" DEFSANDBOXCGROUPONLY_CLH=true" fi agent_make_flags="LIBC=gnu OPENSSL_NO_VENDOR=Y DESTDIR=${AGENT_INSTALL_DIR} BUILD_TYPE=${AGENT_BUILD_TYPE}" @@ -65,29 +87,45 @@ if [ "${CONF_PODS}" == "yes" ]; then popd fi -echo "Building shim binary and configuration" +echo "Building runtime-go shim binary" pushd src/runtime/ if [ "${CONF_PODS}" == "yes" ] || [ "${OS_VERSION}" == "3.0" ]; then - make ${runtime_make_flags} + make ${runtime_go_make_flags} else # Mariner 2 pod sandboxing uses cgroupsv1 - note: cannot add the kernelparams in above assignments, # leads to quotation issue. Hence, implementing the conditional check right here at the time of the make command - make ${runtime_make_flags} KERNELPARAMS="systemd.legacy_systemd_cgroup_controller=yes systemd.unified_cgroup_hierarchy=0" + make ${runtime_go_make_flags} KERNELPARAMS="systemd.legacy_systemd_cgroup_controller=yes systemd.unified_cgroup_hierarchy=0" fi popd -pushd src/runtime/config/ -echo "Creating shim debug configuration" -cp "${SHIM_CONFIG_FILE_NAME}" "${SHIM_DBG_CONFIG_FILE_NAME}" -sed -i '/^#enable_debug =/s|^#||g' "${SHIM_DBG_CONFIG_FILE_NAME}" -sed -i '/^#debug_console_enabled =/s|^#||g' "${SHIM_DBG_CONFIG_FILE_NAME}" - -if [ "${CONF_PODS}" == "yes" ]; then - echo "Adding debug igvm to SNP shim debug configuration" - sed -i "s|${IGVM_FILE_NAME}|${IGVM_DBG_FILE_NAME}|g" "${SHIM_DBG_CONFIG_FILE_NAME}" -fi +echo "Building runtime-rs shim binary" +pushd src/runtime-rs/ +make ${runtime_rs_make_flags} popd +create_debug_shim_config() { + local config_dir="$1" + local release_cfg="$2" + local debug_cfg="$3" + + pushd "${config_dir}" + echo "Creating shim debug configuration: ${debug_cfg}" + cp "${release_cfg}" "${debug_cfg}" + # Ensure debug is enabled in the shim config, regardless of whether the + # template uses commented or uncommented keys. + sed -i -E 's|^#?[[:space:]]*enable_debug[[:space:]]*=.*$|enable_debug = true|' "${debug_cfg}" + sed -i -E 's|^#?[[:space:]]*debug_console_enabled[[:space:]]*=.*$|debug_console_enabled = true|' "${debug_cfg}" + + if [ "${CONF_PODS}" == "yes" ]; then + echo "Adding debug igvm to SNP shim debug configuration" + sed -i "s|${IGVM_FILE_NAME}|${IGVM_DBG_FILE_NAME}|g" "${debug_cfg}" + fi + popd +} + +create_debug_shim_config "${CONFIG_DIR_RUNTIME_GO}" "${SHIM_CONFIG_FILE_NAME_RUNTIME_GO}" "${SHIM_DBG_CONFIG_FILE_NAME_RUNTIME_GO}" +create_debug_shim_config "${CONFIG_DIR_RUNTIME_RS}" "${SHIM_CONFIG_FILE_NAME_RUNTIME_RS}" "${SHIM_DBG_CONFIG_FILE_NAME_RUNTIME_RS}" + echo "Building agent binary and generating service files" pushd src/agent/ make ${agent_make_flags} From fd3559cdafb196dabb875cd4ffe5ebc03f44a09b Mon Sep 17 00:00:00 2001 From: Saul Paredes Date: Thu, 26 Mar 2026 09:24:49 -0700 Subject: [PATCH 2/9] node-builder: install runtime-rs Install both runtime-rs and runtime-go configs and binaries side by side: - runtime-go: /usr/local/bin/containerd-shim-kata-v2-go /usr/local/share/defaults/kata-containers/configuration-clh.toml /usr/local/share/defaults/kata-containers/configuration-clh-debug.toml - runtime-rs: /usr/local/bin/containerd-shim-kata-v2-rs /usr/local/share/defaults/kata-containers/configuration-cloud-hypervisor.toml /usr/local/share/defaults/kata-containers/configuration-cloud-hypervisor-debug.toml Also add USE_RUNTIME_RS variable and default to "no". This controls which runtime binary and configuration will be installed to /usr/local/bin/containerd-shim-kata-v2 and /usr/local/share/defaults/kata-containers/configuration.toml respectively. Signed-off-by: Saul Paredes --- .../node-builder/azure-linux/Makefile | 1 + .../node-builder/azure-linux/common.sh | 12 +++++- .../azure-linux/package_install.sh | 40 +++++++++++++++---- 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/tools/osbuilder/node-builder/azure-linux/Makefile b/tools/osbuilder/node-builder/azure-linux/Makefile index 85ebf59e2114..74283db511f3 100644 --- a/tools/osbuilder/node-builder/azure-linux/Makefile +++ b/tools/osbuilder/node-builder/azure-linux/Makefile @@ -5,6 +5,7 @@ BUILD_TYPE := release export SHIM_REDEPLOY_CONFIG := yes +export USE_RUNTIME_RS := no ifeq ($(BUILD_TYPE),debug) export AGENT_BUILD_TYPE := debug diff --git a/tools/osbuilder/node-builder/azure-linux/common.sh b/tools/osbuilder/node-builder/azure-linux/common.sh index 50a9d297edb6..aa53562dbdc1 100755 --- a/tools/osbuilder/node-builder/azure-linux/common.sh +++ b/tools/osbuilder/node-builder/azure-linux/common.sh @@ -38,15 +38,23 @@ if [ "${CONF_PODS}" == "yes" ]; then SHIM_BINARIES_PATH="/usr/local/bin" SHIM_BINARY_NAME="containerd-shim-kata-cc-v2" else + + # Toggle the default shim implementation installed + if [ "${USE_RUNTIME_RS}" == "yes" ]; then + SHIM_CONFIG_FILE_NAME="${SHIM_CONFIG_FILE_NAME_RUNTIME_RS}" + SHIM_DBG_CONFIG_FILE_NAME="${SHIM_DBG_CONFIG_FILE_NAME_RUNTIME_RS}" + else # runtime-go + SHIM_CONFIG_FILE_NAME="${SHIM_CONFIG_FILE_NAME_RUNTIME_GO}" + SHIM_DBG_CONFIG_FILE_NAME="${SHIM_DBG_CONFIG_FILE_NAME_RUNTIME_GO}" + fi + INSTALL_PATH_PREFIX="/usr" UVM_TOOLS_PATH_OSB="/opt/kata-containers/uvm/tools/osbuilder" UVM_TOOLS_PATH_SRC="/opt/kata-containers/uvm/src" UVM_PATH_DEFAULT="${INSTALL_PATH_PREFIX}/share/kata-containers" IMG_FILE_NAME="kata-containers.img" SHIM_CONFIG_PATH="${INSTALL_PATH_PREFIX}/share/defaults/kata-containers" - SHIM_CONFIG_FILE_NAME="configuration-clh.toml" SHIM_CONFIG_INST_FILE_NAME="configuration.toml" - SHIM_DBG_CONFIG_FILE_NAME="configuration-clh-debug.toml" SHIM_DBG_CONFIG_INST_FILE_NAME="${SHIM_DBG_CONFIG_FILE_NAME}" DEBUGGING_BINARIES_PATH="${INSTALL_PATH_PREFIX}/local/bin" SHIM_BINARIES_PATH="${INSTALL_PATH_PREFIX}/local/bin" diff --git a/tools/osbuilder/node-builder/azure-linux/package_install.sh b/tools/osbuilder/node-builder/azure-linux/package_install.sh index 791cff5d92d2..d1d22733140e 100755 --- a/tools/osbuilder/node-builder/azure-linux/package_install.sh +++ b/tools/osbuilder/node-builder/azure-linux/package_install.sh @@ -29,6 +29,12 @@ mkdir -p "${PREFIX}/${SHIM_CONFIG_PATH}" mkdir -p "${PREFIX}/${DEBUGGING_BINARIES_PATH}" mkdir -p "${PREFIX}/${SHIM_BINARIES_PATH}" +RUNTIME_GO_SHIM="src/runtime/containerd-shim-kata-v2" +RUNTIME_RS_TARGET="target/x86_64-unknown-linux-gnu/release" +RUNTIME_RS_SHIM="${RUNTIME_RS_TARGET}/containerd-shim-kata-v2" +SHIM_BINARY_RUNTIME_GO="${SHIM_BINARY_NAME}-go" +SHIM_BINARY_RUNTIME_RS="${SHIM_BINARY_NAME}-rs" + if [ "${CONF_PODS}" == "yes" ]; then echo "Installing tardev-snapshotter binaries and service file" mkdir -p ${PREFIX}/usr/sbin @@ -45,26 +51,44 @@ if [ "${CONF_PODS}" == "yes" ]; then fi fi -echo "Installing diagnosability binaries (monitor, runtime, collect-data script)" +echo "Installing diagnosability binaries from runtime-go (monitor, runtime, collect-data script)" cp -a --backup=numbered src/runtime/kata-monitor "${PREFIX}/${DEBUGGING_BINARIES_PATH}" cp -a --backup=numbered src/runtime/kata-runtime "${PREFIX}/${DEBUGGING_BINARIES_PATH}" chmod +x src/runtime/data/kata-collect-data.sh cp -a --backup=numbered src/runtime/data/kata-collect-data.sh "${PREFIX}/${DEBUGGING_BINARIES_PATH}" -echo "Installing shim binary" -cp -a --backup=numbered src/runtime/containerd-shim-kata-v2 "${PREFIX}/${SHIM_BINARIES_PATH}"/"${SHIM_BINARY_NAME}" +echo "Installing shim binaries side by side" +cp -a --backup=numbered "${RUNTIME_GO_SHIM}" "${PREFIX}/${SHIM_BINARIES_PATH}/${SHIM_BINARY_RUNTIME_GO}" +cp -a --backup=numbered "${RUNTIME_RS_SHIM}" "${PREFIX}/${SHIM_BINARIES_PATH}/${SHIM_BINARY_RUNTIME_RS}" + +default_shim_binary="${SHIM_BINARY_RUNTIME_GO}" +shim_config_src_dir="${CONFIG_DIR_RUNTIME_GO}" + +if [ "${USE_RUNTIME_RS}" == "yes" ]; then + default_shim_binary="${SHIM_BINARY_RUNTIME_RS}" + shim_config_src_dir="${CONFIG_DIR_RUNTIME_RS}" +fi + +echo "Installing default shim binary: ${default_shim_binary}" +cp -a --backup=numbered "${PREFIX}/${SHIM_BINARIES_PATH}/${default_shim_binary}" "${PREFIX}/${SHIM_BINARIES_PATH}/${SHIM_BINARY_NAME}" if [ "${SHIM_REDEPLOY_CONFIG}" == "yes" ]; then - echo "Installing shim configuration" - cp -a --backup=numbered src/runtime/config/"${SHIM_CONFIG_FILE_NAME}" "${PREFIX}/${SHIM_CONFIG_PATH}/${SHIM_CONFIG_INST_FILE_NAME}" - cp -a --backup=numbered src/runtime/config/"${SHIM_DBG_CONFIG_FILE_NAME}" "${PREFIX}/${SHIM_CONFIG_PATH}/${SHIM_DBG_CONFIG_INST_FILE_NAME}" + + echo "Installing configurations side by side" + cp -a --backup=numbered "${CONFIG_DIR_RUNTIME_GO}/${SHIM_CONFIG_FILE_NAME_RUNTIME_GO}" "${PREFIX}/${SHIM_CONFIG_PATH}/${SHIM_CONFIG_FILE_NAME_RUNTIME_GO}" + cp -a --backup=numbered "${CONFIG_DIR_RUNTIME_RS}/${SHIM_CONFIG_FILE_NAME_RUNTIME_RS}" "${PREFIX}/${SHIM_CONFIG_PATH}/${SHIM_CONFIG_FILE_NAME_RUNTIME_RS}" + cp -a --backup=numbered "${CONFIG_DIR_RUNTIME_GO}/${SHIM_DBG_CONFIG_FILE_NAME_RUNTIME_GO}" "${PREFIX}/${SHIM_CONFIG_PATH}/${SHIM_DBG_CONFIG_FILE_NAME_RUNTIME_GO}" + cp -a --backup=numbered "${CONFIG_DIR_RUNTIME_RS}/${SHIM_DBG_CONFIG_FILE_NAME_RUNTIME_RS}" "${PREFIX}/${SHIM_CONFIG_PATH}/${SHIM_DBG_CONFIG_FILE_NAME_RUNTIME_RS}" + + echo "Installing default shim configuration: ${SHIM_CONFIG_FILE_NAME}" + cp -a --backup=numbered "${shim_config_src_dir}/${SHIM_CONFIG_FILE_NAME}" "${PREFIX}/${SHIM_CONFIG_PATH}/${SHIM_CONFIG_INST_FILE_NAME}" if [ "${SHIM_USE_DEBUG_CONFIG}" == "yes" ]; then # We simply override the release config with the debug config, # which is probably fine when debugging. Not symlinking as that # would create cycles the next time this script is called. - echo "Overriding shim configuration with debug configuration" - cp -a --backup=numbered src/runtime/config/"${SHIM_DBG_CONFIG_FILE_NAME}" "${PREFIX}/${SHIM_CONFIG_PATH}/${SHIM_CONFIG_INST_FILE_NAME}" + echo "Overriding shim configuration with debug configuration: ${SHIM_DBG_CONFIG_FILE_NAME}" + cp -a --backup=numbered "${shim_config_src_dir}/${SHIM_DBG_CONFIG_FILE_NAME}" "${PREFIX}/${SHIM_CONFIG_PATH}/${SHIM_CONFIG_INST_FILE_NAME}" fi else echo "Skipping installation of shim configuration" From d2a870635c1ff2bcff9f388c90e9e2b3653ec4e4 Mon Sep 17 00:00:00 2001 From: Saul Paredes Date: Mon, 30 Mar 2026 08:47:58 -0700 Subject: [PATCH 3/9] runtime-rs: Resolve high UVM memory footprint This is a port from https://github.com/microsoft/kata-containers/commit/b03db3e3c0e2708e009aa108500601f533ae7b20 into runtime-rs Bug: https://microsoft.visualstudio.com/OS/_workitems/edit/43668151 Rationale: This is a temporary solution for optimizing memory usage for the current mechanism of requesting resources through pod Limit annotations: - if no Limits are specified and hence WorkloadMemMB is 0, set a default value 'StaticWorkloadDefaultMem' to allocate a default amount of memory for use for containers in the sandbox in addition to the base memory - if Limits are specified, the base memory and the sum of Limits are allocated. The end user needs to be aware of the minimum memory requirements for their pods, otherwise the pod will be stuck in the ContainerCreating state Testing: Manual testing, creating pods with Limits and without limits, and with two containers where each container has a limit, tested with integration in a SPEC file where the config variables were set via environment variables via the make command Signed-off-by: Saul Paredes --- src/libs/kata-types/src/config/runtime.rs | 4 ++++ src/runtime-rs/Makefile | 5 ++++- .../config/configuration-cloud-hypervisor.toml.in | 5 +++++ src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs | 7 +++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/libs/kata-types/src/config/runtime.rs b/src/libs/kata-types/src/config/runtime.rs index 077c6b96db1a..6c6752efe202 100644 --- a/src/libs/kata-types/src/config/runtime.rs +++ b/src/libs/kata-types/src/config/runtime.rs @@ -123,6 +123,10 @@ pub struct Runtime { #[serde(default)] pub static_sandbox_resource_mgmt: bool, + /// Memory to allocate for workloads within the sandbox when workload memory is unspecified + #[serde(default)] + pub static_sandbox_default_workload_mem: u32, + /// Determines whether container seccomp profiles are passed to the virtual machine and /// applied by the kata agent. If set to true, seccomp is not applied within the guest. #[serde(default)] diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index eea49c54f9ab..9b025640973c 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -163,7 +163,7 @@ DEFVCPUS := 1 ##VAR DEFMAXVCPUS= Default maximum number of vCPUs DEFMAXVCPUS := 0 ##VAR DEFMEMSZ= Default memory size in MiB -DEFMEMSZ := 2048 +DEFMEMSZ ?= 2048 ##VAR DEFMEMSLOTS= Default memory slots # Cases to consider : # - nvdimm rootfs image @@ -214,6 +214,8 @@ DEFVFIOMODE := guest-kernel DEFBINDMOUNTS := [] DEFDANCONF := /run/kata-containers/dans DEFFORCEGUESTPULL := false +# Default memory for workloads within the sandbox when no workload memory is requested. +DEFSTATICSANDBOXWORKLOADMEM ?= 2048 QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT := 4050 # Create Container Timeout in seconds @@ -622,6 +624,7 @@ USER_VARS += KATA_INSTALL_OWNER USER_VARS += KATA_INSTALL_CFG_PERMS USER_VARS += DEFDANCONF USER_VARS += DEFFORCEGUESTPULL +USER_VARS += DEFSTATICSANDBOXWORKLOADMEM USER_VARS += QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT USER_VARS += DEFCREATECONTAINERTIMEOUT USER_VARS += DEFCREATECONTAINERTIMEOUT_COCO diff --git a/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in b/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in index 4b981d47c099..864477da99d0 100644 --- a/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in +++ b/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in @@ -522,6 +522,11 @@ enable_pprof = false # - When running single containers using a tool like ctr, container sizing information will be available. static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_CLH@ +# If set, the runtime will use the value as the default workload memory in MB for the sandbox when no workload memory request is passed +# down to the shim via the OCI when static sandbox resource management is enabled. With this, we ensure that workloads have a proper +# default amount of memory available within the sandbox. +static_sandbox_default_workload_mem = @DEFSTATICSANDBOXWORKLOADMEM@ + # If specified, sandbox_bind_mounts identifieds host paths to be mounted(ro, rw) into the sandboxes shared path. # This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. # If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs index 9b56af9734fa..f5d5e2809ca3 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs @@ -143,6 +143,13 @@ impl InitialSizeManager { if self.resource.vcpu > 0.0 { info!(sl!(), "resource with vcpu {}", self.resource.vcpu); } + + if config.runtime.static_sandbox_resource_mgmt { + if self.resource.mem_mb == 0 { + self.resource.mem_mb = config.runtime.static_sandbox_default_workload_mem; + } + } + self.resource.orig_toml_default_mem = hv.memory_info.default_memory; if self.resource.mem_mb > 0 { // since the memory overhead introduced by kata-agent and system components From 6d2d0d00efc7963e6c46ed9bf77fbb713b5a8038 Mon Sep 17 00:00:00 2001 From: Saul Paredes Date: Mon, 30 Mar 2026 08:52:47 -0700 Subject: [PATCH 4/9] runtime-rs: improved memory overhead management This is a port from https://github.com/microsoft/kata-containers/commit/7ddec33642c9a307d4af79078c2f15be887bec86 into runtime-rs After these changes: 1. The value of the K8s runtime class memory overhead: - Covers the memory usage from all the Host-side components (mainly the Kata Shim and the VMM). - Doesn't include the memory usage from any Guest-side components. 2. The value of a pod memory limit specified by the user: - Is equal to the memory size of the Pod VM. - Includes the memory usage from all the Guest-side components (mainly user's workload, the Guest kernel, and the Kata Agent) - Doesn't include the memory usage from any Host-side components. Signed-off-by: Saul Paredes --- src/libs/kata-types/src/config/default.rs | 2 +- src/libs/kata-types/src/config/hypervisor/ch.rs | 3 --- src/libs/kata-types/src/config/hypervisor/mod.rs | 9 ++------- .../osbuilder/node-builder/azure-linux/package_build.sh | 4 +++- 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/libs/kata-types/src/config/default.rs b/src/libs/kata-types/src/config/default.rs index 7a4e42b80eb3..2839de928d5a 100644 --- a/src/libs/kata-types/src/config/default.rs +++ b/src/libs/kata-types/src/config/default.rs @@ -93,7 +93,7 @@ pub const DEFAULT_CH_MEMORY_SLOTS: u32 = 128; pub const DEFAULT_CH_PCI_BRIDGES: u32 = 2; pub const MAX_CH_PCI_BRIDGES: u32 = 5; pub const MAX_CH_VCPUS: u32 = 256; -pub const MIN_CH_MEMORY_SIZE_MB: u32 = 64; +pub const MIN_CH_MEMORY_SIZE_MB: u32 = 0; //Default configuration for firecracker pub const DEFAULT_FIRECRACKER_ENTROPY_SOURCE: &str = "/dev/urandom"; diff --git a/src/libs/kata-types/src/config/hypervisor/ch.rs b/src/libs/kata-types/src/config/hypervisor/ch.rs index 2d017ae211ce..15432277fa11 100644 --- a/src/libs/kata-types/src/config/hypervisor/ch.rs +++ b/src/libs/kata-types/src/config/hypervisor/ch.rs @@ -79,9 +79,6 @@ impl ConfigPlugin for CloudHypervisorConfig { ch.machine_info.entropy_source = default::DEFAULT_CH_ENTROPY_SOURCE.to_string(); } - if ch.memory_info.default_memory == 0 { - ch.memory_info.default_memory = default::DEFAULT_CH_MEMORY_SIZE_MB; - } if ch.memory_info.memory_slots == 0 { ch.memory_info.memory_slots = default::DEFAULT_CH_MEMORY_SLOTS; } diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs index 20be1909b2b8..894d825bc6e2 100644 --- a/src/libs/kata-types/src/config/hypervisor/mod.rs +++ b/src/libs/kata-types/src/config/hypervisor/mod.rs @@ -1031,19 +1031,14 @@ impl MemoryInfo { /// Validates the memory configuration information. /// - /// This ensures that critical memory parameters like `default_memory` - /// and `memory_slots` are non-zero, and checks the validity of + /// This ensures that critical memory parameters like `memory_slots` are + /// non-zero, and checks the validity of /// the memory backend file path. pub fn validate(&self) -> Result<()> { validate_path!( self.file_mem_backend, "Memory backend file {} is invalid: {}" )?; - if self.default_memory == 0 { - return Err(std::io::Error::other( - "Configured memory size for guest VM is zero", - )); - } if self.memory_slots == 0 { return Err(std::io::Error::other( "Configured memory slots for guest VM are zero", diff --git a/tools/osbuilder/node-builder/azure-linux/package_build.sh b/tools/osbuilder/node-builder/azure-linux/package_build.sh index 6d0f85174147..762340866d67 100755 --- a/tools/osbuilder/node-builder/azure-linux/package_build.sh +++ b/tools/osbuilder/node-builder/azure-linux/package_build.sh @@ -40,7 +40,9 @@ runtime_rs_make_flags="BUILD_TYPE=release \ QEMUCMD= \ FCCMD= \ DEFVIRTIOFSDAEMON=${VIRTIOFSD_BINARY_LOCATION} \ - PREFIX=${INSTALL_PATH_PREFIX}" + PREFIX=${INSTALL_PATH_PREFIX}" \ + DEFMEMSZ=0 \ + DEFSTATICSANDBOXWORKLOADMEM=512 # - for vanilla Kata we use the kernel binary. For ConfPods we use IGVM, so no need to provide kernel path. # - for vanilla Kata we explicitly set DEFSTATICRESOURCEMGMT_CLH. For ConfPods, From 701b396a2c7e6c95b65d8e161ec80b8d68ed00b9 Mon Sep 17 00:00:00 2001 From: Saul Paredes Date: Mon, 30 Mar 2026 08:54:05 -0700 Subject: [PATCH 5/9] runtime-rs: Allocate default workload vcpus This is a port from https://github.com/microsoft/kata-containers/commit/9af9844bc7331d0a37d0159f299ab3fb0e443669 Plus ports an existing behaviour from runtime-go to also add the vcpus. See https://github.com/fidencio/kata-containers/blob/e2476f587c472d5d217df9c75cdb80193dd85994/src/runtime/pkg/oci/utils.go#L1232 - similar to the static_sandbox_default_workload_mem option, assign a default number of vcpus to the VM when no limits are given, 1 vcpu in this case - similar to commit c7b8ee9, do not allocate additional vcpus when limits are provided Signed-off-by: Saul Paredes --- src/libs/kata-types/src/config/default.rs | 2 +- src/libs/kata-types/src/config/runtime.rs | 5 +++++ src/runtime-rs/Makefile | 6 ++++-- .../config/configuration-cloud-hypervisor.toml.in | 5 +++++ .../crates/resource/src/cpu_mem/initial_size.rs | 8 ++++++++ tools/osbuilder/node-builder/azure-linux/package_build.sh | 6 ++++-- 6 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/libs/kata-types/src/config/default.rs b/src/libs/kata-types/src/config/default.rs index 2839de928d5a..8cbe988a2d49 100644 --- a/src/libs/kata-types/src/config/default.rs +++ b/src/libs/kata-types/src/config/default.rs @@ -54,7 +54,7 @@ pub const MAX_SHARED_9PFS_SIZE_MB: u32 = 8 * 1024 * 1024; pub const DEFAULT_GUEST_HOOK_PATH: &str = "/opt/kata/hooks"; pub const DEFAULT_GUEST_DNS_FILE: &str = "/etc/resolv.conf"; -pub const DEFAULT_GUEST_VCPUS: u32 = 1; +pub const DEFAULT_GUEST_VCPUS: u32 = 0; // Default configuration for dragonball pub const DEFAULT_DRAGONBALL_GUEST_KERNEL_IMAGE: &str = "vmlinuz"; diff --git a/src/libs/kata-types/src/config/runtime.rs b/src/libs/kata-types/src/config/runtime.rs index 6c6752efe202..1d91010ef4a6 100644 --- a/src/libs/kata-types/src/config/runtime.rs +++ b/src/libs/kata-types/src/config/runtime.rs @@ -127,6 +127,11 @@ pub struct Runtime { #[serde(default)] pub static_sandbox_default_workload_mem: u32, + /// Default workload vcpus added to the sandbox when static resource management + /// is enabled and no explicit workload vcpu limit was provided. + #[serde(default)] + pub static_sandbox_default_workload_vcpus: f32, + /// Determines whether container seccomp profiles are passed to the virtual machine and /// applied by the kata agent. If set to true, seccomp is not applied within the guest. #[serde(default)] diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index 9b025640973c..1de2febcecf5 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -159,7 +159,7 @@ FIRMWARE_SNP_PATH := $(PREFIXDEPS)/share/ovmf/AMDSEV.fd FIRMWARE_VOLUME_SNP_PATH := ##VAR DEFVCPUS= Default number of vCPUs -DEFVCPUS := 1 +DEFVCPUS ?= 1 ##VAR DEFMAXVCPUS= Default maximum number of vCPUs DEFMAXVCPUS := 0 ##VAR DEFMEMSZ= Default memory size in MiB @@ -214,8 +214,9 @@ DEFVFIOMODE := guest-kernel DEFBINDMOUNTS := [] DEFDANCONF := /run/kata-containers/dans DEFFORCEGUESTPULL := false -# Default memory for workloads within the sandbox when no workload memory is requested. +# Default memory and vcpus for workloads within the sandbox when no workload values are requested. DEFSTATICSANDBOXWORKLOADMEM ?= 2048 +DEFSTATICSANDBOXWORKLOADVCPUS ?= 1 QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT := 4050 # Create Container Timeout in seconds @@ -625,6 +626,7 @@ USER_VARS += KATA_INSTALL_CFG_PERMS USER_VARS += DEFDANCONF USER_VARS += DEFFORCEGUESTPULL USER_VARS += DEFSTATICSANDBOXWORKLOADMEM +USER_VARS += DEFSTATICSANDBOXWORKLOADVCPUS USER_VARS += QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT USER_VARS += DEFCREATECONTAINERTIMEOUT USER_VARS += DEFCREATECONTAINERTIMEOUT_COCO diff --git a/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in b/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in index 864477da99d0..7497525845d8 100644 --- a/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in +++ b/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in @@ -527,6 +527,11 @@ static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_CLH@ # default amount of memory available within the sandbox. static_sandbox_default_workload_mem = @DEFSTATICSANDBOXWORKLOADMEM@ +# If set, the runtime will use the value as the default number of vcpus for the sandbox when no workload vcpu request is passed +# down to the shim via the OCI when static sandbox resource management is enabled. With this, we ensure that workloads have a proper +# default amount of vcpus available within the sandbox. +static_sandbox_default_workload_vcpus = @DEFSTATICSANDBOXWORKLOADVCPUS@ + # If specified, sandbox_bind_mounts identifieds host paths to be mounted(ro, rw) into the sandboxes shared path. # This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. # If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs index f5d5e2809ca3..92a8749f097d 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs @@ -148,6 +148,10 @@ impl InitialSizeManager { if self.resource.mem_mb == 0 { self.resource.mem_mb = config.runtime.static_sandbox_default_workload_mem; } + + if self.resource.vcpu == 0.0 { + self.resource.vcpu = config.runtime.static_sandbox_default_workload_vcpus; + } } self.resource.orig_toml_default_mem = hv.memory_info.default_memory; @@ -159,6 +163,10 @@ impl InitialSizeManager { // use memory as they orignally expected, it would be easy to OOM.) hv.memory_info.default_memory += self.resource.mem_mb; } + + if self.resource.vcpu > 0.0 { + hv.cpu_info.default_vcpus += self.resource.vcpu; + } Ok(()) } diff --git a/tools/osbuilder/node-builder/azure-linux/package_build.sh b/tools/osbuilder/node-builder/azure-linux/package_build.sh index 762340866d67..1b08ea295382 100755 --- a/tools/osbuilder/node-builder/azure-linux/package_build.sh +++ b/tools/osbuilder/node-builder/azure-linux/package_build.sh @@ -40,9 +40,11 @@ runtime_rs_make_flags="BUILD_TYPE=release \ QEMUCMD= \ FCCMD= \ DEFVIRTIOFSDAEMON=${VIRTIOFSD_BINARY_LOCATION} \ - PREFIX=${INSTALL_PATH_PREFIX}" \ + PREFIX=${INSTALL_PATH_PREFIX} \ DEFMEMSZ=0 \ - DEFSTATICSANDBOXWORKLOADMEM=512 + DEFSTATICSANDBOXWORKLOADMEM=512 \ + DEFVCPUS=0 \ + DEFSTATICSANDBOXWORKLOADVCPUS=1" # - for vanilla Kata we use the kernel binary. For ConfPods we use IGVM, so no need to provide kernel path. # - for vanilla Kata we explicitly set DEFSTATICRESOURCEMGMT_CLH. For ConfPods, From fdd1d73959b548d2aa1f2904b1aeda5f48dda229 Mon Sep 17 00:00:00 2001 From: Saul Paredes Date: Mon, 30 Mar 2026 14:59:59 -0700 Subject: [PATCH 6/9] runtime-rs: add test coverage for static resource management If using static management and: - initial size manager uses 0 for CPU or memory, we add default static values to the hv config - if initial size manager uses non-zero values for CPU or memory, we add those values to the hv config Signed-off-by: Saul Paredes --- .../resource/src/cpu_mem/initial_size.rs | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs index 92a8749f097d..9b9118ec92ce 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs @@ -212,6 +212,7 @@ fn get_sizing_info(annotation: Annotation) -> Result<(u64, i64, i64)> { mod tests { use super::*; use kata_types::annotations::cri_containerd; + use kata_types::config::Hypervisor; use oci_spec::runtime::{LinuxBuilder, LinuxMemory, LinuxMemoryBuilder, LinuxResourcesBuilder}; use std::collections::HashMap; #[derive(Clone)] @@ -381,4 +382,72 @@ mod tests { ); } } + + fn get_config_for_setup_tests( + base_vcpus: f32, + base_mem_mb: u32, + static_mgmt: bool, + default_workload_vcpus: f32, + default_workload_mem_mb: u32, + ) -> TomlConfig { + let hypervisor_name = "test-hv".to_string(); + let mut config = TomlConfig::default(); + config.runtime.hypervisor_name = hypervisor_name.clone(); + config.runtime.static_sandbox_resource_mgmt = static_mgmt; + config.runtime.static_sandbox_default_workload_vcpus = default_workload_vcpus; + config.runtime.static_sandbox_default_workload_mem = default_workload_mem_mb; + + let mut hv = Hypervisor::default(); + hv.cpu_info.default_vcpus = base_vcpus; + hv.memory_info.default_memory = base_mem_mb; + config.hypervisor.insert(hypervisor_name, hv); + + config + } + + #[test] + fn test_setup_config_static_defaults_unset_resources() { + let mut manager = InitialSizeManager { + resource: InitialSize { + vcpu: 0.0, + mem_mb: 0, + orig_toml_default_mem: 0, + }, + }; + let mut config = get_config_for_setup_tests(2.0, 256, true, 1.0, 512); + + manager.setup_config(&mut config).unwrap(); + + let hv = config + .hypervisor + .get(&config.runtime.hypervisor_name) + .unwrap(); + assert_eq!(hv.cpu_info.default_vcpus, 3.0); + assert_eq!(hv.memory_info.default_memory, 768); + assert_eq!(manager.get_orig_toml_default_mem(), 256); + assert_eq!(manager.workload_mem_mb(), 512); + } + + #[test] + fn test_setup_config_static_preserves_explicit_resources() { + let mut manager = InitialSizeManager { + resource: InitialSize { + vcpu: 1.5, + mem_mb: 1024, + orig_toml_default_mem: 0, + }, + }; + let mut config = get_config_for_setup_tests(2.0, 256, true, 3.0, 512); + + manager.setup_config(&mut config).unwrap(); + + let hv = config + .hypervisor + .get(&config.runtime.hypervisor_name) + .unwrap(); + assert_eq!(hv.cpu_info.default_vcpus, 3.5); + assert_eq!(hv.memory_info.default_memory, 1280); + assert_eq!(manager.get_orig_toml_default_mem(), 256); + assert_eq!(manager.workload_mem_mb(), 1024); + } } From e7aad900bd4c0e62ab26f64c50e50361769c13b3 Mon Sep 17 00:00:00 2001 From: Saul Paredes Date: Thu, 19 Mar 2026 12:08:50 -0700 Subject: [PATCH 7/9] runtime-rs: Enforce that OCI memory limit exceeds 128MB baseline This is a port from https://github.com/microsoft/kata-containers/commit/c06b4704fbb730106341b33ce55ae2198ee300e4 into runtime-rs For our Kata UVM, we know we need at least 128MB of memory to prevent instability in the guest. Enforce this constraint with a descriptive error to prevent users from destabilizing the UVM with faulty k8s configurations. Signed-off-by: Saul Paredes --- src/libs/kata-types/src/config/runtime.rs | 6 ++++++ src/runtime-rs/Makefile | 3 +++ .../config/configuration-cloud-hypervisor.toml.in | 5 +++++ .../crates/resource/src/cpu_mem/initial_size.rs | 8 ++++++++ src/runtime-rs/crates/runtimes/src/manager.rs | 12 +++++++++++- .../node-builder/azure-linux/package_build.sh | 4 ++-- 6 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/libs/kata-types/src/config/runtime.rs b/src/libs/kata-types/src/config/runtime.rs index 1d91010ef4a6..906fcc78cce3 100644 --- a/src/libs/kata-types/src/config/runtime.rs +++ b/src/libs/kata-types/src/config/runtime.rs @@ -132,6 +132,12 @@ pub struct Runtime { #[serde(default)] pub static_sandbox_default_workload_vcpus: f32, + /// Minimum memory (in MiB) to enforce for pods that explicitly set a memory limit via + /// resources.limits.memory. If the requested memory is below this value the sandbox + /// creation will fail with a descriptive error. 0 (the default) disables the check. + #[serde(default)] + pub sandbox_workload_mem_min: u32, + /// Determines whether container seccomp profiles are passed to the virtual machine and /// applied by the kata agent. If set to true, seccomp is not applied within the guest. #[serde(default)] diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index 1de2febcecf5..bfed402c228e 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -217,6 +217,8 @@ DEFFORCEGUESTPULL := false # Default memory and vcpus for workloads within the sandbox when no workload values are requested. DEFSTATICSANDBOXWORKLOADMEM ?= 2048 DEFSTATICSANDBOXWORKLOADVCPUS ?= 1 +# Minimum memory (in MiB) a pod must request when explicitly setting a memory limit. +DEFSANDBOXWORKLOADMEMMIN ?= 128 QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT := 4050 # Create Container Timeout in seconds @@ -627,6 +629,7 @@ USER_VARS += DEFDANCONF USER_VARS += DEFFORCEGUESTPULL USER_VARS += DEFSTATICSANDBOXWORKLOADMEM USER_VARS += DEFSTATICSANDBOXWORKLOADVCPUS +USER_VARS += DEFSANDBOXWORKLOADMEMMIN USER_VARS += QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT USER_VARS += DEFCREATECONTAINERTIMEOUT USER_VARS += DEFCREATECONTAINERTIMEOUT_COCO diff --git a/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in b/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in index 7497525845d8..65bfefeef535 100644 --- a/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in +++ b/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in @@ -532,6 +532,11 @@ static_sandbox_default_workload_mem = @DEFSTATICSANDBOXWORKLOADMEM@ # default amount of vcpus available within the sandbox. static_sandbox_default_workload_vcpus = @DEFSTATICSANDBOXWORKLOADVCPUS@ +# The runtime will enforce that pods explicitly setting memory limits using +# resources.limits.memory allow at least this amount of memory in MiB so +# that the sandbox can properly start. +sandbox_workload_mem_min = @DEFSANDBOXWORKLOADMEMMIN@ + # If specified, sandbox_bind_mounts identifieds host paths to be mounted(ro, rw) into the sandboxes shared path. # This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. # If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs index 9b9118ec92ce..b7d003c30fd5 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs @@ -173,6 +173,14 @@ impl InitialSizeManager { pub fn get_orig_toml_default_mem(&self) -> u32 { self.resource.orig_toml_default_mem } + + /// Returns the effective workload memory for the pod/container (in MiB). + /// This may be either explicitly requested in the spec or defaulted by static + /// sandbox resource management. 0 means no explicit limit was set and no default + /// workload memory was applied. + pub fn workload_mem_mb(&self) -> u32 { + self.resource.mem_mb + } } fn get_nr_vcpu(resource: &LinuxContainerCpuResources) -> f32 { diff --git a/src/runtime-rs/crates/runtimes/src/manager.rs b/src/runtime-rs/crates/runtimes/src/manager.rs index e65944e62ac1..eb91cf5265b7 100644 --- a/src/runtime-rs/crates/runtimes/src/manager.rs +++ b/src/runtime-rs/crates/runtimes/src/manager.rs @@ -33,13 +33,13 @@ use netns_rs::{Env, NetNs}; use nix::{sys::statfs, unistd::User}; use oci_spec::runtime as oci; use persist::sandbox_persist::Persist; +use protobuf::Message as ProtobufMessage; use resource::{ cpu_mem::initial_size::InitialSizeManager, network::{dan_config_path, generate_netns_name}, }; use runtime_spec as spec; use shim_interface::shim_mgmt::ERR_NO_SHIM_SERVER; -use protobuf::Message as ProtobufMessage; use std::{ collections::HashMap, env, @@ -218,6 +218,16 @@ impl RuntimeHandlerManagerInner { .setup_config(&mut config) .context("failed to setup static resource mgmt config")?; + let mem_min = config.runtime.sandbox_workload_mem_min; + let workload_mem = initial_size_manager.workload_mem_mb(); + if workload_mem < mem_min { + return Err(anyhow!( + "pod memory limit too low: minimum {}MiB, got {}MiB", + mem_min, + workload_mem + )); + } + update_component_log_level(&config); let dan_path = dan_config_path(&config, &self.id); diff --git a/tools/osbuilder/node-builder/azure-linux/package_build.sh b/tools/osbuilder/node-builder/azure-linux/package_build.sh index 1b08ea295382..00364734a2b5 100755 --- a/tools/osbuilder/node-builder/azure-linux/package_build.sh +++ b/tools/osbuilder/node-builder/azure-linux/package_build.sh @@ -53,10 +53,10 @@ runtime_rs_make_flags="BUILD_TYPE=release \ # as we have a single CLH binary for both vanilla Kata and ConfPods if [ "${CONF_PODS}" == "no" ]; then runtime_go_make_flags+=" DEFSTATICRESOURCEMGMT_CLH=true KERNELPATH_CLH=${KERNEL_BINARY_LOCATION} DEFSANDBOXWORKLOADMEMMIN=128" - runtime_rs_make_flags+=" DEFSTATICRESOURCEMGMT_CLH=true KERNELPATH_CLH=${KERNEL_BINARY_LOCATION}" + runtime_rs_make_flags+=" DEFSTATICRESOURCEMGMT_CLH=true KERNELPATH_CLH=${KERNEL_BINARY_LOCATION} DEFSANDBOXWORKLOADMEMMIN=128" else runtime_go_make_flags+=" CLHPATH=${CLOUD_HYPERVISOR_LOCATION} DEFSANDBOXWORKLOADMEMMIN=192" - runtime_rs_make_flags+=" CLHPATH=${CLOUD_HYPERVISOR_LOCATION}" + runtime_rs_make_flags+=" CLHPATH=${CLOUD_HYPERVISOR_LOCATION} DEFSANDBOXWORKLOADMEMMIN=192" fi # On Mariner 3.0 we use cgroupsv2 with a single sandbox cgroup From e87f816cf6eee9d00c6dcc4ba398d7829fc5271b Mon Sep 17 00:00:00 2001 From: Saul Paredes Date: Thu, 19 Mar 2026 13:14:30 -0700 Subject: [PATCH 8/9] network: preseed default-gateway neighbor This is a port from https://github.com/microsoft/kata-containers/commit/a136359ce5eea316f5b89889b2a7ea93736ad767 into runtime-rs This change mirrors host networking into the guest as before, but now also includes the default gateway neighbor entry for each interface. Pods using overlay/synthetic gateways (e.g., 169.254.1.1) can hit a first-connect race while the guest performs the initial ARP. Preseeding the gateway neighbor removes that latency and makes early connections (e.g., to the API Service) deterministic. Signed-off-by: Saul Paredes --- .../network_info/network_info_from_link.rs | 58 ++++++++++++++++--- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs b/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs index 32c38c24bda1..e046a72230b6 100644 --- a/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs +++ b/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs @@ -5,6 +5,7 @@ // use std::{ + collections::HashSet, convert::TryFrom, net::{IpAddr, Ipv4Addr, Ipv6Addr}, }; @@ -42,6 +43,12 @@ impl NetworkInfoFromLink { ) -> Result { let attrs = link.attrs(); let name = &attrs.name; + let routes = handle_routes(handle, attrs) + .await + .context("handle routes")?; + let neighs = handle_neighbors(handle, attrs, &routes) + .await + .context("handle neighbours")?; Ok(Self { interface: Interface { @@ -54,12 +61,8 @@ impl NetworkInfoFromLink { field_type: link.r#type().to_string(), raw_flags: attrs.flags & libc::IFF_NOARP as u32, }, - neighs: handle_neighbors(handle, attrs) - .await - .context("handle neighbours")?, - routes: handle_routes(handle, attrs) - .await - .context("handle routes")?, + neighs, + routes, }) } } @@ -147,11 +150,49 @@ fn generate_neigh(name: &str, n: &NeighbourMessage) -> Result { Ok(neigh) } +fn gateway_set_from_routes(routes: &[Route]) -> HashSet { + let mut gateway_set = HashSet::new(); + for route in routes { + if route.gateway.is_empty() { + continue; + } + + // Default routes may be represented with an empty destination string or + // an all-zero destination. + if route.dest.is_empty() || route.dest == "0.0.0.0" || route.dest == "::" { + gateway_set.insert(route.gateway.clone()); + } + } + + gateway_set +} + +fn valid_guest_neighbor(neigh: &ARPNeighbor, gateway_set: &HashSet) -> bool { + // We need a MAC address in the guest ARP table. + if neigh.ll_addr.is_empty() { + return false; + } + + // Keep all static entries. + if neigh.state == libc::NUD_PERMANENT as i32 { + return true; + } + + // Gateway-only exception: allow default-gateway neighbors. + neigh + .to_ip_address + .as_ref() + .map(|ip| gateway_set.contains(&ip.address)) + .unwrap_or(false) +} + async fn handle_neighbors( handle: &rtnetlink::Handle, attrs: &LinkAttrs, + routes: &[Route], ) -> Result> { let name = &attrs.name; + let gateway_set = gateway_set_from_routes(routes); let mut neighs = vec![]; let mut neigh_msg_list = handle.neighbours().get().execute(); while let Some(neigh) = neigh_msg_list @@ -161,7 +202,10 @@ async fn handle_neighbors( { // get neigh filter with index if neigh.header.ifindex == attrs.index { - neighs.push(generate_neigh(name, &neigh).context("generate neigh")?) + let neigh = generate_neigh(name, &neigh).context("generate neigh")?; + if valid_guest_neighbor(&neigh, &gateway_set) { + neighs.push(neigh); + } } } Ok(neighs) From a326d5b242b7ff1fe502cafd3ffec92f05c7d674 Mon Sep 17 00:00:00 2001 From: Saul Paredes Date: Thu, 26 Mar 2026 16:52:47 -0700 Subject: [PATCH 9/9] TEMP: build-runtime-rs by default so we can test end to end. To remove this commit before merging. Signed-off-by: Saul Paredes --- tools/osbuilder/node-builder/azure-linux/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/osbuilder/node-builder/azure-linux/Makefile b/tools/osbuilder/node-builder/azure-linux/Makefile index 74283db511f3..30318bc80234 100644 --- a/tools/osbuilder/node-builder/azure-linux/Makefile +++ b/tools/osbuilder/node-builder/azure-linux/Makefile @@ -5,7 +5,7 @@ BUILD_TYPE := release export SHIM_REDEPLOY_CONFIG := yes -export USE_RUNTIME_RS := no +export USE_RUNTIME_RS := yes ifeq ($(BUILD_TYPE),debug) export AGENT_BUILD_TYPE := debug