Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions scripts/perftune.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,8 @@ def slaves(self, nic):
"""
return iter(self.__slaves[nic])

AzurePerfTuner(self.nics).tune()
Copy link

Copilot AI Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line is unreachable code. It appears after a return statement in the slaves method and will never be executed. This line should be moved to the tune method of the NetPerfTuner class (after line 666) to actually invoke the Azure tuning functionality.

Copilot uses AI. Check for mistakes.

#### Protected methods ##########################
def _get_irqs(self):
"""
Expand Down Expand Up @@ -1206,6 +1208,139 @@ def __get_rx_queue_count(self, iface):

return min(self.__max_rx_queue_count(iface), rx_queues_count)

class AzurePerfTuner(object):
def __init__(self, nics):
self.nics = nics
# Known drivers for Azure Accelerated Networking VFs
self.vf_drivers = {'mlx4_core', 'mlx4_en', 'mlx5_core', 'mana'}

def __get_driver_name(self, nic):
"""
Returns the driver name for a given interface using ethtool.
"""
try:
# run_ethtool returns a list of strings. We join them to search.
# Output format is usually: "driver: mlx5_core\nversion: ..."
output = run_ethtool(['-i', nic])
for line in output:
if line.startswith('driver:'):
return line.split(':')[1].strip()
except Exception:
Copy link

Copilot AI Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'except' clause does nothing but pass and there is no explanatory comment.

Copilot uses AI. Check for mistakes.
pass
return None
Comment on lines +1217 to +1230
Copy link

Copilot AI Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method duplicates the __get_driver_name implementation already present in NetPerfTuner class (line 955). Consider refactoring to reuse the existing implementation by either making NetPerfTuner's version accessible or extracting it to a shared utility function.

Copilot uses AI. Check for mistakes.

def __is_azure_vm(self):
"""
Detects if the script is running on an Azure VM.
"""
try:
with open('/sys/class/dmi/id/sys_vendor', 'r') as f:
vendor = f.read().strip()
return "Microsoft Corporation" in vendor or "Microsoft" in vendor
except OSError:
return False

def __is_accelerated_nic(self, nic):
"""
Checks if a specific network interface has Azure Accelerated Networking enabled.
"""
driver = self.__get_driver_name(nic)

# Check 1: Is the NIC itself a VF? (e.g. running on bare metal or direct assignment)
Copy link

Copilot AI Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment mentions 'running on bare metal or direct assignment' but this scenario is unlikely on Azure VMs with Accelerated Networking, which typically use the hv_netvsc driver with a bonded VF. Consider clarifying that this check handles edge cases or non-standard configurations where the VF driver is directly exposed.

Suggested change
# Check 1: Is the NIC itself a VF? (e.g. running on bare metal or direct assignment)
# Check 1: Is the NIC itself a VF? This covers edge/non-standard setups where the
# VF driver is bound directly to the interface (for example, bare metal
# or direct assignment). On Azure VMs with Accelerated Networking, the
# typical configuration is the synthetic hv_netvsc NIC bonding a VF,
# which is detected in Check 2 below.

Copilot uses AI. Check for mistakes.
if driver in self.vf_drivers:
return True

# Check 2: Is this the synthetic NIC (hv_netvsc) bonding a VF? (Standard Azure AN)
if driver == 'hv_netvsc':
try:
# Look for slave/lower interfaces (e.g., /sys/class/net/eth0/lower_*)
# We need to find if any 'lower' interface uses a VF driver.
lower_devs = glob.glob(f"/sys/class/net/{nic}/lower_*")

for path in lower_devs:
# Extract interface name: /sys/class/net/eth0/lower_eth1 -> eth1
slave_iface = os.path.basename(path).replace("lower_", "")

slave_driver = self.__get_driver_name(slave_iface)
if slave_driver in self.vf_drivers:
return True
except Exception:
pass
Copy link

Copilot AI Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'except' clause does nothing but pass and there is no explanatory comment.

Suggested change
pass
logging.debug(
"Failed to inspect lower devices for NIC %s while checking for Azure Accelerated Networking",
nic,
exc_info=True,
)

Copilot uses AI. Check for mistakes.

return False

def tune(self):
"""
Applies Azure-specific network optimizations if running on an Azure VM.
"""
if not self.__is_azure_vm():
return

perftune_print("Azure VM detected. Checking for Accelerated Networking...")

an_enabled_any = False
Copy link

Copilot AI Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable name 'an_enabled_any' is ambiguous. The abbreviation 'an' could stand for multiple things. Consider renaming to 'accelerated_networking_enabled' or 'azure_an_enabled' for better clarity.

Copilot uses AI. Check for mistakes.

# 1. Interface-specific Tuning
for nic in self.nics:
if not self.__is_accelerated_nic(nic):
perftune_print(f"Interface {nic}: No Accelerated Networking detected. Skipping Azure tuning.")
continue

an_enabled_any = True
perftune_print(f"Interface {nic}: Accelerated Networking detected. Tuning...")

# Optimization: Increase Ring Buffers to 1024
# We use check=False because some drivers/versions are noisy even on success
run_one_command(['ethtool', '-G', nic, 'rx', '1024', 'tx', '1024'], check=False)

# Optimization: Increase TX Queue Length to 10000
run_one_command(['ip', 'link', 'set', nic, 'txqueuelen', '10000'], check=False)
Comment on lines +1293 to +1297
Copy link

Copilot AI Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment states 'check=False because some drivers/versions are noisy even on success', but using check=False silently ignores all errors including genuine failures. Consider logging when these commands fail to help with debugging, even if the failures are non-fatal. The dry_run_mode already prints the commands, but in normal mode failures are completely silent.

Suggested change
# We use check=False because some drivers/versions are noisy even on success
run_one_command(['ethtool', '-G', nic, 'rx', '1024', 'tx', '1024'], check=False)
# Optimization: Increase TX Queue Length to 10000
run_one_command(['ip', 'link', 'set', nic, 'txqueuelen', '10000'], check=False)
# We keep failures non-fatal but log them for debugging.
try:
run_one_command(['ethtool', '-G', nic, 'rx', '1024', 'tx', '1024'], check=True)
except subprocess.CalledProcessError as e:
perftune_print(f"Warning: failed to set ring buffers on interface {nic}: {e}")
# Optimization: Increase TX Queue Length to 10000
try:
run_one_command(['ip', 'link', 'set', nic, 'txqueuelen', '10000'], check=True)
except subprocess.CalledProcessError as e:
perftune_print(f"Warning: failed to set txqueuelen on interface {nic}: {e}")

Copilot uses AI. Check for mistakes.

# 2. Global Sysctl Tuning (Only if AN is present)
if an_enabled_any:
perftune_print("Applying global kernel optimizations for Azure High Throughput...")

sysctl_params = {
# Memory Buffers
'net.ipv4.tcp_rmem': '4096 87380 67108864',
'net.ipv4.tcp_wmem': '4096 65536 67108864',
'net.core.rmem_default': '33554432',
'net.core.wmem_default': '33554432',
'net.core.rmem_max': '134217728',
'net.core.wmem_max': '134217728',
'net.ipv4.udp_wmem_min': '16384',
'net.ipv4.udp_rmem_min': '16384',

# Latency & Throughput Extras
'net.core.busy_poll': '50',
'net.core.busy_read': '50',
'net.ipv4.tcp_timestamps': '1',
'net.ipv4.tcp_tw_reuse': '1',
'net.core.netdev_budget': '1000',
'net.core.optmem_max': '65535',

# Connection Backlogs
Copy link

Copilot AI Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential conflict with existing sysctl settings. The NetPerfTuner.tune method sets net.core.somaxconn to 4096 (line 660), but AzurePerfTuner.tune sets it to 32768 (line 1323). If AzurePerfTuner runs after NetPerfTuner, it will overwrite this value. However, if the misplaced line 700 is fixed and placed at the end of NetPerfTuner.tune, this would result in the Azure value taking precedence, which may be intentional for Azure VMs. Consider documenting this behavior or extracting the common sysctl tuning to avoid conflicts.

Suggested change
# Connection Backlogs
# Connection Backlogs
# NOTE: This Azure-specific value intentionally overrides the generic
# net.core.somaxconn setting applied by NetPerfTuner.tune to support
# higher connection backlogs on Azure high-throughput VMs.

Copilot uses AI. Check for mistakes.
'net.core.somaxconn': '32768',
'net.core.netdev_max_backlog': '32768',

# Queue Discipline
'net.core.default_qdisc': 'fq'
}

# Enable BBR if kernel version >= 4.19
try:
kernel_ver = platform.release().split('-')[0]
major, minor = map(int, kernel_ver.split('.')[:2])
if major > 4 or (major == 4 and minor >= 19):
sysctl_params['net.ipv4.tcp_congestion_control'] = 'bbr'
except Exception:
pass
Comment on lines +1331 to +1337
Copy link

Copilot AI Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The kernel version parsing assumes a specific format and may fail for certain kernel version strings. For example, versions like '5.10.0-rc1' would fail when trying to convert 'rc1' to an integer. While the exception is caught, consider adding more robust parsing or validating the format before conversion to avoid unnecessary exception handling.

Suggested change
try:
kernel_ver = platform.release().split('-')[0]
major, minor = map(int, kernel_ver.split('.')[:2])
if major > 4 or (major == 4 and minor >= 19):
sysctl_params['net.ipv4.tcp_congestion_control'] = 'bbr'
except Exception:
pass
kernel_ver = platform.release().split('-')[0]
match = re.match(r'^(\d+)\.(\d+)', kernel_ver)
if match:
major = int(match.group(1))
minor = int(match.group(2))
if major > 4 or (major == 4 and minor >= 19):
sysctl_params['net.ipv4.tcp_congestion_control'] = 'bbr'

Copilot uses AI. Check for mistakes.

# Apply sysctls
for param, value in sysctl_params.items():
path = f"/proc/sys/{param.replace('.', '/')}"
fwriteln_and_log(path, value, log_errors=False)



class ClocksourceManager:
Expand Down