From 25684a985feab7f33241b1007c73405d7b895f90 Mon Sep 17 00:00:00 2001 From: Xiaoming-AMD Date: Wed, 12 Nov 2025 23:49:58 -0600 Subject: [PATCH 1/3] feat(envs): refactor environment configuration with layered design - Create runner/helpers/envs/ directory with modular configuration - base_env.sh: Base configuration (logging, cluster info, pythonpath) - common_network.sh: Network and communication settings (NCCL, RCCL) - perf_tuning.sh: Performance tuning and optimizations - primus-env.sh: Main entry point with layered loading - detect_gpu.sh: GPU model detection - MI300X.sh, MI325X.sh, MI355X.sh: GPU-specific configurations - get_ip_interface.sh, get_nccl_ib_hca.sh: Network detection utilities - Refactor configuration loading with clear responsibilities - Implement dependency checks between config files - Add validation with validate_distributed_params() - Support debug mode (PRIMUS_DEBUG=1) - Support validation skip (PRIMUS_SKIP_VALIDATION=1) - Move RCCL configuration from perf_tuning.sh to common_network.sh - Better categorization: communication vs performance - Unified location for NCCL and RCCL settings - Rename env_common_network.sh to common_network.sh - Consistent naming with other config files - Move validation function to runner/lib/validation.sh - Eliminate duplicate code with DRY principle - Remove unnecessary wrapper function (YAGNI, KISS) - Add comprehensive unit tests for primus-env.sh - 10 test cases covering all core functionality - Tests for validation, debug mode, defaults, error detection - 100% test pass rate (10/10) - Update test runner to include new test suite - Add test_primus_env.sh to run_all_tests.sh Architecture improvements: - Clear separation of concerns (base, network, perf, gpu-specific) - Better maintainability with modular design - Robust error handling and validation - Production-ready configuration system --- runner/helpers/envs/MI300X.sh | 34 ++ runner/helpers/envs/MI325X.sh | 33 ++ runner/helpers/envs/MI355X.sh | 34 ++ runner/helpers/envs/base_env.sh | 96 +++++ runner/helpers/envs/common_network.sh | 83 ++++ runner/helpers/envs/detect_gpu.sh | 49 +++ runner/helpers/{ => envs}/get_ip_interface.sh | 0 runner/helpers/{ => envs}/get_nccl_ib_hca.sh | 0 runner/helpers/envs/perf_tuning.sh | 83 ++++ runner/helpers/envs/primus-env.sh | 92 ++++ runner/helpers/primus-env.sh | 159 ------- tests/runner/helpers/test_primus_env.sh | 398 ++++++++++++++++++ tests/runner/run_all_tests.sh | 2 + 13 files changed, 904 insertions(+), 159 deletions(-) create mode 100755 runner/helpers/envs/MI300X.sh create mode 100755 runner/helpers/envs/MI325X.sh create mode 100755 runner/helpers/envs/MI355X.sh create mode 100755 runner/helpers/envs/base_env.sh create mode 100644 runner/helpers/envs/common_network.sh create mode 100755 runner/helpers/envs/detect_gpu.sh rename runner/helpers/{ => envs}/get_ip_interface.sh (100%) rename runner/helpers/{ => envs}/get_nccl_ib_hca.sh (100%) create mode 100644 runner/helpers/envs/perf_tuning.sh create mode 100755 runner/helpers/envs/primus-env.sh delete mode 100755 runner/helpers/primus-env.sh create mode 100755 tests/runner/helpers/test_primus_env.sh diff --git a/runner/helpers/envs/MI300X.sh b/runner/helpers/envs/MI300X.sh new file mode 100755 index 00000000..a38d66ad --- /dev/null +++ b/runner/helpers/envs/MI300X.sh @@ -0,0 +1,34 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +# +# AMD MI300X GPU-specific optimizations +# Note: Common settings are in base_env.sh. This file only contains MI300X-specific overrides. +# + +LOG_INFO_RANK0 "Loading MI300X-specific optimizations..." + +# ----------------- MI300X-specific GPU settings ----------------- +# MI300X has 192GB HBM3, disable XNACK for performance +export HSA_XNACK=${HSA_XNACK:-0} + +# Optimize memory allocation for large models +export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100} + +# MI300X-specific memory optimizations +# Increase HSA kernarg pool size for large model workloads (12MB) +export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} + +# ----------------- MI300X RCCL optimizations ----------------- +# MI300X works well with MSCCLPP disabled (already set in base_env.sh) +# Override here only if needed for specific MI300X workloads + +# Uncomment to enable MSCCLPP for MI300X if tested and verified +# export RCCL_MSCCLPP_ENABLE=1 +# export RCCL_MSCCLPP_FORCE_ENABLE=1 + +log_exported_vars "MI300X-specific optimizations" \ + HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE diff --git a/runner/helpers/envs/MI325X.sh b/runner/helpers/envs/MI325X.sh new file mode 100755 index 00000000..7f342a8f --- /dev/null +++ b/runner/helpers/envs/MI325X.sh @@ -0,0 +1,33 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +# +# AMD MI325X GPU-specific optimizations +# Note: Common settings are in base_env.sh. This file only contains MI325X-specific overrides. +# + +LOG_INFO_RANK0 "Loading MI325X-specific optimizations..." + +# ----------------- MI325X-specific GPU settings ----------------- +# MI325X has 256GB HBM3e (enhanced), disable XNACK for performance +export HSA_XNACK=${HSA_XNACK:-0} + +# Optimize memory allocation for larger models compared to MI300X +export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100} + +# MI325X-specific memory optimizations +export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} + +# ----------------- MI325X RCCL optimizations ----------------- +# MI325X may benefit from different RCCL settings +# Override base_env.sh settings if needed for MI325X + +# Uncomment to enable MSCCLPP for MI325X if tested and verified +# export RCCL_MSCCLPP_ENABLE=1 +# export RCCL_MSCCLPP_FORCE_ENABLE=1 + +log_exported_vars "MI325X-specific optimizations" \ + HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE diff --git a/runner/helpers/envs/MI355X.sh b/runner/helpers/envs/MI355X.sh new file mode 100755 index 00000000..3c68f79e --- /dev/null +++ b/runner/helpers/envs/MI355X.sh @@ -0,0 +1,34 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +# +# AMD MI355X GPU-specific optimizations +# Note: MI355X is an APU with integrated CPU and GPU, using unified memory architecture. +# Common settings are in base_env.sh. This file only contains MI355X-specific overrides. +# + +LOG_INFO_RANK0 "Loading MI355X-specific optimizations..." + +# ----------------- MI355X-specific GPU settings ----------------- +# MI355X has 128GB unified memory (HBM + DDR) +# Enable XNACK for unified memory support (different from discrete GPUs) +export HSA_XNACK=${HSA_XNACK:-1} + +# APU-specific: Enable interrupt-driven mode for better power efficiency +export HSA_ENABLE_INTERRUPT=${HSA_ENABLE_INTERRUPT:-1} + +# Optimize memory allocation for unified memory architecture +export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100} + +# MI355X memory pool settings +export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-8388608} # 8MB (smaller than discrete GPUs) + +# ----------------- MI355X RCCL optimizations ----------------- +# APU may have different interconnect characteristics +# Keep base_env.sh settings unless testing shows otherwise + +log_exported_vars "MI355X-specific optimizations" \ + HSA_XNACK HSA_ENABLE_INTERRUPT GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE diff --git a/runner/helpers/envs/base_env.sh b/runner/helpers/envs/base_env.sh new file mode 100755 index 00000000..a4f60979 --- /dev/null +++ b/runner/helpers/envs/base_env.sh @@ -0,0 +1,96 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### + +# ============================================================================= +# Base Environment Configuration +# ============================================================================= +# This file provides the foundation for all environment configurations: +# - Logging functions (LOG_INFO, LOG_INFO_RANK0, LOG_ERROR, etc.) +# - Distributed training cluster information (MASTER_ADDR, NNODES, etc.) +# - Python path setup +# +# Network, performance tuning, and GPU-specific settings are loaded separately +# ============================================================================= + +# --------------------------------------------------------------------------- +# Guard: avoid duplicate exports/logging on multiple sourcing +# --------------------------------------------------------------------------- +if [[ -n "${__PRIMUS_BASE_ENV_SOURCED:-}" ]]; then + return 0 +fi +export __PRIMUS_BASE_ENV_SOURCED=1 + +# --------------------------------------------------------------------------- +# Load common library for consistent logging +# --------------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [[ -f "$SCRIPT_DIR/../../lib/common.sh" ]]; then + # shellcheck disable=SC1091 + source "$SCRIPT_DIR/../../lib/common.sh" +else + # Fallback logging functions if common.sh not available + HOSTNAME="$(hostname)" + export HOSTNAME + + LOG_INFO() { + if [ "$*" = "" ]; then + echo "" + else + echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] $*" + fi + } + + LOG_INFO_RANK0() { + if [ "${NODE_RANK:-0}" -eq 0 ]; then + if [ "$*" = "" ]; then + echo "" + else + echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] $*" + fi + fi + } + + LOG_ERROR() { + echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] [ERROR] $*" >&2 + } + + LOG_WARN() { + echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] [WARN] $*" >&2 + } + + log_exported_vars() { + LOG_INFO_RANK0 "========== $1 ==========" + for var in "${@:2}"; do + LOG_INFO_RANK0 " $var=${!var-}" + done + } +fi + +# --------------------------------------------------------------------------- +# Distributed Training Cluster Configuration +# --------------------------------------------------------------------------- +export MASTER_ADDR=${MASTER_ADDR:-localhost} +export MASTER_PORT=${MASTER_PORT:-1234} +export NNODES=${NNODES:-1} +export NODE_RANK=${NODE_RANK:-0} +export GPUS_PER_NODE=${GPUS_PER_NODE:-8} + +log_exported_vars "Training Cluster Info" \ + MASTER_ADDR MASTER_PORT NNODES NODE_RANK GPUS_PER_NODE + +# --------------------------------------------------------------------------- +# Python Path Setup +# --------------------------------------------------------------------------- +PRIMUS_PATH=$(cd "$SCRIPT_DIR/../../.." && pwd) +site_packages=$(python -c "import sysconfig; print(sysconfig.get_paths()['purelib'])" 2>/dev/null || echo "") +if [[ -n "$site_packages" ]]; then + export PYTHONPATH="${PRIMUS_PATH}:${site_packages}:${PYTHONPATH:-}" +else + export PYTHONPATH="${PRIMUS_PATH}:${PYTHONPATH:-}" +fi + +log_exported_vars "Python Path" PYTHONPATH diff --git a/runner/helpers/envs/common_network.sh b/runner/helpers/envs/common_network.sh new file mode 100644 index 00000000..23af6884 --- /dev/null +++ b/runner/helpers/envs/common_network.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### + +# ============================================================================= +# NCCL and Network Configuration +# ============================================================================= +# This file contains all network-related environment variable settings +# for distributed training with NCCL and communication libraries. +# ============================================================================= + +# Dependency check: ensure base_env.sh has been loaded +if [[ -z "${GPUS_PER_NODE}" ]]; then + echo "[ERROR] GPUS_PER_NODE not set. base_env.sh must be loaded first." >&2 + exit 1 +fi + +if ! declare -f log_exported_vars >/dev/null 2>&1; then + echo "[ERROR] log_exported_vars function not found. base_env.sh must be loaded first." >&2 + exit 1 +fi + +# Set visible GPUs for the current node (0 to GPUS_PER_NODE-1) +HIP_VISIBLE_DEVICES=$(seq -s, 0 $((GPUS_PER_NODE - 1))) +export HIP_VISIBLE_DEVICES + +# ----------------- NCCL and Network Settings ----------------- + +# NCCL logging level: VERSION, WARN, INFO, DEBUG, TRACE +# Set to empty for default behavior, or specify level for debugging +export NCCL_DEBUG=${NCCL_DEBUG:-} + +# Disable NCCL internal checks to reduce overhead +export NCCL_CHECKS_DISABLE=1 + +# Set InfiniBand GID index for NCCL communication +export NCCL_IB_GID_INDEX=3 + +# Disable cross NIC communication for NCCL +export NCCL_CROSS_NIC=0 + +# Dynamically get InfiniBand Host Channel Adapter index for NCCL if not set +if [ -z "${NCCL_IB_HCA}" ]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + NCCL_IB_HCA=$(bash "${SCRIPT_DIR}/get_nccl_ib_hca.sh" 2>/dev/null || echo "") +fi +export NCCL_IB_HCA + +# Dynamically get network interface IP address for socket communication if not set +if [ -z "${IP_INTERFACE}" ]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + IP_INTERFACE=$(bash "${SCRIPT_DIR}/get_ip_interface.sh" 2>/dev/null || hostname -I | awk '{print $1}') +fi +export IP_INTERFACE + +# Set network interfaces for NCCL and Gloo, fallback to detected IP_INTERFACE +export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$IP_INTERFACE} +export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$IP_INTERFACE} + +# ----------------- RCCL Settings (AMD ROCm Communication Library) ----------------- + +# Disable MSCCL (RCCL multi-connection feature) for better stability +export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0} +export RCCL_MSCCLPP_ENABLE=${RCCL_MSCCLPP_ENABLE:-0} +export RCCL_MSCCLPP_FORCE_ENABLE=${RCCL_MSCCLPP_FORCE_ENABLE:-0} +export RCCL_MSCCLPP_THRESHOLD=${RCCL_MSCCLPP_THRESHOLD:-$((1*1024*1024*1024))} # default 1GB + +# https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/env.hpp#L82-L87 +export MSCCLPP_DISABLE_CHANNEL_CACHE=${MSCCLPP_DISABLE_CHANNEL_CACHE:-FALSE} + +# PyTorch needs this env to enable register comm +export TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=${TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK:-0} + +log_exported_vars "NCCL and Network Settings" \ + HIP_VISIBLE_DEVICES NCCL_DEBUG NCCL_CHECKS_DISABLE NCCL_IB_GID_INDEX \ + NCCL_CROSS_NIC NCCL_IB_HCA IP_INTERFACE NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME + +log_exported_vars "RCCL Settings" \ + RCCL_MSCCL_ENABLE RCCL_MSCCLPP_ENABLE RCCL_MSCCLPP_FORCE_ENABLE RCCL_MSCCLPP_THRESHOLD \ + MSCCLPP_DISABLE_CHANNEL_CACHE TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK diff --git a/runner/helpers/envs/detect_gpu.sh b/runner/helpers/envs/detect_gpu.sh new file mode 100755 index 00000000..1e9309a5 --- /dev/null +++ b/runner/helpers/envs/detect_gpu.sh @@ -0,0 +1,49 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +# +# Detect GPU Model Script +# Uses rocm-smi to detect AMD GPU model (MI300, MI355, etc.) +# + +detect_gpu_model() { + local gpu_model + gpu_model="unknown" + + # Check if rocm-smi is available + if ! command -v rocm-smi &> /dev/null; then + echo "Error: rocm-smi not found. Is ROCm installed?" >&2 + return 1 + fi + + # Get product name from rocm-smi + local product_name + product_name=$(rocm-smi --showproductname 2>/dev/null | grep -i "Card series" | head -n1 | awk '{print $NF}') + + # If that doesn't work, try alternative method + if [[ -z "$product_name" ]]; then + product_name=$(rocm-smi --showproductname 2>/dev/null | grep -oP 'MI\d+[A-Z]*' | head -n1) + fi + + # Extract model identifier (MI300, MI355, etc.) + if [[ "$product_name" =~ MI([0-9]+)([A-Z]*) ]]; then + gpu_model="MI${BASH_REMATCH[1]}${BASH_REMATCH[2]}" + fi + + echo "$gpu_model" +} + +# Execute detection +GPU_MODEL=$(detect_gpu_model) + +# Output result +echo "$GPU_MODEL" + +# Exit with error if detection failed +if [[ "$GPU_MODEL" == "unknown" ]]; then + echo "Warning: Unable to detect GPU model. Using default configuration." >&2 + exit 1 +fi diff --git a/runner/helpers/get_ip_interface.sh b/runner/helpers/envs/get_ip_interface.sh similarity index 100% rename from runner/helpers/get_ip_interface.sh rename to runner/helpers/envs/get_ip_interface.sh diff --git a/runner/helpers/get_nccl_ib_hca.sh b/runner/helpers/envs/get_nccl_ib_hca.sh similarity index 100% rename from runner/helpers/get_nccl_ib_hca.sh rename to runner/helpers/envs/get_nccl_ib_hca.sh diff --git a/runner/helpers/envs/perf_tuning.sh b/runner/helpers/envs/perf_tuning.sh new file mode 100644 index 00000000..bddb8cb1 --- /dev/null +++ b/runner/helpers/envs/perf_tuning.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### + +# ============================================================================= +# Performance Tuning Configuration +# ============================================================================= +# This file contains all performance-related settings including: +# - AMD-specific GPU optimizations (HSA, RCCL) +# - General performance tuning (GPU queues, NUMA, CUDA connections) +# - NCCL performance settings (PXN, P2P) +# - Transformer Engine optimizations (NVTE) +# ============================================================================= + +# Dependency check: ensure base_env.sh has been loaded +if ! declare -f log_exported_vars >/dev/null 2>&1; then + echo "[ERROR] log_exported_vars function not found. base_env.sh must be loaded first." >&2 + exit 1 +fi + +# ----------------- AMD-specific GPU optimizations ----------------- +# Enable system DMA engine (SDMA) on AMD GPUs for better IO throughput +export HSA_ENABLE_SDMA=${HSA_ENABLE_SDMA:-1} + +# Prevent scratch memory from being reclaimed to stabilize large memory usage +# NOTE: Must disable scratch reclaim to avoid MoE training crash on AMD GPUs +# Setting this to 0 prevents core dumps when using Mixture-of-Experts (MoE) models +export HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM:-0} + +log_exported_vars "AMD GPU Optimizations" \ + HSA_ENABLE_SDMA HSA_NO_SCRATCH_RECLAIM + +# ----------------- General Performance Tuning ----------------- +# Limit GPU hardware queues to 2 for performance stability +export GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES:-2} + +# Increase HSA kernarg pool size to 12MB for models with many kernels (optional, can be set by GPU-specific configs) +# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} + +# Enable NUMA binding for better memory locality (may increase stability for large models) +export ENABLE_NUMA_BINDING=${ENABLE_NUMA_BINDING:-0} + +# Limit max CUDA device connections to reduce PCIe traffic +export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1} + +# Prioritize NCCL communication for PyTorch for higher throughput +export TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY:-1} + +# ----------------- NCCL Performance Settings ----------------- +# In multi-node training, PXN can be enabled to improve inter-node all-to-all +# communication efficiency, but it will increase GPU memory usage. +# Default: disable PXN for NCCL +export NCCL_PXN_DISABLE=${NCCL_PXN_DISABLE:-1} +export NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE:-524288} + +log_exported_vars "General Performance Tuning" \ + GPU_MAX_HW_QUEUES ENABLE_NUMA_BINDING CUDA_DEVICE_MAX_CONNECTIONS \ + TORCH_NCCL_HIGH_PRIORITY NCCL_PXN_DISABLE NCCL_P2P_NET_CHUNKSIZE + +# ----------------- Transformer Engine Optimizations ----------------- +# Optimize NVTE fp8 cast transpose +export NVTE_USE_CAST_TRANSPOSE_TRITON=${NVTE_USE_CAST_TRANSPOSE_TRITON:-1} +export NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE=${NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE:-0} + +# Note: Disable v3 due to accuracy issues. Will fix after TE version 2.1. +export NVTE_CK_USES_BWD_V3=${NVTE_CK_USES_BWD_V3:-0} + +# Note: Disable fp32 atomic if you find any accuracy issue +export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=${PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32:-0} + +# NVTE debug envs +export NVTE_DEBUG=${NVTE_DEBUG:-0} # 0, 1 +export NVTE_DEBUG_LEVEL=${NVTE_DEBUG_LEVEL:-0} # 0, 1, 2 +export NVTE_FUSED_ATTN_LOG_CONFIG=${NVTE_FUSED_ATTN_LOG_CONFIG:-0} # 0, 1 +export PATCH_TE_FLASH_ATTN=${PATCH_TE_FLASH_ATTN:-0} + +log_exported_vars "Transformer Engine Optimizations" \ + NVTE_USE_CAST_TRANSPOSE_TRITON NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE \ + NVTE_CK_USES_BWD_V3 PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32 \ + NVTE_DEBUG NVTE_DEBUG_LEVEL NVTE_FUSED_ATTN_LOG_CONFIG PATCH_TE_FLASH_ATTN diff --git a/runner/helpers/envs/primus-env.sh b/runner/helpers/envs/primus-env.sh new file mode 100755 index 00000000..b9530c5f --- /dev/null +++ b/runner/helpers/envs/primus-env.sh @@ -0,0 +1,92 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### + +# ============================================================================= +# Primus Environment Setup - Layered Configuration Loading +# ============================================================================= +# Load order: +# 1. base_env.sh - Base configuration (logging, cluster info, pythonpath) +# 2. common_network.sh - Network and NCCL settings +# 3. perf_tuning.sh - Performance tuning and optimizations +# 4. .sh - GPU-specific overrides (e.g., MI300X.sh, MI325X.sh) +# +# Environment Variables: +# PRIMUS_DEBUG=1 - Enable debug mode (set -x, verbose output) +# PRIMUS_SKIP_VALIDATION=1 - Skip configuration validation (not recommended) +# ============================================================================= + +# Enable debug mode if requested +if [[ "${PRIMUS_DEBUG:-0}" == "1" ]]; then + set -x + echo "[DEBUG] Primus debug mode enabled" +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# 1. Load base environment (logging, cluster info, pythonpath) +# shellcheck source=runner/helpers/envs/base_env.sh +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/base_env.sh" + +LOG_INFO_RANK0 "" +LOG_INFO_RANK0 "=== Loading Primus Environment Configuration ===" + +# 2. Load common network configuration +# shellcheck source=runner/helpers/envs/common_network.sh +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common_network.sh" + +# 3. Load performance tuning configuration +# shellcheck source=runner/helpers/envs/perf_tuning.sh +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/perf_tuning.sh" + +# 4. Detect GPU model and load device-specific configuration +GPU_MODEL=$(bash "${SCRIPT_DIR}/detect_gpu.sh") +LOG_INFO_RANK0 "Detected GPU model: ${GPU_MODEL}" + +GPU_CONFIG_FILE="${SCRIPT_DIR}/${GPU_MODEL}.sh" +if [[ -f "$GPU_CONFIG_FILE" ]]; then + LOG_INFO_RANK0 "Loading GPU-specific configuration: $GPU_CONFIG_FILE" + # shellcheck disable=SC1090 + source "$GPU_CONFIG_FILE" +else + LOG_WARN "GPU configuration file not found: ${GPU_CONFIG_FILE}, using common settings only." +fi + +# 5. Load validation library and validate configuration (unless explicitly skipped) +if [[ "${PRIMUS_SKIP_VALIDATION:-0}" != "1" ]]; then + LOG_INFO_RANK0 "" + LOG_INFO_RANK0 "=== Validating Configuration ===" + + # Load validation library (requires common.sh which is already loaded by base_env.sh) + VALIDATION_LIB="${SCRIPT_DIR}/../../lib/validation.sh" + if [[ -f "$VALIDATION_LIB" ]]; then + # shellcheck disable=SC1090 + source "$VALIDATION_LIB" + else + LOG_WARN "Validation library not found: $VALIDATION_LIB" + LOG_WARN "Skipping validation..." + fi + + # Run validation if the function is available + if declare -f validate_distributed_params >/dev/null 2>&1; then + if validate_distributed_params; then + LOG_INFO_RANK0 "✓ Configuration validation passed" + else + LOG_ERROR "✗ Configuration validation failed" + LOG_ERROR "Set PRIMUS_SKIP_VALIDATION=1 to skip validation (not recommended)" + exit 1 + fi + else + LOG_WARN "validate_distributed_params function not found, skipping validation" + fi +fi + +LOG_INFO_RANK0 "" +LOG_INFO_RANK0 "=== Environment Configuration Complete ===" +LOG_INFO_RANK0 "" diff --git a/runner/helpers/primus-env.sh b/runner/helpers/primus-env.sh deleted file mode 100755 index 6dd2a96f..00000000 --- a/runner/helpers/primus-env.sh +++ /dev/null @@ -1,159 +0,0 @@ -#!/bin/bash -############################################################################### -# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -# -# See LICENSE for license information. -############################################################################### - -# --------------------------------------------------------------------------- -# Guard: avoid duplicate exports/logging on multiple sourcing -# --------------------------------------------------------------------------- -if [[ -n "${__PRIMUS_ENV_SOURCED:-}" ]]; then - return 0 -fi -export __PRIMUS_ENV_SOURCED=1 - -# Hostname is useful for logs in any script that sources this file -HOSTNAME="$(hostname)" -export HOSTNAME - -LOG_INFO() { - if [ "$*" = "" ]; then - echo "" - else - echo "[NODE-$NODE_RANK($HOSTNAME)] $*" - fi -} - -LOG_INFO_RANK0() { - if [ "$NODE_RANK" -eq 0 ]; then - if [ "$*" = "" ]; then - echo "" - else - echo "[NODE-$NODE_RANK($HOSTNAME)] $*" - fi - fi -} - -LOG_ERROR() { - echo "[NODE-$NODE_RANK($HOSTNAME)] [ERROR] $*"; -} - -log_exported_vars() { - LOG_INFO_RANK0 "========== $1 ==========" - for var in "${@:2}"; do - LOG_INFO_RANK0 " $var=${!var-}" - done -} - -export MASTER_ADDR=${MASTER_ADDR:-localhost} -export MASTER_PORT=${MASTER_PORT:-1234} -export NNODES=${NNODES:-1} -export NODE_RANK=${NODE_RANK:-0} -export GPUS_PER_NODE=${GPUS_PER_NODE:-8} -log_exported_vars "Training cluster info" \ - MASTER_ADDR MASTER_PORT NNODES NODE_RANK GPUS_PER_NODE - -# -------------------- NCCL and Communication Setup -------------------- -# Set visible GPUs for the current node (0 to GPUS_PER_NODE-1) -HIP_VISIBLE_DEVICES=$(seq -s, 0 $((GPUS_PER_NODE - 1))) -export HIP_VISIBLE_DEVICES - -# ----------------- NCCL and Network Settings ----------------- -# VERSION, WARN, INFO, DEBUG, TRACE -export NCCL_DEBUG= - -# Disable NCCL internal checks to reduce overhead -export NCCL_CHECKS_DISABLE=1 - -# Set InfiniBand GID index for NCCL communication -export NCCL_IB_GID_INDEX=3 - -# Disable cross NIC communication for NCCL -export NCCL_CROSS_NIC=0 - -SCRIPT_DIR="$(cd "$(dirname "$(realpath "$0")")" && pwd)" - -# Dynamically get InfiniBand Host Channel Adapter index for NCCL if not set -if [ -z "${NCCL_IB_HCA}" ]; then - NCCL_IB_HCA=$(bash "$SCRIPT_DIR/helpers/get_nccl_ib_hca.sh") -fi -export NCCL_IB_HCA - -# Dynamically get network interface IP address for socket communication if not set -if [ -z "${IP_INTERFACE}" ]; then - IP_INTERFACE=$(bash "$SCRIPT_DIR/helpers/get_ip_interface.sh") -fi -export IP_INTERFACE -# Set network interfaces for NCCL and Gloo, fallback to detected IP_INTERFACE -export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$IP_INTERFACE} -export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$IP_INTERFACE} - -log_exported_vars "NCCL and Network Settings" \ - HIP_VISIBLE_DEVICES NCCL_DEBUG NCCL_CHECKS_DISABLE NCCL_IB_GID_INDEX \ - NCCL_IB_HCA IP_INTERFACE NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME - -# ----------------- AMD-specific GPU optimizations ----------------- -# Enable system DMA engine (SDMA) on AMD GPUs for better IO throughput -export HSA_ENABLE_SDMA=1 - -# Prevent scratch memory from being reclaimed to stabilize large memory usage patterns (e.g., KV cache, MoE experts) -# NOTE: Must disable scratch reclaim to avoid MoE training crash on AMD GPUs -# Setting this to 0 prevents core dumps when using Mixture-of-Experts (MoE) models -export HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM:-0} - -# Disable MSCCL (RCCL multi-connection feature) for better stability -export RCCL_MSCCL_ENABLE=0 -export RCCL_MSCCLPP_ENABLE=0 -export RCCL_MSCCLPP_FORCE_ENABLE=0 -export RCCL_MSCCLPP_THRESHOLD=$((1*1024*1024*1024)) # default 1 MB -# https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/env.hpp#L82-L87 -export MSCCLPP_DISABLE_CHANNEL_CACHE=FALSE -# pytorch need set this env to enable register comm -export TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=0 - -log_exported_vars "AMD-specific GPU optimizations" \ - HSA_ENABLE_SDMA HSA_NO_SCRATCH_RECLAIM \ - RCCL_MSCCL_ENABLE RCCL_MSCCLPP_ENABLE RCCL_MSCCLPP_FORCE_ENABLE RCCL_MSCCLPP_THRESHOLD \ - MSCCLPP_DISABLE_CHANNEL_CACHE TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK - - -# ----------------- Performance tuning ----------------- -# Limit GPU hardware queues to 2 for performance stability -export GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES:-2} - -# Increase HSA kernarg pool size to 12MB for models with lot of kernels -# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} - -# Enable NUMA binding for better memory locality (may increase stability for large models) -export ENABLE_NUMA_BINDING=${ENABLE_NUMA_BINDING:-0} - -# Limit max CUDA device connections to reduce PCIe traffic -export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1} - -# Prioritize NCCL communication for PyTorch for higher throughput -export TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY:-1} - -# optimize nvte fp8 cast transpose -export NVTE_USE_CAST_TRANSPOSE_TRITON=${NVTE_USE_CAST_TRANSPOSE_TRITON:-1} -export NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE=${NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE:-0} - -# Note: Disable v3 due to accuracy issues. Will fix after TE version 2.1. -export NVTE_CK_USES_BWD_V3=${NVTE_CK_USES_BWD_V3:-0} - -# nvte debug envs -export NVTE_DEBUG=0 # 0, 1 -export NVTE_DEBUG_LEVEL=0 # 0, 1, 2 -export NVTE_FUSED_ATTN_LOG_CONFIG=0 # 0, 1 -export PATCH_TE_FLASH_ATTN=${PATCH_TE_FLASH_ATTN:-0} - -log_exported_vars "Performance tuning" \ - GPU_MAX_HW_QUEUES HSA_KERNARG_POOL_SIZE ENABLE_NUMA_BINDING CUDA_DEVICE_MAX_CONNECTIONS \ - TORCH_NCCL_HIGH_PRIORITY NVTE_USE_CAST_TRANSPOSE_TRITON NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE \ - NVTE_CK_USES_BWD_V3 NVTE_DEBUG NVTE_DEBUG_LEVEL NVTE_FUSED_ATTN_LOG_CONFIG PATCH_TE_FLASH_ATTN - -# -------------------- setup_pythonpath ------------------- -PRIMUS_PATH=$(realpath "$(dirname "$0")/..") -site_packages=$(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])") -export PYTHONPATH="${PRIMUS_PATH}:${site_packages}:${PYTHONPATH:-}" -log_exported_vars "pythonpath" PYTHONPATH diff --git a/tests/runner/helpers/test_primus_env.sh b/tests/runner/helpers/test_primus_env.sh new file mode 100755 index 00000000..5708065e --- /dev/null +++ b/tests/runner/helpers/test_primus_env.sh @@ -0,0 +1,398 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +# +# Unit tests for runner/helpers/envs/primus-env.sh +# + +# Get project root (tests/runner/helpers -> ../../..) +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" + +# Test counter +TESTS_RUN=0 +TESTS_PASSED=0 + +# Test assertion functions +assert_pass() { + ((TESTS_RUN++)) + ((TESTS_PASSED++)) + echo " ✓ PASS: $1" +} + +assert_fail() { + ((TESTS_RUN++)) + echo " ✗ FAIL: $1" +} + +# Currently unused but kept for future use +# shellcheck disable=SC2317 +assert_contains() { + local output="$1" + local expected="$2" + local message="$3" + + if echo "$output" | grep -q "$expected"; then + assert_pass "$message" + else + assert_fail "$message" + fi +} + +# Setup test environment +setup_test_env() { + export MASTER_ADDR="localhost" + export MASTER_PORT="1234" + export NNODES="1" + export NODE_RANK="0" + export GPUS_PER_NODE="8" +} + +# Cleanup test environment +cleanup_test_env() { + unset MASTER_ADDR MASTER_PORT NNODES NODE_RANK GPUS_PER_NODE + unset PRIMUS_DEBUG PRIMUS_SKIP_VALIDATION + unset HIP_VISIBLE_DEVICES NCCL_DEBUG NCCL_IB_HCA + unset HSA_ENABLE_SDMA GPU_MAX_HW_QUEUES + unset __PRIMUS_BASE_ENV_SOURCED +} + +# ============================================================================ +# Test 1: Basic Environment Loading +# ============================================================================ +test_basic_env_loading() { + echo "Test 1: Basic Environment Loading" + + setup_test_env + export PRIMUS_SKIP_VALIDATION=1 # Skip validation for faster test + + # Source primus-env.sh in a subshell to avoid affecting test environment + result=$(bash -c " + export MASTER_ADDR='$MASTER_ADDR' + export MASTER_PORT='$MASTER_PORT' + export NNODES='$NNODES' + export NODE_RANK='$NODE_RANK' + export GPUS_PER_NODE='$GPUS_PER_NODE' + export PRIMUS_SKIP_VALIDATION=1 + source '$PROJECT_ROOT/runner/helpers/envs/primus-env.sh' 2>&1 | grep -c 'Environment Configuration Complete' + ") + + if [[ "$result" -eq 1 ]]; then + assert_pass "Basic environment loads successfully" + else + assert_fail "Basic environment loading failed" + fi + + cleanup_test_env +} + +# ============================================================================ +# Test 2: Environment Variables Are Set +# ============================================================================ +test_env_variables_set() { + echo "Test 2: Environment Variables Are Set" + + setup_test_env + export PRIMUS_SKIP_VALIDATION=1 + + # Check if key variables are exported + result=$(bash -c " + export MASTER_ADDR='$MASTER_ADDR' + export MASTER_PORT='$MASTER_PORT' + export NNODES='$NNODES' + export NODE_RANK='$NODE_RANK' + export GPUS_PER_NODE='$GPUS_PER_NODE' + export PRIMUS_SKIP_VALIDATION=1 + source '$PROJECT_ROOT/runner/helpers/envs/primus-env.sh' 2>/dev/null + + # Check if variables are set + if [[ -n \"\$HIP_VISIBLE_DEVICES\" ]] && \ + [[ -n \"\$HSA_ENABLE_SDMA\" ]] && \ + [[ -n \"\$GPU_MAX_HW_QUEUES\" ]]; then + echo 'PASS' + else + echo 'FAIL' + fi + " 2>&1) + + if echo "$result" | grep -q "PASS"; then + assert_pass "Environment variables are set correctly" + else + assert_fail "Environment variables not set" + fi + + cleanup_test_env +} + +# ============================================================================ +# Test 3: Debug Mode +# ============================================================================ +test_debug_mode() { + echo "Test 3: Debug Mode" + + setup_test_env + export PRIMUS_DEBUG=1 + export PRIMUS_SKIP_VALIDATION=1 + + # Check if debug mode outputs expected trace + result=$(bash -c " + export MASTER_ADDR='$MASTER_ADDR' + export MASTER_PORT='$MASTER_PORT' + export NNODES='$NNODES' + export NODE_RANK='$NODE_RANK' + export GPUS_PER_NODE='$GPUS_PER_NODE' + export PRIMUS_DEBUG=1 + export PRIMUS_SKIP_VALIDATION=1 + source '$PROJECT_ROOT/runner/helpers/envs/primus-env.sh' 2>&1 | grep -c 'DEBUG' + ") + + if [[ "$result" -gt 0 ]]; then + assert_pass "Debug mode works correctly" + else + assert_fail "Debug mode not working" + fi + + cleanup_test_env +} + +# ============================================================================ +# Test 4: Validation Execution +# ============================================================================ +test_validation_execution() { + echo "Test 4: Validation Execution" + + setup_test_env + # Don't skip validation this time + + # Should pass validation with correct values + result=$(bash -c " + export MASTER_ADDR='$MASTER_ADDR' + export MASTER_PORT='$MASTER_PORT' + export NNODES='$NNODES' + export NODE_RANK='$NODE_RANK' + export GPUS_PER_NODE='$GPUS_PER_NODE' + source '$PROJECT_ROOT/runner/helpers/envs/primus-env.sh' 2>&1 | grep -c 'Configuration validation passed' + ") + + if [[ "$result" -eq 1 ]]; then + assert_pass "Validation executes and passes correctly" + else + assert_fail "Validation not executed or failed" + fi + + cleanup_test_env +} + +# ============================================================================ +# Test 5: Validation Skip Flag +# ============================================================================ +test_validation_skip() { + echo "Test 5: Validation Skip Flag" + + setup_test_env + export PRIMUS_SKIP_VALIDATION=1 + + # Should not see validation messages + result=$(bash -c " + export MASTER_ADDR='$MASTER_ADDR' + export MASTER_PORT='$MASTER_PORT' + export NNODES='$NNODES' + export NODE_RANK='$NODE_RANK' + export GPUS_PER_NODE='$GPUS_PER_NODE' + export PRIMUS_SKIP_VALIDATION=1 + source '$PROJECT_ROOT/runner/helpers/envs/primus-env.sh' 2>&1 | grep -c 'Validating Configuration' + ") + + if [[ "$result" -eq 0 ]]; then + assert_pass "Validation skip flag works correctly" + else + assert_fail "Validation skip flag not working" + fi + + cleanup_test_env +} + +# ============================================================================ +# Test 6: Invalid Configuration Detection +# ============================================================================ +test_invalid_config_detection() { + echo "Test 6: Invalid Configuration Detection" + + # Set invalid NODE_RANK (>= NNODES) + export MASTER_ADDR="localhost" + export MASTER_PORT="1234" + export NNODES="2" + export NODE_RANK="5" # Invalid: should be < NNODES + export GPUS_PER_NODE="8" + + # Should fail validation - capture exit code only + if bash -c " + export MASTER_ADDR='$MASTER_ADDR' + export MASTER_PORT='$MASTER_PORT' + export NNODES='$NNODES' + export NODE_RANK='$NODE_RANK' + export GPUS_PER_NODE='$GPUS_PER_NODE' + source '$PROJECT_ROOT/runner/helpers/envs/primus-env.sh' 2>/dev/null + " 2>/dev/null; then + assert_fail "Invalid configuration not detected" + else + assert_pass "Invalid configuration is detected" + fi + + cleanup_test_env +} + +# ============================================================================ +# Test 7: GPU Detection +# ============================================================================ +test_gpu_detection() { + echo "Test 7: GPU Detection" + + setup_test_env + export PRIMUS_SKIP_VALIDATION=1 + + # Check if GPU detection runs + result=$(bash -c " + export MASTER_ADDR='$MASTER_ADDR' + export MASTER_PORT='$MASTER_PORT' + export NNODES='$NNODES' + export NODE_RANK='$NODE_RANK' + export GPUS_PER_NODE='$GPUS_PER_NODE' + export PRIMUS_SKIP_VALIDATION=1 + source '$PROJECT_ROOT/runner/helpers/envs/primus-env.sh' 2>&1 | grep -c 'Detected GPU model' + ") + + if [[ "$result" -eq 1 ]]; then + assert_pass "GPU detection executes" + else + assert_fail "GPU detection not executed" + fi + + cleanup_test_env +} + +# ============================================================================ +# Test 8: Layered Configuration Loading Order +# ============================================================================ +test_loading_order() { + echo "Test 8: Layered Configuration Loading Order" + + setup_test_env + export PRIMUS_SKIP_VALIDATION=1 + + # Check loading messages appear in correct order + result=$(bash -c " + export MASTER_ADDR='$MASTER_ADDR' + export MASTER_PORT='$MASTER_PORT' + export NNODES='$NNODES' + export NODE_RANK='$NODE_RANK' + export GPUS_PER_NODE='$GPUS_PER_NODE' + export PRIMUS_SKIP_VALIDATION=1 + source '$PROJECT_ROOT/runner/helpers/envs/primus-env.sh' 2>&1 + ") + + # Check if loading message appears + if echo "$result" | grep -q "Loading Primus Environment Configuration"; then + assert_pass "Configuration loading order is correct" + else + assert_fail "Configuration loading order incorrect" + fi + + cleanup_test_env +} + +# ============================================================================ +# Test 9: Missing Base Environment Detection +# ============================================================================ +test_missing_base_env() { + echo "Test 9: Missing Base Environment Detection" + + # Temporarily rename base_env.sh to simulate missing file + BASE_ENV_FILE="$PROJECT_ROOT/runner/helpers/envs/base_env.sh" + BASE_ENV_BACKUP="$PROJECT_ROOT/runner/helpers/envs/base_env.sh.backup" + + if [[ -f "$BASE_ENV_FILE" ]]; then + mv "$BASE_ENV_FILE" "$BASE_ENV_BACKUP" + fi + + # Should fail when base_env.sh is missing - capture exit code only + if bash -c "source '$PROJECT_ROOT/runner/helpers/envs/primus-env.sh' 2>/dev/null" 2>/dev/null; then + fail_detected=0 + else + fail_detected=1 + fi + + # Restore base_env.sh + if [[ -f "$BASE_ENV_BACKUP" ]]; then + mv "$BASE_ENV_BACKUP" "$BASE_ENV_FILE" + fi + + if [[ "$fail_detected" -eq 1 ]]; then + assert_pass "Missing base environment is detected" + else + assert_fail "Missing base environment not detected" + fi +} + +# ============================================================================ +# Test 10: Environment Variable Defaults +# ============================================================================ +test_env_defaults() { + echo "Test 10: Environment Variable Defaults" + + # Don't set any variables, let defaults kick in + export PRIMUS_SKIP_VALIDATION=1 + + result=$(bash -c " + export PRIMUS_SKIP_VALIDATION=1 + source '$PROJECT_ROOT/runner/helpers/envs/primus-env.sh' 2>&1 + + # Check default values + [[ \"\$MASTER_ADDR\" == 'localhost' ]] && \ + [[ \"\$MASTER_PORT\" == '1234' ]] && \ + [[ \"\$NNODES\" == '1' ]] && \ + [[ \"\$NODE_RANK\" == '0' ]] && \ + [[ \"\$GPUS_PER_NODE\" == '8' ]] && \ + echo 'PASS' || echo 'FAIL' + " 2>&1) + + if echo "$result" | grep -q "PASS"; then + assert_pass "Default values are set correctly" + else + assert_fail "Default values not set" + fi + + cleanup_test_env +} + +# ============================================================================ +# Run all tests +# ============================================================================ +echo "==========================================" +echo "Running primus-env.sh Unit Tests" +echo "==========================================" +echo "" + +test_basic_env_loading +test_env_variables_set +test_debug_mode +test_validation_execution +test_validation_skip +test_invalid_config_detection +test_gpu_detection +test_loading_order +test_missing_base_env +test_env_defaults + +echo "" +echo "==========================================" +echo "Test Summary: $TESTS_PASSED/$TESTS_RUN tests passed" +echo "==========================================" + +if [[ $TESTS_PASSED -eq $TESTS_RUN ]]; then + exit 0 +else + exit 1 +fi diff --git a/tests/runner/run_all_tests.sh b/tests/runner/run_all_tests.sh index ba1599bd..6a2d46b8 100755 --- a/tests/runner/run_all_tests.sh +++ b/tests/runner/run_all_tests.sh @@ -34,6 +34,8 @@ TEST_SCRIPTS=( "$SCRIPT_DIR/lib/test_validation.sh" "$SCRIPT_DIR/lib/test_config.sh" "$SCRIPT_DIR/helpers/test_execute_hooks.sh" + "$SCRIPT_DIR/helpers/test_execute_patches.sh" + "$SCRIPT_DIR/helpers/test_primus_env.sh" ) # Run each test suite From efb4cf261356ddc04168ebb0aa62b891b9391ee6 Mon Sep 17 00:00:00 2001 From: Xiaoming-AMD Date: Thu, 13 Nov 2025 00:20:43 -0600 Subject: [PATCH 2/3] chore(envs): comment out GPU-specific environment variables Temporarily disable GPU-specific configurations in MI300X, MI325X, and MI355X. Keep as documentation template for future use. --- runner/helpers/envs/MI300X.sh | 12 ++++++------ runner/helpers/envs/MI325X.sh | 12 ++++++------ runner/helpers/envs/MI355X.sh | 14 +++++++------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/runner/helpers/envs/MI300X.sh b/runner/helpers/envs/MI300X.sh index a38d66ad..d488467c 100755 --- a/runner/helpers/envs/MI300X.sh +++ b/runner/helpers/envs/MI300X.sh @@ -13,22 +13,22 @@ LOG_INFO_RANK0 "Loading MI300X-specific optimizations..." # ----------------- MI300X-specific GPU settings ----------------- # MI300X has 192GB HBM3, disable XNACK for performance -export HSA_XNACK=${HSA_XNACK:-0} +# export HSA_XNACK=${HSA_XNACK:-0} # Optimize memory allocation for large models -export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100} +# export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100} # MI300X-specific memory optimizations # Increase HSA kernarg pool size for large model workloads (12MB) -export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} +# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} # ----------------- MI300X RCCL optimizations ----------------- -# MI300X works well with MSCCLPP disabled (already set in base_env.sh) +# MI300X works well with MSCCLPP disabled (already set in common_network.sh) # Override here only if needed for specific MI300X workloads # Uncomment to enable MSCCLPP for MI300X if tested and verified # export RCCL_MSCCLPP_ENABLE=1 # export RCCL_MSCCLPP_FORCE_ENABLE=1 -log_exported_vars "MI300X-specific optimizations" \ - HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE +# log_exported_vars "MI300X-specific optimizations" \ +# HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE diff --git a/runner/helpers/envs/MI325X.sh b/runner/helpers/envs/MI325X.sh index 7f342a8f..36d8f8e6 100755 --- a/runner/helpers/envs/MI325X.sh +++ b/runner/helpers/envs/MI325X.sh @@ -13,21 +13,21 @@ LOG_INFO_RANK0 "Loading MI325X-specific optimizations..." # ----------------- MI325X-specific GPU settings ----------------- # MI325X has 256GB HBM3e (enhanced), disable XNACK for performance -export HSA_XNACK=${HSA_XNACK:-0} +# export HSA_XNACK=${HSA_XNACK:-0} # Optimize memory allocation for larger models compared to MI300X -export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100} +# export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100} # MI325X-specific memory optimizations -export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} +# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} # ----------------- MI325X RCCL optimizations ----------------- # MI325X may benefit from different RCCL settings -# Override base_env.sh settings if needed for MI325X +# Override common_network.sh settings if needed for MI325X # Uncomment to enable MSCCLPP for MI325X if tested and verified # export RCCL_MSCCLPP_ENABLE=1 # export RCCL_MSCCLPP_FORCE_ENABLE=1 -log_exported_vars "MI325X-specific optimizations" \ - HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE +# log_exported_vars "MI325X-specific optimizations" \ +# HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE diff --git a/runner/helpers/envs/MI355X.sh b/runner/helpers/envs/MI355X.sh index 3c68f79e..8a781182 100755 --- a/runner/helpers/envs/MI355X.sh +++ b/runner/helpers/envs/MI355X.sh @@ -15,20 +15,20 @@ LOG_INFO_RANK0 "Loading MI355X-specific optimizations..." # ----------------- MI355X-specific GPU settings ----------------- # MI355X has 128GB unified memory (HBM + DDR) # Enable XNACK for unified memory support (different from discrete GPUs) -export HSA_XNACK=${HSA_XNACK:-1} +# export HSA_XNACK=${HSA_XNACK:-1} # APU-specific: Enable interrupt-driven mode for better power efficiency -export HSA_ENABLE_INTERRUPT=${HSA_ENABLE_INTERRUPT:-1} +# export HSA_ENABLE_INTERRUPT=${HSA_ENABLE_INTERRUPT:-1} # Optimize memory allocation for unified memory architecture -export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100} +# export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100} # MI355X memory pool settings -export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-8388608} # 8MB (smaller than discrete GPUs) +# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-8388608} # 8MB (smaller than discrete GPUs) # ----------------- MI355X RCCL optimizations ----------------- # APU may have different interconnect characteristics -# Keep base_env.sh settings unless testing shows otherwise +# Keep common_network.sh settings unless testing shows otherwise -log_exported_vars "MI355X-specific optimizations" \ - HSA_XNACK HSA_ENABLE_INTERRUPT GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE +# log_exported_vars "MI355X-specific optimizations" \ +# HSA_XNACK HSA_ENABLE_INTERRUPT GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE From 0d60c9be9d62ca55858a77284cd0f07d03a10866 Mon Sep 17 00:00:00 2001 From: Xiaoming-AMD Date: Thu, 13 Nov 2025 01:16:59 -0600 Subject: [PATCH 3/3] refactor(envs): consolidate configuration files for simplicity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Merge common_network.sh, perf_tuning.sh, and detect_gpu.sh into base files to reduce file count and simplify configuration structure. Changes: - Merge common_network.sh and perf_tuning.sh into base_env.sh - Merge detect_gpu.sh into primus-env.sh - Delete redundant files: common_network.sh, perf_tuning.sh, detect_gpu.sh - Simplify primus-env.sh loading order (2 layers instead of 4) Result: - File count: 10 → 7 files (-30%) - Cleaner directory structure - Faster loading (fewer source calls) - Better integration (GPU detection uses logging functions) - Improved error handling (GPU detection doesn't exit on failure) --- runner/helpers/envs/base_env.sh | 146 +++++++++++++++++++++++++- runner/helpers/envs/common_network.sh | 83 --------------- runner/helpers/envs/detect_gpu.sh | 49 --------- runner/helpers/envs/perf_tuning.sh | 83 --------------- runner/helpers/envs/primus-env.sh | 51 ++++++--- 5 files changed, 178 insertions(+), 234 deletions(-) delete mode 100644 runner/helpers/envs/common_network.sh delete mode 100755 runner/helpers/envs/detect_gpu.sh delete mode 100644 runner/helpers/envs/perf_tuning.sh diff --git a/runner/helpers/envs/base_env.sh b/runner/helpers/envs/base_env.sh index a4f60979..7f427cf2 100755 --- a/runner/helpers/envs/base_env.sh +++ b/runner/helpers/envs/base_env.sh @@ -8,12 +8,17 @@ # ============================================================================= # Base Environment Configuration # ============================================================================= -# This file provides the foundation for all environment configurations: +# This file provides all environment configurations for Primus: # - Logging functions (LOG_INFO, LOG_INFO_RANK0, LOG_ERROR, etc.) # - Distributed training cluster information (MASTER_ADDR, NNODES, etc.) -# - Python path setup +# - Python path setup and data paths +# - NCCL and network settings +# - RCCL communication library settings +# - AMD GPU optimizations +# - General performance tuning +# - Transformer Engine optimizations # -# Network, performance tuning, and GPU-specific settings are loaded separately +# GPU-specific settings can override these in GPU model files (e.g., MI300X.sh) # ============================================================================= # --------------------------------------------------------------------------- @@ -86,6 +91,12 @@ log_exported_vars "Training Cluster Info" \ # Python Path Setup # --------------------------------------------------------------------------- PRIMUS_PATH=$(cd "$SCRIPT_DIR/../../.." && pwd) +export PRIMUS_PATH + +# Set data paths +export DATA_PATH=${DATA_PATH:-"${PRIMUS_PATH}/data"} +export HF_HOME=${HF_HOME:-"${DATA_PATH}/huggingface"} + site_packages=$(python -c "import sysconfig; print(sysconfig.get_paths()['purelib'])" 2>/dev/null || echo "") if [[ -n "$site_packages" ]]; then export PYTHONPATH="${PRIMUS_PATH}:${site_packages}:${PYTHONPATH:-}" @@ -93,4 +104,131 @@ else export PYTHONPATH="${PRIMUS_PATH}:${PYTHONPATH:-}" fi -log_exported_vars "Python Path" PYTHONPATH +log_exported_vars "Python Path and Data Paths" \ + PRIMUS_PATH DATA_PATH HF_HOME PYTHONPATH + +# ============================================================================= +# NCCL and Network Configuration +# ============================================================================= + +# Set visible GPUs for the current node (0 to GPUS_PER_NODE-1) +HIP_VISIBLE_DEVICES=$(seq -s, 0 $((GPUS_PER_NODE - 1))) +export HIP_VISIBLE_DEVICES + +# ----------------- NCCL and Network Settings ----------------- + +# NCCL logging level: VERSION, WARN, INFO, DEBUG, TRACE +# Set to empty for default behavior, or specify level for debugging +export NCCL_DEBUG=${NCCL_DEBUG:-} + +# Disable NCCL internal checks to reduce overhead +export NCCL_CHECKS_DISABLE=1 + +# Set InfiniBand GID index for NCCL communication +export NCCL_IB_GID_INDEX=3 + +# Disable cross NIC communication for NCCL +export NCCL_CROSS_NIC=0 + +# Dynamically get InfiniBand Host Channel Adapter index for NCCL if not set +if [ -z "${NCCL_IB_HCA}" ]; then + NCCL_IB_HCA=$(bash "${SCRIPT_DIR}/get_nccl_ib_hca.sh" 2>/dev/null || echo "") +fi +export NCCL_IB_HCA + +# Dynamically get network interface IP address for socket communication if not set +if [ -z "${IP_INTERFACE}" ]; then + IP_INTERFACE=$(bash "${SCRIPT_DIR}/get_ip_interface.sh" 2>/dev/null || hostname -I | awk '{print $1}') +fi +export IP_INTERFACE + +# Set network interfaces for NCCL and Gloo, fallback to detected IP_INTERFACE +export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$IP_INTERFACE} +export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$IP_INTERFACE} + +# ----------------- RCCL Settings (AMD ROCm Communication Library) ----------------- + +# Disable MSCCL (RCCL multi-connection feature) for better stability +export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0} +export RCCL_MSCCLPP_ENABLE=${RCCL_MSCCLPP_ENABLE:-0} +export RCCL_MSCCLPP_FORCE_ENABLE=${RCCL_MSCCLPP_FORCE_ENABLE:-0} +export RCCL_MSCCLPP_THRESHOLD=${RCCL_MSCCLPP_THRESHOLD:-$((1*1024*1024*1024))} # default 1GB + +# https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/env.hpp#L82-L87 +export MSCCLPP_DISABLE_CHANNEL_CACHE=${MSCCLPP_DISABLE_CHANNEL_CACHE:-FALSE} + +# PyTorch needs this env to enable register comm +export TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=${TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK:-0} + +log_exported_vars "NCCL and Network Settings" \ + HIP_VISIBLE_DEVICES NCCL_DEBUG NCCL_CHECKS_DISABLE NCCL_IB_GID_INDEX \ + NCCL_CROSS_NIC NCCL_IB_HCA IP_INTERFACE NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME + +log_exported_vars "RCCL Settings" \ + RCCL_MSCCL_ENABLE RCCL_MSCCLPP_ENABLE RCCL_MSCCLPP_FORCE_ENABLE RCCL_MSCCLPP_THRESHOLD \ + MSCCLPP_DISABLE_CHANNEL_CACHE TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK + +# ============================================================================= +# Performance Tuning Configuration +# ============================================================================= + +# ----------------- AMD-specific GPU optimizations ----------------- +# Enable system DMA engine (SDMA) on AMD GPUs for better IO throughput +export HSA_ENABLE_SDMA=${HSA_ENABLE_SDMA:-1} + +# Prevent scratch memory from being reclaimed to stabilize large memory usage +# NOTE: Must disable scratch reclaim to avoid MoE training crash on AMD GPUs +# Setting this to 0 prevents core dumps when using Mixture-of-Experts (MoE) models +export HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM:-0} + +log_exported_vars "AMD GPU Optimizations" \ + HSA_ENABLE_SDMA HSA_NO_SCRATCH_RECLAIM + +# ----------------- General Performance Tuning ----------------- +# Limit GPU hardware queues to 2 for performance stability +export GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES:-2} + +# Increase HSA kernarg pool size to 12MB for models with many kernels (optional, can be set by GPU-specific configs) +# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} + +# Enable NUMA binding for better memory locality (may increase stability for large models) +export ENABLE_NUMA_BINDING=${ENABLE_NUMA_BINDING:-0} + +# Limit max CUDA device connections to reduce PCIe traffic +export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1} + +# Prioritize NCCL communication for PyTorch for higher throughput +export TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY:-1} + +# ----------------- NCCL Performance Settings ----------------- +# In multi-node training, PXN can be enabled to improve inter-node all-to-all +# communication efficiency, but it will increase GPU memory usage. +# Default: disable PXN for NCCL +export NCCL_PXN_DISABLE=${NCCL_PXN_DISABLE:-1} +export NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE:-524288} + +log_exported_vars "General Performance Tuning" \ + GPU_MAX_HW_QUEUES ENABLE_NUMA_BINDING CUDA_DEVICE_MAX_CONNECTIONS \ + TORCH_NCCL_HIGH_PRIORITY NCCL_PXN_DISABLE NCCL_P2P_NET_CHUNKSIZE + +# ----------------- Transformer Engine Optimizations ----------------- +# Optimize NVTE fp8 cast transpose +export NVTE_USE_CAST_TRANSPOSE_TRITON=${NVTE_USE_CAST_TRANSPOSE_TRITON:-1} +export NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE=${NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE:-0} + +# Note: Disable v3 due to accuracy issues. Will fix after TE version 2.1. +export NVTE_CK_USES_BWD_V3=${NVTE_CK_USES_BWD_V3:-0} + +# Note: Disable fp32 atomic if you find any accuracy issue +export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=${PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32:-0} + +# NVTE debug envs +export NVTE_DEBUG=${NVTE_DEBUG:-0} # 0, 1 +export NVTE_DEBUG_LEVEL=${NVTE_DEBUG_LEVEL:-0} # 0, 1, 2 +export NVTE_FUSED_ATTN_LOG_CONFIG=${NVTE_FUSED_ATTN_LOG_CONFIG:-0} # 0, 1 +export PATCH_TE_FLASH_ATTN=${PATCH_TE_FLASH_ATTN:-0} + +log_exported_vars "Transformer Engine Optimizations" \ + NVTE_USE_CAST_TRANSPOSE_TRITON NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE \ + NVTE_CK_USES_BWD_V3 PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32 \ + NVTE_DEBUG NVTE_DEBUG_LEVEL NVTE_FUSED_ATTN_LOG_CONFIG PATCH_TE_FLASH_ATTN diff --git a/runner/helpers/envs/common_network.sh b/runner/helpers/envs/common_network.sh deleted file mode 100644 index 23af6884..00000000 --- a/runner/helpers/envs/common_network.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env bash -############################################################################### -# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -# -# See LICENSE for license information. -############################################################################### - -# ============================================================================= -# NCCL and Network Configuration -# ============================================================================= -# This file contains all network-related environment variable settings -# for distributed training with NCCL and communication libraries. -# ============================================================================= - -# Dependency check: ensure base_env.sh has been loaded -if [[ -z "${GPUS_PER_NODE}" ]]; then - echo "[ERROR] GPUS_PER_NODE not set. base_env.sh must be loaded first." >&2 - exit 1 -fi - -if ! declare -f log_exported_vars >/dev/null 2>&1; then - echo "[ERROR] log_exported_vars function not found. base_env.sh must be loaded first." >&2 - exit 1 -fi - -# Set visible GPUs for the current node (0 to GPUS_PER_NODE-1) -HIP_VISIBLE_DEVICES=$(seq -s, 0 $((GPUS_PER_NODE - 1))) -export HIP_VISIBLE_DEVICES - -# ----------------- NCCL and Network Settings ----------------- - -# NCCL logging level: VERSION, WARN, INFO, DEBUG, TRACE -# Set to empty for default behavior, or specify level for debugging -export NCCL_DEBUG=${NCCL_DEBUG:-} - -# Disable NCCL internal checks to reduce overhead -export NCCL_CHECKS_DISABLE=1 - -# Set InfiniBand GID index for NCCL communication -export NCCL_IB_GID_INDEX=3 - -# Disable cross NIC communication for NCCL -export NCCL_CROSS_NIC=0 - -# Dynamically get InfiniBand Host Channel Adapter index for NCCL if not set -if [ -z "${NCCL_IB_HCA}" ]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - NCCL_IB_HCA=$(bash "${SCRIPT_DIR}/get_nccl_ib_hca.sh" 2>/dev/null || echo "") -fi -export NCCL_IB_HCA - -# Dynamically get network interface IP address for socket communication if not set -if [ -z "${IP_INTERFACE}" ]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - IP_INTERFACE=$(bash "${SCRIPT_DIR}/get_ip_interface.sh" 2>/dev/null || hostname -I | awk '{print $1}') -fi -export IP_INTERFACE - -# Set network interfaces for NCCL and Gloo, fallback to detected IP_INTERFACE -export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$IP_INTERFACE} -export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$IP_INTERFACE} - -# ----------------- RCCL Settings (AMD ROCm Communication Library) ----------------- - -# Disable MSCCL (RCCL multi-connection feature) for better stability -export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0} -export RCCL_MSCCLPP_ENABLE=${RCCL_MSCCLPP_ENABLE:-0} -export RCCL_MSCCLPP_FORCE_ENABLE=${RCCL_MSCCLPP_FORCE_ENABLE:-0} -export RCCL_MSCCLPP_THRESHOLD=${RCCL_MSCCLPP_THRESHOLD:-$((1*1024*1024*1024))} # default 1GB - -# https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/env.hpp#L82-L87 -export MSCCLPP_DISABLE_CHANNEL_CACHE=${MSCCLPP_DISABLE_CHANNEL_CACHE:-FALSE} - -# PyTorch needs this env to enable register comm -export TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=${TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK:-0} - -log_exported_vars "NCCL and Network Settings" \ - HIP_VISIBLE_DEVICES NCCL_DEBUG NCCL_CHECKS_DISABLE NCCL_IB_GID_INDEX \ - NCCL_CROSS_NIC NCCL_IB_HCA IP_INTERFACE NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME - -log_exported_vars "RCCL Settings" \ - RCCL_MSCCL_ENABLE RCCL_MSCCLPP_ENABLE RCCL_MSCCLPP_FORCE_ENABLE RCCL_MSCCLPP_THRESHOLD \ - MSCCLPP_DISABLE_CHANNEL_CACHE TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK diff --git a/runner/helpers/envs/detect_gpu.sh b/runner/helpers/envs/detect_gpu.sh deleted file mode 100755 index 1e9309a5..00000000 --- a/runner/helpers/envs/detect_gpu.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -############################################################################### -# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -# -# See LICENSE for license information. -############################################################################### -# -# Detect GPU Model Script -# Uses rocm-smi to detect AMD GPU model (MI300, MI355, etc.) -# - -detect_gpu_model() { - local gpu_model - gpu_model="unknown" - - # Check if rocm-smi is available - if ! command -v rocm-smi &> /dev/null; then - echo "Error: rocm-smi not found. Is ROCm installed?" >&2 - return 1 - fi - - # Get product name from rocm-smi - local product_name - product_name=$(rocm-smi --showproductname 2>/dev/null | grep -i "Card series" | head -n1 | awk '{print $NF}') - - # If that doesn't work, try alternative method - if [[ -z "$product_name" ]]; then - product_name=$(rocm-smi --showproductname 2>/dev/null | grep -oP 'MI\d+[A-Z]*' | head -n1) - fi - - # Extract model identifier (MI300, MI355, etc.) - if [[ "$product_name" =~ MI([0-9]+)([A-Z]*) ]]; then - gpu_model="MI${BASH_REMATCH[1]}${BASH_REMATCH[2]}" - fi - - echo "$gpu_model" -} - -# Execute detection -GPU_MODEL=$(detect_gpu_model) - -# Output result -echo "$GPU_MODEL" - -# Exit with error if detection failed -if [[ "$GPU_MODEL" == "unknown" ]]; then - echo "Warning: Unable to detect GPU model. Using default configuration." >&2 - exit 1 -fi diff --git a/runner/helpers/envs/perf_tuning.sh b/runner/helpers/envs/perf_tuning.sh deleted file mode 100644 index bddb8cb1..00000000 --- a/runner/helpers/envs/perf_tuning.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env bash -############################################################################### -# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -# -# See LICENSE for license information. -############################################################################### - -# ============================================================================= -# Performance Tuning Configuration -# ============================================================================= -# This file contains all performance-related settings including: -# - AMD-specific GPU optimizations (HSA, RCCL) -# - General performance tuning (GPU queues, NUMA, CUDA connections) -# - NCCL performance settings (PXN, P2P) -# - Transformer Engine optimizations (NVTE) -# ============================================================================= - -# Dependency check: ensure base_env.sh has been loaded -if ! declare -f log_exported_vars >/dev/null 2>&1; then - echo "[ERROR] log_exported_vars function not found. base_env.sh must be loaded first." >&2 - exit 1 -fi - -# ----------------- AMD-specific GPU optimizations ----------------- -# Enable system DMA engine (SDMA) on AMD GPUs for better IO throughput -export HSA_ENABLE_SDMA=${HSA_ENABLE_SDMA:-1} - -# Prevent scratch memory from being reclaimed to stabilize large memory usage -# NOTE: Must disable scratch reclaim to avoid MoE training crash on AMD GPUs -# Setting this to 0 prevents core dumps when using Mixture-of-Experts (MoE) models -export HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM:-0} - -log_exported_vars "AMD GPU Optimizations" \ - HSA_ENABLE_SDMA HSA_NO_SCRATCH_RECLAIM - -# ----------------- General Performance Tuning ----------------- -# Limit GPU hardware queues to 2 for performance stability -export GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES:-2} - -# Increase HSA kernarg pool size to 12MB for models with many kernels (optional, can be set by GPU-specific configs) -# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} - -# Enable NUMA binding for better memory locality (may increase stability for large models) -export ENABLE_NUMA_BINDING=${ENABLE_NUMA_BINDING:-0} - -# Limit max CUDA device connections to reduce PCIe traffic -export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1} - -# Prioritize NCCL communication for PyTorch for higher throughput -export TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY:-1} - -# ----------------- NCCL Performance Settings ----------------- -# In multi-node training, PXN can be enabled to improve inter-node all-to-all -# communication efficiency, but it will increase GPU memory usage. -# Default: disable PXN for NCCL -export NCCL_PXN_DISABLE=${NCCL_PXN_DISABLE:-1} -export NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE:-524288} - -log_exported_vars "General Performance Tuning" \ - GPU_MAX_HW_QUEUES ENABLE_NUMA_BINDING CUDA_DEVICE_MAX_CONNECTIONS \ - TORCH_NCCL_HIGH_PRIORITY NCCL_PXN_DISABLE NCCL_P2P_NET_CHUNKSIZE - -# ----------------- Transformer Engine Optimizations ----------------- -# Optimize NVTE fp8 cast transpose -export NVTE_USE_CAST_TRANSPOSE_TRITON=${NVTE_USE_CAST_TRANSPOSE_TRITON:-1} -export NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE=${NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE:-0} - -# Note: Disable v3 due to accuracy issues. Will fix after TE version 2.1. -export NVTE_CK_USES_BWD_V3=${NVTE_CK_USES_BWD_V3:-0} - -# Note: Disable fp32 atomic if you find any accuracy issue -export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=${PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32:-0} - -# NVTE debug envs -export NVTE_DEBUG=${NVTE_DEBUG:-0} # 0, 1 -export NVTE_DEBUG_LEVEL=${NVTE_DEBUG_LEVEL:-0} # 0, 1, 2 -export NVTE_FUSED_ATTN_LOG_CONFIG=${NVTE_FUSED_ATTN_LOG_CONFIG:-0} # 0, 1 -export PATCH_TE_FLASH_ATTN=${PATCH_TE_FLASH_ATTN:-0} - -log_exported_vars "Transformer Engine Optimizations" \ - NVTE_USE_CAST_TRANSPOSE_TRITON NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE \ - NVTE_CK_USES_BWD_V3 PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32 \ - NVTE_DEBUG NVTE_DEBUG_LEVEL NVTE_FUSED_ATTN_LOG_CONFIG PATCH_TE_FLASH_ATTN diff --git a/runner/helpers/envs/primus-env.sh b/runner/helpers/envs/primus-env.sh index b9530c5f..7af0ad2f 100755 --- a/runner/helpers/envs/primus-env.sh +++ b/runner/helpers/envs/primus-env.sh @@ -9,10 +9,8 @@ # Primus Environment Setup - Layered Configuration Loading # ============================================================================= # Load order: -# 1. base_env.sh - Base configuration (logging, cluster info, pythonpath) -# 2. common_network.sh - Network and NCCL settings -# 3. perf_tuning.sh - Performance tuning and optimizations -# 4. .sh - GPU-specific overrides (e.g., MI300X.sh, MI325X.sh) +# 1. base_env.sh - All base configurations (cluster, network, performance, pythonpath) +# 2. .sh - GPU-specific overrides (e.g., MI300X.sh, MI325X.sh) # # Environment Variables: # PRIMUS_DEBUG=1 - Enable debug mode (set -x, verbose output) @@ -27,7 +25,7 @@ fi SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# 1. Load base environment (logging, cluster info, pythonpath) +# 1. Load base environment (includes all configurations) # shellcheck source=runner/helpers/envs/base_env.sh # shellcheck disable=SC1091 source "${SCRIPT_DIR}/base_env.sh" @@ -35,18 +33,41 @@ source "${SCRIPT_DIR}/base_env.sh" LOG_INFO_RANK0 "" LOG_INFO_RANK0 "=== Loading Primus Environment Configuration ===" -# 2. Load common network configuration -# shellcheck source=runner/helpers/envs/common_network.sh -# shellcheck disable=SC1091 -source "${SCRIPT_DIR}/common_network.sh" +# 2. Detect GPU model and load device-specific configuration -# 3. Load performance tuning configuration -# shellcheck source=runner/helpers/envs/perf_tuning.sh -# shellcheck disable=SC1091 -source "${SCRIPT_DIR}/perf_tuning.sh" +# GPU detection function +detect_gpu_model() { + local gpu_model + gpu_model="unknown" + + # Check if rocm-smi is available + if ! command -v rocm-smi &> /dev/null; then + echo "Error: rocm-smi not found. Is ROCm installed?" >&2 + echo "unknown" + return 1 + fi + + # Get product name from rocm-smi + local product_name + product_name=$(rocm-smi --showproductname 2>/dev/null | grep -i "Card series" | head -n1 | awk '{print $NF}') -# 4. Detect GPU model and load device-specific configuration -GPU_MODEL=$(bash "${SCRIPT_DIR}/detect_gpu.sh") + # If that doesn't work, try alternative method + if [[ -z "$product_name" ]]; then + product_name=$(rocm-smi --showproductname 2>/dev/null | grep -oP 'MI\d+[A-Z]*' | head -n1) + fi + + # Extract model identifier (MI300, MI355, etc.) + if [[ "$product_name" =~ MI([0-9]+)([A-Z]*) ]]; then + gpu_model="MI${BASH_REMATCH[1]}${BASH_REMATCH[2]}" + fi + + echo "$gpu_model" +} + +GPU_MODEL=$(detect_gpu_model) +if [[ "$GPU_MODEL" == "unknown" ]]; then + LOG_WARN "Unable to detect GPU model. Using default configuration." +fi LOG_INFO_RANK0 "Detected GPU model: ${GPU_MODEL}" GPU_CONFIG_FILE="${SCRIPT_DIR}/${GPU_MODEL}.sh"