|
| 1 | +#!/bin/bash |
| 2 | +############################################################################### |
| 3 | +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. |
| 4 | +# |
| 5 | +# See LICENSE for license information. |
| 6 | +############################################################################### |
| 7 | + |
| 8 | +# ============================================================================= |
| 9 | +# Base Environment Configuration |
| 10 | +# ============================================================================= |
| 11 | +# This file provides all environment configurations for Primus: |
| 12 | +# - Logging functions (LOG_INFO, LOG_INFO_RANK0, LOG_ERROR, etc.) |
| 13 | +# - Distributed training cluster information (MASTER_ADDR, NNODES, etc.) |
| 14 | +# - Python path setup and data paths |
| 15 | +# - NCCL and network settings |
| 16 | +# - RCCL communication library settings |
| 17 | +# - AMD GPU optimizations |
| 18 | +# - General performance tuning |
| 19 | +# - Transformer Engine optimizations |
| 20 | +# |
| 21 | +# GPU-specific settings can override these in GPU model files (e.g., MI300X.sh) |
| 22 | +# ============================================================================= |
| 23 | + |
| 24 | +# --------------------------------------------------------------------------- |
| 25 | +# Guard: avoid duplicate exports/logging on multiple sourcing |
| 26 | +# --------------------------------------------------------------------------- |
| 27 | +if [[ -n "${__PRIMUS_BASE_ENV_SOURCED:-}" ]]; then |
| 28 | + return 0 |
| 29 | +fi |
| 30 | +export __PRIMUS_BASE_ENV_SOURCED=1 |
| 31 | + |
| 32 | +# --------------------------------------------------------------------------- |
| 33 | +# Load common library for consistent logging |
| 34 | +# --------------------------------------------------------------------------- |
| 35 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 36 | +if [[ -f "$SCRIPT_DIR/../../lib/common.sh" ]]; then |
| 37 | + # shellcheck disable=SC1091 |
| 38 | + source "$SCRIPT_DIR/../../lib/common.sh" |
| 39 | +else |
| 40 | + # Fallback logging functions if common.sh not available |
| 41 | + HOSTNAME="$(hostname)" |
| 42 | + export HOSTNAME |
| 43 | + |
| 44 | + LOG_INFO() { |
| 45 | + if [ "$*" = "" ]; then |
| 46 | + echo "" |
| 47 | + else |
| 48 | + echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] $*" |
| 49 | + fi |
| 50 | + } |
| 51 | + |
| 52 | + LOG_INFO_RANK0() { |
| 53 | + if [ "${NODE_RANK:-0}" -eq 0 ]; then |
| 54 | + if [ "$*" = "" ]; then |
| 55 | + echo "" |
| 56 | + else |
| 57 | + echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] $*" |
| 58 | + fi |
| 59 | + fi |
| 60 | + } |
| 61 | + |
| 62 | + LOG_ERROR() { |
| 63 | + echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] [ERROR] $*" >&2 |
| 64 | + } |
| 65 | + |
| 66 | + LOG_WARN() { |
| 67 | + echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] [WARN] $*" >&2 |
| 68 | + } |
| 69 | + |
| 70 | + log_exported_vars() { |
| 71 | + LOG_INFO_RANK0 "========== $1 ==========" |
| 72 | + for var in "${@:2}"; do |
| 73 | + LOG_INFO_RANK0 " $var=${!var-}" |
| 74 | + done |
| 75 | + } |
| 76 | +fi |
| 77 | + |
| 78 | +# --------------------------------------------------------------------------- |
| 79 | +# Distributed Training Cluster Configuration |
| 80 | +# --------------------------------------------------------------------------- |
| 81 | +export MASTER_ADDR=${MASTER_ADDR:-localhost} |
| 82 | +export MASTER_PORT=${MASTER_PORT:-1234} |
| 83 | +export NNODES=${NNODES:-1} |
| 84 | +export NODE_RANK=${NODE_RANK:-0} |
| 85 | +export GPUS_PER_NODE=${GPUS_PER_NODE:-8} |
| 86 | + |
| 87 | +log_exported_vars "Training Cluster Info" \ |
| 88 | + MASTER_ADDR MASTER_PORT NNODES NODE_RANK GPUS_PER_NODE |
| 89 | + |
| 90 | +# --------------------------------------------------------------------------- |
| 91 | +# Python Path Setup |
| 92 | +# --------------------------------------------------------------------------- |
| 93 | +PRIMUS_PATH=$(cd "$SCRIPT_DIR/../../.." && pwd) |
| 94 | +export PRIMUS_PATH |
| 95 | + |
| 96 | +# Set data paths |
| 97 | +export DATA_PATH=${DATA_PATH:-"${PRIMUS_PATH}/data"} |
| 98 | +export HF_HOME=${HF_HOME:-"${DATA_PATH}/huggingface"} |
| 99 | + |
| 100 | +site_packages=$(python -c "import sysconfig; print(sysconfig.get_paths()['purelib'])" 2>/dev/null || echo "") |
| 101 | +if [[ -n "$site_packages" ]]; then |
| 102 | + export PYTHONPATH="${PRIMUS_PATH}:${site_packages}:${PYTHONPATH:-}" |
| 103 | +else |
| 104 | + export PYTHONPATH="${PRIMUS_PATH}:${PYTHONPATH:-}" |
| 105 | +fi |
| 106 | + |
| 107 | +log_exported_vars "Python Path and Data Paths" \ |
| 108 | + PRIMUS_PATH DATA_PATH HF_HOME PYTHONPATH |
| 109 | + |
| 110 | +# ============================================================================= |
| 111 | +# NCCL and Network Configuration |
| 112 | +# ============================================================================= |
| 113 | + |
| 114 | +# Set visible GPUs for the current node (0 to GPUS_PER_NODE-1) |
| 115 | +HIP_VISIBLE_DEVICES=$(seq -s, 0 $((GPUS_PER_NODE - 1))) |
| 116 | +export HIP_VISIBLE_DEVICES |
| 117 | + |
| 118 | +# ----------------- NCCL and Network Settings ----------------- |
| 119 | + |
| 120 | +# NCCL logging level: VERSION, WARN, INFO, DEBUG, TRACE |
| 121 | +# Set to empty for default behavior, or specify level for debugging |
| 122 | +export NCCL_DEBUG=${NCCL_DEBUG:-} |
| 123 | + |
| 124 | +# Disable NCCL internal checks to reduce overhead |
| 125 | +export NCCL_CHECKS_DISABLE=1 |
| 126 | + |
| 127 | +# Set InfiniBand GID index for NCCL communication |
| 128 | +export NCCL_IB_GID_INDEX=3 |
| 129 | + |
| 130 | +# Disable cross NIC communication for NCCL |
| 131 | +export NCCL_CROSS_NIC=0 |
| 132 | + |
| 133 | +# Dynamically get InfiniBand Host Channel Adapter index for NCCL if not set |
| 134 | +if [ -z "${NCCL_IB_HCA}" ]; then |
| 135 | + NCCL_IB_HCA=$(bash "${SCRIPT_DIR}/get_nccl_ib_hca.sh" 2>/dev/null || echo "") |
| 136 | +fi |
| 137 | +export NCCL_IB_HCA |
| 138 | + |
| 139 | +# Dynamically get network interface IP address for socket communication if not set |
| 140 | +if [ -z "${IP_INTERFACE}" ]; then |
| 141 | + IP_INTERFACE=$(bash "${SCRIPT_DIR}/get_ip_interface.sh" 2>/dev/null || hostname -I | awk '{print $1}') |
| 142 | +fi |
| 143 | +export IP_INTERFACE |
| 144 | + |
| 145 | +# Set network interfaces for NCCL and Gloo, fallback to detected IP_INTERFACE |
| 146 | +export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$IP_INTERFACE} |
| 147 | +export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$IP_INTERFACE} |
| 148 | + |
| 149 | +# ----------------- RCCL Settings (AMD ROCm Communication Library) ----------------- |
| 150 | + |
| 151 | +# Disable MSCCL (RCCL multi-connection feature) for better stability |
| 152 | +export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0} |
| 153 | +export RCCL_MSCCLPP_ENABLE=${RCCL_MSCCLPP_ENABLE:-0} |
| 154 | +export RCCL_MSCCLPP_FORCE_ENABLE=${RCCL_MSCCLPP_FORCE_ENABLE:-0} |
| 155 | +export RCCL_MSCCLPP_THRESHOLD=${RCCL_MSCCLPP_THRESHOLD:-$((1*1024*1024*1024))} # default 1GB |
| 156 | + |
| 157 | +# https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/env.hpp#L82-L87 |
| 158 | +export MSCCLPP_DISABLE_CHANNEL_CACHE=${MSCCLPP_DISABLE_CHANNEL_CACHE:-FALSE} |
| 159 | + |
| 160 | +# PyTorch needs this env to enable register comm |
| 161 | +export TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=${TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK:-0} |
| 162 | + |
| 163 | +log_exported_vars "NCCL and Network Settings" \ |
| 164 | + HIP_VISIBLE_DEVICES NCCL_DEBUG NCCL_CHECKS_DISABLE NCCL_IB_GID_INDEX \ |
| 165 | + NCCL_CROSS_NIC NCCL_IB_HCA IP_INTERFACE NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME |
| 166 | + |
| 167 | +log_exported_vars "RCCL Settings" \ |
| 168 | + RCCL_MSCCL_ENABLE RCCL_MSCCLPP_ENABLE RCCL_MSCCLPP_FORCE_ENABLE RCCL_MSCCLPP_THRESHOLD \ |
| 169 | + MSCCLPP_DISABLE_CHANNEL_CACHE TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK |
| 170 | + |
| 171 | +# ============================================================================= |
| 172 | +# Performance Tuning Configuration |
| 173 | +# ============================================================================= |
| 174 | + |
| 175 | +# ----------------- AMD-specific GPU optimizations ----------------- |
| 176 | +# Enable system DMA engine (SDMA) on AMD GPUs for better IO throughput |
| 177 | +export HSA_ENABLE_SDMA=${HSA_ENABLE_SDMA:-1} |
| 178 | + |
| 179 | +# Prevent scratch memory from being reclaimed to stabilize large memory usage |
| 180 | +# NOTE: Must disable scratch reclaim to avoid MoE training crash on AMD GPUs |
| 181 | +# Setting this to 0 prevents core dumps when using Mixture-of-Experts (MoE) models |
| 182 | +export HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM:-0} |
| 183 | + |
| 184 | +log_exported_vars "AMD GPU Optimizations" \ |
| 185 | + HSA_ENABLE_SDMA HSA_NO_SCRATCH_RECLAIM |
| 186 | + |
| 187 | +# ----------------- General Performance Tuning ----------------- |
| 188 | +# Limit GPU hardware queues to 2 for performance stability |
| 189 | +export GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES:-2} |
| 190 | + |
| 191 | +# Increase HSA kernarg pool size to 12MB for models with many kernels (optional, can be set by GPU-specific configs) |
| 192 | +# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912} |
| 193 | + |
| 194 | +# Enable NUMA binding for better memory locality (may increase stability for large models) |
| 195 | +export ENABLE_NUMA_BINDING=${ENABLE_NUMA_BINDING:-0} |
| 196 | + |
| 197 | +# Limit max CUDA device connections to reduce PCIe traffic |
| 198 | +export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1} |
| 199 | + |
| 200 | +# Prioritize NCCL communication for PyTorch for higher throughput |
| 201 | +export TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY:-1} |
| 202 | + |
| 203 | +# ----------------- NCCL Performance Settings ----------------- |
| 204 | +# In multi-node training, PXN can be enabled to improve inter-node all-to-all |
| 205 | +# communication efficiency, but it will increase GPU memory usage. |
| 206 | +# Default: disable PXN for NCCL |
| 207 | +export NCCL_PXN_DISABLE=${NCCL_PXN_DISABLE:-1} |
| 208 | +export NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE:-524288} |
| 209 | + |
| 210 | +log_exported_vars "General Performance Tuning" \ |
| 211 | + GPU_MAX_HW_QUEUES ENABLE_NUMA_BINDING CUDA_DEVICE_MAX_CONNECTIONS \ |
| 212 | + TORCH_NCCL_HIGH_PRIORITY NCCL_PXN_DISABLE NCCL_P2P_NET_CHUNKSIZE |
| 213 | + |
| 214 | +# ----------------- Transformer Engine Optimizations ----------------- |
| 215 | +# Optimize NVTE fp8 cast transpose |
| 216 | +export NVTE_USE_CAST_TRANSPOSE_TRITON=${NVTE_USE_CAST_TRANSPOSE_TRITON:-1} |
| 217 | +export NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE=${NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE:-0} |
| 218 | + |
| 219 | +# Note: Disable v3 due to accuracy issues. Will fix after TE version 2.1. |
| 220 | +export NVTE_CK_USES_BWD_V3=${NVTE_CK_USES_BWD_V3:-0} |
| 221 | + |
| 222 | +# Note: Disable fp32 atomic if you find any accuracy issue |
| 223 | +export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=${PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32:-0} |
| 224 | + |
| 225 | +# NVTE debug envs |
| 226 | +export NVTE_DEBUG=${NVTE_DEBUG:-0} # 0, 1 |
| 227 | +export NVTE_DEBUG_LEVEL=${NVTE_DEBUG_LEVEL:-0} # 0, 1, 2 |
| 228 | +export NVTE_FUSED_ATTN_LOG_CONFIG=${NVTE_FUSED_ATTN_LOG_CONFIG:-0} # 0, 1 |
| 229 | +export PATCH_TE_FLASH_ATTN=${PATCH_TE_FLASH_ATTN:-0} |
| 230 | + |
| 231 | +log_exported_vars "Transformer Engine Optimizations" \ |
| 232 | + NVTE_USE_CAST_TRANSPOSE_TRITON NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE \ |
| 233 | + NVTE_CK_USES_BWD_V3 PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32 \ |
| 234 | + NVTE_DEBUG NVTE_DEBUG_LEVEL NVTE_FUSED_ATTN_LOG_CONFIG PATCH_TE_FLASH_ATTN |
0 commit comments