Skip to content

Commit bdcc2de

Browse files
authored
feature(cli): refactor environment configuration with layered design (#280)
- Create runner/helpers/envs/ directory with modular configuration - base_env.sh: Base configuration (logging, cluster info, pythonpath) - primus-env.sh: Main entry point with layered loading - MI300X.sh, MI325X.sh, MI355X.sh: GPU-specific configurations - get_ip_interface.sh, get_nccl_ib_hca.sh: Network detection utilities - Refactor configuration loading with clear responsibilities - Implement dependency checks between config files - Add validation with validate_distributed_params() - Support debug mode (PRIMUS_DEBUG=1) - Support validation skip (PRIMUS_SKIP_VALIDATION=1) - Add comprehensive unit tests for primus-env.sh - 10 test cases covering all core functionality - Tests for validation, debug mode, defaults, error detection - 100% test pass rate (10/10) - Update test runner to include new test suite - Add test_primus_env.sh to run_all_tests.sh Architecture improvements: - Clear separation of concerns (base, network, perf, gpu-specific) - Better maintainability with modular design - Robust error handling and validation - Production-ready configuration system
1 parent 5a03df4 commit bdcc2de

File tree

10 files changed

+848
-159
lines changed

10 files changed

+848
-159
lines changed

runner/helpers/envs/MI300X.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
###############################################################################
7+
#
8+
# AMD MI300X GPU-specific optimizations
9+
# Note: Common settings are in base_env.sh. This file only contains MI300X-specific overrides.
10+
#
11+
12+
LOG_INFO_RANK0 "Loading MI300X-specific optimizations..."
13+
14+
# ----------------- MI300X-specific GPU settings -----------------
15+
# MI300X has 192GB HBM3, disable XNACK for performance
16+
# export HSA_XNACK=${HSA_XNACK:-0}
17+
18+
# Optimize memory allocation for large models
19+
# export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100}
20+
21+
# MI300X-specific memory optimizations
22+
# Increase HSA kernarg pool size for large model workloads (12MB)
23+
# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912}
24+
25+
# ----------------- MI300X RCCL optimizations -----------------
26+
# MI300X works well with MSCCLPP disabled (already set in common_network.sh)
27+
# Override here only if needed for specific MI300X workloads
28+
29+
# Uncomment to enable MSCCLPP for MI300X if tested and verified
30+
# export RCCL_MSCCLPP_ENABLE=1
31+
# export RCCL_MSCCLPP_FORCE_ENABLE=1
32+
33+
# log_exported_vars "MI300X-specific optimizations" \
34+
# HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE

runner/helpers/envs/MI325X.sh

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
###############################################################################
7+
#
8+
# AMD MI325X GPU-specific optimizations
9+
# Note: Common settings are in base_env.sh. This file only contains MI325X-specific overrides.
10+
#
11+
12+
LOG_INFO_RANK0 "Loading MI325X-specific optimizations..."
13+
14+
# ----------------- MI325X-specific GPU settings -----------------
15+
# MI325X has 256GB HBM3e (enhanced), disable XNACK for performance
16+
# export HSA_XNACK=${HSA_XNACK:-0}
17+
18+
# Optimize memory allocation for larger models compared to MI300X
19+
# export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100}
20+
21+
# MI325X-specific memory optimizations
22+
# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912}
23+
24+
# ----------------- MI325X RCCL optimizations -----------------
25+
# MI325X may benefit from different RCCL settings
26+
# Override common_network.sh settings if needed for MI325X
27+
28+
# Uncomment to enable MSCCLPP for MI325X if tested and verified
29+
# export RCCL_MSCCLPP_ENABLE=1
30+
# export RCCL_MSCCLPP_FORCE_ENABLE=1
31+
32+
# log_exported_vars "MI325X-specific optimizations" \
33+
# HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE

runner/helpers/envs/MI355X.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
###############################################################################
7+
#
8+
# AMD MI355X GPU-specific optimizations
9+
# Note: MI355X is an APU with integrated CPU and GPU, using unified memory architecture.
10+
# Common settings are in base_env.sh. This file only contains MI355X-specific overrides.
11+
#
12+
13+
LOG_INFO_RANK0 "Loading MI355X-specific optimizations..."
14+
15+
# ----------------- MI355X-specific GPU settings -----------------
16+
# MI355X has 128GB unified memory (HBM + DDR)
17+
# Enable XNACK for unified memory support (different from discrete GPUs)
18+
# export HSA_XNACK=${HSA_XNACK:-1}
19+
20+
# APU-specific: Enable interrupt-driven mode for better power efficiency
21+
# export HSA_ENABLE_INTERRUPT=${HSA_ENABLE_INTERRUPT:-1}
22+
23+
# Optimize memory allocation for unified memory architecture
24+
# export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100}
25+
26+
# MI355X memory pool settings
27+
# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-8388608} # 8MB (smaller than discrete GPUs)
28+
29+
# ----------------- MI355X RCCL optimizations -----------------
30+
# APU may have different interconnect characteristics
31+
# Keep common_network.sh settings unless testing shows otherwise
32+
33+
# log_exported_vars "MI355X-specific optimizations" \
34+
# HSA_XNACK HSA_ENABLE_INTERRUPT GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE

runner/helpers/envs/base_env.sh

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
#!/bin/bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
###############################################################################
7+
8+
# =============================================================================
9+
# Base Environment Configuration
10+
# =============================================================================
11+
# This file provides all environment configurations for Primus:
12+
# - Logging functions (LOG_INFO, LOG_INFO_RANK0, LOG_ERROR, etc.)
13+
# - Distributed training cluster information (MASTER_ADDR, NNODES, etc.)
14+
# - Python path setup and data paths
15+
# - NCCL and network settings
16+
# - RCCL communication library settings
17+
# - AMD GPU optimizations
18+
# - General performance tuning
19+
# - Transformer Engine optimizations
20+
#
21+
# GPU-specific settings can override these in GPU model files (e.g., MI300X.sh)
22+
# =============================================================================
23+
24+
# ---------------------------------------------------------------------------
25+
# Guard: avoid duplicate exports/logging on multiple sourcing
26+
# ---------------------------------------------------------------------------
27+
if [[ -n "${__PRIMUS_BASE_ENV_SOURCED:-}" ]]; then
28+
return 0
29+
fi
30+
export __PRIMUS_BASE_ENV_SOURCED=1
31+
32+
# ---------------------------------------------------------------------------
33+
# Load common library for consistent logging
34+
# ---------------------------------------------------------------------------
35+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
36+
if [[ -f "$SCRIPT_DIR/../../lib/common.sh" ]]; then
37+
# shellcheck disable=SC1091
38+
source "$SCRIPT_DIR/../../lib/common.sh"
39+
else
40+
# Fallback logging functions if common.sh not available
41+
HOSTNAME="$(hostname)"
42+
export HOSTNAME
43+
44+
LOG_INFO() {
45+
if [ "$*" = "" ]; then
46+
echo ""
47+
else
48+
echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] $*"
49+
fi
50+
}
51+
52+
LOG_INFO_RANK0() {
53+
if [ "${NODE_RANK:-0}" -eq 0 ]; then
54+
if [ "$*" = "" ]; then
55+
echo ""
56+
else
57+
echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] $*"
58+
fi
59+
fi
60+
}
61+
62+
LOG_ERROR() {
63+
echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] [ERROR] $*" >&2
64+
}
65+
66+
LOG_WARN() {
67+
echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] [WARN] $*" >&2
68+
}
69+
70+
log_exported_vars() {
71+
LOG_INFO_RANK0 "========== $1 =========="
72+
for var in "${@:2}"; do
73+
LOG_INFO_RANK0 " $var=${!var-}"
74+
done
75+
}
76+
fi
77+
78+
# ---------------------------------------------------------------------------
79+
# Distributed Training Cluster Configuration
80+
# ---------------------------------------------------------------------------
81+
export MASTER_ADDR=${MASTER_ADDR:-localhost}
82+
export MASTER_PORT=${MASTER_PORT:-1234}
83+
export NNODES=${NNODES:-1}
84+
export NODE_RANK=${NODE_RANK:-0}
85+
export GPUS_PER_NODE=${GPUS_PER_NODE:-8}
86+
87+
log_exported_vars "Training Cluster Info" \
88+
MASTER_ADDR MASTER_PORT NNODES NODE_RANK GPUS_PER_NODE
89+
90+
# ---------------------------------------------------------------------------
91+
# Python Path Setup
92+
# ---------------------------------------------------------------------------
93+
PRIMUS_PATH=$(cd "$SCRIPT_DIR/../../.." && pwd)
94+
export PRIMUS_PATH
95+
96+
# Set data paths
97+
export DATA_PATH=${DATA_PATH:-"${PRIMUS_PATH}/data"}
98+
export HF_HOME=${HF_HOME:-"${DATA_PATH}/huggingface"}
99+
100+
site_packages=$(python -c "import sysconfig; print(sysconfig.get_paths()['purelib'])" 2>/dev/null || echo "")
101+
if [[ -n "$site_packages" ]]; then
102+
export PYTHONPATH="${PRIMUS_PATH}:${site_packages}:${PYTHONPATH:-}"
103+
else
104+
export PYTHONPATH="${PRIMUS_PATH}:${PYTHONPATH:-}"
105+
fi
106+
107+
log_exported_vars "Python Path and Data Paths" \
108+
PRIMUS_PATH DATA_PATH HF_HOME PYTHONPATH
109+
110+
# =============================================================================
111+
# NCCL and Network Configuration
112+
# =============================================================================
113+
114+
# Set visible GPUs for the current node (0 to GPUS_PER_NODE-1)
115+
HIP_VISIBLE_DEVICES=$(seq -s, 0 $((GPUS_PER_NODE - 1)))
116+
export HIP_VISIBLE_DEVICES
117+
118+
# ----------------- NCCL and Network Settings -----------------
119+
120+
# NCCL logging level: VERSION, WARN, INFO, DEBUG, TRACE
121+
# Set to empty for default behavior, or specify level for debugging
122+
export NCCL_DEBUG=${NCCL_DEBUG:-}
123+
124+
# Disable NCCL internal checks to reduce overhead
125+
export NCCL_CHECKS_DISABLE=1
126+
127+
# Set InfiniBand GID index for NCCL communication
128+
export NCCL_IB_GID_INDEX=3
129+
130+
# Disable cross NIC communication for NCCL
131+
export NCCL_CROSS_NIC=0
132+
133+
# Dynamically get InfiniBand Host Channel Adapter index for NCCL if not set
134+
if [ -z "${NCCL_IB_HCA}" ]; then
135+
NCCL_IB_HCA=$(bash "${SCRIPT_DIR}/get_nccl_ib_hca.sh" 2>/dev/null || echo "")
136+
fi
137+
export NCCL_IB_HCA
138+
139+
# Dynamically get network interface IP address for socket communication if not set
140+
if [ -z "${IP_INTERFACE}" ]; then
141+
IP_INTERFACE=$(bash "${SCRIPT_DIR}/get_ip_interface.sh" 2>/dev/null || hostname -I | awk '{print $1}')
142+
fi
143+
export IP_INTERFACE
144+
145+
# Set network interfaces for NCCL and Gloo, fallback to detected IP_INTERFACE
146+
export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$IP_INTERFACE}
147+
export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$IP_INTERFACE}
148+
149+
# ----------------- RCCL Settings (AMD ROCm Communication Library) -----------------
150+
151+
# Disable MSCCL (RCCL multi-connection feature) for better stability
152+
export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
153+
export RCCL_MSCCLPP_ENABLE=${RCCL_MSCCLPP_ENABLE:-0}
154+
export RCCL_MSCCLPP_FORCE_ENABLE=${RCCL_MSCCLPP_FORCE_ENABLE:-0}
155+
export RCCL_MSCCLPP_THRESHOLD=${RCCL_MSCCLPP_THRESHOLD:-$((1*1024*1024*1024))} # default 1GB
156+
157+
# https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/env.hpp#L82-L87
158+
export MSCCLPP_DISABLE_CHANNEL_CACHE=${MSCCLPP_DISABLE_CHANNEL_CACHE:-FALSE}
159+
160+
# PyTorch needs this env to enable register comm
161+
export TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=${TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK:-0}
162+
163+
log_exported_vars "NCCL and Network Settings" \
164+
HIP_VISIBLE_DEVICES NCCL_DEBUG NCCL_CHECKS_DISABLE NCCL_IB_GID_INDEX \
165+
NCCL_CROSS_NIC NCCL_IB_HCA IP_INTERFACE NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME
166+
167+
log_exported_vars "RCCL Settings" \
168+
RCCL_MSCCL_ENABLE RCCL_MSCCLPP_ENABLE RCCL_MSCCLPP_FORCE_ENABLE RCCL_MSCCLPP_THRESHOLD \
169+
MSCCLPP_DISABLE_CHANNEL_CACHE TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK
170+
171+
# =============================================================================
172+
# Performance Tuning Configuration
173+
# =============================================================================
174+
175+
# ----------------- AMD-specific GPU optimizations -----------------
176+
# Enable system DMA engine (SDMA) on AMD GPUs for better IO throughput
177+
export HSA_ENABLE_SDMA=${HSA_ENABLE_SDMA:-1}
178+
179+
# Prevent scratch memory from being reclaimed to stabilize large memory usage
180+
# NOTE: Must disable scratch reclaim to avoid MoE training crash on AMD GPUs
181+
# Setting this to 0 prevents core dumps when using Mixture-of-Experts (MoE) models
182+
export HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM:-0}
183+
184+
log_exported_vars "AMD GPU Optimizations" \
185+
HSA_ENABLE_SDMA HSA_NO_SCRATCH_RECLAIM
186+
187+
# ----------------- General Performance Tuning -----------------
188+
# Limit GPU hardware queues to 2 for performance stability
189+
export GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES:-2}
190+
191+
# Increase HSA kernarg pool size to 12MB for models with many kernels (optional, can be set by GPU-specific configs)
192+
# export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912}
193+
194+
# Enable NUMA binding for better memory locality (may increase stability for large models)
195+
export ENABLE_NUMA_BINDING=${ENABLE_NUMA_BINDING:-0}
196+
197+
# Limit max CUDA device connections to reduce PCIe traffic
198+
export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1}
199+
200+
# Prioritize NCCL communication for PyTorch for higher throughput
201+
export TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY:-1}
202+
203+
# ----------------- NCCL Performance Settings -----------------
204+
# In multi-node training, PXN can be enabled to improve inter-node all-to-all
205+
# communication efficiency, but it will increase GPU memory usage.
206+
# Default: disable PXN for NCCL
207+
export NCCL_PXN_DISABLE=${NCCL_PXN_DISABLE:-1}
208+
export NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE:-524288}
209+
210+
log_exported_vars "General Performance Tuning" \
211+
GPU_MAX_HW_QUEUES ENABLE_NUMA_BINDING CUDA_DEVICE_MAX_CONNECTIONS \
212+
TORCH_NCCL_HIGH_PRIORITY NCCL_PXN_DISABLE NCCL_P2P_NET_CHUNKSIZE
213+
214+
# ----------------- Transformer Engine Optimizations -----------------
215+
# Optimize NVTE fp8 cast transpose
216+
export NVTE_USE_CAST_TRANSPOSE_TRITON=${NVTE_USE_CAST_TRANSPOSE_TRITON:-1}
217+
export NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE=${NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE:-0}
218+
219+
# Note: Disable v3 due to accuracy issues. Will fix after TE version 2.1.
220+
export NVTE_CK_USES_BWD_V3=${NVTE_CK_USES_BWD_V3:-0}
221+
222+
# Note: Disable fp32 atomic if you find any accuracy issue
223+
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=${PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32:-0}
224+
225+
# NVTE debug envs
226+
export NVTE_DEBUG=${NVTE_DEBUG:-0} # 0, 1
227+
export NVTE_DEBUG_LEVEL=${NVTE_DEBUG_LEVEL:-0} # 0, 1, 2
228+
export NVTE_FUSED_ATTN_LOG_CONFIG=${NVTE_FUSED_ATTN_LOG_CONFIG:-0} # 0, 1
229+
export PATCH_TE_FLASH_ATTN=${PATCH_TE_FLASH_ATTN:-0}
230+
231+
log_exported_vars "Transformer Engine Optimizations" \
232+
NVTE_USE_CAST_TRANSPOSE_TRITON NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE \
233+
NVTE_CK_USES_BWD_V3 PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32 \
234+
NVTE_DEBUG NVTE_DEBUG_LEVEL NVTE_FUSED_ATTN_LOG_CONFIG PATCH_TE_FLASH_ATTN
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)