Skip to content

Commit 25684a9

Browse files
committed
feat(envs): refactor environment configuration with layered design
- Create runner/helpers/envs/ directory with modular configuration - base_env.sh: Base configuration (logging, cluster info, pythonpath) - common_network.sh: Network and communication settings (NCCL, RCCL) - perf_tuning.sh: Performance tuning and optimizations - primus-env.sh: Main entry point with layered loading - detect_gpu.sh: GPU model detection - MI300X.sh, MI325X.sh, MI355X.sh: GPU-specific configurations - get_ip_interface.sh, get_nccl_ib_hca.sh: Network detection utilities - Refactor configuration loading with clear responsibilities - Implement dependency checks between config files - Add validation with validate_distributed_params() - Support debug mode (PRIMUS_DEBUG=1) - Support validation skip (PRIMUS_SKIP_VALIDATION=1) - Move RCCL configuration from perf_tuning.sh to common_network.sh - Better categorization: communication vs performance - Unified location for NCCL and RCCL settings - Rename env_common_network.sh to common_network.sh - Consistent naming with other config files - Move validation function to runner/lib/validation.sh - Eliminate duplicate code with DRY principle - Remove unnecessary wrapper function (YAGNI, KISS) - Add comprehensive unit tests for primus-env.sh - 10 test cases covering all core functionality - Tests for validation, debug mode, defaults, error detection - 100% test pass rate (10/10) - Update test runner to include new test suite - Add test_primus_env.sh to run_all_tests.sh Architecture improvements: - Clear separation of concerns (base, network, perf, gpu-specific) - Better maintainability with modular design - Robust error handling and validation - Production-ready configuration system
1 parent 7f74177 commit 25684a9

File tree

13 files changed

+904
-159
lines changed

13 files changed

+904
-159
lines changed

runner/helpers/envs/MI300X.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
###############################################################################
7+
#
8+
# AMD MI300X GPU-specific optimizations
9+
# Note: Common settings are in base_env.sh. This file only contains MI300X-specific overrides.
10+
#
11+
12+
LOG_INFO_RANK0 "Loading MI300X-specific optimizations..."
13+
14+
# ----------------- MI300X-specific GPU settings -----------------
15+
# MI300X has 192GB HBM3, disable XNACK for performance
16+
export HSA_XNACK=${HSA_XNACK:-0}
17+
18+
# Optimize memory allocation for large models
19+
export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100}
20+
21+
# MI300X-specific memory optimizations
22+
# Increase HSA kernarg pool size for large model workloads (12MB)
23+
export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912}
24+
25+
# ----------------- MI300X RCCL optimizations -----------------
26+
# MI300X works well with MSCCLPP disabled (already set in base_env.sh)
27+
# Override here only if needed for specific MI300X workloads
28+
29+
# Uncomment to enable MSCCLPP for MI300X if tested and verified
30+
# export RCCL_MSCCLPP_ENABLE=1
31+
# export RCCL_MSCCLPP_FORCE_ENABLE=1
32+
33+
log_exported_vars "MI300X-specific optimizations" \
34+
HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE

runner/helpers/envs/MI325X.sh

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
###############################################################################
7+
#
8+
# AMD MI325X GPU-specific optimizations
9+
# Note: Common settings are in base_env.sh. This file only contains MI325X-specific overrides.
10+
#
11+
12+
LOG_INFO_RANK0 "Loading MI325X-specific optimizations..."
13+
14+
# ----------------- MI325X-specific GPU settings -----------------
15+
# MI325X has 256GB HBM3e (enhanced), disable XNACK for performance
16+
export HSA_XNACK=${HSA_XNACK:-0}
17+
18+
# Optimize memory allocation for larger models compared to MI300X
19+
export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100}
20+
21+
# MI325X-specific memory optimizations
22+
export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912}
23+
24+
# ----------------- MI325X RCCL optimizations -----------------
25+
# MI325X may benefit from different RCCL settings
26+
# Override base_env.sh settings if needed for MI325X
27+
28+
# Uncomment to enable MSCCLPP for MI325X if tested and verified
29+
# export RCCL_MSCCLPP_ENABLE=1
30+
# export RCCL_MSCCLPP_FORCE_ENABLE=1
31+
32+
log_exported_vars "MI325X-specific optimizations" \
33+
HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE

runner/helpers/envs/MI355X.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
###############################################################################
7+
#
8+
# AMD MI355X GPU-specific optimizations
9+
# Note: MI355X is an APU with integrated CPU and GPU, using unified memory architecture.
10+
# Common settings are in base_env.sh. This file only contains MI355X-specific overrides.
11+
#
12+
13+
LOG_INFO_RANK0 "Loading MI355X-specific optimizations..."
14+
15+
# ----------------- MI355X-specific GPU settings -----------------
16+
# MI355X has 128GB unified memory (HBM + DDR)
17+
# Enable XNACK for unified memory support (different from discrete GPUs)
18+
export HSA_XNACK=${HSA_XNACK:-1}
19+
20+
# APU-specific: Enable interrupt-driven mode for better power efficiency
21+
export HSA_ENABLE_INTERRUPT=${HSA_ENABLE_INTERRUPT:-1}
22+
23+
# Optimize memory allocation for unified memory architecture
24+
export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100}
25+
26+
# MI355X memory pool settings
27+
export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-8388608} # 8MB (smaller than discrete GPUs)
28+
29+
# ----------------- MI355X RCCL optimizations -----------------
30+
# APU may have different interconnect characteristics
31+
# Keep base_env.sh settings unless testing shows otherwise
32+
33+
log_exported_vars "MI355X-specific optimizations" \
34+
HSA_XNACK HSA_ENABLE_INTERRUPT GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE

runner/helpers/envs/base_env.sh

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
###############################################################################
7+
8+
# =============================================================================
9+
# Base Environment Configuration
10+
# =============================================================================
11+
# This file provides the foundation for all environment configurations:
12+
# - Logging functions (LOG_INFO, LOG_INFO_RANK0, LOG_ERROR, etc.)
13+
# - Distributed training cluster information (MASTER_ADDR, NNODES, etc.)
14+
# - Python path setup
15+
#
16+
# Network, performance tuning, and GPU-specific settings are loaded separately
17+
# =============================================================================
18+
19+
# ---------------------------------------------------------------------------
20+
# Guard: avoid duplicate exports/logging on multiple sourcing
21+
# ---------------------------------------------------------------------------
22+
if [[ -n "${__PRIMUS_BASE_ENV_SOURCED:-}" ]]; then
23+
return 0
24+
fi
25+
export __PRIMUS_BASE_ENV_SOURCED=1
26+
27+
# ---------------------------------------------------------------------------
28+
# Load common library for consistent logging
29+
# ---------------------------------------------------------------------------
30+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
31+
if [[ -f "$SCRIPT_DIR/../../lib/common.sh" ]]; then
32+
# shellcheck disable=SC1091
33+
source "$SCRIPT_DIR/../../lib/common.sh"
34+
else
35+
# Fallback logging functions if common.sh not available
36+
HOSTNAME="$(hostname)"
37+
export HOSTNAME
38+
39+
LOG_INFO() {
40+
if [ "$*" = "" ]; then
41+
echo ""
42+
else
43+
echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] $*"
44+
fi
45+
}
46+
47+
LOG_INFO_RANK0() {
48+
if [ "${NODE_RANK:-0}" -eq 0 ]; then
49+
if [ "$*" = "" ]; then
50+
echo ""
51+
else
52+
echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] $*"
53+
fi
54+
fi
55+
}
56+
57+
LOG_ERROR() {
58+
echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] [ERROR] $*" >&2
59+
}
60+
61+
LOG_WARN() {
62+
echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] [WARN] $*" >&2
63+
}
64+
65+
log_exported_vars() {
66+
LOG_INFO_RANK0 "========== $1 =========="
67+
for var in "${@:2}"; do
68+
LOG_INFO_RANK0 " $var=${!var-}"
69+
done
70+
}
71+
fi
72+
73+
# ---------------------------------------------------------------------------
74+
# Distributed Training Cluster Configuration
75+
# ---------------------------------------------------------------------------
76+
export MASTER_ADDR=${MASTER_ADDR:-localhost}
77+
export MASTER_PORT=${MASTER_PORT:-1234}
78+
export NNODES=${NNODES:-1}
79+
export NODE_RANK=${NODE_RANK:-0}
80+
export GPUS_PER_NODE=${GPUS_PER_NODE:-8}
81+
82+
log_exported_vars "Training Cluster Info" \
83+
MASTER_ADDR MASTER_PORT NNODES NODE_RANK GPUS_PER_NODE
84+
85+
# ---------------------------------------------------------------------------
86+
# Python Path Setup
87+
# ---------------------------------------------------------------------------
88+
PRIMUS_PATH=$(cd "$SCRIPT_DIR/../../.." && pwd)
89+
site_packages=$(python -c "import sysconfig; print(sysconfig.get_paths()['purelib'])" 2>/dev/null || echo "")
90+
if [[ -n "$site_packages" ]]; then
91+
export PYTHONPATH="${PRIMUS_PATH}:${site_packages}:${PYTHONPATH:-}"
92+
else
93+
export PYTHONPATH="${PRIMUS_PATH}:${PYTHONPATH:-}"
94+
fi
95+
96+
log_exported_vars "Python Path" PYTHONPATH
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/env bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
###############################################################################
7+
8+
# =============================================================================
9+
# NCCL and Network Configuration
10+
# =============================================================================
11+
# This file contains all network-related environment variable settings
12+
# for distributed training with NCCL and communication libraries.
13+
# =============================================================================
14+
15+
# Dependency check: ensure base_env.sh has been loaded
16+
if [[ -z "${GPUS_PER_NODE}" ]]; then
17+
echo "[ERROR] GPUS_PER_NODE not set. base_env.sh must be loaded first." >&2
18+
exit 1
19+
fi
20+
21+
if ! declare -f log_exported_vars >/dev/null 2>&1; then
22+
echo "[ERROR] log_exported_vars function not found. base_env.sh must be loaded first." >&2
23+
exit 1
24+
fi
25+
26+
# Set visible GPUs for the current node (0 to GPUS_PER_NODE-1)
27+
HIP_VISIBLE_DEVICES=$(seq -s, 0 $((GPUS_PER_NODE - 1)))
28+
export HIP_VISIBLE_DEVICES
29+
30+
# ----------------- NCCL and Network Settings -----------------
31+
32+
# NCCL logging level: VERSION, WARN, INFO, DEBUG, TRACE
33+
# Set to empty for default behavior, or specify level for debugging
34+
export NCCL_DEBUG=${NCCL_DEBUG:-}
35+
36+
# Disable NCCL internal checks to reduce overhead
37+
export NCCL_CHECKS_DISABLE=1
38+
39+
# Set InfiniBand GID index for NCCL communication
40+
export NCCL_IB_GID_INDEX=3
41+
42+
# Disable cross NIC communication for NCCL
43+
export NCCL_CROSS_NIC=0
44+
45+
# Dynamically get InfiniBand Host Channel Adapter index for NCCL if not set
46+
if [ -z "${NCCL_IB_HCA}" ]; then
47+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
48+
NCCL_IB_HCA=$(bash "${SCRIPT_DIR}/get_nccl_ib_hca.sh" 2>/dev/null || echo "")
49+
fi
50+
export NCCL_IB_HCA
51+
52+
# Dynamically get network interface IP address for socket communication if not set
53+
if [ -z "${IP_INTERFACE}" ]; then
54+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
55+
IP_INTERFACE=$(bash "${SCRIPT_DIR}/get_ip_interface.sh" 2>/dev/null || hostname -I | awk '{print $1}')
56+
fi
57+
export IP_INTERFACE
58+
59+
# Set network interfaces for NCCL and Gloo, fallback to detected IP_INTERFACE
60+
export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$IP_INTERFACE}
61+
export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$IP_INTERFACE}
62+
63+
# ----------------- RCCL Settings (AMD ROCm Communication Library) -----------------
64+
65+
# Disable MSCCL (RCCL multi-connection feature) for better stability
66+
export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
67+
export RCCL_MSCCLPP_ENABLE=${RCCL_MSCCLPP_ENABLE:-0}
68+
export RCCL_MSCCLPP_FORCE_ENABLE=${RCCL_MSCCLPP_FORCE_ENABLE:-0}
69+
export RCCL_MSCCLPP_THRESHOLD=${RCCL_MSCCLPP_THRESHOLD:-$((1*1024*1024*1024))} # default 1GB
70+
71+
# https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/env.hpp#L82-L87
72+
export MSCCLPP_DISABLE_CHANNEL_CACHE=${MSCCLPP_DISABLE_CHANNEL_CACHE:-FALSE}
73+
74+
# PyTorch needs this env to enable register comm
75+
export TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=${TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK:-0}
76+
77+
log_exported_vars "NCCL and Network Settings" \
78+
HIP_VISIBLE_DEVICES NCCL_DEBUG NCCL_CHECKS_DISABLE NCCL_IB_GID_INDEX \
79+
NCCL_CROSS_NIC NCCL_IB_HCA IP_INTERFACE NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME
80+
81+
log_exported_vars "RCCL Settings" \
82+
RCCL_MSCCL_ENABLE RCCL_MSCCLPP_ENABLE RCCL_MSCCLPP_FORCE_ENABLE RCCL_MSCCLPP_THRESHOLD \
83+
MSCCLPP_DISABLE_CHANNEL_CACHE TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK

runner/helpers/envs/detect_gpu.sh

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/bin/bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
###############################################################################
7+
#
8+
# Detect GPU Model Script
9+
# Uses rocm-smi to detect AMD GPU model (MI300, MI355, etc.)
10+
#
11+
12+
detect_gpu_model() {
13+
local gpu_model
14+
gpu_model="unknown"
15+
16+
# Check if rocm-smi is available
17+
if ! command -v rocm-smi &> /dev/null; then
18+
echo "Error: rocm-smi not found. Is ROCm installed?" >&2
19+
return 1
20+
fi
21+
22+
# Get product name from rocm-smi
23+
local product_name
24+
product_name=$(rocm-smi --showproductname 2>/dev/null | grep -i "Card series" | head -n1 | awk '{print $NF}')
25+
26+
# If that doesn't work, try alternative method
27+
if [[ -z "$product_name" ]]; then
28+
product_name=$(rocm-smi --showproductname 2>/dev/null | grep -oP 'MI\d+[A-Z]*' | head -n1)
29+
fi
30+
31+
# Extract model identifier (MI300, MI355, etc.)
32+
if [[ "$product_name" =~ MI([0-9]+)([A-Z]*) ]]; then
33+
gpu_model="MI${BASH_REMATCH[1]}${BASH_REMATCH[2]}"
34+
fi
35+
36+
echo "$gpu_model"
37+
}
38+
39+
# Execute detection
40+
GPU_MODEL=$(detect_gpu_model)
41+
42+
# Output result
43+
echo "$GPU_MODEL"
44+
45+
# Exit with error if detection failed
46+
if [[ "$GPU_MODEL" == "unknown" ]]; then
47+
echo "Warning: Unable to detect GPU model. Using default configuration." >&2
48+
exit 1
49+
fi
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)