AMD-AGI
diff --git a/‎runner/helpers/envs/MI300X.sh‎
Lines changed: 34 additions & 0 deletions b/‎runner/helpers/envs/MI300X.sh‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎runner/helpers/envs/MI325X.sh‎
Lines changed: 33 additions & 0 deletions b/‎runner/helpers/envs/MI325X.sh‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎runner/helpers/envs/MI355X.sh‎
Lines changed: 34 additions & 0 deletions b/‎runner/helpers/envs/MI355X.sh‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎runner/helpers/envs/base_env.sh‎
Lines changed: 96 additions & 0 deletions b/‎runner/helpers/envs/base_env.sh‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎runner/helpers/envs/common_network.sh‎
Lines changed: 83 additions & 0 deletions b/‎runner/helpers/envs/common_network.sh‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎runner/helpers/envs/detect_gpu.sh‎
Lines changed: 49 additions & 0 deletions b/‎runner/helpers/envs/detect_gpu.sh‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎runner/helpers/get_ip_interface.sh‎ renamed to ‎runner/helpers/envs/get_ip_interface.sh‎ b/‎runner/helpers/get_ip_interface.sh‎ renamed to ‎runner/helpers/envs/get_ip_interface.sh‎
diff --git a/‎runner/helpers/get_nccl_ib_hca.sh‎ renamed to ‎runner/helpers/envs/get_nccl_ib_hca.sh‎ b/‎runner/helpers/get_nccl_ib_hca.sh‎ renamed to ‎runner/helpers/envs/get_nccl_ib_hca.sh‎
@@ -0,0 +1,34 @@
+#!/bin/bash
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+#
+# AMD MI300X GPU-specific optimizations
+# Note: Common settings are in base_env.sh. This file only contains MI300X-specific overrides.
+#
+
+LOG_INFO_RANK0 "Loading MI300X-specific optimizations..."
+
+# ----------------- MI300X-specific GPU settings -----------------
+# MI300X has 192GB HBM3, disable XNACK for performance
+export HSA_XNACK=${HSA_XNACK:-0}
+
+# Optimize memory allocation for large models
+export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100}
+
+# MI300X-specific memory optimizations
+# Increase HSA kernarg pool size for large model workloads (12MB)
+export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912}
+
+# ----------------- MI300X RCCL optimizations -----------------
+# MI300X works well with MSCCLPP disabled (already set in base_env.sh)
+# Override here only if needed for specific MI300X workloads
+
+# Uncomment to enable MSCCLPP for MI300X if tested and verified
+# export RCCL_MSCCLPP_ENABLE=1
+# export RCCL_MSCCLPP_FORCE_ENABLE=1
+
+log_exported_vars "MI300X-specific optimizations" \
+    HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE
@@ -0,0 +1,33 @@
+#!/bin/bash
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+#
+# AMD MI325X GPU-specific optimizations
+# Note: Common settings are in base_env.sh. This file only contains MI325X-specific overrides.
+#
+
+LOG_INFO_RANK0 "Loading MI325X-specific optimizations..."
+
+# ----------------- MI325X-specific GPU settings -----------------
+# MI325X has 256GB HBM3e (enhanced), disable XNACK for performance
+export HSA_XNACK=${HSA_XNACK:-0}
+
+# Optimize memory allocation for larger models compared to MI300X
+export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100}
+
+# MI325X-specific memory optimizations
+export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-12582912}
+
+# ----------------- MI325X RCCL optimizations -----------------
+# MI325X may benefit from different RCCL settings
+# Override base_env.sh settings if needed for MI325X
+
+# Uncomment to enable MSCCLPP for MI325X if tested and verified
+# export RCCL_MSCCLPP_ENABLE=1
+# export RCCL_MSCCLPP_FORCE_ENABLE=1
+
+log_exported_vars "MI325X-specific optimizations" \
+    HSA_XNACK GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE
@@ -0,0 +1,34 @@
+#!/bin/bash
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+#
+# AMD MI355X GPU-specific optimizations
+# Note: MI355X is an APU with integrated CPU and GPU, using unified memory architecture.
+# Common settings are in base_env.sh. This file only contains MI355X-specific overrides.
+#
+
+LOG_INFO_RANK0 "Loading MI355X-specific optimizations..."
+
+# ----------------- MI355X-specific GPU settings -----------------
+# MI355X has 128GB unified memory (HBM + DDR)
+# Enable XNACK for unified memory support (different from discrete GPUs)
+export HSA_XNACK=${HSA_XNACK:-1}
+
+# APU-specific: Enable interrupt-driven mode for better power efficiency
+export HSA_ENABLE_INTERRUPT=${HSA_ENABLE_INTERRUPT:-1}
+
+# Optimize memory allocation for unified memory architecture
+export GPU_MAX_HEAP_SIZE=${GPU_MAX_HEAP_SIZE:-100}
+
+# MI355X memory pool settings
+export HSA_KERNARG_POOL_SIZE=${HSA_KERNARG_POOL_SIZE:-8388608}  # 8MB (smaller than discrete GPUs)
+
+# ----------------- MI355X RCCL optimizations -----------------
+# APU may have different interconnect characteristics
+# Keep base_env.sh settings unless testing shows otherwise
+
+log_exported_vars "MI355X-specific optimizations" \
+    HSA_XNACK HSA_ENABLE_INTERRUPT GPU_MAX_HEAP_SIZE HSA_KERNARG_POOL_SIZE
@@ -0,0 +1,96 @@
+#!/bin/bash
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+# =============================================================================
+# Base Environment Configuration
+# =============================================================================
+# This file provides the foundation for all environment configurations:
+#   - Logging functions (LOG_INFO, LOG_INFO_RANK0, LOG_ERROR, etc.)
+#   - Distributed training cluster information (MASTER_ADDR, NNODES, etc.)
+#   - Python path setup
+#
+# Network, performance tuning, and GPU-specific settings are loaded separately
+# =============================================================================
+
+# ---------------------------------------------------------------------------
+# Guard: avoid duplicate exports/logging on multiple sourcing
+# ---------------------------------------------------------------------------
+if [[ -n "${__PRIMUS_BASE_ENV_SOURCED:-}" ]]; then
+  return 0
+fi
+export __PRIMUS_BASE_ENV_SOURCED=1
+
+# ---------------------------------------------------------------------------
+# Load common library for consistent logging
+# ---------------------------------------------------------------------------
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+if [[ -f "$SCRIPT_DIR/../../lib/common.sh" ]]; then
+    # shellcheck disable=SC1091
+    source "$SCRIPT_DIR/../../lib/common.sh"
+else
+    # Fallback logging functions if common.sh not available
+    HOSTNAME="$(hostname)"
+    export HOSTNAME
+
+    LOG_INFO() {
+        if [ "$*" = "" ]; then
+            echo ""
+        else
+            echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] $*"
+        fi
+    }
+
+    LOG_INFO_RANK0() {
+        if [ "${NODE_RANK:-0}" -eq 0 ]; then
+            if [ "$*" = "" ]; then
+                echo ""
+            else
+                echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] $*"
+            fi
+        fi
+    }
+
+    LOG_ERROR() {
+        echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] [ERROR] $*" >&2
+    }
+
+    LOG_WARN() {
+        echo "[NODE-${NODE_RANK:-0}($HOSTNAME)] [WARN] $*" >&2
+    }
+
+    log_exported_vars() {
+        LOG_INFO_RANK0 "========== $1 =========="
+        for var in "${@:2}"; do
+            LOG_INFO_RANK0 "    $var=${!var-}"
+        done
+    }
+fi
+
+# ---------------------------------------------------------------------------
+# Distributed Training Cluster Configuration
+# ---------------------------------------------------------------------------
+export MASTER_ADDR=${MASTER_ADDR:-localhost}
+export MASTER_PORT=${MASTER_PORT:-1234}
+export NNODES=${NNODES:-1}
+export NODE_RANK=${NODE_RANK:-0}
+export GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+
+log_exported_vars "Training Cluster Info" \
+    MASTER_ADDR MASTER_PORT NNODES NODE_RANK GPUS_PER_NODE
+
+# ---------------------------------------------------------------------------
+# Python Path Setup
+# ---------------------------------------------------------------------------
+PRIMUS_PATH=$(cd "$SCRIPT_DIR/../../.." && pwd)
+site_packages=$(python -c "import sysconfig; print(sysconfig.get_paths()['purelib'])" 2>/dev/null || echo "")
+if [[ -n "$site_packages" ]]; then
+    export PYTHONPATH="${PRIMUS_PATH}:${site_packages}:${PYTHONPATH:-}"
+else
+    export PYTHONPATH="${PRIMUS_PATH}:${PYTHONPATH:-}"
+fi
+
+log_exported_vars "Python Path" PYTHONPATH
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+# =============================================================================
+# NCCL and Network Configuration
+# =============================================================================
+# This file contains all network-related environment variable settings
+# for distributed training with NCCL and communication libraries.
+# =============================================================================
+
+# Dependency check: ensure base_env.sh has been loaded
+if [[ -z "${GPUS_PER_NODE}" ]]; then
+    echo "[ERROR] GPUS_PER_NODE not set. base_env.sh must be loaded first." >&2
+    exit 1
+fi
+
+if ! declare -f log_exported_vars >/dev/null 2>&1; then
+    echo "[ERROR] log_exported_vars function not found. base_env.sh must be loaded first." >&2
+    exit 1
+fi
+
+# Set visible GPUs for the current node (0 to GPUS_PER_NODE-1)
+HIP_VISIBLE_DEVICES=$(seq -s, 0 $((GPUS_PER_NODE - 1)))
+export HIP_VISIBLE_DEVICES
+
+# ----------------- NCCL and Network Settings -----------------
+
+# NCCL logging level: VERSION, WARN, INFO, DEBUG, TRACE
+# Set to empty for default behavior, or specify level for debugging
+export NCCL_DEBUG=${NCCL_DEBUG:-}
+
+# Disable NCCL internal checks to reduce overhead
+export NCCL_CHECKS_DISABLE=1
+
+# Set InfiniBand GID index for NCCL communication
+export NCCL_IB_GID_INDEX=3
+
+# Disable cross NIC communication for NCCL
+export NCCL_CROSS_NIC=0
+
+# Dynamically get InfiniBand Host Channel Adapter index for NCCL if not set
+if [ -z "${NCCL_IB_HCA}" ]; then
+    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+    NCCL_IB_HCA=$(bash "${SCRIPT_DIR}/get_nccl_ib_hca.sh" 2>/dev/null || echo "")
+fi
+export NCCL_IB_HCA
+
+# Dynamically get network interface IP address for socket communication if not set
+if [ -z "${IP_INTERFACE}" ]; then
+    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+    IP_INTERFACE=$(bash "${SCRIPT_DIR}/get_ip_interface.sh" 2>/dev/null || hostname -I | awk '{print $1}')
+fi
+export IP_INTERFACE
+
+# Set network interfaces for NCCL and Gloo, fallback to detected IP_INTERFACE
+export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$IP_INTERFACE}
+export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$IP_INTERFACE}
+
+# ----------------- RCCL Settings (AMD ROCm Communication Library) -----------------
+
+# Disable MSCCL (RCCL multi-connection feature) for better stability
+export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
+export RCCL_MSCCLPP_ENABLE=${RCCL_MSCCLPP_ENABLE:-0}
+export RCCL_MSCCLPP_FORCE_ENABLE=${RCCL_MSCCLPP_FORCE_ENABLE:-0}
+export RCCL_MSCCLPP_THRESHOLD=${RCCL_MSCCLPP_THRESHOLD:-$((1*1024*1024*1024))} # default 1GB
+
+# https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/env.hpp#L82-L87
+export MSCCLPP_DISABLE_CHANNEL_CACHE=${MSCCLPP_DISABLE_CHANNEL_CACHE:-FALSE}
+
+# PyTorch needs this env to enable register comm
+export TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=${TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK:-0}
+
+log_exported_vars "NCCL and Network Settings" \
+    HIP_VISIBLE_DEVICES NCCL_DEBUG NCCL_CHECKS_DISABLE NCCL_IB_GID_INDEX \
+    NCCL_CROSS_NIC NCCL_IB_HCA IP_INTERFACE NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME
+
+log_exported_vars "RCCL Settings" \
+    RCCL_MSCCL_ENABLE RCCL_MSCCLPP_ENABLE RCCL_MSCCLPP_FORCE_ENABLE RCCL_MSCCLPP_THRESHOLD \
+    MSCCLPP_DISABLE_CHANNEL_CACHE TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK
@@ -0,0 +1,49 @@
+#!/bin/bash
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+#
+# Detect GPU Model Script
+# Uses rocm-smi to detect AMD GPU model (MI300, MI355, etc.)
+#
+
+detect_gpu_model() {
+    local gpu_model
+    gpu_model="unknown"
+
+    # Check if rocm-smi is available
+    if ! command -v rocm-smi &> /dev/null; then
+        echo "Error: rocm-smi not found. Is ROCm installed?" >&2
+        return 1
+    fi
+
+    # Get product name from rocm-smi
+    local product_name
+    product_name=$(rocm-smi --showproductname 2>/dev/null | grep -i "Card series" | head -n1 | awk '{print $NF}')
+
+    # If that doesn't work, try alternative method
+    if [[ -z "$product_name" ]]; then
+        product_name=$(rocm-smi --showproductname 2>/dev/null | grep -oP 'MI\d+[A-Z]*' | head -n1)
+    fi
+
+    # Extract model identifier (MI300, MI355, etc.)
+    if [[ "$product_name" =~ MI([0-9]+)([A-Z]*) ]]; then
+        gpu_model="MI${BASH_REMATCH[1]}${BASH_REMATCH[2]}"
+    fi
+
+    echo "$gpu_model"
+}
+
+# Execute detection
+GPU_MODEL=$(detect_gpu_model)
+
+# Output result
+echo "$GPU_MODEL"
+
+# Exit with error if detection failed
+if [[ "$GPU_MODEL" == "unknown" ]]; then
+    echo "Warning: Unable to detect GPU model. Using default configuration." >&2
+    exit 1
+fi