Skip to content

[rocmlibs][rccl] Support rccl on gfx1150 #1433

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 2 additions & 13 deletions bin/aomp_common_vars
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ unset CMAKE_GENERATOR
AOMP_COMPILER_NAME=${AOMP_COMPILER_NAME:-AOMP}

# ROCM_VERSION may be set by Jenkins, if not guess a good number
ROCM_VERSION=${ROCM_VERSION:-6.0.0}
# it is used by rocm-core and rccl also
ROCM_VERSION=${ROCM_VERSION:-6.4.0}

# Set the AOMP VERSION STRING
AOMP_VERSION=${AOMP_VERSION:-"21.0"}
Expand Down Expand Up @@ -302,18 +303,6 @@ for arch in ${ROCMLIBS_GFXLIST//;/ }; do
fi
done

# rccl currently doesn't support gfx1103 and gfx1150.
RCCL_GFXLIST=${RCCL_GFXLIST:-"gfx90a;gfx942"}

# Do a sanity check for stanalone build that RCCL_GFXLIST is a subset of ROCMLIBS_GFXLIST
for arch in ${RCCL_GFXLIST//;/ }; do
if [[ ! $ROCMLIBS_GFXLIST =~ $arch ]]; then
echo "ERROR: RCCL_GFXLIST architecture $arch not in ROCMLIBS_GFXLIST $ROCMLIBS_GFXLIST"
echo "Please update RCCL_GFXLIST to be a subset of ROCMLIBS_GFXLIST, or leave it undefined to use the default architecture list."
exit 1
fi
done

# Calculate the number of threads to use for make
COMP_THREADS=1
if [ ! -z `which "getconf"` ]; then
Expand Down
39 changes: 21 additions & 18 deletions bin/rocmlibs/build_rccl.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/bin/bash
#
# build_rccl.sh: Script to build and install rccl.
# This uses a slightly modified install.sh from rccl.
#
# build_rccl.sh: Script to build and install rccl.
# This uses a slightly modified install.sh from rccl.
# It has a dependency on rocm-core.
BUILD_TYPE=${BUILD_TYPE:-Release}

# --- Start standard header to set AOMP environment variables ----
Expand All @@ -11,6 +11,8 @@ thisdir=`dirname $realpath`
. $thisdir/../aomp_common_vars
# --- end standard header ----

export ROCM_PATH=$AOMP_INSTALL_DIR

_howcalled=${0##*/}
_shname=${_howcalled#build_*} # strip off build_
_libname=${_shname%*.sh} # strip off .sh to get component libname = rccl
Expand All @@ -32,9 +34,9 @@ fi
# rccl needs cmake 3.25, so put prereq cmake first in path
export PATH=$AOMP_SUPP/cmake/bin:$PATH

if [ $AOMP_STANDALONE_BUILD == 1 ] ; then
if [ ! -L $AOMP ] ; then
if [ -d $AOMP ] ; then
if [ $AOMP_STANDALONE_BUILD == 1 ] ; then
if [ ! -L $AOMP ] ; then
if [ -d $AOMP ] ; then
echo "ERROR: Directory $AOMP is a physical directory."
echo " It must be a symbolic link or not exist"
exit 1
Expand All @@ -45,25 +47,25 @@ else
exit 1
fi

if [ "$1" == "nocmake" ] ; then
if [ "$1" == "nocmake" ] ; then
_nocmake_option="--nocmake"
else
_nocmake_option=""
fi

if [ "$BUILD_TYPE" == "Release" ] ; then
if [ "$BUILD_TYPE" == "Release" ] ; then
_buildtype_option=""
_build_dir_option="release"
else
_buildtype_option="--debug"
_build_dir_option="debug"
fi

# Make sure we can update the install directory
# Make sure we can update the install directory
if [ "$1" == "install" ] ; then
$SUDO mkdir -p $AOMP_INSTALL_DIR
$SUDO touch $AOMP_INSTALL_DIR/testfile
if [ $? != 0 ] ; then
if [ $? != 0 ] ; then
echo "ERROR: No update access to $AOMP_INSTALL_DIR"
exit 1
fi
Expand All @@ -78,7 +80,7 @@ if [ "$1" != "nocmake" ] && [ "$1" != "install" ] ; then
rm -rf $BUILD_DIR/build/rocmlibs/$_libname
mkdir -p $BUILD_DIR/build/rocmlibs/$_libname
else
if [ ! -d $BUILD_DIR/build/rocmlibs/$_libname ] ; then
if [ ! -d $BUILD_DIR/build/rocmlibs/$_libname ] ; then
echo "ERROR: The build directory $BUILD_DIR/build/rocmlibs/$_libname does not exist"
echo " run $0 without install and without nocmake option"
exit 1
Expand All @@ -92,11 +94,12 @@ if [ "$1" != "install" ] ; then
echo
echo " -----Running cmake in install.sh ---"
echo cd $AOMP_REPOS/build/rocmlibs/$_libname
cd $AOMP_REPOS/build/rocmlibs/$_libname
cd $AOMP_REPOS/build/rocmlibs/$_libname
# --noinstall must follow --prefix because --prefix sets install_library=true
echo $_source_dir/install.sh $_nocmake_option $_buildtype_option -j $AOMP_JOB_THREADS --prefix $AOMP_INSTALL_DIR $_set_ninja_gen --source_dir $_source_dir --noinstall --amdgpu_targets $RCCL_GFXLIST
$_source_dir/install.sh $_nocmake_option $_buildtype_option -j $AOMP_JOB_THREADS --prefix $AOMP_INSTALL_DIR $_set_ninja_gen --source_dir $_source_dir --noinstall --amdgpu_targets $RCCL_GFXLIST
if [ $? != 0 ] ; then
# add --disable-colltrace because it is not supported on gfx1150
echo $_source_dir/install.sh $_nocmake_option $_buildtype_option -j $AOMP_JOB_THREADS --prefix $AOMP_INSTALL_DIR $_set_ninja_gen --source_dir $_source_dir --noinstall --amdgpu_targets $ROCMLIBS_GFXLIST --disable-colltrace
$_source_dir/install.sh $_nocmake_option $_buildtype_option -j $AOMP_JOB_THREADS --prefix $AOMP_INSTALL_DIR $_set_ninja_gen --source_dir $_source_dir --noinstall --amdgpu_targets $ROCMLIBS_GFXLIST --disable-colltrace
if [ $? != 0 ] ; then
echo "ERROR install failed."
echo " $MYCMAKEOPTS"
cd $_curdir
Expand All @@ -118,9 +121,9 @@ if [ "$1" == "install" ] ; then
echo "SUCCESSFUL INSTALL to $AOMP_INSTALL_DIR"
echo
removepatch $_source_dir
else
echo
else
echo
echo "SUCCESSFUL BUILD, please run: $0 install"
echo " to install into $AOMP_INSTALL_DIR"
echo
echo
fi
130 changes: 130 additions & 0 deletions bin/rocmlibs/build_rocm-core.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/bin/bash
#
# build_rocm-core.sh: Script to build and install rocm-core library
#
#
BUILD_TYPE=${BUILD_TYPE:-Release}

# --- Start standard header to set AOMP environment variables ----
realpath=`realpath $0`
thisdir=`dirname $realpath`
. $thisdir/../aomp_common_vars
# --- end standard header ----

_repo_dir=$AOMP_REPOS/rocmlibs/rocm-core
patchrepo $_repo_dir

export CC=$LLVM_INSTALL_LOC/bin/clang
export CXX=$LLVM_INSTALL_LOC/bin/clang++
export FC=$LLVM_INSTALL_LOC/bin/flang
export ROCM_DIR=$AOMP_INSTALL_DIR
export ROCM_PATH=$AOMP_INSTALL_DIR
export PATH=$AOMP_SUPP/cmake/bin:$AOMP_INSTALL_DIR/bin:$PATH
export HIP_USE_PERL_SCRIPTS=1
export USE_PERL_SCRIPTS=1
export NUM_PROC=$AOMP_JOB_THREADS
export CXXFLAGS="-I$AOMP_INSTALL_DIR/include -D__HIP_PLATFORM_AMD__=1"
export LDFLAGS="-fPIC"
if [ "$AOMP_USE_CCACHE" != 0 ] ; then
_ccache_bin=`which ccache`
# export CMAKE_CXX_COMPILER_LAUNCHER=$_ccache_bin
fi

if [ $AOMP_STANDALONE_BUILD == 1 ] ; then
if [ ! -L $AOMP ] ; then
if [ -d $AOMP ] ; then
echo "ERROR: Directory $AOMP is a physical directory."
echo " It must be a symbolic link or not exist"
exit 1
fi
fi
else
echo "ERROR: $0 only valid for AOMP_STANDALONE_BUILD=1"
exit 1
fi

if [ "$1" == "nocmake" ] ; then
echo "ERROR: nocmake is not an option for $0"
exit 1
fi

# Make sure we can update the install directory
if [ "$1" == "install" ] ; then
$SUDO mkdir -p $AOMP_INSTALL_DIR
$SUDO touch $AOMP_INSTALL_DIR/testfile
if [ $? != 0 ] ; then
echo "ERROR: No update access to $AOMP_INSTALL_DIR"
exit 1
fi
$SUDO rm $AOMP_INSTALL_DIR/testfile
fi

if [ "$1" != "install" ] ; then
echo
echo "This is a FRESH START. ERASING any previous builds in $BUILD_DIR/build/rocmlibs/rocm-core"
echo "Use ""$0 install"" to avoid FRESH START."
echo rm -rf $BUILD_DIR/build/rocmlibs/rocm-core
rm -rf $BUILD_DIR/build/rocmlibs/rocm-core
mkdir -p $BUILD_DIR/build/rocmlibs/rocm-core
else
if [ ! -d $BUILD_DIR/build/rocmlibs/rocm-core ] ; then
echo "ERROR: The build directory $BUILD_DIR/build/rocmlibs/rocm-core does not exist"
echo " run $0 without install option. "
exit 1
fi
fi

if [ "$1" != "install" ] ; then
# Remember start directory to return on exit
_curdir=$PWD
echo " -----Running cmake ---"
echo cd $AOMP_REPOS/build/rocmlibs/rocm-core
cd $AOMP_REPOS/build/rocmlibs/rocm-core
pwd
MYCMAKEOPTS="
-DCMAKE_CXX_COMPILER=$CXX
-DCMAKE_C_COMPILER=$CC
-DROCM_DIR:PATH=$AOMP_INSTALL_DIR
-DCPACK_PACKAGING_INSTALL_PREFIX=$AOMP_INSTALL_DIR
-DCMAKE_INSTALL_PREFIX=$AOMP_INSTALL_DIR
-DROCM_PATH=$AOMP_INSTALL_DIR
-DCMAKE_PREFIX_PATH:PATH=$AOMP_INSTALL_DIR
-DCPACK_SET_DESTDIR=OFF
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_VERBOSE_MAKEFILE=1
-DROCM_VERSION=$ROCM_VERSION
"
echo $AOMP_CMAKE $MYCMAKEOPTS $_repo_dir
$AOMP_CMAKE $MYCMAKEOPTS $_repo_dir
if [ $? != 0 ] ; then
echo "ERROR cmake failed."
echo " $MYCMAKEOPTS"
cd $_curdir
exit 1
fi
make -j$AOMP_JOB_THREADS

if [ $? != 0 ] ; then
echo "ERROR make -j $AOMP_JOB_THREADS failed"
exit 1
fi
fi

if [ "$1" == "install" ] ; then
echo " -----Installing to $AOMP_INSTALL_DIR ---- "
cd $BUILD_DIR/build/rocmlibs/rocm-core
make -j$AOMP_JOB_THREADS install
if [ $? != 0 ] ; then
echo "ERROR install to $AOMP_INSTALL_DIR failed "
exit 1
fi
echo
echo "SUCCESSFUL INSTALL to $AOMP_INSTALL_DIR"
echo
removepatch $_repo_dir
else
echo
echo "SUCCESSFUL BUILD, please run: $0 install"
echo " to install into $AOMP_INSTALL_DIR"
echo
fi
2 changes: 1 addition & 1 deletion bin/rocmlibs/build_rocmlibs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ components="prereq rocm-cmake"
if [ "$AOMP_STANDALONE_BUILD" == 1 ] ; then
# This ordered build is important when starting from scratch
#components="$components rocBLAS rocPRIM rocSPARSE rocSOLVER hipBLAS-common hipBLAS rocRAND hipRAND rccl half hipSOLVER"
components="$components rocBLAS rocPRIM rocSPARSE rocSOLVER hipBLAS-common hipBLAS rocRAND hipRAND rccl half "
components="$components rocBLAS rocPRIM rocSPARSE rocSOLVER hipBLAS-common hipBLAS rocRAND hipRAND rocm-core rccl half "
else
echo "ERROR: Cannot run $0 with AOMP_STANDALONE_BUILD=$AOMP_STANDALONE_BUILD"
echo " Please set $AOMP_STANDALONE_BUILD=1"
Expand Down
1 change: 1 addition & 0 deletions bin/rocmlibs/rocmlibs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
<project remote="rocm" path="hipRAND" name="hipRAND" revision="release/rocm-rel-6.4" groups="unlocked"/>
<project remote="rocm" path="hipSOLVER" name="hipSOLVER" revision="release/rocm-rel-6.4" groups="unlocked"/>
<project remote="rocm" path="hipSPARSE" name="hipSPARSE" revision="release/rocm-rel-6.4" groups="unlocked"/>
<project remote="rocm" path="rocm-core" name="rocm-core" revision="release/rocm-rel-6.4" groups="unlocked"/>
<project remote="rocm" path="rccl" name="rccl" revision="release/rocm-rel-6.4" groups="unlocked"/>
<project remote="rocm" path="rocBLAS" name="rocBLAS" revision="release/rocm-rel-6.4" groups="unlocked"/>
<project remote="rocm" path="rocFFT" name="rocFFT" revision="release/rocm-rel-6.4" groups="unlocked"/>
Expand Down