Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions scripts/config_cuda10.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Source me

export SCRATCH=/lus/scratch/$USER

if [ $USER == "swowner" ]; then
umask 002 # all-readable
INSTALL_BASE=/usr/common/software
else
INSTALL_BASE=$SCRATCH/condaenv
fi
#export INSTALL_BASE=$SCRATCH/conda
# Configure the installation
export INSTALL_DIR=/lus/scratch/jbalma/condenv-cuda10-cosmoflow

# Setup programming environment
#module unload PrgEnv-cray
#module load PrgEnv-gnu
#module unload atp
#module unload cray-libsci
#module unload craype-hugepages8M
#module unload craype-broadwell
#module unload gcc
#module load gcc/7.2.0
source /cray/css/users/jbalma/bin/setup_env_cuda10_osprey_V100.sh
source /cray/css/users/jbalma/bin/env_python3.sh
export CRAY_CPU_TARGET=x86-64

# Setup conda
#source /usr/common/software/python/3.6-anaconda-5.2/etc/profile.d/conda.sh
#source /cray/css/users/dctools/anaconda3/etc/profile.d/conda.sh
#conda create $INSTALL_DIR

# Print some stuff
echo "Configuring on $(hostname) as $USER"
echo " Install directory $INSTALL_DIR"
178 changes: 178 additions & 0 deletions scripts/runit_hvd_osprey.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#!/bin/bash
#SBATCH -N 1
##SBATCH --ntasks-per-node=8
##SBATCH -n 8
####SBATCH -x "nid[000076-000080,000137-000144]"
####SBATCH --contiguous
#SBATCH -t 2:00:00
#SBATCH --exclusive
#SBATCH -p spider
#SBATCH -C V100
#SBATCH --job-name=cosmoflow-gpu
#####SBATCH --nodelist=nid[000256-000287]
#####SBATCH --nodelist=nid[000256-000383]
###SBATCH --nodelist=nid[000256-000512]

###SBATCH --core-spec=2
ulimit -l unlimited
source ./config_cuda10.sh
unset PYTHONPATH
#module rm PrgEnv-cray
export SCRATCH=/lus/scratch/jbalma
INSTALL_DIR=/lus/scratch/jbalma/condenv-cuda10-cosmoflow
export CRAY_CPU_TARGET=x86-64
#conda create -y --prefix $INSTALL_DIR python=3.6 cudatoolkit=10.0 cudnn
source activate $INSTALL_DIR
#conda activate $INSTALL_DIR
export PATH=${INSTALL_DIR}/bin:${PATH} #/home/users/${USER}/.local/bin:${PATH}

echo $CUDATOOLKIT_HOME
which gcc
which python
BUILD_TIMEMORY=0

if [ $BUILD_TIMEMORY -eq 0 ]
then
echo "Running assuming you already have timemory installed"
else

#fi
#pip install tensorflow-gpu==1.15
#Install Horovod on Vanilla Cluster with GPU, OpenMPI support and no NCCL
#CMAKE_CXX_COMPILER=$MPI_CXX CMAKE_CC_COMPILER=$MPI_CC CXX=mpicxx CC=mpicc HOROVOD_CUDA_HOME=${CUDATOOLKIT_HOME} HOROVOD_MPICXX_SHOW="mpicxx -show" HOROVOD_MPI_HOME=${MPI_PATH} HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 pip install --global-option=build_ext --global-option="-I ${CUDATOOLKIT_HOME}/include" -v --no-cache-dir --force-reinstall horovod

#Install on XC50
#pip uninstall horovod
#cc=cc CXX=CC HOROVOD_HIERARCHICAL_ALLREDUCE=1 HOROVOD_MPICXX_SHOW="CC --cray-print-opts=all" pip install -v --no-cache-dir horovod
#ulimit -c

#Install Horovod on Cluster without GPU support
#CXX=mpic++ CC=gcc HOROVOD_MPICXX_SHOW="mpic++ -show" HOROVOD_MPI_HOME=${MPI_PATH} HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 pip install --global-option=build_ext --global-option="-I ${CUDATOOLKIT_HOME}/include" --no-cache-dir --force-reinstall horovod

# Setup a build directory
TIMEMORY_BUILD_DIR=$SCRATCH/timemory-tf-1.13.1-py36
rm -rf $TIMEMORY_BUILD_DIR
mkdir -p $TIMEMORY_BUILD_DIR && cd $TIMEMORY_BUILD_DIR

# Do a clean checkout
[ -d timemory ] && rm -rf timemory
git clone https://github.com/NERSC/timemory.git

# Install dependencies missing from the TF installation
python -m pip install scikit-build
python -m pip install pandas
# Configure the build
export BUILD_SHARED_LIBS=1
export CRAYPE_LINK_TYPE=dynamic

# Run the build
cd timemory
python -m pip uninstall timemory
python -m pip install -r requirements.txt --user
#python setup.py install <ARGS> -- <CMAKE_ARGS>
#python setup.py install --help
python -m pip install scikit-build
export CMAKE_CXX_COMPILER=$MPI_CXX
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a side note, this env variable isn't used by CMake. You can use either -DCMAKE_CXX_COMPILER=$MPI_CXX or export CXX=$MPI_CXX. The former will force the compiler to change if there is an existing build and the CXX environment variable is used if there is no existing build and the latter is not specified on the command line.

Also, out of curiosity, is there a particular reason you are using the MPI compiler wrappers here? The find_package for MPI in CMake should take care of that but if you've had build errors, please lmk.

export CMAKE_CC_COMPILER=$MPI_CC
python setup.py install --enable-gotcha --enable-mpi -- -DTIMEMORY_BUILD_TOOLS=OFF -DCMAKE_CXX_COMPILER=$MPI_CXX -DCMAKE_CC_COMPILER=$MPI_CC -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF

fi

cd /cray/css/users/jbalma/Innovation-Proposals/ML-Perf/hpc-wg/Cosmoflow/cosmoflow-benchmark/scripts

RUN_CMD="python train.py -d configs/scaling_dummy.yaml"
#
#export CRAY_CUDA_MPS=1
#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export CRAY_CUDA_PROXY=1

echo "Running..."
NODES=1 #nodes total
PPN=8 #processer per node
PPS=1 #processes per socket
NP=8 #processes total
NC=9 #job threads per rank
BS=1
#NC=64 #job threads per rank
#IMG_MODE=NCHW
IMG_MODE=NHWC

export RUN_NAME=mlperf-cosmoflow-bs${BS}_np${NP}_PPN${PPN}_PPS${PPS}_NC${NC}_Nodes${NODES}_hvd
export TEMP_DIR=$SCRATCH/temp/${RUN_NAME}
rm -rf $TEMP_DIR
mkdir -p ${TEMP_DIR}
cp -r ../* ${TEMP_DIR}/
cd ${TEMP_DIR}

export SLURM_WORKING_DIR=${TEMP_DIR}
#export PYTHONPATH="$TEMP_DIR:$PYTHONPATH"
#export PYTHONPATH="$(pwd)/data:$(pwd)/utils:$(pwd)/models:$(pwd)/scripts:$PYTHONPATH"
echo $PYTHONPATH

echo "Running ML-Perf HPC WG Cosmoflow Benchmark..."
date

#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export TF_FP16_CONV_USE_FP32_COMPUTE=0
#export TF_FP16_MATMUL_USE_FP32_COMPUTE=0
#export HOROVOD_TIMELINE=${SCRATCH_PAD}/timeline.json
#export HOROVOD_FUSION_THRESHOLD=256000000
#export HOROVOD_MPI_THREADS_DISABLE=1
#export HOROVOD_FUSION_THRESHOLD=0
#export MPICH_MAX_THREAD_SAFETY=multiple
#export MPICH_COLL_SYNC=1
#export MPICH_ENV_DISPLAY=1

#export TF_XLA_FLAGS=--tf_xla_cpu_global_jit
#export TF_XLA_FLAGS="--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit"
#export MKLDNN_VERBOSE=1
#export TF_CPP_MIN_LOG_LEVEL=3
#export TF_XLA_FLAGS=--tf_xla_cpu_global_jit
# clean out the checkpoints
rm -rf ./checkpoints
# these are ignored for GPU runs
export INTER=1
export INTRA=${NC}

#export OMP_NUM_THREADS=$INTRA
#export KMP_AFFINITY="granularity=fine,compact,1,0"

export PYTHONPATH="${TEMP_DIR}:${TEMP_DIR}/data:${TEMP_DIR}/models:${TEMP_DIR}/scripts:$PYTHONPATH"
pwd


export HOROVOD_CACHE_CAPACITY=0
#export HOROVOD_CACHE_CAPACITY=16384
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export OMPI_MCA_btl_openib_allow_ib=false
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl=^openib
#export UCX_TLS="cma,dc_mlx5,posix,rc,rc_mlx5,self,sm,sysv,tcp,ud,ud_mlx5"
#export UCX_MEMTYPE_CACHE=n
#export UCX_ACC_DEVICES=""
#export UCX_NET_DEVICES="ib0,eth0,mlx5_0:1" #,ib0,eth0" #mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1
#export DL_COMM_USE_CRCCL=1
#export OMPI_MCA_btl_tcp_if_include=ib0
#-mca btl_tcp_if_include ens4d1


RUN_CMD="python train.py --distributed --rank-gpu -v configs/scaling_dummy_cray.yaml"

export CRAY_OMP_CHECK_AFFINITY=TRUE
#
# "TIMEMORY_MPI_INIT": false,
# "TIMEMORY_MPI_FINALIZE": false,
# "TIMEMORY_MPI_THREAD": false,
# "TIMEMORY_MPI_THREAD_TYPE": "",

export TIMEMORY_UPCXX_INIT=false
export TIMEMORY_UPCXX_FINALIZE=false


#RUN_OPT="-t 1:00:00 -u --cpu_bind=rank_ldom"
#srun -C V100 -p spider -n $NP --ntasks-per-node $PPN -N $NODES -u $RUN_OPT $RUN_CMD 2>&1 |& tee ${TEMP_DIR}/logfile
srun -p spider --accel-bind=g,v --cpu_bind=none -c ${NC} -C V100 -l -N ${NODES} -n ${NP} --ntasks-per-node=${PPN} -u $RUN_CMD 2>&1 |& tee ${TEMP_DIR}/logfile

echo "end time = " date
conda deactivate

82 changes: 70 additions & 12 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
import pandas as pd
import tensorflow as tf
# Suppress TF warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.compat.v1.logging.set_verbosity(logging.ERROR)
import horovod.tensorflow.keras as hvd
import timemory

from timemory.profiler import profile
from timemory.bundle import auto_timer
from timemory.util import marker
# Local imports
from data import get_datasets
from models import get_model
Expand All @@ -33,6 +35,23 @@
logging.root.removeHandler(absl.logging._absl_handler)
absl.logging._warn_preinit_stderr = False

#timemory settings
# set verbose output to 1
timemory.settings.verbose = 1
# disable timemory debug prints
timemory.settings.debug = False
# set output data format output to json
timemory.settings.json_output = False
# disable mpi_thread mode
timemory.settings.mpi_thread = False
# enable timemory dart output
timemory.settings.dart_output = True
timemory.settings.dart_count = 1
# disable timemory banner
timemory.settings.banner = True
#timemory.settings.flat_profile = False
#timemory.settings.timeline_profile = False

def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser('train.py')
Expand Down Expand Up @@ -111,8 +130,8 @@ def reload_last_checkpoint(checkpoint_format, n_epochs, distributed):
return epoch, model
raise Exception('Unable to find a checkpoint file at %s' % checkpoint_format)

@timemory.util.auto_tuple([getattr(timemory.component, c) for c in
['thread_cpu_clock', 'page_rss', 'priority_context_switch', 'read_bytes', 'written_bytes']])
#@timemory.util.auto_tuple([getattr(timemory.component, c) for c in
# ['thread_cpu_clock', 'page_rss', 'priority_context_switch', 'read_bytes', 'written_bytes']])
def main():
"""Main function"""

Expand Down Expand Up @@ -212,14 +231,48 @@ def main():
if rank == 0:
logging.info('Beginning training')
fit_verbose = 1 if (args.verbose and rank==0) else 2
model.fit(datasets['train_dataset'],
steps_per_epoch=datasets['n_train_steps'],
epochs=data_config['n_epochs'],
validation_data=datasets['valid_dataset'],
validation_steps=datasets['n_valid_steps'],
callbacks=callbacks,
initial_epoch=initial_epoch,
verbose=fit_verbose)

#cray added
#timemory.enable_signal_detection()
#timemory.settings.width = 12
#timemory.settings.precision = 6

#with profile(["wall_clock", "user_clock", "system_clock", "cpu_util",
# "peak_rss", "thread_cpu_clock", "thread_cpu_util"]):
#id = timemory.start_mpip()

#with marker(['wall_clock','cpu_util','peak_rss','cpu_roofline_flops','gpu_roofline_flops','user_mpip_bundle','read_bytes', 'written_bytes'], key="marker_ctx_manager"):

#with marker(['wall_clock','cpu_util','peak_rss','cpu_roofline_flops','user_mpip_bundle','read_bytes', 'written_bytes'], key="marker_ctx_manager"):

#with profile(['wall_clock','cpu_util','peak_rss','cpu_roofline_sp_flops','gpu_roofline_flops','user_mpip_bundle',
# 'read_bytes', 'written_bytes'], flat=True, timeline=False):

#components = ['wall_clock', 'cpu_util', 'peak_rss','read_bytes', 'written_bytes','thread_cpu_util','user_mpip_bundle','thread_cpu_clock']
timemory.settings.flat_profile = False
timemory.settings.timeline_profile = False

components = ['wall_clock','peak_rss','read_bytes','written_bytes']
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should add in the 'user_global_bundle' here. It will essentially be zero overhead and it will allow you to add other components on-the-fly without and modifications to the script, e.g. export TIMEMORY_GLOBAL_COMPONENTS="papi_vector, virtual_memory, gperftools_cpu_profiler"

timemory.enable_signal_detection()
timemory.settings.width = 12
timemory.settings.precision = 6
timemory.settings.mpip_components = ','.join(components)
id = timemory.start_mpip()

#with profile(components, flat=True, timeline=False):
with marker(components, key="marker_ctx_manager"):

model.fit(datasets['train_dataset'],
steps_per_epoch=datasets['n_train_steps'],
epochs=data_config['n_epochs'],
validation_data=datasets['valid_dataset'],
validation_steps=datasets['n_valid_steps'],
callbacks=callbacks,
initial_epoch=initial_epoch,
verbose=fit_verbose)

timemory.stop_mpip(id)
timemory.finalize()

# Print training summary
if rank == 0:
Expand All @@ -229,5 +282,10 @@ def main():
if rank == 0:
logging.info('All done!')


if __name__ == '__main__':

#with profile(['wall_clock','cpu_util','peak_rss','cpu_roofline_sp_flops','user_mpip_bundle',
# 'read_bytes', 'written_bytes'], flat=True, timeline=False):
main()