LBANN
diff --git a/‎CMakeLists.txt‎
Lines changed: 42 additions & 45 deletions b/‎CMakeLists.txt‎
Lines changed: 42 additions & 45 deletions
diff --git a/‎ReleaseNotes.txt‎
Lines changed: 204 additions & 0 deletions b/‎ReleaseNotes.txt‎
Lines changed: 204 additions & 0 deletions
@@ -54,7 +54,7 @@ if (GIT_REPO)
   set(${UPPER_PROJECT_NAME}_VERSION ${GIT_VERSION}
     CACHE STRING "LBANN's version string")
 else ()
-  set(${UPPER_PROJECT_NAME}_VERSION v0.94
+  set(${UPPER_PROJECT_NAME}_VERSION v0.95
     CACHE STRING "LBANN's version string")
 endif (GIT_REPO)
 
@@ -72,8 +72,6 @@ option(${UPPER_PROJECT_NAME}_WARNINGS_AS_ERRORS
 
 option(${UPPER_PROJECT_NAME}_WITH_CUDA "Include Nvidia CUDA" OFF)
 
-option(${UPPER_PROJECT_NAME}_WITH_NCCL "Include Nvidia NCCL2" OFF)
-
 option(${UPPER_PROJECT_NAME}_WITH_CUDNN "Include Nvidia cuDNN" ON)
 
 option(${UPPER_PROJECT_NAME}_WITH_CNPY "Include cnpy" ON)
@@ -89,7 +87,7 @@ option(${UPPER_PROJECT_NAME}_WITH_NVPROF
 option(${UPPER_PROJECT_NAME}_WITH_TOPO_AWARE
   "Enable topology-aware profiling (HWLOC)" ON)
 
-option(${UPPER_PROJECT_NAME}_WITH_ALUMINUM 
+option(${UPPER_PROJECT_NAME}_WITH_ALUMINUM
   "Enable Aluminum all-reduce library" OFF)
 
 option(${UPPER_PROJECT_NAME}_WITH_CONDUIT
@@ -134,14 +132,10 @@ endif ()
 set(LBANN_TOPO_AWARE ${${UPPER_PROJECT_NAME}_WITH_TOPO_AWARE})
 
 # Enable parallel random matrix generation, if possible
-if (${UPPER_PROJECT_NAME}_SEQUENTIAL_INITIALIZATION)
-  set(LBANN_SEQUENTIAL_CONSISTENCY TRUE)
-  set(LBANN_PROCDET_DROPOUT TRUE)
-  set(LBANN_PARALLEL_RANDOM_MATRICES FALSE)
+if (${UPPER_PROJECT_NAME}_DETERMINISTIC)
+  set(LBANN_DETERMINISTIC TRUE)
 else()
-  set(LBANN_SEQUENTIAL_CONSISTENCY FALSE)
-  set(LBANN_PROCDET_DROPOUT FALSE)
-  set(LBANN_PARALLEL_RANDOM_MATRICES TRUE)
+  set(LBANN_DETERMINISTIC FALSE)
 endif ()
 
 #
@@ -170,6 +164,29 @@ include(SetupElemental)
 find_package(OpenCV REQUIRED)
 set(LBANN_HAS_OPENCV ${OpenCV_FOUND})
 
+if (LBANN_WITH_ALUMINUM)
+  find_package(Aluminum)
+  set(LBANN_HAS_ALUMINUM ${Aluminum_FOUND})
+  if (NOT LBANN_HAS_ALUMINUM)
+    message(FATAL_ERROR
+      "Requested LBANN_WITH_ALUMINUM but Aluminum not found. "
+      "Aluminum is now disabled. "
+      "Try specifying ALUMINUM_DIR as the root of an ALUMINUM install. "
+      "Alternatively, build with LBANN_WITH_ALUMINUM=OFF.")
+    set(LBANN_WITH_ALUMINUM OFF)
+  endif(NOT LBANN_HAS_ALUMINUM)
+
+  if (AL_HAS_CUDA AND NOT LBANN_WITH_CUDA)
+    message(WARNING
+      "Aluminum has CUDA but LBANN is configured with LBANN_WITH_CUDA=OFF")
+  endif ()
+
+  option(LBANN_BUILT_WITH_SPECTRUM "LBANN was built with Spectrum MPI" OFF)
+  if (LBANN_BUILT_WITH_SPECTRUM)
+    set(LBANN_ALUMINUM_MPI_PASSTHROUGH ON)
+  endif (LBANN_BUILT_WITH_SPECTRUM)
+endif (LBANN_WITH_ALUMINUM)
+
 # Setup some additional CUDA-y things
 if (LBANN_HAS_CUDA)
   if (NOT LBANN_WITH_CUDNN)
@@ -184,18 +201,12 @@ if (LBANN_HAS_CUDA)
 
   set(LBANN_HAS_CUDNN ${CUDNN_FOUND})
 
-  if (LBANN_WITH_NCCL)
-    find_package(NCCL 2.0.0 REQUIRED)
-    set(LBANN_HAS_NCCL2 ${NCCL_FOUND})
-    if (NOT LBANN_HAS_NCCL2)
-      message(FATAL_ERROR
-        "Requested LBANN_WITH_NCCL but NCCL not found. "
-        "NCCL is now disabled. "
-        "Try specifying NCCL_DIR as the root of a NCCL install. "
-        "Alternatively, build with LBANN_WITH_NCCL=OFF.")
-      set(LBANN_WITH_NCCL OFF)
-    endif (NOT LBANN_HAS_NCCL2)
-  endif (LBANN_WITH_NCCL)
+  if (LBANN_HAS_ALUMINUM AND AL_HAS_NCCL)
+    set(LBANN_HAS_NCCL2 TRUE)
+  else ()
+    set(LBANN_HAS_NCCL2 FALSE)
+  endif ()
+
 endif (LBANN_HAS_CUDA)
 
 # This shouldn't be here, but is ok for now. This will occasionally be
@@ -219,6 +230,10 @@ if (LBANN_WITH_VTUNE)
   include(SetupVTune)
 endif ()
 
+if (LBANN_WITH_NVPROF)
+  set(LBANN_NVPROF TRUE)
+endif ()
+
 if (LBANN_WITH_CNPY)
   find_package(CNPY)
   set(LBANN_HAS_CNPY ${CNPY_FOUND})
@@ -246,23 +261,6 @@ if (LBANN_TOPO_AWARE)
   endif (NOT HWLOC_FOUND)
 endif (LBANN_TOPO_AWARE)
 
-if (LBANN_WITH_ALUMINUM)
-  find_package(ALUMINUM)
-  set(LBANN_HAS_ALUMINUM ${ALUMINUM_FOUND}) 
-  if (NOT LBANN_HAS_ALUMINUM)
-    message(FATAL_ERROR
-      "Requested LBANN_WITH_ALUMINUM but Aluminum not found. "
-      "Aluminum is now disabled. "
-      "Try specifying ALUMINUM_DIR as the root of an ALUMINUM install. "
-      "Alternatively, build with LBANN_WITH_ALUMINUM=OFF.")
-    set(LBANN_WITH_ALUMINUM OFF)
-  endif(NOT LBANN_HAS_ALUMINUM)      
-  option(LBANN_BUILT_WITH_SPECTRUM "LBANN was built with Spectrum MPI" OFF)
-  if (LBANN_BUILT_WITH_SPECTRUM)
-    set(LBANN_ALUMINUM_MPI_PASSTHROUGH ON)
-  endif (LBANN_BUILT_WITH_SPECTRUM)
-endif (LBANN_WITH_ALUMINUM)
-
 if (LBANN_WITH_CONDUIT)
   find_package(CONDUIT)
   set(LBANN_HAS_CONDUIT ${CONDUIT_FOUND})
@@ -276,7 +274,7 @@ if (LBANN_WITH_CONDUIT)
 endif (LBANN_WITH_CONDUIT)
 
 # Handle the documentation
-add_subdirectory(doc)
+add_subdirectory(docs)
 
 ################################################################
 # Build LBANN
@@ -322,7 +320,7 @@ if (LBANN_TOPO_AWARE)
 endif ()
 
 if (LBANN_HAS_ALUMINUM)
-  target_link_libraries(lbann PUBLIC ALUMINUM::ALUMINUM)
+  target_link_libraries(lbann PUBLIC ${Aluminum_LIBRARIES})
 endif ()
 
 if (LBANN_HAS_CONDUIT)
@@ -334,8 +332,7 @@ endif ()
 if (LBANN_HAS_CUDA)
   target_link_libraries(lbann PUBLIC ${CUDA_LIBRARIES})
   target_link_libraries(lbann PUBLIC cuda::toolkit)
-  if (WITH_NVPROF)
-    add_definitions(-DLBANN_NVPROF)
+  if (LBANN_WITH_NVPROF)
     target_link_libraries(lbann PUBLIC ${NVTX_LIBRARIES})
   endif ()
   target_link_libraries(lbann PUBLIC ${cuBLAS_LIBRARIES})
@@ -360,7 +357,6 @@ target_link_libraries(lbann PUBLIC ${DL_LIBRARY})
 # Add the rest of the things
 add_subdirectory(model_zoo)
 add_subdirectory(model_zoo/tests)
-add_subdirectory(model_zoo/historical)
 add_subdirectory(tests)
 
 ################################################################
@@ -429,6 +425,7 @@ message("  LBANN_HAS_PROTOBUF:   ${LBANN_HAS_PROTOBUF}")
 message("  LBANN_HAS_CNPY:       ${LBANN_HAS_CNPY}")
 message("  LBANN_HAS_TBINF:      ${LBANN_HAS_TBINF}")
 message("  LBANN_HAS_VTUNE:      ${LBANN_HAS_VTUNE}")
+message("  LBANN_NVPROF:         ${LBANN_NVPROF}")
 message("  LBANN_HAS_DOXYGEN:    ${LBANN_HAS_DOXYGEN}")
 message("  LBANN_HAS_LBANN_PROTO:${LBANN_HAS_LBANN_PROTO}")
 message("  LBANN_HAS_ALUMINUM:   ${LBANN_HAS_ALUMINUM}")
 
@@ -0,0 +1,204 @@
+============================== Release Notes: v0.95 ==============================
+Support for new training algorithms:
+  - Generative Adversarial Networks (GAN)
+
+Support for new network structures:
+  - Variational Autoencoders
+  - GAN
+  - CycleGAN
+  - Combined Autoencoders with CycleGAN
+  - Deep Recurrent Attention Model (DRAM), Ba et al. (2015)
+  - Video Recurrent Attention Model (VRAM)
+
+Support for new layers:
+  - Optimized Top-K accuracy (CPU, GPU)
+  - Crop (CPU, GPU)
+  - Sort (CPU, GPU) both ascending and descending order
+  - Absolute value (CPU, GPU)
+  - Mean-squared (CPU, GPU)
+  - Top-K categorical accuracy (CPU, GPU)
+  - Cross-entropy (CPU, GPU)
+  - Stop gradient (CPU, GPU)
+
+Performance optimizations:
+  - Use Pinned memory for CPU activations matrices
+  - Non-blocking GPU computation of objective functions and metrics
+  - Refactored weight matrices and weight initialization
+  - Manage GPU workspace buffers with memory pool
+  - Slice and concatenation layer emit matrix views if possible
+  - Used more fine-grained asynchronous calls when using Aluminum Library
+    - Minimized GPU stream synchronization events per call
+  - Improved / minimized synchronization events when using a single GPU
+  - Fixed GPU workspace size
+  - GPU implementation of Adagrad optimizer
+  - GPU model-parallel softmax
+  - Optimized local CUDA kernel implementations
+  - Support for distributed matrices with arbitrary alignment
+
+Model portability & Usability:
+  - Keras to LBANN prototext conversion tool
+
+Internals Features:
+  - Support for multiple objective functions and metrics per network with arbitrary placement
+    - Objective functions represented as layers
+    - Metrics represented as layers
+    - Introduced evaluation layer construct
+  - Ability to freeze specific layers for pre-training / fine-tuning
+  - Refactoring tensor setup in setup, forward prop, and back prop
+  - Layers store matrices in private smart pointers
+  - Model automatically inserts evaluation layers where needed
+  - Copy Layer activations between models
+  - Annotated GPU profiling output with training phases
+  - Fixed initialization of Comm object and Grid objects when using multiple models
+  - General code cleanup, refactoring, and various bug fixes.
+  - All layers overwrite error signal matrices
+  - NCCL backend is now implemented via Aluminum Library
+  - MPI calls are routed through the LBANN Comm object into Hydrogen or Aluminum
+  - Provide runtime statistics summary from every rank
+  - Reworked LBANN to use Hydrogen to manage GPU memory
+  - GPU allocations now via CUB memory pool
+  - Fixed Spack build interaction with Hydrogen Library
+
+I/O & data readers:
+  - Support for Conduit objects with HDF5 formatting
+  - In-memory and locally offloaded data store
+    - Data Store can hold the entire training set in memory (or node-local storage)
+    - Data store will shuffle data samples between epochs and present samples to input layer
+  - Updated synthetic data reader
+  - Modified data readers to handle bad samples in JAG conduit data
+  - Reworked the I/O layers (input and target) so that the input layer produces both the
+    sample and label / response if necessary.
+    - Target layer is being deprecated
+  - Updated image data reader to use cv::imdecode to accelerate image load times
+  - Allow users to specify an array of data sources for the independent/dependent
+    variables via prototext
+
+============================== Release Notes: v0.94 ==============================
+Support for new training algorithms:
+  - Back-Propagation Through Time (BPTT)
+    -- Recurrent Neural Networks (RNN)
+    -- Long Short-Term Memories (LSTM)
+  - Generative Adversarial Networks (GAN)
+  - Variational autoencoders
+  - Convolutional autoencoders
+  - Fine tuning of pretrained networks
+    -- Flexible weight freezing
+  - Context-prediction network (Siamese network)
+  - Livermore Tournament Fast Batch learning (LTFB)
+  - Variable mini-batch sizes
+
+Support for new network structures
+  - Directed Acyclic Graph (DAG) networks
+  - Residual networks
+  - Modular and composable objective functions
+  - Multiple metrics
+  - Shared weight matrices
+  - (BETA) New evaluation layer that is attach to any point of DAG
+  - Motifs (compound, reused network patterns)
+
+Support for new layers:
+  - Learning:
+    - Deconvolution
+  - Metrics:
+    -- Top K Categorical accuracy, Pearson correlation, Mean absolute deviation
+  - Loss Functions:
+    -- Cross Entropy with Uncertainty, Geometric negative log likelihood
+    -- Poisson Negative log likelihood, Polya Negative Log Likelihood
+  - Optimizers:
+    -- Hypergradient Adam
+  - Transform Layers:
+    -- Contatenation, Noise, Unpooling, Pooling, Reshape, Slice, Split, Sum
+  - Regularizer:
+    -- Batch Normalization, Selu Dropout, Local Response Normalization (LRN)
+  - Activations:
+    -- Leaky Relu, Smooth Relu, Elu, Scaled Elu, Softplus, Atan,
+    -- Bent Identity, Exponential
+
+Performance optimizations:
+  - GPU acceleration for most layers
+  - NCCL 2.X
+  - Optimized communication patterns
+  - Asynchronous weight updates
+  - Asynchronous metric and objective function updates
+  - batch normalization (global and local)
+  - L2 normalization
+  - Adaptive Quantization (inter-model)
+
+Model portability & usability:
+  - Portable checkpoints / recovery
+  - Distributed checkpoint / recovery
+  - Network visualization
+  - Export LBANN to TensorFlow format
+
+Internals Features:
+  - Gradient checking
+  - Network representation using tensor dimensions
+  - Bamboo continuous integration (CI)
+  - Improved data processing pipeline
+
+New data readers:
+ - Numpy
+ - CSV
+ - Methods for merging multiple features and samples across files
+ - CANDLE Pilot 2
+ - CANDLE Pilot 1 Combo
+ - ICF JAG
+
+Integration with Hydrogen, an optimized distributed, dense linear algebra
+library.  Hydrogen is a fork of the Elemental library.  Hydrogen optimizes for:
+distributed matrices with elemental and block distributions, BLAS, LAPACK,
+distributed and local matrix management.
+
+Integration with optimized all-reduce communication library Aluminum.  Aluminum
+provides custom reduction patterns, customized CUDA reduction kernels,
+and asynchronous communication operators. It uses MPI, MPI w/GPUdirect, or NCCL
+as back-end libraries. Aluminum enables us to effectively use non-blocking
+all-reduces during backprop/optimization
+
+Additionally, we have added support for an online, distributed data store.  When
+enabled, LBANN is able to ingest all of the training data set in a distributed
+method across all ranks.  Each data store is then able to serve it's portion of
+a mini-batch, dynamically moving data to the necessary ranks in the model (based
+on the mini-batch data distribution).
+
+============================== Release Notes: v0.93 ==============================
+This release contains a major refactoring / overhaul of the code base.
+Key highlights include:
+- Moving layer design into smaller simpler layers that have a single
+  compute behavior per layer.  Specifically, linear combination of the
+  inputs, non-linear activations, and regularizers now exist as their
+  own layers.
+- Layers now have a template parameter that specifies the data layout
+  for the distributed matrices.
+- Prototext interface for specifying neural network models and data
+  readers is nearly fully functional.
+- Code now adheres to internal coding style as outlined in
+  README_coding_style.txt
+- Dead-code has been eliminated and layer file hierarchy has been
+  cleaned up.
+
+============================== Release Notes: v0.92 ==============================
+New features include (but are not limited to):
+  - Full support for convolutional and pooling layers
+  - GPU acceleration of local Elemental GEMM operations
+  - Improved network and data reader support
+    -- Alexnet
+    -- VGG
+    -- CIFAR-10
+  - Added a suite of regularizers, objective functions, and metrics, including:
+    -- Batch normalization
+    -- Drop-out
+    -- L2
+  - Dramatically improves the performance of inter-model communication
+  - Added suite of image prepossessing routines
+
+============================== Release Notes: v0.91 ==============================
+Incorporates a number of changes through the LBANN code base.  In
+particular there is a new build system that tries to have LBANN
+download all of the dependencies into its build tree, and compile them
+locally.  Additional improvements include optimizations in the data
+parallel, multiple model training framework, support for convolutional
+layers, and general bug fixes.
+
+============================== Release Notes: v0.90 ==============================
+Initial release of the LBANN toolkit.