Merge branch 'develop' of github.com:LLNL/lbann into develop

bvanessen · bvanessen · commit 0bb74e1076e8 · 2017-06-08T21:33:58.000-07:00
diff --git a/include/lbann/proto/lbann_proto_common.hpp b/include/lbann/proto/lbann_proto_common.hpp
@@ -40,6 +40,7 @@ lbann::optimizer_factory * init_optimizer_factory(
 void init_callbacks(
   lbann::lbann_comm *comm,
   lbann::sequential_model *model,
+  std::map<execution_mode, lbann::DataReader*> &data_readers,
   const lbann_data::LbannPB &p);
 
 ///
diff --git a/include/lbann/regularization/lbann_batch_normalization.hpp b/include/lbann/regularization/lbann_batch_normalization.hpp
@@ -67,7 +67,7 @@ class batch_normalization : public regularizer {
   void setup(Layer* l);
   void update();
 protected:
-  lbann_comm* comm;
+  lbann_comm* m_comm;
   /** For learning gamma and beta. */
   optimizer* m_gamma_optimizer;
   optimizer* m_beta_optimizer;
diff --git a/include/lbann/regularization/lbann_dropout.hpp b/include/lbann/regularization/lbann_dropout.hpp
@@ -56,7 +56,7 @@ class dropout : public regularizer {
   /** Adjust gradients for dropout in backprop. */
   void bp_activations();
 protected:
-  lbann_comm* comm;
+  lbann_comm* m_comm;
   /** Probability of keeping each unit. */
   float m_keep_prob;
 #ifdef LBANN_PROCDET_DROPOUT
diff --git a/model_zoo/lbann_proto.cpp b/model_zoo/lbann_proto.cpp
@@ -120,7 +120,6 @@ int main(int argc, char* argv[])
     //@todo: code not in place for correctly handling image preprocessing
     ///////////////////////////////////////////////////////////////////
     const lbann_data::Model &m2 = pb.model();
-    cerr << "calling init_data_readers: " << pb_model->mini_batch_size() << " " << m2.mini_batch_size() << endl << endl;
     std::map<execution_mode, DataReader*> data_readers;
     init_data_readers(comm->am_world_master(), pb_reader, data_readers, pb_model->mini_batch_size());
     if (comm->am_world_master()) {
@@ -129,7 +128,6 @@ int main(int argc, char* argv[])
              << " num data: " << it.second->getNumData() << endl;
       }
     }
-    cerr << "DONE calling init_data_readers: " << pb_model->mini_batch_size() << " " << m2.mini_batch_size() << endl << endl;
 
     //user feedback
     if (comm->am_world_master()) {
@@ -164,7 +162,7 @@ int main(int argc, char* argv[])
 #endif
     sequential_model * model = init_model(comm, optimizer_fac, pb);
     add_layers(model, data_readers, cudnn, pb);
-    init_callbacks(comm, model, pb);
+    init_callbacks(comm, model, data_readers, pb);
     model->setup();
 
     // restart model from checkpoint if we have one
@@ -175,8 +173,7 @@ int main(int argc, char* argv[])
     // main loop for training/testing
     ///////////////////////////////////////////////////////////////////
     while (model->get_cur_epoch() < pb_model->num_epochs()) {
-      model->train(1, true);
-      model->evaluate(execution_mode::testing);
+      model->train(1, pb_model->evaluation_frequency());
     }
 
     // @todo: figure out and implement coherent strategy
diff --git a/model_zoo/prototext/example.prototext b/model_zoo/prototext/example.prototext
@@ -0,0 +1,168 @@
+/*
+ * This prototext file can not be used as input to model_zoo/lbann_proto;
+ * instead, it's provided to illustrate all (or most) of the various items
+ * that can appear in a prototext file. I'm pretty sure that comments such
+ * as this one, and those that appear below, will cause an exception
+ * if they're included in an actual prototext file.
+ */
+
+model {
+  name: "dnn" //dnn, stacked_autoencoder, or greedy_layerwise_autoencoder
+  objective_function: "categorical_cross_entropy" //categorical_cross_entropy or mean_squared_error
+  metric: "categorical_accuracy" //categorical_accuracy or mean_squared_error
+  mini_batch_size: 192 
+  num_epochs: 10
+  num_parallel_readers: 0
+  procs_per_model: 0
+  use_cudnn: false
+  num_gpus: -1
+  evaluation_frequency: 1 //How often to evaluate model on validation set. A value less than 1 will disable evaluation.
+
+  optimizer {
+    name: "adagrad" //adagrad, rmsprop, adam or sgd
+    learn_rate: 0.01
+    momentum: 0.9
+    decay: 0.5
+    nesterov: true
+  }
+
+  /* 
+   * "data_layout" fields should be either data_parallel or model_parallel 
+   * 
+   * activation_type: sigmoid, tanh, relu, id, leaky_relu, smooth_relu or elu
+   * (see: include/lbann/layers/lbann_layer_activations.hpp; sigmoid = 1)
+   *
+   * weight_initialization: zero, uniform, normal, glorot_normal, he_normal or he_uniform
+   * (see: include/lbann/lbann_base.hpp; zero = 0
+   *
+   * Each layer has a unique identifying index. These need not be sequential.
+   * For now layers are instantiated wrt the order in which they
+   * appear below; in the future they will be instantiated wrt the parent
+   * and child fields, and ordering within the prototext file will be ignored
+   * -1 indicates there is no parent or child layer
+   */
+
+  layer {
+    input_distributed_minibatch_parallel_io {
+      data_layout: "data_parallel"
+    }
+    index: -1
+    parent: 0
+    child: 1
+  }
+
+  layer {
+    fully_connected {
+      data_layout: "data_parallel"
+      num_neurons: 100
+      activation_type: "sigmoid"
+      weight_initialization: "glorot_uniform"
+    }
+    index: 1
+    parent: 0
+    child: 2
+  }
+
+  layer {
+    convolution {
+      num_dims: 2
+      num_input_channels: 32
+      input_dims: "26 26"
+      num_output_channels:  32
+      filter_dims: "3 3"
+      conv_pads: "0 0"
+      conv_strides: "1 1"
+      activation_type: "relu"
+      weight_initialization: "glorot_uniform"
+    }
+    index: 1
+    parent: 0
+    child: 2
+  }
+
+  layer {
+    pooling {
+      num_dims: 2
+      num_channels: 32
+      input_dims: "24 24"
+      pool_dims: "2 2"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max" //max, average, average_no_pad
+    }
+    index: 1
+    parent: 0
+    child: 2
+  }
+
+  layer {
+    softmax {
+      data_layout: "data_parallel"
+      num_neurons: 10
+      weight_initialization: "glorot_uniform"
+    }
+    index: 1
+    parent: 0
+    child: 2
+  }
+
+  layer {
+    target_distributed_minibatch_parallel_io {
+      data_layout: "data_parallel"
+      shared_data_reader: true
+    }
+    index: 5
+    parent: 4
+    child: -1
+  }
+
+  /* 
+   * except where noted, regularizer values below are the current lbann defaults
+   */ 
+  regularizer {
+    batch_normalization {
+      data_layout: model_parallel
+      decay: .0.9  
+      gama: 1.0
+      beta: 0.0
+    }
+  }
+  regularizer {
+    dropout {
+      data_layout: model_parallel
+      keep_prob: 0.5
+    }
+  }
+  regularizer {
+    l2_regularization {
+      lambda: 0 //no default in
+    }
+  }
+
+  /*
+   * I'm not listing all the callbacks here ... see src/proto/lbann.proto for 
+   * aditional info
+   */
+  callback {
+    save_images {
+      image_dir: "images"
+      extension: "png"
+    }
+  }
+  callback {
+    print {
+      interval: 1
+    }
+  }
+  callback {
+    timer {
+      dir: "none"
+    }
+  }
+  callback {
+    summary {
+      dir: "none"
+      interval: 1
+    }
+  }
+}
diff --git a/model_zoo/prototext/greedy_layerwise_autoencoder_mnist.prototext b/model_zoo/prototext/greedy_layerwise_autoencoder_mnist.prototext
@@ -0,0 +1,39 @@
+model {
+  name: "greedy_layerwise_autoencoder"
+  objective_function: "mean_squared_error"
+  mini_batch_size: 192
+  num_epochs: 10
+  num_parallel_readers: 0
+  procs_per_model: 0
+  use_cudnn: false
+  block_size: 256
+  evaluation_frequency: -1
+
+  regularizer {
+    dropout {
+      data_layout: "model_parallel"
+      keep_prob: -1.0
+    }
+  }
+
+  layer {
+    input_distributed_minibatch_parallel_io {
+      data_layout: "model_parallel"
+    }
+  }
+  layer {
+    fully_connected {
+      data_layout: "data_parallel"
+      num_neurons: 32
+      activation_type: "sigmoid"
+      weight_initialization: "glorot_uniform"
+    }
+  }
+  optimizer {
+    name: "rmsprop"
+    learn_rate: 0.01
+    momentum: 0.9
+    decay: 0.5
+    nesterov: true
+  }
+}
diff --git a/scripts/build_lbann_lc_env.sh b/scripts/build_lbann_lc_env.sh
@@ -28,6 +28,9 @@ fi
 # Detect the cuda toolkit version loaded or use default
 if [ "${HasGPU}" == "" ] ; then
   echo "This platform has no GPU"
+elif [ "${ARCH}" == "ppc64le" ]; then
+  CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda
+  CUDATOOLKIT_VERSION=`ls -l ${CUDA_TOOLKIT_ROOT_DIR} | awk '{print $NF}' | cut -d '-' -f 2`
 elif [ "$CUDA_PATH" == "" ] || [ `basename "$CUDA_PATH"` == "" ] ; then
   # use default
   CUDATOOLKIT_VERSION=8.0
@@ -37,9 +40,6 @@ elif [ "$CUDA_PATH" == "" ] || [ `basename "$CUDA_PATH"` == "" ] ; then
   elif [ -d /opt/cudatoolkit-$CUDATOOLKIT_VERSION ] ; then
       CUDA_TOOLKIT_ROOT_DIR=/opt/cudatoolkit-$CUDATOOLKIT_VERSION
   fi
-elif [ "${ARCH}" == "ppc64le" ]; then
-  CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda
-  CUDATOOLKIT_VERSION=`ls -l ${CUDA_TOOLKIT_ROOT_DIR} | awk '{print $NF}' | cut -d '-' -f 2`
 else
   CUDATOOLKIT_VERSION=`basename "$CUDA_PATH" | sed 's/cudatoolkit-//'`
   CUDA_TOOLKIT_ROOT_DIR=$CUDA_PATH
@@ -72,7 +72,7 @@ fi
 if [ "${ARCH}" == "x86_64" ]; then
   cuDNN_DIR=/usr/gapps/brain/installs/cudnn/v5
 elif [ "${ARCH}" == "ppc64le" ]; then
-  cuDNN_DIR=""
+  cuDNN_DIR="/usr/gapps/brain/cuda/targets/ppc64le-linux"
 fi
 ELEMENTAL_MATH_LIBS=
 PATCH_OPENBLAS=ON
diff --git a/src/layers/lbann_layer_fully_connected.cpp b/src/layers/lbann_layer_fully_connected.cpp
@@ -393,7 +393,7 @@ DataType lbann::FullyConnectedLayer::checkGradient(Layer& PrevLayer, const DataT
 
             if(bad_E1) {
               if(Acts_E1.Grid().Rank() == 0) {
-                printf("BAD ENTRY Acts_E1 %d x %d\n", row, col);
+                printf("BAD ENTRY Acts_E1 %lld x %lld\n", row, col);
               }
               cout.precision(20);
               Print(Acts_E1);
@@ -404,7 +404,7 @@ DataType lbann::FullyConnectedLayer::checkGradient(Layer& PrevLayer, const DataT
             }
             if(bad_E2) {
               if(Acts_E2.Grid().Rank() == 0) {
-                printf("BAD ENTRY Acts_E2 %d x %d\n", row, col);
+                printf("BAD ENTRY Acts_E2 %lld x %lld\n", row, col);
               }
               Print(Acts_E2);
             }
diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto
@@ -58,6 +58,8 @@ message Model {
   int64 procs_per_model = 51;
   int64 num_parallel_readers = 52;
   int64 num_gpus = 53;
+  int64 evaluation_frequency = 54;
+  repeated Regularizer regularizer = 55;
 
   //use cudnn_manager, if use_cudnn=true AND lbann was compiled with cudnn support
   bool use_cudnn = 8;
@@ -99,6 +101,12 @@ message Callback {
    CallbackDumpActivations dump_activations = 5;
    CallbackDumpGradients dump_gradients = 6;
    CallbackImComm imcomm = 7;
+   CallbackSaveImages save_images = 8;
+}
+
+message CallbackSaveImages {
+  string image_dir = 1;
+  string extension = 2;
 }
 
 message CallbackPrint {
@@ -180,8 +188,9 @@ message L2Regularization {
 //
 
 message Layer {
-   int64 index = 2; //corresponds to index wrt std::vector<Layer*>
-                     //not currently used
+   int64 index = 2; 
+   int64 parent = 3;
+   int64 child = 4;
 
    // a Layer should contain exactly one of the following
    InputDistributedMiniBatchParallelIO input_distributed_minibatch_parallel_io = 8;
diff --git a/src/proto/lbann_proto_common.cpp b/src/proto/lbann_proto_common.cpp
diff --git a/src/regularization/lbann_batch_normalization.cpp b/src/regularization/lbann_batch_normalization.cpp
diff --git a/src/regularization/lbann_dropout.cpp b/src/regularization/lbann_dropout.cpp

Original file line number	Diff line number	Diff line change
`@@ -393,7 +393,7 @@ DataType lbann::FullyConnectedLayer::checkGradient(Layer& PrevLayer, const DataT`
`393`	`393`
`394`	`394`	`if(bad_E1) {`
`395`	`395`	`if(Acts_E1.Grid().Rank() == 0) {`
`396`		`- printf("BAD ENTRY Acts_E1 %d x %d\n", row, col);`
	`396`	`+ printf("BAD ENTRY Acts_E1 %lld x %lld\n", row, col);`
`397`	`397`	`}`
`398`	`398`	`cout.precision(20);`
`399`	`399`	`Print(Acts_E1);`
`@@ -404,7 +404,7 @@ DataType lbann::FullyConnectedLayer::checkGradient(Layer& PrevLayer, const DataT`
`404`	`404`	`}`
`405`	`405`	`if(bad_E2) {`
`406`	`406`	`if(Acts_E2.Grid().Rank() == 0) {`
`407`		`- printf("BAD ENTRY Acts_E2 %d x %d\n", row, col);`
	`407`	`+ printf("BAD ENTRY Acts_E2 %lld x %lld\n", row, col);`
`408`	`408`	`}`
`409`	`409`	`Print(Acts_E2);`
`410`	`410`	`}`