Skip to content

Commit 0bb74e1

Browse files
committed
Merge branch 'develop' of github.com:LLNL/lbann into develop
2 parents e64dce8 + 88df806 commit 0bb74e1

File tree

12 files changed

+264
-45
lines changed

12 files changed

+264
-45
lines changed

include/lbann/proto/lbann_proto_common.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ lbann::optimizer_factory * init_optimizer_factory(
4040
void init_callbacks(
4141
lbann::lbann_comm *comm,
4242
lbann::sequential_model *model,
43+
std::map<execution_mode, lbann::DataReader*> &data_readers,
4344
const lbann_data::LbannPB &p);
4445

4546
///

include/lbann/regularization/lbann_batch_normalization.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ class batch_normalization : public regularizer {
6767
void setup(Layer* l);
6868
void update();
6969
protected:
70-
lbann_comm* comm;
70+
lbann_comm* m_comm;
7171
/** For learning gamma and beta. */
7272
optimizer* m_gamma_optimizer;
7373
optimizer* m_beta_optimizer;

include/lbann/regularization/lbann_dropout.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ class dropout : public regularizer {
5656
/** Adjust gradients for dropout in backprop. */
5757
void bp_activations();
5858
protected:
59-
lbann_comm* comm;
59+
lbann_comm* m_comm;
6060
/** Probability of keeping each unit. */
6161
float m_keep_prob;
6262
#ifdef LBANN_PROCDET_DROPOUT

model_zoo/lbann_proto.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ int main(int argc, char* argv[])
120120
//@todo: code not in place for correctly handling image preprocessing
121121
///////////////////////////////////////////////////////////////////
122122
const lbann_data::Model &m2 = pb.model();
123-
cerr << "calling init_data_readers: " << pb_model->mini_batch_size() << " " << m2.mini_batch_size() << endl << endl;
124123
std::map<execution_mode, DataReader*> data_readers;
125124
init_data_readers(comm->am_world_master(), pb_reader, data_readers, pb_model->mini_batch_size());
126125
if (comm->am_world_master()) {
@@ -129,7 +128,6 @@ int main(int argc, char* argv[])
129128
<< " num data: " << it.second->getNumData() << endl;
130129
}
131130
}
132-
cerr << "DONE calling init_data_readers: " << pb_model->mini_batch_size() << " " << m2.mini_batch_size() << endl << endl;
133131

134132
//user feedback
135133
if (comm->am_world_master()) {
@@ -164,7 +162,7 @@ int main(int argc, char* argv[])
164162
#endif
165163
sequential_model * model = init_model(comm, optimizer_fac, pb);
166164
add_layers(model, data_readers, cudnn, pb);
167-
init_callbacks(comm, model, pb);
165+
init_callbacks(comm, model, data_readers, pb);
168166
model->setup();
169167

170168
// restart model from checkpoint if we have one
@@ -175,8 +173,7 @@ int main(int argc, char* argv[])
175173
// main loop for training/testing
176174
///////////////////////////////////////////////////////////////////
177175
while (model->get_cur_epoch() < pb_model->num_epochs()) {
178-
model->train(1, true);
179-
model->evaluate(execution_mode::testing);
176+
model->train(1, pb_model->evaluation_frequency());
180177
}
181178

182179
// @todo: figure out and implement coherent strategy
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
/*
2+
* This prototext file can not be used as input to model_zoo/lbann_proto;
3+
* instead, it's provided to illustrate all (or most) of the various items
4+
* that can appear in a prototext file. I'm pretty sure that comments such
5+
* as this one, and those that appear below, will cause an exception
6+
* if they're included in an actual prototext file.
7+
*/
8+
9+
model {
10+
name: "dnn" //dnn, stacked_autoencoder, or greedy_layerwise_autoencoder
11+
objective_function: "categorical_cross_entropy" //categorical_cross_entropy or mean_squared_error
12+
metric: "categorical_accuracy" //categorical_accuracy or mean_squared_error
13+
mini_batch_size: 192
14+
num_epochs: 10
15+
num_parallel_readers: 0
16+
procs_per_model: 0
17+
use_cudnn: false
18+
num_gpus: -1
19+
evaluation_frequency: 1 //How often to evaluate model on validation set. A value less than 1 will disable evaluation.
20+
21+
optimizer {
22+
name: "adagrad" //adagrad, rmsprop, adam or sgd
23+
learn_rate: 0.01
24+
momentum: 0.9
25+
decay: 0.5
26+
nesterov: true
27+
}
28+
29+
/*
30+
* "data_layout" fields should be either data_parallel or model_parallel
31+
*
32+
* activation_type: sigmoid, tanh, relu, id, leaky_relu, smooth_relu or elu
33+
* (see: include/lbann/layers/lbann_layer_activations.hpp; sigmoid = 1)
34+
*
35+
* weight_initialization: zero, uniform, normal, glorot_normal, he_normal or he_uniform
36+
* (see: include/lbann/lbann_base.hpp; zero = 0
37+
*
38+
* Each layer has a unique identifying index. These need not be sequential.
39+
* For now layers are instantiated wrt the order in which they
40+
* appear below; in the future they will be instantiated wrt the parent
41+
* and child fields, and ordering within the prototext file will be ignored
42+
* -1 indicates there is no parent or child layer
43+
*/
44+
45+
layer {
46+
input_distributed_minibatch_parallel_io {
47+
data_layout: "data_parallel"
48+
}
49+
index: -1
50+
parent: 0
51+
child: 1
52+
}
53+
54+
layer {
55+
fully_connected {
56+
data_layout: "data_parallel"
57+
num_neurons: 100
58+
activation_type: "sigmoid"
59+
weight_initialization: "glorot_uniform"
60+
}
61+
index: 1
62+
parent: 0
63+
child: 2
64+
}
65+
66+
layer {
67+
convolution {
68+
num_dims: 2
69+
num_input_channels: 32
70+
input_dims: "26 26"
71+
num_output_channels: 32
72+
filter_dims: "3 3"
73+
conv_pads: "0 0"
74+
conv_strides: "1 1"
75+
activation_type: "relu"
76+
weight_initialization: "glorot_uniform"
77+
}
78+
index: 1
79+
parent: 0
80+
child: 2
81+
}
82+
83+
layer {
84+
pooling {
85+
num_dims: 2
86+
num_channels: 32
87+
input_dims: "24 24"
88+
pool_dims: "2 2"
89+
pool_pads: "0 0"
90+
pool_strides: "2 2"
91+
pool_mode: "max" //max, average, average_no_pad
92+
}
93+
index: 1
94+
parent: 0
95+
child: 2
96+
}
97+
98+
layer {
99+
softmax {
100+
data_layout: "data_parallel"
101+
num_neurons: 10
102+
weight_initialization: "glorot_uniform"
103+
}
104+
index: 1
105+
parent: 0
106+
child: 2
107+
}
108+
109+
layer {
110+
target_distributed_minibatch_parallel_io {
111+
data_layout: "data_parallel"
112+
shared_data_reader: true
113+
}
114+
index: 5
115+
parent: 4
116+
child: -1
117+
}
118+
119+
/*
120+
* except where noted, regularizer values below are the current lbann defaults
121+
*/
122+
regularizer {
123+
batch_normalization {
124+
data_layout: model_parallel
125+
decay: .0.9
126+
gama: 1.0
127+
beta: 0.0
128+
}
129+
}
130+
regularizer {
131+
dropout {
132+
data_layout: model_parallel
133+
keep_prob: 0.5
134+
}
135+
}
136+
regularizer {
137+
l2_regularization {
138+
lambda: 0 //no default in
139+
}
140+
}
141+
142+
/*
143+
* I'm not listing all the callbacks here ... see src/proto/lbann.proto for
144+
* aditional info
145+
*/
146+
callback {
147+
save_images {
148+
image_dir: "images"
149+
extension: "png"
150+
}
151+
}
152+
callback {
153+
print {
154+
interval: 1
155+
}
156+
}
157+
callback {
158+
timer {
159+
dir: "none"
160+
}
161+
}
162+
callback {
163+
summary {
164+
dir: "none"
165+
interval: 1
166+
}
167+
}
168+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
model {
2+
name: "greedy_layerwise_autoencoder"
3+
objective_function: "mean_squared_error"
4+
mini_batch_size: 192
5+
num_epochs: 10
6+
num_parallel_readers: 0
7+
procs_per_model: 0
8+
use_cudnn: false
9+
block_size: 256
10+
evaluation_frequency: -1
11+
12+
regularizer {
13+
dropout {
14+
data_layout: "model_parallel"
15+
keep_prob: -1.0
16+
}
17+
}
18+
19+
layer {
20+
input_distributed_minibatch_parallel_io {
21+
data_layout: "model_parallel"
22+
}
23+
}
24+
layer {
25+
fully_connected {
26+
data_layout: "data_parallel"
27+
num_neurons: 32
28+
activation_type: "sigmoid"
29+
weight_initialization: "glorot_uniform"
30+
}
31+
}
32+
optimizer {
33+
name: "rmsprop"
34+
learn_rate: 0.01
35+
momentum: 0.9
36+
decay: 0.5
37+
nesterov: true
38+
}
39+
}

scripts/build_lbann_lc_env.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ fi
2828
# Detect the cuda toolkit version loaded or use default
2929
if [ "${HasGPU}" == "" ] ; then
3030
echo "This platform has no GPU"
31+
elif [ "${ARCH}" == "ppc64le" ]; then
32+
CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda
33+
CUDATOOLKIT_VERSION=`ls -l ${CUDA_TOOLKIT_ROOT_DIR} | awk '{print $NF}' | cut -d '-' -f 2`
3134
elif [ "$CUDA_PATH" == "" ] || [ `basename "$CUDA_PATH"` == "" ] ; then
3235
# use default
3336
CUDATOOLKIT_VERSION=8.0
@@ -37,9 +40,6 @@ elif [ "$CUDA_PATH" == "" ] || [ `basename "$CUDA_PATH"` == "" ] ; then
3740
elif [ -d /opt/cudatoolkit-$CUDATOOLKIT_VERSION ] ; then
3841
CUDA_TOOLKIT_ROOT_DIR=/opt/cudatoolkit-$CUDATOOLKIT_VERSION
3942
fi
40-
elif [ "${ARCH}" == "ppc64le" ]; then
41-
CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda
42-
CUDATOOLKIT_VERSION=`ls -l ${CUDA_TOOLKIT_ROOT_DIR} | awk '{print $NF}' | cut -d '-' -f 2`
4343
else
4444
CUDATOOLKIT_VERSION=`basename "$CUDA_PATH" | sed 's/cudatoolkit-//'`
4545
CUDA_TOOLKIT_ROOT_DIR=$CUDA_PATH
@@ -72,7 +72,7 @@ fi
7272
if [ "${ARCH}" == "x86_64" ]; then
7373
cuDNN_DIR=/usr/gapps/brain/installs/cudnn/v5
7474
elif [ "${ARCH}" == "ppc64le" ]; then
75-
cuDNN_DIR=""
75+
cuDNN_DIR="/usr/gapps/brain/cuda/targets/ppc64le-linux"
7676
fi
7777
ELEMENTAL_MATH_LIBS=
7878
PATCH_OPENBLAS=ON

src/layers/lbann_layer_fully_connected.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ DataType lbann::FullyConnectedLayer::checkGradient(Layer& PrevLayer, const DataT
393393

394394
if(bad_E1) {
395395
if(Acts_E1.Grid().Rank() == 0) {
396-
printf("BAD ENTRY Acts_E1 %d x %d\n", row, col);
396+
printf("BAD ENTRY Acts_E1 %lld x %lld\n", row, col);
397397
}
398398
cout.precision(20);
399399
Print(Acts_E1);
@@ -404,7 +404,7 @@ DataType lbann::FullyConnectedLayer::checkGradient(Layer& PrevLayer, const DataT
404404
}
405405
if(bad_E2) {
406406
if(Acts_E2.Grid().Rank() == 0) {
407-
printf("BAD ENTRY Acts_E2 %d x %d\n", row, col);
407+
printf("BAD ENTRY Acts_E2 %lld x %lld\n", row, col);
408408
}
409409
Print(Acts_E2);
410410
}

src/proto/lbann.proto

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ message Model {
5858
int64 procs_per_model = 51;
5959
int64 num_parallel_readers = 52;
6060
int64 num_gpus = 53;
61+
int64 evaluation_frequency = 54;
62+
repeated Regularizer regularizer = 55;
6163

6264
//use cudnn_manager, if use_cudnn=true AND lbann was compiled with cudnn support
6365
bool use_cudnn = 8;
@@ -99,6 +101,12 @@ message Callback {
99101
CallbackDumpActivations dump_activations = 5;
100102
CallbackDumpGradients dump_gradients = 6;
101103
CallbackImComm imcomm = 7;
104+
CallbackSaveImages save_images = 8;
105+
}
106+
107+
message CallbackSaveImages {
108+
string image_dir = 1;
109+
string extension = 2;
102110
}
103111

104112
message CallbackPrint {
@@ -180,8 +188,9 @@ message L2Regularization {
180188
//
181189

182190
message Layer {
183-
int64 index = 2; //corresponds to index wrt std::vector<Layer*>
184-
//not currently used
191+
int64 index = 2;
192+
int64 parent = 3;
193+
int64 child = 4;
185194

186195
// a Layer should contain exactly one of the following
187196
InputDistributedMiniBatchParallelIO input_distributed_minibatch_parallel_io = 8;

0 commit comments

Comments
 (0)