Skip to content

Commit 51d0c7f

Browse files
committed
Merge branch 'release-v0.97'
============================== Release Notes: v0.97 ============================== Support for new layers: - Mean absolute error and L1 norm - GPU implementation for activation layers - Log sigmoid and softsign - Channel-wise mean (temporary kludge) Model portability & usability: - Hints for layer output dimensions - Confusion matrix callback - Metric checking callback Internal features: - Removed target-layer-based features from model zoo - Layer unit tests check for expected output values Retired features: - Smooth ReLU, bent identity, and swish layers - Target-layer-based metrics - Target-layer-based models (sequential, greedy layer-wise autoencoder, Siamese)
2 parents cd7350e + f019d34 commit 51d0c7f

File tree

288 files changed

+7509
-7772
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

288 files changed

+7509
-7772
lines changed

CMakeLists.txt

Lines changed: 219 additions & 82 deletions
Large diffs are not rendered by default.

ReleaseNotes.txt

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
============================== (Pending) Release Notes: v0.97 ==============================
1+
============================== (Pending) Release Notes: v0.98 ==============================
22
Support for new training algorithms:
33

44
Support for new network structures:
@@ -15,6 +15,29 @@ I/O & data readers:
1515

1616
Build system:
1717

18+
Retired features:
19+
20+
============================== Release Notes: v0.97 ==============================
21+
Support for new layers:
22+
- Mean absolute error and L1 norm
23+
- GPU implementation for activation layers
24+
- Log sigmoid and softsign
25+
- Channel-wise mean (temporary kludge)
26+
27+
Model portability & usability:
28+
- Hints for layer output dimensions
29+
- Confusion matrix callback
30+
- Metric checking callback
31+
32+
Internal features:
33+
- Removed target-layer-based features from model zoo
34+
- Layer unit tests check for expected output values
35+
36+
Retired features:
37+
- Smooth ReLU, bent identity, and swish layers
38+
- Target-layer-based metrics
39+
- Target-layer-based models (sequential, greedy layer-wise autoencoder, Siamese)
40+
1841
============================== Release Notes: v0.96 ==============================
1942
Support for new layers:
2043
- Log softmax

bamboo/common_python/tools.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def get_command(cluster,
7676
command_allocate = ''
7777
# Allocate a node if we don't have one already
7878
# Running the tests manually allows for already having a node allocated
79-
if os.getenv('SLURM_NNODES') == None:
79+
if os.getenv('SLURM_JOB_NUM_NODES') == None:
8080
command_allocate = 'salloc'
8181
option_num_nodes = ''
8282
option_partition = ''

bamboo/integration_tests/common_code.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def get_command(cluster, dir_name, model_folder, model_name, executable,
3737
cluster=cluster, executable=executable, num_nodes=1,
3838
partition=partition, time_limit=time_limit, num_processes=num_processes,
3939
dir_name=dir_name,
40-
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
40+
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
4141
data_reader_name='mnist', model_folder=model_folder,
4242
model_name=model_name, num_epochs=5, optimizer_name='adagrad',
4343
output_file_name=output_file_name, error_file_name=error_file_name)
@@ -103,7 +103,7 @@ def extract_data(output_file_name, data_fields, should_log):
103103
for line in output_file:
104104
if should_log:
105105
print('%s: %s' % (output_file_name, line))
106-
106+
107107
# Check if line is reporting model results
108108
is_model = re.search('^Model ([0-9]+)', line)
109109
if is_model:

bamboo/integration_tests/test_integration_debug.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly,
1717
command = tools.get_command(
1818
cluster=cluster, executable=executables[compiler_name], num_nodes=1,
1919
partition='pbatch', time_limit=100, dir_name=dir_name,
20-
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
20+
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
2121
data_reader_name='mnist', model_folder='models/' + model_name,
2222
model_name=model_name, num_epochs=5, optimizer_name='adagrad',
2323
output_file_name=output_file_name, error_file_name=error_file_name)
2424
output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name)
2525
assert output_value == 0
2626

2727
def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, debug, should_log=False):
28-
# If weekly or debug are true, then run the test.
28+
# If weekly or debug are true, then run the test.
2929
if (not weekly) and (not debug):
3030
pytest.skip('Not doing weekly or debug testing')
3131
if cluster == 'ray':
@@ -38,8 +38,8 @@ def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly,
3838
command = tools.get_command(
3939
cluster=cluster, executable=executables[compiler_name], num_nodes=1,
4040
partition='pbatch', time_limit=100, dir_name=dir_name,
41-
data_filename_train_default='/p/lscratchf/brainusr/datasets/cifar10-bin/data_all.bin',
42-
data_filename_test_default='/p/lscratchf/brainusr/datasets/cifar10-bin/test_batch.bin',
41+
data_filename_train_default='/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin',
42+
data_filename_test_default='/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin',
4343
data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name,
4444
model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad',
4545
output_file_name=output_file_name, error_file_name=error_file_name)
@@ -51,13 +51,13 @@ def test_integration_mnist_clang4_debug(cluster, dirname, exes, weekly, debug):
5151

5252
def test_integration_cifar_clang4_debug(cluster, dirname, exes, weekly, debug):
5353
skeleton_cifar_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug)
54-
54+
5555
def test_integration_mnist_gcc4_debug(cluster, dirname, exes, weekly, debug):
5656
skeleton_mnist_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug)
57-
57+
5858
def test_integration_cifar_gcc4_debug(cluster, dirname, exes, weekly, debug):
5959
skeleton_cifar_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug)
60-
60+
6161
def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, debug):
6262
skeleton_mnist_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug)
6363

bamboo/integration_tests/test_integration_io_buffers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def skeleton_io_buffers(cluster, dir_name, executables, compiler_name, weekly):
3535
command = tools.get_command(
3636
cluster=cluster, executable=executables[compiler_name], num_nodes=2,
3737
num_processes=num_ranks, dir_name=dir_name,
38-
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
38+
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
3939
data_reader_name='mnist', mini_batch_size=mini_batch_size,
4040
model_folder='tests', model_name=model_name, num_epochs=5,
4141
optimizer_name='adagrad',

bamboo/unit_tests/prototext/data_reader_mnist.prototext

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ data_reader {
33
name: "mnist"
44
role: "train"
55
shuffle: true
6-
data_filedir: "/p/lscratchf/brainusr/datasets/MNIST"
6+
data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
77
data_filename: "train-images-idx3-ubyte"
88
label_filename: "train-labels-idx1-ubyte"
99
validation_percent: 0.1
@@ -34,7 +34,7 @@ data_reader {
3434
name: "mnist"
3535
role: "test"
3636
shuffle: true
37-
data_filedir: "/p/lscratchf/brainusr/datasets/MNIST"
37+
data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
3838
data_filename: "t10k-images-idx3-ubyte"
3939
label_filename: "t10k-labels-idx1-ubyte"
4040
validation_percent: 1.0

bamboo/unit_tests/test_unit_check_proto_models.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@ def skeleton_models(cluster, dir_name, executables, compiler_name):
1313
defective_models = []
1414
working_models = []
1515
for subdir, dirs, files in os.walk(dir_name + '/model_zoo/models/'):
16-
if 'greedy' in subdir:
17-
print('Skipping greedy_layerwise_autoencoder_mnist, kills bamboo agent')
18-
continue
1916
for file_name in files:
2017
if file_name.endswith('.prototext') and "model" in file_name:
2118
model_path = subdir + '/' + file_name
@@ -30,14 +27,14 @@ def skeleton_models(cluster, dir_name, executables, compiler_name):
3027
print('Skipping %s because motifs are deprecated' % model_path)
3128
continue
3229
elif 'mnist' in file_name:
33-
data_filedir_default = '/p/lscratchf/brainusr/datasets/MNIST'
30+
data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST'
3431
data_reader_name = 'mnist'
3532
elif 'adversarial' in file_name:
36-
data_filedir_default = '/p/lscratchf/brainusr/datasets/MNIST'
33+
data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST'
3734
data_reader_path = '%s/model_zoo/models/gan/mnist/adversarial_data.prototext' % (dir_name)
3835
data_reader_name = None
3936
elif 'discriminator' in file_name:
40-
data_filedir_default = '/p/lscratchf/brainusr/datasets/MNIST'
37+
data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST'
4138
data_reader_path = '%s/model_zoo/models/gan/mnist/discriminator_data.prototext' % (dir_name)
4239
data_reader_name = None
4340
elif 'triplet' in file_name:
@@ -66,8 +63,8 @@ def skeleton_models(cluster, dir_name, executables, compiler_name):
6663
if 'resnet50' in file_name:
6764
node_count = 8
6865
elif 'cifar' in file_name:
69-
data_filename_train_default = '/p/lscratchf/brainusr/datasets/cifar10-bin/data_all.bin'
70-
data_filename_test_default = '/p/lscratchf/brainusr/datasets/cifar10-bin/test_batch.bin'
66+
data_filename_train_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin'
67+
data_filename_test_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin'
7168
data_reader_name = 'cifar10'
7269
elif 'char' in file_name:
7370
data_filedir_default = '/p/lscratchh/brainusr/datasets/tinyshakespeare/'

bamboo/unit_tests/test_unit_checkpoint.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_na
1313
command = tools.get_command(
1414
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
1515
dir_name=dir_name,
16-
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
16+
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
1717
data_reader_name='mnist', model_folder='tests',
1818
model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd',
1919
output_file_name=output_file_name, error_file_name=error_file_name)
@@ -28,7 +28,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_na
2828
command = tools.get_command(
2929
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
3030
dir_name=dir_name,
31-
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
31+
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
3232
data_reader_name='mnist', model_folder='tests',
3333
model_name='lenet_mnist_ckpt', num_epochs=1, optimizer_name='sgd',
3434
output_file_name=output_file_name, error_file_name=error_file_name)
@@ -42,7 +42,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_na
4242
command = tools.get_command(
4343
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
4444
dir_name=dir_name,
45-
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
45+
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
4646
data_reader_name='mnist', model_folder='tests',
4747
model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd',
4848
output_file_name=output_file_name, error_file_name=error_file_name)
@@ -64,7 +64,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compil
6464
command = tools.get_command(
6565
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
6666
dir_name=dir_name,
67-
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
67+
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
6868
data_reader_name='mnist', model_folder='tests',
6969
model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd',
7070
output_file_name=output_file_name, error_file_name=error_file_name)
@@ -79,7 +79,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compil
7979
command = tools.get_command(
8080
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
8181
dir_name=dir_name,
82-
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
82+
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
8383
data_reader_name='mnist', model_folder='tests',
8484
model_name='lenet_mnist_dist_ckpt', num_epochs=1, optimizer_name='sgd',
8585
output_file_name=output_file_name, error_file_name=error_file_name)
@@ -93,7 +93,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compil
9393
command = tools.get_command(
9494
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
9595
dir_name=dir_name,
96-
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
96+
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
9797
data_reader_name='mnist', model_folder='tests',
9898
model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd',
9999
output_file_name=output_file_name, error_file_name=error_file_name)
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import sys
2+
sys.path.insert(0, '../common_python')
3+
import tools
4+
import pytest
5+
import os
6+
7+
def skeleton_layer_elu(cluster, executables, dir_name, compiler_name):
8+
if compiler_name not in executables:
9+
pytest.skip('default_exes[%s] does not exist' % compiler_name)
10+
output_file_name = '%s/bamboo/unit_tests/output/layer_elu_%s_output.txt' % (dir_name, compiler_name)
11+
error_file_name = '%s/bamboo/unit_tests/error/layer_elu_%s_error.txt' % (dir_name, compiler_name)
12+
command = tools.get_command(
13+
cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
14+
data_filedir_default='', data_reader_name='synthetic',
15+
model_folder='tests/layer_tests', model_name='elu', optimizer_name='sgd',
16+
output_file_name=output_file_name, error_file_name=error_file_name)
17+
return_code = os.system(command)
18+
assert return_code == 0
19+
20+
def test_unit_layer_elu_clang4(cluster, exes, dirname):
21+
skeleton_layer_elu(cluster, exes, dirname, 'clang4')
22+
23+
def test_unit_layer_elu_gcc4_check(cluster, exes, dirname):
24+
if cluster in ['surface']:
25+
pytest.skip('FIXME')
26+
# Surface Errors:
27+
# assert 34304 == 0
28+
skeleton_layer_elu(cluster, exes, dirname, 'gcc4')
29+
30+
def test_unit_layer_elu_gcc7(cluster, exes, dirname):
31+
skeleton_layer_elu(cluster, exes, dirname, 'gcc7')
32+
33+
def test_unit_layer_elu_intel18(cluster, exes, dirname):
34+
skeleton_layer_elu(cluster, exes, dirname, 'intel18')
35+
36+
# Run with python -m pytest -s test_unit_layer_elu.py -k 'test_unit_layer_elu_exe' --exe=<executable>
37+
def test_unit_layer_elu_exe(cluster, dirname, exe):
38+
if exe == None:
39+
pytest.skip('Non-local testing')
40+
exes = {'exe' : exe}
41+
skeleton_layer_elu(cluster, exes, dirname, 'exe')

0 commit comments

Comments
 (0)