Skip to content

Commit dd8b60d

Browse files
authored
CosmoFlow Updates (#2449)
* Cosmoflow model updates * Update hdf5 data reader setup
1 parent 3127424 commit dd8b60d

File tree

3 files changed

+32
-13
lines changed

3 files changed

+32
-13
lines changed

applications/physics/cosmology/cosmoflow/cosmoflow_model.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ def construct_cosmoflow_model(parallel_strategy,
1111
learning_rate,
1212
min_distconv_width,
1313
mlperf,
14-
transform_input):
14+
transform_input,
15+
dropout_keep_prob=0.5):
1516

1617
# Construct layer graph
1718
universes = lbann.Input(data_field='samples')
@@ -23,7 +24,8 @@ def construct_cosmoflow_model(parallel_strategy,
2324
use_bn=use_batchnorm,
2425
bn_statistics_group_size=statistics_group_size,
2526
mlperf=mlperf,
26-
transform_input=transform_input)(universes)
27+
transform_input=transform_input,
28+
dropout_keep_prob=dropout_keep_prob)(universes)
2729
mse = lbann.MeanSquaredError([preds, secrets])
2830
mae = lbann.MeanAbsoluteError([preds, secrets])
2931
obj = lbann.ObjectiveFunction([mse])
@@ -71,7 +73,7 @@ def construct_cosmoflow_model(parallel_strategy,
7173
# initial_warmup_learning_rate=0,
7274
# warmup_steps=100
7375
# ),
74-
lbann.CallbackProgressBar(newline_interval=1)
76+
lbann.CallbackProgressBar(newline_interval=1, print_mem_usage=True)
7577
]
7678
return lbann.Model(
7779
epochs=num_epochs,

applications/physics/cosmology/cosmoflow/cosmoflow_network_architectures.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ def __init__(self,
2828
use_bn=False,
2929
bn_statistics_group_size=None,
3030
mlperf=False,
31-
transform_input=False):
31+
transform_input=False,
32+
dropout_keep_prob=0.5):
3233
"""Initialize CosmFlow.
3334
3435
Args:
@@ -43,6 +44,8 @@ def __init__(self,
4344
model.
4445
transform_input (bool): Whether or not to apply log1p
4546
transformation to model inputs.
47+
dropout_keep_prob (float): Probability of not zeroing out
48+
activations in dropout layers. Setting to 1 disables dropout.
4649
"""
4750

4851
CosmoFlow.global_count += 1
@@ -53,6 +56,7 @@ def __init__(self,
5356
self.use_bn = use_bn
5457
self.mlperf = mlperf
5558
self.transform_input = transform_input
59+
self.dropout_keep_prob = dropout_keep_prob
5660

5761
if self.mlperf:
5862
base_channels = 32
@@ -144,8 +148,11 @@ def create_act(x, i):
144148
self.name, i, self.instance))
145149

146150
def create_dropout(x, i):
151+
if self.dropout_keep_prob == 1:
152+
return x
153+
147154
return lbann.Dropout(
148-
x, keep_prob=0.5,
155+
x, keep_prob=self.dropout_keep_prob,
149156
name='{0}_fc_drop{1}_instance{2}'.format(
150157
self.name, i, self.instance))
151158

applications/physics/cosmology/cosmoflow/train_cosmoflow.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ def create_python_dataset_reader(args):
2020
readers = []
2121
for role in ['train', 'val', 'test']:
2222
role_dir = getattr(args, f'{role}_dir')
23+
if not role_dir:
24+
continue
25+
if role == 'val':
26+
role = 'validate'
2327
dataset = CosmoFlowDataset(role_dir, args.input_width, args.num_secrets)
2428
reader = lbann.util.data.construct_python_dataset_reader(dataset, role=role)
2529
readers.append(reader)
@@ -35,11 +39,13 @@ def create_cosmoflow_data_reader(
3539
num_responses (int): The number of parameters to predict.
3640
"""
3741

38-
reader_args = [
39-
{"role": "train", "data_filename": train_path},
40-
{"role": "validate", "data_filename": val_path},
41-
# {"role": "test", "data_filename": test_path},
42-
]
42+
reader_args = []
43+
if train_path:
44+
reader_args.append({"role": "train", "data_filename": train_path})
45+
if val_path:
46+
reader_args.append({"role": "validate", "data_filename": val_path})
47+
if test_path:
48+
reader_args.append({"role": "test", "data_filename": test_path})
4349

4450
for reader_arg in reader_args:
4551
reader_arg["data_file_pattern"] = "{}/*.hdf5".format(
@@ -142,7 +148,7 @@ def create_synthetic_data_reader(input_width: int, num_responses: int) -> Any:
142148
default_dir = '{}/{}'.format(default_lc_dataset, role)
143149
parser.add_argument(
144150
'--{}-dir'.format(role), action='store', type=str,
145-
default=default_dir,
151+
default=default_dir if role == 'train' else None,
146152
help='the directory of the {} dataset'.format(role))
147153
parser.add_argument(
148154
'--synthetic', action='store_true',
@@ -156,6 +162,9 @@ def create_synthetic_data_reader(input_width: int, num_responses: int) -> Any:
156162
parser.add_argument(
157163
'--transform-input', action='store_true',
158164
help='Apply log1p transformation to model inputs')
165+
parser.add_argument(
166+
'--dropout-keep-prob', action='store', type=float, default=0.5,
167+
help='Probability of keeping activations in dropout layers (default: 0.5). Set to 1 to disable dropout')
159168

160169
# Parallelism arguments
161170
parser.add_argument(
@@ -227,7 +236,8 @@ def create_synthetic_data_reader(input_width: int, num_responses: int) -> Any:
227236
learning_rate=args.optimizer_learning_rate,
228237
min_distconv_width=args.min_distconv_width,
229238
mlperf=args.mlperf,
230-
transform_input=args.transform_input)
239+
transform_input=args.transform_input,
240+
dropout_keep_prob=args.dropout_keep_prob)
231241

232242
# Add profiling callbacks if needed.
233243
model.callbacks.extend(lbann.contrib.args.create_profile_callbacks(args))
@@ -274,7 +284,7 @@ def create_synthetic_data_reader(input_width: int, num_responses: int) -> Any:
274284
environment['DISTCONV_JIT_CACHEPATH'] = f'{application_path}/DaCe_kernels/.dacecache'
275285

276286
if args.synthetic or args.no_datastore:
277-
lbann_args = []
287+
lbann_args = ['--num_io_threads=8']
278288
else:
279289
lbann_args = ['--use_data_store']
280290
lbann_args += lbann.contrib.args.get_profile_args(args)

0 commit comments

Comments
 (0)