forked from mlcommons/training
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconfig.yaml
More file actions
87 lines (61 loc) · 2.11 KB
/
config.yaml
File metadata and controls
87 lines (61 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
defaults:
- _self_
- model: blank_model
- sched: WarmupHoldPolicy
- dataset: c4_mlperf
# name for this experiment in the local run directory
exp_name: moe_trial
# random seed for batch sampling
seed: 0
# the batch size for for each accelerator/device
# global_train_batch_size = per_device_train_batch_size * num_devices
per_device_train_batch_size: 1
global_train_batch_size: ${get_global_batch_size:${per_device_train_batch_size}}
# the batch size during evaluation and sampling, if enabled
per_device_eval_batch_size: ${per_device_train_batch_size}
global_eval_batch_size: ${get_global_batch_size:${per_device_eval_batch_size}}
max_grad_norm: 1.
max_steps: 10
pad_token_id: -100
output_dir: /tmp
# early stop once reaching target eval_loss
target_eval_loss: 0
# whether to eval at the very beginning of training
do_first_eval: false
# an OmegaConf resolver that returns the local run directory, calling a function in utils.py
run_dir: ${path_join:${output_dir},${exp_name}}
# the learning rate
lr: 2e-5
# number of steps to accumulate over for each batch
# (e.g. if global_train_batch_size=4 and gradient_accumulation_steps=2, then we will
# accumulate gradients over equivalent batch size of 8 i.e. 2 microbatches of size 4)
gradient_accumulation_steps: 1
# the maximum allowed length for an input
max_length: 512
# the max number of examples to evaluate on
n_eval_examples: null
# The optimizer to use; we use RMSprop because it works about as well as Adam and is more memory-efficient
optimizer: ADAMW_TORCH_XLA
weight_decay: 0.1
# evaluate and save model every eval_every steps
eval_frequency: -1
# path to load checkpoint
checkpoint_manager_path: null
# shuffle train data set
shuffle: True
# use float32 in matmul in torch xla
full_precision: False
# path to save compile cache for torch xla
local_compile_cache_dir: ${run_dir}
# tensor_parallelism and fsdp parallelism would be num_devices / tensor_parallelism
tensor_parallelism: 1
context_parallelism: 1
pipeline_parallelism: 1
virtual_pipeline_parallelism: 1
# cache of models
cache_local_dir: null
xla_profile_step: -1
log_frequency: 1
hydra:
run:
dir: ${run_dir}