training/retired_benchmarks/mixtral8x22b/config/config.yaml at 76ce0e412e58bf7aa29a6b64bd0dd7f27efaef60 · ShriyaRishab/training · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
defaults:
- _self_
- model: blank_model
- sched: WarmupHoldPolicy
- dataset: c4_mlperf

# name for this experiment in the local run directory
exp_name: moe_trial

# random seed for batch sampling
seed: 0

# the batch size for for each accelerator/device
# global_train_batch_size = per_device_train_batch_size * num_devices
per_device_train_batch_size: 1
global_train_batch_size: ${get_global_batch_size:${per_device_train_batch_size}}

# the batch size during evaluation and sampling, if enabled
per_device_eval_batch_size: ${per_device_train_batch_size}
global_eval_batch_size: ${get_global_batch_size:${per_device_eval_batch_size}}

max_grad_norm: 1.

max_steps: 10

pad_token_id: -100

output_dir: /tmp

# early stop once reaching target eval_loss
target_eval_loss: 0

# whether to eval at the very beginning of training
do_first_eval: false

# an OmegaConf resolver that returns the local run directory, calling a function in utils.py
run_dir: ${path_join:${output_dir},${exp_name}}

# the learning rate
lr: 2e-5

# number of steps to accumulate over for each batch
#   (e.g. if global_train_batch_size=4 and gradient_accumulation_steps=2, then we will
#   accumulate gradients over equivalent batch size of 8 i.e. 2 microbatches of size 4)
gradient_accumulation_steps: 1

# the maximum allowed length for an input
max_length: 512

# the max number of examples to evaluate on
n_eval_examples: null

# The optimizer to use; we use RMSprop because it works about as well as Adam and is more memory-efficient
optimizer: ADAMW_TORCH_XLA
weight_decay: 0.1

# evaluate and save model every eval_every steps
eval_frequency: -1

# path to load checkpoint
checkpoint_manager_path: null

# shuffle train data set
shuffle: True

# use float32 in matmul in torch xla
full_precision: False

# path to save compile cache for torch xla
local_compile_cache_dir: ${run_dir}

# tensor_parallelism and fsdp parallelism would be num_devices / tensor_parallelism
tensor_parallelism: 1
context_parallelism: 1
pipeline_parallelism: 1
virtual_pipeline_parallelism: 1

# cache of models
cache_local_dir: null

xla_profile_step: -1

log_frequency: 1

hydra:
  run:
    dir: ${run_dir}