forked from mlcommons/training
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgbs256_tpu.yaml
More file actions
52 lines (52 loc) · 1.36 KB
/
gbs256_tpu.yaml
File metadata and controls
52 lines (52 loc) · 1.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# @package _global_
exp_name: mixtral8x22-dropped-241125-1659
config_path: null
seed: 4321
per_device_train_batch_size: 1
global_train_batch_size: 256
per_device_eval_batch_size: 1
global_eval_batch_size: 256
max_grad_norm: 1.0
max_steps: 250
pad_token_id: -100
output_dir: /app/output
do_first_eval: false
run_dir: /app/output/mixtral8x22-dropped-241125-1659
lr: 2.0e-05
gradient_accumulation_steps: 1
max_length: 32768
n_eval_examples: null
optimizer: ADAMW_TORCH_XLA
weight_decay: 0.1
eval_frequency: 6
checkpoint_manager_path: gs://lizhiyu-multipods-eu-west/moe/checkpoints-20240803/mixtral822/
dry_run: false
shuffle: true
full_precision: false
local_compile_cache_dir: /app/output/mixtral8x22-dropped-241125-1659
tensor_parallelism: 1
cache_local_dir: null
model:
config_path: mixtral822.json
name_or_path: mistralai/Mixtral-8x22B-v0.1
dtype: bfloat16
flash_attention: true
capacity_factor: 1.25
fsdp_config:
fsdp_transformer_layer_cls_to_wrap:
- MixtralDecoderLayer
min_num_params: 0
xla_fsdp_grad_ckpt: true
sched:
name: WarmupHoldPolicy
warmup_ratio: 0.25
hold_steps: 10000000000000
max_steps: 250
dataset:
dataset_name: c4_mlperf
train_dataset_path: gs://mlperf-llm-public2/c4/en_json/3.0.1
eval_dataset_path: gs://mlperf-llm-public2/c4/en_val_subset_json
streaming: true
num_proc: 1
load_from_cache_file: true
shuffle_buffer_size: 256