training/retired_benchmarks/mixtral8x22b/config/experiment/gbs256_tpu.yaml at 76ce0e412e58bf7aa29a6b64bd0dd7f27efaef60 · ShriyaRishab/training · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# @package _global_
exp_name: mixtral8x22-dropped-241125-1659
config_path: null
seed: 4321
per_device_train_batch_size: 1
global_train_batch_size: 256
per_device_eval_batch_size: 1
global_eval_batch_size: 256
max_grad_norm: 1.0
max_steps: 250
pad_token_id: -100
output_dir: /app/output
do_first_eval: false
run_dir: /app/output/mixtral8x22-dropped-241125-1659
lr: 2.0e-05
gradient_accumulation_steps: 1
max_length: 32768
n_eval_examples: null
optimizer: ADAMW_TORCH_XLA
weight_decay: 0.1
eval_frequency: 6
checkpoint_manager_path: gs://lizhiyu-multipods-eu-west/moe/checkpoints-20240803/mixtral822/
dry_run: false
shuffle: true
full_precision: false
local_compile_cache_dir: /app/output/mixtral8x22-dropped-241125-1659
tensor_parallelism: 1
cache_local_dir: null
model:
  config_path: mixtral822.json
  name_or_path: mistralai/Mixtral-8x22B-v0.1
  dtype: bfloat16
  flash_attention: true
  capacity_factor: 1.25
  fsdp_config:
    fsdp_transformer_layer_cls_to_wrap:
    - MixtralDecoderLayer
    min_num_params: 0
    xla_fsdp_grad_ckpt: true
sched:
  name: WarmupHoldPolicy
  warmup_ratio: 0.25
  hold_steps: 10000000000000
  max_steps: 250
dataset:
  dataset_name: c4_mlperf
  train_dataset_path: gs://mlperf-llm-public2/c4/en_json/3.0.1
  eval_dataset_path: gs://mlperf-llm-public2/c4/en_val_subset_json
  streaming: true
  num_proc: 1
  load_from_cache_file: true
  shuffle_buffer_size: 256