together-VeOmni/configs/text/llama3.yaml at main · togethercomputer/together-VeOmni · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
model:
  model_path: meta-llama/Meta-Llama-3-8B

data:
  train_path: fineweb_100BT
  train_size: 1000000000000
  dataloader:
    type: native
    drop_last: true
  datasets_type: iterable
  data_type: plaintext
  max_seq_len: 8192
  text_keys: text

train:
  accelerator:
    ulysses_size: 1
    fsdp_config:
      fsdp_mode: fsdp1
      full_shard: true
      offload: false
    offload:
      enable_activation: false
  gradient_checkpointing:
    enable: true
  global_batch_size: 512
  micro_batch_size: 1
  bsz_warmup_ratio: 0.007
  optimizer:
    type: adamw
    lr: 3.0e-4
    lr_warmup_ratio: 0.007
    lr_decay_style: constant
    lr_decay_ratio: 1.0
    weight_decay: 0.01
    max_grad_norm: 1.0
  enable_mixed_precision: false
  init_device: meta
  enable_full_determinism: false
  empty_cache_steps: 500
  checkpoint:
    output_dir: llama3-8b
    manager: dcp
    save_steps: 100
    save_hf_weights: true
  wandb:
    project: Meta-Llama-3-8B
    name: Meta-Llama-3-8B-CT