together-VeOmni/configs/text/qwen2_5.yaml at main · togethercomputer/together-VeOmni · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
model:
  model_path: Qwen/Qwen2.5-7B-Instruct
  ops_implementation:
    attn_implementation: flash_attention_2

data:
  train_path: fineweb
  train_size: 1000000000000
  dataloader:
    type: native
    drop_last: true
  datasets_type: iterable
  data_type: plaintext
  max_seq_len: 8192
  text_keys: text

train:
  accelerator:
    ulysses_size: 1
    fsdp_config:
      fsdp_mode: fsdp1
      full_shard: true
      offload: false
    offload:
      enable_activation: false
  gradient_checkpointing:
    enable: true
  global_batch_size: 32
  micro_batch_size: 1
  bsz_warmup_ratio: 0.007
  dyn_bsz_buffer_size: 200
  optimizer:
    type: adamw
    lr: 3.0e-4
    lr_warmup_ratio: 0.007
    lr_decay_style: constant
    lr_decay_ratio: 1.0
    weight_decay: 0.01
    max_grad_norm: 1.0
  enable_mixed_precision: true
  init_device: meta
  enable_full_determinism: false
  empty_cache_steps: 500
  checkpoint:
    output_dir: Qwen2.5-7B-Instruct_CT
    manager: dcp
    save_steps: 100
    save_hf_weights: true
  wandb:
    project: Qwen2.5-7B-Instruct
    name: Qwen2.5-7B-Instruct-CT