forked from verl-project/verl
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtorchtitan.yaml
More file actions
68 lines (47 loc) · 1.55 KB
/
torchtitan.yaml
File metadata and controls
68 lines (47 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Target class for this configuration
_target_: verl.workers.config.TorchtitanEngineConfig
# policy for wrapping the model
wrap_policy:
# Minimum number of parameters to trigger wrapping a layer with FSDP
min_num_params: 0
# The policy for applying `reshard_after_forward` within an FSDP setup
# Options: "default", "always", "never"
reshard_after_forward: default
# Prefetch the next forward-pass all-gather before the current forward computation.
forward_prefetch: false
# Whether to use original parameters
use_orig_params: false
# Mixed precision configuration for FSDP
mixed_precision: false
# Whether to use torch compile
use_torch_compile: true
# Whether to use entropy_from_logits_with_chunking
entropy_from_logits_with_chunking: false
# Whether to use entropy checkpointing
entropy_checkpointing: false
# Data parallel size (FSDP group size)
data_parallel_size: 1
# Data parallel replicate size
data_parallel_replicate_size: 1
# Data parallel shard size
data_parallel_shard_size: 1
# Tensor parallel size
tensor_parallel_size: 1
# Expert parallel size
expert_parallel_size: 1
# Pipeline parallel size
pipeline_parallel_size: 1
# Context parallel size
context_parallel_size: 1
# Attention type for torchtitan's model (e.g., "sdpa", "flex", "varlen")
attn_type: flex
# Strategy
strategy: torchtitan
# Random seed for reproducibility
seed: 42
# Whether to enable full determinism for distributed training, only for debugging
full_determinism: false
# Whether to use forward only
forward_only: false
# Mixed precision training param dtype
dtype: bfloat16