Skip to content

Commit fe44824

Browse files
committed
feat(torchtitan): add DeepSeek-V3 model configs for MI300X and MI355X
- add DeepSeek-V3 16B and 671B pretrain configs under examples/torchtitan/configs/MI300X - add same model configs for MI355X to support newer platform - add corresponding model YAMLs under primus/configs/models/torchtitan - align format and parameters with TorchTitan standard templates - verified YAML integrity and Primus CLI compatibility
1 parent 90eeb3c commit fe44824

File tree

6 files changed

+349
-0
lines changed

6 files changed

+349
-0
lines changed
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
2+
work_group: ${PRIMUS_TEAM:amd}
3+
user_name: ${PRIMUS_USER:root}
4+
exp_name: ${PRIMUS_EXP_NAME:deepseek_v3_16b-pretrain}
5+
workspace: ./output
6+
7+
modules:
8+
pre_trainer:
9+
framework: torchtitan
10+
config: pre_trainer.yaml
11+
12+
# model to run
13+
model: deepseek_v3_16b.yaml
14+
overrides:
15+
profiling:
16+
enable_profiling: false
17+
save_traces_folder: "profile_trace"
18+
profile_freq: 10
19+
enable_memory_snapshot: false
20+
save_memory_snapshot_folder: "memory_snapshot"
21+
22+
metrics:
23+
log_freq: 10
24+
disable_color_printing: false
25+
enable_tensorboard: false
26+
save_tb_folder: "tb"
27+
enable_wandb: false
28+
29+
optimizer:
30+
name: "AdamW"
31+
lr: 2.2e-4
32+
eps: 1.0e-8
33+
34+
lr_scheduler:
35+
warmup_steps: 200 # lr scheduler warm up, normally 20% of the train steps
36+
decay_ratio: 0.8 # lr scheduler decay ratio, 80% of the train steps
37+
decay_type: "cosine"
38+
min_lr_factor: 0.1
39+
40+
training:
41+
local_batch_size: 4
42+
seq_len: 4096
43+
max_norm: 1.0 # grad norm clipping
44+
steps: 1000
45+
dataset: "c4" # supported datasets: c4_test (2K), c4 (177M)
46+
47+
parallelism:
48+
data_parallel_replicate_degree: 1
49+
data_parallel_shard_degree: -1
50+
fsdp_reshard_after_forward: "default" # default / never / always
51+
tensor_parallel_degree: 1
52+
enable_async_tensor_parallel: false
53+
pipeline_parallel_degree: 1
54+
pipeline_parallel_schedule: "Interleaved1F1B"
55+
expert_parallel_degree: 8
56+
expert_tensor_parallel_degree: 1
57+
58+
checkpoint:
59+
enable: false
60+
folder: "checkpoint"
61+
interval: 10
62+
last_save_model_only: true
63+
export_dtype: "float32"
64+
async_mode: "disabled" # ["disabled", "async", "async_with_pinned_mem"]
65+
66+
activation_checkpoint:
67+
mode: "none" # ["none", "selective", "full"]
68+
selective_ac_option: "op" # 'int' = ac every positive int layer or 'op', ac based on ops policy
69+
70+
compile:
71+
enable: true
72+
components: ["loss"] # ["model", "loss"]
73+
74+
# quantize:
75+
# linear:
76+
# float8:
77+
# enable_fsdp_float8_all_gather: false
78+
# precompute_float8_dynamic_scale_for_fsdp: false
79+
# filter_fqns: ["output", "router.gate"]
80+
# grouped_mm:
81+
# float8:
82+
# fqns: ["experts"]
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
2+
work_group: ${PRIMUS_TEAM:amd}
3+
user_name: ${PRIMUS_USER:root}
4+
exp_name: ${PRIMUS_EXP_NAME:deepseek_v3_671b-pretrain}
5+
workspace: ./output
6+
7+
modules:
8+
pre_trainer:
9+
framework: torchtitan
10+
config: pre_trainer.yaml
11+
12+
# model to run
13+
model: deepseek_v3_671b.yaml
14+
overrides:
15+
profiling:
16+
enable_profiling: false
17+
save_traces_folder: "profile_trace"
18+
profile_freq: 10
19+
enable_memory_snapshot: false
20+
save_memory_snapshot_folder: "memory_snapshot"
21+
22+
metrics:
23+
log_freq: 10
24+
disable_color_printing: false
25+
enable_tensorboard: false
26+
save_tb_folder: "tb"
27+
enable_wandb: false
28+
29+
optimizer:
30+
name: "AdamW"
31+
lr: 2.2e-4
32+
eps: 1.0e-8
33+
34+
lr_scheduler:
35+
warmup_steps: 200 # lr scheduler warm up, normally 20% of the train steps
36+
decay_ratio: 0.8 # lr scheduler decay ratio, 80% of the train steps
37+
decay_type: "cosine"
38+
min_lr_factor: 0.1
39+
40+
training:
41+
local_batch_size: 4
42+
seq_len: 4096
43+
max_norm: 1.0 # grad norm clipping
44+
steps: 1000
45+
dataset: "c4" # supported datasets: c4_test (2K), c4 (177M)
46+
47+
parallelism:
48+
data_parallel_replicate_degree: 1
49+
data_parallel_shard_degree: -1
50+
fsdp_reshard_after_forward: "default" # default / never / always
51+
tensor_parallel_degree: 1
52+
enable_async_tensor_parallel: false
53+
pipeline_parallel_degree: 1
54+
pipeline_parallel_schedule: "Interleaved1F1B"
55+
expert_parallel_degree: 1
56+
expert_tensor_parallel_degree: 1
57+
58+
checkpoint:
59+
enable: false
60+
folder: "checkpoint"
61+
interval: 10
62+
last_save_model_only: true
63+
export_dtype: "float32"
64+
async_mode: "disabled" # ["disabled", "async", "async_with_pinned_mem"]
65+
66+
activation_checkpoint:
67+
mode: "full" # ["none", "selective", "full"]
68+
selective_ac_option: "op" # 'int' = ac every positive int layer or 'op', ac based on ops policy
69+
70+
compile:
71+
enable: true
72+
components: ["loss"] # ["model", "loss"]
73+
74+
# quantize:
75+
# linear:
76+
# float8:
77+
# enable_fsdp_float8_all_gather: false
78+
# precompute_float8_dynamic_scale_for_fsdp: false
79+
# filter_fqns: ["output", "router.gate"]
80+
# grouped_mm:
81+
# float8:
82+
# fqns: ["experts"]
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
2+
work_group: ${PRIMUS_TEAM:amd}
3+
user_name: ${PRIMUS_USER:root}
4+
exp_name: ${PRIMUS_EXP_NAME:deepseek_v3_16b-pretrain}
5+
workspace: ./output
6+
7+
modules:
8+
pre_trainer:
9+
framework: torchtitan
10+
config: pre_trainer.yaml
11+
12+
# model to run
13+
model: deepseek_v3_16b.yaml
14+
overrides:
15+
profiling:
16+
enable_profiling: false
17+
save_traces_folder: "profile_trace"
18+
profile_freq: 10
19+
enable_memory_snapshot: false
20+
save_memory_snapshot_folder: "memory_snapshot"
21+
22+
metrics:
23+
log_freq: 10
24+
disable_color_printing: false
25+
enable_tensorboard: false
26+
save_tb_folder: "tb"
27+
enable_wandb: false
28+
29+
optimizer:
30+
name: "AdamW"
31+
lr: 2.2e-4
32+
eps: 1.0e-8
33+
34+
lr_scheduler:
35+
warmup_steps: 200 # lr scheduler warm up, normally 20% of the train steps
36+
decay_ratio: 0.8 # lr scheduler decay ratio, 80% of the train steps
37+
decay_type: "cosine"
38+
min_lr_factor: 0.1
39+
40+
training:
41+
local_batch_size: 4
42+
seq_len: 4096
43+
max_norm: 1.0 # grad norm clipping
44+
steps: 1000
45+
dataset: "c4" # supported datasets: c4_test (2K), c4 (177M)
46+
47+
parallelism:
48+
data_parallel_replicate_degree: 1
49+
data_parallel_shard_degree: -1
50+
fsdp_reshard_after_forward: "default" # default / never / always
51+
tensor_parallel_degree: 1
52+
enable_async_tensor_parallel: false
53+
pipeline_parallel_degree: 1
54+
pipeline_parallel_schedule: "Interleaved1F1B"
55+
expert_parallel_degree: 8
56+
expert_tensor_parallel_degree: 1
57+
58+
checkpoint:
59+
enable: false
60+
folder: "checkpoint"
61+
interval: 10
62+
last_save_model_only: true
63+
export_dtype: "float32"
64+
async_mode: "disabled" # ["disabled", "async", "async_with_pinned_mem"]
65+
66+
activation_checkpoint:
67+
mode: "none" # ["none", "selective", "full"]
68+
selective_ac_option: "op" # 'int' = ac every positive int layer or 'op', ac based on ops policy
69+
70+
compile:
71+
enable: true
72+
components: ["loss"] # ["model", "loss"]
73+
74+
# quantize:
75+
# linear:
76+
# float8:
77+
# enable_fsdp_float8_all_gather: false
78+
# precompute_float8_dynamic_scale_for_fsdp: false
79+
# filter_fqns: ["output", "router.gate"]
80+
# grouped_mm:
81+
# float8:
82+
# fqns: ["experts"]
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
2+
work_group: ${PRIMUS_TEAM:amd}
3+
user_name: ${PRIMUS_USER:root}
4+
exp_name: ${PRIMUS_EXP_NAME:deepseek_v3_671b-pretrain}
5+
workspace: ./output
6+
7+
modules:
8+
pre_trainer:
9+
framework: torchtitan
10+
config: pre_trainer.yaml
11+
12+
# model to run
13+
model: deepseek_v3_671b.yaml
14+
overrides:
15+
profiling:
16+
enable_profiling: false
17+
save_traces_folder: "profile_trace"
18+
profile_freq: 10
19+
enable_memory_snapshot: false
20+
save_memory_snapshot_folder: "memory_snapshot"
21+
22+
metrics:
23+
log_freq: 10
24+
disable_color_printing: false
25+
enable_tensorboard: false
26+
save_tb_folder: "tb"
27+
enable_wandb: false
28+
29+
optimizer:
30+
name: "AdamW"
31+
lr: 2.2e-4
32+
eps: 1.0e-8
33+
34+
lr_scheduler:
35+
warmup_steps: 200 # lr scheduler warm up, normally 20% of the train steps
36+
decay_ratio: 0.8 # lr scheduler decay ratio, 80% of the train steps
37+
decay_type: "cosine"
38+
min_lr_factor: 0.1
39+
40+
training:
41+
local_batch_size: 4
42+
seq_len: 4096
43+
max_norm: 1.0 # grad norm clipping
44+
steps: 1000
45+
dataset: "c4" # supported datasets: c4_test (2K), c4 (177M)
46+
47+
parallelism:
48+
data_parallel_replicate_degree: 1
49+
data_parallel_shard_degree: -1
50+
fsdp_reshard_after_forward: "default" # default / never / always
51+
tensor_parallel_degree: 1
52+
enable_async_tensor_parallel: false
53+
pipeline_parallel_degree: 1
54+
pipeline_parallel_schedule: "Interleaved1F1B"
55+
expert_parallel_degree: 1
56+
expert_tensor_parallel_degree: 1
57+
58+
checkpoint:
59+
enable: false
60+
folder: "checkpoint"
61+
interval: 10
62+
last_save_model_only: true
63+
export_dtype: "float32"
64+
async_mode: "disabled" # ["disabled", "async", "async_with_pinned_mem"]
65+
66+
activation_checkpoint:
67+
mode: "full" # ["none", "selective", "full"]
68+
selective_ac_option: "op" # 'int' = ac every positive int layer or 'op', ac based on ops policy
69+
70+
compile:
71+
enable: true
72+
components: ["loss"] # ["model", "loss"]
73+
74+
# quantize:
75+
# linear:
76+
# float8:
77+
# enable_fsdp_float8_all_gather: false
78+
# precompute_float8_dynamic_scale_for_fsdp: false
79+
# filter_fqns: ["output", "router.gate"]
80+
# grouped_mm:
81+
# float8:
82+
# fqns: ["experts"]
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
job:
2+
dump_folder: "./outputs"
3+
description: "DeepSeek-V3 16B model training"
4+
print_config: false
5+
6+
7+
model:
8+
name: "deepseek_v3"
9+
flavor: "16B"
10+
hf_assets_path: "deepseek-ai/deepseek-moe-16b-base"
11+
# converters: ["float8"]
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
job:
2+
dump_folder: "./outputs"
3+
description: "DeepSeek-V3 671B model training"
4+
print_config: false
5+
6+
model:
7+
name: "deepseek_v3"
8+
flavor: "671B"
9+
hf_assets_path: "deepseek-ai/DeepSeek-V3.1-Base"
10+
# converters: ["float8"]

0 commit comments

Comments
 (0)