Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions erniekit/train/dpo/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,15 @@ def run_dpo(

set_seed(finetuning_args.seed)

if finetuning_args.pre_alloc_memory > 0:
logger.warning(
f"pre-allocating a tensor whose memory capacity is {finetuning_args.pre_alloc_memory} GB "
"and then release it."
)
memory_size = int(finetuning_args.pre_alloc_memory * 1024 * 1024 * 1024)
x = paddle.empty([memory_size], dtype=paddle.uint8)
del x

logger.warning(
f"Process rank: {finetuning_args.local_rank}, device: {finetuning_args.device}, world_size: "
f"{finetuning_args.world_size}, distributed training: {bool(finetuning_args.local_rank != -1)}, "
Expand Down
101 changes: 101 additions & 0 deletions examples/configs/iluvatar_gpu/ERNIE-4.5-21B-A3B/dpo/run_dpo_8k.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
### data
train_dataset_type: "erniekit"
eval_dataset_type: "erniekit"
train_dataset_path: "./examples/data/dpo-train.jsonl"
train_dataset_prob: "1.0"
eval_dataset_path: "./examples/data/dpo-eval.jsonl"
eval_dataset_prob: "1.0"
max_seq_len: 8192
num_samples_each_epoch: 6000000

### model
model_name_or_path: baidu/ERNIE-4.5-21B-A3B-Paddle
moe_group: mp
fine_tuning: Full
fuse_rope: True
fuse_linear: False

### finetuning
# base
stage: DPO
seed: 42
do_train: True
do_eval: True
distributed_dataloader: False
dataloader_num_workers: 1
batch_size: 1
num_train_epochs: 1
max_steps: 800
max_evaluate_steps: 10000
eval_steps: 20000
evaluation_strategy: epoch
save_steps: 100
save_total_limit: 5
save_strategy: epoch
logging_steps: 1
release_grads: True
gradient_accumulation_steps: 1
logging_dir: ./vdl_log
output_dir: ./output
disable_tqdm: True

# train
warmup_steps: 50
learning_rate: 5.0e-7
lr_scheduler_type: cosine
min_lr: 5.0e-7
layerwise_lr_decay_bound: 1.0
attention_probs_dropout_prob: 0.0
dropout_warmup_steps: 100

# loss
offset_alpha: 0.0
scale_loss: 8192

# optimizer
weight_decay: 0.1
adam_epsilon: 1.0e-8
adam_beta1: 0.9
adam_beta2: 0.95
offload_optim: False

# performance
use_sp_callback: True
tensor_parallel_degree: 4
tensor_parallel_config: "sync_param sync_grad sync_moment"
pipeline_parallel_degree: 8
sharding_parallel_degree: 1
sharding: stage1
sequence_parallel: True
pipeline_parallel_config: disable_partial_send_recv enable_clear_every_step_cache
recompute: True
recompute_use_reentrant: True
compute_type: bf16
fp16_opt_level: O2
disable_ckpt_quant: True
amp_master_grad: True
amp_custom_white_list:
- "lookup_table"
- "lookup_table_v2"
- "flash_attn"
- "matmul"
- "matmul_v2"
- "fused_gemm_epilogue"
amp_custom_black_list:
- "reduce_sum"
- "softmax_with_cross_entropy"
- "c_softmax_with_cross_entropy"
- "elementwise_div"
- "sin"
- "cos"
unified_checkpoint: True
unified_checkpoint_config: async_save

use_flash_attention: True
use_sparse_head_and_loss_fn: False
use_attn_mask_start_row_indices: False
use_sparse_flash_attn: False
moe_multimodal_dispatch_use_allgather: "v2-alltoall"
device: iluvatar_gpu
fuse_rms_norm: False
pre_alloc_memory: 20
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
### data
train_dataset_type: "erniekit"
eval_dataset_type: "erniekit"
train_dataset_path: "./examples/data/dpo-train.jsonl"
train_dataset_prob: "1.0"
eval_dataset_path: "./examples/data/dpo-eval.jsonl"
eval_dataset_prob: "1.0"
max_seq_len: 8192
num_samples_each_epoch: 6000000

### model
model_name_or_path: baidu/ERNIE-4.5-21B-A3B-Paddle
moe_group: mp
fine_tuning: LoRA
lora_rank: 32
lora_alpha: 128
lora_plus_scale: 12
rslora: True
fuse_rope: True
fuse_linear: True

### finetuning
# base
stage: DPO
seed: 42
do_train: True
do_eval: True
distributed_dataloader: True
dataloader_num_workers: 4
batch_size: 1
num_train_epochs: 1
max_steps: 800
max_evaluate_steps: 10000
eval_steps: 20000
evaluation_strategy: epoch
save_steps: 100
save_total_limit: 5
save_strategy: epoch
logging_steps: 1
release_grads: True
gradient_accumulation_steps: 8
logging_dir: ./vdl_log
output_dir: ./output
disable_tqdm: True

# train
warmup_steps: 50
learning_rate: 5.0e-7
lr_scheduler_type: cosine
min_lr: 5.0e-7
layerwise_lr_decay_bound: 1.0
attention_probs_dropout_prob: 0.1
dropout_warmup_steps: 100

# loss
offset_alpha: 0.0
scale_loss: 8192

# optimizer
weight_decay: 0.1
adam_epsilon: 1.0e-8
adam_beta1: 0.9
adam_beta2: 0.95
offload_optim: True

# performance
use_sp_callback: True
tensor_parallel_degree: 4
tensor_parallel_config: "sync_param sync_grad sync_moment"
pipeline_parallel_degree: 1
sharding_parallel_degree: 1
sharding: stage1
sequence_parallel: True
pipeline_parallel_config: disable_partial_send_recv enable_clear_every_step_cache disable_batch_p2p_comm
recompute: True
recompute_use_reentrant: True
compute_type: bf16
fp16_opt_level: O2
amp_master_grad: True
amp_custom_white_list:
- "lookup_table"
- "lookup_table_v2"
- "flash_attn"
- "matmul"
- "matmul_v2"
- "fused_gemm_epilogue"
amp_custom_black_list:
- "reduce_sum"
- "softmax_with_cross_entropy"
- "c_softmax_with_cross_entropy"
- "elementwise_div"
- "sin"
- "cos"
unified_checkpoint: True
unified_checkpoint_config: async_save

use_flash_attention: True
use_sparse_head_and_loss_fn: False
use_attn_mask_start_row_indices: False
use_sparse_flash_attn: False
moe_multimodal_dispatch_use_allgather: "v2-alltoall"
device: iluvatar_gpu
fuse_rms_norm: False
pre_alloc_memory: 20
Loading