Skip to content

Commit a66b4e9

Browse files
authored
Merge branch 'PaddlePaddle:develop' into feat/qwen2_5_vl_add_20251118
2 parents 8f618c0 + 85529fe commit a66b4e9

File tree

5 files changed

+71
-54
lines changed

5 files changed

+71
-54
lines changed

examples/experiments/paddlefleet/glm45.json

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,32 +7,34 @@
77
"do_train": true,
88
"do_eval": true,
99
"per_device_eval_batch_size": 1,
10-
"per_device_train_batch_size": 1,
10+
"per_device_train_batch_size": 2,
1111
"num_train_epochs": 1,
1212
"max_steps": 50,
1313
"eval_steps": 100,
1414
"evaluation_strategy": "steps",
1515
"save_steps": 10,
1616
"save_strategy": "steps",
1717
"logging_steps": 1,
18-
"gradient_accumulation_steps": 8,
18+
"gradient_accumulation_steps": 1,
1919
"logging_dir": "./24_layer_tp4_ep8_pp4_sp2_glm_vdl_log",
2020
"output_dir": "./checkpoints/24layer_tp4_ep8_pp4_sp2_glm_pretrain_ckpts",
2121
"disable_tqdm": true,
2222
"eval_accumulation_steps": 16,
2323
"warmup_steps": 20,
2424
"learning_rate": 0.00001,
25-
"tensor_parallel_degree": 1,
25+
"tensor_parallel_degree": 4,
2626
"pipeline_parallel_degree": 1,
27-
"use_expert_parallel": false,
28-
"expert_parallel_degree": 8,
29-
"sequence_parallel": false,
27+
"use_expert_parallel": true,
28+
"expert_parallel_degree": 16,
29+
"sequence_parallel": true,
3030
"sharding_parallel_config": "split_param",
3131
"amp_master_grad": true,
3232
"sharding": "stage1",
3333
"recompute": false,
3434
"bf16": true,
3535
"fp16_opt_level": "O2",
3636
"load_via_cpu": true,
37-
"save_to_hf": false
37+
"save_to_hf": false,
38+
"save_checkpoint_format": "flex_checkpoint",
39+
"load_checkpoint_format": "flex_checkpoint"
3840
}

examples/experiments/paddlefleet/glm45_provider.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -130,20 +130,16 @@ class GLM45AirModelProvider106B(GLMMoEModelProvider):
130130

131131

132132
@dataclass
133-
class GLM45AirModelDebugProvider(GLMMoEModelProvider):
133+
class GLM45AirModelDebugProvider(GLM45AirModelProvider106B):
134134
"""
135135
Provider for GLM 4.5 Air 106B-A12B: https://huggingface.co/zai-org/GLM-4.5-Air
136136
"""
137137

138-
num_layers: int = 1
139-
# num_moe_experts: int = 128
140-
hidden_size: int = 512
141-
ffn_hidden_size: int = 512
142-
moe_layer_freq: Union[int, List[int]] = field(
143-
default_factory=lambda: [0] * 1 + [1] * 45
144-
) # first one layer is dense
145-
moe_ffn_hidden_size: int = 1408
146-
moe_shared_expert_intermediate_size: int = 1408
147-
qk_layernorm: bool = False
148-
moe_router_topk_scaling_factor: float = 1.0
138+
num_layers: int = 10
139+
moe_num_shared_experts: int = 1
140+
hidden_size: int = 128
141+
ffn_hidden_size: int = 128
142+
moe_intermediate_size: int = 1408
149143
mtp_num_layers: Optional[int] = 0
144+
use_bias: bool = False
145+
vocab_size: int = 37888

examples/experiments/paddlefleet/run_glm45.sh

Lines changed: 20 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -12,42 +12,29 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
#START_RANK=0 # 改成真正执行的机器号
16-
#END_RANK=4 # 改成真正执行的机器号
15+
START_RANK=0 # 改成真正执行的机器号
16+
END_RANK=2 # 改成真正执行的机器号
1717

18-
#if [[ $rank -lt $START_RANK ]]; then
19-
# exit 0
20-
#fi
18+
if [[ $rank -lt $START_RANK ]]; then
19+
exit 0
20+
fi
2121

22-
#if [[ $rank -ge $END_RANK ]]; then
23-
# exit 0
24-
#fi
25-
#nnodes=$(($END_RANK-$START_RANK))
26-
#master=`cat /root/paddlejob/workspace/hostfile | head -n $(($START_RANK+1)) | tail -n 1 | awk '{print $1}'`
27-
# master=10.54.107.148
28-
#port=36677
22+
if [[ $rank -ge $END_RANK ]]; then
23+
exit 0
24+
fi
25+
nnodes=$(($END_RANK-$START_RANK))
26+
master=`cat /root/paddlejob/workspace/hostfile | head -n $(($START_RANK+1)) | tail -n 1 | awk '{print $1}'`
27+
port=36677
2928

30-
#rank=$(($rank-$START_RANK))
29+
rank=$(($rank-$START_RANK))
3130
#bash script/kill_process.sh
3231
#sleep 5
3332

34-
#rm core.* -rf
35-
# rank_id=$(echo "$LAUNCH_CMD" | sed -n 's/.*--rank \([0-9]*\).*/\1/p')
36-
#rm -rf /root/paddlejob/share-storage/gpfs/system-public/path/to/your/outputs # 改成自己的输出目录
37-
38-
# ls /root/paddlejob/share-storage/gpfs/system-public/huggingface_model/GLM-4.5-Air
39-
40-
export PYTHONPATH=/workspace/PaddleFleet:/workspace/PaddleFleet/examples/experiments/paddlefleet #修改为自己的paddlefleet路径
41-
export CUDA_VISIBLE_DEVICES=0
42-
43-
python run_pretrain.py glm45.json \
44-
--output_dir /workspace/PaddleFormers/examples/experiments/paddlefleet/outputs # 改成自己的保存模型目录
45-
46-
#python3.10 -m paddle.distributed.launch \
47-
# --log_dir /root/paddlejob/share-storage/gpfs/system-public/zhangyichen/outputs/output_$rank/paddle_distributed_logs \ # 改成自己的保存日志目录
48-
# --master $master:$port \
49-
# --nnodes $nnodes \
50-
# --rank $rank \
51-
# --run_mode=collective \
52-
# ${script:-run_finetune.py} \
53-
# $@
33+
python -m paddle.distributed.launch \
34+
--log_dir ./outputs/output_$rank/paddle_distributed_logs \
35+
--master $master:$port \
36+
--nnodes $nnodes \
37+
--rank $rank \
38+
--run_mode=collective \
39+
run_pretrain.py glm45.json \
40+
--output_dir . # 改成自己的保存模型目录

examples/experiments/paddlefleet/run_pretrain.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,15 @@
1414
import copy
1515
import math
1616
import os
17+
import random
1718
import sys
1819
import time
1920
from dataclasses import dataclass, field
2021
from typing import Optional
2122

23+
import numpy as np
2224
import paddle
25+
import paddlefleet
2326

2427
from paddleformers.data.causal_dataset import (
2528
build_train_valid_test_datasets,
@@ -34,7 +37,6 @@
3437
StepFlexToken,
3538
TrainingArguments,
3639
get_last_checkpoint,
37-
set_seed,
3840
speed_metrics,
3941
)
4042
from paddleformers.trainer.trainer import Trainer
@@ -350,6 +352,31 @@ def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
350352
)
351353

352354

355+
def _set_random_seed(
356+
seed_: int,
357+
data_parallel_random_init: bool = False,
358+
te_rng_tracker: bool = False,
359+
inference_rng_tracker: bool = False,
360+
use_cudagraphable_rng: bool = False,
361+
):
362+
"""Set random seed for reproducability."""
363+
if seed_ is not None and seed_ > 0:
364+
# Ensure that different pipeline MP stages get different seeds.
365+
seed = seed_ + (100 * paddlefleet.parallel_state.get_pipeline_model_parallel_rank())
366+
# Ensure different data parallel ranks get different seeds
367+
if data_parallel_random_init:
368+
seed = seed + (10 * paddlefleet.parallel_state.get_data_parallel_rank())
369+
random.seed(seed)
370+
np.random.seed(seed)
371+
paddle.manual_seed(seed)
372+
if paddle.cuda.device_count() > 0:
373+
paddlefleet.tensor_parallel.model_parallel_cuda_manual_seed(
374+
seed, te_rng_tracker, inference_rng_tracker, use_cudagraphable_rng
375+
)
376+
else:
377+
raise ValueError("Seed ({}) should be a positive integer.".format(seed_))
378+
379+
353380
def main():
354381
parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments))
355382
# Support format as "args.json --arg1 value1 --arg2 value2.”
@@ -374,7 +401,7 @@ def main():
374401
os.makedirs(data_args.data_cache, exist_ok=True)
375402

376403
paddle.set_device(training_args.device)
377-
set_seed(seed=training_args.seed)
404+
_set_random_seed(seed_=training_args.seed)
378405

379406
training_args.eval_iters = 10
380407
training_args.test_iters = training_args.eval_iters * 10

paddleformers/trainer/training_args.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1750,6 +1750,11 @@ def is_context_parallel_supported():
17501750
strategy = init_nccl_config(self.nccl_comm_group_config, strategy)
17511751

17521752
fleet.init(is_collective=True, strategy=strategy)
1753+
1754+
# In PaddleFleet, we should use the following code to initialize.
1755+
1756+
# from paddlefleet.training.initialize import initialize_fleet
1757+
# initialize_fleet(strategy)
17531758
logger.info(strategy)
17541759

17551760
if self.reorder_pipeline_priority:

0 commit comments

Comments
 (0)