-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
Traceback (most recent call last):
File "/opt/tiger/ms-swift/swift/cli/_megatron/sft.py", line 7, in <module>
megatron_sft_main()
File "/opt/tiger/ms-swift/swift/megatron/train/sft.py", line 87, in megatron_sft_main
return MegatronSft(args).main()
^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/tiger/ms-swift/swift/llm/base.py", line 49, in main
result = self.run()
^^^^^^^^^^
File "/opt/tiger/ms-swift/swift/megatron/train/sft.py", line 77, in run
self.trainer.train(train_dataset, val_dataset, data_collator)
File "/opt/tiger/ms-swift/swift/megatron/trainers/base.py", line 1098, in train
pretrain(
File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 737, in pretrain
iteration, num_floating_point_operations_so_far = train(
^^^^^^
File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 2451, in train
should_exit = checkpoint_and_decide_exit(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 1870, in checkpoint_and_decide_exit
save_checkpoint_and_time(
File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 1741, in save_checkpoint_and_time
save_checkpoint(
File "/opt/tiger/ms-swift/swift/megatron/trainers/base.py", line 1038, in save_checkpoint
self._origin_save_checkpoint(iteration, model, *_args, **kwargs)
File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/checkpointing.py", line 691, in save_checkpoint
wandb_finalize_fn()
File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/checkpointing.py", line 686, in wandb_finalize_fn
wandb_utils.on_save_checkpoint_success(checkpoint_name, get_checkpoint_tracker_filename(save_dir), save_dir, iteration)
File "/opt/tiger/ms-swift/swift/megatron/init.py", line 860, in on_save_checkpoint_success
origin_on_save_checkpoint_success(*_args, **kwargs)
File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/wandb_utils.py", line 38, in on_save_checkpoint_success
artifact.add_file(tracker_filename)
File "/home/tiger/.local/lib/python3.11/site-packages/wandb/sdk/wandb_artifacts.py", line 406, in add_file
raise ValueError("Path is not a file: %s" % local_path)
ValueError: Path is not a file: /mnt/hdfs/lhr.217_syd/experiments/sft/I-60-E-40-50B-Qwen3-8B-LR_2e-5-Dolci-Think-SFT-10w-cosine-1e-5-bs128/v2-20260117-102329/checkpoint-100/latest_checkpointed_iteration.txt
[rank15]: Traceback (most recent call last):
[rank15]: File "/opt/tiger/ms-swift/swift/cli/_megatron/sft.py", line 7, in <module>
[rank15]: megatron_sft_main()
[rank15]: File "/opt/tiger/ms-swift/swift/megatron/train/sft.py", line 87, in megatron_sft_main
[rank15]: return MegatronSft(args).main()
[rank15]: ^^^^^^^^^^^^^^^^^^^^^^^^
[rank15]: File "/opt/tiger/ms-swift/swift/llm/base.py", line 49, in main
[rank15]: result = self.run()
[rank15]: ^^^^^^^^^^
[rank15]: File "/opt/tiger/ms-swift/swift/megatron/train/sft.py", line 77, in run
[rank15]: self.trainer.train(train_dataset, val_dataset, data_collator)
[rank15]: File "/opt/tiger/ms-swift/swift/megatron/trainers/base.py", line 1098, in train
[rank15]: pretrain(
[rank15]: File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 737, in pretrain
[rank15]: iteration, num_floating_point_operations_so_far = train(
[rank15]: ^^^^^^
[rank15]: File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 2451, in train
[rank15]: should_exit = checkpoint_and_decide_exit(
[rank15]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank15]: File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 1870, in checkpoint_and_decide_exit
[rank15]: save_checkpoint_and_time(
[rank15]: File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 1741, in save_checkpoint_and_time
[rank15]: save_checkpoint(
[rank15]: File "/opt/tiger/ms-swift/swift/megatron/trainers/base.py", line 1038, in save_checkpoint
[rank15]: self._origin_save_checkpoint(iteration, model, *_args, **kwargs)
[rank15]: File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/checkpointing.py", line 691, in save_checkpoint
[rank15]: wandb_finalize_fn()
[rank15]: File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/checkpointing.py", line 686, in wandb_finalize_fn
[rank15]: wandb_utils.on_save_checkpoint_success(checkpoint_name, get_checkpoint_tracker_filename(save_dir), save_dir, iteration)
[rank15]: File "/opt/tiger/ms-swift/swift/megatron/init.py", line 860, in on_save_checkpoint_success
[rank15]: origin_on_save_checkpoint_success(*_args, **kwargs)
[rank15]: File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/wandb_utils.py", line 38, in on_save_checkpoint_success
[rank15]: artifact.add_file(tracker_filename)
[rank15]: File "/home/tiger/.local/lib/python3.11/site-packages/wandb/sdk/wandb_artifacts.py", line 406, in add_file
[rank15]: raise ValueError("Path is not a file: %s" % local_path)
[rank15]: ValueError: Path is not a file: /mnt/hdfs/lhr.217_syd/experiments/sft/I-60-E-40-50B-Qwen3-8B-LR_2e-5-Dolci-Think-SFT-10w-cosine-1e-5-bs128/v2-20260117-102329/checkpoint-100/latest_checkpointed_iteration.txt
Your hardware and system info
ms-swift 3.13.0.dev0
``
Additional context
运行命令:
pip3 install -e .
pip3 install -e /opt/tiger/Megatron-LM-core_r0.15.0
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NPROC_PER_NODE=8 \
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NNODES=2 \
NODE_RANK=xxx \
MASTER_ADDR=xx \
MASTER_PORT=xxx \
megatron sft \
--model ${MODEL_DIR} \
--model_type qwen3 \
--load_safetensors true \
--save_safetensors true \
--dataset '/mnt/hdfs/lhr.217_syd/data/sft/Dolci-Think-SFT/Dolci-Think-SFT-10w.jsonl' \
\
--pipeline_model_parallel_size 1 \
--tensor_model_parallel_size 2 \
--sequence_parallel true \
--attention_backend flash \
--cross_entropy_loss_fusion true \
\
--micro_batch_size 1 \
--global_batch_size 128 \
--packing false \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--max_epochs 1 \
--finetune true \
\
--lr 1e-5 \
--lr_warmup_fraction 0.05 \
--lr_decay_style cosine \
--min_lr 1e-6 \
\
--save ${MYSAVE_DIR} \
--eval_interval 20 \
--save_interval 100 \
--max_length 16384 \
--num_workers 8 \
--dataset_num_proc 8 \
--no_save_optim true \
--no_save_rng true \
--wandb_project ${WANDB_PROJECT} \
--wandb_exp_name ${WANDB_EXP_NAME} \
--report_to wandb \
--tensorboard_dir ./local_tensorboard \
--log_interval 1
取消--report_to=wandb后 不会报错
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working