Skip to content

mcore cannot find latest_checkpointed_iteration.txt #7628

@haorannlp

Description

@haorannlp

Describe the bug

Traceback (most recent call last):
  File "/opt/tiger/ms-swift/swift/cli/_megatron/sft.py", line 7, in <module>
    megatron_sft_main()
  File "/opt/tiger/ms-swift/swift/megatron/train/sft.py", line 87, in megatron_sft_main
    return MegatronSft(args).main()
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/tiger/ms-swift/swift/llm/base.py", line 49, in main
    result = self.run()
             ^^^^^^^^^^
  File "/opt/tiger/ms-swift/swift/megatron/train/sft.py", line 77, in run
    self.trainer.train(train_dataset, val_dataset, data_collator)
  File "/opt/tiger/ms-swift/swift/megatron/trainers/base.py", line 1098, in train
    pretrain(
  File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 737, in pretrain
    iteration, num_floating_point_operations_so_far = train(
                                                      ^^^^^^
  File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 2451, in train
    should_exit = checkpoint_and_decide_exit(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 1870, in checkpoint_and_decide_exit
    save_checkpoint_and_time(
  File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 1741, in save_checkpoint_and_time
    save_checkpoint(
  File "/opt/tiger/ms-swift/swift/megatron/trainers/base.py", line 1038, in save_checkpoint
    self._origin_save_checkpoint(iteration, model, *_args, **kwargs)
  File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/checkpointing.py", line 691, in save_checkpoint
    wandb_finalize_fn()
  File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/checkpointing.py", line 686, in wandb_finalize_fn
    wandb_utils.on_save_checkpoint_success(checkpoint_name, get_checkpoint_tracker_filename(save_dir), save_dir, iteration)
  File "/opt/tiger/ms-swift/swift/megatron/init.py", line 860, in on_save_checkpoint_success
    origin_on_save_checkpoint_success(*_args, **kwargs)
  File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/wandb_utils.py", line 38, in on_save_checkpoint_success
    artifact.add_file(tracker_filename)
  File "/home/tiger/.local/lib/python3.11/site-packages/wandb/sdk/wandb_artifacts.py", line 406, in add_file
    raise ValueError("Path is not a file: %s" % local_path)
ValueError: Path is not a file: /mnt/hdfs/lhr.217_syd/experiments/sft/I-60-E-40-50B-Qwen3-8B-LR_2e-5-Dolci-Think-SFT-10w-cosine-1e-5-bs128/v2-20260117-102329/checkpoint-100/latest_checkpointed_iteration.txt
[rank15]: Traceback (most recent call last):
[rank15]:   File "/opt/tiger/ms-swift/swift/cli/_megatron/sft.py", line 7, in <module>
[rank15]:     megatron_sft_main()
[rank15]:   File "/opt/tiger/ms-swift/swift/megatron/train/sft.py", line 87, in megatron_sft_main
[rank15]:     return MegatronSft(args).main()
[rank15]:            ^^^^^^^^^^^^^^^^^^^^^^^^
[rank15]:   File "/opt/tiger/ms-swift/swift/llm/base.py", line 49, in main
[rank15]:     result = self.run()
[rank15]:              ^^^^^^^^^^
[rank15]:   File "/opt/tiger/ms-swift/swift/megatron/train/sft.py", line 77, in run
[rank15]:     self.trainer.train(train_dataset, val_dataset, data_collator)
[rank15]:   File "/opt/tiger/ms-swift/swift/megatron/trainers/base.py", line 1098, in train
[rank15]:     pretrain(
[rank15]:   File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 737, in pretrain
[rank15]:     iteration, num_floating_point_operations_so_far = train(
[rank15]:                                                       ^^^^^^
[rank15]:   File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 2451, in train
[rank15]:     should_exit = checkpoint_and_decide_exit(
[rank15]:                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank15]:   File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 1870, in checkpoint_and_decide_exit
[rank15]:     save_checkpoint_and_time(
[rank15]:   File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/training.py", line 1741, in save_checkpoint_and_time
[rank15]:     save_checkpoint(
[rank15]:   File "/opt/tiger/ms-swift/swift/megatron/trainers/base.py", line 1038, in save_checkpoint
[rank15]:     self._origin_save_checkpoint(iteration, model, *_args, **kwargs)
[rank15]:   File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/checkpointing.py", line 691, in save_checkpoint
[rank15]:     wandb_finalize_fn()
[rank15]:   File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/checkpointing.py", line 686, in wandb_finalize_fn
[rank15]:     wandb_utils.on_save_checkpoint_success(checkpoint_name, get_checkpoint_tracker_filename(save_dir), save_dir, iteration)
[rank15]:   File "/opt/tiger/ms-swift/swift/megatron/init.py", line 860, in on_save_checkpoint_success
[rank15]:     origin_on_save_checkpoint_success(*_args, **kwargs)
[rank15]:   File "/opt/tiger/Megatron-LM-core_r0.15.0/megatron/training/wandb_utils.py", line 38, in on_save_checkpoint_success
[rank15]:     artifact.add_file(tracker_filename)
[rank15]:   File "/home/tiger/.local/lib/python3.11/site-packages/wandb/sdk/wandb_artifacts.py", line 406, in add_file
[rank15]:     raise ValueError("Path is not a file: %s" % local_path)
[rank15]: ValueError: Path is not a file: /mnt/hdfs/lhr.217_syd/experiments/sft/I-60-E-40-50B-Qwen3-8B-LR_2e-5-Dolci-Think-SFT-10w-cosine-1e-5-bs128/v2-20260117-102329/checkpoint-100/latest_checkpointed_iteration.txt

Your hardware and system info
ms-swift 3.13.0.dev0
``

Additional context
运行命令:

pip3 install -e .
pip3 install -e /opt/tiger/Megatron-LM-core_r0.15.0


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NPROC_PER_NODE=8 \
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NNODES=2 \
NODE_RANK=xxx \
MASTER_ADDR=xx \
MASTER_PORT=xxx \
megatron sft \
    --model ${MODEL_DIR} \
    --model_type qwen3 \
    --load_safetensors true \
    --save_safetensors true \
    --dataset '/mnt/hdfs/lhr.217_syd/data/sft/Dolci-Think-SFT/Dolci-Think-SFT-10w.jsonl' \
    \
    --pipeline_model_parallel_size 1 \
    --tensor_model_parallel_size 2 \
    --sequence_parallel true \
    --attention_backend flash \
    --cross_entropy_loss_fusion true \
    \
    --micro_batch_size 1 \
    --global_batch_size 128 \
    --packing false \
    --recompute_granularity full \
    --recompute_method uniform \
    --recompute_num_layers 1 \
    --max_epochs 1 \
    --finetune true \
    \
    --lr 1e-5 \
    --lr_warmup_fraction 0.05 \
    --lr_decay_style cosine \
    --min_lr 1e-6 \
    \
    --save ${MYSAVE_DIR} \
    --eval_interval 20 \
    --save_interval 100 \
    --max_length 16384 \
    --num_workers 8 \
    --dataset_num_proc 8 \
    --no_save_optim true \
    --no_save_rng true \
    --wandb_project ${WANDB_PROJECT} \
    --wandb_exp_name ${WANDB_EXP_NAME} \
    --report_to wandb \
    --tensorboard_dir ./local_tensorboard \
    --log_interval 1

取消--report_to=wandb后 不会报错

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions