Skip to content

[BUG] verl deepseek save error #68

@wuyaoxuehun

Description

@wuyaoxuehun

using latest verl code train deepseek-v3.1,saving checkpoint raises the folloing error, but this error
is not observed when swith to commit https://github.com/ISEEKYAN/mbridge/pull/58/commits. Is there bugs introduced after this commit?

Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/main_ppo.py", line 448, in <module>
    main()
  File "/usr/local/lib/python3.10/dist-packages/hydra/main.py", line 94, in decorated_main
    _run_hydra(
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
    _run_app(
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
    run_and_report(
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 223, in run_and_report
    raise ex
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
    return func()
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 458, in <lambda>
    lambda: hydra.run(
  File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py", line 132, in run
    _ = ret.return_value
  File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 260, in return_value
    raise self._return_value
  File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 186, in run_job
    ret.return_value = task_function(task_cfg)
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/main_ppo.py", line 45, in main
    run_ppo(config)
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/main_ppo.py", line 99, in run_ppo
    ray.get(runner.run.remote(config))
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2882, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 968, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TypeError): ray::TaskRunner.run() (pid=94773, ip=10.146.228.160, actor_id=200d59195885ddfe5f14e8ff0f000000, repr=<main_ppo.TaskRunner object at 0x7f66ec081420>)
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/main_ppo.py", line 367, in run
    trainer.fit()
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/ppo/ray_trainer.py", line 1674, in fit
    self._save_checkpoint()
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/ppo/ray_trainer.py", line 1006, in _save_checkpoint
    self.actor_rollout_wg.save_checkpoint(
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/single_controller/ray/base.py", line 54, in __call__
    output = ray.get(output)
ray.exceptions.RayTaskError(TypeError): ray::WorkerDict.actor_rollout_save_checkpoint() (pid=47774, ip=10.146.225.239, actor_id=b7fc25b8c3492b9329491d9f0f000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7ed9926fda80>)
  File "/usr/lib/python3.10/concurrent/futures/_base.py", line 458, in result
    return self.__get_result()
  File "/usr/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
    raise self._exception
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/single_controller/ray/base.py", line 852, in func
    return getattr(self.worker_dict[key], name)(*args, **kwargs)
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/single_controller/base/decorator.py", line 462, in inner
    return func(*args, **kwargs)
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/utils/transferqueue_utils.py", line 314, in dummy_inner
    output = func(*args, **kwargs)
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/workers/megatron_workers.py", line 940, in save_checkpoint
    self.checkpoint_mananager.save_checkpoint(
  File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/utils/checkpoint/megatron_checkpoint_manager.py", line 496, in save_checkpoint
    self.bridge.save_weights(
  File "/usr/local/lib/python3.10/dist-packages/mbridge/core/bridge.py", line 428, in save_weights
    return self._save_weights_fast(models, weights_path)
  File "/usr/local/lib/python3.10/dist-packages/mbridge/core/bridge.py", line 383, in _save_weights_fast
    if w_files[0][4] > 0:
TypeError: '>' not supported between instances of 'NoneType' and 'int'

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions