-
Notifications
You must be signed in to change notification settings - Fork 64
[BUG] verl deepseek save error #68
Copy link
Copy link
Open
Description
using latest verl code train deepseek-v3.1,saving checkpoint raises the folloing error, but this error
is not observed when swith to commit https://github.com/ISEEKYAN/mbridge/pull/58/commits. Is there bugs introduced after this commit?
Traceback (most recent call last):
File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/main_ppo.py", line 448, in <module>
main()
File "/usr/local/lib/python3.10/dist-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 458, in <lambda>
lambda: hydra.run(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/main_ppo.py", line 45, in main
run_ppo(config)
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/main_ppo.py", line 99, in run_ppo
ray.get(runner.run.remote(config))
File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2882, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 968, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TypeError): ray::TaskRunner.run() (pid=94773, ip=10.146.228.160, actor_id=200d59195885ddfe5f14e8ff0f000000, repr=<main_ppo.TaskRunner object at 0x7f66ec081420>)
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/main_ppo.py", line 367, in run
trainer.fit()
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/ppo/ray_trainer.py", line 1674, in fit
self._save_checkpoint()
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/trainer/ppo/ray_trainer.py", line 1006, in _save_checkpoint
self.actor_rollout_wg.save_checkpoint(
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/single_controller/ray/base.py", line 54, in __call__
output = ray.get(output)
ray.exceptions.RayTaskError(TypeError): ray::WorkerDict.actor_rollout_save_checkpoint() (pid=47774, ip=10.146.225.239, actor_id=b7fc25b8c3492b9329491d9f0f000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7ed9926fda80>)
File "/usr/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/usr/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/single_controller/ray/base.py", line 852, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/single_controller/base/decorator.py", line 462, in inner
return func(*args, **kwargs)
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/utils/transferqueue_utils.py", line 314, in dummy_inner
output = func(*args, **kwargs)
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/workers/megatron_workers.py", line 940, in save_checkpoint
self.checkpoint_mananager.save_checkpoint(
File "/tmp/ray/session_2026-01-20_07-51-48_908055_65/runtime_resources/working_dir_files/_ray_pkg_ba95acd4e5aae7fb/verl/utils/checkpoint/megatron_checkpoint_manager.py", line 496, in save_checkpoint
self.bridge.save_weights(
File "/usr/local/lib/python3.10/dist-packages/mbridge/core/bridge.py", line 428, in save_weights
return self._save_weights_fast(models, weights_path)
File "/usr/local/lib/python3.10/dist-packages/mbridge/core/bridge.py", line 383, in _save_weights_fast
if w_files[0][4] > 0:
TypeError: '>' not supported between instances of 'NoneType' and 'int'
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels