Skip to content

Running error on PPO training on GSM8K dataset example. "RuntimeError: setStorage:" #1565

Closed
@ambarion

Description

@ambarion

Training Progress: 0%| | 0/290 [00:09<?, ?it/s]
Error executing job with overrides: ['data.train_files=./data/gsm8k/train.parquet', 'data.val_files=./data/gsm8k/test.parquet', 'data.train_batch_size=256', 'data.max_prompt_length=512', 'data.max_response_length=256', 'actor_rollout_ref.model.path=/data/Qwen/Qwen2.5-0.5B-Instruct', 'actor_rollout_ref.actor.optim.lr=1e-6', 'actor_rollout_ref.actor.ppo_mini_batch_size=64', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4', 'actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8', 'actor_rollout_ref.rollout.tensor_model_parallel_size=1', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.4', 'actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4', 'critic.optim.lr=1e-5', 'critic.model.path=/data/Qwen/Qwen2.5-0.5B-Instruct', 'critic.ppo_micro_batch_size_per_gpu=4', 'algorithm.kl_ctrl.kl_coef=0.001', 'trainer.logger=[console,wandb]', 'trainer.val_before_train=False', 'trainer.default_hdfs_dir=null', 'trainer.n_gpus_per_node=1', 'trainer.nnodes=1', 'trainer.save_freq=100', 'trainer.test_freq=10', 'trainer.total_epochs=10', 'trainer.project_name=verl', 'trainer.experiment_name=qwen2_5-0_5b_math_gsm8k']
Traceback (most recent call last):
File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/workspace/verl/verl/verl/trainer/main_ppo.py", line 246, in
main()
File "/usr/local/lib/python3.10/dist-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 458, in
lambda: hydra.run(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/workspace/verl/verl/verl/trainer/main_ppo.py", line 64, in main
run_ppo(config)
File "/workspace/verl/verl/verl/trainer/main_ppo.py", line 76, in run_ppo
ray.get(runner.run.remote(config))
File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2771, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 919, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::TaskRunner.run() (pid=39668, ip=10.249.46.87, actor_id=20407956d3cbc47bae10f7b301000000, repr=<main_ppo.TaskRunner object at 0x7ec7a380d5a0>)
File "/workspace/verl/verl/verl/trainer/main_ppo.py", line 183, in run
trainer.fit()
File "/workspace/verl/verl/verl/trainer/ppo/ray_trainer.py", line 960, in fit
old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
File "/workspace/verl/verl/verl/single_controller/ray/base.py", line 49, in func
output = ray.get(output)
ray.exceptions.RayTaskError(RuntimeError): ray::WorkerDict.actor_rollout_compute_log_prob() (pid=40136, ip=10.249.46.87, actor_id=45db6b3ff11f5e2edafcfb2601000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7333047b96c0>)
File "/workspace/verl/verl/verl/single_controller/ray/base.py", line 466, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
File "/workspace/verl/verl/verl/single_controller/base/decorator.py", line 501, in inner
return func(*args, **kwargs)
File "/workspace/verl/verl/verl/workers/fsdp_workers.py", line 658, in compute_log_prob
output, entropys = self.actor.compute_log_prob(data=data, calculate_entropy=True)
File "/workspace/verl/verl/verl/utils/debug/performance.py", line 78, in f
return self.log(decorated_function, *args, **kwargs)
File "/workspace/verl/verl/verl/utils/debug/performance.py", line 88, in log
output = func(*args, **kwargs)
File "/workspace/verl/verl/verl/workers/actor/dp_actor.py", line 298, in compute_log_prob
entropy, log_probs = self._forward_micro_batch(micro_batch, temperature=temperature, calculate_entropy=calculate_entropy)
File "/workspace/verl/verl/verl/workers/actor/dp_actor.py", line 215, in _forward_micro_batch
log_probs, entropy = self.fused_linear_for_ppo(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/verl/verl/verl/utils/experimental/torch_functional.py", line 211, in forward
return FusedLinearForPPOFunction.apply(
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 575, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/workspace/verl/verl/verl/utils/experimental/torch_functional.py", line 109, in forward
chunk_log_probs, chunk_entropy = _fused_linear_for_ppo_fwd(
File "/workspace/verl/verl/verl/utils/experimental/torch_functional.py", line 25, in _fused_linear_for_ppo_fwd
logits = (hidden_states @ vocab_weights.t()) / temperature
RuntimeError: setStorage: sizes [896, 151936], strides [1, 896], storage offset 0, and itemsize 2 requiring a storage size of 272269312 are out of bounds for storage of size 0

I use this script:
set -x
export CUDA_VISIBLE_DEVICES=7
export HYDRA_FULL_ERROR=1

PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo
data.train_files=./data/gsm8k/train.parquet
data.val_files=./data/gsm8k/test.parquet
data.train_batch_size=256
data.max_prompt_length=512
data.max_response_length=256
actor_rollout_ref.model.path=/data/Qwen/Qwen2.5-0.5B-Instruct
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.ppo_mini_batch_size=64
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8
actor_rollout_ref.rollout.tensor_model_parallel_size=1
actor_rollout_ref.rollout.gpu_memory_utilization=0.4
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4
critic.optim.lr=1e-5
critic.model.path=/data/Qwen/Qwen2.5-0.5B-Instruct
critic.ppo_micro_batch_size_per_gpu=4
algorithm.kl_ctrl.kl_coef=0.001
trainer.logger=['console','wandb']
trainer.val_before_train=False
trainer.default_hdfs_dir=null
trainer.n_gpus_per_node=1
trainer.nnodes=1
trainer.save_freq=100
trainer.test_freq=10
trainer.total_epochs=10
trainer.project_name=verl
trainer.experiment_name=qwen2_5-0_5b_math_gsm8k \

I run the demo on A800-80G and I had successfully trained this demo before, but suddenly couldn't train for the next few days.
I've reinstalled the docker. How to solve the problem?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions