Description
Training Progress: 0%| | 0/290 [00:09<?, ?it/s]
Error executing job with overrides: ['data.train_files=./data/gsm8k/train.parquet', 'data.val_files=./data/gsm8k/test.parquet', 'data.train_batch_size=256', 'data.max_prompt_length=512', 'data.max_response_length=256', 'actor_rollout_ref.model.path=/data/Qwen/Qwen2.5-0.5B-Instruct', 'actor_rollout_ref.actor.optim.lr=1e-6', 'actor_rollout_ref.actor.ppo_mini_batch_size=64', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4', 'actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8', 'actor_rollout_ref.rollout.tensor_model_parallel_size=1', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.4', 'actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4', 'critic.optim.lr=1e-5', 'critic.model.path=/data/Qwen/Qwen2.5-0.5B-Instruct', 'critic.ppo_micro_batch_size_per_gpu=4', 'algorithm.kl_ctrl.kl_coef=0.001', 'trainer.logger=[console,wandb]', 'trainer.val_before_train=False', 'trainer.default_hdfs_dir=null', 'trainer.n_gpus_per_node=1', 'trainer.nnodes=1', 'trainer.save_freq=100', 'trainer.test_freq=10', 'trainer.total_epochs=10', 'trainer.project_name=verl', 'trainer.experiment_name=qwen2_5-0_5b_math_gsm8k']
Traceback (most recent call last):
File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/workspace/verl/verl/verl/trainer/main_ppo.py", line 246, in
main()
File "/usr/local/lib/python3.10/dist-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 458, in
lambda: hydra.run(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/workspace/verl/verl/verl/trainer/main_ppo.py", line 64, in main
run_ppo(config)
File "/workspace/verl/verl/verl/trainer/main_ppo.py", line 76, in run_ppo
ray.get(runner.run.remote(config))
File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2771, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 919, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::TaskRunner.run() (pid=39668, ip=10.249.46.87, actor_id=20407956d3cbc47bae10f7b301000000, repr=<main_ppo.TaskRunner object at 0x7ec7a380d5a0>)
File "/workspace/verl/verl/verl/trainer/main_ppo.py", line 183, in run
trainer.fit()
File "/workspace/verl/verl/verl/trainer/ppo/ray_trainer.py", line 960, in fit
old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
File "/workspace/verl/verl/verl/single_controller/ray/base.py", line 49, in func
output = ray.get(output)
ray.exceptions.RayTaskError(RuntimeError): ray::WorkerDict.actor_rollout_compute_log_prob() (pid=40136, ip=10.249.46.87, actor_id=45db6b3ff11f5e2edafcfb2601000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7333047b96c0>)
File "/workspace/verl/verl/verl/single_controller/ray/base.py", line 466, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
File "/workspace/verl/verl/verl/single_controller/base/decorator.py", line 501, in inner
return func(*args, **kwargs)
File "/workspace/verl/verl/verl/workers/fsdp_workers.py", line 658, in compute_log_prob
output, entropys = self.actor.compute_log_prob(data=data, calculate_entropy=True)
File "/workspace/verl/verl/verl/utils/debug/performance.py", line 78, in f
return self.log(decorated_function, *args, **kwargs)
File "/workspace/verl/verl/verl/utils/debug/performance.py", line 88, in log
output = func(*args, **kwargs)
File "/workspace/verl/verl/verl/workers/actor/dp_actor.py", line 298, in compute_log_prob
entropy, log_probs = self._forward_micro_batch(micro_batch, temperature=temperature, calculate_entropy=calculate_entropy)
File "/workspace/verl/verl/verl/workers/actor/dp_actor.py", line 215, in _forward_micro_batch
log_probs, entropy = self.fused_linear_for_ppo(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/verl/verl/verl/utils/experimental/torch_functional.py", line 211, in forward
return FusedLinearForPPOFunction.apply(
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 575, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/workspace/verl/verl/verl/utils/experimental/torch_functional.py", line 109, in forward
chunk_log_probs, chunk_entropy = _fused_linear_for_ppo_fwd(
File "/workspace/verl/verl/verl/utils/experimental/torch_functional.py", line 25, in _fused_linear_for_ppo_fwd
logits = (hidden_states @ vocab_weights.t()) / temperature
RuntimeError: setStorage: sizes [896, 151936], strides [1, 896], storage offset 0, and itemsize 2 requiring a storage size of 272269312 are out of bounds for storage of size 0
I use this script:
set -x
export CUDA_VISIBLE_DEVICES=7
export HYDRA_FULL_ERROR=1
PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo
data.train_files=./data/gsm8k/train.parquet
data.val_files=./data/gsm8k/test.parquet
data.train_batch_size=256
data.max_prompt_length=512
data.max_response_length=256
actor_rollout_ref.model.path=/data/Qwen/Qwen2.5-0.5B-Instruct
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.ppo_mini_batch_size=64
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8
actor_rollout_ref.rollout.tensor_model_parallel_size=1
actor_rollout_ref.rollout.gpu_memory_utilization=0.4
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4
critic.optim.lr=1e-5
critic.model.path=/data/Qwen/Qwen2.5-0.5B-Instruct
critic.ppo_micro_batch_size_per_gpu=4
algorithm.kl_ctrl.kl_coef=0.001
trainer.logger=['console','wandb']
trainer.val_before_train=False
trainer.default_hdfs_dir=null
trainer.n_gpus_per_node=1
trainer.nnodes=1
trainer.save_freq=100
trainer.test_freq=10
trainer.total_epochs=10
trainer.project_name=verl
trainer.experiment_name=qwen2_5-0_5b_math_gsm8k \
I run the demo on A800-80G and I had successfully trained this demo before, but suddenly couldn't train for the next few days.
I've reinstalled the docker. How to solve the problem?