Description
setting as follow:
deepspeed --master_port 12346 main.py \ --data_path yitingxie/rlhf-reward-datasets \ --data_split 2,4,4 \ --actor_model_name_or_path $ACTOR_MODEL_PATH \ --critic_model_name_or_path $CRITIC_MODEL_PATH \ --num_padding_at_beginning 1 \ --per_device_train_batch_size 4 \ --per_device_mini_train_batch_size 4 \ --generation_batch_numbers 1 \ --ppo_epochs 1 \ --max_answer_seq_len 128 \ --max_prompt_seq_len 128 \ --actor_learning_rate ${Actor_Lr} \ --critic_learning_rate ${Critic_Lr} \ --actor_weight_decay 0.1 \ --critic_weight_decay 0.1 \ --num_train_epochs 1 \ --lr_scheduler_type cosine \ --gradient_accumulation_steps 1 \ --num_warmup_steps 100 \ --deepspeed --seed 1234 \ --actor_zero_stage $ACTOR_ZERO_STAGE \ --critic_zero_stage $CRITIC_ZERO_STAGE \ --output_dir $OUTPUT \ &> $OUTPUT/training.log
running log:
`|E2E latency=6.48s |Gather latency=0.19s (2.87%) |Generate time=2.96s (45.67%) |Training time=1.90s (29.33%) |Others=1.62 (25.00%)|CurSamplesPerSec=2.47 |AvgSamplesPerSec=2.35
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "main.py", line 526, in
File "main.py", line 526, in
File "main.py", line 526, in
Traceback (most recent call last):
File "main.py", line 526, in
main()main()main()
File "main.py", line 449, in main
File "main.py", line 449, in main
File "main.py", line 449, in main
main()
File "main.py", line 449, in main
actor_loss, critic_loss = trainer.train_rlhf(exp_data)actor_loss, critic_loss = trainer.train_rlhf(exp_data)
File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf
actor_loss, critic_loss = trainer.train_rlhf(exp_data) File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf
File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf
self.actor_model.step()self.actor_model.step()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step
self.actor_model.step()
actor_loss, critic_loss = trainer.train_rlhf(exp_data)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step
File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf
self.actor_model.step()super().step(lr_kwargs=lr_kwargs)super().step(lr_kwargs=lr_kwargs)
super().step(lr_kwargs=lr_kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step
super().step(lr_kwargs=lr_kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step
self._take_model_step(lr_kwargs)self._take_model_step(lr_kwargs)self._take_model_step(lr_kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step
self._take_model_step(lr_kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step
self.optimizer.step()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.step()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.step()ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step
self.optimizer.step()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step
if self._overflow_check_and_loss_scale_update():
if self._overflow_check_and_loss_scale_update(): File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
if self._overflow_check_and_loss_scale_update():
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update
ret_val = func(*args, **kwargs)ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update
if self._overflow_check_and_loss_scale_update():
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update
self._update_scale(self.overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale
self._update_scale(self.overflow)
self._update_scale(self.overflow) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale
self._update_scale(self.overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale
self.loss_scaler.update_scale(has_overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale
self.loss_scaler.update_scale(has_overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale
self.loss_scaler.update_scale(has_overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale
raise Exception(
Exception : raise Exception(Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.
self.loss_scaler.update_scale(has_overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale
Exception : raise Exception(Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.
Exception: Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.
raise Exception(
Exception: Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.`
How to solve this problem?