Skip to content

step3 failed actor opt_1.3b critic opt_350m Exception: Current loss scale already at minimum - cannot decrease scale anymore. Exiting run #419

Open
@BaiStone2017

Description

@BaiStone2017

setting as follow:
deepspeed --master_port 12346 main.py \ --data_path yitingxie/rlhf-reward-datasets \ --data_split 2,4,4 \ --actor_model_name_or_path $ACTOR_MODEL_PATH \ --critic_model_name_or_path $CRITIC_MODEL_PATH \ --num_padding_at_beginning 1 \ --per_device_train_batch_size 4 \ --per_device_mini_train_batch_size 4 \ --generation_batch_numbers 1 \ --ppo_epochs 1 \ --max_answer_seq_len 128 \ --max_prompt_seq_len 128 \ --actor_learning_rate ${Actor_Lr} \ --critic_learning_rate ${Critic_Lr} \ --actor_weight_decay 0.1 \ --critic_weight_decay 0.1 \ --num_train_epochs 1 \ --lr_scheduler_type cosine \ --gradient_accumulation_steps 1 \ --num_warmup_steps 100 \ --deepspeed --seed 1234 \ --actor_zero_stage $ACTOR_ZERO_STAGE \ --critic_zero_stage $CRITIC_ZERO_STAGE \ --output_dir $OUTPUT \ &> $OUTPUT/training.log

running log:
`|E2E latency=6.48s |Gather latency=0.19s (2.87%) |Generate time=2.96s (45.67%) |Training time=1.90s (29.33%) |Others=1.62 (25.00%)|CurSamplesPerSec=2.47 |AvgSamplesPerSec=2.35
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "main.py", line 526, in
File "main.py", line 526, in
File "main.py", line 526, in
Traceback (most recent call last):
File "main.py", line 526, in
main()main()main()

File "main.py", line 449, in main
File "main.py", line 449, in main
File "main.py", line 449, in main
main()
File "main.py", line 449, in main
actor_loss, critic_loss = trainer.train_rlhf(exp_data)actor_loss, critic_loss = trainer.train_rlhf(exp_data)

  File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf

actor_loss, critic_loss = trainer.train_rlhf(exp_data) File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf

File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf
self.actor_model.step()self.actor_model.step()

  File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step
self.actor_model.step()
actor_loss, critic_loss = trainer.train_rlhf(exp_data)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step
File "/home/vca1/bl/projects/chatGPT/tmp/DeepSpeedExamples-master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py", line 173, in train_rlhf
self.actor_model.step()super().step(lr_kwargs=lr_kwargs)super().step(lr_kwargs=lr_kwargs)

super().step(lr_kwargs=lr_kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py", line 397, in step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step
super().step(lr_kwargs=lr_kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1988, in step
self._take_model_step(lr_kwargs)self._take_model_step(lr_kwargs)self._take_model_step(lr_kwargs)

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step
self._take_model_step(lr_kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1895, in _take_model_step
self.optimizer.step()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.step()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.step()ret_val = func(*args, **kwargs)

File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step
self.optimizer.step()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1744, in step
if self._overflow_check_and_loss_scale_update():
if self._overflow_check_and_loss_scale_update(): File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn

  File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn

if self._overflow_check_and_loss_scale_update():
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update
ret_val = func(*args, **kwargs)ret_val = func(*args, **kwargs)

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update
if self._overflow_check_and_loss_scale_update():
File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1694, in _overflow_check_and_loss_scale_update
self._update_scale(self.overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale
self._update_scale(self.overflow)
self._update_scale(self.overflow) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale
self._update_scale(self.overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 2009, in _update_scale
self.loss_scaler.update_scale(has_overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale
self.loss_scaler.update_scale(has_overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale
self.loss_scaler.update_scale(has_overflow)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale
raise Exception(
Exception : raise Exception(Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.

self.loss_scaler.update_scale(has_overflow)

File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 173, in update_scale
Exception : raise Exception(Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.

Exception: Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.
raise Exception(
Exception: Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.`

How to solve this problem?

Metadata

Metadata

Labels

deespeed chatDeepSpeed ChatmodelingRelated to modeling questions.

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions