Open
Description
I want to save immediate ckpt in training after specfic steps while keep meeting job hang issue, how can I got it fixed?
Torch 1.14 + CUDA 12.0, Transformer Engine 0.6
Code
for step, batch in enumerate(train_dataloader):
batch = to_device(batch, device)
if first_step:
input_ids = batch["input_ids"][0]
input_tokens = tokenizer.convert_ids_to_tokens(input_ids)
input_rawtext = tokenizer.convert_tokens_to_string(input_tokens)
attention_mask = batch["attention_mask"][0]
labels = batch["labels"][0]
print_rank_0(f"input_raw_text: {input_rawtext}",args.global_rank)
print_rank_0(f"input_ids: {input_ids}",args.global_rank)
print_rank_0(f"attention_mask: {attention_mask}",args.global_rank)
print_rank_0(f"labels: {labels}",args.global_rank)
first_step = False
if (step+1) % args.num_print_steps == 0 and args.global_rank == 0:
prof.start_profile()
outputs = model(**batch, use_cache=False)
prof.stop_profile()
loss = outputs.loss
flops = prof.get_total_flops()
print_rank_0(f"step: {step}, loss: {loss}, TFLOPs: {flops/10**12}")
model.backward(loss)
model.step()
else:
outputs = model(**batch, use_cache=False)
loss = outputs.loss
model.backward(loss)
model.step()
if (step+1) % args.num_save_steps == 0:
if args.global_rank == 0:
if args.zero_stage == 3:
# For zero stage 3, each gpu only has a part of the model, so we need a special save function
save_zero_three_model(model,
args.global_rank,
os.path.join(args.output_dir,f"step_{step}"),
zero_stage=args.zero_stage)
else:
save_hf_format(model, tokenizer, args,sub_folder=f"step_{step}")
Log
worker-0: [2023-04-18 20:30:48,606] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024
worker-0: [2023-04-18 20:30:52,307] [WARNING] [stage3.py:1787:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
worker-0: [2023-04-18 20:30:55,310] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1024, reducing to 512
worker-0: [2023-04-18 20:30:57,921] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=8, lr=[9.999246866958693e-06], mom=[(0.9, 0.95)]
worker-0: [2023-04-18 20:30:57,922] [INFO] [timer.py:199:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=23.706460757974455, CurrSamplesPerSec=24.52642844149013, MemAllocated=9.86GB, MaxMemAllocated=17.17GB