-
Notifications
You must be signed in to change notification settings - Fork 60
Open
Description
Hi, I've got an error when fine tune Abstractive BART model
| Name | Type | Params
------------------------------------------------------------
0 | model | MBartForConditionalGeneration | 420 M
1 | loss_func | LabelSmoothingLoss | 0
------------------------------------------------------------
420 M Trainable params
0 Non-trainable params
420 M Total params
1,681.445 Total estimated model params size (MB)
Validation sanity check: 0%| | 0/2 [00:00<?, ?it/s]Traceback (most recent call last):
File "main.py", line 490, in <module>
main(main_args)
File "main.py", line 125, in main
trainer.fit(model)
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 552, in fit
self._run(model)
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 922, in _run
self._dispatch()
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 990, in _dispatch
self.accelerator.start_training(self)
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
self.training_type_plugin.start_training(trainer)
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 161, in start_training
self._results = trainer.run_stage()
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1000, in run_stage
return self._run_train()
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run_train
self._run_sanity_check(self.lightning_module)
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1122, in _run_sanity_check
self._evaluation_loop.run()
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 111, in advance
dataloader_iter, self.current_dataloader_idx, dl_max_batches, self.num_dataloaders
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 111, in advance
output = self.evaluation_step(batch, batch_idx, dataloader_idx)
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 158, in evaluation_step
output = self.trainer.accelerator.validation_step(step_kwargs)
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 211, in validation_step
return self.training_type_plugin.validation_step(*step_kwargs.values())
File "/data/env/train_env/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 178, in validation_step
return self.model.validation_step(*args, **kwargs)
File "/data/summary_to_title/transformersum/src/abstractive.py", line 709, in validation_step
cross_entropy_loss = self._step(batch)
File "/data/summary_to_title/transformersum/src/abstractive.py", line 694, in _step
outputs = self.forward(source, target, source_mask, target_mask, labels=labels)
File "/data/summary_to_title/transformersum/src/abstractive.py", line 256, in forward
loss = self.calculate_loss(prediction_scores, labels)
File "/data/summary_to_title/transformersum/src/abstractive.py", line 674, in calculate_loss
prediction_scores.view(-1, self.model.config.vocab_size), labels.view(-1)
File "/data/env/train_env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/data/summary_to_title/transformersum/src/helpers.py", line 282, in forward
return F.kl_div(output, model_prob, reduction="batchmean")
File "/data/env/train_env/lib/python3.7/site-packages/torch/nn/functional.py", line 2753, in kl_div
reduced = torch.kl_div(input, target, reduction_enum, log_target=log_target)
RuntimeError: The size of tensor a (64000) must match the size of tensor b (64001) at non-singleton dimension 1
This is parameter when initialize
python main.py \
--mode abstractive \
--model_name_or_path vinai/bartpho-word \
--max_epochs 50 \
--model_max_length 100 \
--dataset /data/summary_to_title/transformersum/data/train/train.arrow /data/summary_to_title/transformersum/data/val/val.arrow /data/summary_to_title/transformersum/data/test/test.arrow \
--data_example_column content \
--data_summarized_column title \
--cache_file_path /data/summary_to_title/transformersum/data \
--do_train \
--do_test \
--batch_size 4 \
--val_batch_size 8 \
--weights_save_path model_weights \
--use_logger wandb \
--wandb_project bartpho_word_sum \
--no_wandb_logger_log_model \
--accumulate_grad_batches 5 \
--learning_rate 3e-4 \
--use_scheduler linear \
--warmup_steps 8000 \
--gradient_clip_val 1.0 \
--split_char ^
Metadata
Metadata
Assignees
Labels
No labels