Open
Description
Bug description
I try use https://github.com/ashleve/lightning-hydra-template with deepspeed strategy.
Here is my fork https://github.com/dmitrymailk/ru_lm/tree/61ab735110b3c80a3cb3d58b3d7c5c05d4cf56af
And I got this error TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object
I don't think that it's a pytorch-lighting problem itsels because
The error raise in deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
'''Copyright The Microsoft DeepSpeed Team'''
import torch
from deepspeed.utils import logger, log_dist
from deepspeed.runtime.checkpoint_engine.checkpoint_engine import \
CheckpointEngine
class TorchCheckpointEngine(CheckpointEngine):
def __init__(self, config_params=None):
super().__init__(config_params)
def create(self, tag):
log_dist(f"[Torch] Checkpoint {tag} is about to be saved!", ranks=[0])
def save(self, state_dict, path: str):
logger.info(f"[Torch] Saving {path}...")
torch.save(state_dict, path) # <-------- THIS IS LINE WITH ERROR
logger.info(f"[Torch] Saved {path}.")
return None
def load(self, path: str, map_location=None):
logger.info(f"[Torch] Loading checkpoint from {path}...")
partition = torch.load(path, map_location=map_location)
logger.info(f"[Torch] Loaded checkpoint from {path}.")
return partition
def commit(self, tag):
logger.info(f"[Torch] Checkpoint {tag} is ready now!")
return True
state_dict is
{'module': OrderedDict([('_forward_module.net.model.0.weight', tensor([[-0.0206, -0.0164, -0.0273, ..., -0.0302, 0.0299, -0.0047],
[-0.0128, 0.0170, -0.0113, ..., 0.0350, 0.0331, -0.0160],
[-0.0067, 0.0077, 0.0225, ..., 0.0044, -0.0277, 0.0051],
...,
[-0.0353, -0.0296, 0.0091, ..., 0.0305, -0.0232, -0.0312],
[ 0.0211, -0.0110, 0.0163, ..., 0.0069, 0.0326, 0.0053],
[-0.0086, -0.0325, 0.0274, ..., -0.0282, 0.0301, 0.0071]],
device='cuda:2')), ('_forward_module.net.model.0.bias', tensor([ 1.6634e-02, -2.3079e-03, -3.2068e-02, 2.1141e-02, -2.0570e-02,
-1.6523e-02, 3.0869e-02, 3.1002e-02, 8.9699e-03, 8.0999e-03,
-5.3077e-03, -1.5247e-03, 1.1600e-02, -2.3882e-02, 2.0400e-02,
6.6980e-03, -1.5726e-02, 3.0201e-02, -3.0459e-02, -5.5736e-03,
-3.2227e-02, 1.6418e-02, -3.3164e-03, -8.9103e-03, -2.0807e-02,
-6.2269e-03, 2.7469e-03, 5.5335e-03, -6.0272e-03, -2.5161e-02,
1.6865e-02, -2.6231e-02, 1.9226e-02, -1.2376e-02, 2.2611e-02,
2.1642e-02, -2.1752e-02, 1.3505e-05, 7.2918e-03, 1.6172e-02,
-2.2469e-02, 2.7463e-02, 2.1199e-02, 3.4017e-02, -2.8088e-02,
-9.1580e-04, -9.2622e-03, -1.0225e-02, -1.9733e-02, -1.5048e-02,
1.3339e-02, -1.0597e-02, -5.1447e-03, 6.7623e-03, 7.7667e-03,
1.2265e-02, 3.9282e-03, -1.8352e-02, 3.9440e-03, -2.2286e-02,
-3.5115e-02, 1.9813e-02, 3.2887e-02, -8.3252e-04], device='cuda:2')), ('_forward_module.net.model.1.weight', tensor([1.0168, 1.0103, 1.0170, 0.9606, 0.9720, 1.0054, 1.0024, 0.9416, 0.9631,
1.0735, 0.9495, 0.9839, 1.0446, 0.9490, 0.9425, 0.9924, 0.9962, 1.0726,
1.0224, 1.0065, 1.0098, 1.0007, 0.9632, 0.9839, 1.0079, 0.9911, 0.9914,
1.0538, 1.0098, 0.9762, 1.0098, 0.9808, 1.0373, 0.9647, 0.9809, 0.9613,
0.9817, 0.9978, 1.0200, 0.9712, 1.0052, 0.9922, 0.9766, 1.0005, 1.0585,
1.0503, 1.0387, 1.0138, 0.9679, 0.9696, 0.9906, 1.0347, 1.0640, 1.0130,
0.9582, 1.0201, 0.9812, 1.0380, 0.9930, 0.9797, 0.9500, 1.0297, 0.9632,
0.9230], device='cuda:2')), ('_forward_module.net.model.1.bias', tensor([-0.1002, 0.0113, 0.0383, -0.0181, -0.0401, -0.0249, -0.0108, -0.0561,
-0.0671, -0.0106, 0.0089, -0.0569, 0.0076, -0.0571, -0.0250, -0.0345,
-0.0179, -0.0020, -0.0343, -0.0905, -0.0323, -0.0654, -0.0434, -0.0152,
-0.0238, -0.0347, -0.0396, -0.0114, -0.0186, -0.0629, -0.0012, -0.0116,
0.0067, -0.0646, -0.0147, -0.0136, -0.0839, -0.0536, 0.0047, 0.0185,
-0.0690, -0.0223, 0.0031, -0.0109, 0.0104, -0.0046, -0.0495, -0.0645,
-0.0083, -0.0269, 0.0071, -0.0411, 0.0381, -0.0577, -0.0245, -0.0126,
-0.0421, 0.0048, -0.0383, -0.0071, -0.0592, 0.0024, -0.0674, -0.0339],
device='cuda:2')), ('_forward_module.net.model.1.running_mean', tensor([-0.2770, 0.7688, 0.9060, 0.1643, 1.0290, -0.1429, -0.4767, 1.2568,
-0.4853, -2.1052, 1.8014, 0.6222, 1.4748, -0.5060, 0.0468, 0.7766,
-0.3606, 0.4830, 1.1301, -0.8858, 0.5095, 1.7616, 1.2985, -0.7447,
1.2914, -0.5072, 1.1782, -0.6020, 0.6702, 0.0687, 0.3902, -0.0578,
0.8498, 0.5335, 0.6371, -1.8661, -0.2625, -1.0885, -0.3993, 0.1109,
0.0751, -0.1520, 1.2330, -0.5041, -0.8100, -0.6048, -0.3007, -1.1550,
-0.5078, 0.9425, 0.4095, 0.6797, -1.1699, -1.1110, 1.6058, -0.5655,
0.3667, -0.0421, -0.1447, 0.1583, -0.9060, 1.3481, -0.1281, 0.7023],
device='cuda:2')), ('_forward_module.net.model.1.running_var', tensor([2.4604, 3.6512, 2.2266, 2.6904, 1.9039, 2.1977, 2.3890, 2.4403, 2.5475,
2.8812, 2.1730, 2.3935, 4.2853, 1.6594, 2.6012, 1.7956, 2.5534, 2.8410,
3.0139, 2.4982, 2.1857, 3.6985, 1.8812, 2.1615, 3.1596, 2.6286, 1.7740,
3.0028, 3.2665, 2.3736, 3.8532, 2.2592, 1.9263, 2.4511, 3.1454, 2.2164,
2.1486, 3.0658, 4.0342, 2.3159, 3.2614, 1.7401, 2.6082, 2.1396, 2.1671,
4.3761, 2.3555, 3.5225, 2.3165, 2.7841, 2.1779, 4.6677, 2.8057, 2.6997,
1.7716, 1.9607, 2.5103, 3.0575, 2.2511, 2.5046, 1.3976, 2.5423, 2.1747,
2.3439], device='cuda:2')), ('_forward_module.net.model.1.num_batches_tracked', tensor(430, device='cuda:2')), ('_forward_module.net.model.3.weight', tensor([[ 0.0666, 0.0439, -0.0677, ..., -0.1074, 0.0113, -0.0350],
[ 0.0209, -0.0946, -0.0402, ..., -0.0770, -0.0062, 0.0970],
[-0.0347, 0.0199, -0.0589, ..., -0.0599, -0.0072, -0.0322],
...,
[-0.0400, 0.1107, 0.1408, ..., 0.0782, -0.0053, 0.0371],
[-0.0354, -0.0415, -0.0526, ..., 0.1302, -0.0607, -0.0982],
[-0.0489, -0.0171, -0.0607, ..., 0.0260, 0.0699, -0.0023]],
device='cuda:2')), ('_forward_module.net.model.3.bias', tensor([ 0.0371, -0.1201, 0.1242, -0.0917, 0.1201, 0.1125, 0.1002, -0.0976,
0.0511, -0.0445, -0.1236, 0.1131, -0.0694, -0.0412, 0.0963, 0.1085,
0.0458, -0.1029, -0.0773, -0.1115, -0.1080, -0.0374, 0.0575, 0.1134,
0.1030, -0.1213, 0.0181, -0.0951, -0.0121, 0.0057, -0.0873, -0.0170,
-0.0819, 0.0476, 0.0057, -0.0765, 0.1016, 0.0075, 0.0647, 0.0446,
-0.0293, 0.0005, -0.0510, 0.1022, -0.0609, 0.0789, 0.0106, 0.0396,
0.0795, 0.0585, 0.0888, 0.0950, 0.0060, -0.0476, -0.0320, -0.0433,
0.0468, -0.0734, -0.0697, -0.1227, 0.0337, -0.0054, -0.0873, 0.0732,
0.0415, -0.1204, -0.0345, 0.0072, 0.0551, 0.0011, -0.0251, -0.1187,
0.0916, 0.0826, 0.0491, 0.0969, 0.0570, 0.0681, -0.1145, -0.0844,
-0.0672, -0.1078, -0.1102, 0.1186, 0.0466, -0.0408, -0.0104, 0.0077,
-0.1125, 0.0287, -0.1237, -0.0869, -0.0023, 0.0279, 0.1238, -0.0968,
-0.1007, 0.0801, -0.0582, 0.0211, -0.0789, 0.0735, -0.1026, 0.0292,
0.0482, 0.0025, 0.1231, -0.1071, -0.1202, -0.0087, -0.0777, 0.0662,
0.0407, 0.1197, 0.0484, 0.1013, -0.0494, -0.1110, 0.1213, 0.0530,
0.0355, 0.0203, -0.0327, -0.1022, 0.0537, 0.0855, -0.1248, 0.1174],
device='cuda:2')), ...]), 'buffer_names': ['_forward_module.net.model.1.running_mean', '_forward_module.net.model.1.running_var', '_forward_module.net.model.1.num_batches_tracked', '_forward_module.net.model.4.running_mean', '_forward_module.net.model.4.running_var', '_forward_module.net.model.4.num_batches_tracked', '_forward_module.net.model.7.running_mean', '_forward_module.net.model.7.running_var', '_forward_module.net.model.7.num_batches_tracked'], 'optimizer': None, 'param_shapes': [OrderedDict([...])], 'lr_scheduler': None, 'data_sampler': None, 'random_ltd': None, 'sparse_tensor_module_names': {}, 'skipped_steps': 0, 'global_steps': 430, 'global_samples': 430, 'dp_world_size': 1, 'mp_world_size': 1, 'ds_config': {'zero_allow_untested_optimizer': True, 'zero_optimization': {...}, 'activation_checkpointing': {...}, 'aio': {...}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}, ...}
What version are you seeing the problem on?
2.0+
How to reproduce the bug
git clone https://github.com/dmitrymailk/ru_lm
cd ru_lm
git checkout 61ab735110b3c80a3cb3d58b3d7c5c05d4cf56af
pip install -r requirements.txt
python src/train.py
you must change devices in configs/trainer/deepspeed.yaml
Error messages and logs
[2023-04-14 00:57:35,767][src.utils.utils][INFO] - Enforcing tags! <cfg.extras.enforce_tags=True>
[2023-04-14 00:57:35,773][src.utils.utils][INFO] - Printing config tree with Rich! <cfg.extras.print_config=True>
[2023-04-14 00:57:35,773][src.utils.rich_utils][WARNING] - Field 'logger' not found in config. Skipping 'logger' config printing...
CONFIG
├── data
│ └── _target_: src.data.mnist_datamodule.MNISTDataModule
│ data_dir: /cephfs/home/kosenko/deepspeed/ru_lm/data/
│ batch_size: 128
│ train_val_test_split:
│ - 55000
│ - 5000
│ - 10000
│ num_workers: 0
│ pin_memory: false
│
├── model
│ └── _target_: src.models.mnist_module.MNISTLitModule
│ optimizer:
│ _target_: torch.optim.Adam
│ _partial_: true
│ lr: 0.001
│ weight_decay: 0.0
│ scheduler:
│ _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
│ _partial_: true
│ mode: min
│ factor: 0.1
│ patience: 10
│ net:
│ _target_: src.models.components.simple_dense_net.SimpleDenseNet
│ input_size: 784
│ lin1_size: 64
│ lin2_size: 128
│ lin3_size: 64
│ output_size: 10
│
├── callbacks
│ └── model_checkpoint:
│ _target_: lightning.pytorch.callbacks.ModelCheckpoint
│ dirpath: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35/checkpoints
│ filename: epoch_{epoch:03d}
│ monitor: val/acc
│ verbose: false
│ save_last: true
│ save_top_k: 1
│ mode: max
│ auto_insert_metric_name: false
│ save_weights_only: false
│ every_n_train_steps: null
│ train_time_interval: null
│ every_n_epochs: null
│ save_on_train_epoch_end: null
│ early_stopping:
│ _target_: lightning.pytorch.callbacks.EarlyStopping
│ monitor: val/acc
│ min_delta: 0.0
│ patience: 100
│ verbose: false
│ mode: max
│ strict: true
│ check_finite: true
│ stopping_threshold: null
│ divergence_threshold: null
│ check_on_train_epoch_end: null
│ model_summary:
│ _target_: lightning.pytorch.callbacks.RichModelSummary
│ max_depth: -1
│ rich_progress_bar:
│ _target_: lightning.pytorch.callbacks.RichProgressBar
│
├── trainer
│ └── _target_: lightning.pytorch.trainer.Trainer
│ default_root_dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35
│ min_epochs: 1
│ max_epochs: 2
│ check_val_every_n_epoch: 1
│ deterministic: false
│ accelerator: gpu
│ devices:
│ - 2
│ strategy: deepspeed
│
├── paths
│ └── root_dir: /cephfs/home/kosenko/deepspeed/ru_lm
│ data_dir: /cephfs/home/kosenko/deepspeed/ru_lm/data/
│ log_dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/
│ output_dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35
│ work_dir: /cephfs/home/kosenko/deepspeed/ru_lm
│
├── extras
│ └── ignore_warnings: false
│ enforce_tags: true
│ print_config: true
│
├── task_name
│ └── train
├── tags
│ └── ['dev']
├── train
│ └── True
├── test
│ └── False
├── compile
│ └── False
├── ckpt_path
│ └── None
└── seed
└── None
[2023-04-14 00:57:35,822][__main__][INFO] - Instantiating datamodule <src.data.mnist_datamodule.MNISTDataModule>
[2023-04-14 00:57:35,825][__main__][INFO] - Instantiating model <src.models.mnist_module.MNISTLitModule>
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:197: UserWarning: Attribute 'net' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['net'])`.
rank_zero_warn(
[2023-04-14 00:57:35,886][__main__][INFO] - Instantiating callbacks...
[2023-04-14 00:57:35,886][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.ModelCheckpoint>
[2023-04-14 00:57:35,890][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.EarlyStopping>
[2023-04-14 00:57:35,891][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.RichModelSummary>
[2023-04-14 00:57:35,891][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.RichProgressBar>
[2023-04-14 00:57:35,892][__main__][INFO] - Instantiating loggers...
[2023-04-14 00:57:35,892][src.utils.instantiators][WARNING] - No logger configs found! Skipping...
[2023-04-14 00:57:35,893][__main__][INFO] - Instantiating trainer <lightning.pytorch.trainer.Trainer>
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[2023-04-14 00:57:36,181][__main__][INFO] - Starting training!
initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1
[2023-04-14 00:57:36,548][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
[2023-04-14 00:57:36,548][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
[2023-04-14 00:57:36,548] [WARNING] [deepspeed.py:637:_auto_select_batch_size] Tried to infer the batch size for internal deepspeed logging from the `train_dataloader()`. To ensure DeepSpeed logging remains correct, please manually pass the plugin with the batch size, `Trainer(strategy=DeepSpeedStrategy(logging_batch_size_per_gpu=batch_size))`.
You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
[2023-04-14 00:57:38,095][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:2 to store for rank: 0
[2023-04-14 00:57:38,095][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:2 with 1 nodes.
Using /cephfs/home/kosenko/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Emitting ninja build file /cephfs/home/kosenko/.cache/torch_extensions/py310_cu118/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.10430693626403809 seconds
Rank: 0 partition count [1] and sizes[(67978, False)]
Using /cephfs/home/kosenko/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0004889965057373047 seconds
┏━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
┃ ┃ Name ┃ Type ┃ Params ┃
┡━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
│ 0 │ net │ SimpleDenseNet │ 68.0 K │
│ 1 │ net.model │ Sequential │ 68.0 K │
│ 2 │ net.model.0 │ Linear │ 50.2 K │
│ 3 │ net.model.1 │ BatchNorm1d │ 128 │
│ 4 │ net.model.2 │ ReLU │ 0 │
│ 5 │ net.model.3 │ Linear │ 8.3 K │
│ 6 │ net.model.4 │ BatchNorm1d │ 256 │
│ 7 │ net.model.5 │ ReLU │ 0 │
│ 8 │ net.model.6 │ Linear │ 8.3 K │
│ 9 │ net.model.7 │ BatchNorm1d │ 128 │
│ 10 │ net.model.8 │ ReLU │ 0 │
│ 11 │ net.model.9 │ Linear │ 650 │
│ 12 │ criterion │ CrossEntropyLoss │ 0 │
│ 13 │ train_acc │ MulticlassAccuracy │ 0 │
│ 14 │ val_acc │ MulticlassAccuracy │ 0 │
│ 15 │ test_acc │ MulticlassAccuracy │ 0 │
│ 16 │ train_loss │ MeanMetric │ 0 │
│ 17 │ val_loss │ MeanMetric │ 0 │
│ 18 │ test_loss │ MeanMetric │ 0 │
│ 19 │ val_acc_best │ MaxMetric │ 0 │
└────┴──────────────┴────────────────────┴────────┘
Trainable params: 68.0 K
Non-trainable params: 0
Total params: 68.0 K
Total estimated model params size (MB): 0
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:430:
PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing
the value of the `num_workers` argument` (try 96 which is the number of cpus on this machine) in the `DataLoader` init to
improve performance.
rank_zero_warn(
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:432:
PossibleUserWarning: It is recommended to use `self.log('val/acc_best', ..., sync_dist=True)` when logging on epoch level in
distributed setting to accumulate the metric across devices.
warning_cache.warn(
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:430:
PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider
increasing the value of the `num_workers` argument` (try 96 which is the number of cpus on this machine) in the `DataLoader`
init to improve performance.
rank_zero_warn(
/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being
deprecated, use kwargs instead. Refer to
https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
warnings.warn(
Epoch 0/1 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 430/430 0:00:13 • 0:00:00 32.62it/s val/loss: 0.114 val/acc: 0.967
val/acc_best: 0.967 train/loss: 0.327
train/acc: 0.92
[2023-04-14 00:57:54,771][src.utils.utils][ERROR] -
Traceback (most recent call last):
File "/cephfs/home/kosenko/deepspeed/ru_lm/src/utils/utils.py", line 65, in wrap
metric_dict, object_dict = task_func(cfg=cfg)
File "/cephfs/home/kosenko/deepspeed/ru_lm/src/train.py", line 89, in train
trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 520, in fit
call._call_and_handle_interrupt(
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 92, in launch
return function(*args, **kwargs)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 559, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 935, in _run
results = self._run_stage()
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 978, in _run_stage
self.fit_loop.run()
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run
self.on_advance_end()
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 369, in on_advance_end
call._call_callback_hooks(trainer, "on_train_epoch_end", monitoring_callbacks=True)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 190, in _call_callback_hooks
fn(trainer, trainer.lightning_module, *args, **kwargs)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 303, in on_train_epoch_end
self._save_topk_checkpoint(trainer, monitor_candidates)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 360, in _save_topk_checkpoint
self._save_monitor_checkpoint(trainer, monitor_candidates)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 658, in _save_monitor_checkpoint
self._update_best_and_save(current, trainer, monitor_candidates)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 709, in _update_best_and_save
self._save_checkpoint(trainer, filepath)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 365, in _save_checkpoint
trainer.save_checkpoint(filepath, self.save_weights_only)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1262, in save_checkpoint
self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 500, in save_checkpoint
self.trainer.strategy.save_checkpoint(_checkpoint, filepath, storage_options=storage_options)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/deepspeed.py", line 772, in save_checkpoint
self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3133, in save_checkpoint
self._save_checkpoint(save_dir, tag, client_state=client_state)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3345, in _save_checkpoint
self.checkpoint_engine.save(state, save_path)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 18, in save
torch.save(state_dict, path)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 441, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 653, in _save
pickler.dump(obj)
TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object
[2023-04-14 00:57:54,775][src.utils.utils][INFO] - Output dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35
Error executing job with overrides: []
Traceback (most recent call last):
File "/cephfs/home/kosenko/deepspeed/ru_lm/src/train.py", line 117, in main
metric_dict, _ = train(cfg)
File "/cephfs/home/kosenko/deepspeed/ru_lm/src/utils/utils.py", line 75, in wrap
raise ex
File "/cephfs/home/kosenko/deepspeed/ru_lm/src/utils/utils.py", line 65, in wrap
metric_dict, object_dict = task_func(cfg=cfg)
File "/cephfs/home/kosenko/deepspeed/ru_lm/src/train.py", line 89, in train
trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 520, in fit
call._call_and_handle_interrupt(
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 92, in launch
return function(*args, **kwargs)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 559, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 935, in _run
results = self._run_stage()
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 978, in _run_stage
self.fit_loop.run()
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run
self.on_advance_end()
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 369, in on_advance_end
call._call_callback_hooks(trainer, "on_train_epoch_end", monitoring_callbacks=True)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 190, in _call_callback_hooks
fn(trainer, trainer.lightning_module, *args, **kwargs)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 303, in on_train_epoch_end
self._save_topk_checkpoint(trainer, monitor_candidates)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 360, in _save_topk_checkpoint
self._save_monitor_checkpoint(trainer, monitor_candidates)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 658, in _save_monitor_checkpoint
self._update_best_and_save(current, trainer, monitor_candidates)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 709, in _update_best_and_save
self._save_checkpoint(trainer, filepath)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 365, in _save_checkpoint
trainer.save_checkpoint(filepath, self.save_weights_only)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1262, in save_checkpoint
self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 500, in save_checkpoint
self.trainer.strategy.save_checkpoint(_checkpoint, filepath, storage_options=storage_options)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/deepspeed.py", line 772, in save_checkpoint
self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3133, in save_checkpoint
self._save_checkpoint(save_dir, tag, client_state=client_state)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3345, in _save_checkpoint
self.checkpoint_engine.save(state, save_path)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 18, in save
torch.save(state_dict, path)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 441, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 653, in _save
pickler.dump(obj)
TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
Environment
Current environment
- CUDA:
- GPU:
- NVIDIA A100-SXM4-40GB
- NVIDIA A100-SXM4-40GB
- NVIDIA A100-SXM4-40GB
- NVIDIA A100-SXM4-40GB
- available: True
- version: 11.8 - Lightning:
- lightning: 2.0.1.post0
- lightning-cloud: 0.5.33
- lightning-colossalai: 0.1.0
- lightning-utilities: 0.8.0
- pytorch-lightning: 2.0.1.post0
- torch: 2.0.0+cu118
- torchaudio: 2.0.1+cu118
- torchmetrics: 0.11.4
- torchvision: 0.15.1+cu118 - Packages:
- absl-py: 1.4.0
- accelerate: 0.18.0
- aiofiles: 23.1.0
- aiohttp: 3.8.4
- aiosignal: 1.3.1
- alembic: 1.10.3
- altair: 4.2.2
- antlr4-python3-runtime: 4.9.3
- anyio: 3.6.2
- apex: 0.1
- appdirs: 1.4.4
- arrow: 1.2.3
- asttokens: 2.2.1
- async-timeout: 4.0.2
- attrs: 22.2.0
- autopage: 0.5.1
- backcall: 0.2.0
- backports.functools-lru-cache: 1.6.4
- bcrypt: 4.0.1
- beautifulsoup4: 4.12.2
- bitsandbytes: 0.37.2
- black: 23.3.0
- blessed: 1.20.0
- boltons: 23.0.0
- brotlipy: 0.7.0
- cachetools: 5.3.0
- certifi: 2022.12.7
- cffi: 1.15.1
- cfgv: 3.3.1
- charset-normalizer: 2.0.4
- click: 8.1.3
- cliff: 4.2.0
- cmaes: 0.9.1
- cmake: 3.25.0
- cmd2: 2.4.3
- colorlog: 6.7.0
- colossalai: 0.2.8
- conda: 23.3.1
- conda-content-trust: 0.1.3
- conda-package-handling: 2.0.2
- conda-package-streaming: 0.7.0
- contexttimer: 0.3.3
- contourpy: 1.0.7
- croniter: 1.3.14
- cryptography: 38.0.4
- cycler: 0.11.0
- datasets: 2.11.0
- dateutils: 0.6.12
- debugpy: 1.5.1
- decorator: 5.1.1
- deepdiff: 6.3.0
- deepspeed: 0.8.3
- dill: 0.3.6
- distlib: 0.3.6
- docker-pycreds: 0.4.0
- einops: 0.6.0
- entrypoints: 0.4
- evaluate: 0.4.0
- exceptiongroup: 1.1.1
- executing: 1.2.0
- fabric: 3.0.0
- fastapi: 0.88.0
- ffmpy: 0.3.0
- filelock: 3.9.0
- fire: 0.5.0
- flash-attn: 0.2.8
- flit-core: 3.8.0
- fonttools: 4.39.3
- frozenlist: 1.3.3
- fschat: 0.1.10
- fsspec: 2023.4.0
- gitdb: 4.0.10
- gitpython: 3.1.31
- gmpy2: 2.1.2
- google-auth: 2.17.3
- google-auth-oauthlib: 1.0.0
- gradio: 3.23.0
- gradio-client: 0.0.8
- greenlet: 2.0.2
- grpcio: 1.53.0
- h11: 0.14.0
- hjson: 3.1.0
- html2text: 2020.1.16
- httpcore: 0.16.3
- httpx: 0.23.3
- huggingface-hub: 0.13.4
- hydra-colorlog: 1.2.0
- hydra-core: 1.3.2
- hydra-optuna-sweeper: 1.2.0
- identify: 2.5.22
- idna: 3.4
- importlib-metadata: 6.3.0
- iniconfig: 2.0.0
- inquirer: 3.1.3
- invoke: 2.0.0
- ipykernel: 6.15.0
- ipython: 8.12.0
- itsdangerous: 2.1.2
- jedi: 0.18.2
- jinja2: 3.1.2
- joblib: 1.2.0
- jsonlines: 3.1.0
- jsonpatch: 1.32
- jsonpointer: 2.1
- jsonschema: 4.17.3
- jupyter-client: 7.3.4
- jupyter-core: 4.12.0
- kiwisolver: 1.4.4
- lightning: 2.0.1.post0
- lightning-cloud: 0.5.33
- lightning-colossalai: 0.1.0
- lightning-utilities: 0.8.0
- linkify-it-py: 2.0.0
- lit: 15.0.7
- loralib: 0.1.1
- mako: 1.2.4
- markdown: 3.4.3
- markdown-it-py: 2.2.0
- markdown2: 2.4.8
- markupsafe: 2.1.1
- matplotlib: 3.7.1
- matplotlib-inline: 0.1.6
- mdit-py-plugins: 0.3.3
- mdurl: 0.1.2
- mkl-fft: 1.3.1
- mkl-random: 1.2.2
- mkl-service: 2.4.0
- mpmath: 1.2.1
- multidict: 6.0.4
- multiprocess: 0.70.14
- mypy-extensions: 1.0.0
- nest-asyncio: 1.5.6
- networkx: 2.8.4
- ninja: 1.11.1
- nodeenv: 1.7.0
- numpy: 1.23.5
- nvidia-cublas-cu11: 11.10.3.66
- nvidia-cuda-nvrtc-cu11: 11.7.99
- nvidia-cuda-runtime-cu11: 11.7.99
- nvidia-cudnn-cu11: 8.5.0.96
- oauthlib: 3.2.2
- omegaconf: 2.3.0
- optuna: 2.10.1
- ordered-set: 4.1.0
- orjson: 3.8.10
- packaging: 23.0
- pandas: 2.0.0
- paramiko: 3.1.0
- parso: 0.8.3
- pathspec: 0.11.1
- pathtools: 0.1.2
- pbr: 5.11.1
- peft: 0.3.0.dev0
- pexpect: 4.8.0
- pickleshare: 0.7.5
- pillow: 9.4.0
- pip: 22.3.1
- platformdirs: 3.2.0
- pluggy: 1.0.0
- pre-commit: 3.2.2
- prettytable: 3.7.0
- prompt-toolkit: 3.0.38
- protobuf: 3.20.3
- psutil: 5.9.4
- ptyprocess: 0.7.0
- pure-eval: 0.2.2
- py-cpuinfo: 9.0.0
- pyarrow: 11.0.0
- pyasn1: 0.4.8
- pyasn1-modules: 0.2.8
- pycosat: 0.6.4
- pycparser: 2.21
- pydantic: 1.10.7
- pydeprecate: 0.3.2
- pydub: 0.25.1
- pygments: 2.14.0
- pyjwt: 2.6.0
- pynacl: 1.5.0
- pyopenssl: 22.0.0
- pyparsing: 3.0.9
- pyperclip: 1.8.2
- pyrootutils: 1.0.4
- pyrsistent: 0.19.3
- pysocks: 1.7.1
- pytest: 7.3.0
- python-dateutil: 2.8.2
- python-dotenv: 1.0.0
- python-editor: 1.0.4
- python-multipart: 0.0.6
- pytorch-lightning: 2.0.1.post0
- pytz: 2023.3
- pyyaml: 6.0
- pyzmq: 23.2.0
- readchar: 4.0.5
- regex: 2023.3.23
- requests: 2.28.1
- requests-oauthlib: 1.3.1
- responses: 0.18.0
- rfc3986: 1.5.0
- rich: 13.3.3
- rsa: 4.9
- ruamel.yaml: 0.17.21
- ruamel.yaml.clib: 0.2.6
- safetensors: 0.3.0
- scikit-learn: 1.2.2
- scipy: 1.10.1
- semantic-version: 2.10.0
- sentencepiece: 0.1.97
- sentry-sdk: 1.19.1
- setproctitle: 1.3.2
- setuptools: 65.6.3
- six: 1.16.0
- smmap: 5.0.0
- sniffio: 1.3.0
- soupsieve: 2.4
- sqlalchemy: 2.0.9
- stack-data: 0.6.2
- starlette: 0.22.0
- starsessions: 1.3.0
- stevedore: 5.0.0
- svgwrite: 1.4.3
- sympy: 1.11.1
- tensorboard: 2.12.2
- tensorboard-data-server: 0.7.0
- tensorboard-plugin-wit: 1.8.1
- termcolor: 2.2.0
- threadpoolctl: 3.1.0
- tokenize-rt: 5.0.0
- tokenizers: 0.13.3
- tomli: 2.0.1
- toolz: 0.12.0
- torch: 2.0.0+cu118
- torchaudio: 2.0.1+cu118
- torchmetrics: 0.11.4
- torchvision: 0.15.1+cu118
- tornado: 6.1
- tqdm: 4.64.1
- traitlets: 5.9.0
- transformers: 4.28.0.dev0
- triton: 2.0.0
- typing-extensions: 4.4.0
- tzdata: 2023.3
- uc-micro-py: 1.0.1
- urllib3: 1.26.14
- uvicorn: 0.21.1
- virtualenv: 20.21.0
- wandb: 0.14.2
- wavedrom: 2.0.3.post3
- wcwidth: 0.2.6
- websocket-client: 1.5.1
- websockets: 11.0.1
- werkzeug: 2.2.3
- wheel: 0.37.1
- xxhash: 3.2.0
- yarl: 1.8.2
- zipp: 3.15.0
- zstandard: 0.18.0 - System:
- OS: Linux
- architecture:
- 64bit
- ELF
- processor: x86_64
- python: 3.10.9
- version: Quantisation and Pruning Support #76~20.04.1-Ubuntu SMP Mon Mar 20 15:54:19 UTC 2023
More info
No response
cc @awaelchli