Skip to content

deepspeed strategy can't save checkpoint, TypeError: cannot pickle torch._C._distributed_c10d.ProcessGroup object  #17369

Open
@dmitrymailk

Description

@dmitrymailk

Bug description

I try use https://github.com/ashleve/lightning-hydra-template with deepspeed strategy.
Here is my fork https://github.com/dmitrymailk/ru_lm/tree/61ab735110b3c80a3cb3d58b3d7c5c05d4cf56af

And I got this error TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object

I don't think that it's a pytorch-lighting problem itsels because
The error raise in deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py

'''Copyright The Microsoft DeepSpeed Team'''

import torch
from deepspeed.utils import logger, log_dist
from deepspeed.runtime.checkpoint_engine.checkpoint_engine import \
    CheckpointEngine


class TorchCheckpointEngine(CheckpointEngine):
    def __init__(self, config_params=None):
        super().__init__(config_params)

    def create(self, tag):
        log_dist(f"[Torch] Checkpoint {tag} is about to be saved!", ranks=[0])

    def save(self, state_dict, path: str):
        logger.info(f"[Torch] Saving {path}...")
        torch.save(state_dict, path) # <--------  THIS IS  LINE WITH ERROR
        logger.info(f"[Torch] Saved {path}.")
        return None

    def load(self, path: str, map_location=None):
        logger.info(f"[Torch] Loading checkpoint from {path}...")
        partition = torch.load(path, map_location=map_location)
        logger.info(f"[Torch] Loaded checkpoint from {path}.")
        return partition

    def commit(self, tag):
        logger.info(f"[Torch] Checkpoint {tag} is ready now!")
        return True

state_dict is

{'module': OrderedDict([('_forward_module.net.model.0.weight', tensor([[-0.0206, -0.0164, -0.0273,  ..., -0.0302,  0.0299, -0.0047],
        [-0.0128,  0.0170, -0.0113,  ...,  0.0350,  0.0331, -0.0160],
        [-0.0067,  0.0077,  0.0225,  ...,  0.0044, -0.0277,  0.0051],
        ...,
        [-0.0353, -0.0296,  0.0091,  ...,  0.0305, -0.0232, -0.0312],
        [ 0.0211, -0.0110,  0.0163,  ...,  0.0069,  0.0326,  0.0053],
        [-0.0086, -0.0325,  0.0274,  ..., -0.0282,  0.0301,  0.0071]],
       device='cuda:2')), ('_forward_module.net.model.0.bias', tensor([ 1.6634e-02, -2.3079e-03, -3.2068e-02,  2.1141e-02, -2.0570e-02,
        -1.6523e-02,  3.0869e-02,  3.1002e-02,  8.9699e-03,  8.0999e-03,
        -5.3077e-03, -1.5247e-03,  1.1600e-02, -2.3882e-02,  2.0400e-02,
         6.6980e-03, -1.5726e-02,  3.0201e-02, -3.0459e-02, -5.5736e-03,
        -3.2227e-02,  1.6418e-02, -3.3164e-03, -8.9103e-03, -2.0807e-02,
        -6.2269e-03,  2.7469e-03,  5.5335e-03, -6.0272e-03, -2.5161e-02,
         1.6865e-02, -2.6231e-02,  1.9226e-02, -1.2376e-02,  2.2611e-02,
         2.1642e-02, -2.1752e-02,  1.3505e-05,  7.2918e-03,  1.6172e-02,
        -2.2469e-02,  2.7463e-02,  2.1199e-02,  3.4017e-02, -2.8088e-02,
        -9.1580e-04, -9.2622e-03, -1.0225e-02, -1.9733e-02, -1.5048e-02,
         1.3339e-02, -1.0597e-02, -5.1447e-03,  6.7623e-03,  7.7667e-03,
         1.2265e-02,  3.9282e-03, -1.8352e-02,  3.9440e-03, -2.2286e-02,
        -3.5115e-02,  1.9813e-02,  3.2887e-02, -8.3252e-04], device='cuda:2')), ('_forward_module.net.model.1.weight', tensor([1.0168, 1.0103, 1.0170, 0.9606, 0.9720, 1.0054, 1.0024, 0.9416, 0.9631,
        1.0735, 0.9495, 0.9839, 1.0446, 0.9490, 0.9425, 0.9924, 0.9962, 1.0726,
        1.0224, 1.0065, 1.0098, 1.0007, 0.9632, 0.9839, 1.0079, 0.9911, 0.9914,
        1.0538, 1.0098, 0.9762, 1.0098, 0.9808, 1.0373, 0.9647, 0.9809, 0.9613,
        0.9817, 0.9978, 1.0200, 0.9712, 1.0052, 0.9922, 0.9766, 1.0005, 1.0585,
        1.0503, 1.0387, 1.0138, 0.9679, 0.9696, 0.9906, 1.0347, 1.0640, 1.0130,
        0.9582, 1.0201, 0.9812, 1.0380, 0.9930, 0.9797, 0.9500, 1.0297, 0.9632,
        0.9230], device='cuda:2')), ('_forward_module.net.model.1.bias', tensor([-0.1002,  0.0113,  0.0383, -0.0181, -0.0401, -0.0249, -0.0108, -0.0561,
        -0.0671, -0.0106,  0.0089, -0.0569,  0.0076, -0.0571, -0.0250, -0.0345,
        -0.0179, -0.0020, -0.0343, -0.0905, -0.0323, -0.0654, -0.0434, -0.0152,
        -0.0238, -0.0347, -0.0396, -0.0114, -0.0186, -0.0629, -0.0012, -0.0116,
         0.0067, -0.0646, -0.0147, -0.0136, -0.0839, -0.0536,  0.0047,  0.0185,
        -0.0690, -0.0223,  0.0031, -0.0109,  0.0104, -0.0046, -0.0495, -0.0645,
        -0.0083, -0.0269,  0.0071, -0.0411,  0.0381, -0.0577, -0.0245, -0.0126,
        -0.0421,  0.0048, -0.0383, -0.0071, -0.0592,  0.0024, -0.0674, -0.0339],
       device='cuda:2')), ('_forward_module.net.model.1.running_mean', tensor([-0.2770,  0.7688,  0.9060,  0.1643,  1.0290, -0.1429, -0.4767,  1.2568,
        -0.4853, -2.1052,  1.8014,  0.6222,  1.4748, -0.5060,  0.0468,  0.7766,
        -0.3606,  0.4830,  1.1301, -0.8858,  0.5095,  1.7616,  1.2985, -0.7447,
         1.2914, -0.5072,  1.1782, -0.6020,  0.6702,  0.0687,  0.3902, -0.0578,
         0.8498,  0.5335,  0.6371, -1.8661, -0.2625, -1.0885, -0.3993,  0.1109,
         0.0751, -0.1520,  1.2330, -0.5041, -0.8100, -0.6048, -0.3007, -1.1550,
        -0.5078,  0.9425,  0.4095,  0.6797, -1.1699, -1.1110,  1.6058, -0.5655,
         0.3667, -0.0421, -0.1447,  0.1583, -0.9060,  1.3481, -0.1281,  0.7023],
       device='cuda:2')), ('_forward_module.net.model.1.running_var', tensor([2.4604, 3.6512, 2.2266, 2.6904, 1.9039, 2.1977, 2.3890, 2.4403, 2.5475,
        2.8812, 2.1730, 2.3935, 4.2853, 1.6594, 2.6012, 1.7956, 2.5534, 2.8410,
        3.0139, 2.4982, 2.1857, 3.6985, 1.8812, 2.1615, 3.1596, 2.6286, 1.7740,
        3.0028, 3.2665, 2.3736, 3.8532, 2.2592, 1.9263, 2.4511, 3.1454, 2.2164,
        2.1486, 3.0658, 4.0342, 2.3159, 3.2614, 1.7401, 2.6082, 2.1396, 2.1671,
        4.3761, 2.3555, 3.5225, 2.3165, 2.7841, 2.1779, 4.6677, 2.8057, 2.6997,
        1.7716, 1.9607, 2.5103, 3.0575, 2.2511, 2.5046, 1.3976, 2.5423, 2.1747,
        2.3439], device='cuda:2')), ('_forward_module.net.model.1.num_batches_tracked', tensor(430, device='cuda:2')), ('_forward_module.net.model.3.weight', tensor([[ 0.0666,  0.0439, -0.0677,  ..., -0.1074,  0.0113, -0.0350],
        [ 0.0209, -0.0946, -0.0402,  ..., -0.0770, -0.0062,  0.0970],
        [-0.0347,  0.0199, -0.0589,  ..., -0.0599, -0.0072, -0.0322],
        ...,
        [-0.0400,  0.1107,  0.1408,  ...,  0.0782, -0.0053,  0.0371],
        [-0.0354, -0.0415, -0.0526,  ...,  0.1302, -0.0607, -0.0982],
        [-0.0489, -0.0171, -0.0607,  ...,  0.0260,  0.0699, -0.0023]],
       device='cuda:2')), ('_forward_module.net.model.3.bias', tensor([ 0.0371, -0.1201,  0.1242, -0.0917,  0.1201,  0.1125,  0.1002, -0.0976,
         0.0511, -0.0445, -0.1236,  0.1131, -0.0694, -0.0412,  0.0963,  0.1085,
         0.0458, -0.1029, -0.0773, -0.1115, -0.1080, -0.0374,  0.0575,  0.1134,
         0.1030, -0.1213,  0.0181, -0.0951, -0.0121,  0.0057, -0.0873, -0.0170,
        -0.0819,  0.0476,  0.0057, -0.0765,  0.1016,  0.0075,  0.0647,  0.0446,
        -0.0293,  0.0005, -0.0510,  0.1022, -0.0609,  0.0789,  0.0106,  0.0396,
         0.0795,  0.0585,  0.0888,  0.0950,  0.0060, -0.0476, -0.0320, -0.0433,
         0.0468, -0.0734, -0.0697, -0.1227,  0.0337, -0.0054, -0.0873,  0.0732,
         0.0415, -0.1204, -0.0345,  0.0072,  0.0551,  0.0011, -0.0251, -0.1187,
         0.0916,  0.0826,  0.0491,  0.0969,  0.0570,  0.0681, -0.1145, -0.0844,
        -0.0672, -0.1078, -0.1102,  0.1186,  0.0466, -0.0408, -0.0104,  0.0077,
        -0.1125,  0.0287, -0.1237, -0.0869, -0.0023,  0.0279,  0.1238, -0.0968,
        -0.1007,  0.0801, -0.0582,  0.0211, -0.0789,  0.0735, -0.1026,  0.0292,
         0.0482,  0.0025,  0.1231, -0.1071, -0.1202, -0.0087, -0.0777,  0.0662,
         0.0407,  0.1197,  0.0484,  0.1013, -0.0494, -0.1110,  0.1213,  0.0530,
         0.0355,  0.0203, -0.0327, -0.1022,  0.0537,  0.0855, -0.1248,  0.1174],
       device='cuda:2')), ...]), 'buffer_names': ['_forward_module.net.model.1.running_mean', '_forward_module.net.model.1.running_var', '_forward_module.net.model.1.num_batches_tracked', '_forward_module.net.model.4.running_mean', '_forward_module.net.model.4.running_var', '_forward_module.net.model.4.num_batches_tracked', '_forward_module.net.model.7.running_mean', '_forward_module.net.model.7.running_var', '_forward_module.net.model.7.num_batches_tracked'], 'optimizer': None, 'param_shapes': [OrderedDict([...])], 'lr_scheduler': None, 'data_sampler': None, 'random_ltd': None, 'sparse_tensor_module_names': {}, 'skipped_steps': 0, 'global_steps': 430, 'global_samples': 430, 'dp_world_size': 1, 'mp_world_size': 1, 'ds_config': {'zero_allow_untested_optimizer': True, 'zero_optimization': {...}, 'activation_checkpointing': {...}, 'aio': {...}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}, ...}

What version are you seeing the problem on?

2.0+

How to reproduce the bug

git clone https://github.com/dmitrymailk/ru_lm
cd ru_lm
git checkout 61ab735110b3c80a3cb3d58b3d7c5c05d4cf56af
pip install -r requirements.txt
python src/train.py

you must change devices in configs/trainer/deepspeed.yaml

Error messages and logs


[2023-04-14 00:57:35,767][src.utils.utils][INFO] - Enforcing tags! <cfg.extras.enforce_tags=True>
[2023-04-14 00:57:35,773][src.utils.utils][INFO] - Printing config tree with Rich! <cfg.extras.print_config=True>
[2023-04-14 00:57:35,773][src.utils.rich_utils][WARNING] - Field 'logger' not found in config. Skipping 'logger' config printing...
CONFIG
├── data
│   └── _target_: src.data.mnist_datamodule.MNISTDataModule                                                                    
│       data_dir: /cephfs/home/kosenko/deepspeed/ru_lm/data/                                                                   
│       batch_size: 128                                                                                                        
│       train_val_test_split:                                                                                                  
│       - 55000                                                                                                                
│       - 5000                                                                                                                 
│       - 10000                                                                                                                
│       num_workers: 0                                                                                                         
│       pin_memory: false                                                                                                      
│                                                                                                                              
├── model
│   └── _target_: src.models.mnist_module.MNISTLitModule                                                                       
│       optimizer:                                                                                                             
│         _target_: torch.optim.Adam                                                                                           
│         _partial_: true                                                                                                      
│         lr: 0.001                                                                                                            
│         weight_decay: 0.0                                                                                                    
│       scheduler:                                                                                                             
│         _target_: torch.optim.lr_scheduler.ReduceLROnPlateau                                                                 
│         _partial_: true                                                                                                      
│         mode: min                                                                                                            
│         factor: 0.1                                                                                                          
│         patience: 10                                                                                                         
│       net:                                                                                                                   
│         _target_: src.models.components.simple_dense_net.SimpleDenseNet                                                      
│         input_size: 784                                                                                                      
│         lin1_size: 64                                                                                                        
│         lin2_size: 128                                                                                                       
│         lin3_size: 64                                                                                                        
│         output_size: 10                                                                                                      
│                                                                                                                              
├── callbacks
│   └── model_checkpoint:                                                                                                      
│         _target_: lightning.pytorch.callbacks.ModelCheckpoint                                                                
│         dirpath: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35/checkpoints                        
│         filename: epoch_{epoch:03d}                                                                                          
│         monitor: val/acc                                                                                                     
│         verbose: false                                                                                                       
│         save_last: true                                                                                                      
│         save_top_k: 1                                                                                                        
│         mode: max                                                                                                            
│         auto_insert_metric_name: false                                                                                       
│         save_weights_only: false                                                                                             
│         every_n_train_steps: null                                                                                            
│         train_time_interval: null                                                                                            
│         every_n_epochs: null                                                                                                 
│         save_on_train_epoch_end: null                                                                                        
│       early_stopping:                                                                                                        
│         _target_: lightning.pytorch.callbacks.EarlyStopping                                                                  
│         monitor: val/acc                                                                                                     
│         min_delta: 0.0                                                                                                       
│         patience: 100                                                                                                        
│         verbose: false                                                                                                       
│         mode: max                                                                                                            
│         strict: true                                                                                                         
│         check_finite: true                                                                                                   
│         stopping_threshold: null                                                                                             
│         divergence_threshold: null                                                                                           
│         check_on_train_epoch_end: null                                                                                       
│       model_summary:                                                                                                         
│         _target_: lightning.pytorch.callbacks.RichModelSummary                                                               
│         max_depth: -1                                                                                                        
│       rich_progress_bar:                                                                                                     
│         _target_: lightning.pytorch.callbacks.RichProgressBar                                                                
│                                                                                                                              
├── trainer
│   └── _target_: lightning.pytorch.trainer.Trainer                                                                            
│       default_root_dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35                             
│       min_epochs: 1                                                                                                          
│       max_epochs: 2                                                                                                          
│       check_val_every_n_epoch: 1                                                                                             
│       deterministic: false                                                                                                   
│       accelerator: gpu                                                                                                       
│       devices:                                                                                                               
│       - 2                                                                                                                    
│       strategy: deepspeed                                                                                                    
│                                                                                                                              
├── paths
│   └── root_dir: /cephfs/home/kosenko/deepspeed/ru_lm                                                                         
│       data_dir: /cephfs/home/kosenko/deepspeed/ru_lm/data/                                                                   
│       log_dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/                                                                    
│       output_dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35                                   
│       work_dir: /cephfs/home/kosenko/deepspeed/ru_lm                                                                         
│                                                                                                                              
├── extras
│   └── ignore_warnings: false                                                                                                 
│       enforce_tags: true                                                                                                     
│       print_config: true                                                                                                     
│                                                                                                                              
├── task_name
│   └── train                                                                                                                  
├── tags
│   └── ['dev']                                                                                                                
├── train
│   └── True                                                                                                                   
├── test
│   └── False                                                                                                                  
├── compile
│   └── False                                                                                                                  
├── ckpt_path
│   └── None                                                                                                                   
└── seed
    └── None                                                                                                                   
[2023-04-14 00:57:35,822][__main__][INFO] - Instantiating datamodule <src.data.mnist_datamodule.MNISTDataModule>
[2023-04-14 00:57:35,825][__main__][INFO] - Instantiating model <src.models.mnist_module.MNISTLitModule>
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:197: UserWarning: Attribute 'net' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['net'])`.
  rank_zero_warn(
[2023-04-14 00:57:35,886][__main__][INFO] - Instantiating callbacks...
[2023-04-14 00:57:35,886][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.ModelCheckpoint>
[2023-04-14 00:57:35,890][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.EarlyStopping>
[2023-04-14 00:57:35,891][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.RichModelSummary>
[2023-04-14 00:57:35,891][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.RichProgressBar>
[2023-04-14 00:57:35,892][__main__][INFO] - Instantiating loggers...
[2023-04-14 00:57:35,892][src.utils.instantiators][WARNING] - No logger configs found! Skipping...
[2023-04-14 00:57:35,893][__main__][INFO] - Instantiating trainer <lightning.pytorch.trainer.Trainer>
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[2023-04-14 00:57:36,181][__main__][INFO] - Starting training!
initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1
[2023-04-14 00:57:36,548][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
[2023-04-14 00:57:36,548][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
[2023-04-14 00:57:36,548] [WARNING] [deepspeed.py:637:_auto_select_batch_size] Tried to infer the batch size for internal deepspeed logging from the `train_dataloader()`. To ensure DeepSpeed logging remains correct, please manually pass the plugin with the batch size, `Trainer(strategy=DeepSpeedStrategy(logging_batch_size_per_gpu=batch_size))`.
You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
[2023-04-14 00:57:38,095][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:2 to store for rank: 0
[2023-04-14 00:57:38,095][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:2 with 1 nodes.
Using /cephfs/home/kosenko/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Emitting ninja build file /cephfs/home/kosenko/.cache/torch_extensions/py310_cu118/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.10430693626403809 seconds
Rank: 0 partition count [1] and sizes[(67978, False)] 
Using /cephfs/home/kosenko/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0004889965057373047 seconds
┏━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
┃    ┃ Name         ┃ Type               ┃ Params ┃
┡━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
│ 0  │ net          │ SimpleDenseNet     │ 68.0 K │
│ 1  │ net.model    │ Sequential         │ 68.0 K │
│ 2  │ net.model.0  │ Linear             │ 50.2 K │
│ 3  │ net.model.1  │ BatchNorm1d        │    128 │
│ 4  │ net.model.2  │ ReLU               │      0 │
│ 5  │ net.model.3  │ Linear             │  8.3 K │
│ 6  │ net.model.4  │ BatchNorm1d        │    256 │
│ 7  │ net.model.5  │ ReLU               │      0 │
│ 8  │ net.model.6  │ Linear             │  8.3 K │
│ 9  │ net.model.7  │ BatchNorm1d        │    128 │
│ 10 │ net.model.8  │ ReLU               │      0 │
│ 11 │ net.model.9  │ Linear             │    650 │
│ 12 │ criterion    │ CrossEntropyLoss   │      0 │
│ 13 │ train_acc    │ MulticlassAccuracy │      0 │
│ 14 │ val_acc      │ MulticlassAccuracy │      0 │
│ 15 │ test_acc     │ MulticlassAccuracy │      0 │
│ 16 │ train_loss   │ MeanMetric         │      0 │
│ 17 │ val_loss     │ MeanMetric         │      0 │
│ 18 │ test_loss    │ MeanMetric         │      0 │
│ 19 │ val_acc_best │ MaxMetric          │      0 │
└────┴──────────────┴────────────────────┴────────┘
Trainable params: 68.0 K                                                                                                       
Non-trainable params: 0                                                                                                        
Total params: 68.0 K                                                                                                           
Total estimated model params size (MB): 0                                                                                      
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:430: 
PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing 
the value of the `num_workers` argument` (try 96 which is the number of cpus on this machine) in the `DataLoader` init to 
improve performance.
  rank_zero_warn(
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:432: 
PossibleUserWarning: It is recommended to use `self.log('val/acc_best', ..., sync_dist=True)` when logging on epoch level in 
distributed setting to accumulate the metric across devices.
  warning_cache.warn(
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:430: 
PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider 
increasing the value of the `num_workers` argument` (try 96 which is the number of cpus on this machine) in the `DataLoader` 
init to improve performance.
  rank_zero_warn(
/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being 
deprecated, use kwargs instead. Refer to 
https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
  warnings.warn(
Epoch 0/1  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 430/430 0:00:13 • 0:00:00 32.62it/s val/loss: 0.114 val/acc: 0.967          
                                                                                       val/acc_best: 0.967 train/loss: 0.327   
                                                                                       train/acc: 0.92                         
[2023-04-14 00:57:54,771][src.utils.utils][ERROR] - 
Traceback (most recent call last):
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/utils/utils.py", line 65, in wrap
    metric_dict, object_dict = task_func(cfg=cfg)
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/train.py", line 89, in train
    trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 520, in fit
    call._call_and_handle_interrupt(
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 92, in launch
    return function(*args, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 559, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 935, in _run
    results = self._run_stage()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 978, in _run_stage
    self.fit_loop.run()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run
    self.on_advance_end()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 369, in on_advance_end
    call._call_callback_hooks(trainer, "on_train_epoch_end", monitoring_callbacks=True)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 190, in _call_callback_hooks
    fn(trainer, trainer.lightning_module, *args, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 303, in on_train_epoch_end
    self._save_topk_checkpoint(trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 360, in _save_topk_checkpoint
    self._save_monitor_checkpoint(trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 658, in _save_monitor_checkpoint
    self._update_best_and_save(current, trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 709, in _update_best_and_save
    self._save_checkpoint(trainer, filepath)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 365, in _save_checkpoint
    trainer.save_checkpoint(filepath, self.save_weights_only)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1262, in save_checkpoint
    self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 500, in save_checkpoint
    self.trainer.strategy.save_checkpoint(_checkpoint, filepath, storage_options=storage_options)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/deepspeed.py", line 772, in save_checkpoint
    self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3133, in save_checkpoint
    self._save_checkpoint(save_dir, tag, client_state=client_state)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3345, in _save_checkpoint
    self.checkpoint_engine.save(state, save_path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 18, in save
    torch.save(state_dict, path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 441, in save
    _save(obj, opened_zipfile, pickle_module, pickle_protocol)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 653, in _save
    pickler.dump(obj)
TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object
[2023-04-14 00:57:54,775][src.utils.utils][INFO] - Output dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35
Error executing job with overrides: []
Traceback (most recent call last):
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/train.py", line 117, in main
    metric_dict, _ = train(cfg)
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/utils/utils.py", line 75, in wrap
    raise ex
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/utils/utils.py", line 65, in wrap
    metric_dict, object_dict = task_func(cfg=cfg)
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/train.py", line 89, in train
    trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 520, in fit
    call._call_and_handle_interrupt(
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 92, in launch
    return function(*args, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 559, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 935, in _run
    results = self._run_stage()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 978, in _run_stage
    self.fit_loop.run()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run
    self.on_advance_end()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 369, in on_advance_end
    call._call_callback_hooks(trainer, "on_train_epoch_end", monitoring_callbacks=True)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 190, in _call_callback_hooks
    fn(trainer, trainer.lightning_module, *args, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 303, in on_train_epoch_end
    self._save_topk_checkpoint(trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 360, in _save_topk_checkpoint
    self._save_monitor_checkpoint(trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 658, in _save_monitor_checkpoint
    self._update_best_and_save(current, trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 709, in _update_best_and_save
    self._save_checkpoint(trainer, filepath)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 365, in _save_checkpoint
    trainer.save_checkpoint(filepath, self.save_weights_only)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1262, in save_checkpoint
    self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 500, in save_checkpoint
    self.trainer.strategy.save_checkpoint(_checkpoint, filepath, storage_options=storage_options)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/deepspeed.py", line 772, in save_checkpoint
    self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3133, in save_checkpoint
    self._save_checkpoint(save_dir, tag, client_state=client_state)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3345, in _save_checkpoint
    self.checkpoint_engine.save(state, save_path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 18, in save
    torch.save(state_dict, path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 441, in save
    _save(obj, opened_zipfile, pickle_module, pickle_protocol)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 653, in _save
    pickler.dump(obj)
TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object

Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.

Environment

Current environment
  • CUDA:
    - GPU:
    - NVIDIA A100-SXM4-40GB
    - NVIDIA A100-SXM4-40GB
    - NVIDIA A100-SXM4-40GB
    - NVIDIA A100-SXM4-40GB
    - available: True
    - version: 11.8
  • Lightning:
    - lightning: 2.0.1.post0
    - lightning-cloud: 0.5.33
    - lightning-colossalai: 0.1.0
    - lightning-utilities: 0.8.0
    - pytorch-lightning: 2.0.1.post0
    - torch: 2.0.0+cu118
    - torchaudio: 2.0.1+cu118
    - torchmetrics: 0.11.4
    - torchvision: 0.15.1+cu118
  • Packages:
    - absl-py: 1.4.0
    - accelerate: 0.18.0
    - aiofiles: 23.1.0
    - aiohttp: 3.8.4
    - aiosignal: 1.3.1
    - alembic: 1.10.3
    - altair: 4.2.2
    - antlr4-python3-runtime: 4.9.3
    - anyio: 3.6.2
    - apex: 0.1
    - appdirs: 1.4.4
    - arrow: 1.2.3
    - asttokens: 2.2.1
    - async-timeout: 4.0.2
    - attrs: 22.2.0
    - autopage: 0.5.1
    - backcall: 0.2.0
    - backports.functools-lru-cache: 1.6.4
    - bcrypt: 4.0.1
    - beautifulsoup4: 4.12.2
    - bitsandbytes: 0.37.2
    - black: 23.3.0
    - blessed: 1.20.0
    - boltons: 23.0.0
    - brotlipy: 0.7.0
    - cachetools: 5.3.0
    - certifi: 2022.12.7
    - cffi: 1.15.1
    - cfgv: 3.3.1
    - charset-normalizer: 2.0.4
    - click: 8.1.3
    - cliff: 4.2.0
    - cmaes: 0.9.1
    - cmake: 3.25.0
    - cmd2: 2.4.3
    - colorlog: 6.7.0
    - colossalai: 0.2.8
    - conda: 23.3.1
    - conda-content-trust: 0.1.3
    - conda-package-handling: 2.0.2
    - conda-package-streaming: 0.7.0
    - contexttimer: 0.3.3
    - contourpy: 1.0.7
    - croniter: 1.3.14
    - cryptography: 38.0.4
    - cycler: 0.11.0
    - datasets: 2.11.0
    - dateutils: 0.6.12
    - debugpy: 1.5.1
    - decorator: 5.1.1
    - deepdiff: 6.3.0
    - deepspeed: 0.8.3
    - dill: 0.3.6
    - distlib: 0.3.6
    - docker-pycreds: 0.4.0
    - einops: 0.6.0
    - entrypoints: 0.4
    - evaluate: 0.4.0
    - exceptiongroup: 1.1.1
    - executing: 1.2.0
    - fabric: 3.0.0
    - fastapi: 0.88.0
    - ffmpy: 0.3.0
    - filelock: 3.9.0
    - fire: 0.5.0
    - flash-attn: 0.2.8
    - flit-core: 3.8.0
    - fonttools: 4.39.3
    - frozenlist: 1.3.3
    - fschat: 0.1.10
    - fsspec: 2023.4.0
    - gitdb: 4.0.10
    - gitpython: 3.1.31
    - gmpy2: 2.1.2
    - google-auth: 2.17.3
    - google-auth-oauthlib: 1.0.0
    - gradio: 3.23.0
    - gradio-client: 0.0.8
    - greenlet: 2.0.2
    - grpcio: 1.53.0
    - h11: 0.14.0
    - hjson: 3.1.0
    - html2text: 2020.1.16
    - httpcore: 0.16.3
    - httpx: 0.23.3
    - huggingface-hub: 0.13.4
    - hydra-colorlog: 1.2.0
    - hydra-core: 1.3.2
    - hydra-optuna-sweeper: 1.2.0
    - identify: 2.5.22
    - idna: 3.4
    - importlib-metadata: 6.3.0
    - iniconfig: 2.0.0
    - inquirer: 3.1.3
    - invoke: 2.0.0
    - ipykernel: 6.15.0
    - ipython: 8.12.0
    - itsdangerous: 2.1.2
    - jedi: 0.18.2
    - jinja2: 3.1.2
    - joblib: 1.2.0
    - jsonlines: 3.1.0
    - jsonpatch: 1.32
    - jsonpointer: 2.1
    - jsonschema: 4.17.3
    - jupyter-client: 7.3.4
    - jupyter-core: 4.12.0
    - kiwisolver: 1.4.4
    - lightning: 2.0.1.post0
    - lightning-cloud: 0.5.33
    - lightning-colossalai: 0.1.0
    - lightning-utilities: 0.8.0
    - linkify-it-py: 2.0.0
    - lit: 15.0.7
    - loralib: 0.1.1
    - mako: 1.2.4
    - markdown: 3.4.3
    - markdown-it-py: 2.2.0
    - markdown2: 2.4.8
    - markupsafe: 2.1.1
    - matplotlib: 3.7.1
    - matplotlib-inline: 0.1.6
    - mdit-py-plugins: 0.3.3
    - mdurl: 0.1.2
    - mkl-fft: 1.3.1
    - mkl-random: 1.2.2
    - mkl-service: 2.4.0
    - mpmath: 1.2.1
    - multidict: 6.0.4
    - multiprocess: 0.70.14
    - mypy-extensions: 1.0.0
    - nest-asyncio: 1.5.6
    - networkx: 2.8.4
    - ninja: 1.11.1
    - nodeenv: 1.7.0
    - numpy: 1.23.5
    - nvidia-cublas-cu11: 11.10.3.66
    - nvidia-cuda-nvrtc-cu11: 11.7.99
    - nvidia-cuda-runtime-cu11: 11.7.99
    - nvidia-cudnn-cu11: 8.5.0.96
    - oauthlib: 3.2.2
    - omegaconf: 2.3.0
    - optuna: 2.10.1
    - ordered-set: 4.1.0
    - orjson: 3.8.10
    - packaging: 23.0
    - pandas: 2.0.0
    - paramiko: 3.1.0
    - parso: 0.8.3
    - pathspec: 0.11.1
    - pathtools: 0.1.2
    - pbr: 5.11.1
    - peft: 0.3.0.dev0
    - pexpect: 4.8.0
    - pickleshare: 0.7.5
    - pillow: 9.4.0
    - pip: 22.3.1
    - platformdirs: 3.2.0
    - pluggy: 1.0.0
    - pre-commit: 3.2.2
    - prettytable: 3.7.0
    - prompt-toolkit: 3.0.38
    - protobuf: 3.20.3
    - psutil: 5.9.4
    - ptyprocess: 0.7.0
    - pure-eval: 0.2.2
    - py-cpuinfo: 9.0.0
    - pyarrow: 11.0.0
    - pyasn1: 0.4.8
    - pyasn1-modules: 0.2.8
    - pycosat: 0.6.4
    - pycparser: 2.21
    - pydantic: 1.10.7
    - pydeprecate: 0.3.2
    - pydub: 0.25.1
    - pygments: 2.14.0
    - pyjwt: 2.6.0
    - pynacl: 1.5.0
    - pyopenssl: 22.0.0
    - pyparsing: 3.0.9
    - pyperclip: 1.8.2
    - pyrootutils: 1.0.4
    - pyrsistent: 0.19.3
    - pysocks: 1.7.1
    - pytest: 7.3.0
    - python-dateutil: 2.8.2
    - python-dotenv: 1.0.0
    - python-editor: 1.0.4
    - python-multipart: 0.0.6
    - pytorch-lightning: 2.0.1.post0
    - pytz: 2023.3
    - pyyaml: 6.0
    - pyzmq: 23.2.0
    - readchar: 4.0.5
    - regex: 2023.3.23
    - requests: 2.28.1
    - requests-oauthlib: 1.3.1
    - responses: 0.18.0
    - rfc3986: 1.5.0
    - rich: 13.3.3
    - rsa: 4.9
    - ruamel.yaml: 0.17.21
    - ruamel.yaml.clib: 0.2.6
    - safetensors: 0.3.0
    - scikit-learn: 1.2.2
    - scipy: 1.10.1
    - semantic-version: 2.10.0
    - sentencepiece: 0.1.97
    - sentry-sdk: 1.19.1
    - setproctitle: 1.3.2
    - setuptools: 65.6.3
    - six: 1.16.0
    - smmap: 5.0.0
    - sniffio: 1.3.0
    - soupsieve: 2.4
    - sqlalchemy: 2.0.9
    - stack-data: 0.6.2
    - starlette: 0.22.0
    - starsessions: 1.3.0
    - stevedore: 5.0.0
    - svgwrite: 1.4.3
    - sympy: 1.11.1
    - tensorboard: 2.12.2
    - tensorboard-data-server: 0.7.0
    - tensorboard-plugin-wit: 1.8.1
    - termcolor: 2.2.0
    - threadpoolctl: 3.1.0
    - tokenize-rt: 5.0.0
    - tokenizers: 0.13.3
    - tomli: 2.0.1
    - toolz: 0.12.0
    - torch: 2.0.0+cu118
    - torchaudio: 2.0.1+cu118
    - torchmetrics: 0.11.4
    - torchvision: 0.15.1+cu118
    - tornado: 6.1
    - tqdm: 4.64.1
    - traitlets: 5.9.0
    - transformers: 4.28.0.dev0
    - triton: 2.0.0
    - typing-extensions: 4.4.0
    - tzdata: 2023.3
    - uc-micro-py: 1.0.1
    - urllib3: 1.26.14
    - uvicorn: 0.21.1
    - virtualenv: 20.21.0
    - wandb: 0.14.2
    - wavedrom: 2.0.3.post3
    - wcwidth: 0.2.6
    - websocket-client: 1.5.1
    - websockets: 11.0.1
    - werkzeug: 2.2.3
    - wheel: 0.37.1
    - xxhash: 3.2.0
    - yarl: 1.8.2
    - zipp: 3.15.0
    - zstandard: 0.18.0
  • System:
    - OS: Linux
    - architecture:
    - 64bit
    - ELF
    - processor: x86_64
    - python: 3.10.9
    - version: Quantisation and Pruning Support #76~20.04.1-Ubuntu SMP Mon Mar 20 15:54:19 UTC 2023

More info

No response

cc @awaelchli

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions