Skip to content

RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method #14

@zhang0730

Description

@zhang0730

I am following the tutorial in quickstart. when fitting the model, I met the question below:


scDiffEq [INFO]: Detected environment: jupyter
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
💡 Tip: For seamless cloud logging and experiment tracking, try installing [litlogger](https://pypi.org/project/litlogger/) to enable LitLogger, which logs metrics and artifacts automatically to the Lightning Experiments platform.
W0409 21:38:15.270000 3716831 site-packages/torch/multiprocessing/spawn.py:165] Terminating process 51059 via signal SIGTERM
---------------------------------------------------------------------------
ProcessRaisedException                    Traceback (most recent call last)
Cell In[14], line 1
----> 1 model.fit(train_epochs=1500)

File ~/scDiffEq/scDiffEq-main/src/scdiffeq/core/_scdiffeq.py:459, in scDiffEq.fit(self, train_epochs, pretrain_epochs, train_lr, pretrain_callbacks, train_callbacks, ckpt_frequency, save_last_ckpt, keep_ckpts, monitor, accelerator, log_every_n_steps, reload_dataloaders_every_n_epochs, devices, deterministic, print_every, **kwargs)
    404 def fit(
    405     self,
    406     train_epochs: int = 2500,
   (...)    421     **kwargs: dict,
    422 ) -> None:
    423     """Fit the scDiffEq model to some data.
    424 
    425     Parameters
   (...)    457     Returns: None
    458     """
--> 459     self.train(**ABCParse.function_kwargs(self.train, locals()))

File ~/scDiffEq/scDiffEq-main/src/scdiffeq/core/_mix_ins/_training_routine_mix_ins.py:185, in TrainMixIn.train(self, train_epochs, train_callbacks, ckpt_frequency, save_last_ckpt, keep_ckpts, monitor, accelerator, log_every_n_steps, reload_dataloaders_every_n_epochs, devices, deterministic, **kwargs)
    183 if self._train_epochs > 0:
    184     self._configure_train_step(locals())
--> 185     self.trainer.fit(self.DiffEq, self.LitDataModule)

File ~/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:584, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path, weights_only)
    582 self.training = True
    583 self.should_stop = False
--> 584 call._call_and_handle_interrupt(
    585     self,
    586     self._fit_impl,
    587     model,
    588     train_dataloaders,
    589     val_dataloaders,
    590     datamodule,
    591     ckpt_path,
    592     weights_only,
    593 )

File ~/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:48, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
     46 try:
     47     if trainer.strategy.launcher is not None:
---> 48         return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
     49     return trainer_fn(*args, **kwargs)
     51 except _TunerExitException:

File ~/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py:144, in _MultiProcessingLauncher.launch(self, function, trainer, *args, **kwargs)
    136 process_context = mp.start_processes(
    137     self._wrapping_function,
    138     args=process_args,
   (...)    141     join=False,  # we will join ourselves to get the process references
    142 )
    143 self.procs = process_context.processes
--> 144 while not process_context.join():
    145     pass
    147 worker_output = return_queue.get()

File ~/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/torch/multiprocessing/spawn.py:211, in ProcessContext.join(self, timeout, grace_period)
    209 msg = f"\n\n-- Process {error_index:d} terminated with the following error:\n"
    210 msg += original_trace
--> 211 raise ProcessRaisedException(msg, error_index, failed_process.pid)

ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 87, in _wrap
    fn(i, *args)
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py", line 173, in _wrapping_function
    results = function(*args, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 630, in _fit_impl
    self._run(model, ckpt_path=ckpt_path, weights_only=weights_only)
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 1033, in _run
    self.strategy.setup_environment()
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.py", line 154, in setup_environment
    super().setup_environment()
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py", line 129, in setup_environment
    self.accelerator.setup_device(self.root_device)
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/accelerators/cuda.py", line 46, in setup_device
    _check_cuda_matmul_precision(device)
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/fabric/accelerators/cuda.py", line 166, in _check_cuda_matmul_precision
    if not torch.cuda.is_available() or not _is_ampere_or_later(device):
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/fabric/accelerators/cuda.py", line 160, in _is_ampere_or_later
    major, _ = torch.cuda.get_device_capability(device)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/torch/cuda/__init__.py", line 669, in get_device_capability
    prop = get_device_properties(device)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/torch/cuda/__init__.py", line 686, in get_device_properties
    _lazy_init()  # will define _get_device_properties
    ^^^^^^^^^^^^
  File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/torch/cuda/__init__.py", line 466, in _lazy_init
    raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions