I am following the tutorial in quickstart. when fitting the model, I met the question below:
scDiffEq [INFO]: Detected environment: jupyter
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
💡 Tip: For seamless cloud logging and experiment tracking, try installing [litlogger](https://pypi.org/project/litlogger/) to enable LitLogger, which logs metrics and artifacts automatically to the Lightning Experiments platform.
W0409 21:38:15.270000 3716831 site-packages/torch/multiprocessing/spawn.py:165] Terminating process 51059 via signal SIGTERM
---------------------------------------------------------------------------
ProcessRaisedException Traceback (most recent call last)
Cell In[14], line 1
----> 1 model.fit(train_epochs=1500)
File ~/scDiffEq/scDiffEq-main/src/scdiffeq/core/_scdiffeq.py:459, in scDiffEq.fit(self, train_epochs, pretrain_epochs, train_lr, pretrain_callbacks, train_callbacks, ckpt_frequency, save_last_ckpt, keep_ckpts, monitor, accelerator, log_every_n_steps, reload_dataloaders_every_n_epochs, devices, deterministic, print_every, **kwargs)
404 def fit(
405 self,
406 train_epochs: int = 2500,
(...) 421 **kwargs: dict,
422 ) -> None:
423 """Fit the scDiffEq model to some data.
424
425 Parameters
(...) 457 Returns: None
458 """
--> 459 self.train(**ABCParse.function_kwargs(self.train, locals()))
File ~/scDiffEq/scDiffEq-main/src/scdiffeq/core/_mix_ins/_training_routine_mix_ins.py:185, in TrainMixIn.train(self, train_epochs, train_callbacks, ckpt_frequency, save_last_ckpt, keep_ckpts, monitor, accelerator, log_every_n_steps, reload_dataloaders_every_n_epochs, devices, deterministic, **kwargs)
183 if self._train_epochs > 0:
184 self._configure_train_step(locals())
--> 185 self.trainer.fit(self.DiffEq, self.LitDataModule)
File ~/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:584, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path, weights_only)
582 self.training = True
583 self.should_stop = False
--> 584 call._call_and_handle_interrupt(
585 self,
586 self._fit_impl,
587 model,
588 train_dataloaders,
589 val_dataloaders,
590 datamodule,
591 ckpt_path,
592 weights_only,
593 )
File ~/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:48, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
46 try:
47 if trainer.strategy.launcher is not None:
---> 48 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
49 return trainer_fn(*args, **kwargs)
51 except _TunerExitException:
File ~/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py:144, in _MultiProcessingLauncher.launch(self, function, trainer, *args, **kwargs)
136 process_context = mp.start_processes(
137 self._wrapping_function,
138 args=process_args,
(...) 141 join=False, # we will join ourselves to get the process references
142 )
143 self.procs = process_context.processes
--> 144 while not process_context.join():
145 pass
147 worker_output = return_queue.get()
File ~/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/torch/multiprocessing/spawn.py:211, in ProcessContext.join(self, timeout, grace_period)
209 msg = f"\n\n-- Process {error_index:d} terminated with the following error:\n"
210 msg += original_trace
--> 211 raise ProcessRaisedException(msg, error_index, failed_process.pid)
ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 87, in _wrap
fn(i, *args)
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py", line 173, in _wrapping_function
results = function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 630, in _fit_impl
self._run(model, ckpt_path=ckpt_path, weights_only=weights_only)
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 1033, in _run
self.strategy.setup_environment()
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.py", line 154, in setup_environment
super().setup_environment()
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py", line 129, in setup_environment
self.accelerator.setup_device(self.root_device)
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/pytorch/accelerators/cuda.py", line 46, in setup_device
_check_cuda_matmul_precision(device)
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/fabric/accelerators/cuda.py", line 166, in _check_cuda_matmul_precision
if not torch.cuda.is_available() or not _is_ampere_or_later(device):
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/lightning/fabric/accelerators/cuda.py", line 160, in _is_ampere_or_later
major, _ = torch.cuda.get_device_capability(device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/torch/cuda/__init__.py", line 669, in get_device_capability
prop = get_device_properties(device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/torch/cuda/__init__.py", line 686, in get_device_properties
_lazy_init() # will define _get_device_properties
^^^^^^^^^^^^
File "/data/cai803/miniforge3/envs/scdiffeq/lib/python3.11/site-packages/torch/cuda/__init__.py", line 466, in _lazy_init
raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
I am following the tutorial in quickstart. when fitting the model, I met the question below: