Open
Description
Hi! I really appreciate your work! When I run your multi-gpu code, I met the following problem. It looks like some layers are in different device. Could you please help me with that?
Traceback (most recent call last):
File "/CVPR23_LFDM/DM/train_video_flow_diffusion_mhad_multiGPU.py", line 465, in <module>
main()
File "/CVPR23_LFDM/DM/train_video_flow_diffusion_mhad_multiGPU.py", line 253, in main
train_output_dict = model.forward(real_vid=real_vids, ref_img=ref_imgs, ref_text=cond)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/nn/parallel/data_parallel.py", line 185, in forward
outputs = self.parallel_apply(replicas, inputs, module_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/nn/parallel/data_parallel.py", line 200, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py", line 110, in parallel_apply
output.reraise()
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/_utils.py", line 694, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker
output = module(*input, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/CVPR23_LFDM/DM/modules/video_flow_diffusion_model_multiGPU.py", line 103, in forward
generated = self.generator(ref_img, source_region_params=source_region_params,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/CVPR23_LFDM/LFAE/modules/generator.py", line 100, in forward
motion_params = self.pixelwise_flow_predictor(source_image=source_image,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/anaconda3/envs/pytorch/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/CVPR23_LFDM/LFAE/modules/pixelwise_flow_predictor.py", line 111, in forward
heatmap_representation = self.create_heatmap_representations(source_image, driving_region_params,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/CVPR23_LFDM/LFAE/modules/pixelwise_flow_predictor.py", line 54, in create_heatmap_representations
gaussian_driving = region2gaussian(driving_region_params['shift'], covar=covar, spatial_size=spatial_size)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/CVPR23_LFDM/LFAE/modules/util.py", line 44, in region2gaussian
covar_inverse = torch.inverse(covar).view(*shape)
^^^^^^^^^^^^^^^^^^^^
RuntimeError: lazy wrapper should be called at most once
Metadata
Metadata
Assignees
Labels
No labels