File tree Expand file tree Collapse file tree 2 files changed +10
-2
lines changed
Expand file tree Collapse file tree 2 files changed +10
-2
lines changed Original file line number Diff line number Diff line change @@ -209,7 +209,11 @@ def _pin(t: torch.Tensor):
209209 torch .cuda .set_device (device_index )
210210 cudart = torch .cuda .cudart ()
211211 r = cudart .cudaHostRegister (t .data_ptr (), t .numel () * t .element_size (), 0 )
212- assert r == 0 , f"pin memory error, error code: { r } "
212+ if r != 0 :
213+ error_msg = cudart .cudaGetErrorString (r )
214+ raise RuntimeError (
215+ f"pin memory error, error code: { r } , error message: { error_msg } "
216+ )
213217
214218 # TODO: should only support /dev/shm? but we found files in disk also work?
215219 size = os .stat (file_path ).st_size
Original file line number Diff line number Diff line change @@ -391,7 +391,11 @@ def _unpin(t: torch.Tensor):
391391 )
392392 cudart = torch .cuda .cudart ()
393393 r = cudart .cudaHostUnregister (t .data_ptr ())
394- assert r == 0 , f"unpin memory error, error code: { r } "
394+ if r != 0 :
395+ error_msg = cudart .cudaGetErrorString (r )
396+ raise RuntimeError (
397+ f"unpin memory error, error code: { r } , error message: { error_msg } "
398+ )
395399
396400 # if the checkpoint is pinned by cudaHostRegister manually, we need to unpin it manually
397401 try :
You can’t perform that action at this time.
0 commit comments