Skip to content

Commit ba99fc6

Browse files
committed
misc: translate cuda error code to string when pin and unpin
1 parent 02a68dd commit ba99fc6

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

checkpoint_engine/pin_memory.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,11 @@ def _pin(t: torch.Tensor):
209209
torch.cuda.set_device(device_index)
210210
cudart = torch.cuda.cudart()
211211
r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0)
212-
assert r == 0, f"pin memory error, error code: {r}"
212+
if r != 0:
213+
error_msg = cudart.cudaGetErrorString(r)
214+
raise RuntimeError(
215+
f"pin memory error, error code: {r}, error message: {error_msg}"
216+
)
213217

214218
# TODO: should only support /dev/shm? but we found files in disk also work?
215219
size = os.stat(file_path).st_size

checkpoint_engine/ps.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,11 @@ def _unpin(t: torch.Tensor):
391391
)
392392
cudart = torch.cuda.cudart()
393393
r = cudart.cudaHostUnregister(t.data_ptr())
394-
assert r == 0, f"unpin memory error, error code: {r}"
394+
if r != 0:
395+
error_msg = cudart.cudaGetErrorString(r)
396+
raise RuntimeError(
397+
f"unpin memory error, error code: {r}, error message: {error_msg}"
398+
)
395399

396400
# if the checkpoint is pinned by cudaHostRegister manually, we need to unpin it manually
397401
try:

0 commit comments

Comments
 (0)