Skip to content

Commit a9ad301

Browse files
committed
bugfix: skip empty safetensors file when inplace pin memory
1 parent 0568670 commit a9ad301

File tree

1 file changed

+6
-0
lines changed

1 file changed

+6
-0
lines changed

checkpoint_engine/pin_memory.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,12 @@ def _pin(t: torch.Tensor):
258258
# Remove the file after successfully loading. This will avoid doubling the memory usage.
259259
# We assume files in /dev/shm/ are temporary files. So it's safe to remove them after loading.
260260
os.remove(file_path)
261+
if not metas:
262+
# TODO: should we still return this buffer?
263+
assert buffer.nbytes == 0, f"buffer nbytes {buffer.nbytes} should be 0"
264+
logger.warning(f"[rank{rank}] no metas found in {file_path}, skip pin memory")
265+
return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=[], manually_pinned=False)
266+
261267
_pin(buffer)
262268
logger.info(
263269
f"[rank{rank}] inplace pin memory for file {file_path} finished, size {buffer.nbytes / 1024 / 1024:.2f}MiB"

0 commit comments

Comments
 (0)