From 4b6d203adcb1739491f835fd52f087273d755000 Mon Sep 17 00:00:00 2001 From: kip-cxj Date: Thu, 15 Jan 2026 20:32:38 +0800 Subject: [PATCH 1/3] fix: npu free host cache --- checkpoint_engine/ps.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py index 4990575..26e9899 100644 --- a/checkpoint_engine/ps.py +++ b/checkpoint_engine/ps.py @@ -407,7 +407,11 @@ def _unpin(t: torch.Tensor): del self._memory_pool[checkpoint_name] # see https://github.com/pytorch/pytorch/blob/31d5c675394705f8a6bc767f80ae14bf4f01246b/torch/csrc/cuda/Module.cpp#L2018 # this works by using torch>=2.5.0 - torch._C._host_emptyCache() + if self.device_manager.device_type == "cuda": + torch._C._host_emptyCache() + else: + import gc + gc.collect() def gather_metas(self, checkpoint_name: str): """ From e767a80a014864031fb50087a7aebe9a60db43f2 Mon Sep 17 00:00:00 2001 From: kip-cxj Date: Thu, 15 Jan 2026 20:37:46 +0800 Subject: [PATCH 2/3] fix pre-commit --- checkpoint_engine/ps.py | 1 + 1 file changed, 1 insertion(+) diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py index 26e9899..8986a00 100644 --- a/checkpoint_engine/ps.py +++ b/checkpoint_engine/ps.py @@ -411,6 +411,7 @@ def _unpin(t: torch.Tensor): torch._C._host_emptyCache() else: import gc + gc.collect() def gather_metas(self, checkpoint_name: str): From ca65a2235f7e8b697dc5e7a00b6a2f963205b749 Mon Sep 17 00:00:00 2001 From: kip-cxj Date: Mon, 19 Jan 2026 15:33:10 +0800 Subject: [PATCH 3/3] add comments --- checkpoint_engine/ps.py | 1 + 1 file changed, 1 insertion(+) diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py index 8986a00..20f5be6 100644 --- a/checkpoint_engine/ps.py +++ b/checkpoint_engine/ps.py @@ -410,6 +410,7 @@ def _unpin(t: torch.Tensor): if self.device_manager.device_type == "cuda": torch._C._host_emptyCache() else: + # torch._C._host_emptyCache() is not supported on NPU, so we call gc.collect() to empty host cache. import gc gc.collect()