|
12 | 12 | # See the License for the specific language governing permissions and |
13 | 13 | # limitations under the License. |
14 | 14 |
|
15 | | -"""Composite Checkpoint Manager handling P2P syncing with optional Persistent Fallback.""" |
| 15 | +"""Composite Checkpoint Manager with P2P syncing and Persistent Fallback.""" |
16 | 16 |
|
17 | 17 | import shutil |
18 | 18 | import threading |
@@ -178,7 +178,7 @@ def get_all_steps_from_peers(self) -> list[int]: |
178 | 178 | return self._peer_selector.get_all_steps() |
179 | 179 |
|
180 | 180 | def has_shard_for_step(self, step: int) -> bool: |
181 | | - """Checks if this process's shard for a given step exists in the P2P network.""" |
| 181 | + """Checks if this process's shard for a step exists in P2P.""" |
182 | 182 | assert self._peer_selector is not None |
183 | 183 | return ( |
184 | 184 | self._peer_selector.get_source_peer(step, self._process_index) |
@@ -212,7 +212,7 @@ def close(self): |
212 | 212 | class CheckpointManager( |
213 | 213 | abstract_checkpoint_manager.AbstractCheckpointManager, epy.ContextManager |
214 | 214 | ): |
215 | | - """Orchestrates P2P local checkpointing with optional persistent storage failover. |
| 215 | + """P2P local checkpointing with persistent storage failover. |
216 | 216 |
|
217 | 217 | Restoration Strategy: |
218 | 218 | 1. Check Local Disk (Fastest) |
@@ -333,7 +333,15 @@ def best_step(self) -> int | None: |
333 | 333 |
|
334 | 334 | @override |
335 | 335 | def reload(self): |
| 336 | + """Reloads the checkpoint manager and its components. |
| 337 | +
|
| 338 | + This method refreshes the local and persistent managers and marks the P2P |
| 339 | + registry as stale, forcing a re-sync on the next access. |
| 340 | + """ |
| 341 | + self._p2p.mark_registry_stale() |
336 | 342 | self._local_manager.reload() |
| 343 | + if self._persistent_manager: |
| 344 | + self._persistent_manager.reload() |
337 | 345 |
|
338 | 346 | @override |
339 | 347 | def reached_preemption(self, step: int) -> bool: |
|
0 commit comments