one-covenant
diff --git a/‎grail/infrastructure/checkpoint_consumer.py‎
Lines changed: 30 additions & 60 deletions b/‎grail/infrastructure/checkpoint_consumer.py‎
Lines changed: 30 additions & 60 deletions
diff --git a/‎grail/shared/retention_utils.py‎
Lines changed: 110 additions & 0 deletions b/‎grail/shared/retention_utils.py‎
Lines changed: 110 additions & 0 deletions
@@ -47,10 +47,7 @@
 
 from ..shared.constants import (
     BASE_CHECKPOINT_RETENTION_LIMIT,
-    CHECKPOINT_MILESTONE_INTERVAL,
-    DELTA_BASE_INTERVAL,
     GRAIL_CHECKPOINT_MOD10,
-    WINDOW_LENGTH,
 )
 from . import comms
 from .delta_checkpoint import apply_sparse_delta, compute_weights_hash
@@ -304,8 +301,6 @@ async def apply_delta_in_place(
         Returns:
             True if delta was applied successfully, False if fallback to full load needed
         """
-        import torch
-
         # Validate inputs
         if target_window <= current_window:
             logger.debug(
@@ -358,7 +353,7 @@ async def apply_delta_in_place(
             # Get model's current state dict (on device)
             current_state = model.state_dict()
 
-            # Apply delta in float32, cast back to bf16
+            # Apply delta - dtype is inferred from current_state
             logger.debug(
                 "Applying delta: %.2f%% sparse, %d params changed",
                 delta_info.get("sparsity_ratio", 0) * 100,
@@ -369,7 +364,7 @@ async def apply_delta_in_place(
                 current_state,
                 sparse_tensors,
                 shapes,
-                target_dtype=torch.bfloat16,
+                target_dtype=None,  # Infer from current_state
             )
 
             # Verify hash if available
@@ -851,13 +846,24 @@ async def _handle_delta_checkpoint(
 
         base_path, delta_chain = chain
 
-        logger.info(
-            "Built delta chain: base=%s (cached=%s), chain_length=%d, target=%s",
-            delta_chain[0].prev_window if delta_chain else "N/A",
-            base_path is not None,
-            len(delta_chain),
-            metadata.window,
-        )
+        # Log recovery mode when catching up multiple missed windows
+        if len(delta_chain) > 1:
+            first_delta = delta_chain[0]
+            last_delta = delta_chain[-1]
+            logger.info(
+                "🔄 Recovery mode: catching up %d missed windows (%s -> %s)",
+                len(delta_chain),
+                first_delta.prev_window,
+                last_delta.window,
+            )
+        else:
+            logger.info(
+                "Built delta chain: base=%s (cached=%s), chain_length=%d, target=%s",
+                delta_chain[0].prev_window if delta_chain else "N/A",
+                base_path is not None,
+                len(delta_chain),
+                metadata.window,
+            )
 
         if base_path is None:
             logger.error("Cannot find base checkpoint for chain reconstruction")
@@ -882,8 +888,6 @@ async def _apply_single_delta(
         Returns:
             Path to reconstructed checkpoint, or None on failure
         """
-        import torch
-
         try:
             # Download and load delta
             delta_data = await self._download_and_load_delta(delta_metadata)
@@ -906,12 +910,12 @@ async def _apply_single_delta(
                 delta_info.get("sparsity_ratio", 0) * 100,
             )
 
-            # Apply delta (float32 computation, bf16 output)
+            # Apply delta - dtype is inferred from prev_state
             reconstructed = apply_sparse_delta(
                 prev_state,
                 sparse_tensors,
                 shapes,
-                target_dtype=torch.bfloat16,
+                target_dtype=None,  # Infer from prev_state
             )
 
             # Verify hash
@@ -1014,8 +1018,6 @@ async def _apply_delta_chain(
         Returns:
             Path to reconstructed checkpoint directory, or None on failure
         """
-        import torch
-
         try:
             # Load anchor weights
             current_state = load_model_state_dict(anchor_path)
@@ -1040,12 +1042,12 @@ async def _apply_delta_chain(
 
                 sparse_tensors, shapes, delta_info = delta_data
 
-                # Apply sparse delta and cast to bf16 (bit-exact as analyzed)
+                # Apply sparse delta - dtype is inferred from current_state
                 current_state = apply_sparse_delta(
                     current_state,
                     sparse_tensors,
                     shapes,
-                    target_dtype=torch.bfloat16,
+                    target_dtype=None,  # Infer from current_state
                 )
 
                 logger.debug(
@@ -1229,6 +1231,9 @@ async def _write_reconstructed_checkpoint(
     def _compute_keep_windows(self, current_window: int) -> set[int]:
         """Calculate which checkpoint windows should be retained.
 
+        Delegates to the shared retention utility for consistent behavior
+        between publisher (remote cleanup) and consumer (local cache cleanup).
+
         For chained deltas, we must keep the entire chain from the current
         anchor (FULL) to now, plus the previous anchor for miners catching up.
 
@@ -1243,44 +1248,9 @@ def _compute_keep_windows(self, current_window: int) -> set[int]:
         Returns:
             Set of window numbers to retain
         """
-        keep: set[int] = set()
-        if current_window < 0:
-            return keep
-
-        # Always keep windows 0-9 (bootstrap)
-        keep.update(range(10))
-
-        # Calculate current anchor (last FULL boundary)
-        delta_base_interval_windows = max(1, int(DELTA_BASE_INTERVAL))
-        anchor_stride = delta_base_interval_windows * int(WINDOW_LENGTH)
-        current_anchor = (current_window // anchor_stride) * anchor_stride
-
-        # Keep all windows from current anchor to now (the active chain)
-        w = current_anchor
-        while w <= current_window:
-            keep.add(w)
-            w += WINDOW_LENGTH
-
-        # Keep previous anchor for miners still catching up
-        prev_anchor = current_anchor - anchor_stride
-        if prev_anchor >= 0:
-            keep.add(prev_anchor)
-            # Also keep the chain from previous anchor to current anchor
-            # This allows miners who were on the old chain to transition
-            w = prev_anchor
-            while w < current_anchor:
-                keep.add(w)
-                w += WINDOW_LENGTH
-
-        # Keep milestones (every CHECKPOINT_MILESTONE_INTERVAL windows)
-        interval_blocks = CHECKPOINT_MILESTONE_INTERVAL * WINDOW_LENGTH
-        if interval_blocks > 0:
-            milestone = (current_window // interval_blocks) * interval_blocks
-            while milestone >= 0:
-                keep.add(milestone)
-                milestone -= interval_blocks
-
-        return keep
+        from grail.shared.retention_utils import compute_retention_windows
+
+        return compute_retention_windows(current_window)
 
     async def get_latest_ready_checkpoint(self, before_window: int) -> int | None:
         """Find the latest checkpoint that became READY before the given window.
 
@@ -0,0 +1,110 @@
+"""Shared checkpoint retention policy utilities.
+
+This module provides a unified retention policy for determining which checkpoint
+windows should be kept in both remote storage (publisher) and local cache (consumer).
+
+For chained deltas, retention must keep entire chains from anchor (FULL) to tip.
+"""
+
+from __future__ import annotations
+
+from grail.shared.constants import (
+    CHECKPOINT_MILESTONE_INTERVAL,
+    DELTA_BASE_INTERVAL,
+    WINDOW_LENGTH,
+)
+
+
+def compute_retention_windows(
+    current_window: int,
+    bootstrap_windows: int = 10,
+) -> set[int]:
+    """Calculate which checkpoint windows should be retained.
+
+    For chained deltas, we must keep entire chains from anchor (FULL) to tip.
+    This ensures miners can always reconstruct the current state by:
+    1. Starting from an anchor (FULL checkpoint)
+    2. Applying sequential deltas to reach the current window
+
+    Retention policy:
+    - Keep all windows from current anchor to now (active chain)
+    - Keep previous anchor and its entire chain (for miners catching up)
+    - Keep milestone checkpoints (every CHECKPOINT_MILESTONE_INTERVAL)
+    - Keep bootstrap windows (windows 0-N for initial network state)
+
+    Args:
+        current_window: Current window number
+        bootstrap_windows: Number of initial windows to always keep (default 10)
+
+    Returns:
+        Set of window numbers to retain
+    """
+    keep: set[int] = set()
+    if current_window < 0:
+        return keep
+
+    # Always keep bootstrap windows
+    for i in range(bootstrap_windows):
+        keep.add(i * WINDOW_LENGTH)
+
+    # Calculate anchor stride (blocks between FULL checkpoints)
+    delta_base_interval_windows = max(1, int(DELTA_BASE_INTERVAL))
+    anchor_stride = delta_base_interval_windows * int(WINDOW_LENGTH)
+
+    # Calculate current anchor (last FULL boundary)
+    current_anchor = (current_window // anchor_stride) * anchor_stride
+
+    # Keep all windows from current anchor to now (the active chain)
+    w = current_anchor
+    while w <= current_window:
+        keep.add(w)
+        w += WINDOW_LENGTH
+
+    # Keep previous anchor and its chain (for miners catching up)
+    prev_anchor = current_anchor - anchor_stride
+    if prev_anchor >= 0:
+        keep.add(prev_anchor)
+        # Keep entire chain from previous anchor to current anchor
+        w = prev_anchor
+        while w < current_anchor:
+            keep.add(w)
+            w += WINDOW_LENGTH
+
+    # Keep milestone checkpoints (long-term preservation)
+    if CHECKPOINT_MILESTONE_INTERVAL > 0:
+        interval_blocks = CHECKPOINT_MILESTONE_INTERVAL * WINDOW_LENGTH
+        if interval_blocks > 0:
+            milestone = (current_window // interval_blocks) * interval_blocks
+            while milestone >= 0:
+                keep.add(milestone)
+                milestone -= interval_blocks
+
+    return keep
+
+
+def get_anchor_window(target_window: int) -> int:
+    """Get the anchor window (nearest preceding FULL checkpoint) for a given window.
+
+    Args:
+        target_window: The window to find the anchor for
+
+    Returns:
+        The anchor window number
+    """
+    delta_base_interval_windows = max(1, int(DELTA_BASE_INTERVAL))
+    anchor_stride = delta_base_interval_windows * int(WINDOW_LENGTH)
+    return (target_window // anchor_stride) * anchor_stride
+
+
+def is_anchor_window(window: int) -> bool:
+    """Check if a window is an anchor (FULL checkpoint) window.
+
+    Args:
+        window: The window number to check
+
+    Returns:
+        True if this window is an anchor window
+    """
+    delta_base_interval_windows = max(1, int(DELTA_BASE_INTERVAL))
+    anchor_stride = delta_base_interval_windows * int(WINDOW_LENGTH)
+    return window % anchor_stride == 0