Fresh Multi-GPU device consistency fixes for fused kernels

Vinayyyy7 · Vinayyyy7 · commit d9c1b3e4bf47 · 2026-02-18T18:28:52.000+05:30
diff --git a/unsloth_zoo/fused_losses/cross_entropy_loss.py b/unsloth_zoo/fused_losses/cross_entropy_loss.py
@@ -172,6 +172,15 @@ def forward(
         device = lm_head_weight.device
         if extra_kwargs is None: extra_kwargs = {}
 
+        # Fix for multi-GPU: ensure all tensors are on the same device for computation
+        # torch.func.grad_and_value fails when tensors are on different devices
+        # BUT we must return gradients on the ORIGINAL device of hidden_states
+        original_hidden_states_device = hidden_states.device
+        if hidden_states.device != device:
+            hidden_states = hidden_states.to(device)
+        if labels.device != device:
+            labels = labels.to(device)
+
         # Get shifted labels first
         if shift_labels:
             _labels = torch.empty_like(labels, device = device)
@@ -328,6 +337,7 @@ def accumulate_chunk(
         pass
         ctx.save_for_backward(grad_inputs, grad_lm_head, grad_lm_head_bias)
         ctx.scaling = scaling
+        ctx.original_hidden_states_device = original_hidden_states_device
         return accumulated_loss
     pass
 
@@ -338,6 +348,10 @@ def backward(ctx, grad_output,):
             scaling = ctx.scaling if ctx.scaling is not None else 1.0
             torch._assert(torch.all(grad_output == scaling), f"Fused losses expect grad_output to be all {scaling}, but got {grad_output.ravel()[:10]}")
         (grad_inputs, grad_lm_head, grad_lm_head_bias, ) = ctx.saved_tensors
+        # Fix for multi-GPU: return gradients on the ORIGINAL device of hidden_states
+        original_device = ctx.original_hidden_states_device
+        if grad_inputs.device != original_device:
+            grad_inputs = grad_inputs.to(original_device)
         return (None, grad_inputs, grad_lm_head, grad_lm_head_bias, None, None, None, None, None, None, None, None, None,)
     pass
 pass
diff --git a/unsloth_zoo/rl_replacements.py b/unsloth_zoo/rl_replacements.py
@@ -83,7 +83,8 @@ def chunked_hidden_states_selective_log_softmax(
     all_per_token_logps = []
 
     for chunk_hidden_states, chunk_index in zip(chunked_hidden_states, chunked_index):
-        chunk_logits = chunk_hidden_states.to(lm_head.dtype) @ lm_head.t()
+        # Fix for multi-GPU: ensure all tensors are on the same device
+        chunk_logits = chunk_hidden_states.to(lm_head.device).to(lm_head.dtype) @ lm_head.t()
 
         if logit_scale_multiply != 0.0:
             chunk_logits = chunk_logits * logit_scale_multiply