update

duanhx1037 · duanhx1037 · commit a297a52df87a · 2025-04-08T19:30:30.000Z
diff --git a/training/DeepSpeed-Domino/domino/tensor_parallel/cross_entropy.py b/training/DeepSpeed-Domino/domino/tensor_parallel/cross_entropy.py
@@ -173,91 +173,6 @@ def fused_linear_cross_entropy_forward_megatron_chunked(
     
     return loss, None, grad_input, grad_weight, grad_bias
 
-def fused_linear_cross_entropy_forward_megatron(
-    _input,
-    weight,
-    target,
-    bias=None,
-    reduction="none",
-):
-    device = _input.device
-    BT, H = _input.shape
-    V = weight.shape[0]
-
-    grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
-    grad_input = torch.zeros_like(_input, device=device)
-    grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
-    # we use fp32 for loss accumulator
-    loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
-
-    # TODO: evaluate how CUDA synchronization caused by .item() affects the speed
-    rank = get_tensor_model_parallel_rank()
-    world_size = get_tensor_model_parallel_world_size()
-    vocab_start, vocab_end = VocabUtility.vocab_range_from_per_partition_vocab_size(V, rank, world_size)
-    
-    target_mask = (target < vocab_start) | (target >= vocab_end)
-    adjusted_target = target.clone() - vocab_start # relative id
-    adjusted_target[target_mask] = 0
-    adjusted_target_1d = adjusted_target.view(-1)
-    
-    # input
-    # when doing matmul, use the original precision
-    logits = (_input @ weight.t()).float()  # chunk_size x V
-    if bias is not None:
-        logits = logits + bias
-    
-    # # ensure _input and target are contiguous
-    # logits_chunk = logits_chunk.contiguous() # [chunk_size, vocab_size]
-    # target_chunk = target_chunk.contiguous() # [chunk_size]
-    
-    max_logits = torch.max(logits, dim=-1)[0]
-    torch.distributed.all_reduce(max_logits, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group(), async_op=False)
-    logits = logits - max_logits.unsqueeze(-1)
-    
-    sum_exp_logits = torch.sum(torch.exp(logits), dim=-1)
-    torch.distributed.all_reduce(sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=get_tensor_model_parallel_group(), async_op=False)
-    
-    
-    predicted_logits = logits[torch.arange(BT, device=logits.device), adjusted_target_1d]
-    predicted_logits[target_mask] = 0.0
-    handle_predicted_logits = torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=get_tensor_model_parallel_group(), async_op=True)
-    
-    # Compute gradient
-    grad_logits = torch.exp(logits).div_(sum_exp_logits.unsqueeze(-1))
-    grad_logits[torch.arange(BT, device=grad_logits.device), adjusted_target_1d] -= 1.0 - target_mask.float() # chunk_size x V
-    grad_input = grad_logits.to(dtype=torch.half) @ weight
-    torch.distributed.all_reduce(grad_input, group=get_tensor_model_parallel_group(), async_op=False)
-    
-    if grad_weight is not None:
-        torch.addmm(
-            input=grad_weight,
-            mat1=grad_logits.t().to(
-                _input.dtype
-            ),  # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error.
-            mat2=_input,
-            out=grad_weight,
-            alpha=1.0,
-            beta=1.0,
-        )
-    if bias is not None:
-        torch.add(
-            input=grad_bias,
-            other=grad_logits.sum(dim=0),
-            out=grad_bias,
-            alpha=1.0,
-        )
-    handle_predicted_logits.wait()
-    loss_chunk = torch.log(sum_exp_logits) - predicted_logits
-    loss_1d = loss_chunk
-
-    if reduction == "none":
-        loss = loss_1d
-    else:
-        loss = torch.sum(loss_1d)
-    
-    return loss, None, grad_input, grad_weight, grad_bias
-
-
 def fused_linear_cross_entropy_backward(grad_output, grad_input, grad_weight, grad_bias):
     # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
     if not torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):