Convert aten.embedding_dense_backward to ttnn.embedding_bw

jerrysky3 · jerrysky3 · commit fc4cfa60ac70 · 2024-09-20T09:15:38.000Z
diff --git a/tests/lowering/embedding/test_embedding.py b/tests/lowering/embedding/test_embedding.py
@@ -70,3 +70,38 @@ def test_embedding_tile_layout(device, batch_size, sentence_size, vocabulary_siz
     assert [node.target for node in nodes].count(ttnn.embedding) == 1
     # Check inference result
     assert torch.allclose(result_before, result_after)
+
+
+@pytest.mark.parametrize(
+    "batch, sentence_size, vocabulary_size, hidden_embedding_dim, converted",
+    [
+        (1, 384, 160, 1024, True),
+        (8, 384, 256, 512, True),
+        # TODO(TODO): Not support vocabulary size > 256
+        (8, 384, 512, 1024, False),
+    ],
+)
+def test_embedding_backward_tile_layout(device, batch, sentence_size, vocabulary_size, hidden_embedding_dim, converted):
+    m = EmbeddingTileLayoutModule()
+    input = torch.randint(0, vocabulary_size, (batch, sentence_size), dtype=torch.int64)
+    weights = torch.rand((vocabulary_size, hidden_embedding_dim), dtype=torch.bfloat16)
+    grad_data = torch.rand((batch, sentence_size, hidden_embedding_dim))
+
+    weights_before = weights.clone().detach().requires_grad_(True)
+    forward_output = m.forward(input, weights_before)
+    forward_output.backward(gradient=grad_data)
+
+    option = torch_ttnn.TorchTtnnOption(device=device, gen_graphviz=True)
+    # The compilation is lazy, so we need to run forward once to trigger the compilation
+    m = torch.compile(m, backend=torch_ttnn.backend, options=option)
+    weights_after = weights.clone().detach().requires_grad_(True)
+    forward_output = m.forward(input, weights_after)
+    forward_output.backward(gradient=grad_data)
+
+    # Check the graph has be rewritten
+    nodes = list(option._out_fx_graphs[-1].nodes)
+    assert [node.target for node in nodes].count(ttnn.embedding_bw) == (1 if converted else 0)
+    # Check inference result
+    assert weights_before.grad.shape == weights_after.grad.shape
+    # Multiple float multiplications needs a higher tolerance
+    assert torch.allclose(weights_before.grad, weights_after.grad, rtol=0.1)
diff --git a/torch_ttnn/passes/lowering/add_data_move_pass.py b/torch_ttnn/passes/lowering/add_data_move_pass.py
@@ -153,9 +153,11 @@ def is_tt_compute(node) -> bool:
         + TTNN_NORM_OPS
         + [
             ttnn.embedding,
+            ttnn.embedding_bw,
             ttnn.ones,
             ttnn.tril,
             ttnn.arange,
+            ttnn.zeros,
             ttnn.zeros_like,
             ttnn.mean,
             ttnn.global_avg_pool2d,
diff --git a/torch_ttnn/passes/lowering/to_tt_pass.py b/torch_ttnn/passes/lowering/to_tt_pass.py
@@ -11,6 +11,7 @@
 import numpy as np
 from typing import Tuple
 import torch_ttnn.metrics as metrics
+import math
 
 from torch.fx.passes.infra.pass_base import PassBase, PassResult
 import torch.fx.traceback as fx_traceback
@@ -623,6 +624,28 @@ def rewrite_node(node):
                     input = g.call_function(ttnn.to_layout, args=(input, TtnnRowMajorLayout()))
                 return g.call_function(ttnn.pad, args=(input, full_pad, value))
 
+            if node.target == torch.ops.aten.embedding_dense_backward.default:
+                grad_output, indices, num_weights, padding_idx, scale_grad_by_freq = args
+                # TODO(TODO): Not support padding_idx and scale_grad_by_freq
+                if padding_idx != -1 or scale_grad_by_freq:
+                    return None
+                if num_weights > 256:
+                    return None
+                # Change indices to row-major layout to support non-tile-aligned shape
+                indices = g.call_function(ttnn.to_layout, args=(indices, TtnnRowMajorLayout()))
+                # Reconstruct the weight tensor solely for vocabulary size
+                grad_shape = grad_output.meta["val"].size()
+                embedding_dim = grad_shape[-1]
+                weights = g.call_function(
+                    ttnn.zeros, args=((num_weights, embedding_dim),), kwargs={"device": TtnnDevice()}
+                )
+                # Pack grad_output into (1, 1, x, embedding dim)
+                new_grad_shape = (1, 1, math.prod(grad_shape[:-1]), embedding_dim)
+                grad_output = g.call_function(ttnn.reshape, args=(grad_output, new_grad_shape))
+
+                result = g.call_function(ttnn.embedding_bw, args=(indices, weights, grad_output))
+                return g.call_function(ttnn.reshape, args=(result, node.meta["val"].size()))
+
         with g.inserting_before(node):
             new_node = rewrite_node(node)
             if new_node is not None: