Support (un)squeeze in XNN delegate via conversion to view

GregoryComer · GregoryComer · commit bbc17caef16b · 2025-01-27T00:33:06.000-08:00
diff --git a/backends/xnnpack/_passes/__init__.py b/backends/xnnpack/_passes/__init__.py
@@ -12,6 +12,9 @@
 from executorch.backends.xnnpack._passes.conv1d_unsqueeze_pass import (
     Conv1dUnsqueezePass,
 )
+from executorch.backends.xnnpack._passes.convert_squeeze_to_view_pass import (
+    ConvertSqueezeToViewPass,
+)
 from executorch.backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
 from executorch.backends.xnnpack._passes.convert_to_sdpa import ConvertToSDPAPass
 from executorch.backends.xnnpack._passes.convert_to_upsample_bilinear2d import (
@@ -67,6 +70,7 @@ def __init__(
                 DecomposeConcatenate,
                 RemoveGetItemPass,
                 Conv1dUnsqueezePass,
+                ConvertSqueezeToViewPass,
                 PReLUReshapePass,
                 ChannelsLastTaggedReshapePass,
                 TagImplicitQDqPass,
diff --git a/backends/xnnpack/_passes/convert_squeeze_to_view_pass.py b/backends/xnnpack/_passes/convert_squeeze_to_view_pass.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import Optional
+
+import torch
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant
+from executorch.backends.xnnpack.utils.utils import check_or_raise, get_param_tensor, is_param_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import PassResult
+from torch._ops import OpOverload
+from torch.fx.experimental.symbolic_shapes import has_free_symbols
+
+
+class ConvertSqueezeToViewPass(XNNPACKPass):
+    """
+    This pass is used to convert squeeze and unsqueeze nodes into view_copy.
+    This allows them to be subsequentially lowered as static_reshape ops.
+    """
+
+    SUPPORTED_OPS = [
+        exir_ops.edge.aten.squeeze_copy.dim,
+        exir_ops.edge.aten.squeeze_copy.dims,
+        exir_ops.edge.aten.unsqueeze_copy.default,
+    ]
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        node_list = list(graph.nodes)
+        for node in node_list:
+            if node.op == "call_function":
+                if node.target in self.SUPPORTED_OPS:
+                    out_shape = node.meta["val"].shape
+
+                    # Replace up to one dynamic dimension with -1 (inferred dim).
+                    new_shape = []
+                    dynamic_dim_count = 0
+                    for d in out_shape:
+                        if has_free_symbols(d):
+                            new_shape.append(-1)
+                            dynamic_dim_count += 1
+                        else:
+                            new_shape.append(d)
+                    
+                    # This constraint should be enforced by the partitioner.
+                    check_or_raise(
+                        dynamic_dim_count <= 1,
+                        "XNN supports only one dynamic dimension"
+                    )
+
+                    with graph_module.graph.inserting_after(node):
+                        view_node = graph_module.graph.create_node(
+                            "call_function",
+                            target=exir_ops.edge.aten.view_copy.default,
+                            args=(node.args[0], new_shape),
+                            kwargs=node.kwargs
+                        )
+
+                        node.replace_all_uses_with(view_node)
+                        graph_module.graph.erase_node(node)
+
+
+        graph_module.recompile()
+        # Since we are overriding "call", we need to call the parent's "call"
+        # to retrace the graph and regenerate metadata
+        graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
+
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
@@ -45,7 +45,9 @@
     SliceCopyConfig,
     SoftmaxConfig,
     SquareRootConfig,
+    SqueezeCopyConfig,
     SubConfig,
+    UnsqueezeCopyConfig,
     UpsampleBilinear2dConfig,
     ViewCopyConfig,
 )
@@ -99,7 +101,9 @@
     SliceCopyConfig,
     SoftmaxConfig,
     SquareRootConfig,
+    SqueezeCopyConfig,
     SubConfig,
+    UnsqueezeCopyConfig,
     UpsampleBilinear2dConfig,
     ViewCopyConfig,
     # Quant/Dequant Op Configs
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -21,6 +21,7 @@
 )
 from executorch.exir.backend.utils import is_shape_dynamic, WhyNoPartition
 from torch.export import ExportedProgram
+from torch.fx.experimental.symbolic_shapes import has_free_symbols
 
 logger = logging.getLogger(__name__)
 why = WhyNoPartition(logger=logger)
@@ -314,6 +315,31 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         return torch.ops.aten.max_pool2d.default
 
 
+class SqueezeCopyConfig(GenericNodePartitionerConfig):
+    target_name = "squeeze_copy.dims"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
+
+    def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
+        return torch.ops.aten.squeeze_copy.default
+
+    def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
+        """
+        XNNPACK's static_reshape only supports 1 dynamic dimension
+        """
+        if not self.check_common_constraints(node, ep):
+            return False
+
+        new_shape = node.meta["val"].shape
+        dynamic_dim_count = sum(1 for d in new_shape if has_free_symbols(d))
+        if dynamic_dim_count > 1:
+            why(node, reason="only a single dynamic dimension is supported")
+            return False
+
+        return True
+
+
 class UpsampleBilinear2dConfig(GenericNodePartitionerConfig):
     target_name = "upsample_bilinear2d.vec"
 
@@ -336,6 +362,33 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         return torch.ops.aten.upsample_bilinear2d.vec
 
 
+class UnsqueezeCopyConfig(GenericNodePartitionerConfig):
+    target_name = "unsqueeze_copy.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
+
+    def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
+        return torch.ops.aten.unsqueeze_copy.default
+
+    def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
+        """
+        XNNPACK's static_reshape only supports 1 dynamic dimension
+        """
+        if not self.check_common_constraints(node, ep):
+            return False
+
+        new_shape = node.meta["val"].shape
+        dynamic_dim_count = sum(
+            1 for d in new_shape if not isinstance(d, int) and has_free_symbols(d)
+        )
+        if dynamic_dim_count > 1:
+            why(node, reason="only a single dynamic dimension is supported")
+            return False
+
+        return True
+
+
 class ViewCopyConfig(GenericNodePartitionerConfig):
     target_name = "view_copy.default"
 
diff --git a/backends/xnnpack/test/ops/test_squeeze.py b/backends/xnnpack/test/ops/test_squeeze.py
@@ -0,0 +1,107 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.test.tester import Export, Tester
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import Dim
+
+
+class TestSqueeze(unittest.TestCase):
+    class Squeeze(torch.nn.Module):
+        def __init__(self, dims):
+            super().__init__()
+            self.dims = dims
+
+        def forward(self, x):
+            return torch.squeeze(x, self.dims)
+
+    def test_fp32_squeeze(self):
+        inputs = (torch.randn(1,2,1,4,1),)
+        squeeze_dims = (0, 2, 4)
+
+        for dims in squeeze_dims:
+            (
+                Tester(self.Squeeze(dims), inputs)
+                .export()
+                .check_node_count({
+                    torch.ops.aten.squeeze.dim: 1,
+                })
+                .to_edge_transform_and_lower()
+                .check_node_count({
+                    exir_ops.edge.aten.squeeze_copy.dim: 0,
+                    exir_ops.edge.aten.view_copy.default: 0,
+                    torch.ops.higher_order.executorch_call_delegate: 1,
+                })
+                .run_method_and_compare_outputs()
+            )
+
+    def test_fp16_squeeze(self):
+        inputs = (torch.randn(1,2,1,4,1).to(torch.float16),)
+        squeeze_dims = (0, 2, 4)
+
+        for dims in squeeze_dims:
+            (
+                Tester(self.Squeeze(dims), inputs)
+                .export()
+                .check_node_count({
+                    torch.ops.aten.squeeze.dim: 1,
+                })
+                .to_edge_transform_and_lower()
+                .check_node_count({
+                    exir_ops.edge.aten.squeeze_copy.dim: 0,
+                    exir_ops.edge.aten.view_copy.default: 0,
+                    torch.ops.higher_order.executorch_call_delegate: 1,
+                })
+                .run_method_and_compare_outputs()
+            )
+
+    def test_fp32_squeeze_dynamic(self):
+        inputs = (torch.randn(1,2,1,4,1),)
+        squeeze_dims = (0, 2, 4)
+        dynamic_shapes = { "x": { 1: Dim("x_1", min=1, max=10) } }
+
+        for dims in squeeze_dims:
+            (
+                Tester(self.Squeeze(dims), inputs)
+                .export(Export(dynamic_shapes=dynamic_shapes))
+                .check_node_count({
+                    torch.ops.aten.squeeze.dim: 1,
+                })
+                .to_edge_transform_and_lower()
+                .check_node_count({
+                    exir_ops.edge.aten.squeeze_copy.dim: 0,
+                    exir_ops.edge.aten.view_copy.default: 0,
+                    torch.ops.higher_order.executorch_call_delegate: 1,
+                })
+                .run_method_and_compare_outputs()
+            )
+
+    def test_fp32_squeeze_unsupported_dynamism(self):
+        inputs = (torch.randn(1,2,1,4,1),)
+        squeeze_dims = (0, 2, 4)
+        # Only one dynamic dimension is supported.
+        dynamic_shapes = { "x": { 
+            1: Dim("x_1", min=1, max=10),
+            3: Dim("x_3", min=1, max=10),
+        } }
+
+        for dims in squeeze_dims:
+            (
+                Tester(self.Squeeze(dims), inputs)
+                .export(Export(dynamic_shapes=dynamic_shapes))
+                .check_node_count({
+                    torch.ops.aten.squeeze.dim: 1,
+                })
+                .to_edge_transform_and_lower()
+                .check_node_count({
+                    exir_ops.edge.aten.squeeze_copy.dims: 1,
+                    torch.ops.higher_order.executorch_call_delegate: 0,
+                })
+                .run_method_and_compare_outputs()
+            )
diff --git a/backends/xnnpack/test/ops/test_unsqueeze.py b/backends/xnnpack/test/ops/test_unsqueeze.py
@@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.test.tester import Export, Tester
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import Dim
+
+
+class TestUnsqueeze(unittest.TestCase):
+    class Unsqueeze(torch.nn.Module):
+        def __init__(self, dim):
+            super().__init__()
+            self.dim = dim
+
+        def forward(self, x):
+            return torch.unsqueeze(x, self.dim)
+
+    def test_fp32_unsqueeze(self):
+        inputs = (torch.randn(1,2,4),)
+        for dim in range(len(inputs[0].shape)):
+            (
+                Tester(self.Unsqueeze(dim), inputs)
+                .export()
+                .check_node_count({
+                    torch.ops.aten.unsqueeze.default: 1,
+                })
+                .to_edge_transform_and_lower()
+                .check_node_count({
+                    exir_ops.edge.aten.unsqueeze_copy.default: 0,
+                    exir_ops.edge.aten.view_copy.default: 0,
+                    torch.ops.higher_order.executorch_call_delegate: 1,
+                })
+                .run_method_and_compare_outputs()
+            )
+
+    def test_fp16_unsqueeze(self):
+        inputs = (torch.randn(1,2,4).to(torch.float16),)
+        for dim in range(len(inputs[0].shape)):
+            (
+                Tester(self.Unsqueeze(dim), inputs)
+                .export()
+                .check_node_count({
+                    torch.ops.aten.unsqueeze.default: 1,
+                })
+                .to_edge_transform_and_lower()
+                .check_node_count({
+                    exir_ops.edge.aten.unsqueeze_copy.default: 0,
+                    exir_ops.edge.aten.view_copy.default: 0,
+                    torch.ops.higher_order.executorch_call_delegate: 1,
+                })
+                .run_method_and_compare_outputs()
+            )
+
+    def test_fp32_unsqueeze_dynamic(self):
+        inputs = (torch.randn(1,2,4),)
+        dynamic_shapes = { "x": { 1: Dim("x_1", min=1, max=10) } }
+
+        for dim in range(len(inputs[0].shape)):
+            (
+                Tester(self.Unsqueeze(dim), inputs)
+                .export(Export(dynamic_shapes=dynamic_shapes))
+                .check_node_count({
+                    torch.ops.aten.unsqueeze.default: 1,
+                })
+                .to_edge_transform_and_lower()
+                .check_node_count({
+                    exir_ops.edge.aten.unsqueeze_copy.default: 0,
+                    exir_ops.edge.aten.view_copy.default: 0,
+                    torch.ops.higher_order.executorch_call_delegate: 1,
+                })
+                .run_method_and_compare_outputs()
+            )
+
+    def test_fp32_unsqueeze_unsupported_dynamism(self):
+        inputs = (torch.randn(1,2,4),)
+        # Only one dynamic dimension is supported.
+        dynamic_shapes = { "x": { 
+            1: Dim("x_1", min=1, max=10),
+            2: Dim("x_2", min=1, max=10),
+        } }
+
+        for dim in range(len(inputs[0].shape)):
+            (
+                Tester(self.Unsqueeze(dim), inputs)
+                .export(Export(dynamic_shapes=dynamic_shapes))
+                .check_node_count({
+                    torch.ops.aten.unsqueeze.default: 1,
+                })
+                .to_edge_transform_and_lower()
+                .check_node_count({
+                    exir_ops.edge.aten.unsqueeze_copy.default: 1,
+                    torch.ops.higher_order.executorch_call_delegate: 0,
+                })
+                .run_method_and_compare_outputs()
+            )
diff --git a/backends/xnnpack/test/passes/test_convert_squeeze_to_view_pass.py b/backends/xnnpack/test/passes/test_convert_squeeze_to_view_pass.py