[while_loop][inductor] support sym expression as cond_fn output (pytorch#146222)

ydwu4 · pytorchmergebot · commit 5ecdc428b230 · 2025-02-06T19:39:55.000Z
As titled. Previously, we only support tensor output of cond_fn, this PR changes to also allow a shape expr to be returned in cond_fn. aoti generated output code looks like: ``` V0203 11:28:05.750000 2611693 torch/_inductor/compile_fx.py:1091] [1/0] [__output_code] bool buf7_cond_result; .... (while_loop_cond_graph_0_arg2_1_handle); V0203 11:27:59.336000 2611693 torch/_inductor/compile_fx.py:1091] [1/0] [__output_code] buf7_cond_result = u0 + u1 < 10L; V0203 11:27:59.336000 2611693 torch/_inductor/compile_fx.py:1091] [1/0] [__output_code] if (!buf7_cond_result) break; ``` Pull Request resolved: pytorch#146222 Approved by: https://github.com/desertfire ghstack dependencies: pytorch#146194, pytorch#146195
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
@@ -1511,6 +1511,26 @@ def test_while_loop_with_unbacked_symint_closure(self, dynamic):
             dynamic_shapes=dynamic_shapes,
         )
 
+    @common_utils.parametrize("dynamic", [False, True])
+    def test_while_loop_with_sym_expr_cond(self, dynamic):
+        inputs = (
+            torch.randn(10, 20, device=self.device),
+            torch.randn(10, 20, device=self.device),
+        )
+        dim0_ab = Dim("s0", min=2, max=1024)
+        dynamic_shapes = None
+        if dynamic:
+            dynamic_shapes = {
+                "c": {},
+                "a": {0: dim0_ab, 1: None},
+                "b": {0: dim0_ab, 1: None},
+            }
+        self.check_model_with_multiple_inputs(
+            WhileLoopModels.SymExprCond(),
+            prepend_counters(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
     @config.patch({"is_predispatch": True})
     def test_constant(self):
         class M(torch.nn.Module):
diff --git a/test/inductor/test_control_flow.py b/test/inductor/test_control_flow.py
@@ -876,6 +876,23 @@ def body_fn(c, a, b):
                 [c, a, b],
             )
 
+    class SymExprCond(torch.nn.Module):
+        def forward(self, c, a, b):
+            d = a.sum().to(torch.int64).item()
+            e = torch.nonzero(b).size(0)
+
+            def cond_fn(c, a, b):
+                return d + e + a.shape[0] - b.shape[0] < 10
+
+            def body_fn(c, a, b):
+                return c + 1, a + e, b + d
+
+            return torch._higher_order_ops.while_loop(
+                cond_fn,
+                body_fn,
+                [c, a, b],
+            )
+
 
 class WhileLoopTests(TestCase):
     def _run_test(
@@ -1139,6 +1156,23 @@ def test_while_loop_with_unbacked_symint_closure(self, device, dynamic):
             dynamic=dynamic,
         )
 
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch(
+        {"capture_scalar_outputs": True, "capture_dynamic_output_shape_ops": True}
+    )
+    def test_while_loop_with_sym_expr_cond(self, device, dynamic):
+        self._run_test(
+            model=WhileLoopModels.SymExprCond(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
 
 class AssociativeScanTests(TestCase):
     @requires_gpu
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -1594,13 +1594,14 @@ def codegen_subgraph_suffix(self, subgraph, outer_inputs, outer_outputs):
             subgraph.graph.graph_outputs, outer_outputs
         ):
             src = inner_output.codegen_reference()
-            # in ABI-compatible mode, we need to std::move subgraph output (inner_output)
-            # to the conditional output (outer_output), as RAIIAtenTensorHandle's copy
-            # constructor is deleted.
-            src = f"std::move({src})"
-            # in case the outer_output carried a value
-            # before (e.g., in the while_loop codegen)
-            self.writeline(f"{outer_output}.reset();")
+            if not isinstance(inner_output, ir.ShapeAsConstantBuffer):
+                # in ABI-compatible mode, we need to std::move subgraph output (inner_output)
+                # to the conditional output (outer_output), as RAIIAtenTensorHandle's copy
+                # constructor is deleted.
+                src = f"std::move({src})"
+                # in case the outer_output carried a value
+                # before (e.g., in the while_loop codegen)
+                self.writeline(f"{outer_output}.reset();")
             self.writeline(f"{outer_output} = {src};")
 
     def codegen_invoke_subgraph(self, invoke_subgraph):
@@ -1662,6 +1663,9 @@ def codegen_subgraph(self, subgraph, outer_inputs, outer_outputs):
             self.pop_codegened_graph()
 
     def codegen_while_loop(self, while_loop):
+        is_bool_pred = isinstance(
+            while_loop.cond_subgraph.graph.graph_outputs[0], ir.ShapeAsConstantBuffer
+        )
         name = while_loop.get_name()
         outer_carried_inputs = [
             buf.codegen_reference() for buf in while_loop.carried_inputs
@@ -1670,7 +1674,10 @@ def codegen_while_loop(self, while_loop):
             buf.codegen_reference() for buf in while_loop.additional_inputs
         ]
         cond_result_name = f"{name}_cond_result"
-        self.writeline(f"RAIIAtenTensorHandle {cond_result_name};")
+        if is_bool_pred:
+            self.writeline(f"bool {cond_result_name};")
+        else:
+            self.writeline(f"RAIIAtenTensorHandle {cond_result_name};")
 
         cond_outer_inputs = []
         for inp, out in zip(outer_carried_inputs, while_loop.outputs):
@@ -1700,8 +1707,11 @@ def codegen_while_loop(self, while_loop):
             while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
         )
 
-        cond_result = f"{cond_result_name}_scalar"
-        self.codegen_tensor_item(torch.bool, cond_result_name, cond_result)
+        if is_bool_pred:
+            cond_result = f"{cond_result_name}"
+        else:
+            cond_result = f"{cond_result_name}_scalar"
+            self.codegen_tensor_item(torch.bool, cond_result_name, cond_result)
         self.writeline(f"if (!{cond_result}) break;")
 
         self.writeline(ExitSubgraphLine(self))
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
@@ -2551,7 +2551,7 @@ def codegen_while_loop(self, while_loop):
             while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
         )
         self.writeline(
-            f"if not {cond_outer_outputs[0]}.item(): break"
+            f"if not {cond_outer_outputs[0]}: break"
         )  # condition doesn't hold
         self.writeline(ExitSubgraphLine(self))
         self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -7404,8 +7404,10 @@ def create(  # type: ignore[no-untyped-def]
 
         # make sure cond_fn returns a boolean scalar Tensor
         assert len(cond_outputs) == 1, cond_outputs
-        assert cond_outputs[0].get_dtype() == torch.bool, cond_outputs
-        assert len(cond_outputs[0].get_size()) == 0, cond_outputs
+        p = cond_outputs[0]
+        if not isinstance(p, ShapeAsConstantBuffer):
+            assert p.get_dtype() == torch.bool, p
+            assert len(p.get_size()) == 0, p
 
         assert (
             len(all_inputs) > 0

Original file line number	Diff line number	Diff line change
`@@ -2551,7 +2551,7 @@ def codegen_while_loop(self, while_loop):`
`2551`	`2551`	`while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs`
`2552`	`2552`	`)`
`2553`	`2553`	`self.writeline(`
`2554`		`- f"if not {cond_outer_outputs[0]}.item(): break"`
	`2554`	`+ f"if not {cond_outer_outputs[0]}: break"`
`2555`	`2555`	`) # condition doesn't hold`
`2556`	`2556`	`self.writeline(ExitSubgraphLine(self))`
`2557`	`2557`	`self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))`