Fix view and reshape ops when shape is passed as a kwarg. This essentially (#1426)

coreyjadams · web-flow · commit 82664751ef9f · 2026-02-18T14:43:36.000Z
consolidates the argument parsing for all the operatiosn.
diff --git a/physicsnemo/domain_parallel/shard_utils/view_ops.py b/physicsnemo/domain_parallel/shard_utils/view_ops.py
@@ -668,32 +668,51 @@ def sharded_view(tensor: ShardTensor, target_shape: Sequence[int]) -> ShardTenso
 
 
 # ---------------------------------------------------------------------------
-# __torch_function__ handlers
+# __torch_function__ handlers: argument repackaging
 # ---------------------------------------------------------------------------
 
 
-def _extract_view_shape(args: tuple[Any, ...]) -> tuple[ShardTensor, tuple[int, ...]]:
-    r"""Extract tensor and target shape from ``__torch_function__`` args.
-
-    Handles both ``x.view(a, b, c)`` and ``x.view((a, b, c))`` calling
-    conventions.
+def _reshape_args(*shape_args: Any) -> tuple[int, ...]:
+    r"""Normalize shape arguments to a single tuple of ints.
 
-    Parameters
-    ----------
-    args : tuple
-        Positional arguments from ``__torch_function__``.
-
-    Returns
-    -------
-    tuple[ShardTensor, tuple[int, ...]]
-        The input tensor and the target shape.
+    Handles both a single sequence (e.g. ``(2, 3, 4)``) and variadic ints
+    (e.g. ``2, 3, 4``) as used by ``Tensor.view`` and ``Tensor.reshape``.
+    """
+    if len(shape_args) == 1 and isinstance(shape_args[0], (tuple, list, torch.Size)):
+        return tuple(shape_args[0])
+    return tuple(shape_args)
+
+
+def extract_view_and_reshape_arguments(
+    *args: Any, **kwargs: Any
+) -> tuple[
+    ShardTensor,
+    tuple[int, ...] | None,
+    torch.dtype | None,
+]:
+    r"""Extract (tensor, shape, dtype) from view/reshape __torch_function__ args.
+
+    Used by Tensor.view, Tensor.reshape, torch.reshape, and aten.view.default.
+    For view(dtype), returns (tensor, None, dtype). Otherwise returns
+    (tensor, shape, None) with shape normalized to tuple[int, ...].
     """
     tensor = args[0]
-    if len(args) == 2 and isinstance(args[1], (tuple, list, torch.Size)):
-        shape = tuple(args[1])
-    else:
-        shape = tuple(args[1:])
-    return tensor, shape
+    # If there is a dtype, catch and exit early:
+    if len(args) == 2 and isinstance(args[1], torch.dtype):
+        # Honestly this execution path makes no sense to me ...
+        return (tensor, None, args[1])
+    # If it's in kwargs, use that:
+    shape = kwargs.get("shape", None)
+    if shape is not None:
+        return (tensor, shape, None)
+    # Otherwise, all remaning args get massaged into a tuple:
+    shape = _reshape_args(*args[1:])
+    return (tensor, shape, None)
+
+
+# ---------------------------------------------------------------------------
+# __torch_function__ handlers
+# ---------------------------------------------------------------------------
 
 
 def view_wrapper(
@@ -703,9 +722,13 @@ def view_wrapper(
     kwargs: dict[str, Any],
 ) -> ShardTensor:
     r"""``__torch_function__`` handler for ``torch.Tensor.view``."""
-    if len(args) == 2 and isinstance(args[1], torch.dtype):
-        return _sharded_view_dtype(args[0], args[1])
-    tensor, shape = _extract_view_shape(args)
+    tensor, shape, dtype = extract_view_and_reshape_arguments(*args, **kwargs)
+    if dtype is not None:
+        return _sharded_view_dtype(tensor, dtype)
+    if shape is None:
+        raise ValueError(
+            "ShardTensor.view_wrapper: Shape is required for view operation"
+        )
     return sharded_view(tensor, shape)
 
 
@@ -716,7 +739,15 @@ def reshape_wrapper(
     kwargs: dict[str, Any],
 ) -> ShardTensor:
     r"""``__torch_function__`` handler for ``torch.Tensor.reshape``."""
-    tensor, shape = _extract_view_shape(args)
+    tensor, shape, dtype = extract_view_and_reshape_arguments(*args, **kwargs)
+    if dtype is not None:
+        raise ValueError(
+            "ShardTensor.reshape_wrapper: Dtype is not supported for reshape operation"
+        )
+    if shape is None:
+        raise ValueError(
+            "ShardTensor.reshape_wrapper: Shape is required for reshape operation"
+        )
     return sharded_view(tensor, shape)
 
 
@@ -727,7 +758,11 @@ def torch_reshape_wrapper(
     kwargs: dict[str, Any],
 ) -> ShardTensor:
     r"""``__torch_function__`` handler for ``torch.reshape``."""
-    tensor, shape = _extract_view_shape(args)
+    tensor, shape, _ = extract_view_and_reshape_arguments(*args, **kwargs)
+    if shape is None:
+        raise ValueError(
+            "ShardTensor.torch_reshape_wrapper: Shape is required for reshape operation"
+        )
     return sharded_view(tensor, shape)
 
 
@@ -789,8 +824,11 @@ def aten_view_wrapper(
     ShardTensor
         Viewed ShardTensor.
     """
-    tensor = args[0]
-    shape = args[1]
+    tensor, shape, _ = extract_view_and_reshape_arguments(*args, **kwargs)
+    if shape is None:
+        raise ValueError(
+            "ShardTensor.aten_view_wrapper: Shape is required for view operation"
+        )
     return sharded_view(tensor, shape)
 
 
diff --git a/test/domain_parallel/ops/test_view_ops.py b/test/domain_parallel/ops/test_view_ops.py
@@ -76,8 +76,19 @@ def forward(self, tensor: torch.Tensor):
         return tensor.view(self.target_shape)
 
 
+class ViewVariadicWrapper(torch.nn.Module):
+    """Wrapper for testing tensor.view(*shape) with variadic int arguments."""
+
+    def __init__(self, target_shape: tuple[int, ...]):
+        super().__init__()
+        self.target_shape = target_shape
+
+    def forward(self, tensor: torch.Tensor):
+        return tensor.view(*self.target_shape)
+
+
 class ReshapeWrapper(torch.nn.Module):
-    """Wrapper class for testing tensor.reshape operation."""
+    """Wrapper class for testing tensor.reshape(shape) with shape as a single tuple."""
 
     def __init__(self, target_shape: tuple[int, ...]):
         super().__init__()
@@ -87,8 +98,19 @@ def forward(self, tensor: torch.Tensor):
         return tensor.reshape(self.target_shape)
 
 
+class ReshapeVariadicWrapper(torch.nn.Module):
+    """Wrapper for testing tensor.reshape(*shape) with variadic int arguments."""
+
+    def __init__(self, target_shape: tuple[int, ...]):
+        super().__init__()
+        self.target_shape = target_shape
+
+    def forward(self, tensor: torch.Tensor):
+        return tensor.reshape(*self.target_shape)
+
+
 class TorchReshapeWrapper(torch.nn.Module):
-    """Wrapper class for testing torch.reshape operation."""
+    """Wrapper class for testing torch.reshape(tensor, shape) with shape as tuple."""
 
     def __init__(self, target_shape: tuple[int, ...]):
         super().__init__()
@@ -98,6 +120,28 @@ def forward(self, tensor: torch.Tensor):
         return torch.reshape(tensor, self.target_shape)
 
 
+class TorchReshapeListWrapper(torch.nn.Module):
+    """Wrapper for testing torch.reshape(tensor, shape) with shape as list."""
+
+    def __init__(self, target_shape: tuple[int, ...]):
+        super().__init__()
+        self.target_shape = target_shape
+
+    def forward(self, tensor: torch.Tensor):
+        return torch.reshape(tensor, list(self.target_shape))
+
+
+class TorchReshapeKwargWrapper(torch.nn.Module):
+    """Wrapper for testing torch.reshape(tensor, shape=...) with shape as kwarg."""
+
+    def __init__(self, target_shape: tuple[int, ...]):
+        super().__init__()
+        self.target_shape = target_shape
+
+    def forward(self, tensor: torch.Tensor):
+        return torch.reshape(tensor, shape=self.target_shape)
+
+
 class ViewRoundTrip(torch.nn.Module):
     """View to merge last two dims, then view back to the original shape.
 
@@ -343,6 +387,71 @@ def test_torch_reshape_operation(
     )
 
 
+@pytest.mark.multigpu_static
+@pytest.mark.parametrize(
+    "wrapper_cls,arg_style",
+    [
+        (ViewWrapper, "tuple"),
+        (ViewVariadicWrapper, "variadic"),
+        (ReshapeWrapper, "tuple"),
+        (ReshapeVariadicWrapper, "variadic"),
+        (TorchReshapeWrapper, "tuple"),
+        (TorchReshapeListWrapper, "list"),
+        (TorchReshapeKwargWrapper, "kwarg"),
+    ],
+    ids=[
+        "view_tuple",
+        "view_variadic",
+        "reshape_tuple",
+        "reshape_variadic",
+        "torch_reshape_tuple",
+        "torch_reshape_list",
+        "torch_reshape_kwarg",
+    ],
+)
+@pytest.mark.parametrize("backward", [False, True])
+def test_view_reshape_argument_permutations(
+    distributed_mesh,
+    wrapper_cls,
+    arg_style,
+    backward,
+):
+    """Test all argument permutations: view/reshape with shape as tuple, variadic, list, or kwarg.
+
+    Covers tensor.view(shape), tensor.view(*shape), tensor.reshape(shape),
+    tensor.reshape(*shape), torch.reshape(tensor, shape),
+    torch.reshape(tensor, list(shape)), and torch.reshape(tensor, shape=...).
+    """
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
+    dm = DistributedManager()
+    shape = (4, 128, 8, 4)
+    target_shape = (4, 128, 32)
+
+    original_tensor = torch.rand(shape, device=dm.device, requires_grad=backward)
+
+    placements = (Shard(1),)
+
+    sharded_tensor = scatter_tensor(
+        original_tensor,
+        global_src=0,
+        mesh=distributed_mesh,
+        placements=placements,
+        requires_grad=backward,
+    )
+
+    module = wrapper_cls(target_shape=target_shape)
+
+    numerical_shard_tensor_check(
+        distributed_mesh,
+        module,
+        [sharded_tensor],
+        {},
+        check_grads=backward,
+    )
+
+
 @pytest.mark.multigpu_static
 @pytest.mark.parametrize("backward", [False, True])
 def test_view_shard_on_non_viewed_dim(