modular/max/nn/conv_transpose.py at main · avioligo/modular · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
# ===----------------------------------------------------------------------=== #
# Copyright (c) 2025, Modular Inc. All rights reserved.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions:
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===----------------------------------------------------------------------=== #

from dataclasses import dataclass
from typing import Union

from max.dtype import DType
from max.graph import DeviceRef, TensorValue, Weight, ops
from max.graph.type import ConvInputLayout, FilterLayout

from .layer import Module


@dataclass
class ConvTranspose1d(Module):
    """A 1D transposed convolution operator over an input image composed of several input planes.
    Example:
        .. code-block:: python

            conv = nn.ConvTranspose1d(
                in_channels,
                out_channels,
                kernel_size,
                stride,
                padding,
                output_padding,
                has_bias=False,
                name="conv3d_weight",
                device=DeviceRef.GPU(),
            )
    """

    device: Union[DeviceRef, None]
    """The device where matrix operations are performed."""

    weight: Weight
    """The weight matrix stored on CPU with shape (kernel_length, out_channels, in_channels).
    Model init moves the weight to :obj:`device`."""

    stride: tuple[int, int]
    """Controls the stride for the cross-correlation. """

    padding: tuple[int, int, int, int]
    """Controls the amount of padding applied before and after the input for depth, height, and width dimensions."""

    dilation: tuple[int, int]
    """Not implemented yet. Assuming dilation = 1 for now."""

    output_padding: tuple[int, int]
    """Additional size added to one side of the output shape. Default: 0"""

    permute: bool
    """bool controls whether self.weight is permuted from PyTorch order to max order.
    PyTorch order is: (in_channels, out_channels, kernel_length)
    Max API order: (kernel_length, out_channels, in_channels). """

    bias: Union[Weight, None] = None
    """The optional bias vector stored on CPU with shape (out_channels,).
    Model init moves the bias to :obj:`device` if present."""

    def __init__(
        self,
        length: int,
        in_channels: int,
        out_channels: int,
        dtype: DType,
        stride: Union[int, tuple[int, int]] = 1,
        padding: Union[int, tuple[int, int, int, int]] = 0,
        dilation: Union[int, tuple[int, int]] = 1,
        output_padding: Union[int, tuple[int, int]] = 0,
        device: Union[DeviceRef, None] = None,
        has_bias: bool = False,
        permute: bool = False,
        name: Union[str, None] = None,
    ) -> None:
        """Initializes the Conv3D layer with weights and optional bias.

        Args:
            depth: kernel_size[0]
            height: kernel_size[1]
            width: kernel_size[2]
            in_channels: number of channels in the input image.
            out_channels: dimensionality of the output.
            dtype: The data type for both weights and bias.
            stride: Stride of the convolution. Default: 1
            padding:  Padding added to all six sides of the input. Default: 0
            dilation: Spacing between kernel elements. Default: 1
            num_groups:  Number of blocked connections from input channels to output channels. Default: 1.
            device: The target device for computation.
                Weights remain on CPU until moved during computation.
            name: Base name for weights (appended with ``.weight`` and
                ``.bias`` if applicable).
            has_bias: When :obj:`True`, adds a bias vector to the layer.
                Defaults to :obj:`False`.
        """
        super().__init__()

        self.device = device

        self.permute = permute

        if self.permute:
            self.weight = Weight(
                name=f"{name}.weight" if name else "weight",
                dtype=dtype,
                shape=[
                    in_channels,
                    out_channels,
                    length,
                ],
                device=self.device or DeviceRef.CPU(),
            )
        else:
            self.weight = Weight(
                name=f"{name}.weight" if name else "weight",
                dtype=dtype,
                shape=[
                    length,
                    out_channels,
                    in_channels,
                ],
                device=self.device or DeviceRef.CPU(),
            )

        if has_bias:
            self.bias = Weight(
                name=f"{name}.bias" if name else "bias",
                dtype=dtype,
                shape=(out_channels,),
                device=self.device or DeviceRef.CPU(),
            )

        # These need to be casted as the underlying ops.conv3d call
        # expects them to only be tuple types.
        if isinstance(stride, int):
            stride = (1, stride)
        self.stride = stride

        if isinstance(output_padding, int):
            output_padding = (0, output_padding)
        self.output_padding = output_padding

        if isinstance(padding, int):
            padding = (
                0,
                0,
                padding,
                padding,
            )
        self.padding = padding

        if isinstance(dilation, int):
            dilation = (1, dilation)
        self.dilation = dilation

        if (
            isinstance(self.weight, Weight)
            and self.weight.quantization_encoding is not None
        ):
            raise ValueError(
                "ConvTranspose1d not implemented with weight quantization."
            )

    def __call__(self, x: TensorValue) -> TensorValue:
        """Applied ConvTranspose1d to input `x`. Permutes pytorch weights to match max API if permute=True.

        Args:
            x: a tensor of shape (batch_size, length, in_channels)
            if self.permute, then input is of shape: (batch_size, in_channels, length)
            and will be permuted to match max's expected input shape.
            Also, self.weight will be permuted from (kernel_length, in_channels, out_channels) to
            (in_channels, out_channels, kernel_length)

        Returns:
             a tensor of shape (batch_size, new_length, out_channels).
             if self.permute, then the output shape will be (batch_size, out_channels, new_length)
        """
        weight: TensorValue = self.weight

        if self.permute:
            # Use Pyotorch and CuDNN layout.
            # Reshape (batch_size, in_channels, length) to [batch_size, in_channels, height=1, length].
            x = ops.unsqueeze(x, 2)
            # Reshape (in_channels, out_channels, kernel_length) to [in_channels, out_channels, kernel_height=1, kernel_length,].
            weight = ops.unsqueeze(self.weight, 2)
            # # [batch_size, in_channels, height=1, length] to (batch_size, height, length, in_channels)
            # x = ops.permute(x, [0, 2, 3, 1])
            # # (in_channels, out_channels, kernel_height, kernel_length) to [kernel_height=1, kernel_length, out_channels, in_channels]
            # weight = ops.permute(weight, [2, 3, 1, 0])
        else:
            # Reshape (batch_size, length, in_channels) to [batch_size, height=1, length, in_channels].
            x = ops.unsqueeze(x, 1)
            # Reshape [kernel_length, in_channels, out_channels] to [kernel_height=1, kernel_length, out_channels, in_channels].
            weight = ops.unsqueeze(weight, 0)

        res = ops.conv2d_transpose(
            x=x,
            filter=weight,
            stride=self.stride,
            dilation=self.dilation,
            padding=self.padding,
            output_paddings=self.output_padding,
            bias=self.bias,
            input_layout=ConvInputLayout.NCHW
            if self.permute
            else ConvInputLayout.NHWC,
            filter_layout=FilterLayout.CFRS
            if self.permute
            else FilterLayout.RSCF,
        )

        if self.permute:
            # # permute output from [batch_size, height=1, new_length, out_channels] to (batch_size, out_channels, height=1, new_length).
            # res = ops.permute(res, [0, 3, 1, 2])
            # Reshape  (batch_size, out_channels, height=1, new_length). to [batch_size, out_channels, new_length].
            res = ops.squeeze(res, 2)
        else:
            # Reshape [batch_size, height=1, new_length, out_channels] to [batch_size, new_length, out_channels].
            res = ops.squeeze(res, 1)
        return res


class WeightNormConvTranspose1d(Module):
    """A 1D transposed convolution operator over an input image composed of several input planes.
    This version uses weight normalization as described in https://arxiv.org/abs/1602.07868

    Weight normalization reparameterizes weights in terms of a direction vector v and a magnitude scalar g.
    This can help improve optimization by decoupling the length and direction of weight vectors.

    Example:
        .. code-block:: python

            conv = WeightNormConvTranspose1d(
                length=kernel_size,
                in_channels=in_channels,
                out_channels=out_channels,
                dtype=dtype,
                stride=stride,
                padding=padding,
                output_padding=output_padding,
                has_bias=False,
                device=DeviceRef.GPU(),
            )
    """

    device: Union[DeviceRef, None]
    """The device where matrix operations are performed."""

    conv: ConvTranspose1d
    """The underlying ConvTranspose1d layer"""

    weight_g: Weight
    """The magnitude parameter g for weight normalization"""

    weight_v: Weight
    """The direction parameter v for weight normalization"""

    def __init__(
        self,
        length: int,
        in_channels: int,
        out_channels: int,
        dtype: DType,
        stride: Union[int, tuple[int, int]] = 1,
        padding: Union[int, tuple[int, int, int, int]] = 0,
        dilation: Union[int, tuple[int, int]] = 1,
        output_padding: Union[int, tuple[int, int]] = 0,
        device: Union[DeviceRef, None] = None,
        has_bias: bool = False,
        permute: bool = False,
        name: Union[str, None] = None,
    ) -> None:
        """Initializes the WeightNormConvTranspose1d layer.

        Args:
            length: The length of the convolution kernel
            in_channels: Number of channels in the input image
            out_channels: Number of channels produced by the convolution
            dtype: The data type for weights and bias
            stride: Stride of the convolution. Default: 1
            padding: Padding added to input. Default: 0
            dilation: Spacing between kernel elements. Default: 1
            output_padding: Additional size added to output shape. Default: 0
            device: The target device for computation
            has_bias: When True, adds a bias vector. Default: False
            permute: Whether to permute weights between PyTorch and MAX format
            name: Base name for weights
        """
        super().__init__()

        # Initialize the conv layer
        self.conv = ConvTranspose1d(
            length,
            in_channels,
            out_channels,
            dtype,
            stride,
            padding,
            dilation,
            output_padding,
            device,
            has_bias,
            permute,
            name,
        )

        # Initialize g parameter (magnitude) - shape matches PyTorch's implementation
        self.weight_g = Weight(
            name=f"{name}.weight_g" if name else "weight_g",
            dtype=dtype,
            shape=(in_channels, 1, 1),  # Shape matches PyTorch's weight_g
            device=device or DeviceRef.CPU(),
        )

        # Initialize v parameter (direction)
        if permute:
            v_shape = (in_channels, out_channels, length)
        else:
            v_shape = (length, out_channels, in_channels)

        self.weight_v = Weight(
            name=f"{name}.weight_v" if name else "weight_v",
            dtype=dtype,
            shape=v_shape,
            device=device or DeviceRef.CPU(),
        )

        self.bias = self.conv.bias

        del self.conv.weight
        del self.conv.bias

    def __call__(self, x: TensorValue) -> TensorValue:
        """Apply the weight normalized convolution to input x.

        Args:
            x: Input tensor

        Returns:
            Output tensor after applying convolution with normalized weights
        """
        if not hasattr(self.conv, "weight"):
            # Compute normalized weight sqrt(sum(x**2))
            in_channels = self.weight_v.shape[0]
            v_norm = ops.sqrt(
                ops.sum((self.weight_v**2).reshape([in_channels, -1]), axis=1)
            ).reshape([in_channels, 1, 1])
            w = self.weight_v / v_norm
            w = w * self.weight_g

            # Update conv layer weight and apply convolution
            self.conv.weight = w
            self.conv.bias = self.bias

        return self.conv(x)