feat: add embedding net that uses 1D causal convolutions (#1459) (#1499)

Aranka-S · web-flow · commit 2280f337433c · 2025-03-21T14:53:31.000+01:00
* first wavenet inspired causal convolution embedding net

* add same padding and a kernel size sanity check

* changes requested in PR review

* pass dilation scheme as a string

* take activation outside of causalConv1d, and add sanity check for dilation size
diff --git a/sbi/neural_nets/__init__.py b/sbi/neural_nets/__init__.py
@@ -9,7 +9,12 @@
 
 
 def __getattr__(name):
-    if name in ["CNNEmbedding", "FCEmbedding", "PermutationInvariantEmbedding"]:
+    if name in [
+        "CausalCNNEmbedding",
+        "CNNEmbedding",
+        "FCEmbedding",
+        "PermutationInvariantEmbedding",
+    ]:
         raise ImportError(
             "As of sbi v0.23.0, you have to import embedding networks from "
             "`sbi.neural_nets.embedding_nets`. For example, use: "
diff --git a/sbi/neural_nets/embedding_nets/__init__.py b/sbi/neural_nets/embedding_nets/__init__.py
@@ -1,3 +1,4 @@
+from sbi.neural_nets.embedding_nets.causal_cnn import CausalCNNEmbedding
 from sbi.neural_nets.embedding_nets.cnn import CNNEmbedding
 from sbi.neural_nets.embedding_nets.fully_connected import FCEmbedding
 from sbi.neural_nets.embedding_nets.permutation_invariant import (
diff --git a/sbi/neural_nets/embedding_nets/causal_cnn.py b/sbi/neural_nets/embedding_nets/causal_cnn.py
@@ -0,0 +1,271 @@
+# This file is part of sbi, a toolkit for simulation-based inference. sbi is licensed
+# under the Apache License Version 2.0, see <https://www.apache.org/licenses/>
+
+from typing import List, Optional, Tuple, Union
+
+from torch import Tensor, nn
+
+from sbi.neural_nets.embedding_nets.cnn import calculate_filter_output_size
+
+
+def causalConv1d(
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int,
+    dilation: int = 1,
+    stride: int = 1,
+) -> nn.Module:
+    """Returns a causal convolution by left padding the input
+
+    Args:
+        in_channels: number of input channels
+        out_channels: number of output channels wanted
+        kernel_size: wanted kernel size
+        dilation: dilation to use in the convolution.
+        stride: stride to use in the convolution.
+            Stride and dilation cannot both be > 1.
+
+    Returns:
+        An nn.Sequential object that represents a 1D causal convolution.
+    """
+    assert not (dilation > 1 and stride > 1), (
+        "we don't allow combining stride with dilation."
+    )
+    padding_size = dilation * (kernel_size - 1)
+    padding = nn.ZeroPad1d(padding=(padding_size, 0))
+    conv_layer = nn.Conv1d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        dilation=dilation,
+        stride=stride,
+        padding=0,
+    )
+    return nn.Sequential(padding, conv_layer)
+
+
+def WaveNetSRLikeAggregator(
+    in_channels: int,
+    num_timepoints: int,
+    output_dim: int,
+    activation: nn.Module = nn.LeakyReLU(inplace=True),
+    kernel_sizes: Optional[List] = None,
+    intermediate_channel_sizes: Optional[List] = None,
+    stride_sizes: Union[int, List] = 1,
+) -> nn.Module:
+    """
+    Creates a non-causal 1D CNN aggregator based on the WaveNet speach recognition task
+
+    By default this function creates an aggregator with two CNN layers,
+    after every convolution a maxpooling operation halves the number of timepoints.
+    The final CNN will have as many channels as the desired output dimension.
+    A global average pooling operation is applied in the end. The dimension of the
+    output will thus be (batch_size, output_dim, 1) regardless of the input size.
+
+    Args:
+        in_channels: number of channels at input.
+        num_timepoints: length of the input.
+        output dim: wanted number of features as output.
+        activation: activation to apply after the convolution.
+        kernel_sizes: (optional) alter the kernel size used and the number of CNN layers
+            (through the length of the kernel size vector).
+        intermediate_channel_sizes: (optional) alter the intermediate channel sizes
+            used, should have length = len(kernel_sizes) - 1.
+        stride_sizes = Optional alter the stride used, either a vector of
+            len = len(kernel_sizes) or a single integer, in which case the same stride
+            is used in every convolution.
+
+    Returns:
+        nn.Module object that contains a sequence of CNN and max_pool layer
+            and finally a global average pooling layer.
+    """
+    aggregator_out_shape = (
+        in_channels,
+        num_timepoints,
+    )
+    if kernel_sizes is None:
+        kernel_sizes = [
+            min(9, aggregator_out_shape[-1]),
+            min(5, int(aggregator_out_shape[-1] / 2)),
+        ]
+    if intermediate_channel_sizes is None:
+        intermediate_channel_sizes = [64]
+    assert len(intermediate_channel_sizes) == len(kernel_sizes) - 1, (
+        "Provided kernel size list should be exactly one element longer "
+        "than channel size list."
+    )
+    intermediate_channel_sizes += [output_dim]
+    if isinstance(stride_sizes, List):
+        assert len(stride_sizes) == len(kernel_sizes), (
+            "Provided stride size list should be have the same size as"
+            "the kernel size list."
+        )
+    else:
+        stride_sizes = [stride_sizes] * len(kernel_sizes)
+
+    non_causal_layers = []
+    for ll in range(len(kernel_sizes)):
+        print(aggregator_out_shape)
+        conv_layer = nn.Conv1d(
+            in_channels=in_channels if ll == 0 else intermediate_channel_sizes[ll - 1],
+            out_channels=intermediate_channel_sizes[ll],
+            kernel_size=kernel_sizes[ll],
+            stride=stride_sizes[ll],
+            padding='same',
+        )
+        maxpool = nn.MaxPool1d(kernel_size=2 if aggregator_out_shape[-1] > 2 else 1)
+        non_causal_layers += [conv_layer, activation, maxpool]
+        aggregator_out_shape = (
+            intermediate_channel_sizes[ll],
+            int(
+                calculate_filter_output_size(
+                    aggregator_out_shape[-1],
+                    (kernel_sizes[ll] - 1) / 2,
+                    1,
+                    kernel_sizes[ll],
+                    stride_sizes[ll],
+                )
+                / 2
+            ),
+        )
+    print(aggregator_out_shape)
+    aggregator = nn.Sequential(*non_causal_layers, nn.AdaptiveAvgPool1d(1))
+    return aggregator
+
+
+class CausalCNNEmbedding(nn.Module):
+    def __init__(
+        self,
+        input_shape: Tuple,
+        in_channels: int = 1,
+        out_channels_per_layer: Optional[List] = None,
+        dilation: Union[str, List] = "exponential_cyclic",
+        num_conv_layers: int = 5,
+        activation: nn.Module = nn.LeakyReLU(inplace=True),
+        pool_kernel_size: int = 160,
+        kernel_size: int = 2,
+        aggregator: Optional[nn.Module] = None,
+        output_dim: int = 20,
+    ):
+        """Embedding network that uses 1D causal convolutions
+
+        This is a simplified version of the architecture introduced for
+        the speech recognition task in the WaveNet paper (van den Oord, et al. (2016))
+
+        After several dilated causal convolutions (that maintain the dimensionality
+        of the input), an aggregator network is used to bring down the dimensionality.
+        You can provide an aggregator network that you deem reasonable for your data.
+        If you do not provide an aggregator network yourself, a default aggregator
+        is used. This default aggregator is based on the WaveNet paper's description
+        of their Speech Recognition Task, and uses non-causal convolutions and pooling
+        layers, and global average poolingg to obtain a final low dimensional embedding.
+
+        Args:
+            input_shape: Dimensionality of the input e.g. (num_timepoints,),
+                currently only 1D is supported.
+            in_channels: Number of input channels, default = 1.
+            out_channels_per_layer: number of out_channels for each layer, number
+                of entries should correspond with num_conv_layers passed below.
+                Default = 16 in every convolutional layer.
+            dilation: type of dilation to use either one of "none" (dilation = 1
+                in every layer), "exponential" (increase dilation by a factor of 2
+                every layer), "exponential_cyclic" (as exponential, but reset to 1
+                after dilation = 2**9) or pass a list with dilation size per layer.
+                By default the cyclic, exponential scheme from WaveNet is used.
+            num_conv_layers: the number of causal convolutional layers
+            kernel_size: size of the kernels in the causal convolutional layers.
+            activation: activation function to use between convolutions,
+                default = LeakyReLU.
+            pool_kernel_size: pool size to use for the AvgPool1d operation after
+                the causal convolutional layers.
+            aggregator: aggregation net that reduces the dimensionality of the data
+                to a low-dimensional embedding.
+            output_dim: number of output units in the final layer when using
+                the default aggregation
+        """
+
+        super(CausalCNNEmbedding, self).__init__()
+        assert isinstance(input_shape, Tuple), (
+            "input_shape must be a Tuple of size 1, e.g. (timepoints,)."
+        )
+        assert len(input_shape) == 1, "Currently only 1D causal CNNs are supported."
+        self.input_shape = (in_channels, *input_shape)
+
+        total_timepoints = input_shape[0]
+        assert total_timepoints >= pool_kernel_size, (
+            "Please ensure that the pool kernel size is not "
+            "larger than the number of observed timepoints."
+        )
+        if isinstance(dilation, str):
+            match dilation.lower():
+                case "exponential_cyclic":
+                    max_dil_exp = 10
+                    ## Use dilation scheme as in WaveNet paper
+                    dilation_per_layer = [
+                        2 ** (i % max_dil_exp) for i in range(num_conv_layers)
+                    ]
+                case "exponential":
+                    dilation_per_layer = [2**i for i in range(num_conv_layers)]
+                case "none":
+                    dilation_per_layer = [1] * num_conv_layers
+                case _:
+                    raise ValueError(
+                        f"{dilation} is not a valid option, please use \"none\","
+                        "\"exponential\",or \"exponential_cyclic\", or pass a list "
+                        "of dilation sizes."
+                    )
+        else:
+            assert isinstance(dilation, List), (
+                "Please pass dilation size as list or a string option."
+            )
+            dilation_per_layer = dilation
+
+        assert max(dilation_per_layer) < total_timepoints, (
+            "Your maximal dilations size used is larger than the number of "
+            "timepoints in your input, please provide a list with smaller dilations."
+        )
+        if out_channels_per_layer is None:
+            out_channels_per_layer = [16] * num_conv_layers
+
+        causal_layers = []
+        for ll in range(num_conv_layers):
+            causal_layers += [
+                causalConv1d(
+                    in_channels if ll == 0 else out_channels_per_layer[ll - 1],
+                    out_channels_per_layer[ll],
+                    kernel_size,
+                    dilation_per_layer[ll],
+                    1,
+                ),
+                activation,
+            ]
+
+        self.causal_cnns = nn.Sequential(*causal_layers)
+
+        self.pooling_layer = nn.AvgPool1d(kernel_size=pool_kernel_size)
+
+        if aggregator is None:
+            aggregator_out_shape = (
+                out_channels_per_layer[-1],
+                int(total_timepoints / pool_kernel_size),
+            )
+            assert aggregator_out_shape[-1] > 1, (
+                "Your dimensionality is already small,"
+                "Please ensure a larger input size or use a custom aggregator."
+            )
+            aggregator = WaveNetSRLikeAggregator(
+                aggregator_out_shape[0],
+                aggregator_out_shape[-1],
+                output_dim=output_dim,
+            )
+        self.aggregation = aggregator
+
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size = x.size(0)
+        x = x.view(batch_size, *self.input_shape)
+        x = self.causal_cnns(x)
+        x = self.pooling_layer(x)
+        x = self.aggregation(x)
+        # ensure flattening when aggregator uses global average pooling
+        x = x.view(batch_size, -1)
+        return x
diff --git a/tests/embedding_net_test.py b/tests/embedding_net_test.py
@@ -13,6 +13,7 @@
 from sbi.neural_nets import classifier_nn, flowmatching_nn, likelihood_nn, posterior_nn
 from sbi.neural_nets.embedding_nets import (
     CNNEmbedding,
+    CausalCNNEmbedding,
     FCEmbedding,
     PermutationInvariantEmbedding,
 )
@@ -173,6 +174,52 @@ def simulator1d(theta):
     posterior.potential(s)
 
 
+@pytest.mark.parametrize("input_shape", [(32,), (64,)])
+@pytest.mark.parametrize("num_channels", (1, 2, 3))
+def test_1d_causal_cnn_embedding_net(input_shape, num_channels):
+    estimator_provider = posterior_nn(
+        "mdn",
+        embedding_net=CausalCNNEmbedding(
+            input_shape, in_channels=num_channels, pool_kernel_size=2, output_dim=20
+        ),
+    )
+
+    num_dim = input_shape[0]
+
+    def simulator2d(theta):
+        x = MultivariateNormal(
+            loc=theta, covariance_matrix=0.5 * torch.eye(num_dim)
+        ).sample()
+        return x.unsqueeze(2).repeat(1, 1, input_shape[1])
+
+    def simulator1d(theta):
+        return torch.rand_like(theta) + theta
+
+    if len(input_shape) == 1:
+        simulator = simulator1d
+        xo = torch.ones(1, num_channels, *input_shape).squeeze(1)
+    else:
+        simulator = simulator2d
+        xo = torch.ones(1, num_channels, *input_shape).squeeze(1)
+
+    prior = MultivariateNormal(torch.zeros(num_dim), torch.eye(num_dim))
+
+    num_simulations = 1000
+    theta = prior.sample(torch.Size((num_simulations,)))
+    x = simulator(theta)
+    if num_channels > 1:
+        x = x.unsqueeze(1).repeat(
+            1, num_channels, *[1 for _ in range(len(input_shape))]
+        )
+
+    trainer = NPE(prior=prior, density_estimator=estimator_provider)
+    trainer.append_simulations(theta, x).train(max_num_epochs=2)
+    posterior = trainer.build_posterior().set_default_x(xo)
+
+    s = posterior.sample((10,))
+    posterior.potential(s)
+
+
 @pytest.mark.slow
 def test_npe_with_with_iid_embedding_varying_num_trials(trial_factor=50):
     """Test inference accuracy with embeddings for varying number of trials.

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+from sbi.neural_nets.embedding_nets.causal_cnn import CausalCNNEmbedding`
`1`	`2`	`from sbi.neural_nets.embedding_nets.cnn import CNNEmbedding`
`2`	`3`	`from sbi.neural_nets.embedding_nets.fully_connected import FCEmbedding`
`3`	`4`	`from sbi.neural_nets.embedding_nets.permutation_invariant import (`