better eps handling in torch

jjcmoon · jjcmoon · commit a90fb57d4f88 · 2025-11-19T10:33:49.000+01:00
diff --git a/src/klay/__init__.py b/src/klay/__init__.py
@@ -5,7 +5,7 @@
 from collections.abc import Sequence
 
 
-def to_torch_module(self: Circuit, semiring: str = "log", probabilistic: bool = False):
+def to_torch_module(self: Circuit, semiring: str = "log", probabilistic: bool = False, eps: float = 0):
     """
     Convert the circuit into a PyTorch module.
 
@@ -15,12 +15,14 @@ def to_torch_module(self: Circuit, semiring: str = "log", probabilistic: bool =
         If enabled, construct a probabilistic circuit instead of an arithmetic circuit.
         This means the inputs to a sum node are multiplied by a probability, and
         we can interpret sum nodes as latent Categorical variables.
+    :param eps:
+        Epsilon used by log semiring for numerical stability.
     """
     from .torch import CircuitModule, ProbabilisticCircuitModule
     indices = self._get_indices()
     if probabilistic:
-        return ProbabilisticCircuitModule(*indices, semiring=semiring)
-    return CircuitModule(*indices, semiring=semiring)
+        return ProbabilisticCircuitModule(*indices, semiring=semiring, eps=eps)
+    return CircuitModule(*indices, semiring=semiring, eps=eps)
 
 
 def to_jax_function(self: Circuit, semiring: str = "log"):
diff --git a/src/klay/torch/__init__.py b/src/klay/torch/__init__.py
@@ -5,53 +5,54 @@
 from .utils import unroll_ixs
 
 
-def _create_layers(sum_layer, prod_layer, ixs_in, ixs_out):
+def _create_layers(sum_layer, prod_layer, ixs_in, ixs_out, eps):
     layers = []
     for i, (ix_in, ix_out) in enumerate(zip(ixs_in, ixs_out)):
         ix_in = torch.as_tensor(ix_in, dtype=torch.long)
         ix_out = torch.as_tensor(ix_out, dtype=torch.long)
         ix_out = unroll_ixs(ix_out)
         layer = prod_layer if i % 2 == 0 else sum_layer
-        layers.append(layer(ix_in, ix_out))
+        layers.append(layer(ix_in, ix_out, eps))
     return nn.Sequential(*layers)
 
 
 class CircuitModule(nn.Module):
-    def __init__(self, ixs_in, ixs_out, semiring='real'):
+    def __init__(self, ixs_in, ixs_out, semiring: str = 'real', eps: float = 0):
         super(CircuitModule, self).__init__()
         self.semiring = semiring
+        self._eps = 0
+
         self.sum_layer, self.prod_layer, self.zero, self.one, self.negate = \
             get_semiring(semiring, self.is_probabilistic())
-        self.layers = _create_layers(self.sum_layer, self.prod_layer, ixs_in, ixs_out)
+        self.layers = _create_layers(self.sum_layer, self.prod_layer, ixs_in, ixs_out, eps)
 
-    def forward(self, x_pos, x_neg=None, eps=0):
-        x = self.encode_input(x_pos, x_neg, eps)
+    def forward(self, x_pos, x_neg=None):
+        x = self.encode_input(x_pos, x_neg)
         return self.layers(x)
 
-    def encode_input(self, pos, neg, eps):
+    def encode_input(self, pos, neg):
         if neg is None:
-            neg = self.negate(pos, eps)
+            neg = self.negate(pos, self._eps)
         x = torch.stack([pos, neg], dim=1).flatten()
-        units = torch.tensor([self.zero, self.one], dtype=torch.float32, device=pos.device)
+        units = torch.tensor([self.zero, self.one], dtype=pos.dtype, device=pos.device)
         return torch.cat([units, x])
 
     def sparsity(self, nb_vars: int) -> float:
-        sparse_params = sum(len(l.ix_out) for l in self.layers)
-        layer_widths = [nb_vars] + [l.out_shape[0] for l in self.layers]
+        sparse_params = sum(len(layer.ix_out) for layer in self.layers)
+        layer_widths = [nb_vars] + [layer.out_shape[0] for layer in self.layers]
         dense_params = sum(layer_widths[i] * layer_widths[i + 1] for i in range(len(layer_widths) - 1))
         return sparse_params / dense_params
 
-    def to_pc(self, x_pos, x_neg=None, eps=0):
+    def to_pc(self, x_pos, x_neg=None):
         """ Converts the circuit into a probabilistic circuit."""
         assert self.semiring == "log" or self.semiring == "real"
         pc = ProbabilisticCircuitModule([], [], self.semiring)
-        print("Making PC", pc.sum_layer, pc.sum_layer)
         layers = []
 
-        x = self.encode_input(x_pos, x_neg, eps)
+        x = self.encode_input(x_pos, x_neg)
         for i, layer in enumerate(self.layers):
             if isinstance(layer, self.sum_layer):
-                new_layer = pc.sum_layer(layer.ix_in, layer.ix_out)
+                new_layer = pc.sum_layer(layer.ix_in, layer.ix_out, layer._eps)
                 weights = x.log() if self.semiring == "real" else x
                 new_layer.weights.data = weights[new_layer.ix_in]
             else:
@@ -76,7 +77,7 @@ def sample(self):
         return y[2::2]
 
     def condition(self, x_pos, x_neg):
-        x = self.encode_input(x_pos, x_neg, None)
+        x = self.encode_input(x_pos, x_neg)
         for layer in self.layers:
             x = layer.condition(x) \
                 if isinstance(layer, ProbabilisticCircuitLayer) \
diff --git a/src/klay/torch/layers.py b/src/klay/torch/layers.py
@@ -5,16 +5,17 @@
 
 
 class CircuitLayer(nn.Module):
-    def __init__(self, ix_in, ix_out):
+    def __init__(self, ix_in, ix_out, eps):
         super().__init__()
         self.register_buffer('ix_in', ix_in)
         self.register_buffer('ix_out', ix_out)
         self.out_shape = (self.ix_out[-1].item() + 1,)
         self.in_shape = (self.ix_in.max().item() + 1,)
+        self._eps = eps
 
-    def _scatter_forward(self, x: torch.Tensor, reduce: str, **kwargs):
+    def _scatter_forward(self, x: torch.Tensor, reduce: str):
         if reduce == "logsumexp":
-            return self._scatter_logsumexp_forward(x, **kwargs)
+            return self._scatter_logsumexp_forward(x)
         output = torch.empty(self.out_shape, dtype=x.dtype, device=x.device)
         output = torch.scatter_reduce(output, 0, index=self.ix_out, src=x, reduce=reduce, include_self=False)
         return output
@@ -31,9 +32,9 @@ def _safe_exp(self, x: torch.Tensor):
         x.nan_to_num_(nan=0., posinf=float('inf'), neginf=float('-inf'))
         return torch.exp(x), max_output
 
-    def _scatter_logsumexp_forward(self, x: torch.Tensor, eps: float):
+    def _scatter_logsumexp_forward(self, x: torch.Tensor):
         x, max_output = self._safe_exp(x)
-        output = torch.full(self.out_shape, eps, dtype=x.dtype, device=x.device)
+        output = torch.full(self.out_shape, self._eps, dtype=x.dtype, device=x.device)
         output = torch.scatter_add(output, 0, index=self.ix_out, src=x)
         output = torch.log(output) + max_output
         return output
@@ -63,13 +64,13 @@ def forward(self, x):
 
 
 class LogSumLayer(CircuitLayer):
-    def forward(self, x, eps=10e-16):
-        return self._scatter_forward(x[self.ix_in], "logsumexp", eps=eps)
+    def forward(self, x):
+        return self._scatter_forward(x[self.ix_in], "logsumexp")
 
 
 class ProbabilisticCircuitLayer(CircuitLayer):
-    def __init__(self, ix_in, ix_out):
-        super().__init__(ix_in, ix_out)
+    def __init__(self, ix_in, ix_out, eps):
+        super().__init__(ix_in, ix_out, eps)
         self.weights = nn.Parameter(torch.randn_like(ix_in, dtype=torch.float32))
 
     def get_edge_weights(self):
@@ -79,15 +80,15 @@ def get_edge_weights(self):
 
     def renorm_weights(self, x):
         with torch.no_grad():
-            self.weights.data = self.get_log_edge_weights(0) + x
+            self.weights.data = self.get_log_edge_weights() + x
 
-    def get_log_edge_weights(self, eps):
-        norm = self._scatter_logsumexp_forward(self.weights, eps)
+    def get_log_edge_weights(self):
+        norm = self._scatter_logsumexp_forward(self.weights)
         return self.weights - norm[self.ix_out]
 
-    def sample(self, y, eps=10e-16):
-        weights = self.get_log_edge_weights(eps)
-        noise = -(-torch.log(torch.rand_like(weights) + eps) + eps).log()
+    def sample(self, y):
+        weights = self.get_log_edge_weights()
+        noise = -(-torch.log(torch.rand_like(weights) + self._eps) + self._eps).log()
         gumbels = weights + noise
         samples = self._scatter_forward(gumbels, "amax")
         samples = samples[self.ix_out] == gumbels
@@ -107,9 +108,9 @@ def condition(self, x):
 
 
 class ProbabilisticLogSumLayer(ProbabilisticCircuitLayer):
-    def forward(self, x, eps=10e-16):
-        x = self.get_log_edge_weights(eps) + x[self.ix_in]
-        return self._scatter_logsumexp_forward(x, eps)
+    def forward(self, x):
+        x = self.get_log_edge_weights() + x[self.ix_in]
+        return self._scatter_logsumexp_forward(x)
 
     def condition(self, x):
         y = self.forward(x)