Adding code

EiffL · EiffL · commit 49f9eea137cd · 2025-05-24T19:38:35.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -174,3 +174,4 @@ cython_debug/
 .pypirc
 notebooks
 data
+old_impl
diff --git a/aion/codecs/catalog.py b/aion/codecs/catalog.py
@@ -0,0 +1,102 @@
+from collections import OrderedDict
+from typing import Type, Optional, Dict, List
+
+from huggingface_hub import PyTorchModelHubMixin
+import torch
+from jaxtyping import Float
+from torch import Tensor
+
+from aion.codecs.base import Codec
+from aion.codecs.quantizers import Quantizer
+from aion.codecs.quantizers.scalar import ComposedScalarQuantizer
+from aion.modalities import Catalog
+
+__all__ = ["CatalogIdentityCodec"]
+
+
+class CatalogIdentityCodec(Codec, PyTorchModelHubMixin):
+    """Codec for catalog quantities.
+
+    A codec that embeds catalog quantities through an identity mapping. A
+    quantizer is applied if specified.
+
+    Args:
+        catalog_keys: List[str]
+            List of catalog keys to encode.
+        quantizers: Optional[List[Quantizer]]
+            Optional list of quantizers for each catalog key.
+        mask_value: int
+            Value used to indicate masked/missing data.
+    """
+
+    def __init__(
+        self,
+        catalog_keys: List[str],
+        quantizers: Optional[List[Quantizer]] = None,
+        mask_value: int = 9999,
+    ):
+        super().__init__()
+        self._modality = Catalog
+        self._catalog_keys = catalog_keys
+        self.mask_value = mask_value
+        if quantizers:
+            assert len(catalog_keys) == len(quantizers), (
+                "Number of catalog keys and quantizers must match"
+            )
+            _quantizer = OrderedDict()
+            for key, quantizer in zip(catalog_keys, quantizers):
+                _quantizer[key] = quantizer
+            self._quantizer = ComposedScalarQuantizer(_quantizer)
+        else:
+            self._quantizer = None
+
+    @property
+    def modality(self) -> Type[Catalog]:
+        return self._modality
+
+    @property
+    def quantizer(self) -> Optional[Quantizer]:
+        return self._quantizer
+
+    def _encode(self, x: Catalog) -> Dict[str, Tensor]:
+        encoded = OrderedDict()
+        for key in self._catalog_keys:
+            catalog_value = x[self.modality][key]
+            mask = catalog_value != self.mask_value
+            catalog_value = catalog_value[mask]
+            encoded[key] = catalog_value
+        encoded["mask"] = mask
+        return encoded
+
+    def encode(self, x: Catalog) -> Float[Tensor, "b c1 *code_shape"]:
+        """Encodes a given batch of samples into latent space."""
+        embedding = self._encode(x)
+        _encoded = self.quantizer.encode(
+            embedding
+        )  # (b, C), where b is the number of non-masked samples
+
+        mask = embedding["mask"]
+        # B: batch size, L: sequence length (20) for each catalog key
+        B, L = mask.shape
+        C = len(self._catalog_keys)
+        encoded = self.mask_value * torch.ones(
+            B, L, C, dtype=_encoded.dtype, device=_encoded.device
+        )
+        encoded[mask] = _encoded
+        encoded = encoded.reshape(B, -1)
+        return encoded
+
+    def _decode(self, z: Dict[str, Tensor]) -> Catalog:
+        return Catalog(data=z)
+
+    def decode(self, z: Float[Tensor, "b c1 *code_shape"]) -> Catalog:
+        B, LC = z.shape
+        C = len(self._catalog_keys)
+        L = LC // C
+        z = z[:, : C * L]  # Truncate the z if it is longer than the expected length
+        z = z.reshape(B * L, C)
+        if self._quantizer is not None:
+            z = self.quantizer.decode(z)
+        for key in self._catalog_keys:
+            z[key] = z[key].reshape(B, L)
+        return self._decode(z)
diff --git a/aion/codecs/quantizers/scalar.py b/aion/codecs/quantizers/scalar.py
@@ -1,5 +1,6 @@
 import math
-from typing import Optional
+from typing import Optional, Dict
+from collections import OrderedDict
 
 import scipy.interpolate
 import torch
@@ -504,3 +505,121 @@ def codebook(self) -> torch.Tensor:
     def embedding_dim(self) -> int:
         """Returns the dimension of the codebook entries."""
         return 1
+
+
+class ComposedScalarQuantizer(Quantizer):
+    """
+    Composed scalar quantizer module.
+
+    Combines multiple scalar quantizers into a single quantizer. Each quantizer
+    operates on a different channel/feature and maintains its own codebook.
+
+    Args:
+        quantizers: OrderedDict[str, Quantizer]
+            Ordered dictionary mapping feature names to their respective quantizers.
+    """
+
+    def __init__(self, quantizers: OrderedDict[str, Quantizer]):
+        super().__init__()
+        _offsets = [0]
+        for key, quantizer in quantizers.items():
+            _offsets.append(_offsets[-1] + quantizer.codebook_size)
+        self.offsets = _offsets[:-1]
+        self._codebook_size = _offsets[-1]
+        self.quantizers = nn.ModuleDict(quantizers)
+
+    def forward(
+        self, z_es: Dict[str, torch.Tensor]
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Performs a forward pass through the vector quantizer.
+        Args:
+            z_es: Dict[str, torch.Tensor]
+                The input tensor to be quantized.
+        Returns:
+            z_qs: torch.Tensor
+                The quantized tensor.
+            loss: torch.Tensor
+                The embedding loss for the quantization.
+            codebook_usage: torch.Tensor
+                The fraction of codes used in the codebook.
+        """
+        z_qs = []
+        loss = torch.tensor(0.0)
+        codebook_usage = torch.tensor(0.0)
+        for key, quantizer in self.quantizers.items():
+            z_e = z_es[key]
+            z_q, _loss, _usage = quantizer(z_e)
+            z_qs.append(z_q)
+            loss += _loss
+            codebook_usage += _usage
+
+        C = len(z_qs)
+        z_qs = torch.stack(z_qs, dim=1)  # (B, C)
+        loss /= C
+        codebook_usage /= C
+        return z_qs, loss, codebook_usage
+
+    def quantize(self, z: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """Quantize the input tensor z, returns corresponding
+        codebook entry.
+        """
+        quantized = []
+        for key, quantizer in self.quantizers.items():
+            quantized.append(quantizer.quantize(z[key]))
+
+        quantized = torch.stack(quantized, dim=1)  # (B, C)
+        return quantized
+
+    def encode(self, z: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """Encodes the input tensor z, returns the corresponding
+        codebook index.
+
+        Args:
+            z: Dict[str, torch.Tensor]
+                The input tensor to be encoded.
+
+        Returns:
+            codes: torch.Tensor (B, C)
+                Encoded tensor.
+        """
+        codes = []
+        for offset, (key, quantizer) in zip(self.offsets, self.quantizers.items()):
+            codes.append((quantizer.encode(z[key]) + offset))
+        codes = torch.stack(codes, dim=1)  # (B, C)
+        return codes
+
+    def decode(self, codes: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """Decodes the input code index into corresponding codebook entry of
+        dimension (embedding_dim).
+
+        Args:
+            codes: torch.Tensor (B, C)
+                Codes to be decoded.
+
+        Returns:
+            z: Dict[str, torch.Tensor]
+                Decoded sample.
+        """
+        z = {}
+        for i, (offset, (key, quantizer)) in enumerate(
+            zip(self.offsets, self.quantizers.items())
+        ):
+            codes_i = codes[:, i] - offset
+            # clamp the codes to the valid range
+            _codes_i = codes_i.clamp(0, quantizer.codebook_size - 1)
+            decoded_i = quantizer.decode(_codes_i)
+            # set the clamped codes to -1
+            is_clamped = _codes_i != codes_i
+            decoded_i[is_clamped] = -1
+            z[key] = decoded_i
+        return z
+
+    @property
+    def codebook_size(self) -> int:
+        """Returns the size of the codebook."""
+        return self._codebook_size
+
+    @property
+    def embedding_dim(self) -> int:
+        """Returns the dimension of the codebook entries."""
+        return 1
diff --git a/aion/modalities.py b/aion/modalities.py
@@ -2,7 +2,7 @@
 
 from typing import List, Union, ClassVar
 from pydantic import BaseModel, Field, ConfigDict
-from jaxtyping import Float, Bool
+from jaxtyping import Float, Bool, Dict
 from torch import Tensor
 
 
@@ -70,6 +70,18 @@ def __repr__(self) -> str:
         return f"{self.__class__.__name__}(shape={list(self.value.shape)})"
 
 
+# Catalog modality
+class Catalog(Modality):
+    """Catalog modality data.
+
+    Represents a catalog of scalar values.
+    """
+
+    data: Dict[str, Dict[str, Float[Tensor, "b t"]]] = Field(
+        description="Dictionary of dictionaries of scalar values."
+    )
+
+
 # Flux measurements in different bands
 class FluxG(ScalarModality):
     """G-band flux measurement."""
@@ -310,4 +322,4 @@ class XpRp(ScalarModality):
 ]
 
 # Convenience type for any modality data
-ModalityType = Union[Image, Spectrum, ScalarModality]
+ModalityType = Union[Image, Spectrum, ScalarModality, Catalog]