ai4sd
diff --git a/‎src/mblm/__init__.py‎
Lines changed: 6 additions & 3 deletions b/‎src/mblm/__init__.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/mblm/data/dataset/beep.py‎
Lines changed: 78 additions & 0 deletions b/‎src/mblm/data/dataset/beep.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎src/mblm/data/dataset/clevr.py‎
Lines changed: 1 addition & 1 deletion b/‎src/mblm/data/dataset/clevr.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/mblm/data/dataset/pg19_masked.py‎
Lines changed: 174 additions & 0 deletions b/‎src/mblm/data/dataset/pg19_masked.py‎
Lines changed: 174 additions & 0 deletions
diff --git a/‎src/mblm/data/types.py‎
Lines changed: 1 addition & 0 deletions b/‎src/mblm/data/types.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/mblm/model/block.py‎
Lines changed: 1 addition & 1 deletion b/‎src/mblm/model/block.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/mblm/model/config.py‎
Lines changed: 8 additions & 1 deletion b/‎src/mblm/model/config.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎src/mblm/model/mamba.py‎
Lines changed: 7 additions & 1 deletion b/‎src/mblm/model/mamba.py‎
Lines changed: 7 additions & 1 deletion
@@ -26,15 +26,18 @@
 __version__ = importlib.metadata.version("mblm")
 
 
-from mblm.model.config import MBLMModelConfig, MBLMReturnType
+from mblm.model.config import MBLMEncoderModelConfig, MBLMModelConfig, MBLMReturnType
 from mblm.model.mamba import MambaBlock
-from mblm.model.mblm import MBLM
-from mblm.model.transformer import TransformerBlock
+from mblm.model.mblm import MBLM, MBLMEncoder
+from mblm.model.transformer import TransformerBlock, TransformerEncoderBlock
 
 __all__ = [
     "MBLM",
+    "MBLMEncoder",
     "MBLMModelConfig",
+    "MBLMEncoderModelConfig",
     "MBLMReturnType",
     "TransformerBlock",
+    "TransformerEncoderBlock",
     "MambaBlock",
 ]
@@ -0,0 +1,78 @@
+from pathlib import Path
+
+from typing_extensions import Unpack
+
+from mblm.data.datasets import DistributedDataset, DistributedDatasetConfig
+from mblm.data.types import BatchMaskedForMLM, ModelMode
+from mblm.train.mblm import (
+    TrainMaskedEntryConfig,
+    masked_dataset_registry,
+)
+
+
+@masked_dataset_registry.register("beep")
+class Beep(DistributedDataset[BatchMaskedForMLM]):
+    """The beep dataset raw data"""
+
+    def __init__(
+        self,
+        mode: ModelMode,
+        data_dir: str | Path,
+        **args: Unpack[DistributedDatasetConfig],
+    ):
+        # Dummy example - Get data from anywhere, e.g., the disk
+        print(f"Reading dataset from {data_dir}")
+        if mode == ModelMode.TRAIN:
+            # TODO Load the train BEEP FILE.
+            data = list(range(10_000))
+        elif mode == ModelMode.VALID:
+            # TODO Load the Beep Validation file
+            data = list(range(2_000))
+        elif mode == ModelMode.TEST:
+            # TODO Load the Beep TEST file
+            pass
+        else:
+            raise ValueError("This variant isn't implemented yet, please update the code")
+        self._data = data
+
+        super().__init__(
+            data_size=len(data),
+            is_sequential=True,  # We have a sequential dataset
+            **args,
+        )
+
+    def get_sample(self, from_idx: int):
+        """
+        Tell the superclass how to get a single sample - here, a sequence of
+        the specified length.
+        """
+        # data = torch.tensor(self._data[from_idx : from_idx + self.seq_len])
+        # return torch.ones_like(data), data
+        raise NotImplementedError()
+
+    @staticmethod
+    def from_train_entry_config(
+        config: TrainMaskedEntryConfig,
+        mode: ModelMode,
+        worker_id: int,
+        num_workers: int,
+    ) -> DistributedDataset[BatchMaskedForMLM]:
+        """
+        How to parse a training config to a dataset.
+        """
+        return Beep(
+            data_dir=config.io.dataset_dir,
+            mode=mode,
+            seq_len=config.params.input_seq_len,
+            num_workers=num_workers,
+            worker_id=worker_id,
+        )
+
+    @staticmethod
+    def supports_test_mode() -> bool:
+        """
+        Whether or not this dataset supports a test mode. Some datasets might not
+        expose the answers in their test set so we cannot evaluate a model on it.
+        Override if necessary
+        """
+        return True
@@ -401,4 +401,4 @@ def get_sample(self, from_idx: int) -> BatchWithLossMask:
         DistributedDataset superclass.
         """
         q_i_q_a, loss_mask, _ = self.get_sample_with_parts(from_idx)
-        return q_i_q_a, loss_mask
+        return q_i_q_a, loss_mask  #
@@ -0,0 +1,174 @@
+from __future__ import annotations
+
+__copyright__ = """MIT License
+
+Copyright (c) 2025 - IBM Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE."""
+
+
+import pathlib
+import random
+from pathlib import Path
+from typing import TYPE_CHECKING, Generator
+
+import torch
+import tqdm
+from typing_extensions import Unpack
+
+from mblm.data.datasets import DistributedDataset, DistributedDatasetConfig
+from mblm.data.types import BatchMaskedForMLM, ModelMode
+from mblm.data.utils import Bytes
+
+if TYPE_CHECKING:
+    from mblm.train.mblm import TrainMaskedEntryConfig
+
+
+# @masked_dataset_registry.register("maskedPG19")
+class PG19Masked(DistributedDataset[BatchMaskedForMLM]):
+    """
+    https://github.com/google-deepmind/pg19
+
+    The `data_dir` is expected to be the of the exact structure as the
+    original dataset, although only the test, train and validation folders
+    are strictly needed:
+        ├── LICENSE
+        ├── README.md
+        ├── metadata.csv
+        ├── test
+        ├── train
+        └── validation
+
+    """
+
+    def __init__(
+        self,
+        data_dir: str | Path,
+        mode: ModelMode,
+        masked_token_id: int = -100,
+        masking_proba: float = 0.15,
+        load_mininterval: int = 30,
+        display_load_progress: bool = True,
+        padding_token_id: int = -101,
+        **config: Unpack[DistributedDatasetConfig],
+    ):
+        root = Path(data_dir)
+        if mode == ModelMode.VALID:
+            data_path = root / "validation"
+        else:
+            data_path = root / mode.value
+        self.txt_files = [file for file in pathlib.Path.iterdir(data_path)]
+        self.masking_proba = masking_proba
+        data_buff = bytearray()
+        for file in tqdm.tqdm(
+            self.txt_files,
+            desc=f"Loading pg19 {data_path}",
+            mininterval=load_mininterval,
+            disable=not display_load_progress,
+        ):
+            with Path.open(file, "rb") as f:
+                data_buff.extend(f.read())
+        self.data = Bytes.bytes_to_tensor(data_buff)
+        self.masked_token_id = masked_token_id
+        self.padding_token_id = padding_token_id
+        if masked_token_id == padding_token_id:
+            raise ValueError("You can't set the padding and the mask with the same value")
+
+        super().__init__(
+            data_size=self.data.numel(),
+            is_sequential=True,
+            **config,
+        )
+
+    @staticmethod
+    def from_train_entry_config(
+        config: TrainMaskedEntryConfig,
+        mode: ModelMode,
+        worker_id: int,
+        num_workers: int,
+    ) -> DistributedDataset[BatchMaskedForMLM]:
+        return PG19Masked(
+            data_dir=config.io.dataset_dir,
+            masking_proba=config.train.masking_proba,
+            masked_token_id=config.params.mask_token_id,
+            mode=mode,
+            padding_token_id=config.params.mblm_config.pad_token_id,
+            seq_len=config.params.input_seq_len,
+            worker_id=worker_id,
+            num_workers=num_workers,
+        )
+
+    @staticmethod
+    def supports_test_mode() -> bool:
+        return True
+
+    def get_sample(self, from_idx: int) -> BatchMaskedForMLM:
+        """
+        Get a sample with a loss mask. This method is required by the
+        DistributedDataset superclass.
+        """
+        sample = self.data[from_idx : from_idx + self.seq_len].long()
+        mask = torch.rand(sample.size()) < self.masking_proba
+        tokens_masked = sample.clone()
+
+        # TODO implement same strategy as BERT and even when token is masked, sometimes copy the correct token, not the masked_token_id
+        tokens_masked[mask] = self.masked_token_id
+        # Padd if necessary, should only be needed when from_idx == len(self)
+        if sample.size(-1) != self.seq_len:
+            # padd with tensors with padding_token_id
+            pad_tensor = self.padding_token_id * torch.ones(self.seq_len - sample.size(-1))
+            tokens_masked = torch.concat((tokens_masked, pad_tensor))
+            # pad_tensor * 0 ensures that the loss is never computed over the padding tokens
+            # as 1 shows MASKED elements and 0 non-MASKED token
+            mask = torch.concat((mask, pad_tensor * 0))
+            sample = torch.concat((sample, pad_tensor))
+        return tokens_masked.long(), mask.bool(), sample.long()
+
+    def book(self, name: str) -> str:
+        """
+        Get a book by its name (e.g., `44381.txt`) return its content as a
+        string
+        """
+        for candidate in self.txt_files:
+            if candidate.name == name:
+                with Path.open(candidate, "r", encoding="utf8") as f:
+                    return f.read()
+        raise ValueError(f"Book {name} does not exist")
+
+    def iter_sequences_rand(self) -> Generator[torch.Tensor, None, None]:
+        """
+        Iterate over random sequences across books of PG19
+        """
+        max_sample_start_idx = len(self.data) - self.seq_len - 1
+        while True:
+            idx = random.randint(0, max_sample_start_idx)
+            yield self.data[idx : idx + self.seq_len]
+
+    def iter_books(self, shuffle: bool = False) -> Generator[tuple[str, str], None, None]:
+        """
+        Iterate over all the books in PG19, possibly in random order. Return an
+        iterator over the index of the book and its content as a string
+        """
+        txt_file_idxs = list(range(len(self.txt_files)))
+        if shuffle:
+            random.shuffle(txt_file_idxs)
+        for i in txt_file_idxs:
+            book = self.txt_files[i]
+            with Path.open(book, "r", encoding="utf8") as f:
+                yield book.name, f.read()
@@ -33,3 +33,4 @@ class ModelMode(Enum):
 
 
 BatchWithLossMask: TypeAlias = tuple[torch.Tensor, torch.Tensor]
+BatchMaskedForMLM: TypeAlias = tuple[torch.Tensor, torch.Tensor, torch.Tensor]
@@ -98,4 +98,4 @@ def try_parse(
             except Exception:
                 pass
 
-        raise ValueError(f"Coult not parse data to any of {self}")
+        raise ValueError(f"Could not parse data to any of {self}")
@@ -34,17 +34,19 @@
 
 from mblm.model.block import StageBlock, StageBlockRegistry
 from mblm.model.mamba import MambaBlock
-from mblm.model.transformer import TransformerBlock
+from mblm.model.transformer import TransformerBlock, TransformerEncoderBlock
 
 block_registry = StageBlockRegistry()
 block_registry.register()(TransformerBlock)
 block_registry.register()(MambaBlock)
+block_registry.register()(TransformerEncoderBlock)
 
 
 class MBLMReturnType(str, Enum):
     LOGITS = auto()
     LOSS = auto()
     LOSS_LOGITS = auto()
+    HIDDEN_STATE = auto()
 
 
 class MBLMModelConfig(BaseModel):
@@ -113,3 +115,8 @@ def stage_blocks(self) -> Sequence[StageBlock]:
         if isinstance(self.block, Sequence):
             return self.block
         return list(repeat(self.block, len(self.hidden_dims)))
+
+
+class MBLMEncoderModelConfig(BaseModel):
+    mask_token_id: int
+    mblm_config: MBLMModelConfig
@@ -21,7 +21,7 @@
 SOFTWARE."""
 
 
-from pydantic import Field
+from pydantic import Field, model_validator
 
 from mblm.model.block import StageBlock
 from mblm.model.mamba_shim import Mamba1, Mamba1Config, Mamba2Mixer
@@ -79,3 +79,9 @@ def to_model(self, model_dim, num_layers):
         raise RuntimeError(
             "Failed to import any Mamba version - this should never happen",
         )
+
+    @model_validator(mode="after")
+    def validate_block_type(self):
+        if "mamba" not in self.block_type:
+            raise ValueError("This model is a mamba block")
+        return self
Original file line number	Diff line number	Diff line change
`@@ -33,3 +33,4 @@ class ModelMode(Enum):`
`33`	`33`
`34`	`34`
`35`	`35`	`BatchWithLossMask: TypeAlias = tuple[torch.Tensor, torch.Tensor]`
	`36`	`+BatchMaskedForMLM: TypeAlias = tuple[torch.Tensor, torch.Tensor, torch.Tensor]`