CAREamics
diff --git a/‎src/careamics/dataset_ng/demos/val_split.ipynb‎
Lines changed: 136 additions & 0 deletions b/‎src/careamics/dataset_ng/demos/val_split.ipynb‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎src/careamics/dataset_ng/patching_strategies/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/careamics/dataset_ng/patching_strategies/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/careamics/dataset_ng/patching_strategies/fixed_patching.py‎
Lines changed: 76 additions & 0 deletions b/‎src/careamics/dataset_ng/patching_strategies/fixed_patching.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎src/careamics/dataset_ng/patching_strategies/stratified_patching.py‎
Lines changed: 38 additions & 4 deletions b/‎src/careamics/dataset_ng/patching_strategies/stratified_patching.py‎
Lines changed: 38 additions & 4 deletions
diff --git a/‎src/careamics/dataset_ng/val_split.py‎
Lines changed: 90 additions & 0 deletions b/‎src/careamics/dataset_ng/val_split.py‎
Lines changed: 90 additions & 0 deletions
@@ -0,0 +1,136 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections.abc import Sequence\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "from numpy.typing import NDArray\n",
+    "\n",
+    "from careamics.dataset_ng.patching_strategies import (\n",
+    "    PatchingStrategy,\n",
+    "    StratifiedPatchingStrategy,\n",
+    ")\n",
+    "from careamics.dataset_ng.val_split import create_val_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def demo_selected_patches(\n",
+    "    patching_strategy: PatchingStrategy,\n",
+    "    data_shapes: Sequence[Sequence[int]],\n",
+    "    epochs: int,\n",
+    ") -> Sequence[NDArray[np.int_]]:\n",
+    "    \"\"\"Create a map where all the patches have been selected from.\n",
+    "\n",
+    "    Every time a patch is selected that area is incremented by 1.\n",
+    "    \"\"\"\n",
+    "    tracking_arrays = [np.zeros(shape, dtype=int) for shape in data_shapes]\n",
+    "    for _ in range(epochs):\n",
+    "        for index in range(patching_strategy.n_patches):\n",
+    "            patch_spec = patching_strategy.get_patch_spec(index)\n",
+    "            data_idx = patch_spec[\"data_idx\"]\n",
+    "            sample_idx = patch_spec[\"sample_idx\"]\n",
+    "            coord = patch_spec[\"coords\"]\n",
+    "            patch_size = patch_spec[\"patch_size\"]\n",
+    "\n",
+    "            patch_slice = [\n",
+    "                slice(c, c + ps) for c, ps in zip(coord, patch_size, strict=True)\n",
+    "            ]\n",
+    "            tracking_arrays[data_idx][sample_idx, ..., *patch_slice] += 1\n",
+    "    return tracking_arrays"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rng = np.random.default_rng(42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_shapes = [(1, 1, 512, 620), (1, 1, 300, 335), (1, 1, 512, 512)]\n",
+    "patch_size = (64, 64)\n",
+    "\n",
+    "stratified_patching = StratifiedPatchingStrategy(data_shapes, patch_size, seed=42)\n",
+    "n_val_patches = int(np.ceil(stratified_patching.n_patches * 0.1))  # 10% of patches\n",
+    "print(\n",
+    "    f\"Selecting {n_val_patches} validation patches from \"\n",
+    "    f\"{stratified_patching.n_patches} total patches.\"\n",
+    ")\n",
+    "train_patching, val_patching = create_val_split(stratified_patching, n_val_patches, rng)\n",
+    "\n",
+    "train_1 = demo_selected_patches(train_patching, data_shapes, epochs=1)\n",
+    "train_200 = demo_selected_patches(train_patching, data_shapes, epochs=200)\n",
+    "val = demo_selected_patches(val_patching, data_shapes, epochs=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(3, len(data_shapes), figsize=(12, 12), constrained_layout=True)\n",
+    "for i in range(len(data_shapes)):\n",
+    "    axes[0, i].set_title(f\"Image {i}\")\n",
+    "    axes[0, i].imshow(train_1[i][0, 0])\n",
+    "    axes[1, i].imshow(train_200[i][0, 0])\n",
+    "    axes[2, i].imshow(val[i][0, 0])\n",
+    "axes[0, 0].set_ylabel(\"Train epochs 1\")\n",
+    "axes[1, 0].set_ylabel(\"Train epochs 200\")\n",
+    "axes[2, 0].set_ylabel(\"Validation\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "careamics",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -1,4 +1,5 @@
 __all__ = [
+    "FixedPatchingStrategy",
     "FixedRandomPatchingStrategy",
     "PatchSpecs",
     "PatchingStrategy",
@@ -13,6 +14,7 @@
     "is_tile_specs",
 ]
 
+from .fixed_patching import FixedPatchingStrategy
 from .patching_strategy_factory import create_patching_strategy
 from .patching_strategy_protocol import (
     PatchingStrategy,
 
@@ -0,0 +1,76 @@
+"""A module for a fixed coordinate patching strategy, useful for validation."""
+
+from collections.abc import Sequence
+
+from .patching_strategy_protocol import PatchSpecs
+
+
+class FixedPatchingStrategy:
+    """A simple patching strategy that returns patches from a fixed sequence.
+
+    This class implements the `PatchingStrategy` `Protocol`.
+    """
+
+    def __init__(self, fixed_patch_specs: Sequence[PatchSpecs]):
+        """A simple patching strategy that returns patches from a fixed list.
+
+        Parameters
+        ----------
+        fixed_patch_specs: Sequence[PatchSpecs]
+            A sequence of patch specifications.
+        """
+        self.fixed_patch_specs = fixed_patch_specs
+
+    @property
+    def n_patches(self):
+        """
+        The number of patches that this patching strategy will return.
+
+        It also determines the maximum index that can be given to `get_patch_spec`.
+        """
+        return len(self.fixed_patch_specs)
+
+    def get_patch_spec(self, index: int) -> PatchSpecs:
+        """Return the patch specs for a given index.
+
+        Parameters
+        ----------
+        index : int
+            A patch index.
+
+        Returns
+        -------
+        PatchSpecs
+            A dictionary that specifies a single patch in a series of `ImageStacks`.
+        """
+        if index >= self.n_patches:
+            raise IndexError(
+                f"Index {index} out of bounds for FixedRandomPatchingStrategy with "
+                f"number of patches {self.n_patches}"
+            )
+        # simply index the pre-generated patches to get the correct patch
+        return self.fixed_patch_specs[index]
+
+    # Note: this is used by the FileIterSampler
+    def get_patch_indices(self, data_idx: int) -> Sequence[int]:
+        """
+        Return all patch indices belonging to a specific `image_stack`.
+
+        Each `image_stack` corresponds to a given `data_idx`.
+
+        Parameters
+        ----------
+        data_idx : int
+            An index that corresponds to a given `image_stack`.
+
+        Returns
+        -------
+        sequence of int
+            A sequence of patch indices belonging to a particular `image_stack` that
+            can be used to index the `CAREamicsDataset`.
+        """
+        return [
+            i
+            for i, patch_spec in enumerate(self.fixed_patch_specs)
+            if patch_spec["data_idx"] == data_idx
+        ]
@@ -96,8 +96,6 @@ def n_patches(self) -> int:
             sum([sample.n_patches for sample in image]) for image in self.image_patching
         )
 
-    # TODO: add method to return valid grid coords for removal
-
     def exclude_patches(
         self, data_idx: int, sample_idx: int, grid_coords: Sequence[tuple[int, ...]]
     ):
@@ -194,6 +192,27 @@ def get_patch_indices(self, data_idx: int) -> Sequence[int]:
         start = 0 if data_idx == 0 else self.cumulative_image_patches[data_idx - 1]
         return np.arange(start, self.cumulative_image_patches[data_idx]).tolist()
 
+    def get_included_grid_coords(self) -> dict[tuple[int, int], list[tuple[int, ...]]]:
+        """
+        Get all grid coordinates included in the patching strategy.
+
+        If a grid coordinate is not included, a patch can never be selected from the
+        region `[grid_coord*patch_size, (grid_coord+1)*patch_size]`.
+
+        Returns
+        -------
+        grid_coords : dict[tuple[int, int], list[tuple, ...]]
+            The key of the returned dictionary corresponds to the
+            `(data_idx, sample_idx)` and the values are the corresponding grid coords.
+        """
+        included_grid_coords: dict[tuple[int, int], list[tuple[int, ...]]] = {}
+        for data_idx, image_patch_list in enumerate(self.image_patching):
+            for sample_idx, sample_patching in enumerate(image_patch_list):
+                included_grid_coords[(data_idx, sample_idx)] = (
+                    sample_patching.get_included_grid_coords()
+                )
+        return included_grid_coords
+
     def _calc_bins(self) -> tuple[NDArray[np.int_], NDArray[np.int_], NDArray[np.int_]]:
         """
         Calculate bins to determine which image and sample a patch index maps to.
@@ -301,7 +320,7 @@ def __init__(
         self.areas: dict[tuple[int, ...], int] = {}
         self.probs: dict[tuple[int, ...], float]
 
-        self.excluded_patches: list[tuple[int, ...]] = []
+        self.excluded_patches: set[tuple[int, ...]] = set()
         self.bin_size: int
         self.bins: list[list[tuple[int, ...]]]
         self.n_patches: int
@@ -413,7 +432,7 @@ def exclude_patches(self, grid_coords: Sequence[tuple[int, ...]]):
             that will be excluded from sampling. The grid starts at (0, 0) and has a
             spacing of the given `patch_size`.
         """
-        self.excluded_patches.extend(grid_coords)
+        self.excluded_patches.update(grid_coords)
         for grid_coord in grid_coords:
             d: tuple[Literal[0, 1], ...] = (0, 1)
             # exclude the patch from all the sampling regions that cover it
@@ -438,6 +457,21 @@ def exclude_patches(self, grid_coords: Sequence[tuple[int, ...]]):
             self._recalculate_sampling()
         )
 
+    def get_included_grid_coords(self) -> list[tuple[int, ...]]:
+        """
+        Get all the included grid coordinates in the patching strategy.
+
+        If a grid coordinate is not included, a patch can never be selected from the
+        region `[grid_coord*patch_size, (grid_coord+1)*patch_size]`.
+
+        Returns
+        -------
+        grid_coords : list[tuple, ...]]
+            The list of included grid coordinates.
+        """
+        grid_coords_all: set[tuple[int, ...]] = set(self.regions.keys())
+        return list(grid_coords_all.difference(self.excluded_patches))
+
     def _recalculate_sampling(self):
         """
         Recalculate how patches will be sampled.
 
@@ -0,0 +1,90 @@
+"""A module for selecting data to be set aside for validation."""
+
+import numpy as np
+
+from .patching_strategies import (
+    FixedPatchingStrategy,
+    PatchSpecs,
+    StratifiedPatchingStrategy,
+)
+
+
+def create_val_split(
+    stratified_patching: StratifiedPatchingStrategy,
+    n_val_patches: int,
+    rng: np.random.Generator,
+) -> tuple[StratifiedPatchingStrategy, FixedPatchingStrategy]:
+    """
+    Create patching strategies for training and validation.
+
+    The patches from the training patching strategy will never overlap with the patches
+    from the validation patching strategy.
+
+    Parameters
+    ----------
+    stratified_patching : StratifiedPatchingStrategy
+        The patching strategy to select and exclude validation patches from.
+    n_val_patches: int,
+        The number of validation patches.
+    rng : int, optional
+        An optional seed to ensure the reproducibility of the validation patch choice.
+    Returns
+    -------
+    training_patching_strategy : StratifiedPatchingStrategy
+        The patching strategy to be used for training. Patches will be sampled in a
+        stratified way, for each epoch. It excludes all the patches that should be used
+        for validation.
+    validation_patching_strategy : FixedPatchingStrategy
+        The patching strategy to be used for validation. It will return the same patches
+        every epoch.
+    """
+    patch_size = stratified_patching.patch_size
+
+    # validation patches have to lie on this grid
+    grid_coords = stratified_patching.get_included_grid_coords()
+    # sample_ids are (data_idx, sample_idx)
+    sample_ids = list(grid_coords.keys())
+    val_patch_specs: list[PatchSpecs] = []
+
+    # select validation patches
+    n_patches_per_image = np.array(
+        [
+            stratified_patching.image_patching[data_idx][sample_idx].n_patches
+            for data_idx, sample_idx in sample_ids
+        ]
+    )
+    n_selected_image_patches = np.zeros_like(n_patches_per_image)
+    for _ in range(n_val_patches):
+        probs = n_patches_per_image / n_patches_per_image.sum()
+        idx = rng.choice(np.arange(len(n_patches_per_image)), p=probs)
+        n_selected_image_patches[idx] += 1
+        n_patches_per_image[idx] -= 1
+
+    for idx, n_patches in enumerate(n_selected_image_patches):
+
+        data_idx, sample_idx = sample_ids[idx]
+        # randomly choose the validation patches in the image
+        coord_indices = rng.choice(
+            len(grid_coords[(data_idx, sample_idx)]), n_patches, replace=False
+        )
+        coords: list[tuple[int, ...]] = [
+            grid_coords[(data_idx, sample_idx)][coord_idx]
+            for coord_idx in coord_indices
+        ]
+        # exclude the chosen validation patches from training
+        stratified_patching.exclude_patches(data_idx, sample_idx, coords)
+
+        # collect the chosen validation patches to create the fixed patching strategy
+        patch_specs: list[PatchSpecs] = [
+            {
+                "data_idx": data_idx,
+                "sample_idx": sample_idx,
+                "coords": tuple(np.array(grid_coord) * np.array(patch_size)),
+                "patch_size": patch_size,
+            }
+            for grid_coord in coords
+        ]
+        val_patch_specs.extend(patch_specs)
+
+    val_patching_strategy = FixedPatchingStrategy(val_patch_specs)
+    return stratified_patching, val_patching_strategy