Externalize training example sampling utilities for BrainState.

chinasaur · copybara-github · commit 60156fdab6e3 · 2026-01-27T22:48:46.000-08:00
PiperOrigin-RevId: 862056907
diff --git a/connectomics/brainstate/sampling.py b/connectomics/brainstate/sampling.py
@@ -0,0 +1,207 @@
+# coding=utf-8
+# Copyright 2026 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils for selecting consistent dataset splits across experiments."""
+
+from collections.abc import Sequence
+import dataclasses
+from typing import Self
+import numpy as np
+
+
+@dataclasses.dataclass
+class DatasetMultiSplit:
+  sample_id_splits: list[np.ndarray]
+  label_splits: list[np.ndarray]
+
+
+@dataclasses.dataclass
+class DatasetSplit:
+  """Represents split of dataset into train/valid/test for ML experiments."""
+  train_ids: np.ndarray
+  valid_ids: np.ndarray
+  test_ids: np.ndarray
+  train_labels: np.ndarray
+  valid_labels: np.ndarray
+  test_labels: np.ndarray
+
+  def upsampled(self, upsample_factor: int, dataset_len: int) -> Self:
+    train_ids, valid_ids, test_ids = [], [], []
+    train_labels, valid_labels, test_labels = [], [], []
+    for i in range(upsample_factor):
+      train_ids.append(self.train_ids + i * dataset_len)
+      valid_ids.append(self.valid_ids + i * dataset_len)
+      test_ids.append(self.test_ids + i * dataset_len)
+      train_labels.append(self.train_labels)
+      valid_labels.append(self.valid_labels)
+      test_labels.append(self.test_labels)
+    return DatasetSplit(
+        np.concatenate(train_ids),
+        np.concatenate(valid_ids),
+        np.concatenate(test_ids),
+        np.concatenate(train_labels),
+        np.concatenate(valid_labels),
+        np.concatenate(test_labels),
+    )
+
+
+def concatenate_splits(
+    splits: Sequence[DatasetSplit], dataset_lengths: Sequence[int]
+) -> DatasetSplit:
+  """Concatenate DatasetSplits by incrementing by previous dataset length."""
+  train_ids, valid_ids, test_ids = [], [], []
+  train_labels, valid_labels, test_labels = [], [], []
+  increment = 0
+  for split, l in zip(splits, dataset_lengths):
+    train_ids.append(split.train_ids + increment)
+    valid_ids.append(split.valid_ids + increment)
+    test_ids.append(split.test_ids + increment)
+    train_labels.append(split.train_labels)
+    valid_labels.append(split.valid_labels)
+    test_labels.append(split.test_labels)
+    increment += l
+  return DatasetSplit(
+      np.concatenate(train_ids),
+      np.concatenate(valid_ids),
+      np.concatenate(test_ids),
+      np.concatenate(train_labels),
+      np.concatenate(valid_labels),
+      np.concatenate(test_labels),
+  )
+
+
+def split_indices_by_labels(
+    labels: Sequence[int], ratios: Sequence[float],
+    rng: np.random.RandomState) -> list[np.ndarray]:
+  """Low-level function to generate arbitrary splits balanced by labels.
+
+  Args:
+    labels: The data labels to balance splits by.
+    ratios: The ratios of the splits. A final implicit split will be included,
+      so e.g. passing ratios=[0.8, 0.1] will result in an 80/10/10 percent
+      split. (If ratios adds up to >=1 then the trailing splits will be empty.)
+    rng: A np.random.RandomState to use for splitting.
+
+  Returns:
+    The indices into labels for each split (total len(ratios) + 1). This can be
+  used to index into e.g. example IDs as well.
+  """
+  split_indices = []
+  for label in np.unique(labels):
+    label_indices = np.flatnonzero(labels == label)
+    rng.shuffle(label_indices)
+    # Splits are rounded this way for backward compatibility.
+    n = len(label_indices)
+    splits = np.cumsum([int(ratio * n) for ratio in ratios])
+    split_indices.append(np.split(label_indices, splits))
+
+  return [np.concat(si) for si in zip(*split_indices)]    # Reshape.
+
+
+def split_dataset_by_ratios(
+    sample_ids: Sequence[int], seed: int, ratios: Sequence[float],
+    labels: Sequence[int] | None = None,
+) -> DatasetMultiSplit:
+  """Splits dataset and labels by given ratios, balanced by labels.
+
+  Args:
+    sample_ids: IDs to identify examples, e.g. cell ids
+    seed: random seed
+    ratios: The ratios of the splits. A final implicit split will be included,
+      so e.g. passing ratios=[0.8, 0.1] will result in an 80/10/10 percent
+      split. (If ratios adds up to >=1 then the trailing splits will be empty.)
+    labels: A label array of the same length as sample_ids. When passed, the
+      samples for each label are distributed among the splits according to their
+      ratios.
+
+  Returns:
+    DatasetMultiSplit
+  """
+  if len(np.unique(sample_ids)) != len(sample_ids):
+    raise ValueError("Found repeated sample ids")
+
+  if labels is not None:
+    if len(labels) != len(sample_ids):
+      raise ValueError("labels must be of the same length as sample_ids")
+    labels = np.array(labels, dtype=int)
+  else:
+    labels = np.zeros(len(sample_ids), dtype=int)
+
+  # Sort by cell id to make samples reproducible even if the samples are passed
+  # in a different order
+  sample_ids = np.array(sample_ids, dtype=int)
+  sample_id_sorting = np.argsort(sample_ids)
+  sample_ids = sample_ids[sample_id_sorting]
+  labels = labels[sample_id_sorting]
+  rng = np.random.RandomState(seed)
+  split_indices = split_indices_by_labels(labels, ratios, rng)
+
+  sample_id_splits = [sample_ids[s] for s in split_indices]
+  label_splits = [labels[s] for s in split_indices]
+  return DatasetMultiSplit(sample_id_splits, label_splits)
+
+
+def split_dataset(
+    sample_ids: Sequence[int], seed: int, train_ratio: float,
+    valid_ratio: float = 0, labels: Sequence[int] | None = None,
+) -> DatasetSplit:
+  """Splits dataset into train / valid / test splits.
+
+  Args:
+    sample_ids: IDs to identify examples, e.g. cell ids
+    seed: random seed
+    train_ratio: ratio of training examples to sample (0-1)
+    valid_ratio: ratio of validation examples to sample (0-1)
+    labels: Optional label array of the same length as sample_ids. When passed,
+      the samples for each label are distributed among the splits according to
+      their ratios.
+
+  Returns:
+    DatasetSplit
+  """
+  if train_ratio + valid_ratio > 1:
+    raise ValueError(
+        "train_ratio and valid_ratio must be <= 1: "
+        f"{train_ratio}, {valid_ratio}"
+    )
+  ratios = train_ratio, valid_ratio
+  split = split_dataset_by_ratios(sample_ids, seed, ratios, labels)
+  return DatasetSplit(
+      train_ids=split.sample_id_splits[0],
+      valid_ids=split.sample_id_splits[1],
+      test_ids=split.sample_id_splits[2],
+      train_labels=split.label_splits[0],
+      valid_labels=split.label_splits[1],
+      test_labels=split.label_splits[2],
+  )
+
+
+def cross_validation_split_dataset(
+    sample_ids: Sequence[int], seed: int, num_splits: int,
+    labels: Sequence[int] | None = None) -> DatasetMultiSplit:
+  """Splits dataset into num_splits, optionally balanced by labels.
+
+  Args:
+    sample_ids: IDs to identify examples, e.g. cell ids
+    seed: random seed
+    num_splits: The number of splits to produce; typically 5- or 10-fold.
+    labels: Optional label array of the same length as sample_ids. When passed,
+      the samples for each label are distributed among the splits according to
+      their ratios.
+
+  Returns:
+    DatasetMultiSplit
+  """
+  ratios = [1.0 / num_splits] * (num_splits - 1)
+  return split_dataset_by_ratios(sample_ids, seed, ratios, labels)
diff --git a/connectomics/brainstate/sampling_test.py b/connectomics/brainstate/sampling_test.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2026 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for sampling module."""
+
+from connectomics.brainstate import sampling
+import numpy as np
+from google3.testing.pybase import googletest
+
+
+class SamplingTest(googletest.TestCase):
+
+  def test_split_indices_by_labels(self):
+    labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    ratios = [0.8]
+    rng = np.random.RandomState(22222)
+    splits = sampling.split_indices_by_labels(labels, ratios, rng)
+    np.testing.assert_array_equal(splits[0], [4, 0, 3, 2, 9, 6, 8, 5])
+    np.testing.assert_array_equal(splits[1], [1, 7])
+
+  def test_empty_split(self):
+    labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    ratios = [0.8, 0.0]
+    rng = np.random.RandomState(22222)
+    splits = sampling.split_indices_by_labels(labels, ratios, rng)
+    np.testing.assert_array_equal(splits[0], [4, 0, 3, 2, 9, 6, 8, 5])
+    np.testing.assert_array_equal(splits[1], [])
+    np.testing.assert_array_equal(splits[2], [1, 7])
+
+  def test_split_dataset(self):
+    sample_ids = range(10)
+    seed = 22222
+    train_ratio = 0.7
+    valid_ratio = 0.1  # Test 0.2 implicit.
+    split = sampling.split_dataset(sample_ids, seed, train_ratio, valid_ratio)
+    np.testing.assert_array_equal(split.train_ids, [3, 5, 9, 4, 6, 7, 0])
+    np.testing.assert_array_equal(split.valid_ids, [8])
+    np.testing.assert_array_equal(split.test_ids, [2, 1])
+    np.testing.assert_array_equal(split.train_labels, [0, 0, 0, 0, 0, 0, 0])
+    np.testing.assert_array_equal(split.valid_labels, [0])
+    np.testing.assert_array_equal(split.test_labels, [0, 0])
+
+    # Results should be balanced by labels.
+    labels = [1, 1, 1, 2, 2, 2, 2, 2, 2, 2]
+    split = sampling.split_dataset(
+        sample_ids, seed, train_ratio, valid_ratio, labels)
+    np.testing.assert_array_equal(split.train_ids, [2, 0, 7, 4, 6, 8])
+    np.testing.assert_array_equal(split.valid_ids, [])
+    np.testing.assert_array_equal(split.test_ids, [1, 9, 3, 5])
+    np.testing.assert_array_equal(split.train_labels, [1, 1, 2, 2, 2, 2])
+    np.testing.assert_array_equal(split.valid_labels, [])
+    np.testing.assert_array_equal(split.test_labels, [1, 2, 2, 2])
+
+  def test_upsample(self):
+    sample_ids = range(10)
+    labels = [1, 1, 1, 2, 2, 2, 2, 2, 2, 2]
+    seed = 22222
+    train_ratio = 0.7
+    valid_ratio = 0.1  # Test 0.2 implicit.
+    split = sampling.split_dataset(
+        sample_ids, seed, train_ratio, valid_ratio, labels
+    )
+    upsampled = split.upsampled(upsample_factor=2, dataset_len=10)
+    np.testing.assert_array_equal(
+        upsampled.train_ids, [2, 0, 7, 4, 6, 8, 12, 10, 17, 14, 16, 18]
+    )
+    np.testing.assert_array_equal(upsampled.valid_ids, [])
+    np.testing.assert_array_equal(
+        upsampled.test_ids, [1, 9, 3, 5, 11, 19, 13, 15]
+    )
+    np.testing.assert_array_equal(
+        upsampled.train_labels, [1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2]
+    )
+    np.testing.assert_array_equal(upsampled.valid_labels, [])
+    np.testing.assert_array_equal(
+        upsampled.test_labels, [1, 2, 2, 2, 1, 2, 2, 2]
+    )
+
+  def test_concatenate_splits(self):
+    sample_ids = range(10)
+    seed = 22222
+    train_ratio = 0.7
+    valid_ratio = 0.1  # Test 0.2 implicit.
+    split = sampling.split_dataset(sample_ids, seed, train_ratio, valid_ratio)
+
+    labels = [1, 1, 1, 2, 2, 2, 2, 2, 2, 2]
+    split2 = sampling.split_dataset(
+        sample_ids, seed, train_ratio, valid_ratio, labels
+    )
+
+    dataset_lengths = [10, 10]
+    concat = sampling.concatenate_splits([split, split2], dataset_lengths)
+    np.testing.assert_array_equal(
+        concat.train_ids, [3, 5, 9, 4, 6, 7, 0, 12, 10, 17, 14, 16, 18]
+    )
+    np.testing.assert_array_equal(concat.valid_ids, [8])
+    np.testing.assert_array_equal(concat.test_ids, [2, 1, 11, 19, 13, 15])
+    np.testing.assert_array_equal(
+        concat.train_labels, [0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2]
+    )
+    np.testing.assert_array_equal(concat.valid_labels, [0])
+    np.testing.assert_array_equal(concat.test_labels, [0, 0, 1, 2, 2, 2])
+
+  def test_cross_validation_split_dataset(self):
+    sample_ids = range(10)
+    seed = 22222
+    num_splits = 5
+    splits = sampling.cross_validation_split_dataset(
+        sample_ids, seed, num_splits).sample_id_splits
+    np.testing.assert_array_equal(splits[0], [3, 5])
+    np.testing.assert_array_equal(splits[1], [9, 4])
+    np.testing.assert_array_equal(splits[2], [6, 7])
+    np.testing.assert_array_equal(splits[3], [0, 8])
+    np.testing.assert_array_equal(splits[4], [2, 1])
+
+    num_splits = 2
+    splits = sampling.cross_validation_split_dataset(
+        sample_ids, seed, num_splits).sample_id_splits
+    np.testing.assert_array_equal(splits[0], [3, 5, 9, 4, 6])
+    np.testing.assert_array_equal(splits[1], [7, 0, 8, 2, 1])
+
+
+if __name__ == "__main__":
+  googletest.main()