Refactor _get_class_balancing_data() (#191)

lukas-lightly · web-flow · commit 462b300e5217 · 2025-11-24T16:01:10.000+01:00
diff --git a/lightly_studio/src/lightly_studio/selection/select_via_db.py b/lightly_studio/src/lightly_studio/selection/select_via_db.py
@@ -12,7 +12,6 @@
 from numpy.typing import NDArray
 from sqlmodel import Session
 
-from lightly_studio.models.annotation.annotation_base import AnnotationBaseTable
 from lightly_studio.models.tag import TagCreate
 from lightly_studio.resolvers import (
     annotation_label_resolver,
@@ -75,16 +74,16 @@ def _aggregate_class_distributions(
 def _process_explicit_target_distribution(
     session: Session,
     target_distribution: dict[str, float],
-    annotations: Sequence[AnnotationBaseTable],
+    annotation_label_ids: Sequence[UUID],
 ) -> tuple[dict[UUID, float], set[UUID], float]:
     """Processes the explicit target distribution.
 
     Args:
         session: The SQLAlchemy session.
         target_distribution:
             A dictionary mapping annotation label names to their target proportions.
-        annotations:
-            A sequence of all annotations to consider for class balancing.
+        annotation_label_ids:
+            A sequence of all annotation label IDs to consider for class balancing.
 
     Returns:
         Tuple of:
@@ -111,7 +110,7 @@ def _process_explicit_target_distribution(
         label_id_to_target[annotation_label.annotation_label_id] = target
         total_targets += target
 
-    all_label_ids = {a.annotation_label_id for a in annotations}
+    all_label_ids = set(annotation_label_ids)
     unused_label_ids = all_label_ids - set(label_id_to_target.keys())
     # `total_targets` can be more or less than 1.0. Both can be ignored, selection will still
     # try correctly to reach the target.
@@ -122,18 +121,18 @@ def _process_explicit_target_distribution(
 def _get_class_balancing_data(
     session: Session,
     strat: AnnotationClassBalancingStrategy,
-    annotations: Sequence[AnnotationBaseTable],
+    annotation_label_ids: Sequence[UUID],
     input_sample_ids: Sequence[UUID],
     sample_id_to_annotation_label_ids: Mapping[UUID, list[UUID]],
 ) -> tuple[NDArray[np.float32], list[float]]:
     """Helper function to get class balancing data."""
     if strat.target_distribution == "uniform":
-        target_keys_set = {a.annotation_label_id for a in annotations}
+        target_keys_set = set(annotation_label_ids)
         target_keys = list(target_keys_set)
         target_values = [1.0 / len(target_keys)] * len(target_keys)
     elif strat.target_distribution == "input":
         # Count the number of times each label appears in the input
-        input_label_count = Counter(a.annotation_label_id for a in annotations)
+        input_label_count = Counter(annotation_label_ids)
         target_keys, target_values = (
             list(input_label_count.keys()),
             list(input_label_count.values()),
@@ -143,18 +142,18 @@ def _get_class_balancing_data(
             _process_explicit_target_distribution(
                 session=session,
                 target_distribution=strat.target_distribution,
-                annotations=annotations,
+                annotation_label_ids=annotation_label_ids,
             )
         )
         if len(unused_label_ids) >= 1:
             other_uuid = uuid4()
             # Handle the case when not all classes have a target.
             # We replace UUIDs that are present in `unused_label_ids` for `other_uuid` and the
             # target for `other_uuid` is `remaining_ratio`.
-            for annotation_label_ids in sample_id_to_annotation_label_ids.values():
-                for i, label_id in enumerate(annotation_label_ids):
+            for sample_annotation_label_ids in sample_id_to_annotation_label_ids.values():
+                for i, label_id in enumerate(sample_annotation_label_ids):
                     if label_id in unused_label_ids:
-                        annotation_label_ids[i] = other_uuid
+                        sample_annotation_label_ids[i] = other_uuid
             label_id_to_target[other_uuid] = remaining_ratio
 
         target_keys, target_values = (
@@ -230,6 +229,7 @@ def select_via_database(
                 session=session,
                 filters=AnnotationsFilter(sample_ids=input_sample_ids),
             ).annotations
+            annotation_label_ids = [a.annotation_label_id for a in annotations]
             sample_id_to_annotation_label_ids = defaultdict(list)
             for annotation in annotations:
                 sample_id_to_annotation_label_ids[annotation.parent_sample_id].append(
@@ -239,7 +239,7 @@ def select_via_database(
             class_distributions, target_values = _get_class_balancing_data(
                 session=session,
                 strat=strat,
-                annotations=annotations,
+                annotation_label_ids=annotation_label_ids,
                 input_sample_ids=input_sample_ids,
                 sample_id_to_annotation_label_ids=sample_id_to_annotation_label_ids,
             )
diff --git a/lightly_studio/tests/selection/test_select_via_db.py b/lightly_studio/tests/selection/test_select_via_db.py
@@ -9,7 +9,6 @@
 from pytest_mock import MockerFixture
 from sqlmodel import Session
 
-from lightly_studio.models.annotation.annotation_base import AnnotationBaseTable, AnnotationType
 from lightly_studio.models.tag import TagCreate
 from lightly_studio.resolvers import (
     image_resolver,
@@ -732,31 +731,11 @@ def test_get_class_balancing_data_input(test_db: Session) -> None:
     label_id_dog = UUID("00000000-0000-0000-0000-000000000002")
     sample_id_1 = UUID("11111111-1111-1111-1111-111111111111")
     sample_id_2 = UUID("22222222-2222-2222-2222-222222222222")
-    dataset_id = uuid4()
-
-    ann_cat_1 = AnnotationBaseTable(
-        annotation_label_id=label_id_cat,
-        parent_sample_id=sample_id_1,
-        dataset_id=dataset_id,
-        annotation_type=AnnotationType.CLASSIFICATION,
-    )
-    ann_cat_2 = AnnotationBaseTable(
-        annotation_label_id=label_id_cat,
-        parent_sample_id=sample_id_2,
-        dataset_id=dataset_id,
-        annotation_type=AnnotationType.CLASSIFICATION,
-    )
-    ann_dog_1 = AnnotationBaseTable(
-        annotation_label_id=label_id_dog,
-        parent_sample_id=sample_id_2,
-        dataset_id=dataset_id,
-        annotation_type=AnnotationType.CLASSIFICATION,
-    )
 
     # The order of target keys depends on the insertion order in this list.
     # 'cat' appears first, 'dog' appears second.
     # Target Keys: [cat, dog]
-    all_annotations = [ann_cat_1, ann_cat_2, ann_dog_1]
+    all_annotation_labels = [label_id_cat, label_id_cat, label_id_dog]
     input_sample_ids = [sample_id_1, sample_id_2]
 
     sample_id_to_annotation_label_ids = {
@@ -769,7 +748,7 @@ def test_get_class_balancing_data_input(test_db: Session) -> None:
     class_dist, target_vals = _get_class_balancing_data(
         session=test_db,
         strat=strat,
-        annotations=all_annotations,
+        annotation_label_ids=all_annotation_labels,
         input_sample_ids=input_sample_ids,
         sample_id_to_annotation_label_ids=sample_id_to_annotation_label_ids,
     )
@@ -790,28 +769,8 @@ def test_get_class_balancing_data_uniform(test_db: Session) -> None:
     label_id_dog = UUID("00000000-0000-0000-0000-000000000002")
     sample_id_1 = UUID("11111111-1111-1111-1111-111111111111")
     sample_id_2 = UUID("22222222-2222-2222-2222-222222222222")
-    dataset_id = uuid4()
 
-    ann_cat_1 = AnnotationBaseTable(
-        annotation_label_id=label_id_cat,
-        parent_sample_id=sample_id_1,
-        dataset_id=dataset_id,
-        annotation_type=AnnotationType.CLASSIFICATION,
-    )
-    ann_cat_2 = AnnotationBaseTable(
-        annotation_label_id=label_id_cat,
-        parent_sample_id=sample_id_2,
-        dataset_id=dataset_id,
-        annotation_type=AnnotationType.CLASSIFICATION,
-    )
-    ann_dog_1 = AnnotationBaseTable(
-        annotation_label_id=label_id_dog,
-        parent_sample_id=sample_id_2,
-        dataset_id=dataset_id,
-        annotation_type=AnnotationType.CLASSIFICATION,
-    )
-
-    all_annotations = [ann_cat_1, ann_cat_2, ann_dog_1]
+    all_annotation_labels = [label_id_cat, label_id_cat, label_id_dog]
     input_sample_ids = [sample_id_1, sample_id_2]
 
     sample_id_to_annotation_label_ids = {
@@ -824,7 +783,7 @@ def test_get_class_balancing_data_uniform(test_db: Session) -> None:
     class_dist, target_vals = _get_class_balancing_data(
         session=test_db,
         strat=strat,
-        annotations=all_annotations,
+        annotation_label_ids=all_annotation_labels,
         input_sample_ids=input_sample_ids,
         sample_id_to_annotation_label_ids=sample_id_to_annotation_label_ids,
     )
@@ -849,28 +808,8 @@ def test_get_class_balancing_data_target(test_db: Session) -> None:
 
     sample_id_1 = UUID("11111111-1111-1111-1111-111111111111")
     sample_id_2 = UUID("22222222-2222-2222-2222-222222222222")
-    dataset_id = uuid4()
-
-    ann_cat_1 = AnnotationBaseTable(
-        annotation_label_id=label_id_cat,
-        parent_sample_id=sample_id_1,
-        dataset_id=dataset_id,
-        annotation_type=AnnotationType.CLASSIFICATION,
-    )
-    ann_cat_2 = AnnotationBaseTable(
-        annotation_label_id=label_id_cat,
-        parent_sample_id=sample_id_2,
-        dataset_id=dataset_id,
-        annotation_type=AnnotationType.CLASSIFICATION,
-    )
-    ann_dog_1 = AnnotationBaseTable(
-        annotation_label_id=label_id_dog,
-        parent_sample_id=sample_id_2,
-        dataset_id=dataset_id,
-        annotation_type=AnnotationType.CLASSIFICATION,
-    )
 
-    all_annotations = [ann_cat_1, ann_cat_2, ann_dog_1]
+    all_annotation_labels = [label_id_cat, label_id_cat, label_id_dog]
     input_sample_ids = [sample_id_1, sample_id_2]
 
     sample_id_to_annotation_label_ids = {
@@ -888,7 +827,7 @@ def test_get_class_balancing_data_target(test_db: Session) -> None:
     class_dist, target_vals = _get_class_balancing_data(
         session=test_db,
         strat=strat,
-        annotations=all_annotations,
+        annotation_label_ids=all_annotation_labels,
         input_sample_ids=input_sample_ids,
         sample_id_to_annotation_label_ids=sample_id_to_annotation_label_ids,
     )