tidying up

czaloom · czaloom · commit d89e0cfd0c8f · 2025-10-16T16:47:36.000-04:00
diff --git a/src/valor_lite/object_detection/computation.py b/src/valor_lite/object_detection/computation.py
@@ -174,54 +174,7 @@ def compute_polygon_iou(
     return ious
 
 
-def compute_label_metadata(
-    ids: NDArray[np.int32],
-    n_labels: int,
-) -> NDArray[np.uint32]:
-    """
-    Computes label metadata returning a count of annotations per label.
-
-    Parameters
-    ----------
-    detailed_pairs : NDArray[np.int32]
-        Detailed annotation pairings with shape (N, 7).
-            Index 0 - Datum Index
-            Index 1 - GroundTruth Index
-            Index 2 - Prediction Index
-            Index 3 - GroundTruth Label Index
-            Index 4 - Prediction Label Index
-    n_labels : int
-        The total number of unique labels.
-
-    Returns
-    -------
-    NDArray[np.int32]
-        The label metadata array with shape (n_labels, 2).
-            Index 0 - Ground truth label count
-            Index 1 - Prediction label count
-    """
-    label_metadata = np.zeros((n_labels, 2), dtype=np.uint32)
-
-    ground_truth_pairs = ids[:, (0, 1, 3)]
-    ground_truth_pairs = ground_truth_pairs[ground_truth_pairs[:, 1] >= 0]
-    unique_pairs = np.unique(ground_truth_pairs, axis=0)
-    label_indices, unique_counts = np.unique(
-        unique_pairs[:, 2], return_counts=True
-    )
-    label_metadata[label_indices.astype(np.int32), 0] = unique_counts
-
-    prediction_pairs = ids[:, (0, 2, 4)]
-    prediction_pairs = prediction_pairs[prediction_pairs[:, 1] >= 0]
-    unique_pairs = np.unique(prediction_pairs, axis=0)
-    label_indices, unique_counts = np.unique(
-        unique_pairs[:, 2], return_counts=True
-    )
-    label_metadata[label_indices.astype(np.int32), 1] = unique_counts
-
-    return label_metadata
-
-
-def rank_pairs_returning_indices(sorted_pairs: NDArray[np.float64]):
+def rank_pairs(sorted_pairs: NDArray[np.float64]):
     """
     Prunes and ranks prediction pairs.
 
@@ -327,7 +280,7 @@ def rank_table(tbl: pa.Table, number_of_labels: int) -> pa.Table:
     pairs = np.column_stack(
         [sorted_tbl[col].to_numpy() for col in numeric_columns]
     )
-    pairs, indices = rank_pairs_returning_indices(pairs)
+    pairs, indices = rank_pairs(pairs)
     ranked_tbl = sorted_tbl.take(indices)
     lower_iou_bound, winning_predictions = calculate_ranking_boundaries(
         pairs, number_of_labels=number_of_labels
@@ -344,57 +297,6 @@ def rank_table(tbl: pa.Table, number_of_labels: int) -> pa.Table:
     return ranked_tbl
 
 
-def rank_pairs(
-    detailed_pairs: NDArray[np.float64],
-) -> NDArray[np.float64]:
-    """
-    Highly optimized pair ranking for computing precision and recall based metrics.
-
-    Only ground truths and predictions that provide unique information are kept. The unkept
-    pairs are represented via the label metadata array.
-
-    Parameters
-    ----------
-    detailed_pairs : NDArray[np.float64]
-        Detailed annotation pairs with shape (n_pairs, 7).
-            Index 0 - Datum Index
-            Index 1 - GroundTruth Index
-            Index 2 - Prediction Index
-            Index 3 - GroundTruth Label Index
-            Index 4 - Prediction Label Index
-            Index 5 - IOU
-            Index 6 - Score
-
-    Returns
-    -------
-    NDArray[np.float64]
-        Array of ranked pairs for precision-recall metric computation.
-    """
-    # remove unmatched ground truths
-    pairs = detailed_pairs[detailed_pairs[:, 2] >= 0.0]
-
-    # find best fits for prediction
-    mask_label_match = np.isclose(pairs[:, 3], pairs[:, 4])
-    matched_predictions = np.unique(pairs[mask_label_match, 2])
-    mask_unmatched_predictions = ~np.isin(pairs[:, 2], matched_predictions)
-    pairs = pairs[mask_label_match | mask_unmatched_predictions]
-
-    # only keep the highest ranked pair
-    _, indices = np.unique(pairs[:, [0, 2, 4]], axis=0, return_index=True)
-    pairs = pairs[indices]
-
-    # np.unique orders its results by value, we need to sort the indices to maintain the results of the lexsort
-    indices = np.lexsort(
-        (
-            -pairs[:, 5],  # iou
-            -pairs[:, 6],  # score
-        )
-    )
-    pairs = pairs[indices]
-
-    return pairs
-
-
 def compute_counts(
     ranked_pairs: NDArray[np.float64],
     iou_thresholds: NDArray[np.float64],
diff --git a/src/valor_lite/object_detection/evaluator.py b/src/valor_lite/object_detection/evaluator.py
@@ -10,7 +10,6 @@
 import pyarrow.compute as pc
 import pyarrow.dataset as ds
 import pyarrow.parquet as pq
-from numpy.typing import NDArray
 
 from valor_lite.cache import CacheReader, CacheWriter, DataType
 from valor_lite.object_detection.computation import (
@@ -24,27 +23,13 @@
 )
 from valor_lite.object_detection.metric import Metric, MetricType
 from valor_lite.object_detection.utilities import (
+    create_mapping,
     unpack_confusion_matrix,
     unpack_examples,
     unpack_precision_recall_into_metric_lists,
 )
 
 
-def create_mapping(
-    tbl: pa.Table,
-    pairs: NDArray[np.float64],
-    index: int,
-    id_col: str,
-    uid_col: str,
-) -> dict[int, str]:
-    col = pairs[:, index].astype(np.int64)
-    values, indices = np.unique(col, return_index=True)
-    indices = indices[values >= 0]
-    return {
-        tbl[id_col][idx].as_py(): tbl[uid_col][idx].as_py() for idx in indices
-    }
-
-
 @dataclass
 class EvaluatorInfo:
     number_of_datums: int = 0
@@ -65,7 +50,6 @@ class Filter:
     datums: pc.Expression | None = None
     groundtruths: pc.Expression | None = None
     predictions: pc.Expression | None = None
-    labels: pc.Expression | None = None
 
 
 class Evaluator:
@@ -90,7 +74,7 @@ def __init__(
             self._index_to_label,
             self._number_of_groundtruths_per_label,
             self._info,
-        ) = self._generate_meta(labels_override)
+        ) = self.generate_meta(self._dataset, labels_override)
 
         with open(self._metadata_path, "r") as f:
             types = json.load(f)
@@ -135,19 +119,22 @@ def detailed(self) -> ds.Dataset:
         return self._dataset
 
     @property
-    def ranked(self):
+    def ranked(self) -> ds.Dataset:
         return ds.dataset(self._ranked_path, format="parquet")
 
     @property
     def info(self) -> EvaluatorInfo:
         return self._info
 
-    def _generate_meta(self, labels_override: dict[int, str] | None):
+    @staticmethod
+    def generate_meta(
+        dataset: ds.Dataset, labels_override: dict[int, str] | None
+    ):
         gt_counts_per_lbl = defaultdict(int)
         labels = labels_override if labels_override else {}
         info = EvaluatorInfo()
 
-        for fragment in self.detailed.get_fragments():
+        for fragment in dataset.get_fragments():
             tbl = fragment.to_table()
             columns = (
                 "datum_id",
@@ -217,6 +204,29 @@ def _generate_meta(self, labels_override: dict[int, str] | None):
 
         return labels, number_of_groundtruths_per_label, info
 
+    @staticmethod
+    def iterate_pairs(
+        dataset: ds.Dataset,
+        columns: list[str] | None = None,
+    ):
+        for fragment in dataset.get_fragments():
+            tbl = fragment.to_table(columns=columns)
+            yield np.column_stack(
+                [tbl.column(i).to_numpy() for i in range(tbl.num_columns)]
+            )
+
+    @staticmethod
+    def iterate_pairs_with_table(
+        dataset: ds.Dataset,
+        columns: list[str] | None = None,
+    ):
+        for fragment in dataset.get_fragments():
+            tbl = fragment.to_table()
+            columns = columns if columns else tbl.columns
+            yield tbl, np.column_stack(
+                [tbl[col].to_numpy() for col in columns]
+            )
+
     def filter(
         self,
         filter_expr: Filter,
@@ -234,7 +244,7 @@ def filter(
             filter_expr=filter_expr,
         )
 
-    def create_ranked_cache(
+    def rank(
         self,
         where: str | Path,
         rows_per_file: int | None = None,
@@ -338,29 +348,6 @@ def generate_heap_item(batches, batch_idx, row_idx):
                                 heap, generate_heap_item(batches, batch_idx, 0)
                             )
 
-    @staticmethod
-    def iterate_pairs(
-        dataset: ds.Dataset,
-        columns: list[str] | None = None,
-    ):
-        for fragment in dataset.get_fragments():
-            tbl = fragment.to_table(columns=columns)
-            yield np.column_stack(
-                [tbl.column(i).to_numpy() for i in range(tbl.num_columns)]
-            )
-
-    @staticmethod
-    def iterate_pairs_with_table(
-        dataset: ds.Dataset,
-        columns: list[str] | None = None,
-    ):
-        for fragment in dataset.get_fragments():
-            tbl = fragment.to_table()
-            columns = columns if columns else tbl.columns
-            yield tbl, np.column_stack(
-                [tbl[col].to_numpy() for col in columns]
-            )
-
     def compute_precision_recall(
         self,
         iou_thresholds: list[float],
diff --git a/src/valor_lite/object_detection/loader.py b/src/valor_lite/object_detection/loader.py
@@ -453,7 +453,7 @@ def filter(
             name=loader._name,
             labels_override=evaluator._index_to_label,
         )
-        evaluator.create_ranked_cache(where=loader._ranked_path)
+        evaluator.rank(where=loader._ranked_path)
         return evaluator
 
     def finalize(self):
@@ -473,5 +473,5 @@ def finalize(self):
             directory=self._directory,
             name=self._name,
         )
-        evaluator.create_ranked_cache(where=self._ranked_path)
+        evaluator.rank(where=self._ranked_path)
         return evaluator
diff --git a/src/valor_lite/object_detection/utilities.py b/src/valor_lite/object_detection/utilities.py
@@ -1,9 +1,9 @@
 from collections import defaultdict
 
 import numpy as np
+import pyarrow as pa
 from numpy.typing import NDArray
 
-from valor_lite.object_detection.computation import PairClassification
 from valor_lite.object_detection.metric import Metric, MetricType
 
 
@@ -328,6 +328,21 @@ def unpack_confusion_matrix(
     return metrics
 
 
+def create_mapping(
+    tbl: pa.Table,
+    pairs: NDArray[np.float64],
+    index: int,
+    id_col: str,
+    uid_col: str,
+) -> dict[int, str]:
+    col = pairs[:, index].astype(np.int64)
+    values, indices = np.unique(col, return_index=True)
+    indices = indices[values >= 0]
+    return {
+        tbl[id_col][idx].as_py(): tbl[uid_col][idx].as_py() for idx in indices
+    }
+
+
 def unpack_examples(
     detailed_pairs: NDArray[np.float64],
     mask_tp: NDArray[np.bool_],

Original file line number	Diff line number	Diff line change
`@@ -453,7 +453,7 @@ def filter(`
`453`	`453`	`name=loader._name,`
`454`	`454`	`labels_override=evaluator._index_to_label,`
`455`	`455`	`)`
`456`		`- evaluator.create_ranked_cache(where=loader._ranked_path)`
	`456`	`+ evaluator.rank(where=loader._ranked_path)`
`457`	`457`	`return evaluator`
`458`	`458`
`459`	`459`	`def finalize(self):`
`@@ -473,5 +473,5 @@ def finalize(self):`
`473`	`473`	`directory=self._directory,`
`474`	`474`	`name=self._name,`
`475`	`475`	`)`
`476`		`- evaluator.create_ranked_cache(where=self._ranked_path)`
	`476`	`+ evaluator.rank(where=self._ranked_path)`
`477`	`477`	`return evaluator`