sign-language-processing · AmitMY · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/README.md b/README.md
@@ -56,6 +56,10 @@ and correlate to human judgments.
 
 **TODO** list evaluation metrics here.
 
+### Contributing
+
+Please make sure to run `black pose_evaluation` before submitting a pull request.
+
 ## Cite
 
 If you use our toolkit in your research or projects, please consider citing the work.

diff --git a/pose_evaluation/evaluation/evaluate_signclip.py b/pose_evaluation/evaluation/evaluate_signclip.py
@@ -1,14 +1,17 @@
 import argparse
-from pathlib import Path
-import time
 import json
 import random
-import pandas as pd
+import time
+from pathlib import Path
+
 import numpy as np
+import pandas as pd
 import torch
 from tqdm import tqdm
+
 from pose_evaluation.metrics.embedding_distance_metric import EmbeddingDistanceMetric
 
+
 def load_embedding(file_path: Path) -> np.ndarray:
     """
     Load a SignCLIP embedding from a .npy file, ensuring it has the correct shape.
@@ -61,7 +64,10 @@ def get_embedding(video_file):
 
 
 def calculate_mean_distances(
-    distance_matrix: torch.Tensor, indices_a: torch.Tensor, indices_b: torch.Tensor, exclude_self: bool = False
+    distance_matrix: torch.Tensor,
+    indices_a: torch.Tensor,
+    indices_b: torch.Tensor,
+    exclude_self: bool = False,
 ) -> float:
     """
     Calculate the mean of distances between two sets of indices in a 2D distance matrix.
@@ -92,7 +98,6 @@ def calculate_mean_distances(
 
 
 def generate_synthetic_data(num_items, num_classes, num_items_per_class=4):
-
     torch.manual_seed(42)
     random.seed(42)
     # distance_matrix = torch.rand((num_items, num_items)) * 100
@@ -238,7 +243,7 @@ def evaluate_signclip(emb_dir: Path, split_file: Path, out_path: Path, kind: str
 
     find_class_distances_end = time.perf_counter()
 
-    print(f"Finding within and without took {find_class_distances_end-find_class_distances_start}")
+    print(f"Finding within and without took {find_class_distances_end - find_class_distances_start}")
 
     analysis_end = time.perf_counter()
     analysis_duration = analysis_end - analysis_start
@@ -288,8 +293,17 @@ def evaluate_signclip(emb_dir: Path, split_file: Path, out_path: Path, kind: str
 
 def main():
     parser = argparse.ArgumentParser(description="Evaluate SignCLIP embeddings with score_all.")
-    parser.add_argument("emb_dir", type=Path, help="Path to the directory containing SignCLIP .npy files")
-    parser.add_argument("--split_file", type=Path, required=True, help="Path to the split CSV file (e.g., test.csv)")
+    parser.add_argument(
+        "emb_dir",
+        type=Path,
+        help="Path to the directory containing SignCLIP .npy files",
+    )
+    parser.add_argument(
+        "--split_file",
+        type=Path,
+        required=True,
+        help="Path to the split CSV file (e.g., test.csv)",
+    )
     parser.add_argument(
         "--kind",
         type=str,
@@ -298,7 +312,11 @@ def main():
         help="Type of distance metric to use (default: cosine)",
     )
 
-    parser.add_argument("--out_path", type=Path, help="Where to save output distance npz matrix+file list")
+    parser.add_argument(
+        "--out_path",
+        type=Path,
+        help="Where to save output distance npz matrix+file list",
+    )
 
     args = parser.parse_args()
 
@@ -311,7 +329,12 @@ def main():
 
     print(f"Scores will be saved to {output_file}")
 
-    evaluate_signclip(emb_dir=args.emb_dir, split_file=args.split_file, out_path=output_file, kind=args.kind)
+    evaluate_signclip(
+        emb_dir=args.emb_dir,
+        split_file=args.split_file,
+        out_path=output_file,
+        kind=args.kind,
+    )
 
 
 if __name__ == "__main__":

diff --git a/pose_evaluation/examples/example_metric_construction.py b/pose_evaluation/examples/example_metric_construction.py
@@ -1,19 +1,18 @@
 from pathlib import Path
+
 from pose_format import Pose
-from pose_evaluation.metrics.distance_metric import DistanceMetric
-from pose_evaluation.metrics.distance_measure import AggregatedPowerDistance
+
 from pose_evaluation.metrics.base import BaseMetric
+from pose_evaluation.metrics.distance_measure import AggregatedPowerDistance
+from pose_evaluation.metrics.distance_metric import DistanceMetric
 from pose_evaluation.metrics.test_distance_metric import get_poses
 from pose_evaluation.utils.pose_utils import zero_pad_shorter_poses
 
 if __name__ == "__main__":
     # Define file paths for test pose data
-    reference_file = (
-        Path("pose_evaluation") / "utils" / "test" / "test_data" / "colin-1-HOUSE.pose"
-    )
-    hypothesis_file = (
-        Path("pose_evaluation") / "utils" / "test" / "test_data" / "colin-2-HOUSE.pose"
-    )
+    test_data_path = Path("pose_evaluation") / "utils" / "test" / "test_data"
+    reference_file = test_data_path / "colin-1-HOUSE.pose"
+    hypothesis_file = test_data_path / "colin-2-HOUSE.pose"
 
     # Choose whether to load real files or generate test poses
     # They have different lengths, and so some metrics will crash!
@@ -33,25 +32,19 @@
         poses = [hypothesis, reference]
 
     # Define distance metrics
-    mean_l1_metric = DistanceMetric(
-        "mean_l1_metric", distance_measure=AggregatedPowerDistance(1, 17)
-    )
+    mean_l1_metric = DistanceMetric("mean_l1_metric", distance_measure=AggregatedPowerDistance(1, 17))
     metrics = [
         BaseMetric("base"),
         DistanceMetric("PowerDistanceMetric", AggregatedPowerDistance(2, 1)),
         DistanceMetric("AnotherPowerDistanceMetric", AggregatedPowerDistance(1, 10)),
         mean_l1_metric,
         DistanceMetric(
             "max_l1_metric",
-            AggregatedPowerDistance(
-                order=1, aggregation_strategy="max", default_distance=0
-            ),
+            AggregatedPowerDistance(order=1, aggregation_strategy="max", default_distance=0),
         ),
         DistanceMetric(
             "MeanL2Score",
-            AggregatedPowerDistance(
-                order=2, aggregation_strategy="mean", default_distance=0
-            ),
+            AggregatedPowerDistance(order=2, aggregation_strategy="mean", default_distance=0),
         ),
     ]
 

diff --git a/pose_evaluation/metrics/base.py b/pose_evaluation/metrics/base.py
@@ -1,5 +1,6 @@
 # pylint: disable=undefined-variable
 from typing import Any, Callable, Sequence
+
 from tqdm import tqdm
 
 
@@ -83,9 +84,7 @@ def __call__(self, hypothesis: T, reference: T) -> float:
     def score(self, hypothesis: T, reference: T) -> float:
         raise NotImplementedError
 
-    def score_with_signature(
-        self, hypothesis: T, reference: T, short: bool = False
-    ) -> Score:
+    def score_with_signature(self, hypothesis: T, reference: T, short: bool = False) -> Score:
         return Score(
             name=self.name,
             score=self.score(hypothesis, reference),
@@ -96,29 +95,19 @@ def score_max(self, hypothesis: T, references: Sequence[T]) -> float:
         all_scores = self.score_all([hypothesis], references)
         return max(max(scores) for scores in all_scores)
 
-    def validate_corpus_score_input(
-        self, hypotheses: Sequence[T], references: Sequence[Sequence[T]]
-    ):
+    def validate_corpus_score_input(self, hypotheses: Sequence[T], references: Sequence[Sequence[T]]):
         # This method is designed to avoid mistakes in the use of the corpus_score method
         for reference in references:
-            assert len(hypotheses) == len(
-                reference
-            ), "Hypothesis and reference must have the same number of instances"
+            assert len(hypotheses) == len(reference), "Hypothesis and reference must have the same number of instances"
 
-    def corpus_score(
-        self, hypotheses: Sequence[T], references: Sequence[list[T]]
-    ) -> float:
+    def corpus_score(self, hypotheses: Sequence[T], references: Sequence[list[T]]) -> float:
         """Default implementation: average over sentence scores."""
         self.validate_corpus_score_input(hypotheses, references)
         transpose_references = list(zip(*references))
-        scores = [
-            self.score_max(h, r) for h, r in zip(hypotheses, transpose_references)
-        ]
+        scores = [self.score_max(h, r) for h, r in zip(hypotheses, transpose_references)]
         return sum(scores) / len(hypotheses)
 
-    def score_all(
-        self, hypotheses: Sequence[T], references: Sequence[T], progress_bar=True
-    ) -> list[list[float]]:
+    def score_all(self, hypotheses: Sequence[T], references: Sequence[T], progress_bar=True) -> list[list[float]]:
         """Call the score function for each hypothesis-reference pair."""
         return [
             [self.score(h, r) for r in references]

diff --git a/pose_evaluation/metrics/base_embedding_metric.py b/pose_evaluation/metrics/base_embedding_metric.py
@@ -1,7 +1,8 @@
 from typing import TypeVar
+
 import torch
-from pose_evaluation.metrics.base import BaseMetric
 
+from pose_evaluation.metrics.base import BaseMetric
 
 # Define a type alias for embeddings (e.g., torch.Tensor)
 Embedding = TypeVar("Embedding", bound=torch.Tensor)

diff --git a/pose_evaluation/metrics/conftest.py b/pose_evaluation/metrics/conftest.py
@@ -1,9 +1,10 @@
 import shutil
 from pathlib import Path
 from typing import Callable, Union
-import torch
+
 import numpy as np
 import pytest
+import torch
 
 
 @pytest.fixture(scope="session", autouse=True)
@@ -20,11 +21,12 @@ def clean_test_artifacts():
 @pytest.fixture(name="distance_matrix_shape_checker")
 def fixture_distance_matrix_shape_checker() -> Callable[[torch.Tensor, torch.Tensor], None]:
     def _check_shape(hyp_count: int, ref_count: int, distance_matrix: torch.Tensor):
-
         expected_shape = torch.Size([hyp_count, ref_count])
-        assert (
-            distance_matrix.shape == expected_shape
-        ), f"For M={hyp_count} hypotheses, N={ref_count} references,  Distance Matrix should be MxN={expected_shape}. Instead, received {distance_matrix.shape}"
+        assert distance_matrix.shape == expected_shape, (
+            f"For M={hyp_count} hypotheses, N={ref_count} references,  "
+            f"Distance Matrix should be MxN={expected_shape}. "
+            f"Instead, received {distance_matrix.shape}"
+        )
 
     return _check_shape
 

diff --git a/pose_evaluation/metrics/distance_measure.py b/pose_evaluation/metrics/distance_measure.py
@@ -1,11 +1,15 @@
 from typing import Literal, Dict, Any
+
 import numpy.ma as ma  # pylint: disable=consider-using-from-import
+
 from pose_evaluation.metrics.base import Signature
 
 AggregationStrategy = Literal["max", "min", "mean", "sum"]
 
+
 class DistanceMeasureSignature(Signature):
     """Signature for distance measure metrics."""
+
     def __init__(self, name: str, args: Dict[str, Any]) -> None:
         super().__init__(name=name, args=args)
         self.update_abbr("distance", "dist")
@@ -14,6 +18,7 @@ def __init__(self, name: str, args: Dict[str, Any]) -> None:
 
 class DistanceMeasure:
     """Abstract base class for distance measures."""
+
     _SIGNATURE_TYPE = DistanceMeasureSignature
 
     def __init__(self, name: str) -> None:
@@ -22,7 +27,7 @@ def __init__(self, name: str) -> None:
     def get_distance(self, hyp_data: ma.MaskedArray, ref_data: ma.MaskedArray) -> float:
         """
         Compute the distance between hypothesis and reference data.
-        
+
         This method should be implemented by subclasses.
         """
         raise NotImplementedError
@@ -37,6 +42,7 @@ def get_signature(self) -> Signature:
 
 class PowerDistanceSignature(DistanceMeasureSignature):
     """Signature for power distance measures."""
+
     def __init__(self, name: str, args: Dict[str, Any]) -> None:
         super().__init__(name=name, args=args)
         self.update_signature_and_abbr("order", "ord", args)
@@ -46,6 +52,7 @@ def __init__(self, name: str, args: Dict[str, Any]) -> None:
 
 class AggregatedPowerDistance(DistanceMeasure):
     """Aggregated power distance metric using a specified aggregation strategy."""
+
     _SIGNATURE_TYPE = PowerDistanceSignature
 
     def __init__(
@@ -56,7 +63,7 @@ def __init__(
     ) -> None:
         """
         Initialize the aggregated power distance metric.
-        
+
         :param order: The exponent to which differences are raised.
         :param default_distance: The value to fill in for masked entries.
         :param aggregation_strategy: Strategy to aggregate computed distances.
@@ -69,7 +76,7 @@ def __init__(
     def _aggregate(self, distances: ma.MaskedArray) -> float:
         """
         Aggregate computed distances using the specified strategy.
-        
+
         :param distances: A masked array of computed distances.
         :return: A single aggregated distance value.
         """
@@ -82,23 +89,19 @@ def _aggregate(self, distances: ma.MaskedArray) -> float:
         if self.aggregation_strategy in aggregation_funcs:
             return aggregation_funcs[self.aggregation_strategy]()
 
-        raise NotImplementedError(
-            f"Aggregation Strategy {self.aggregation_strategy} not implemented"
-        )
+        raise NotImplementedError(f"Aggregation Strategy {self.aggregation_strategy} not implemented")
 
-    def _calculate_distances(
-        self, hyp_data: ma.MaskedArray, ref_data: ma.MaskedArray
-    ) -> ma.MaskedArray:
+    def _calculate_distances(self, hyp_data: ma.MaskedArray, ref_data: ma.MaskedArray) -> ma.MaskedArray:
         """
         Compute element-wise distances between hypothesis and reference data.
-        
+
         Steps:
           1. Compute the absolute differences.
           2. Raise the differences to the specified power.
           3. Sum the powered differences along the last axis.
           4. Extract the root corresponding to the power.
           5. Fill masked values with the default distance.
-        
+
         :param hyp_data: Hypothesis data as a masked array.
         :param ref_data: Reference data as a masked array.
         :return: A masked array of computed distances.

diff --git a/pose_evaluation/metrics/distance_metric.py b/pose_evaluation/metrics/distance_metric.py
@@ -1,4 +1,5 @@
 from pose_format import Pose
+
 from pose_evaluation.metrics.base_pose_metric import PoseMetric
 from pose_evaluation.metrics.distance_measure import DistanceMeasure
 

diff --git a/pose_evaluation/metrics/embedding_distance_metric.py b/pose_evaluation/metrics/embedding_distance_metric.py
@@ -1,15 +1,14 @@
-from typing import Literal, List, Union
 import logging
+from typing import Literal, List, Union
 
+import numpy as np
 import torch
+from sentence_transformers import util as st_util
 from torch import Tensor
 from torch.types import Number
-import numpy as np
-from sentence_transformers import util as st_util
 
 from pose_evaluation.metrics.base_embedding_metric import EmbeddingMetric
 
-
 # Useful reference: https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/util.py#L31
 # * Helper functions such as batch_to_device, _convert_to_tensor, _convert_to_batch, _convert_to_batch_tensor
 # * a whole semantic search function, with chunking and top_k
@@ -86,11 +85,7 @@ def _to_batch_tensor_on_device(self, data: TensorConvertableType) -> Tensor:
 
         return st_util._convert_to_batch_tensor(data).to(device=self.device, dtype=self.dtype)
 
-    def score(
-        self,
-        hypothesis: TensorConvertableType,
-        reference: TensorConvertableType,
-    ) -> Number:
+    def score(self, hypothesis: TensorConvertableType, reference: TensorConvertableType) -> Number:
         """
         Compute the distance between two embeddings.