Add save and load options for top_k_tanimoto_scores

niekdejonge · niekdejonge · commit fb0107d1f72c · 2026-03-20T14:19:16.000+01:00
diff --git a/ms2query/benchmarking/TopKTanimotoScores.py b/ms2query/benchmarking/TopKTanimotoScores.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 import numpy as np
 import pandas as pd
 from ms2query.benchmarking.Fingerprints import Fingerprints
@@ -27,13 +28,21 @@ def _create_multi_index(
         combined_data = np.empty((len(inchikey_indexes), self.k * 2), dtype=object)
         combined_data[:, 0::2] = top_k_inchikeys
         combined_data[:, 1::2] = tanimoto_scores_for_top_k
-        return pd.DataFrame(combined_data, index=inchikey_indexes, columns=columns)
+        df = pd.DataFrame(combined_data, index=inchikey_indexes, columns=columns)
+
+        # Cast score columns to float64
+        score_cols = [(rank, "score") for rank in [f"Rank_{i + 1}" for i in range(self.k)]]
+        df[score_cols] = df[score_cols].astype(float)
+
+        return df
 
     @classmethod
     def calculate_from_fingerprints(cls, query_fingerprints: Fingerprints, target_fingerprints: Fingerprints, k):
         """
         Gets the top k highest inchikeys and scores for each inchikey in query_fingerprints from target_fingerprints
         """
+        if target_fingerprints.fingerprints.shape[0] < k:
+            raise ValueError("K cannot be larger than the number of fingerprints")
         similarity_scores = generalized_tanimoto_similarity_matrix(
             query_fingerprints.fingerprints, target_fingerprints.fingerprints
         )
@@ -67,3 +76,30 @@ def get_all_average_tanimoto_scores(self) -> dict[str, float]:
 
         average_per_inchikey_df = scores_df.mean(axis=1)
         return average_per_inchikey_df.to_dict()
+
+    def save(self, path: str | Path) -> None:
+        """Save the TopKTanimotoScores to disk as a parquet file.
+
+        Args:
+            path: File path without extension, e.g. "/data/top_k_scores".
+        """
+        Path(path).with_suffix(".parquet").parent.mkdir(parents=True, exist_ok=True)
+        self.top_k_inchikeys_and_scores.to_parquet(Path(path).with_suffix(".parquet"))
+
+    @classmethod
+    def load(cls, path: str | Path) -> "TopKTanimotoScores":
+        """Load a previously saved TopKTanimotoScores from disk.
+
+        Args:
+            path: File path without extension, e.g. "/data/top_k_scores".
+
+        Returns:
+            A fully reconstructed TopKTanimotoScores instance.
+        """
+        df = pd.read_parquet(Path(path).with_suffix(".parquet"))
+        df.columns.names = ["result_rank", "attribute"]
+
+        instance = cls.__new__(cls)
+        instance.k = len(df.columns.get_level_values("result_rank").unique())
+        instance.top_k_inchikeys_and_scores = df
+        return instance
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,7 @@ ms2deepscore= ">=2.6.0"
 rdkit= ">2024.3.4"
 nmslib= ">=2.0.0"
 umap-learn= ">=0.5.7"
+pyarrow= ">=14.0.1"
 
 [tool.poetry.group.dev.dependencies]
 decorator = "^5.1.1"
diff --git a/tests/test_benchmarking/test_top_k_tanimoto_scores.py b/tests/test_benchmarking/test_top_k_tanimoto_scores.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 import pytest
 from ms2query.benchmarking.TopKTanimotoScores import TopKTanimotoScores
 from tests.helper_functions import make_test_fingerprints
@@ -26,3 +27,54 @@ def test_calculate_from_fingerprints():
         "AAAAAAAAAAAAAD": 0.75,
         "AAAAAAAAAAAAAE": 1.0,
     }
+
+
+@pytest.fixture
+def sample_scores():
+    """Creates a simple TopKTanimotoScores instance for testing."""
+    tanimoto_scores = np.array(
+        [
+            [0.9, 0.7, 0.5],
+            [0.8, 0.6, 0.4],
+            [0.95, 0.85, 0.75],
+        ]
+    )
+    top_k_inchikeys = np.array(
+        [
+            ["INCHI_A", "INCHI_B", "INCHI_C"],
+            ["INCHI_B", "INCHI_C", "INCHI_A"],
+            ["INCHI_C", "INCHI_A", "INCHI_B"],
+        ]
+    )
+    inchikey_indexes = np.array(["QUERY_1", "QUERY_2", "QUERY_3"])
+    return TopKTanimotoScores(tanimoto_scores, top_k_inchikeys, inchikey_indexes)
+
+
+# ----- save and load tests -----
+def test_save_creates_parquet_file(sample_scores, tmp_path):
+    sample_scores.save(tmp_path / "test_scores")
+    assert (tmp_path / "test_scores.parquet").exists()
+
+
+def test_save_creates_parent_directories(sample_scores, tmp_path):
+    sample_scores.save(tmp_path / "nested" / "dir" / "test_scores")
+    assert (tmp_path / "nested" / "dir" / "test_scores.parquet").exists()
+
+
+def test_roundtrip_produces_identical_object(sample_scores, tmp_path):
+    sample_scores.save(tmp_path / "test_scores")
+    loaded = TopKTanimotoScores.load(tmp_path / "test_scores")
+
+    assert loaded.k == sample_scores.k
+    pd.testing.assert_frame_equal(loaded.top_k_inchikeys_and_scores, sample_scores.top_k_inchikeys_and_scores)
+    assert sample_scores.select_top_k_inchikeys_and_scores("QUERY_1") == loaded.select_top_k_inchikeys_and_scores(
+        "QUERY_1"
+    )
+    assert sample_scores.select_top_k_inchikeys("QUERY_2") == loaded.select_top_k_inchikeys("QUERY_2")
+    assert sample_scores.select_average_score("QUERY_3") == pytest.approx(loaded.select_average_score("QUERY_3"))
+
+
+def test_roundtrip_accepts_string_path(sample_scores, tmp_path):
+    sample_scores.save(str(tmp_path / "test_scores"))
+    loaded = TopKTanimotoScores.load(str(tmp_path / "test_scores"))
+    pd.testing.assert_frame_equal(loaded.top_k_inchikeys_and_scores, sample_scores.top_k_inchikeys_and_scores)