Skip to content

Commit e484bee

Browse files
authored
Make tests reusable (#142)
This is helpful in case you curate your own mappings somewhere else and still want to have a test case for them being internally right
1 parent a04784a commit e484bee

2 files changed

Lines changed: 205 additions & 185 deletions

File tree

src/biomappings/testing.py

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""Validation tests for :mod:`biomappings`."""
4+
5+
import itertools as itt
6+
import unittest
7+
from collections import defaultdict
8+
9+
import bioregistry
10+
11+
from biomappings.resources import (
12+
Mappings,
13+
MappingTuple,
14+
PredictionTuple,
15+
load_curators,
16+
mapping_sort_key,
17+
)
18+
from biomappings.resources.semapv import get_semapv
19+
from biomappings.utils import (
20+
InvalidIdentifierPattern,
21+
InvalidNormIdentifier,
22+
check_valid_prefix_id,
23+
get_canonical_tuple,
24+
)
25+
26+
semapv = get_semapv()
27+
28+
29+
def _extract_redundant(counter):
30+
return [(key, values) for key, values in counter.items() if len(values) > 1]
31+
32+
33+
def _locations_str(locations):
34+
return ", ".join(f"{label}:{line}" for label, line in locations)
35+
36+
37+
class IntegrityTestCase(unittest.TestCase):
38+
"""Data integrity tests."""
39+
40+
mappings: Mappings
41+
predictions: Mappings
42+
incorrect: Mappings
43+
unsure: Mappings
44+
45+
def _iter_groups(self):
46+
for group, label in [
47+
(self.mappings, "positive"),
48+
(self.incorrect, "negative"),
49+
(self.predictions, "predictions"),
50+
(self.unsure, "unsure"),
51+
]:
52+
for i, mapping in enumerate(group, start=2):
53+
yield label, i, mapping
54+
55+
def test_prediction_types(self):
56+
"""Test that the prediction type is pulled in properly."""
57+
for line, mapping in enumerate(self.mappings, start=2):
58+
pt = mapping.get("prediction_type", "".strip())
59+
if not pt:
60+
continue
61+
self.assertTrue(
62+
pt.startswith("semapv:"),
63+
msg=f"Prediction type should be annotated with semapv on line {line}",
64+
)
65+
self.assertIn(pt[len("semapv:") :], semapv)
66+
self.assertNotEqual(
67+
"semapv:ManualMappingCuration",
68+
pt,
69+
msg="Prediction can not be annotated with manual curation",
70+
)
71+
72+
for label, line, mapping in self._iter_groups():
73+
tt = mapping["type"]
74+
self.assertTrue(
75+
tt.startswith("semapv:"),
76+
msg=f"[{label}] The 'type' column should be annotated with semapv on line {line}",
77+
)
78+
self.assertIn(tt[len("semapv:") :], semapv)
79+
80+
def test_canonical_prefixes(self):
81+
"""Test that all mappings use canonical bioregistry prefixes."""
82+
valid_prefixes = set(bioregistry.read_registry())
83+
for label, line, mapping in self._iter_groups():
84+
source_prefix, target_prefix = mapping["source prefix"], mapping["target prefix"]
85+
self.assertIn(
86+
source_prefix,
87+
valid_prefixes,
88+
msg=f"Invalid prefix: {source_prefix} on {label}:{line}",
89+
)
90+
self.assertIn(
91+
target_prefix,
92+
valid_prefixes,
93+
msg=f"Invalid prefix: {target_prefix} on {label}:{line}",
94+
)
95+
96+
def test_normalized_identifiers(self):
97+
"""Test that all identifiers have been normalized (based on bioregistry definition)."""
98+
for label, line, mapping in self._iter_groups():
99+
self.assert_canonical_identifier(
100+
mapping["source prefix"], mapping["source identifier"], label, line
101+
)
102+
self.assert_canonical_identifier(
103+
mapping["target prefix"], mapping["target identifier"], label, line
104+
)
105+
106+
def assert_canonical_identifier(
107+
self, prefix: str, identifier: str, label: str, line: int
108+
) -> None:
109+
"""Assert a given identifier is canonical.
110+
111+
:param prefix: The prefix to check
112+
:param identifier: The identifier in the semantic space for the prefix
113+
:param label: The label of the mapping file
114+
:param line: The line number of the mapping
115+
"""
116+
try:
117+
check_valid_prefix_id(prefix, identifier)
118+
except InvalidNormIdentifier as e:
119+
self.fail(f"[{label}:{line}] {e}")
120+
except InvalidIdentifierPattern as e:
121+
self.fail(f"[{label}:{line}] {e}")
122+
123+
def test_contributors(self):
124+
"""Test all contributors have an entry in the curators.tsv file."""
125+
contributor_orcids = {row["orcid"] for row in load_curators()}
126+
for mapping in itt.chain(self.mappings, self.incorrect, self.unsure):
127+
source = mapping["source"]
128+
if not source.startswith("orcid:"):
129+
continue
130+
self.assertIn(source[len("orcid:") :], contributor_orcids)
131+
132+
def test_cross_redundancy(self):
133+
"""Test the redundancy of manually curated mappings and predicted mappings."""
134+
counter = defaultdict(lambda: defaultdict(list))
135+
for label, line, mapping in self._iter_groups():
136+
counter[get_canonical_tuple(mapping)][label].append(line)
137+
138+
redundant = []
139+
for mapping, label_to_lines in counter.items():
140+
if len(label_to_lines) <= 1:
141+
continue
142+
redundant.append((mapping, sorted(label_to_lines.items())))
143+
144+
if redundant:
145+
msg = "".join(
146+
f"\n {mapping}: {_locations_str(locations)}" for mapping, locations in redundant
147+
)
148+
raise ValueError(f"{len(redundant)} are redundant: {msg}")
149+
150+
def assert_no_internal_redundancies(self, m: Mappings, tuple_cls):
151+
"""Assert that the list of mappings doesn't have any redundancies."""
152+
counter = defaultdict(list)
153+
for line, mapping in enumerate(m, start=1):
154+
counter[tuple_cls.from_dict(mapping)].append(line)
155+
redundant = _extract_redundant(counter)
156+
if redundant:
157+
msg = "".join(
158+
f"\n {mapping.source_curie}/{mapping.target_curie}: {locations}"
159+
for mapping, locations in redundant
160+
)
161+
raise ValueError(f"{len(redundant)} are redundant: {msg}")
162+
163+
def test_predictions_sorted(self):
164+
"""Test the predictions are in a canonical order."""
165+
self.assertEqual(
166+
self.predictions,
167+
sorted(self.predictions, key=mapping_sort_key),
168+
msg="Predictions are not sorted",
169+
)
170+
self.assert_no_internal_redundancies(self.predictions, PredictionTuple)
171+
172+
def test_curations_sorted(self):
173+
"""Test the true curated mappings are in a canonical order."""
174+
self.assertEqual(
175+
self.mappings,
176+
sorted(self.mappings, key=mapping_sort_key),
177+
msg="True curations are not sorted",
178+
)
179+
self.assert_no_internal_redundancies(self.mappings, MappingTuple)
180+
181+
def test_false_mappings_sorted(self):
182+
"""Test the false curated mappings are in a canonical order."""
183+
self.assertEqual(
184+
self.incorrect,
185+
sorted(self.incorrect, key=mapping_sort_key),
186+
msg="False curations are not sorted",
187+
)
188+
self.assert_no_internal_redundancies(self.incorrect, MappingTuple)
189+
190+
def test_unsure_sorted(self):
191+
"""Test the unsure mappings are in a canonical order."""
192+
self.assertEqual(
193+
self.unsure,
194+
sorted(self.unsure, key=mapping_sort_key),
195+
msg="Unsure curations are not sorted",
196+
)
197+
self.assert_no_internal_redundancies(self.unsure, MappingTuple)

0 commit comments

Comments
 (0)