Skip to content

Commit 1d22d6e

Browse files
authored
Improve CURIE generation for evidences (#49)
Based on #46, this PR isolates and refactors the code that generates hashes for evidences using the base mapping's triple
1 parent 2382ed5 commit 1d22d6e

2 files changed

Lines changed: 125 additions & 35 deletions

File tree

src/semra/api.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,13 @@ def get_test_evidence(n: None) -> SimpleEvidence: ...
136136
def get_test_evidence(n: int | None = None) -> SimpleEvidence | list[SimpleEvidence]:
137137
"""Get test evidence."""
138138
if isinstance(n, int):
139-
return [SimpleEvidence(mapping_set=TEST_MAPPING_SET) for _ in range(n)]
139+
return [
140+
SimpleEvidence(
141+
mapping_set=TEST_MAPPING_SET,
142+
author=Reference(prefix="orcid", identifier=f"0000-0000-0000-000{n}"),
143+
)
144+
for n in range(n)
145+
]
140146
return SimpleEvidence(mapping_set=TEST_MAPPING_SET)
141147

142148

@@ -212,7 +218,7 @@ def get_index(mappings: Iterable[Mapping], *, progress: bool = True, leave: bool
212218
dd: defaultdict[Triple, list[Evidence]] = defaultdict(list)
213219
for mapping in _tqdm(mappings, desc="Indexing mappings", progress=progress, leave=leave):
214220
dd[mapping.triple].extend(mapping.evidence)
215-
return {triple: deduplicate_evidence(evidence) for triple, evidence in dd.items()}
221+
return {triple: deduplicate_evidence(triple, evidence) for triple, evidence in dd.items()}
216222

217223

218224
def assemble_evidences(mappings: list[Mapping], *, progress: bool = True) -> list[Mapping]:
@@ -904,7 +910,7 @@ def get_many_to_many(mappings: list[Mapping]) -> list[Mapping]:
904910
# this is effectively the same as :func:`unindex` except the deduplicate_evidence is called
905911
# explicitly
906912
rv = [
907-
Mapping.from_triple(triple, deduplicate_evidence(evidence))
913+
Mapping.from_triple(triple, deduplicate_evidence(triple, evidence))
908914
for triple, evidence in index.items()
909915
]
910916
return rv
@@ -1095,9 +1101,9 @@ def unindex(index: Index, *, progress: bool = True) -> list[Mapping]:
10951101
]
10961102

10971103

1098-
def deduplicate_evidence(evidence: list[Evidence]) -> list[Evidence]:
1104+
def deduplicate_evidence(triple: Triple | Mapping, evidence: list[Evidence]) -> list[Evidence]:
10991105
"""Deduplicate a list of evidences based on their "key" function."""
1100-
d = {e.key(): e for e in evidence}
1106+
d = {e.key(triple): e for e in evidence}
11011107
return list(d.values())
11021108

11031109

src/semra/struct.py

Lines changed: 114 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from collections.abc import Iterable
1010
from hashlib import md5
1111
from itertools import islice
12-
from typing import Annotated, Any, ClassVar, Literal
12+
from typing import Annotated, Any, ClassVar, Generic, Literal, NamedTuple, ParamSpec, TypeVar, Union
1313

1414
import pydantic
1515
from more_itertools import triplewise
@@ -31,13 +31,24 @@
3131
"triple_key",
3232
]
3333

34+
P = ParamSpec("P")
35+
X = TypeVar("X")
36+
3437
#: A type annotation for a subject-predicate-object triple
3538
Triple = tuple[Reference, Reference, Reference]
3639

3740

38-
def triple_key(triple: Triple) -> tuple[str, str, str]:
41+
class StrTriple(NamedTuple):
42+
"""A triple of curies."""
43+
44+
subject: str
45+
predicate: str
46+
object: str
47+
48+
49+
def triple_key(triple: Triple) -> StrTriple:
3950
"""Get a sortable key for a triple."""
40-
return triple[0].curie, triple[2].curie, triple[1].curie
51+
return StrTriple(triple[0].curie, triple[2].curie, triple[1].curie)
4152

4253

4354
def _md5_hexdigest(picklable: object) -> str:
@@ -46,7 +57,7 @@ def _md5_hexdigest(picklable: object) -> str:
4657
return hasher.hexdigest()
4758

4859

49-
class KeyedMixin(ABC):
60+
class KeyedMixin(ABC, Generic[P, X]):
5061
"""A mixin for a class that can be hashed and CURIE-encoded."""
5162

5263
#: The prefix for CURIEs for instances of this class
@@ -56,23 +67,23 @@ def __init_subclass__(cls, *, prefix: str, **kwargs: Any) -> None:
5667
cls._prefix = prefix
5768

5869
@abstractmethod
59-
def key(self) -> object:
70+
def key(self, *args: P.args, **kwargs: P.kwargs) -> X:
6071
"""Return a picklable key."""
6172
raise NotImplementedError
6273

63-
def hexdigest(self) -> str:
74+
def hexdigest(self, *args: P.args, **kwargs: P.kwargs) -> str:
6475
"""Get a hex string for the MD5 hash of the pickled key() for this class."""
65-
key = self.key()
76+
key = self.key(*args, **kwargs)
6677
return _md5_hexdigest(key)
6778

68-
def get_reference(self) -> Reference:
79+
def get_reference(self, *args: P.args, **kwargs: P.kwargs) -> Reference:
6980
"""Get a CURIE reference using this class's prefix and its hexadecimal representation."""
70-
return Reference(prefix=self._prefix, identifier=self.hexdigest())
81+
return Reference(prefix=self._prefix, identifier=self.hexdigest(*args, **kwargs))
7182

7283
@property
73-
def curie(self) -> str:
84+
def curie(self, *args: P.args, **kwargs: P.kwargs) -> str:
7485
"""Get a string representing the CURIE."""
75-
return self.get_reference().curie
86+
return self.get_reference(*args, **kwargs).curie
7687

7788

7889
class ConfidenceMixin:
@@ -104,7 +115,20 @@ def mapping_set_names(self) -> set[str]:
104115
raise NotImplementedError
105116

106117

107-
class MappingSet(pydantic.BaseModel, ConfidenceMixin, KeyedMixin, prefix=SEMRA_MAPPING_SET_PREFIX):
118+
class MappingSetKey(NamedTuple):
119+
"""The key used for a mapping set."""
120+
121+
name: str
122+
version: str
123+
license: str
124+
125+
126+
class MappingSet(
127+
pydantic.BaseModel,
128+
ConfidenceMixin,
129+
KeyedMixin[[], MappingSetKey],
130+
prefix=SEMRA_MAPPING_SET_PREFIX,
131+
):
108132
"""Represents a set of semantic mappings.
109133
110134
For example, this might correspond to:
@@ -121,17 +145,30 @@ class MappingSet(pydantic.BaseModel, ConfidenceMixin, KeyedMixin, prefix=SEMRA_M
121145
license: str | None = Field(default=None, description="License name or URL for mapping set")
122146
confidence: float = Field(..., description="Mapping set level confidence")
123147

124-
def key(self) -> object:
148+
def key(self) -> MappingSetKey:
125149
"""Get a picklable key representing the mapping set."""
126-
return self.name, self.version or "", self.license or "", self.confidence
150+
return MappingSetKey(self.name, self.version or "", self.license or "")
127151

128152
def get_confidence(self) -> float:
129153
"""Get the explicit confidence for the mapping set."""
130154
return self.confidence
131155

132156

157+
class SimpleEvidenceKey(NamedTuple):
158+
"""The key used for a simple evidence."""
159+
160+
evidence_type: str
161+
justification: str
162+
author: str
163+
mapping_set: MappingSetKey
164+
165+
133166
class SimpleEvidence(
134-
pydantic.BaseModel, KeyedMixin, EvidenceMixin, ConfidenceMixin, prefix=SEMRA_EVIDENCE_PREFIX
167+
pydantic.BaseModel,
168+
KeyedMixin[[Union[Triple, "Mapping"]], tuple[StrTriple, SimpleEvidenceKey]],
169+
EvidenceMixin,
170+
ConfidenceMixin,
171+
prefix=SEMRA_EVIDENCE_PREFIX,
135172
):
136173
"""Evidence for a mapping.
137174
@@ -158,19 +195,24 @@ class SimpleEvidence(
158195
uuid: UUID4 = Field(default_factory=uuid.uuid4)
159196
confidence: float | None = Field(None, description="The confidence")
160197

161-
def key(self) -> object:
198+
def _simple_key(self) -> SimpleEvidenceKey:
199+
return SimpleEvidenceKey(
200+
self.evidence_type,
201+
self.justification.curie,
202+
self.author.curie if self.author else "",
203+
self.mapping_set.key(),
204+
)
205+
206+
def key(self, triple: Triple | Mapping) -> tuple[StrTriple, SimpleEvidenceKey]:
162207
"""Get a key suitable for hashing the evidence.
163208
164209
:returns: A key for deduplication based on the mapping set.
165210
166211
Note: this should be extended to include basically _all_ fields
167212
"""
168213
return (
169-
self.evidence_type,
170-
self.justification,
171-
self.author,
172-
self.mapping_set.key(),
173-
self.uuid,
214+
triple_key(triple.triple if isinstance(triple, Mapping) else triple),
215+
self._simple_key(),
174216
)
175217

176218
@property
@@ -183,8 +225,28 @@ def get_confidence(self) -> float:
183225
return self.confidence if self.confidence is not None else self.mapping_set.confidence
184226

185227

228+
def _sort_evidence_key(ev: Evidence) -> tuple[Any, ...]:
229+
# the first element of the simple key is the type of evidence,
230+
# so they can be compared
231+
return ev._simple_key()
232+
233+
234+
class ReasonedEvidenceKey(NamedTuple):
235+
"""The key used for a reasoned evidence."""
236+
237+
evidence_type: str
238+
justification: str
239+
rest: tuple[
240+
tuple[tuple[StrTriple, ReasonedEvidenceKey] | tuple[StrTriple, SimpleEvidenceKey], ...], ...
241+
]
242+
243+
186244
class ReasonedEvidence(
187-
pydantic.BaseModel, KeyedMixin, EvidenceMixin, ConfidenceMixin, prefix=SEMRA_EVIDENCE_PREFIX
245+
pydantic.BaseModel,
246+
KeyedMixin[[Union[Triple, "Mapping"]], tuple[StrTriple, ReasonedEvidenceKey]],
247+
EvidenceMixin,
248+
ConfidenceMixin,
249+
prefix=SEMRA_EVIDENCE_PREFIX,
188250
):
189251
"""A complex evidence based on multiple mappings."""
190252

@@ -200,12 +262,29 @@ class ReasonedEvidence(
200262
1.0, description="The probability that the reasoning method is correct"
201263
)
202264

203-
def key(self) -> object:
204-
"""Get a key for reasoned evidence."""
205-
return (
265+
def _simple_key(self) -> ReasonedEvidenceKey:
266+
return ReasonedEvidenceKey(
206267
self.evidence_type,
207-
self.justification,
208-
*((*m.triple, *(e.key() for e in m.evidence)) for m in self.mappings),
268+
self.justification.curie,
269+
tuple(
270+
tuple(
271+
evidence.key(mapping)
272+
for evidence in sorted(mapping.evidence, key=_sort_evidence_key)
273+
)
274+
for mapping in sorted(self.mappings, key=lambda m: triple_key(m.triple))
275+
),
276+
)
277+
278+
def key(self, triple: Triple | Mapping) -> tuple[StrTriple, ReasonedEvidenceKey]:
279+
"""Get a key suitable for hashing the evidence.
280+
281+
:returns: A key for deduplication based on the mapping set.
282+
283+
Note: this should be extended to include basically _all_ fields
284+
"""
285+
return (
286+
triple_key(triple.triple if isinstance(triple, Mapping) else triple),
287+
self._simple_key(),
209288
)
210289

211290
def get_confidence(self) -> float:
@@ -256,7 +335,12 @@ def explanation(self) -> str:
256335
]
257336

258337

259-
class Mapping(pydantic.BaseModel, ConfidenceMixin, KeyedMixin, prefix=SEMRA_MAPPING_PREFIX):
338+
class Mapping(
339+
pydantic.BaseModel,
340+
ConfidenceMixin,
341+
KeyedMixin[[], StrTriple],
342+
prefix=SEMRA_MAPPING_PREFIX,
343+
):
260344
"""A semantic mapping."""
261345

262346
model_config = ConfigDict(frozen=True)
@@ -271,9 +355,9 @@ def triple(self) -> Triple:
271355
"""Get the mapping's core triple as a tuple."""
272356
return self.s, self.p, self.o
273357

274-
def key(self) -> object:
358+
def key(self) -> StrTriple:
275359
"""Get a hashable key for the mapping, based on the subject, predicate, and object."""
276-
return self.triple
360+
return triple_key(self.triple)
277361

278362
@classmethod
279363
def from_triple(cls, triple: Triple, evidence: list[Evidence] | None = None) -> Mapping:

0 commit comments

Comments
 (0)