Skip to content

Commit 6ce243d

Browse files
committed
Reimplement hash
1 parent baf265e commit 6ce243d

1 file changed

Lines changed: 44 additions & 55 deletions

File tree

src/semra/struct.py

Lines changed: 44 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,9 @@
8585
from __future__ import annotations
8686

8787
import math
88-
import pickle
8988
from abc import ABC, abstractmethod
9089
from collections.abc import Iterable
91-
from hashlib import md5
90+
from hashlib import sha256
9291
from typing import Annotated, Any, ClassVar, Generic, Literal, NamedTuple, ParamSpec, TypeVar, Union
9392

9493
import pydantic
@@ -117,13 +116,7 @@
117116
X = TypeVar("X")
118117

119118

120-
def _md5_hexdigest(picklable: object) -> str:
121-
hasher = md5() # noqa: S324
122-
hasher.update(pickle.dumps(picklable))
123-
return hasher.hexdigest()
124-
125-
126-
class KeyedMixin(ABC, Generic[P, X]):
119+
class KeyedMixin(ABC, Generic[P]):
127120
"""A mixin for a class that can be hashed and CURIE-encoded."""
128121

129122
#: The prefix for CURIEs for instances of this class
@@ -133,14 +126,16 @@ def __init_subclass__(cls, *, prefix: str, **kwargs: Any) -> None:
133126
cls._prefix = prefix
134127

135128
@abstractmethod
136-
def key(self, *args: P.args, **kwargs: P.kwargs) -> X:
137-
"""Return a picklable key."""
129+
def key(self, *args: P.args, **kwargs: P.kwargs) -> str:
130+
"""Return a string key."""
138131
raise NotImplementedError
139132

140133
def hexdigest(self, *args: P.args, **kwargs: P.kwargs) -> str:
141134
"""Get a hex string for the MD5 hash of the pickled key() for this class."""
142-
key = self.key(*args, **kwargs)
143-
return _md5_hexdigest(key)
135+
s = self.key(*args, **kwargs)
136+
hasher = sha256()
137+
hasher.update(s.encode("utf-8"))
138+
return hasher.hexdigest()
144139

145140
def get_reference(self, *args: P.args, **kwargs: P.kwargs) -> Reference:
146141
"""Get a CURIE reference using this class's prefix and its hexadecimal representation."""
@@ -193,7 +188,7 @@ class MappingSetKey(NamedTuple):
193188
class MappingSet(
194189
pydantic.BaseModel,
195190
ConfidenceMixin,
196-
KeyedMixin[[], MappingSetKey],
191+
KeyedMixin[[]],
197192
prefix=SEMRA_MAPPING_SET_PREFIX,
198193
):
199194
"""Represents a set of semantic mappings.
@@ -239,9 +234,9 @@ class MappingSet(
239234
description="Mapping set level confidence. Corresponds to optional SSSOM field https://mapping-commons.github.io/sssom/mapping_set_confidence/",
240235
)
241236

242-
def key(self) -> MappingSetKey:
243-
"""Get a picklable key representing the mapping set."""
244-
return MappingSetKey(self.purl or "", self.name, self.version or "", self.license or "")
237+
def key(self) -> str:
238+
"""Get a string key representing the mapping set."""
239+
return "\n".join((self.purl or "", self.name, self.version or "", self.license or ""))
245240

246241
def get_confidence(self) -> float:
247242
"""Get the explicit confidence for the mapping set."""
@@ -259,7 +254,7 @@ class SimpleEvidenceKey(NamedTuple):
259254

260255
class SimpleEvidence(
261256
pydantic.BaseModel,
262-
KeyedMixin[[Union[Triple, "Mapping"]], tuple[StrTriple, SimpleEvidenceKey]],
257+
KeyedMixin[[Union[Triple, "Mapping"]]],
263258
EvidenceMixin,
264259
ConfidenceMixin,
265260
prefix=SEMRA_EVIDENCE_PREFIX,
@@ -288,25 +283,24 @@ class SimpleEvidence(
288283
)
289284
confidence: float | None = Field(None, description="The confidence")
290285

291-
def _simple_key(self) -> SimpleEvidenceKey:
292-
return SimpleEvidenceKey(
293-
self.evidence_type,
294-
self.justification.curie,
295-
self.author.curie if self.author else "",
296-
self.mapping_set.key(),
286+
def _simple_key(self) -> str:
287+
return "\t".join(
288+
(
289+
self.evidence_type,
290+
self.justification.curie,
291+
self.author.curie if self.author else "",
292+
self.mapping_set.key(),
293+
)
297294
)
298295

299-
def key(self, triple: Triple | Mapping) -> tuple[StrTriple, SimpleEvidenceKey]:
296+
def key(self, triple: Triple | Mapping) -> str:
300297
"""Get a key suitable for hashing the evidence.
301298
302299
:returns: A key for deduplication based on the mapping set.
303300
304301
Note: this should be extended to include basically _all_ fields
305302
"""
306-
return (
307-
triple.as_str_triple(),
308-
self._simple_key(),
309-
)
303+
return "\t".join((*triple.as_str_triple(), self._simple_key()))
310304

311305
@property
312306
def mapping_set_names(self) -> set[str]:
@@ -318,12 +312,6 @@ def get_confidence(self) -> float:
318312
return self.confidence if self.confidence is not None else self.mapping_set.confidence
319313

320314

321-
def _sort_evidence_key(ev: Evidence) -> tuple[Any, ...]:
322-
# the first element of the simple key is the type of evidence,
323-
# so they can be compared
324-
return ev._simple_key()
325-
326-
327315
class ReasonedEvidenceKey(NamedTuple):
328316
"""The key used for a reasoned evidence."""
329317

@@ -336,7 +324,7 @@ class ReasonedEvidenceKey(NamedTuple):
336324

337325
class ReasonedEvidence(
338326
pydantic.BaseModel,
339-
KeyedMixin[[Union[Triple, "Mapping"]], tuple[StrTriple, ReasonedEvidenceKey]],
327+
KeyedMixin[[Union[Triple, "Mapping"]]],
340328
EvidenceMixin,
341329
ConfidenceMixin,
342330
prefix=SEMRA_EVIDENCE_PREFIX,
@@ -355,30 +343,29 @@ class ReasonedEvidence(
355343
1.0, description="The probability that the reasoning method is correct"
356344
)
357345

358-
def _simple_key(self) -> ReasonedEvidenceKey:
359-
return ReasonedEvidenceKey(
360-
self.evidence_type,
361-
self.justification.curie,
362-
tuple(
363-
tuple(
364-
evidence.key(mapping)
365-
for evidence in sorted(mapping.evidence, key=_sort_evidence_key)
366-
)
367-
for mapping in sorted(self.mappings)
368-
),
346+
def _simple_key(self) -> str:
347+
return "\t".join(
348+
(
349+
self.evidence_type,
350+
self.justification.curie,
351+
*(
352+
"|".join(
353+
evidence.key(mapping)
354+
for evidence in sorted(mapping.evidence, key=lambda e: e._simple_key())
355+
)
356+
for mapping in sorted(self.mappings)
357+
),
358+
)
369359
)
370360

371-
def key(self, triple: Triple | Mapping) -> tuple[StrTriple, ReasonedEvidenceKey]:
361+
def key(self, triple: Triple | Mapping) -> str:
372362
"""Get a key suitable for hashing the evidence.
373363
374364
:returns: A key for deduplication based on the mapping set.
375365
376366
Note: this should be extended to include basically _all_ fields
377367
"""
378-
return (
379-
triple.as_str_triple(),
380-
self._simple_key(),
381-
)
368+
return "\t".join((*triple.as_str_triple(), self._simple_key()))
382369

383370
def get_confidence(self) -> float:
384371
r"""Calculate confidence for the reasoned evidence.
@@ -433,7 +420,7 @@ def explanation(self) -> str:
433420
class Mapping(
434421
Triple,
435422
ConfidenceMixin,
436-
KeyedMixin[[], StrTriple],
423+
KeyedMixin[[]],
437424
prefix=SEMRA_MAPPING_PREFIX,
438425
):
439426
"""A semantic mapping.
@@ -455,9 +442,11 @@ def triple(self) -> Triple:
455442
"""Get the mapping's core triple as a tuple."""
456443
return Triple(subject=self.subject, predicate=self.predicate, object=self.object)
457444

458-
def key(self) -> StrTriple:
445+
# TODO converge on reusing the curies definition of a triple/mapping identifier
446+
447+
def key(self) -> str:
459448
"""Get a hashable key for the mapping, based on the subject, predicate, and object."""
460-
return self.as_str_triple()
449+
return "\t".join(self.as_str_triple())
461450

462451
@classmethod
463452
def from_triple(

0 commit comments

Comments
 (0)