Skip to content

Commit e8e0adf

Browse files
authored
Merge pull request #3 from sdsc-ordes/refactor_rapidfuzz
refactor: implement rapidfuzz
2 parents e58f9a7 + 7a9a374 commit e8e0adf

7 files changed

Lines changed: 213 additions & 59 deletions

File tree

error.txt

Lines changed: 0 additions & 2 deletions
This file was deleted.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ dependencies = [
1919
"python-multipart>=0.0.20",
2020
"rdflib>=7.1.4",
2121
"SPARQLWrapper>=2.0.0",
22+
"rapidfuzz>=3.0.0",
2223
]
2324
[dependency-groups]
2425
dev = [

src/strings2things/app/api/endpoints.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,36 +3,48 @@
33
from fastapi import APIRouter, UploadFile, File, Form
44
from fastapi.responses import Response
55
from rdflib import Graph
6-
from src.strings2things.app.core.rdf_transformer import RDFTransformer
7-
from src.strings2things.app.core.ontology_manager import (
8-
OntologyManager,
9-
) # Assume this exists
10-
from src.strings2things.app.utils.rdf_utils import (
11-
parse_rdf,
12-
serialize_rdf,
13-
) # Also assume or create
6+
from strings2things.app.core.rdf_transformer import RDFTransformer
7+
from strings2things.app.core.ontology_manager import OntologyManager
8+
from strings2things.app.utils.rdf_utils import parse_rdf, serialize_rdf
149
import logging
1510

1611
router = APIRouter()
1712

1813
ontology_manager = OntologyManager()
1914
ontology_manager.load_ontologies()
2015

21-
transformer = RDFTransformer(ontology_manager.get_label_map())
16+
# Keep transformer as a base instance
17+
base_label_map = ontology_manager.get_label_map()
2218

2319

2420
@router.post("/transform")
2521
async def transform_rdf(
26-
file: UploadFile = File(...), serialization: str = Form("turtle")
22+
file: UploadFile = File(...),
23+
serialization: str = Form("turtle"),
24+
fuzzy: bool = Form(False), # <-- new parameter
25+
fuzzy_threshold: int = Form(90), # <-- configurable fuzzy matching threshold
2726
) -> Response:
2827
"""
2928
Accepts an RDF file upload, transforms it using the label map,
3029
and returns the modified RDF graph in the requested format.
30+
31+
Args:
32+
file: The uploaded RDF file
33+
serialization: Desired output RDF serialization (default: turtle)
34+
fuzzy: Whether to use fuzzy matching (default: False)
35+
threshold: Similarity threshold for fuzzy matching (0-100, default: 90)
3136
"""
3237
content = await file.read()
3338

3439
try:
3540
input_graph = parse_rdf(content)
41+
42+
transformer = RDFTransformer(
43+
base_label_map,
44+
fuzzy=fuzzy,
45+
fuzzy_threshold=fuzzy_threshold,
46+
)
47+
3648
transformed_graph = transformer.transform(input_graph)
3749
serialized = serialize_rdf(transformed_graph, output_format=serialization)
3850

src/strings2things/app/core/rdf_transformer.py

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,48 @@
11
# app/core/rdf_transformer.py
22
"""
33
Transforms RDF graphs by replacing string literals with matching ontology IRIs.
4+
Supports exact and fuzzy matching (RapidFuzz).
45
"""
56

67
from rdflib import Graph, Literal, URIRef
78
from strings2things.app.core.transformation_log import TransformationLog
9+
from rapidfuzz import process
810

911

1012
class RDFTransformer:
11-
def __init__(self, label_map: dict[str, str]):
13+
def __init__(self, label_map: dict[str, str], fuzzy: bool, fuzzy_threshold: int = 90):
14+
"""
15+
:param label_map: dict of {label -> IRI}
16+
:param fuzzy_threshold: minimum score for fuzzy fallback
17+
"""
1218
self.label_map = label_map
19+
self.fuzzy = fuzzy
20+
self.fuzzy_threshold = fuzzy_threshold
1321
self.log = TransformationLog()
1422

23+
def _find_match(self, label: str) -> str | None:
24+
"""
25+
Find an IRI for the given label.
26+
First tries exact match, then (optionally) falls back to fuzzy.
27+
"""
28+
label = label.strip().lower()
29+
30+
# Exact match first (cheap lookup)
31+
iri = self.label_map.get(label)
32+
if iri:
33+
return iri
34+
35+
# Fuzzy fallback
36+
if self.fuzzy:
37+
best = process.extractOne(label, self.label_map.keys())
38+
if best:
39+
match, score, _ = best
40+
if score >= self.fuzzy_threshold:
41+
return self.label_map[match]
42+
43+
return None
44+
45+
1546
def transform(self, input_graph: Graph) -> Graph:
1647
"""
1748
Replace matching string literals in the RDF graph with IRIs.
@@ -20,29 +51,28 @@ def transform(self, input_graph: Graph) -> Graph:
2051
output_graph = Graph()
2152

2253
for s, p, o in input_graph:
23-
# if string matches object
2454
if isinstance(o, Literal) and isinstance(o.value, str):
25-
label = o.value.strip().lower()
26-
if label in self.label_map:
27-
iri = URIRef(self.label_map[label])
55+
iri_str = self._find_match(o.value)
56+
if iri_str:
57+
iri = URIRef(iri_str)
2858

29-
# Retain original triple (to retain backward compatibility for now)
59+
# Retain original triple (for backward compatibility)
3060
output_graph.add((s, p, o))
31-
output_graph.add(
32-
(iri, URIRef("http://wwww.example.org/thingOf"), o)
33-
)
34-
61+
output_graph.add((iri, URIRef("http://www.example.org/thingOf"), o))
3562
output_graph.add((s, p, iri))
3663

3764
self.log.add_entry(
3865
subject=str(s),
3966
predicate=str(p),
4067
original_value=str(o),
4168
replacement_iri=str(iri),
42-
reason="unambiguous match",
69+
reason="exact match"
70+
if o.value.strip().lower() in self.label_map
71+
else f"fuzzy match (threshold={self.fuzzy_threshold})",
4372
)
4473
continue
4574

75+
# If no match found → leave as-is
4676
output_graph.add((s, p, o))
4777
self.log.add_entry(
4878
subject=str(s),
@@ -52,7 +82,7 @@ def transform(self, input_graph: Graph) -> Graph:
5282
reason=(
5383
"not a string literal"
5484
if not isinstance(o, Literal)
55-
else "no match in label map"
85+
else "no match found"
5686
),
5787
)
5888

src/strings2things/app/core/transformation_log.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,37 @@
55

66

77
class TransformationLog:
8-
def __init__(self):
8+
def __init__(self, verbose: bool = True):
99
self.entries = []
10+
self.verbose = verbose # control printing
1011

1112
def add_entry(
1213
self, subject, predicate, original_value, replacement_iri, reason: str
1314
):
1415
"""
1516
Record a transformation decision.
1617
"""
17-
self.entries.append(
18-
{
19-
"subject": subject,
20-
"predicate": predicate,
21-
"original": original_value,
22-
"replacement": replacement_iri,
23-
"reason": reason,
24-
}
25-
)
18+
entry = {
19+
"subject": subject,
20+
"predicate": predicate,
21+
"original": original_value,
22+
"replacement": replacement_iri,
23+
"reason": reason,
24+
}
25+
self.entries.append(entry)
26+
27+
# 👇 Emit message immediately when adding entry
28+
if self.verbose:
29+
if replacement_iri:
30+
print(
31+
f"[TRANSFORM] Replaced '{original_value}' "
32+
f"→ <{replacement_iri}> (subject: <{subject}>, predicate: <{predicate}>)"
33+
)
34+
else:
35+
print(
36+
f"[TRANSFORM] No replacement for '{original_value}' "
37+
f"(subject: <{subject}>, predicate: <{predicate}>, reason: {reason})"
38+
)
2639

2740
def get_summary(self):
2841
"""

tests/test_rdf_transformer.py

Lines changed: 42 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,70 @@
11
import pytest
22
from rdflib import Graph, URIRef, Literal, Namespace
33
from strings2things.app.core.rdf_transformer import RDFTransformer
4-
from strings2things.app.core.transformation_log import TransformationLog
54

6-
EX = Namespace("http://example.org/")
5+
EX = Namespace("http://example.org/ontology#")
76

87

98
@pytest.fixture
109
def label_map():
10+
# Ontology label map: canonical labels → IRIs
1111
return {
12-
"geology": "http://example.org/ontology#Geology",
13-
"biology": "http://example.org/ontology#Biology",
12+
"geology": str(EX.Geology),
13+
"biology": str(EX.Biology),
14+
"physics": str(EX.Physics),
1415
}
1516

1617

1718
@pytest.fixture
1819
def input_graph():
1920
g = Graph()
20-
g.add((EX.subj1, EX.hasCategory, Literal("Geology")))
21-
g.add((EX.subj2, EX.hasCategory, Literal("UnknownLabel")))
22-
g.add((EX.subj3, EX.hasValue, URIRef("http://example.org/someIRI")))
21+
# Exact match example
22+
g.add((EX.subj1, EX.hasCategory, Literal("geology")))
23+
24+
# Fuzzy match example (slightly misspelled)
25+
g.add((EX.subj2, EX.hasCategory, Literal("biolgy")))
26+
27+
# Unknown label (should remain unchanged)
28+
g.add((EX.subj3, EX.hasCategory, Literal("unknownlabel")))
29+
30+
# Non-literal value (should remain untouched)
31+
g.add((EX.subj4, EX.hasValue, URIRef("http://example.org/someIRI")))
2332
return g
2433

2534

26-
def test_rdf_transformer(label_map, input_graph):
27-
transformer = RDFTransformer(label_map)
35+
def test_rdf_transformer_combined(label_map, input_graph):
36+
# Initialize transformer with fuzzy matching enabled
37+
transformer = RDFTransformer(label_map, fuzzy=True, fuzzy_threshold=90)
2838
output_graph = transformer.transform(input_graph)
2939

30-
# --- Check graph triples ---
31-
# Original triple must remain
32-
assert (EX.subj1, EX.hasCategory, Literal("Geology")) in output_graph
40+
# --- 1. Check graph triples ---
3341

34-
# Transformed triple must be present
35-
expected_iri = URIRef("http://example.org/ontology#Geology")
36-
assert (EX.subj1, EX.hasCategory, expected_iri) in output_graph
42+
# Exact match: literal replaced by IRI
43+
assert (EX.subj1, EX.hasCategory, URIRef("http://example.org/ontology#Geology")) in output_graph
3744

38-
# UnknownLabel should remain unchanged (no IRI added)
39-
assert (EX.subj2, EX.hasCategory, Literal("UnknownLabel")) in output_graph
40-
assert len([t for t in output_graph.triples((EX.subj2, EX.hasCategory, None))]) == 1
45+
# Fuzzy match: literal replaced by IRI
46+
assert (EX.subj2, EX.hasCategory, URIRef("http://example.org/ontology#Biology")) in output_graph
4147

42-
# Non-literals should remain untouched
43-
assert (EX.subj3, EX.hasValue, URIRef("http://example.org/someIRI")) in output_graph
48+
# Unknown label: literal remains unchanged
49+
assert (EX.subj3, EX.hasCategory, Literal("unknownlabel")) in output_graph
4450

45-
# --- Check log entries ---
51+
# Non-literals remain untouched
52+
assert (EX.subj4, EX.hasValue, URIRef("http://example.org/someIRI")) in output_graph
53+
54+
# --- 2. Check log entries ---
4655
log = transformer.log.entries
47-
geology_log = next(e for e in log if e["original"] == "Geology")
56+
57+
# Exact match log
58+
geology_log = next(e for e in log if e["original"] == "geology")
4859
assert geology_log["replacement"] == "http://example.org/ontology#Geology"
49-
assert geology_log["reason"] == "unambiguous match"
60+
assert geology_log["reason"] == "exact match"
61+
62+
# Fuzzy match log
63+
biolgy_log = next(e for e in log if e["original"] == "biolgy")
64+
assert biolgy_log["replacement"] == "http://example.org/ontology#Biology"
65+
assert "fuzzy match" in biolgy_log["reason"]
5066

51-
unknown_log = next(e for e in log if e["original"] == "UnknownLabel")
67+
# Unknown label log
68+
unknown_log = next(e for e in log if e["original"] == "unknownlabel")
5269
assert unknown_log["replacement"] is None
53-
assert unknown_log["reason"] == "no match in label map"
70+
assert unknown_log["reason"] == "no match found"

0 commit comments

Comments
 (0)