Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ dependencies = [
"python-multipart>=0.0.20",
"rdflib>=7.1.4",
"SPARQLWrapper>=2.0.0",
"rapidfuzz>=3.0.0",
]
[dependency-groups]
dev = [
Expand Down
32 changes: 22 additions & 10 deletions src/strings2things/app/api/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,48 @@
from fastapi import APIRouter, UploadFile, File, Form
from fastapi.responses import Response
from rdflib import Graph
from src.strings2things.app.core.rdf_transformer import RDFTransformer
from src.strings2things.app.core.ontology_manager import (
OntologyManager,
) # Assume this exists
from src.strings2things.app.utils.rdf_utils import (
parse_rdf,
serialize_rdf,
) # Also assume or create
from strings2things.app.core.rdf_transformer import RDFTransformer
from strings2things.app.core.ontology_manager import OntologyManager
from strings2things.app.utils.rdf_utils import parse_rdf, serialize_rdf
import logging

router = APIRouter()

ontology_manager = OntologyManager()
ontology_manager.load_ontologies()

transformer = RDFTransformer(ontology_manager.get_label_map())
# Keep transformer as a base instance
base_label_map = ontology_manager.get_label_map()


@router.post("/transform")
async def transform_rdf(
file: UploadFile = File(...), serialization: str = Form("turtle")
file: UploadFile = File(...),
serialization: str = Form("turtle"),
fuzzy: bool = Form(False), # <-- new parameter
fuzzy_threshold: int = Form(90), # <-- configurable fuzzy matching threshold
) -> Response:
"""
Accepts an RDF file upload, transforms it using the label map,
and returns the modified RDF graph in the requested format.

Args:
file: The uploaded RDF file
serialization: Desired output RDF serialization (default: turtle)
fuzzy: Whether to use fuzzy matching (default: False)
threshold: Similarity threshold for fuzzy matching (0-100, default: 90)
"""
content = await file.read()

try:
input_graph = parse_rdf(content)

transformer = RDFTransformer(
base_label_map,
fuzzy=fuzzy,
fuzzy_threshold=fuzzy_threshold,
)

transformed_graph = transformer.transform(input_graph)
serialized = serialize_rdf(transformed_graph, output_format=serialization)

Expand Down
51 changes: 39 additions & 12 deletions src/strings2things/app/core/rdf_transformer.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,45 @@
# app/core/rdf_transformer.py
"""
Transforms RDF graphs by replacing string literals with matching ontology IRIs.
Supports exact and fuzzy matching (RapidFuzz).
"""

from rdflib import Graph, Literal, URIRef
from strings2things.app.core.transformation_log import TransformationLog
from rapidfuzz import process


class RDFTransformer:
def __init__(self, label_map: dict[str, str]):
def __init__(self, label_map: dict[str, str], fuzzy: bool, fuzzy_threshold: int = 90):
"""
:param label_map: dict of {label -> IRI}
:param fuzzy_threshold: minimum score for fuzzy fallback
"""
self.label_map = label_map
self.fuzzy = fuzzy
self.fuzzy_threshold = fuzzy_threshold
self.log = TransformationLog()

def _find_match(self, label: str) -> str | None:
"""
Find an IRI for the given label.
First tries exact match, then falls back to fuzzy.
"""
label = label.strip().lower()

# Exact match
if label in self.label_map:
return self.label_map[label]

# Fuzzy fallback
best = process.extractOne(label, self.label_map.keys())
if best:
match, score, _ = best
if score >= self.fuzzy_threshold:
return self.label_map[match]
Comment thread
rmfranken marked this conversation as resolved.
Outdated

return None

def transform(self, input_graph: Graph) -> Graph:
"""
Replace matching string literals in the RDF graph with IRIs.
Expand All @@ -20,29 +48,28 @@ def transform(self, input_graph: Graph) -> Graph:
output_graph = Graph()

for s, p, o in input_graph:
# if string matches object
if isinstance(o, Literal) and isinstance(o.value, str):
label = o.value.strip().lower()
if label in self.label_map:
iri = URIRef(self.label_map[label])
iri_str = self._find_match(o.value)
if iri_str:
iri = URIRef(iri_str)

# Retain original triple (to retain backward compatibility for now)
# Retain original triple (for backward compatibility)
output_graph.add((s, p, o))
output_graph.add(
(iri, URIRef("http://wwww.example.org/thingOf"), o)
)

output_graph.add((iri, URIRef("http://wwww.example.org/thingOf"), o))
Comment thread
rmfranken marked this conversation as resolved.
Outdated
output_graph.add((s, p, iri))

self.log.add_entry(
subject=str(s),
predicate=str(p),
original_value=str(o),
replacement_iri=str(iri),
reason="unambiguous match",
reason="exact match"
if o.value.strip().lower() in self.label_map
else f"fuzzy match (threshold={self.fuzzy_threshold})",
)
continue

# If no match found → leave as-is
output_graph.add((s, p, o))
self.log.add_entry(
subject=str(s),
Expand All @@ -52,7 +79,7 @@ def transform(self, input_graph: Graph) -> Graph:
reason=(
"not a string literal"
if not isinstance(o, Literal)
else "no match in label map"
else "no match found"
),
)

Expand Down
67 changes: 42 additions & 25 deletions tests/test_rdf_transformer.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,70 @@
import pytest
from rdflib import Graph, URIRef, Literal, Namespace
from strings2things.app.core.rdf_transformer import RDFTransformer
from strings2things.app.core.transformation_log import TransformationLog

EX = Namespace("http://example.org/")
EX = Namespace("http://example.org/ontology#")


@pytest.fixture
def label_map():
# Ontology label map: canonical labels → IRIs
return {
"geology": "http://example.org/ontology#Geology",
"biology": "http://example.org/ontology#Biology",
"geology": str(EX.Geology),
"biology": str(EX.Biology),
"physics": str(EX.Physics),
}


@pytest.fixture
def input_graph():
g = Graph()
g.add((EX.subj1, EX.hasCategory, Literal("Geology")))
g.add((EX.subj2, EX.hasCategory, Literal("UnknownLabel")))
g.add((EX.subj3, EX.hasValue, URIRef("http://example.org/someIRI")))
# Exact match example
g.add((EX.subj1, EX.hasCategory, Literal("geology")))

# Fuzzy match example (slightly misspelled)
g.add((EX.subj2, EX.hasCategory, Literal("biolgy")))

# Unknown label (should remain unchanged)
g.add((EX.subj3, EX.hasCategory, Literal("unknownlabel")))

# Non-literal value (should remain untouched)
g.add((EX.subj4, EX.hasValue, URIRef("http://example.org/someIRI")))
return g


def test_rdf_transformer(label_map, input_graph):
transformer = RDFTransformer(label_map)
def test_rdf_transformer_combined(label_map, input_graph):
# Initialize transformer with fuzzy matching enabled
transformer = RDFTransformer(label_map, fuzzy=True, fuzzy_threshold=90)
output_graph = transformer.transform(input_graph)

# --- Check graph triples ---
# Original triple must remain
assert (EX.subj1, EX.hasCategory, Literal("Geology")) in output_graph
# --- 1. Check graph triples ---

# Transformed triple must be present
expected_iri = URIRef("http://example.org/ontology#Geology")
assert (EX.subj1, EX.hasCategory, expected_iri) in output_graph
# Exact match: literal replaced by IRI
assert (EX.subj1, EX.hasCategory, URIRef("http://example.org/ontology#Geology")) in output_graph

# UnknownLabel should remain unchanged (no IRI added)
assert (EX.subj2, EX.hasCategory, Literal("UnknownLabel")) in output_graph
assert len([t for t in output_graph.triples((EX.subj2, EX.hasCategory, None))]) == 1
# Fuzzy match: literal replaced by IRI
assert (EX.subj2, EX.hasCategory, URIRef("http://example.org/ontology#Biology")) in output_graph

# Non-literals should remain untouched
assert (EX.subj3, EX.hasValue, URIRef("http://example.org/someIRI")) in output_graph
# Unknown label: literal remains unchanged
assert (EX.subj3, EX.hasCategory, Literal("unknownlabel")) in output_graph

# --- Check log entries ---
# Non-literals remain untouched
assert (EX.subj4, EX.hasValue, URIRef("http://example.org/someIRI")) in output_graph

# --- 2. Check log entries ---
log = transformer.log.entries
geology_log = next(e for e in log if e["original"] == "Geology")

# Exact match log
geology_log = next(e for e in log if e["original"] == "geology")
assert geology_log["replacement"] == "http://example.org/ontology#Geology"
assert geology_log["reason"] == "unambiguous match"
assert geology_log["reason"] == "exact match"

# Fuzzy match log
biolgy_log = next(e for e in log if e["original"] == "biolgy")
assert biolgy_log["replacement"] == "http://example.org/ontology#Biology"
assert "fuzzy match" in biolgy_log["reason"]

unknown_log = next(e for e in log if e["original"] == "UnknownLabel")
# Unknown label log
unknown_log = next(e for e in log if e["original"] == "unknownlabel")
assert unknown_log["replacement"] is None
assert unknown_log["reason"] == "no match in label map"
assert unknown_log["reason"] == "no match found"
Loading