Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/strings2things/app/api/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
ontology_manager.load_ontologies()

# Keep transformer as a base instance
base_label_map = ontology_manager.get_label_map()
base_label_map = ontology_manager.get_predicate_label_map()


@router.post("/transform")
Expand Down
1 change: 0 additions & 1 deletion src/strings2things/app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ class Settings(BaseSettings):
GRAPHDB_PASSWORD: str
ONTOLOGY_SPARQL_ENDPOINT: str
ONTOLOGY_GRAPH_IRIS: str # raw string from .env
FAIL_ON_AMBIGUOUS_LABELS: bool = True

_graph_iris: List[str] = PrivateAttr()

Expand Down
125 changes: 69 additions & 56 deletions src/strings2things/app/core/ontology_manager.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,24 @@
from rdflib import Graph
from rdflib import Graph, Literal, URIRef, RDF, RDFS, XSD
from SPARQLWrapper import SPARQLWrapper, TURTLE
from strings2things.app.config import Settings
from rdflib import Literal
from rdflib import XSD

settings = Settings()


class OntologyManager:
def __init__(self):
self.graph = Graph()
self.label_map: dict[str, str] = {}
# Properly initialize as empty dict
self.label_map: dict[str, dict[str, str]] = {}

def load_ontologies(self):
print(
f"[INFO] Connecting to SPARQL endpoint: {settings.ONTOLOGY_SPARQL_ENDPOINT}"
)
print(f"[INFO] Connecting to SPARQL endpoint: {settings.ONTOLOGY_SPARQL_ENDPOINT}")
for graph_iri in settings.get_graph_iris():
print(f"[INFO] Loading named graph: {graph_iri}")
g = self._load_named_graph(settings.ONTOLOGY_SPARQL_ENDPOINT, graph_iri)
self.graph += g
print(f"[INFO] Loaded {len(self.graph)} triples.")
self._build_label_map()
self._build_predicate_label_map()

def _load_named_graph(self, endpoint: str, graph_iri: str) -> Graph:
sparql = SPARQLWrapper(endpoint)
Expand All @@ -40,55 +37,71 @@ def _load_named_graph(self, endpoint: str, graph_iri: str) -> Graph:
g = Graph()
g.parse(data=result, format="turtle")
return g

def _build_label_map(self):
seen = {}
for s, p, o in self.graph:
if str(p) not in (
"http://www.w3.org/2000/01/rdf-schema#label",
"http://www.w3.org/2004/02/skos/core#prefLabel",
):
continue

if not isinstance(o, Literal):
continue

if o.datatype and o.datatype != XSD.string:
continue

label = str(o).strip().lower()
iri = str(s)

if label in seen:
if seen[label] != iri:
# Mark ambiguity by storing list of IRIs
if isinstance(seen[label], list):
seen[label].append(iri)
else:
seen[label] = [seen[label], iri]
else:
seen[label] = iri

# Check ambiguities and build final label_map
self.label_map = self._check_ambiguities(seen)
print(f"[INFO] Label map built with {len(self.label_map)} unambiguous labels.")

def _check_ambiguities(self, seen: dict[str, str | list[str]]) -> dict[str, str]:
ambiguous_labels = {
label for label, iris in seen.items() if isinstance(iris, list)
}

if ambiguous_labels:
msg = f"Found ambiguous labels: {', '.join(sorted(ambiguous_labels))} \n Please resolve these in your ontology before proceeding."
if settings.FAIL_ON_AMBIGUOUS_LABELS:

def _check_predicate_ambiguities(
self, predicate_map: dict[str, dict[str, str]]
) -> dict[str, dict[str, str]]:
"""
Fail immediately if any predicate has duplicate labels.
Returns the same map if no ambiguities are found.
"""
for predicate, labels in predicate_map.items():
seen: set[str] = set()
duplicates: dict[str, list[str]] = {}

for label, iri in labels.items():
label_lower = label.lower().strip()
if label_lower in seen:
duplicates.setdefault(label_lower, []).append(iri)
else:
seen.add(label_lower)

if duplicates:
msg = (
f"Ambiguous labels detected for predicate {predicate}:\n"
+ "\n".join(f" '{lbl}' → {iris}" for lbl, iris in duplicates.items())
)
raise ValueError(msg)
else:
print(f"[WARNING] {msg}")

# Return only unambiguous labels (those with a single IRI string)
return {
label: iris for label, iris in seen.items() if not isinstance(iris, list)
}
return predicate_map

Comment thread
rmfranken marked this conversation as resolved.
def _build_predicate_label_map(self):
"""
Build a predicate-specific label map:
{predicate_iri -> {label -> instance_iri}}
Fail immediately if any predicate contains duplicate labels.
"""
predicate_map: dict[str, dict[str, str]] = {}

SH_CLASS = URIRef("http://www.w3.org/ns/shacl#class")
SH_PATH = URIRef("http://www.w3.org/ns/shacl#path")
SH_PROPERTY_SHAPE = URIRef("http://www.w3.org/ns/shacl#PropertyShape")

def get_label_map(self) -> dict[str, str]:
for shape in self.graph.subjects(RDF.type, SH_PROPERTY_SHAPE):
for predicate in self.graph.objects(shape, SH_PATH):
predicate_iri = str(predicate)
predicate_map[predicate_iri] = {}

for cls in self.graph.objects(shape, SH_CLASS):
for instance in self.graph.subjects(RDF.type, cls):
for label_pred in [RDFS.label, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")]:
for label in self.graph.objects(instance, label_pred):
if isinstance(label, Literal) and (not label.datatype or label.datatype == XSD.string):
normalized_label = str(label).strip().lower()
# Check if duplicate
if normalized_label in predicate_map[predicate_iri]:
raise ValueError(
f"Ambiguous label '{normalized_label}' for predicate {predicate_iri}"
)
predicate_map[predicate_iri][normalized_label] = str(instance)

self.label_map = predicate_map




def get_predicate_label_map(self) -> dict[str, dict[str, str]]:
"""
Returns the predicate-specific label map ready for RDFTransformer.
"""
return self.label_map
46 changes: 14 additions & 32 deletions src/strings2things/app/core/rdf_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,80 +10,62 @@


class RDFTransformer:
def __init__(self, label_map: dict[str, str], fuzzy: bool, fuzzy_threshold: int = 90):
def __init__(self, predicate_label_map: dict[str, dict[str, str]], fuzzy: bool, fuzzy_threshold: int = 90):
"""
:param label_map: dict of {label -> IRI}
:param fuzzy_threshold: minimum score for fuzzy fallback
:param predicate_label_map: {predicate_iri: {label -> IRI}}
"""
self.label_map = label_map
self.predicate_label_map = predicate_label_map
self.fuzzy = fuzzy
self.fuzzy_threshold = fuzzy_threshold
self.log = TransformationLog()

def _find_match(self, label: str) -> str | None:
"""
Find an IRI for the given label.
First tries exact match, then (optionally) falls back to fuzzy.
"""
def _find_match(self, predicate: str, label: str) -> str | None:
label = label.strip().lower()
label_map = self.predicate_label_map.get(predicate, {})

# Exact match first (cheap lookup)
iri = self.label_map.get(label)
# Exact match
iri = label_map.get(label)
if iri:
return iri

# Fuzzy fallback
if self.fuzzy:
best = process.extractOne(label, self.label_map.keys())
if self.fuzzy and label_map:
best = process.extractOne(label, label_map.keys())
if best:
match, score, _ = best
if score >= self.fuzzy_threshold:
return self.label_map[match]
return label_map[match]

return None


def transform(self, input_graph: Graph) -> Graph:
"""
Replace matching string literals in the RDF graph with IRIs.
Returns a transformed RDFLib Graph.
"""
output_graph = Graph()

for s, p, o in input_graph:
iri_str = None
if isinstance(o, Literal) and isinstance(o.value, str):
iri_str = self._find_match(o.value)
iri_str = self._find_match(str(p), o.value)
if iri_str:
iri = URIRef(iri_str)

# Retain original triple (for backward compatibility)
output_graph.add((s, p, o))
output_graph.add((iri, URIRef("http://www.example.org/thingOf"), o))
output_graph.add((s, p, iri))

self.log.add_entry(
subject=str(s),
predicate=str(p),
original_value=str(o),
replacement_iri=str(iri),
reason="exact match"
if o.value.strip().lower() in self.label_map
else f"fuzzy match (threshold={self.fuzzy_threshold})",
reason="exact match" if o.value.strip().lower() in self.predicate_label_map.get(str(p), {}) else f"fuzzy match (threshold={self.fuzzy_threshold})",
Comment thread
rmfranken marked this conversation as resolved.
Outdated
)
continue

# If no match found → leave as-is
output_graph.add((s, p, o))
self.log.add_entry(
subject=str(s),
predicate=str(p),
original_value=str(o),
replacement_iri=None,
reason=(
"not a string literal"
if not isinstance(o, Literal)
else "no match found"
),
reason="not a string literal" if not isinstance(o, Literal) else "no match found",
)

return output_graph
52 changes: 52 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# tests/conftest.py
import os
from pathlib import Path
from dotenv import load_dotenv
import pytest
from rdflib import Graph

# --- Environment Setup ---
env_path = Path(__file__).resolve().parents[1] / ".env"
load_dotenv(env_path)

from strings2things.app.config import Settings
from strings2things.app.core.ontology_manager import OntologyManager
from strings2things.app.core.rdf_transformer import RDFTransformer

settings = Settings() # now environment variables are loaded
# --- Fixtures ---

@pytest.fixture
def ontology_manager():
"""
Returns an OntologyManager with predicate-specific label maps built
from examples/ontologies/test_ont.ttl
"""
om = OntologyManager()
om.graph.parse("examples/ontologies/test_ont.ttl", format="turtle")
om._build_predicate_label_map()
return om


@pytest.fixture
def input_graph():
"""
Returns the input RDF graph from examples/data/test_data.ttl
"""
g = Graph()
g.parse("examples/data/test_data.ttl", format="turtle")
return g


@pytest.fixture
def simple_label_map():
"""
Returns a small label map for RDFTransformer unit tests
"""
from rdflib import Namespace
EX = Namespace("http://example.org/ontology#")
return {
"geology": str(EX.Geology),
"biology": str(EX.Biology),
"physics": str(EX.Physics),
}
Loading