sdsc-ordes · rmfranken · Sep 25, 2025 · Oct 9, 2025 · Oct 14, 2025
diff --git a/src/strings2things/app/api/endpoints.py b/src/strings2things/app/api/endpoints.py
@@ -14,7 +14,7 @@
 ontology_manager.load_ontologies()
 
 # Keep transformer as a base instance
-base_label_map = ontology_manager.get_label_map()
+base_label_map = ontology_manager.get_predicate_label_map()
 
 
 @router.post("/transform")

diff --git a/src/strings2things/app/config.py b/src/strings2things/app/config.py
@@ -8,7 +8,6 @@ class Settings(BaseSettings):
     GRAPHDB_PASSWORD: str
     ONTOLOGY_SPARQL_ENDPOINT: str
     ONTOLOGY_GRAPH_IRIS: str  # raw string from .env
-    FAIL_ON_AMBIGUOUS_LABELS: bool = True
 
     _graph_iris: List[str] = PrivateAttr()
 

diff --git a/src/strings2things/app/core/ontology_manager.py b/src/strings2things/app/core/ontology_manager.py
@@ -1,27 +1,24 @@
-from rdflib import Graph
+from rdflib import Graph, Literal, URIRef, RDF, RDFS, XSD
 from SPARQLWrapper import SPARQLWrapper, TURTLE
 from strings2things.app.config import Settings
-from rdflib import Literal
-from rdflib import XSD
 
 settings = Settings()
 
 
 class OntologyManager:
     def __init__(self):
         self.graph = Graph()
-        self.label_map: dict[str, str] = {}
+        # Properly initialize as empty dict
+        self.label_map: dict[str, dict[str, str]] = {}
 
     def load_ontologies(self):
-        print(
-            f"[INFO] Connecting to SPARQL endpoint: {settings.ONTOLOGY_SPARQL_ENDPOINT}"
-        )
+        print(f"[INFO] Connecting to SPARQL endpoint: {settings.ONTOLOGY_SPARQL_ENDPOINT}")
         for graph_iri in settings.get_graph_iris():
             print(f"[INFO] Loading named graph: {graph_iri}")
             g = self._load_named_graph(settings.ONTOLOGY_SPARQL_ENDPOINT, graph_iri)
             self.graph += g
         print(f"[INFO] Loaded {len(self.graph)} triples.")
-        self._build_label_map()
+        self._build_predicate_label_map()
 
     def _load_named_graph(self, endpoint: str, graph_iri: str) -> Graph:
         sparql = SPARQLWrapper(endpoint)
@@ -40,55 +37,71 @@ def _load_named_graph(self, endpoint: str, graph_iri: str) -> Graph:
         g = Graph()
         g.parse(data=result, format="turtle")
         return g
-
-    def _build_label_map(self):
-        seen = {}
-        for s, p, o in self.graph:
-            if str(p) not in (
-                "http://www.w3.org/2000/01/rdf-schema#label",
-                "http://www.w3.org/2004/02/skos/core#prefLabel",
-            ):
-                continue
-
-            if not isinstance(o, Literal):
-                continue
-
-            if o.datatype and o.datatype != XSD.string:
-                continue
-
-            label = str(o).strip().lower()
-            iri = str(s)
-
-            if label in seen:
-                if seen[label] != iri:
-                    # Mark ambiguity by storing list of IRIs
-                    if isinstance(seen[label], list):
-                        seen[label].append(iri)
-                    else:
-                        seen[label] = [seen[label], iri]
-            else:
-                seen[label] = iri
-
-        # Check ambiguities and build final label_map
-        self.label_map = self._check_ambiguities(seen)
-        print(f"[INFO] Label map built with {len(self.label_map)} unambiguous labels.")
-
-    def _check_ambiguities(self, seen: dict[str, str | list[str]]) -> dict[str, str]:
-        ambiguous_labels = {
-            label for label, iris in seen.items() if isinstance(iris, list)
-        }
-
-        if ambiguous_labels:
-            msg = f"Found ambiguous labels: {', '.join(sorted(ambiguous_labels))} \n Please resolve these in your ontology before proceeding."
-            if settings.FAIL_ON_AMBIGUOUS_LABELS:
+
+    def _check_predicate_ambiguities(
+    self, predicate_map: dict[str, dict[str, str]]
+) -> dict[str, dict[str, str]]:
+        """
+        Fail immediately if any predicate has duplicate labels.
+        Returns the same map if no ambiguities are found.
+        """
+        for predicate, labels in predicate_map.items():
+            seen: set[str] = set()
+            duplicates: dict[str, list[str]] = {}
+
+            for label, iri in labels.items():
+                label_lower = label.lower().strip()
+                if label_lower in seen:
+                    duplicates.setdefault(label_lower, []).append(iri)
+                else:
+                    seen.add(label_lower)
+
+            if duplicates:
+                msg = (
+                    f"Ambiguous labels detected for predicate {predicate}:\n"
+                    + "\n".join(f"  '{lbl}' → {iris}" for lbl, iris in duplicates.items())
+                )
                 raise ValueError(msg)
-            else:
-                print(f"[WARNING] {msg}")
 
-        # Return only unambiguous labels (those with a single IRI string)
-        return {
-            label: iris for label, iris in seen.items() if not isinstance(iris, list)
-        }
+        return predicate_map
+
+    def _build_predicate_label_map(self):
+        """
+        Build a predicate-specific label map:
+        {predicate_iri -> {label -> instance_iri}}
+        Fail immediately if any predicate contains duplicate labels.
+        """
+        predicate_map: dict[str, dict[str, str]] = {}
+
+        SH_CLASS = URIRef("http://www.w3.org/ns/shacl#class")
+        SH_PATH = URIRef("http://www.w3.org/ns/shacl#path")
+        SH_PROPERTY_SHAPE = URIRef("http://www.w3.org/ns/shacl#PropertyShape")
 
-    def get_label_map(self) -> dict[str, str]:
+        for shape in self.graph.subjects(RDF.type, SH_PROPERTY_SHAPE):
+            for predicate in self.graph.objects(shape, SH_PATH):
+                predicate_iri = str(predicate)
+                predicate_map[predicate_iri] = {}
+
+                for cls in self.graph.objects(shape, SH_CLASS):
+                    for instance in self.graph.subjects(RDF.type, cls):
+                        for label_pred in [RDFS.label, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")]:
+                            for label in self.graph.objects(instance, label_pred):
+                                if isinstance(label, Literal) and (not label.datatype or label.datatype == XSD.string):
+                                    normalized_label = str(label).strip().lower()
+                                    # Check if duplicate
+                                    if normalized_label in predicate_map[predicate_iri]:
+                                        raise ValueError(
+                                            f"Ambiguous label '{normalized_label}' for predicate {predicate_iri}"
+                                        )
+                                    predicate_map[predicate_iri][normalized_label] = str(instance)
+
+        self.label_map = predicate_map
+
+
+
+
+    def get_predicate_label_map(self) -> dict[str, dict[str, str]]:
+        """
+        Returns the predicate-specific label map ready for RDFTransformer.
+        """
         return self.label_map
diff --git a/src/strings2things/app/core/rdf_transformer.py b/src/strings2things/app/core/rdf_transformer.py
@@ -10,80 +10,62 @@
 
 
 class RDFTransformer:
-    def __init__(self, label_map: dict[str, str], fuzzy: bool, fuzzy_threshold: int = 90):
+    def __init__(self, predicate_label_map: dict[str, dict[str, str]], fuzzy: bool, fuzzy_threshold: int = 90):
         """
-        :param label_map: dict of {label -> IRI}
-        :param fuzzy_threshold: minimum score for fuzzy fallback
+        :param predicate_label_map: {predicate_iri: {label -> IRI}}
         """
-        self.label_map = label_map
+        self.predicate_label_map = predicate_label_map
         self.fuzzy = fuzzy
         self.fuzzy_threshold = fuzzy_threshold
         self.log = TransformationLog()
 
-    def _find_match(self, label: str) -> str | None:
-        """
-        Find an IRI for the given label.
-        First tries exact match, then (optionally) falls back to fuzzy.
-        """
+    def _find_match(self, predicate: str, label: str) -> str | None:
         label = label.strip().lower()
+        label_map = self.predicate_label_map.get(predicate, {})
 
-        # Exact match first (cheap lookup)
-        iri = self.label_map.get(label)
+        # Exact match
+        iri = label_map.get(label)
         if iri:
             return iri
 
         # Fuzzy fallback
-        if self.fuzzy:
-            best = process.extractOne(label, self.label_map.keys())
+        if self.fuzzy and label_map:
+            best = process.extractOne(label, label_map.keys())
             if best:
                 match, score, _ = best
                 if score >= self.fuzzy_threshold:
-                    return self.label_map[match]
+                    return label_map[match]
 
         return None
 
-
     def transform(self, input_graph: Graph) -> Graph:
-        """
-        Replace matching string literals in the RDF graph with IRIs.
-        Returns a transformed RDFLib Graph.
-        """
         output_graph = Graph()
 
         for s, p, o in input_graph:
+            iri_str = None
             if isinstance(o, Literal) and isinstance(o.value, str):
-                iri_str = self._find_match(o.value)
+                iri_str = self._find_match(str(p), o.value)
                 if iri_str:
                     iri = URIRef(iri_str)
-
-                    # Retain original triple (for backward compatibility)
                     output_graph.add((s, p, o))
                     output_graph.add((iri, URIRef("http://www.example.org/thingOf"), o))
                     output_graph.add((s, p, iri))
-
                     self.log.add_entry(
                         subject=str(s),
                         predicate=str(p),
                         original_value=str(o),
                         replacement_iri=str(iri),
-                        reason="exact match"
-                        if o.value.strip().lower() in self.label_map
-                        else f"fuzzy match (threshold={self.fuzzy_threshold})",
+                        reason="exact match" if o.value.strip().lower() in self.predicate_label_map.get(str(p), {}) else f"fuzzy match (threshold={self.fuzzy_threshold})",
                     )
                     continue
 
-            # If no match found → leave as-is
             output_graph.add((s, p, o))
             self.log.add_entry(
                 subject=str(s),
                 predicate=str(p),
                 original_value=str(o),
                 replacement_iri=None,
-                reason=(
-                    "not a string literal"
-                    if not isinstance(o, Literal)
-                    else "no match found"
-                ),
+                reason="not a string literal" if not isinstance(o, Literal) else "no match found",
             )
 
         return output_graph
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,52 @@
+# tests/conftest.py
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+import pytest
+from rdflib import Graph
+
+# --- Environment Setup ---
+env_path = Path(__file__).resolve().parents[1] / ".env"
+load_dotenv(env_path)
+
+from strings2things.app.config import Settings
+from strings2things.app.core.ontology_manager import OntologyManager
+from strings2things.app.core.rdf_transformer import RDFTransformer
+
+settings = Settings()  # now environment variables are loaded
+# --- Fixtures ---
+
+@pytest.fixture
+def ontology_manager():
+    """
+    Returns an OntologyManager with predicate-specific label maps built
+    from examples/ontologies/test_ont.ttl
+    """
+    om = OntologyManager()
+    om.graph.parse("examples/ontologies/test_ont.ttl", format="turtle")
+    om._build_predicate_label_map()
+    return om
+
+
+@pytest.fixture
+def input_graph():
+    """
+    Returns the input RDF graph from examples/data/test_data.ttl
+    """
+    g = Graph()
+    g.parse("examples/data/test_data.ttl", format="turtle")
+    return g
+
+
+@pytest.fixture
+def simple_label_map():
+    """
+    Returns a small label map for RDFTransformer unit tests
+    """
+    from rdflib import Namespace
+    EX = Namespace("http://example.org/ontology#")
+    return {
+        "geology": str(EX.Geology),
+        "biology": str(EX.Biology),
+        "physics": str(EX.Physics),
+    }