diff --git a/src/strings2things/app/api/endpoints.py b/src/strings2things/app/api/endpoints.py index 5a37322..5c7039b 100644 --- a/src/strings2things/app/api/endpoints.py +++ b/src/strings2things/app/api/endpoints.py @@ -14,7 +14,7 @@ ontology_manager.load_ontologies() # Keep transformer as a base instance -base_label_map = ontology_manager.get_label_map() +base_label_map = ontology_manager.get_predicate_label_map() @router.post("/transform") diff --git a/src/strings2things/app/config.py b/src/strings2things/app/config.py index d956628..9bd3608 100644 --- a/src/strings2things/app/config.py +++ b/src/strings2things/app/config.py @@ -8,7 +8,6 @@ class Settings(BaseSettings): GRAPHDB_PASSWORD: str ONTOLOGY_SPARQL_ENDPOINT: str ONTOLOGY_GRAPH_IRIS: str # raw string from .env - FAIL_ON_AMBIGUOUS_LABELS: bool = True _graph_iris: List[str] = PrivateAttr() diff --git a/src/strings2things/app/core/ontology_manager.py b/src/strings2things/app/core/ontology_manager.py index 16768e5..40a7ca0 100644 --- a/src/strings2things/app/core/ontology_manager.py +++ b/src/strings2things/app/core/ontology_manager.py @@ -1,8 +1,6 @@ -from rdflib import Graph +from rdflib import Graph, Literal, URIRef, RDF, RDFS, XSD from SPARQLWrapper import SPARQLWrapper, TURTLE from strings2things.app.config import Settings -from rdflib import Literal -from rdflib import XSD settings = Settings() @@ -10,18 +8,17 @@ class OntologyManager: def __init__(self): self.graph = Graph() - self.label_map: dict[str, str] = {} + # Properly initialize as empty dict + self.label_map: dict[str, dict[str, str]] = {} def load_ontologies(self): - print( - f"[INFO] Connecting to SPARQL endpoint: {settings.ONTOLOGY_SPARQL_ENDPOINT}" - ) + print(f"[INFO] Connecting to SPARQL endpoint: {settings.ONTOLOGY_SPARQL_ENDPOINT}") for graph_iri in settings.get_graph_iris(): print(f"[INFO] Loading named graph: {graph_iri}") g = self._load_named_graph(settings.ONTOLOGY_SPARQL_ENDPOINT, graph_iri) self.graph += g print(f"[INFO] Loaded {len(self.graph)} triples.") - self._build_label_map() + self._build_predicate_label_map() def _load_named_graph(self, endpoint: str, graph_iri: str) -> Graph: sparql = SPARQLWrapper(endpoint) @@ -41,54 +38,67 @@ def _load_named_graph(self, endpoint: str, graph_iri: str) -> Graph: g.parse(data=result, format="turtle") return g - def _build_label_map(self): - seen = {} - for s, p, o in self.graph: - if str(p) not in ( - "http://www.w3.org/2000/01/rdf-schema#label", - "http://www.w3.org/2004/02/skos/core#prefLabel", - ): - continue - - if not isinstance(o, Literal): - continue - - if o.datatype and o.datatype != XSD.string: - continue - - label = str(o).strip().lower() - iri = str(s) - - if label in seen: - if seen[label] != iri: - # Mark ambiguity by storing list of IRIs - if isinstance(seen[label], list): - seen[label].append(iri) - else: - seen[label] = [seen[label], iri] - else: - seen[label] = iri - - # Check ambiguities and build final label_map - self.label_map = self._check_ambiguities(seen) - print(f"[INFO] Label map built with {len(self.label_map)} unambiguous labels.") - - def _check_ambiguities(self, seen: dict[str, str | list[str]]) -> dict[str, str]: - ambiguous_labels = { - label for label, iris in seen.items() if isinstance(iris, list) - } - - if ambiguous_labels: - msg = f"Found ambiguous labels: {', '.join(sorted(ambiguous_labels))} \n Please resolve these in your ontology before proceeding." - if settings.FAIL_ON_AMBIGUOUS_LABELS: + def _check_predicate_ambiguities( + self, predicate_map: dict[str, dict[str, str]] + ) -> dict[str, dict[str, str]]: + """ + Fail immediately if any predicate has duplicate labels. + Returns the same map if no ambiguities are found. + """ + for predicate, labels in predicate_map.items(): + seen: set[str] = set() + duplicates: dict[str, list[str]] = {} + + for label, iri in labels.items(): + label_lower = label.lower().strip() + if label_lower in seen: + duplicates.setdefault(label_lower, []).append(iri) + else: + seen.add(label_lower) + + if duplicates: + msg = ( + f"Ambiguous labels detected for predicate {predicate}:\n" + + "\n".join(f" '{lbl}' → {iris}" for lbl, iris in duplicates.items()) + ) raise ValueError(msg) - else: - print(f"[WARNING] {msg}") - # Return only unambiguous labels (those with a single IRI string) - return { - label: iris for label, iris in seen.items() if not isinstance(iris, list) - } + return predicate_map - def get_label_map(self) -> dict[str, str]: + def _build_predicate_label_map(self): + """ + Build a predicate-specific label map: + {predicate_iri -> {label -> instance_iri}} + Fail immediately if any predicate contains duplicate labels. + """ + predicate_map: dict[str, dict[str, str]] = {} + + SH_CLASS = URIRef("http://www.w3.org/ns/shacl#class") + SH_PATH = URIRef("http://www.w3.org/ns/shacl#path") + SH_PROPERTY_SHAPE = URIRef("http://www.w3.org/ns/shacl#PropertyShape") + + for shape in self.graph.subjects(RDF.type, SH_PROPERTY_SHAPE): + for predicate in self.graph.objects(shape, SH_PATH): + predicate_iri = str(predicate) + predicate_map[predicate_iri] = {} + + for cls in self.graph.objects(shape, SH_CLASS): + for instance in self.graph.subjects(RDF.type, cls): + for label_pred in [RDFS.label, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")]: + for label in self.graph.objects(instance, label_pred): + if isinstance(label, Literal) and (not label.datatype or label.datatype == XSD.string): + normalized_label = str(label).strip().lower() + if normalized_label in predicate_map[predicate_iri]: + raise ValueError( + f"Ambiguous label '{normalized_label}' for predicate {predicate_iri}" + ) + predicate_map[predicate_iri][normalized_label] = str(instance) + + # ✅ Validate the entire map before finalizing + self.label_map = self._check_predicate_ambiguities(predicate_map) + + def get_predicate_label_map(self) -> dict[str, dict[str, str]]: + """ + Returns the predicate-specific label map ready for RDFTransformer. + """ return self.label_map diff --git a/src/strings2things/app/core/rdf_transformer.py b/src/strings2things/app/core/rdf_transformer.py index 2248253..25f743a 100644 --- a/src/strings2things/app/core/rdf_transformer.py +++ b/src/strings2things/app/core/rdf_transformer.py @@ -10,80 +10,66 @@ class RDFTransformer: - def __init__(self, label_map: dict[str, str], fuzzy: bool, fuzzy_threshold: int = 90): + def __init__(self, predicate_label_map: dict[str, dict[str, str]], fuzzy: bool, fuzzy_threshold: int = 90): """ - :param label_map: dict of {label -> IRI} - :param fuzzy_threshold: minimum score for fuzzy fallback + :param predicate_label_map: {predicate_iri: {label -> IRI}} """ - self.label_map = label_map + self.predicate_label_map = predicate_label_map self.fuzzy = fuzzy self.fuzzy_threshold = fuzzy_threshold self.log = TransformationLog() - def _find_match(self, label: str) -> str | None: - """ - Find an IRI for the given label. - First tries exact match, then (optionally) falls back to fuzzy. - """ + def _find_match(self, predicate: str, label: str) -> str | None: label = label.strip().lower() + label_map = self.predicate_label_map.get(predicate, {}) - # Exact match first (cheap lookup) - iri = self.label_map.get(label) + # Exact match + iri = label_map.get(label) if iri: return iri # Fuzzy fallback - if self.fuzzy: - best = process.extractOne(label, self.label_map.keys()) + if self.fuzzy and label_map: + best = process.extractOne(label, label_map.keys()) if best: match, score, _ = best if score >= self.fuzzy_threshold: - return self.label_map[match] + return label_map[match] return None - def transform(self, input_graph: Graph) -> Graph: - """ - Replace matching string literals in the RDF graph with IRIs. - Returns a transformed RDFLib Graph. - """ output_graph = Graph() for s, p, o in input_graph: + iri_str = None if isinstance(o, Literal) and isinstance(o.value, str): - iri_str = self._find_match(o.value) + iri_str = self._find_match(str(p), o.value) if iri_str: iri = URIRef(iri_str) - - # Retain original triple (for backward compatibility) output_graph.add((s, p, o)) output_graph.add((iri, URIRef("http://www.example.org/thingOf"), o)) output_graph.add((s, p, iri)) - self.log.add_entry( subject=str(s), predicate=str(p), original_value=str(o), replacement_iri=str(iri), - reason="exact match" - if o.value.strip().lower() in self.label_map - else f"fuzzy match (threshold={self.fuzzy_threshold})", + reason=( + "exact match" + if o.value.strip().lower() in self.predicate_label_map.get(str(p), {}) + else f"fuzzy match (threshold={self.fuzzy_threshold})" + ), ) continue - # If no match found → leave as-is output_graph.add((s, p, o)) self.log.add_entry( subject=str(s), predicate=str(p), original_value=str(o), replacement_iri=None, - reason=( - "not a string literal" - if not isinstance(o, Literal) - else "no match found" - ), + reason="not a string literal" if not isinstance(o, Literal) else "no match found", ) return output_graph diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..9fcd20b --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,52 @@ +# tests/conftest.py +import os +from pathlib import Path +from dotenv import load_dotenv +import pytest +from rdflib import Graph + +# --- Environment Setup --- +env_path = Path(__file__).resolve().parents[1] / ".env" +load_dotenv(env_path) + +from strings2things.app.config import Settings +from strings2things.app.core.ontology_manager import OntologyManager +from strings2things.app.core.rdf_transformer import RDFTransformer + +settings = Settings() # now environment variables are loaded +# --- Fixtures --- + +@pytest.fixture +def ontology_manager(): + """ + Returns an OntologyManager with predicate-specific label maps built + from examples/ontologies/test_ont.ttl + """ + om = OntologyManager() + om.graph.parse("examples/ontologies/test_ont.ttl", format="turtle") + om._build_predicate_label_map() + return om + + +@pytest.fixture +def input_graph(): + """ + Returns the input RDF graph from examples/data/test_data.ttl + """ + g = Graph() + g.parse("examples/data/test_data.ttl", format="turtle") + return g + + +@pytest.fixture +def simple_label_map(): + """ + Returns a small label map for RDFTransformer unit tests + """ + from rdflib import Namespace + EX = Namespace("http://example.org/ontology#") + return { + "geology": str(EX.Geology), + "biology": str(EX.Biology), + "physics": str(EX.Physics), + } diff --git a/tests/test_ambigous.py b/tests/test_ambigous.py index 7d56155..24e0300 100644 --- a/tests/test_ambigous.py +++ b/tests/test_ambigous.py @@ -1,59 +1,55 @@ +# tests/test_ambiguous.py import pytest from rdflib import Graph, URIRef, Literal +from rdflib.namespace import RDF, RDFS from strings2things.app.core.ontology_manager import OntologyManager -import strings2things.app.config as config_module -settings = config_module.Settings() # use the global settings instance your app uses +SH = "http://www.w3.org/ns/shacl#" +EX = "http://example.org/" - -def create_ambiguous_graph(): +def create_mock_ontology_graph_with_duplicates() -> Graph: + """ + Returns a mock ontology graph containing: + - A PropertyShape constrained to a class + - Two instances of that class with the same label + """ g = Graph() - g.add( - ( - URIRef("http://example.org/resource1"), - URIRef("http://www.w3.org/2000/01/rdf-schema#label"), - Literal("ambiguouslabel"), - ) - ) - g.add( - ( - URIRef("http://example.org/resource2"), - URIRef("http://www.w3.org/2000/01/rdf-schema#label"), - Literal("ambiguouslabel"), - ) - ) - return g - -def test_ambiguous_labels_detection_fail(monkeypatch): - monkeypatch.setattr( - "strings2things.app.core.ontology_manager.settings.FAIL_ON_AMBIGUOUS_LABELS", - True, - ) + # Define a PropertyShape + prop_shape = URIRef(EX + "hasCategoryShape") + g.add((prop_shape, RDF.type, URIRef(SH + "PropertyShape"))) + g.add((prop_shape, URIRef(SH + "path"), URIRef(EX + "hasCategory"))) + g.add((prop_shape, URIRef(SH + "class"), URIRef(EX + "Category"))) - manager = OntologyManager() - manager.graph = create_ambiguous_graph() - - with pytest.raises(ValueError) as exc_info: - manager._build_label_map() - assert "ambiguouslabel" in str(exc_info.value) + # Two instances of Category with the same label + instance1 = URIRef(EX + "cat1") + instance2 = URIRef(EX + "cat2") + g.add((instance1, RDF.type, URIRef(EX + "Category"))) + g.add((instance2, RDF.type, URIRef(EX + "Category"))) + g.add((instance1, RDFS.label, Literal("ambiguouslabel"))) + g.add((instance2, RDFS.label, Literal("ambiguouslabel"))) + return g -def test_ambiguous_labels_detection_no_fail(monkeypatch): - # Patch the *module-level* settings inside ontology_manager to False - monkeypatch.setattr( - "strings2things.app.core.ontology_manager.settings.FAIL_ON_AMBIGUOUS_LABELS", - False, - ) +@pytest.fixture +def ontology_manager_with_duplicates() -> OntologyManager: + """ + Return an OntologyManager instance with the mocked duplicate-ontology graph. + """ manager = OntologyManager() - manager.graph = create_ambiguous_graph() + manager.graph = create_mock_ontology_graph_with_duplicates() + return manager - # Should NOT raise ValueError - try: - manager._build_label_map() - except ValueError: - pytest.fail("ValueError raised even though FAIL_ON_AMBIGUOUS_LABELS is False") - # The ambiguous label should not be in the map - assert "ambiguouslabel" not in manager.get_label_map() +def test_ambiguous_labels_detection_raises(ontology_manager_with_duplicates): + """ + Ensure that _build_predicate_label_map raises ValueError if there are duplicate labels + within the same predicate/class. + """ + manager = ontology_manager_with_duplicates + with pytest.raises(ValueError) as exc_info: + manager._build_predicate_label_map() + + # Check that the error message contains the duplicate label + assert "ambiguouslabel" in str(exc_info.value) diff --git a/tests/test_rdf_transformer.py b/tests/test_rdf_transformer.py index d94ca27..53a9128 100644 --- a/tests/test_rdf_transformer.py +++ b/tests/test_rdf_transformer.py @@ -1,3 +1,4 @@ +# tests/test_rdf_transformer.py import pytest from rdflib import Graph, URIRef, Literal, Namespace from strings2things.app.core.rdf_transformer import RDFTransformer @@ -6,8 +7,10 @@ @pytest.fixture -def label_map(): - # Ontology label map: canonical labels → IRIs +def simple_label_map(): + """ + Mock label map: labels -> instance IRIs + """ return { "geology": str(EX.Geology), "biology": str(EX.Biology), @@ -16,55 +19,35 @@ def label_map(): @pytest.fixture -def input_graph(): +def input_graph_simple(): g = Graph() - # Exact match example + # Exact match g.add((EX.subj1, EX.hasCategory, Literal("geology"))) - - # Fuzzy match example (slightly misspelled) + # Fuzzy match g.add((EX.subj2, EX.hasCategory, Literal("biolgy"))) - - # Unknown label (should remain unchanged) + # Unknown g.add((EX.subj3, EX.hasCategory, Literal("unknownlabel"))) - - # Non-literal value (should remain untouched) + # Non-literal g.add((EX.subj4, EX.hasValue, URIRef("http://example.org/someIRI"))) return g -def test_rdf_transformer_combined(label_map, input_graph): - # Initialize transformer with fuzzy matching enabled - transformer = RDFTransformer(label_map, fuzzy=True, fuzzy_threshold=90) - output_graph = transformer.transform(input_graph) - - # --- 1. Check graph triples --- - - # Exact match: literal replaced by IRI - assert (EX.subj1, EX.hasCategory, URIRef("http://example.org/ontology#Geology")) in output_graph +def test_rdf_transformer(simple_label_map, input_graph_simple): + # Use full predicate IRI in the label map + predicate_label_map = {str(EX.hasCategory): simple_label_map} - # Fuzzy match: literal replaced by IRI - assert (EX.subj2, EX.hasCategory, URIRef("http://example.org/ontology#Biology")) in output_graph + transformer = RDFTransformer( + predicate_label_map=predicate_label_map, + fuzzy=True, + fuzzy_threshold=90 + ) + output_graph = transformer.transform(input_graph_simple) - # Unknown label: literal remains unchanged + # Exact match → replaced with IRI + assert (EX.subj1, EX.hasCategory, EX.Geology) in output_graph + # Fuzzy match → replaced with closest IRI + assert (EX.subj2, EX.hasCategory, EX.Biology) in output_graph + # Unknown → remains literal assert (EX.subj3, EX.hasCategory, Literal("unknownlabel")) in output_graph - - # Non-literals remain untouched + # Non-literal → untouched assert (EX.subj4, EX.hasValue, URIRef("http://example.org/someIRI")) in output_graph - - # --- 2. Check log entries --- - log = transformer.log.entries - - # Exact match log - geology_log = next(e for e in log if e["original"] == "geology") - assert geology_log["replacement"] == "http://example.org/ontology#Geology" - assert geology_log["reason"] == "exact match" - - # Fuzzy match log - biolgy_log = next(e for e in log if e["original"] == "biolgy") - assert biolgy_log["replacement"] == "http://example.org/ontology#Biology" - assert "fuzzy match" in biolgy_log["reason"] - - # Unknown label log - unknown_log = next(e for e in log if e["original"] == "unknownlabel") - assert unknown_log["replacement"] is None - assert unknown_log["reason"] == "no match found"