Skip to content

Commit 75f6963

Browse files
committed
feat: stable version for semantic similarity based terminology mappings
1 parent 4f0c64c commit 75f6963

File tree

2 files changed

+94
-21
lines changed

2 files changed

+94
-21
lines changed

semantic_iot/RML_preprocess.py

Lines changed: 92 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import json
22
import logging
33
import os
4+
import time
45
from typing import List, Any
56
from rapidfuzz import fuzz
6-
from rdflib import Graph, RDF, RDFS, OWL
7+
from rdflib import Graph, RDF, RDFS, OWL, SKOS, DC
78
from sentence_transformers import SentenceTransformer, util
8-
from semantic_iot.JSON_preprocess import JSONPreprocessor, JSONPreprocessorHandler
9+
from semantic_iot.JSON_preprocess import JSONPreprocessorHandler
910

1011

1112
class MappingPreprocess:
@@ -55,6 +56,9 @@ def __init__(self,
5556
self.ontology_property_classes = None
5657
self.ontology_prefixes = None
5758
# self.ontology_prefixes_convert = None
59+
# only for semantic mode
60+
self.ontology_classes_semantic_info = None
61+
self.ontology_property_classes_semantic_info = None
5862

5963
if similarity_mode not in ["string", "semantic"]:
6064
logging.warning(f"Invalid similarity mode: {similarity_mode}. "
@@ -98,27 +102,72 @@ def load_ontology(self):
98102
for s, p, o in _graph.triples((None, RDF.type, OWL.Class)):
99103
label = _graph.value(subject=s, predicate=RDFS.label)
100104
if label:
101-
ontology_classes[str(label).lower()] = str(s)
105+
ontology_classes[str(label).lower()] = s
102106
else:
103107
local_name = s.split("#")[-1] if "#" in s else s.split("/")[-1]
104-
ontology_classes[local_name.lower()] = str(s)
108+
ontology_classes[local_name.lower()] = s
105109
self.ontology_classes = ontology_classes
106110

107111
# Extracting property classes
108112
property_classes = {}
109113
for s, p, o in _graph.triples((None, RDF.type, OWL.ObjectProperty)):
110114
label = _graph.value(subject=s, predicate=RDFS.label)
111115
if label:
112-
property_classes[str(label).lower()] = str(s)
116+
property_classes[str(label).lower()] = s
113117
else:
114118
local_name = s.split("#")[-1] if "#" in s else s.split("/")[-1]
115-
property_classes[local_name.lower()] = str(s)
119+
property_classes[local_name.lower()] = s
116120
self.ontology_property_classes = property_classes
117121

118122
# load namespaces
119123
self.ontology_prefixes = {p: str(ns) for p, ns in _graph.namespaces()}
120124
# self.ontology_prefixes_convert = {v: k for k, v in self.ontology_prefixes.items()}
121125

126+
# (beta) create embedding from ontology classes
127+
# Build semantic info for ontology classes
128+
if self.similarity_mode == "semantic":
129+
print("Building semantic info for ontology classes...")
130+
start_time = time.perf_counter()
131+
self.ontology_classes_semantic_info = self._build_semantic_info(
132+
self.ontology_classes, _graph)
133+
# Build semantic info for ontology property classes
134+
self.ontology_property_classes_semantic_info = self._build_semantic_info(
135+
self.ontology_property_classes, _graph)
136+
print("Embeddings are built.")
137+
end_time = time.perf_counter()
138+
print(f"Time taken to build semantic info: {end_time - start_time:.2f} seconds")
139+
140+
def _build_semantic_info(self, classes_dict, _graph):
141+
"""
142+
For each label and its corresponding IRI in the given dictionary,
143+
retrieve the descriptive text from the graph (using _graph and RDFS.comment),
144+
build the semantic string, and compute its embedding.
145+
"""
146+
semantic_info = {}
147+
for label, iri in classes_dict.items():
148+
# Find possible descriptive text from the graph
149+
description1 = _graph.value(subject=iri, predicate=RDFS.comment)
150+
description2 = _graph.value(subject=iri, predicate=SKOS.definition)
151+
description3 = _graph.value(subject=iri, predicate=DC.description)
152+
153+
# use the one that is not None
154+
description = description1 or description2 or description3 or None
155+
156+
if description:
157+
combined_string = f"{label.lower()}: {str(description).lower()}"
158+
else:
159+
combined_string = label.lower()
160+
161+
# Encode the combined string
162+
embedding = self.embedding_model.encode(combined_string)
163+
164+
semantic_info[label] = {
165+
"iri": iri,
166+
"string": combined_string,
167+
"embedding": embedding
168+
}
169+
return semantic_info
170+
122171
@staticmethod
123172
def string_similarity(str1: str, str2: str):
124173
"""
@@ -127,17 +176,33 @@ def string_similarity(str1: str, str2: str):
127176
score = fuzz.ratio(str1.lower(), str2.lower())
128177
return score
129178

130-
def semantic_similarity(self, str1: str, str2: str):
179+
def semantic_similarity_mappings(self, semantic_info: dict, string: str) -> List[tuple]:
180+
"""
181+
Compute the semantic similarity between the string and all ontology classes.
182+
"""
183+
mappings = []
184+
embeddings_string = self.embedding_model.encode(string)
185+
for label, info in semantic_info.items():
186+
# Compute the cosine similarity
187+
similarity = util.cos_sim(embeddings_string, info["embedding"]).item()
188+
# Convert to percentage
189+
similarity = similarity * 100
190+
mappings.append((semantic_info[label]["iri"], similarity))
191+
return mappings
192+
193+
def class_semantic_similarity_mappings(self, resource_type: str) -> List[tuple]:
194+
"""
195+
(Beta) Compute the semantic similarity between the resource type and all ontology classes.
196+
"""
197+
return self.semantic_similarity_mappings(semantic_info=self.ontology_classes_semantic_info,
198+
string=resource_type)
199+
200+
def property_semantic_similarity_mappings(self, property_str: str) -> List[tuple]:
131201
"""
132-
(Beta) Compute the semantic similarity between two strings using the embedding model.
202+
(Beta) Compute the semantic similarity between the property string and all ontology property classes.
133203
"""
134-
# Encode the strings
135-
embeddings_1 = self.embedding_model.encode(str1)
136-
embeddings_2 = self.embedding_model.encode(str2)
137-
# Compute the cosine similarity
138-
similarity = util.cos_sim(embeddings_1, embeddings_2).item()
139-
# print("Similarity between '", str1, "' and '", str2, "'", ": ", similarity)
140-
return similarity * 100 # Convert to percentage
204+
return self.semantic_similarity_mappings(semantic_info=self.ontology_property_classes_semantic_info,
205+
string=property_str)
141206

142207
def suggestion_condition_top_matches(self, n: int, mappings: List[tuple]):
143208
"""
@@ -187,12 +252,13 @@ def suggest_class(self, entity_type):
187252
mappings = [(iri, self.string_similarity(keyword, label))
188253
for label, iri in self.ontology_classes.items()]
189254
elif self.similarity_mode == "semantic":
190-
mappings = [(iri, self.semantic_similarity(keyword, label))
191-
for label, iri in self.ontology_classes.items()]
255+
mappings = self.class_semantic_similarity_mappings(resource_type=keyword)
256+
else:
257+
raise ValueError(f"Invalid similarity mode: {self.similarity_mode}. "
258+
f"Choose either 'string' or 'semantic'.")
192259

193260
return self.suggestion_condition_top_matches(n=3, mappings=mappings)
194261

195-
196262
def suggest_property_class(self, attribute_path:str):
197263
"""
198264
Suggest a property class for the given attribute path based on the ontology classes.
@@ -204,8 +270,10 @@ def suggest_property_class(self, attribute_path:str):
204270
mappings = [(iri, self.string_similarity(keyword, label))
205271
for label, iri in self.ontology_property_classes.items()]
206272
elif self.similarity_mode == "semantic":
207-
mappings = [(iri, self.semantic_similarity(keyword, label))
208-
for label, iri in self.ontology_property_classes.items()]
273+
mappings = self.property_semantic_similarity_mappings(property_str=keyword)
274+
else:
275+
raise ValueError(f"Invalid similarity mode: {self.similarity_mode}. "
276+
f"Choose either 'string' or 'semantic'.")
209277

210278
return self.suggestion_condition_top_matches(n=3, mappings=mappings)
211279

@@ -352,6 +420,10 @@ def create_rdf_node_relationship_file(self, overwrite: bool = False):
352420
resource_type = resource['nodetype']
353421
suggested_class = self.suggest_class(resource_type)
354422
resource["class"] = f"**TODO: PLEASE CHECK** {suggested_class}"
423+
for relationship in resource["hasRelationship"]:
424+
attribute_path = relationship["rawdataidentifier"]
425+
suggested_property_class = self.suggest_property_class(attribute_path)
426+
relationship["propertyClass"] = f"**TODO: PLEASE CHECK** {suggested_property_class}"
355427

356428
# create namespaces from ontology prefixes
357429
context = self.ontology_prefixes

test/test_relationship_finder.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ def rml_preprocess(json_file_path, ontology_file_paths, platform_config):
3636
rdf_node_relationship_file_path=json_file_path.replace(".json", "_node_relationship.json"),
3737
ontology_file_paths=ontology_file_paths,
3838
platform_config=platform_config,
39-
similarity_mode="string",
39+
# similarity_mode="string", # levenshtein distance
40+
similarity_mode="semantic", # embedding model "sentence-transformers/all-MiniLM-L6-v2"
4041
)
4142
# Load JSON and ontologies
4243
processor.pre_process(overwrite=True)

0 commit comments

Comments
 (0)