11import json
22import logging
33import os
4+ import time
45from typing import List , Any
56from rapidfuzz import fuzz
6- from rdflib import Graph , RDF , RDFS , OWL
7+ from rdflib import Graph , RDF , RDFS , OWL , SKOS , DC
78from sentence_transformers import SentenceTransformer , util
8- from semantic_iot .JSON_preprocess import JSONPreprocessor , JSONPreprocessorHandler
9+ from semantic_iot .JSON_preprocess import JSONPreprocessorHandler
910
1011
1112class MappingPreprocess :
@@ -55,6 +56,9 @@ def __init__(self,
5556 self .ontology_property_classes = None
5657 self .ontology_prefixes = None
5758 # self.ontology_prefixes_convert = None
59+ # only for semantic mode
60+ self .ontology_classes_semantic_info = None
61+ self .ontology_property_classes_semantic_info = None
5862
5963 if similarity_mode not in ["string" , "semantic" ]:
6064 logging .warning (f"Invalid similarity mode: { similarity_mode } . "
@@ -98,27 +102,72 @@ def load_ontology(self):
98102 for s , p , o in _graph .triples ((None , RDF .type , OWL .Class )):
99103 label = _graph .value (subject = s , predicate = RDFS .label )
100104 if label :
101- ontology_classes [str (label ).lower ()] = str ( s )
105+ ontology_classes [str (label ).lower ()] = s
102106 else :
103107 local_name = s .split ("#" )[- 1 ] if "#" in s else s .split ("/" )[- 1 ]
104- ontology_classes [local_name .lower ()] = str ( s )
108+ ontology_classes [local_name .lower ()] = s
105109 self .ontology_classes = ontology_classes
106110
107111 # Extracting property classes
108112 property_classes = {}
109113 for s , p , o in _graph .triples ((None , RDF .type , OWL .ObjectProperty )):
110114 label = _graph .value (subject = s , predicate = RDFS .label )
111115 if label :
112- property_classes [str (label ).lower ()] = str ( s )
116+ property_classes [str (label ).lower ()] = s
113117 else :
114118 local_name = s .split ("#" )[- 1 ] if "#" in s else s .split ("/" )[- 1 ]
115- property_classes [local_name .lower ()] = str ( s )
119+ property_classes [local_name .lower ()] = s
116120 self .ontology_property_classes = property_classes
117121
118122 # load namespaces
119123 self .ontology_prefixes = {p : str (ns ) for p , ns in _graph .namespaces ()}
120124 # self.ontology_prefixes_convert = {v: k for k, v in self.ontology_prefixes.items()}
121125
126+ # (beta) create embedding from ontology classes
127+ # Build semantic info for ontology classes
128+ if self .similarity_mode == "semantic" :
129+ print ("Building semantic info for ontology classes..." )
130+ start_time = time .perf_counter ()
131+ self .ontology_classes_semantic_info = self ._build_semantic_info (
132+ self .ontology_classes , _graph )
133+ # Build semantic info for ontology property classes
134+ self .ontology_property_classes_semantic_info = self ._build_semantic_info (
135+ self .ontology_property_classes , _graph )
136+ print ("Embeddings are built." )
137+ end_time = time .perf_counter ()
138+ print (f"Time taken to build semantic info: { end_time - start_time :.2f} seconds" )
139+
140+ def _build_semantic_info (self , classes_dict , _graph ):
141+ """
142+ For each label and its corresponding IRI in the given dictionary,
143+ retrieve the descriptive text from the graph (using _graph and RDFS.comment),
144+ build the semantic string, and compute its embedding.
145+ """
146+ semantic_info = {}
147+ for label , iri in classes_dict .items ():
148+ # Find possible descriptive text from the graph
149+ description1 = _graph .value (subject = iri , predicate = RDFS .comment )
150+ description2 = _graph .value (subject = iri , predicate = SKOS .definition )
151+ description3 = _graph .value (subject = iri , predicate = DC .description )
152+
153+ # use the one that is not None
154+ description = description1 or description2 or description3 or None
155+
156+ if description :
157+ combined_string = f"{ label .lower ()} : { str (description ).lower ()} "
158+ else :
159+ combined_string = label .lower ()
160+
161+ # Encode the combined string
162+ embedding = self .embedding_model .encode (combined_string )
163+
164+ semantic_info [label ] = {
165+ "iri" : iri ,
166+ "string" : combined_string ,
167+ "embedding" : embedding
168+ }
169+ return semantic_info
170+
122171 @staticmethod
123172 def string_similarity (str1 : str , str2 : str ):
124173 """
@@ -127,17 +176,33 @@ def string_similarity(str1: str, str2: str):
127176 score = fuzz .ratio (str1 .lower (), str2 .lower ())
128177 return score
129178
130- def semantic_similarity (self , str1 : str , str2 : str ):
179+ def semantic_similarity_mappings (self , semantic_info : dict , string : str ) -> List [tuple ]:
180+ """
181+ Compute the semantic similarity between the string and all ontology classes.
182+ """
183+ mappings = []
184+ embeddings_string = self .embedding_model .encode (string )
185+ for label , info in semantic_info .items ():
186+ # Compute the cosine similarity
187+ similarity = util .cos_sim (embeddings_string , info ["embedding" ]).item ()
188+ # Convert to percentage
189+ similarity = similarity * 100
190+ mappings .append ((semantic_info [label ]["iri" ], similarity ))
191+ return mappings
192+
193+ def class_semantic_similarity_mappings (self , resource_type : str ) -> List [tuple ]:
194+ """
195+ (Beta) Compute the semantic similarity between the resource type and all ontology classes.
196+ """
197+ return self .semantic_similarity_mappings (semantic_info = self .ontology_classes_semantic_info ,
198+ string = resource_type )
199+
200+ def property_semantic_similarity_mappings (self , property_str : str ) -> List [tuple ]:
131201 """
132- (Beta) Compute the semantic similarity between two strings using the embedding model .
202+ (Beta) Compute the semantic similarity between the property string and all ontology property classes .
133203 """
134- # Encode the strings
135- embeddings_1 = self .embedding_model .encode (str1 )
136- embeddings_2 = self .embedding_model .encode (str2 )
137- # Compute the cosine similarity
138- similarity = util .cos_sim (embeddings_1 , embeddings_2 ).item ()
139- # print("Similarity between '", str1, "' and '", str2, "'", ": ", similarity)
140- return similarity * 100 # Convert to percentage
204+ return self .semantic_similarity_mappings (semantic_info = self .ontology_property_classes_semantic_info ,
205+ string = property_str )
141206
142207 def suggestion_condition_top_matches (self , n : int , mappings : List [tuple ]):
143208 """
@@ -187,12 +252,13 @@ def suggest_class(self, entity_type):
187252 mappings = [(iri , self .string_similarity (keyword , label ))
188253 for label , iri in self .ontology_classes .items ()]
189254 elif self .similarity_mode == "semantic" :
190- mappings = [(iri , self .semantic_similarity (keyword , label ))
191- for label , iri in self .ontology_classes .items ()]
255+ mappings = self .class_semantic_similarity_mappings (resource_type = keyword )
256+ else :
257+ raise ValueError (f"Invalid similarity mode: { self .similarity_mode } . "
258+ f"Choose either 'string' or 'semantic'." )
192259
193260 return self .suggestion_condition_top_matches (n = 3 , mappings = mappings )
194261
195-
196262 def suggest_property_class (self , attribute_path :str ):
197263 """
198264 Suggest a property class for the given attribute path based on the ontology classes.
@@ -204,8 +270,10 @@ def suggest_property_class(self, attribute_path:str):
204270 mappings = [(iri , self .string_similarity (keyword , label ))
205271 for label , iri in self .ontology_property_classes .items ()]
206272 elif self .similarity_mode == "semantic" :
207- mappings = [(iri , self .semantic_similarity (keyword , label ))
208- for label , iri in self .ontology_property_classes .items ()]
273+ mappings = self .property_semantic_similarity_mappings (property_str = keyword )
274+ else :
275+ raise ValueError (f"Invalid similarity mode: { self .similarity_mode } . "
276+ f"Choose either 'string' or 'semantic'." )
209277
210278 return self .suggestion_condition_top_matches (n = 3 , mappings = mappings )
211279
@@ -352,6 +420,10 @@ def create_rdf_node_relationship_file(self, overwrite: bool = False):
352420 resource_type = resource ['nodetype' ]
353421 suggested_class = self .suggest_class (resource_type )
354422 resource ["class" ] = f"**TODO: PLEASE CHECK** { suggested_class } "
423+ for relationship in resource ["hasRelationship" ]:
424+ attribute_path = relationship ["rawdataidentifier" ]
425+ suggested_property_class = self .suggest_property_class (attribute_path )
426+ relationship ["propertyClass" ] = f"**TODO: PLEASE CHECK** { suggested_property_class } "
355427
356428 # create namespaces from ontology prefixes
357429 context = self .ontology_prefixes
0 commit comments