Skip to content

Commit 8399dad

Browse files
committed
♻️ refactor encoders
1 parent 9308bfb commit 8399dad

4 files changed

Lines changed: 197 additions & 220 deletions

File tree

ontoaligner/encoder/encoders.py

Lines changed: 0 additions & 214 deletions
This file was deleted.

ontoaligner/encoder/lightweight.py

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group.
1+
# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -23,7 +23,74 @@
2323
"""
2424
from typing import Any, Dict
2525

26-
from .encoders import LightweightEncoder
26+
from ..base import BaseEncoder
27+
28+
class LightweightEncoder(BaseEncoder):
29+
"""
30+
A lightweight encoder for parsing ontology data and preprocessing it.
31+
32+
This class provides methods for parsing ontological data, applying text preprocessing,
33+
and formatting the data into a structure suitable for further processing.
34+
"""
35+
def parse(self, **kwargs) -> Any:
36+
"""
37+
Parses the source and target ontologies, applying preprocessing.
38+
39+
This method extracts ontology items (IRI and label) from the source and target ontologies,
40+
applies text preprocessing to the labels, and returns the encoded data.
41+
42+
Parameters:
43+
**kwargs: Contains the source and target ontologies as keyword arguments.
44+
45+
Returns:
46+
list: A list containing two elements, the processed source and target ontologies.
47+
"""
48+
source_onto, target_onto = kwargs["source"], kwargs["target"]
49+
source_ontos = []
50+
for source in source_onto:
51+
encoded_source = self.get_owl_items(owl=source)
52+
encoded_source["text"] = self.preprocess(encoded_source["text"])
53+
source_ontos.append(encoded_source)
54+
target_ontos = []
55+
for target in target_onto:
56+
encoded_target = self.get_owl_items(owl=target)
57+
encoded_target["text"] = self.preprocess(encoded_target["text"])
58+
target_ontos.append(encoded_target)
59+
return [source_ontos, target_ontos]
60+
61+
def __str__(self):
62+
"""
63+
Returns a string representation of the encoder.
64+
65+
Returns:
66+
dict: A dictionary with the class name as key and items_in_owl as value.
67+
"""
68+
return {"LightweightEncoder": self.items_in_owl}
69+
70+
def get_owl_items(self, owl: Dict) -> Any:
71+
"""
72+
Abstract method for extracting ontology data.
73+
74+
This method should be implemented by subclasses to extract specific ontology data
75+
(e.g., IRI and label) from the provided ontology item.
76+
77+
Parameters:
78+
owl (Dict): A dictionary representing an ontology item.
79+
80+
Returns:
81+
Any: The extracted ontology data.
82+
"""
83+
pass
84+
85+
def get_encoder_info(self):
86+
"""
87+
Provides information about the encoder.
88+
89+
Returns:
90+
str: A description of the encoder's function in the overall pipeline.
91+
"""
92+
return "INPUT CONSIST OF COMBINED INFORMATION TO FUZZY STRING MATCHING"
93+
2794

2895
class ConceptLightweightEncoder(LightweightEncoder):
2996
"""

ontoaligner/encoder/llm.py

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group.
1+
# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -13,7 +13,72 @@
1313
# limitations under the License.
1414
from typing import Any, Dict
1515

16-
from .encoders import LLMEncoder
16+
from ..base import BaseEncoder
17+
18+
19+
class LLMEncoder(BaseEncoder):
20+
"""
21+
A naive encoder for ontology alignment.
22+
"""
23+
def parse(self, **kwargs) -> Any:
24+
"""
25+
Processes the source and target ontologies into a prompt for ontology alignment.
26+
27+
This method formats the source and target ontologies into a string representation,
28+
filling in a pre-defined template that includes ontology items (IRI and label).
29+
30+
Parameters:
31+
**kwargs: Contains the source and target ontologies as keyword arguments.
32+
33+
Returns:
34+
list: A list containing the formatted prompt string for ontology matching.
35+
"""
36+
source_onto, target_onto = kwargs["source"], kwargs["target"]
37+
source_ontos = []
38+
for source in source_onto:
39+
encoded_source = self.get_owl_items(owl=source)
40+
# encoded_source["concept"] = self.preprocess(encoded_source["text"])
41+
source_ontos.append(encoded_source)
42+
target_ontos = []
43+
for target in target_onto:
44+
encoded_target = self.get_owl_items(owl=target)
45+
# encoded_target["concept"] = self.preprocess(encoded_target["text"])
46+
target_ontos.append(encoded_target)
47+
return [source_ontos, target_ontos]
48+
49+
def __str__(self):
50+
"""
51+
Returns a string representation of the encoder.
52+
53+
Returns:
54+
dict: A dictionary with the template and items_in_owl values.
55+
"""
56+
return {"LLMEncoder": self.items_in_owl}
57+
58+
def get_owl_items(self, owl: Dict) -> str:
59+
"""
60+
Abstract method to extract ontology data as a string.
61+
62+
This method should be implemented by subclasses to extract specific ontology data
63+
(e.g., IRI and label) from the provided ontology item.
64+
65+
Parameters:
66+
owl (Dict): A dictionary representing an ontology item.
67+
68+
Returns:
69+
str: The extracted ontology data as a string.
70+
"""
71+
pass
72+
73+
def get_encoder_info(self) -> str:
74+
"""
75+
Provides information about the encoder and its prompt template.
76+
77+
Returns:
78+
str: A description of the encoder's components.
79+
"""
80+
return "INPUT CONSIST OF A DICTIONARY THAT CONSIST OF INFORMATION FOR THE GIVEN SOURCE-TARGET ONTOLOGIES."
81+
1782

1883
class ConceptLLMEncoder(LLMEncoder):
1984
"""

0 commit comments

Comments
 (0)