Skip to content

Commit 76d071e

Browse files
committed
refactor: complete rework into cli tool
1 parent 1d94a74 commit 76d071e

3 files changed

Lines changed: 167 additions & 49 deletions

File tree

src/strings2things/cli2.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import os
2+
import glob
3+
import argparse
4+
import rdflib
5+
from config import args
6+
from strings2things.sparql import (
7+
strings_to_things_query,
8+
things_to_strings_query
9+
)
10+
11+
12+
def load_graphs_from_path(path, graph, file_extension="*.ttl", format="turtle"):
13+
"""
14+
Load all RDF files from a given path into the provided graph.
15+
"""
16+
for file_path in glob.glob(os.path.join(path, file_extension)):
17+
print(f"Processing file: {file_path}")
18+
graph.parse(file_path, format=format)
19+
print(f"Loaded {len(graph)} triples from {path}.")
20+
21+
22+
def initialize_graphs(ontology_path, kg_path, ontology_uri, kg_uri):
23+
"""
24+
Load and return ontology and knowledge graphs inside a shared dataset.
25+
"""
26+
dataset = rdflib.Dataset()
27+
28+
# Load KG
29+
knowledge_graph = dataset.graph(kg_uri)
30+
load_graphs_from_path(kg_path, knowledge_graph)
31+
32+
# Load ontology
33+
ontology_graph = dataset.graph(ontology_uri)
34+
load_graphs_from_path(ontology_path, ontology_graph)
35+
36+
return dataset
37+
38+
39+
def strings_to_things(dataset, output_file="subgraph.ttl"):
40+
"""
41+
Replace human-readable strings in the KG with ontology IRIs based on matches.
42+
"""
43+
results = dataset.query(strings_to_things_query)
44+
new_graph = rdflib.Graph()
45+
46+
for triple in results.graph:
47+
new_graph.add(triple)
48+
49+
new_graph.serialize(destination=output_file, format="turtle")
50+
print(f"Strings replaced with IRIs and written to {output_file} ({len(new_graph)} triples).")
51+
52+
53+
def things_to_strings(dataset, output_file="output.ttl"):
54+
"""
55+
Replace IRIs in the KG with human-readable labels using the ontology.
56+
"""
57+
results = dataset.query(things_to_strings_query)
58+
59+
new_graph = rdflib.Graph()
60+
for triple in results.graph:
61+
new_graph.add(triple)
62+
63+
new_graph.serialize(destination=output_file, format="turtle")
64+
print(f"IRIs replaced with labels and written to {output_file} ({len(new_graph)} triples).")
65+
66+
67+
def main():
68+
parser = argparse.ArgumentParser(description="Convert between strings and ontology terms.")
69+
parser.add_argument("direction", choices=["string2thing", "thing2string"], help="Conversion direction")
70+
parser.add_argument("--ontology", required=True, help="Path to ontology files directory")
71+
parser.add_argument("--kg", required=True, help="Path to knowledge graph files directory")
72+
parser.add_argument("--ontology-uri", default="https://imaging-plaza.epfl.ch/ontology#", help="Named graph URI for ontology")
73+
parser.add_argument("--kg-uri", default="https://imaging-plaza.epfl.ch/finalGraph", help="Named graph URI for knowledge graph")
74+
args = parser.parse_args()
75+
76+
dataset = initialize_graphs(args.ontology, args.kg, args.ontology_uri, args.kg_uri)
77+
78+
if args.direction == "string2thing":
79+
strings_to_things(dataset)
80+
elif args.direction == "thing2string":
81+
things_to_strings(dataset)
82+
83+
84+
if __name__ == "__main__":
85+
main()

src/strings2things/config.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import argparse
2+
3+
parser = argparse.ArgumentParser(description="Convert between strings and ontology terms.")
4+
parser.add_argument("direction", choices=["string2thing", "thing2string"], help="Conversion direction")
5+
parser.add_argument("--ontology", required=True, help="Path to ontology files directory")
6+
parser.add_argument("--kg", required=True, help="Path to knowledge graph files directory")
7+
parser.add_argument("--ontology-uri", default="https://imaging-plaza.epfl.ch/ontology#", help="Named graph URI for ontology")
8+
parser.add_argument("--kg-uri", default="https://imaging-plaza.epfl.ch/finalGraph", help="Named graph URI for knowledge graph")
9+
args = parser.parse_args()

src/strings2things/sparql.py

Lines changed: 73 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,83 @@
1-
enumeration_query = r"""
2-
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
3-
PREFIX schema: <http://schema.org/>
4-
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
1+
import os
2+
from dotenv import load_dotenv
3+
from config import args
54

6-
construct { ?subject ?predicate ?object }
7-
WHERE {
8-
?subject a schema:Enumeration .
9-
?subject ?predicate ?object .
10-
}
11-
"""
5+
# Load environment variables from the .env file
6+
load_dotenv()
7+
8+
# Parse graph names from command-line arguments
9+
# Assign graph names
10+
INSTANCE_DATA_GRAPH = args.kg_uri
11+
ONTOLOGY_GRAPH = args.ontology_uri
1212

13-
find_matches_query = r"""
14-
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
13+
# # Parametrize the queries
14+
# enumeration_query = r"""
15+
# PREFIX schema: <http://schema.org/>
16+
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
17+
# PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
18+
# CONSTRUCT {?subject ?predicate ?object }
19+
# WHERE {
20+
# ?subject rdf:type/rdfs:subClassOf* schema:Enumeration .
21+
# ?subject ?predicate ?object .
22+
# }
23+
# """
24+
25+
strings_to_things_query = f"""
26+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
1527
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
1628
PREFIX schema: <http://schema.org/>
17-
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
29+
PREFIX sh: <http://www.w3.org/ns/shacl#>
1830
19-
CONSTRUCT {
20-
?s ?p ?result .
21-
}
22-
WHERE {
23-
GRAPH <https://imaging-plaza.epfl.ch/finalGraph> {
31+
CONSTRUCT {{
32+
?s ?p ?finalValue .
33+
}}
34+
WHERE {{
35+
GRAPH <{INSTANCE_DATA_GRAPH}> {{
2436
?s ?p ?o .
25-
FILTER (!(?p IN (schema:name, schema:description, rdfs:comment, skos:definition)))
26-
FILTER (!regex(STR(?o), "^[ \t]*https?://"))
27-
FILTER (!regex(STR(?o), "^\\d{4}-\\d{2}-\\d{2}T00:00:00\\.000Z$"))
28-
FILTER (datatype(?o) = xsd:string)
29-
}
30-
GRAPH <https://imaging-plaza.epfl.ch/ontology#enums> {
31-
OPTIONAL {
32-
?s2 rdfs:label ?o.
33-
}
34-
35-
# Ensure only one IRI is bound to the label, skipping rows with multiple IRIs
36-
FILTER NOT EXISTS {
37-
?s3 rdfs:label ?o.
38-
FILTER (?s3 != ?s2) # Ensures ?s2 is the only IRI bound to the label
39-
}
40-
}
41-
42-
BIND(IF(BOUND(?s2), ?s2, ?o) AS ?result)
43-
}
37+
}}
38+
39+
# Find the expected enumeration class for this property
40+
GRAPH <{ONTOLOGY_GRAPH}> {{
41+
OPTIONAL {{
42+
?propertyShape sh:path ?p ;
43+
sh:class ?expectedEnumClass .
44+
45+
?enumInstance rdf:type*/rdfs:subClassOf* ?expectedEnumClass ;
46+
rdfs:label ?o .
47+
}}
48+
}}
49+
50+
# Choose only one matching IRI per string, ensuring correct category
51+
BIND(COALESCE(?enumInstance, ?o) AS ?finalValue)
52+
}}
4453
"""
4554

46-
find_predicate_query = r"""
47-
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
55+
things_to_strings_query = f"""
56+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
4857
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
4958
PREFIX schema: <http://schema.org/>
50-
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
51-
52-
SELECT ?o ?p
53-
WHERE {
54-
?s ?p ?o .
55-
FILTER (!(?p IN (schema:name, schema:description, rdfs:comment, skos:definition)))
56-
FILTER (!regex(STR(?o), "^[ \t]*https?://"))
57-
FILTER (!regex(STR(?o), "^\\d{4}-\\d{2}-\\d{2}T00:00:00\\.000Z$"))
58-
FILTER (datatype(?o) = xsd:string)
59-
}"""
59+
PREFIX sh: <http://www.w3.org/ns/shacl#>
60+
61+
CONSTRUCT {{
62+
?s ?p ?finalValue .
63+
}}
64+
WHERE {{
65+
GRAPH <{INSTANCE_DATA_GRAPH}> {{
66+
?s ?p ?o .
67+
}}
68+
69+
# Get the expected enum class for this property
70+
GRAPH <{ONTOLOGY_GRAPH}> {{
71+
OPTIONAL {{
72+
?propertyShape sh:path ?p ;
73+
sh:class ?expectedEnumClass .
74+
75+
?o a ?expectedEnumClass ;
76+
rdfs:label ?label .
77+
}}
78+
}}
79+
80+
# If a label is found (meaning ?o is an enum instance), use it; otherwise keep the original value
81+
BIND(COALESCE(?label, ?o) AS ?finalValue)
82+
}}
83+
"""

0 commit comments

Comments
 (0)