-
Notifications
You must be signed in to change notification settings - Fork 7.5k
/
Copy pathrdfreader.py
92 lines (69 loc) · 2.7 KB
/
rdfreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# mypy: ignore-errors
"""Read RDF files.
This module is used to read RDF files.
It was created by llama-hub but it has not been ported
to llama-index==0.1.0 with multiples changes to fix the code.
Original code:
https://github.com/run-llama/llama-hub
"""
import logging
from pathlib import Path
from typing import Any
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
from rdflib import Graph, URIRef
from rdflib.namespace import RDF, RDFS
logger = logging.getLogger(__name__)
class RDFReader(BaseReader):
"""RDF reader."""
def __init__(
self,
*args: Any,
**kwargs: Any,
) -> None:
"""Initialize loader."""
super().__init__(*args, **kwargs)
def fetch_labels(self, uri: URIRef, graph: Graph, lang: str):
"""Fetch all labels of a URI by language."""
return list(
filter(lambda x: x.language in [lang, None], graph.objects(uri, RDFS.label))
)
def fetch_label_in_graphs(self, uri: URIRef, lang: str = "en"):
"""Fetch one label of a URI by language from the local or global graph."""
labels = self.fetch_labels(uri, self.g_local, lang)
if len(labels) > 0:
return labels[0].value
labels = self.fetch_labels(uri, self.g_global, lang)
if len(labels) > 0:
return labels[0].value
return str(uri)
def load_data(self, file: Path, extra_info: dict | None = None) -> list[Document]:
"""Parse file."""
extra_info = extra_info or {}
extra_info["graph_type"] = "rdf"
lang = (
extra_info["lang"]
if extra_info is not None and "lang" in extra_info
else "en"
)
self.g_local = Graph()
self.g_local.parse(file)
self.g_global = Graph()
self.g_global.parse(str(RDF))
self.g_global.parse(str(RDFS))
text_list = []
for s, p, o in self.g_local:
logger.debug("s=%s, p=%s, o=%s", s, p, o)
if p == RDFS.label:
continue
subj_label = self.fetch_label_in_graphs(s, lang=lang)
pred_label = self.fetch_label_in_graphs(p, lang=lang)
obj_label = self.fetch_label_in_graphs(o, lang=lang)
if subj_label is None or pred_label is None or obj_label is None:
continue
triple = f"<{subj_label}> " f"<{pred_label}> " f"<{obj_label}>"
text_list.append(triple)
text = "\n".join(text_list)
return [self._text_to_document(text, extra_info)]
def _text_to_document(self, text: str, extra_info: dict | None = None) -> Document:
return Document(text=text, extra_info=extra_info or {})