-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcustom_processor_aap.py
More file actions
118 lines (98 loc) · 4.08 KB
/
custom_processor_aap.py
File metadata and controls
118 lines (98 loc) · 4.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""Custom metadata processor for AAP documentation.
This module provides AAPMetadataProcessor class for processing AAP product
documentation metadata and generating vector databases.
"""
# pylint: disable=import-error
import functools
import json
from pathlib import Path
from aap_rag_content import utils
from aap_rag_content.document_processor import DocumentProcessor
from aap_rag_content.metadata_processor import MetadataProcessor
# Folders where AAP product documentation markdown (.md) files are stored.
AAP_PRODUCT_DOCS = [
"aap-product-docs-plaintext/red_hat_content/documentation/ansible_on_clouds",
(
"aap-product-docs-plaintext/red_hat_content/documentation/"
"red_hat_ansible_automation_platform/2.6"
),
(
"aap-product-docs-plaintext/red_hat_content/documentation/"
"red_hat_ansible_lightspeed_with_ibm_watsonx_code_assistant"
),
]
# Folders where additional documents are stored as plain text (.txt) files.
ADDITIONAL_DOCS = [
"additional_docs",
]
class AAPMetadataProcessor(MetadataProcessor):
"""Metadata processor for AAP documentation.
Extends MetadataProcessor to handle AAP-specific metadata stored in JSON files.
"""
def __init__(self, suppress_ping_url: bool = False):
super().__init__(suppress_ping_url=suppress_ping_url)
@functools.lru_cache(maxsize=None)
def _load_metadata(self, file_path_str: str) -> dict:
# Return a dict that contains metadata for the specified source
# document
file_path = Path(file_path_str)
metadata_path = Path(file_path.parent / ".metadata" / f"{file_path.stem}.json")
if not metadata_path.exists():
raise RuntimeError(f"Metadata JSON file {metadata_path} does not exist")
metadata = json.loads(metadata_path.read_text(encoding="utf8"))
return metadata
def url_function(self, file_path: str) -> str:
"""Return a URL for the file, so it can be referenced when used
in an answer
"""
url = self._load_metadata(file_path).get("url")
if not url:
raise RuntimeError(f"URL is not found for {file_path}")
return url
def get_file_title(self, file_path: str) -> str:
"""If a title is find in the metadata JSON file, return it.
Otherwise, extract title from the plaintext doc file.
"""
title = self._load_metadata(file_path).get("title")
if title:
return title
if not file_path.endswith(".txt"):
raise RuntimeError(f"Title metadata is not found for the markdown file {file_path}")
file_content = Path(file_path).read_text(encoding="utf8")
return file_content.split("\n")[0].lstrip("# ")
def main():
"""Main function to process AAP documentation and generate vector database."""
parser = utils.get_common_arg_parser()
args = parser.parse_args()
# Instantiate custom Metadata Processor
metadata_processor = AAPMetadataProcessor(suppress_ping_url=args.suppress_ping_url)
# Instantiate Document Processor
document_processor = DocumentProcessor(
chunk_size=args.chunk,
chunk_overlap=args.overlap,
model_name=args.model_name,
embeddings_model_dir=args.model_dir,
num_workers=args.workers,
vector_store_type=args.vector_store_type,
manual_chunking=args.manual_chunking,
)
# Load and embed the documents, this method can be called multiple times
# for different sets of documents
for document_folder in AAP_PRODUCT_DOCS:
folder = Path(Path(args.folder) / document_folder)
document_processor.process(
folder,
metadata=metadata_processor,
required_exts=[".md"],
)
for document_folder in ADDITIONAL_DOCS:
folder = Path(Path(args.folder) / document_folder)
document_processor.process(
folder,
metadata=metadata_processor,
required_exts=[".txt"],
)
# Save the new vector database to the output directory
document_processor.save(args.index, args.output)
if __name__ == "__main__":
main()