aap-rag-content/scripts/custom_processor_aap.py at main · ansible/aap-rag-content · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""Custom metadata processor for AAP documentation.

This module provides AAPMetadataProcessor class for processing AAP product
documentation metadata and generating vector databases.
"""

# pylint: disable=import-error
import functools
import json
from pathlib import Path

from aap_rag_content import utils
from aap_rag_content.document_processor import DocumentProcessor
from aap_rag_content.metadata_processor import MetadataProcessor

# Folders where AAP product documentation markdown (.md) files are stored.
AAP_PRODUCT_DOCS = [
    "aap-product-docs-plaintext/red_hat_content/documentation/ansible_on_clouds",
    (
        "aap-product-docs-plaintext/red_hat_content/documentation/"
        "red_hat_ansible_automation_platform/2.6"
    ),
    (
        "aap-product-docs-plaintext/red_hat_content/documentation/"
        "red_hat_ansible_lightspeed_with_ibm_watsonx_code_assistant"
    ),
]

# Folders where additional documents are stored as plain text (.txt) files.
ADDITIONAL_DOCS = [
    "additional_docs",
]


class AAPMetadataProcessor(MetadataProcessor):
    """Metadata processor for AAP documentation.

    Extends MetadataProcessor to handle AAP-specific metadata stored in JSON files.
    """

    def __init__(self, suppress_ping_url: bool = False):
        super().__init__(suppress_ping_url=suppress_ping_url)

    @functools.lru_cache(maxsize=None)
    def _load_metadata(self, file_path_str: str) -> dict:
        # Return a dict that contains metadata for the specified source
        # document
        file_path = Path(file_path_str)
        metadata_path = Path(file_path.parent / ".metadata" / f"{file_path.stem}.json")
        if not metadata_path.exists():
            raise RuntimeError(f"Metadata JSON file {metadata_path} does not exist")
        metadata = json.loads(metadata_path.read_text(encoding="utf8"))
        return metadata

    def url_function(self, file_path: str) -> str:
        """Return a URL for the file, so it can be referenced when used
        in an answer
        """
        url = self._load_metadata(file_path).get("url")
        if not url:
            raise RuntimeError(f"URL is not found for {file_path}")
        return url

    def get_file_title(self, file_path: str) -> str:
        """If a title is find in the metadata JSON file, return it.
        Otherwise, extract title from the plaintext doc file.
        """
        title = self._load_metadata(file_path).get("title")
        if title:
            return title
        if not file_path.endswith(".txt"):
            raise RuntimeError(f"Title metadata is not found for the markdown file {file_path}")
        file_content = Path(file_path).read_text(encoding="utf8")
        return file_content.split("\n")[0].lstrip("# ")


def main():
    """Main function to process AAP documentation and generate vector database."""
    parser = utils.get_common_arg_parser()
    args = parser.parse_args()

    # Instantiate custom Metadata Processor
    metadata_processor = AAPMetadataProcessor(suppress_ping_url=args.suppress_ping_url)

    # Instantiate Document Processor
    document_processor = DocumentProcessor(
        chunk_size=args.chunk,
        chunk_overlap=args.overlap,
        model_name=args.model_name,
        embeddings_model_dir=args.model_dir,
        num_workers=args.workers,
        vector_store_type=args.vector_store_type,
        manual_chunking=args.manual_chunking,
    )

    # Load and embed the documents, this method can be called multiple times
    # for different sets of documents
    for document_folder in AAP_PRODUCT_DOCS:
        folder = Path(Path(args.folder) / document_folder)
        document_processor.process(
            folder,
            metadata=metadata_processor,
            required_exts=[".md"],
        )
    for document_folder in ADDITIONAL_DOCS:
        folder = Path(Path(args.folder) / document_folder)
        document_processor.process(
            folder,
            metadata=metadata_processor,
            required_exts=[".txt"],
        )

    # Save the new vector database to the output directory
    document_processor.save(args.index, args.output)


if __name__ == "__main__":
    main()