-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingest_documents_langchain.py
More file actions
114 lines (98 loc) · 3.88 KB
/
ingest_documents_langchain.py
File metadata and controls
114 lines (98 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import argparse
import logging
import os
from typing import List, Dict
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from local_rag import LocalRAG
logging.basicConfig(format="%(asctime)s: %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)
def smart_chunk_text(text: str, chunk_size: int = 2000, chunk_overlap: int = 200) -> List[str]:
"""
Intelligently split text into chunks while trying to preserve semantic meaning.
Uses RecursiveCharacterTextSplitter which tries to split on paragraph, then sentence, then word boundaries.
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", ".", " ", ""]
)
return text_splitter.split_text(text)
def process_pdf(file_path: str) -> List[Dict]:
"""
Process a PDF file and return its chunks with metadata
"""
logger.info(f"Processing PDF file: {file_path}")
try:
# Load PDF and get pages
loader = PyPDFLoader(file_path)
pages = loader.load()
texts = []
metadatas = []
for i, page in enumerate(pages):
# Get chunks from the page content
page_chunks = smart_chunk_text(page.page_content)
# Add each chunk with metadata
for j, chunk in enumerate(page_chunks):
texts.append(chunk)
metadatas.append({
"source": os.path.basename(file_path),
"page": page.metadata["page"],
"chunk": j,
"total_pages": len(pages)
})
return texts, metadatas
except Exception as e:
logger.error(f"Error processing PDF {file_path}: {str(e)}")
return [], []
def ingest_documents(directory_path: str):
"""
Ingest documents (PDFs, txt, md) from a directory into the RAG system
"""
rag = LocalRAG()
texts = []
metadatas = []
# Check if directory exists
if not os.path.exists(directory_path):
raise ValueError(f"Directory {directory_path} does not exist")
# Process all files in the directory
for filename in os.listdir(directory_path):
file_path = os.path.join(directory_path, filename)
if filename.endswith(".pdf"):
# Handle PDF files
pdf_texts, pdf_metadatas = process_pdf(file_path)
texts.extend(pdf_texts)
metadatas.extend(pdf_metadatas)
elif filename.endswith((".txt", ".md")):
# Handle text files
logger.info(f"Processing text file: {filename}")
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
# Split text content into chunks
chunks = smart_chunk_text(content)
texts.extend(chunks)
# Add metadata for each chunk
for i, _ in enumerate(chunks):
metadatas.append({
"source": filename,
"chunk": i
})
except Exception as e:
logger.error(f"Error processing file {filename}: {str(e)}")
if texts:
rag.add_documents(texts, metadatas)
logger.info(f"Successfully ingested {len(texts)} document chunks")
else:
logger.warning("No valid documents found to ingest")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Ingest documents into the RAG system")
parser.add_argument(
"--dir",
type=str,
required=True,
help="Directory containing documents to ingest",
)
args = parser.parse_args()
ingest_documents(args.dir)