Skip to content

Commit 87a655e

Browse files
authored
Merge pull request #3 from Ruchip16/knowledge_source
Pull docs from source and store in postgres db
2 parents 7091337 + 411f215 commit 87a655e

File tree

13 files changed

+748
-1
lines changed

13 files changed

+748
-1
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,8 @@ __pycache__/
77
.env.*
88
.idea/
99
.vscode/
10-
*.db
10+
*.db
11+
.mypy_cache/
12+
.cache/
13+
.DS_Store
14+
output*

config/config.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
version: v1
2+
ingest_threads: 8
3+
collections:
4+
- name: "Source Collection"
5+
id: "source_collection"
6+
mode: "overwrite"
7+
chunk_size: 500
8+
chunk_overlap: 250
9+
embedding_model: "all-MiniLM-L6-v2"
10+
metadata:
11+
key: "value"
12+
sources:
13+
- type: "source"
14+
url_fragment: "/"
15+
recursive: true
16+
attachments: true
17+
metadata:
18+
key: "value"

logs/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.csv

requirements.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,12 @@ Jinja2==3.1.6
2020
MarkupSafe==3.0.2
2121
slack_sdk==3.35.0
2222
Werkzeug==3.1.3
23+
pyigloo @ git+https://github.com/xkahn/pyigloo.git
24+
langchain_huggingface
25+
langchain_postgres
26+
langchain_community
27+
types-beautifulsoup4 # can be removed after testing with igloo API
28+
hf_xet
29+
tf-keras
30+
selenium # can be removed after tesing with igloo API
31+
pdfminer.six

sample.env

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Igloo
2+
IGLOO_API_KEY=
3+
IGLOO_ACCESS_KEY=
4+
IGLOO_USER=
5+
IGLOO_PASS=
6+
7+
# PGVector
8+
PGVECTOR_DRIVER="psycopg2"
9+
PGVECTOR_USER=
10+
PGVECTOR_PASS=
11+
PGVECTOR_DATABASE_NAME=
12+
PGVECTOR_URI="localhost"
13+
PGVECTOR_PORT="5432"

scripts/ingest_data.py

Whitespace-only changes.

vector_store/constants.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import os
2+
import pathlib
3+
4+
import torch
5+
from dotenv import load_dotenv
6+
7+
load_dotenv()
8+
9+
# PATHS
10+
DIRECTORY_PATH = pathlib.Path.cwd()
11+
KNOWLEDGE_REPOSITORY_PATH = DIRECTORY_PATH / "knowledge"
12+
SOURCE_RESPOSITORY_PATH = KNOWLEDGE_REPOSITORY_PATH / "source"
13+
14+
# INGEST
15+
DEVICE = (
16+
"cuda"
17+
if torch.cuda.is_available()
18+
else ("mps" if torch.backends.mps.is_available() else "cpu")
19+
)
20+
21+
# PGVECTOR
22+
PGVECTOR_USER = os.environ.get("PGVECTOR_USER")
23+
PGVECTOR_PASS = os.environ.get("PGVECTOR_PASS")
24+
PGVECTOR_DATABASE_NAME = os.environ.get("PGVECTOR_DATABASE_NAME")
25+
PGVECTOR_HOST = os.environ.get("PGVECTOR_URI", "localhost")
26+
PGVECTOR_PORT = int(os.environ.get("PGVECTOR_PORT", 5432))

vector_store/delete_knowledge.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import logging
2+
import shutil
3+
4+
from constants import KNOWLEDGE_REPOSITORY_PATH
5+
6+
logger = logging.getLogger(__name__)
7+
8+
9+
def delete_knowledge():
10+
"""Delete everything in the knowledge folder."""
11+
if KNOWLEDGE_REPOSITORY_PATH.exists():
12+
logger.info(f"Deleting {KNOWLEDGE_REPOSITORY_PATH}")
13+
shutil.rmtree(KNOWLEDGE_REPOSITORY_PATH)

vector_store/ingest_data.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
"""Data Ingestion"""
2+
3+
import logging
4+
import pathlib
5+
from datetime import datetime
6+
7+
import pandas as pd
8+
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
9+
from langchain_postgres import PGVector
10+
11+
from constants import (
12+
DEVICE,
13+
DIRECTORY_PATH,
14+
KNOWLEDGE_REPOSITORY_PATH,
15+
PGVECTOR_DATABASE_NAME,
16+
PGVECTOR_HOST,
17+
PGVECTOR_PASS,
18+
PGVECTOR_PORT,
19+
PGVECTOR_USER,
20+
)
21+
from split import load_documents, split_document
22+
23+
logger = logging.getLogger(__name__)
24+
25+
26+
def get_embedder(embedding_model_name: str) -> HuggingFaceEmbeddings:
27+
"""Initialize an embedder to convert text into vectors."""
28+
return HuggingFaceEmbeddings(
29+
model_name=embedding_model_name,
30+
model_kwargs={"device": DEVICE},
31+
show_progress=True,
32+
)
33+
34+
35+
def ingest(
36+
meta_lookup: dict[pathlib.Path, dict],
37+
collection_name: str,
38+
chunk_size: int,
39+
chunk_overlap: int,
40+
ingest_threads: int = 8,
41+
embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
42+
mode: str = "overwrite",
43+
collection_metadata: dict = {},
44+
):
45+
"""Load documents into a vectorstore."""
46+
# Get documents
47+
all_documents = []
48+
origin_urls = {}
49+
documents = load_documents(KNOWLEDGE_REPOSITORY_PATH, ingest_threads=ingest_threads)
50+
for extension, document in documents:
51+
# Split each document into chunks
52+
document = document[0]
53+
# Rename "source" to "_source" and save filename to "source"
54+
source = pathlib.Path(document.metadata["source"])
55+
file_name = source.stem
56+
document.metadata["_source"] = document.metadata["source"]
57+
document.metadata["source"] = file_name
58+
chunks = split_document(
59+
document, extension, chunk_size=chunk_size, chunk_overlap=chunk_overlap
60+
)
61+
# Attach metadata to each chunk
62+
for chunk in chunks:
63+
path_metadata = meta_lookup.get(source, {})
64+
chunk.metadata = chunk.metadata | path_metadata
65+
# Record how many chunks were made
66+
rel_path = source.relative_to(KNOWLEDGE_REPOSITORY_PATH)
67+
origin = rel_path.parts[0]
68+
origin_url = (origin, chunk.metadata.get("url"))
69+
origin_urls[origin_url] = len(chunks)
70+
all_documents.extend(chunks)
71+
72+
# Create embeddings
73+
embedder = get_embedder(embedding_model_name)
74+
75+
# Build the Postgres connection string
76+
connection_string = PGVector.connection_string_from_db_params(
77+
driver="psycopg",
78+
host=PGVECTOR_HOST,
79+
port=int(PGVECTOR_PORT),
80+
database=PGVECTOR_DATABASE_NAME,
81+
user=PGVECTOR_USER,
82+
password=PGVECTOR_PASS,
83+
)
84+
85+
# Connect to the db
86+
db = PGVector(
87+
connection=connection_string,
88+
embeddings=embedder,
89+
collection_name=collection_name,
90+
collection_metadata=collection_metadata,
91+
use_jsonb=True,
92+
)
93+
94+
# Overwrite the collection (if requested)
95+
if mode == "overwrite":
96+
db.delete_collection()
97+
logger.info(f"Collection {collection_name} deleted")
98+
db.create_collection()
99+
logger.info(f"Collection {collection_name} created")
100+
101+
# Load the documents
102+
logger.info(
103+
f"Loading {len(all_documents)} embeddings to {PGVECTOR_HOST} - {PGVECTOR_DATABASE_NAME} - {collection_name}"
104+
)
105+
106+
# Add documents to DB in batches to accomodate the large numbers of parameters
107+
batch_size = 150
108+
for i in range(0, len(all_documents), batch_size):
109+
batch = all_documents[i:i + batch_size]
110+
logger.info(f"Ingesting batch {i // batch_size + 1} of {len(batch)} documents")
111+
db.add_documents(documents=batch)
112+
113+
logger.info(f"Successfully loaded {len(all_documents)} embeddings")
114+
115+
directory_source_url_chunks = [
116+
list(origin_url) + [chunks] for origin_url, chunks in origin_urls.items()
117+
]
118+
df = pd.DataFrame(directory_source_url_chunks, columns=["origin", "url", "chunks"])
119+
filename = f"{PGVECTOR_HOST} - {collection_name} - {datetime.now()}.csv"
120+
outpath = DIRECTORY_PATH / "logs" / filename
121+
outpath.parent.mkdir(parents=True, exist_ok=True)
122+
df.to_csv(outpath, index=False)

vector_store/knowledge_source.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# TODO (@abhikdps): Remove this file once the Igloo API keys
2+
# are aquired and rename the knowledge_source_igloo.py file to knowledge_source.py
3+
import pathlib
4+
import time
5+
import logging
6+
from typing import Any
7+
from bs4 import BeautifulSoup
8+
from selenium import webdriver
9+
from selenium.webdriver.chrome.options import Options
10+
11+
from constants import SOURCE_RESPOSITORY_PATH
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
class SourceScraper:
17+
def __init__(self, base_url: str = "https://source.redhat.com/"):
18+
chrome_options = Options()
19+
chrome_options.add_argument("--start-maximized")
20+
self.driver = webdriver.Chrome(options=chrome_options)
21+
self.base_url = base_url
22+
23+
self.driver.get(self.base_url)
24+
print("\n Please log in manually and press ENTER here once done...")
25+
input()
26+
print(" Login confirmed. Proceeding with scraping.")
27+
28+
def fetch_all_pages(self, url_fragment: str, recursive: bool = False):
29+
url = self.base_url.rstrip("/") + url_fragment
30+
self.driver.get(url)
31+
time.sleep(3)
32+
33+
soup = BeautifulSoup(self.driver.page_source, "html.parser")
34+
pages = [soup]
35+
36+
if recursive:
37+
children_links = soup.select("a[href^='/']")
38+
visited = set()
39+
40+
for link in children_links:
41+
href = link.get("href")
42+
full_url = self.base_url.rstrip("/") + href
43+
if href and href.startswith("/") and full_url not in visited:
44+
visited.add(full_url)
45+
try:
46+
self.driver.get(full_url)
47+
time.sleep(2)
48+
sub_soup = BeautifulSoup(self.driver.page_source, "html.parser")
49+
pages.append(sub_soup)
50+
except Exception as e:
51+
logger.warning(f"Failed to visit {full_url}: {e}")
52+
53+
return pages
54+
55+
def extract_attachments(self, soup: BeautifulSoup):
56+
attachments = []
57+
links = soup.select("a")
58+
for link in links:
59+
href = link.get("href")
60+
if href and any(ext in href for ext in [".pdf", ".docx", ".xlsx"]):
61+
attachments.append(href)
62+
return attachments
63+
64+
def save_page(self, soup: BeautifulSoup, path: pathlib.Path):
65+
with open(path, "w", encoding="utf-8") as f:
66+
f.write(str(soup))
67+
68+
def download_attachments(self, attachments: list[str], base_path: pathlib.Path):
69+
for link in attachments:
70+
file_name = link.split("/")[-1]
71+
full_path = base_path / file_name
72+
try:
73+
self.driver.get(
74+
link
75+
if link.startswith("http")
76+
else self.base_url.rstrip("/") + link
77+
)
78+
with open(full_path, "wb") as f:
79+
f.write(self.driver.page_source.encode("utf-8"))
80+
except Exception as e:
81+
logger.warning(f"Failed to download attachment {link}: {e}")
82+
83+
def scrape(
84+
self,
85+
url_fragment: str,
86+
recursive: bool,
87+
attachments: bool,
88+
metadata: dict[str, Any],
89+
):
90+
meta_lookup = {}
91+
pages = self.fetch_all_pages(url_fragment, recursive)
92+
93+
for i, soup in enumerate(pages):
94+
title = soup.title.string if soup.title else f"page_{i}"
95+
safe_title = title.replace("/", "_").replace(" ", "_")[:50]
96+
page_path = (
97+
SOURCE_RESPOSITORY_PATH / url_fragment.strip("/") / f"{safe_title}.html"
98+
)
99+
page_path.parent.mkdir(parents=True, exist_ok=True)
100+
101+
self.save_page(soup, page_path)
102+
file_metadata = metadata.copy()
103+
file_metadata["url"] = self.base_url.rstrip("/") + url_fragment
104+
105+
if attachments:
106+
attachment_links = self.extract_attachments(soup)
107+
self.download_attachments(attachment_links, page_path.parent)
108+
109+
meta_lookup[page_path] = file_metadata
110+
111+
return meta_lookup
112+
113+
114+
def fetchall(
115+
url_fragment: str,
116+
recursive: bool = False,
117+
attachments: bool = True,
118+
metadata: dict = {},
119+
**kwargs,
120+
):
121+
scraper = SourceScraper()
122+
return scraper.scrape(url_fragment, recursive, attachments, metadata)

0 commit comments

Comments
 (0)