Merge pull request #3 from Ruchip16/knowledge_source

Ruchip16 · web-flow · commit 87a655e1c75f · 2025-05-13T14:09:08.000+05:30
Pull docs from source and store in postgres db
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,8 @@ __pycache__/
 .env.*
 .idea/
 .vscode/
-*.db
+*.db
+.mypy_cache/
+.cache/
+.DS_Store
+output*
diff --git a/config/config.yaml b/config/config.yaml
@@ -0,0 +1,18 @@
+version: v1
+ingest_threads: 8
+collections:
+  - name: "Source Collection"
+    id: "source_collection"
+    mode: "overwrite"
+    chunk_size: 500
+    chunk_overlap: 250
+    embedding_model: "all-MiniLM-L6-v2"
+    metadata:
+      key: "value"
+    sources:
+      - type: "source"
+        url_fragment: "/"
+        recursive: true
+        attachments: true
+        metadata:
+          key: "value"
diff --git a/logs/.gitignore b/logs/.gitignore
@@ -0,0 +1 @@
+*.csv
diff --git a/requirements.txt b/requirements.txt
@@ -20,3 +20,12 @@ Jinja2==3.1.6
 MarkupSafe==3.0.2
 slack_sdk==3.35.0
 Werkzeug==3.1.3
+pyigloo @ git+https://github.com/xkahn/pyigloo.git
+langchain_huggingface
+langchain_postgres
+langchain_community
+types-beautifulsoup4 # can be removed after testing with igloo API
+hf_xet
+tf-keras
+selenium # can be removed after tesing with igloo API
+pdfminer.six
diff --git a/sample.env b/sample.env
@@ -0,0 +1,13 @@
+# Igloo
+IGLOO_API_KEY=
+IGLOO_ACCESS_KEY=
+IGLOO_USER=
+IGLOO_PASS=
+
+# PGVector
+PGVECTOR_DRIVER="psycopg2"
+PGVECTOR_USER=
+PGVECTOR_PASS=
+PGVECTOR_DATABASE_NAME=
+PGVECTOR_URI="localhost"
+PGVECTOR_PORT="5432"
diff --git a/scripts/ingest_data.py b/scripts/ingest_data.py
diff --git a/vector_store/constants.py b/vector_store/constants.py
@@ -0,0 +1,26 @@
+import os
+import pathlib
+
+import torch
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# PATHS
+DIRECTORY_PATH = pathlib.Path.cwd()
+KNOWLEDGE_REPOSITORY_PATH = DIRECTORY_PATH / "knowledge"
+SOURCE_RESPOSITORY_PATH = KNOWLEDGE_REPOSITORY_PATH / "source"
+
+# INGEST
+DEVICE = (
+    "cuda"
+    if torch.cuda.is_available()
+    else ("mps" if torch.backends.mps.is_available() else "cpu")
+)
+
+# PGVECTOR
+PGVECTOR_USER = os.environ.get("PGVECTOR_USER")
+PGVECTOR_PASS = os.environ.get("PGVECTOR_PASS")
+PGVECTOR_DATABASE_NAME = os.environ.get("PGVECTOR_DATABASE_NAME")
+PGVECTOR_HOST = os.environ.get("PGVECTOR_URI", "localhost")
+PGVECTOR_PORT = int(os.environ.get("PGVECTOR_PORT", 5432))
diff --git a/vector_store/delete_knowledge.py b/vector_store/delete_knowledge.py
@@ -0,0 +1,13 @@
+import logging
+import shutil
+
+from constants import KNOWLEDGE_REPOSITORY_PATH
+
+logger = logging.getLogger(__name__)
+
+
+def delete_knowledge():
+    """Delete everything in the knowledge folder."""
+    if KNOWLEDGE_REPOSITORY_PATH.exists():
+        logger.info(f"Deleting {KNOWLEDGE_REPOSITORY_PATH}")
+        shutil.rmtree(KNOWLEDGE_REPOSITORY_PATH)
diff --git a/vector_store/ingest_data.py b/vector_store/ingest_data.py
@@ -0,0 +1,122 @@
+"""Data Ingestion"""
+
+import logging
+import pathlib
+from datetime import datetime
+
+import pandas as pd
+from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+from langchain_postgres import PGVector
+
+from constants import (
+    DEVICE,
+    DIRECTORY_PATH,
+    KNOWLEDGE_REPOSITORY_PATH,
+    PGVECTOR_DATABASE_NAME,
+    PGVECTOR_HOST,
+    PGVECTOR_PASS,
+    PGVECTOR_PORT,
+    PGVECTOR_USER,
+)
+from split import load_documents, split_document
+
+logger = logging.getLogger(__name__)
+
+
+def get_embedder(embedding_model_name: str) -> HuggingFaceEmbeddings:
+    """Initialize an embedder to convert text into vectors."""
+    return HuggingFaceEmbeddings(
+        model_name=embedding_model_name,
+        model_kwargs={"device": DEVICE},
+        show_progress=True,
+    )
+
+
+def ingest(
+    meta_lookup: dict[pathlib.Path, dict],
+    collection_name: str,
+    chunk_size: int,
+    chunk_overlap: int,
+    ingest_threads: int = 8,
+    embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+    mode: str = "overwrite",
+    collection_metadata: dict = {},
+):
+    """Load documents into a vectorstore."""
+    # Get documents
+    all_documents = []
+    origin_urls = {}
+    documents = load_documents(KNOWLEDGE_REPOSITORY_PATH, ingest_threads=ingest_threads)
+    for extension, document in documents:
+        # Split each document into chunks
+        document = document[0]
+        # Rename "source" to "_source" and save filename to "source"
+        source = pathlib.Path(document.metadata["source"])
+        file_name = source.stem
+        document.metadata["_source"] = document.metadata["source"]
+        document.metadata["source"] = file_name
+        chunks = split_document(
+            document, extension, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+        )
+        # Attach metadata to each chunk
+        for chunk in chunks:
+            path_metadata = meta_lookup.get(source, {})
+            chunk.metadata = chunk.metadata | path_metadata
+        # Record how many chunks were made
+        rel_path = source.relative_to(KNOWLEDGE_REPOSITORY_PATH)
+        origin = rel_path.parts[0]
+        origin_url = (origin, chunk.metadata.get("url"))
+        origin_urls[origin_url] = len(chunks)
+        all_documents.extend(chunks)
+
+    # Create embeddings
+    embedder = get_embedder(embedding_model_name)
+
+    # Build the Postgres connection string
+    connection_string = PGVector.connection_string_from_db_params(
+        driver="psycopg",
+        host=PGVECTOR_HOST,
+        port=int(PGVECTOR_PORT),
+        database=PGVECTOR_DATABASE_NAME,
+        user=PGVECTOR_USER,
+        password=PGVECTOR_PASS,
+    )
+
+    # Connect to the db
+    db = PGVector(
+        connection=connection_string,
+        embeddings=embedder,
+        collection_name=collection_name,
+        collection_metadata=collection_metadata,
+        use_jsonb=True,
+    )
+
+    # Overwrite the collection (if requested)
+    if mode == "overwrite":
+        db.delete_collection()
+        logger.info(f"Collection {collection_name} deleted")
+        db.create_collection()
+        logger.info(f"Collection {collection_name} created")
+
+    # Load the documents
+    logger.info(
+        f"Loading {len(all_documents)} embeddings to {PGVECTOR_HOST} - {PGVECTOR_DATABASE_NAME} - {collection_name}"
+    )
+
+    # Add documents to DB in batches to accomodate the large numbers of parameters
+    batch_size = 150
+    for i in range(0, len(all_documents), batch_size):
+        batch = all_documents[i:i + batch_size]
+        logger.info(f"Ingesting batch {i // batch_size + 1} of {len(batch)} documents")
+        db.add_documents(documents=batch)
+
+    logger.info(f"Successfully loaded {len(all_documents)} embeddings")
+
+    directory_source_url_chunks = [
+        list(origin_url) + [chunks] for origin_url, chunks in origin_urls.items()
+    ]
+    df = pd.DataFrame(directory_source_url_chunks, columns=["origin", "url", "chunks"])
+    filename = f"{PGVECTOR_HOST} - {collection_name} - {datetime.now()}.csv"
+    outpath = DIRECTORY_PATH / "logs" / filename
+    outpath.parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(outpath, index=False)
diff --git a/vector_store/knowledge_source.py b/vector_store/knowledge_source.py
@@ -0,0 +1,122 @@
+# TODO (@abhikdps): Remove this file once the Igloo API keys
+# are aquired and rename the knowledge_source_igloo.py file to knowledge_source.py
+import pathlib
+import time
+import logging
+from typing import Any
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+
+from constants import SOURCE_RESPOSITORY_PATH
+
+logger = logging.getLogger(__name__)
+
+
+class SourceScraper:
+    def __init__(self, base_url: str = "https://source.redhat.com/"):
+        chrome_options = Options()
+        chrome_options.add_argument("--start-maximized")
+        self.driver = webdriver.Chrome(options=chrome_options)
+        self.base_url = base_url
+
+        self.driver.get(self.base_url)
+        print("\n Please log in manually and press ENTER here once done...")
+        input()
+        print(" Login confirmed. Proceeding with scraping.")
+
+    def fetch_all_pages(self, url_fragment: str, recursive: bool = False):
+        url = self.base_url.rstrip("/") + url_fragment
+        self.driver.get(url)
+        time.sleep(3)
+
+        soup = BeautifulSoup(self.driver.page_source, "html.parser")
+        pages = [soup]
+
+        if recursive:
+            children_links = soup.select("a[href^='/']")
+            visited = set()
+
+            for link in children_links:
+                href = link.get("href")
+                full_url = self.base_url.rstrip("/") + href
+                if href and href.startswith("/") and full_url not in visited:
+                    visited.add(full_url)
+                    try:
+                        self.driver.get(full_url)
+                        time.sleep(2)
+                        sub_soup = BeautifulSoup(self.driver.page_source, "html.parser")
+                        pages.append(sub_soup)
+                    except Exception as e:
+                        logger.warning(f"Failed to visit {full_url}: {e}")
+
+        return pages
+
+    def extract_attachments(self, soup: BeautifulSoup):
+        attachments = []
+        links = soup.select("a")
+        for link in links:
+            href = link.get("href")
+            if href and any(ext in href for ext in [".pdf", ".docx", ".xlsx"]):
+                attachments.append(href)
+        return attachments
+
+    def save_page(self, soup: BeautifulSoup, path: pathlib.Path):
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(str(soup))
+
+    def download_attachments(self, attachments: list[str], base_path: pathlib.Path):
+        for link in attachments:
+            file_name = link.split("/")[-1]
+            full_path = base_path / file_name
+            try:
+                self.driver.get(
+                    link
+                    if link.startswith("http")
+                    else self.base_url.rstrip("/") + link
+                )
+                with open(full_path, "wb") as f:
+                    f.write(self.driver.page_source.encode("utf-8"))
+            except Exception as e:
+                logger.warning(f"Failed to download attachment {link}: {e}")
+
+    def scrape(
+        self,
+        url_fragment: str,
+        recursive: bool,
+        attachments: bool,
+        metadata: dict[str, Any],
+    ):
+        meta_lookup = {}
+        pages = self.fetch_all_pages(url_fragment, recursive)
+
+        for i, soup in enumerate(pages):
+            title = soup.title.string if soup.title else f"page_{i}"
+            safe_title = title.replace("/", "_").replace(" ", "_")[:50]
+            page_path = (
+                SOURCE_RESPOSITORY_PATH / url_fragment.strip("/") / f"{safe_title}.html"
+            )
+            page_path.parent.mkdir(parents=True, exist_ok=True)
+
+            self.save_page(soup, page_path)
+            file_metadata = metadata.copy()
+            file_metadata["url"] = self.base_url.rstrip("/") + url_fragment
+
+            if attachments:
+                attachment_links = self.extract_attachments(soup)
+                self.download_attachments(attachment_links, page_path.parent)
+
+            meta_lookup[page_path] = file_metadata
+
+        return meta_lookup
+
+
+def fetchall(
+    url_fragment: str,
+    recursive: bool = False,
+    attachments: bool = True,
+    metadata: dict = {},
+    **kwargs,
+):
+    scraper = SourceScraper()
+    return scraper.scrape(url_fragment, recursive, attachments, metadata)
diff --git a/vector_store/knowlegde_source_igloo.py b/vector_store/knowlegde_source_igloo.py
diff --git a/vector_store/split.py b/vector_store/split.py
diff --git a/vector_store/vector_store.py b/vector_store/vector_store.py