Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/mvt/ingest_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import time
from os.path import isfile, join
from os import listdir
from utils import load_yaml_file, bs4_extract_linear_text, extract_video_id, save_transcript
from utils import load_yaml_file, bs4_extract_linear_text, extract_video_id, save_transcript, bs4_lxml_improved
from dotenv import load_dotenv, find_dotenv
from langchain_community.vectorstores import FAISS
from langchain_mistralai.embeddings import MistralAIEmbeddings
Expand Down Expand Up @@ -125,8 +125,9 @@ def process_web_urls_with_retry(dataset_dir, config_data, max_retries=3, retry_d

loader = RecursiveUrlLoader(
url=url,
extractor=bs4_extract_linear_text,
prevent_outside=True
extractor=bs4_lxml_improved,
prevent_outside=True,
max_depth=1
)
# Test the loader by trying to load one document
test_docs = loader.load()
Expand Down
38 changes: 37 additions & 1 deletion src/mvt/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import yaml
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, NavigableString, Tag
import re
from urllib.parse import urlparse, parse_qs
import os
Expand Down Expand Up @@ -192,6 +192,42 @@ def bs4_lxml(html: str) -> str:
soup = BeautifulSoup(html, "lxml")
return re.sub(r"\n\n+", "\n\n", soup.text).strip()

# Tags that we want to skip in the final text extraction
SKIP_TAGS = {"script", "style", "noscript", "header", "footer", "nav", "form"}

# Regex to compress spaces and empty lines
SPACE_RE = re.compile(r"[ \t]+")
BLANK_RE = re.compile(r"\n{3,}")

def visible_text_nodes(el: Tag | NavigableString):
"""
Returns only visible text nodes (filters script/style, comments, etc.)
"""
if isinstance(el, NavigableString):
# Avoid comments and extra spaces
if el.parent.name not in SKIP_TAGS:
txt = el.strip()
if txt:
yield txt
elif el.name not in SKIP_TAGS:
for child in el.contents:
yield from visible_text_nodes(child)

def bs4_lxml_improved(html: str) -> str:
# Convert HTML to string
soup = BeautifulSoup(html, "lxml")

# Removes unwanted elements completely (optional but useful)
for tag in soup.find_all(SKIP_TAGS):
tag.decompose()

raw_lines = list(visible_text_nodes(soup.body or soup))
joined = "\n".join(raw_lines)

# Normalizes spaces and empty lines
joined = SPACE_RE.sub(" ", joined)
joined = BLANK_RE.sub("\n\n", joined.strip())
return joined + "\n"


def convert_youtube_short_to_full(short_url):
Expand Down