Skip to content

Commit 5576571

Browse files
authored
improved web text extractor (#128)
Signed-off-by: Gianluca Capuzzi <[email protected]>
1 parent 50498aa commit 5576571

File tree

2 files changed

+41
-4
lines changed

2 files changed

+41
-4
lines changed

src/mvt/ingest_process.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import time
33
from os.path import isfile, join
44
from os import listdir
5-
from utils import load_yaml_file, bs4_extract_linear_text, extract_video_id, save_transcript
5+
from utils import load_yaml_file, bs4_extract_linear_text, extract_video_id, save_transcript, bs4_lxml_improved
66
from dotenv import load_dotenv, find_dotenv
77
from langchain_community.vectorstores import FAISS
88
from langchain_mistralai.embeddings import MistralAIEmbeddings
@@ -125,8 +125,9 @@ def process_web_urls_with_retry(dataset_dir, config_data, max_retries=3, retry_d
125125

126126
loader = RecursiveUrlLoader(
127127
url=url,
128-
extractor=bs4_extract_linear_text,
129-
prevent_outside=True
128+
extractor=bs4_lxml_improved,
129+
prevent_outside=True,
130+
max_depth=1
130131
)
131132
# Test the loader by trying to load one document
132133
test_docs = loader.load()

src/mvt/utils.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import yaml
2-
from bs4 import BeautifulSoup
2+
from bs4 import BeautifulSoup, NavigableString, Tag
33
import re
44
from urllib.parse import urlparse, parse_qs
55
import os
@@ -192,6 +192,42 @@ def bs4_lxml(html: str) -> str:
192192
soup = BeautifulSoup(html, "lxml")
193193
return re.sub(r"\n\n+", "\n\n", soup.text).strip()
194194

195+
# Tags that we want to skip in the final text extraction
196+
SKIP_TAGS = {"script", "style", "noscript", "header", "footer", "nav", "form"}
197+
198+
# Regex to compress spaces and empty lines
199+
SPACE_RE = re.compile(r"[ \t]+")
200+
BLANK_RE = re.compile(r"\n{3,}")
201+
202+
def visible_text_nodes(el: Tag | NavigableString):
203+
"""
204+
Returns only visible text nodes (filters script/style, comments, etc.)
205+
"""
206+
if isinstance(el, NavigableString):
207+
# Avoid comments and extra spaces
208+
if el.parent.name not in SKIP_TAGS:
209+
txt = el.strip()
210+
if txt:
211+
yield txt
212+
elif el.name not in SKIP_TAGS:
213+
for child in el.contents:
214+
yield from visible_text_nodes(child)
215+
216+
def bs4_lxml_improved(html: str) -> str:
217+
# Convert HTML to string
218+
soup = BeautifulSoup(html, "lxml")
219+
220+
# Removes unwanted elements completely (optional but useful)
221+
for tag in soup.find_all(SKIP_TAGS):
222+
tag.decompose()
223+
224+
raw_lines = list(visible_text_nodes(soup.body or soup))
225+
joined = "\n".join(raw_lines)
226+
227+
# Normalizes spaces and empty lines
228+
joined = SPACE_RE.sub(" ", joined)
229+
joined = BLANK_RE.sub("\n\n", joined.strip())
230+
return joined + "\n"
195231

196232

197233
def convert_youtube_short_to_full(short_url):

0 commit comments

Comments
 (0)