improved web text extractor (#128)

gcapuzzi · web-flow · commit 55765717d0f4 · 2025-07-09T10:42:10.000+02:00
Signed-off-by: Gianluca Capuzzi &lt;gianluca.posta78@gmail.com&gt;
diff --git a/src/mvt/ingest_process.py b/src/mvt/ingest_process.py
@@ -2,7 +2,7 @@
 import time
 from os.path import isfile, join
 from os import listdir
-from utils import load_yaml_file, bs4_extract_linear_text, extract_video_id, save_transcript
+from utils import load_yaml_file, bs4_extract_linear_text, extract_video_id, save_transcript, bs4_lxml_improved
 from dotenv import load_dotenv, find_dotenv
 from langchain_community.vectorstores import FAISS
 from langchain_mistralai.embeddings import MistralAIEmbeddings
@@ -125,8 +125,9 @@ def process_web_urls_with_retry(dataset_dir, config_data, max_retries=3, retry_d
                         
                         loader = RecursiveUrlLoader(
                             url=url, 
-                            extractor=bs4_extract_linear_text, 
-                            prevent_outside=True
+                            extractor=bs4_lxml_improved, 
+                            prevent_outside=True,
+                            max_depth=1
                         )
                         # Test the loader by trying to load one document
                         test_docs = loader.load()
diff --git a/src/mvt/utils.py b/src/mvt/utils.py
@@ -1,5 +1,5 @@
 import yaml
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString, Tag
 import re
 from urllib.parse import urlparse, parse_qs
 import os
@@ -192,6 +192,42 @@ def bs4_lxml(html: str) -> str:
     soup = BeautifulSoup(html, "lxml")
     return re.sub(r"\n\n+", "\n\n", soup.text).strip()
 
+# Tags that we want to skip in the final text extraction
+SKIP_TAGS = {"script", "style", "noscript", "header", "footer", "nav", "form"}
+
+# Regex to compress spaces and empty lines
+SPACE_RE = re.compile(r"[ \t]+")
+BLANK_RE = re.compile(r"\n{3,}")
+
+def visible_text_nodes(el: Tag | NavigableString):
+    """
+    Returns only visible text nodes (filters script/style, comments, etc.)
+    """
+    if isinstance(el, NavigableString):
+        # Avoid comments and extra spaces
+        if el.parent.name not in SKIP_TAGS:
+            txt = el.strip()
+            if txt:
+                yield txt
+    elif el.name not in SKIP_TAGS:
+        for child in el.contents:
+            yield from visible_text_nodes(child)
+
+def bs4_lxml_improved(html: str) -> str:
+    # Convert HTML to string
+    soup = BeautifulSoup(html, "lxml")
+
+    # Removes unwanted elements completely (optional but useful)
+    for tag in soup.find_all(SKIP_TAGS):
+        tag.decompose()
+
+    raw_lines = list(visible_text_nodes(soup.body or soup))
+    joined = "\n".join(raw_lines)
+
+    # Normalizes spaces and empty lines
+    joined = SPACE_RE.sub(" ", joined)
+    joined = BLANK_RE.sub("\n\n", joined.strip())
+    return joined + "\n"
 
 
 def convert_youtube_short_to_full(short_url):