|
1 | 1 | import yaml |
2 | | -from bs4 import BeautifulSoup |
| 2 | +from bs4 import BeautifulSoup, NavigableString, Tag |
3 | 3 | import re |
4 | 4 | from urllib.parse import urlparse, parse_qs |
5 | 5 | import os |
@@ -192,6 +192,42 @@ def bs4_lxml(html: str) -> str: |
192 | 192 | soup = BeautifulSoup(html, "lxml") |
193 | 193 | return re.sub(r"\n\n+", "\n\n", soup.text).strip() |
194 | 194 |
|
| 195 | +# Tags that we want to skip in the final text extraction |
| 196 | +SKIP_TAGS = {"script", "style", "noscript", "header", "footer", "nav", "form"} |
| 197 | + |
| 198 | +# Regex to compress spaces and empty lines |
| 199 | +SPACE_RE = re.compile(r"[ \t]+") |
| 200 | +BLANK_RE = re.compile(r"\n{3,}") |
| 201 | + |
| 202 | +def visible_text_nodes(el: Tag | NavigableString): |
| 203 | + """ |
| 204 | + Returns only visible text nodes (filters script/style, comments, etc.) |
| 205 | + """ |
| 206 | + if isinstance(el, NavigableString): |
| 207 | + # Avoid comments and extra spaces |
| 208 | + if el.parent.name not in SKIP_TAGS: |
| 209 | + txt = el.strip() |
| 210 | + if txt: |
| 211 | + yield txt |
| 212 | + elif el.name not in SKIP_TAGS: |
| 213 | + for child in el.contents: |
| 214 | + yield from visible_text_nodes(child) |
| 215 | + |
| 216 | +def bs4_lxml_improved(html: str) -> str: |
| 217 | + # Convert HTML to string |
| 218 | + soup = BeautifulSoup(html, "lxml") |
| 219 | + |
| 220 | + # Removes unwanted elements completely (optional but useful) |
| 221 | + for tag in soup.find_all(SKIP_TAGS): |
| 222 | + tag.decompose() |
| 223 | + |
| 224 | + raw_lines = list(visible_text_nodes(soup.body or soup)) |
| 225 | + joined = "\n".join(raw_lines) |
| 226 | + |
| 227 | + # Normalizes spaces and empty lines |
| 228 | + joined = SPACE_RE.sub(" ", joined) |
| 229 | + joined = BLANK_RE.sub("\n\n", joined.strip()) |
| 230 | + return joined + "\n" |
195 | 231 |
|
196 | 232 |
|
197 | 233 | def convert_youtube_short_to_full(short_url): |
|
0 commit comments