|
| 1 | +from typing import Dict, List, Optional |
| 2 | +from dataclasses import dataclass, field |
| 3 | +from bs4 import BeautifulSoup, Tag, NavigableString |
| 4 | + |
| 5 | + |
| 6 | +@dataclass |
| 7 | +class Node: |
| 8 | + tag: str |
| 9 | + attrs: Dict[str, str] |
| 10 | + text: str = "" |
| 11 | + children: List["Node"] = field(default_factory=list) |
| 12 | + computed_style: Dict[str, str] = field(default_factory=dict) |
| 13 | + |
| 14 | + def __repr__(self) -> str: |
| 15 | + child_tags = [child.tag for child in self.children] |
| 16 | + return (f"Node(tag={self.tag}, attrs={self.attrs}, " |
| 17 | + f"text='{self.text[:30]}', children={child_tags}, " |
| 18 | + f"computed_style={self.computed_style})") |
| 19 | + |
| 20 | + |
| 21 | +class HTMLParser: |
| 22 | + """Convert raw HTML into a tree of Node objects.""" |
| 23 | + |
| 24 | + @staticmethod |
| 25 | + def bs4_to_node(element: Tag) -> Node: |
| 26 | + """Recursively convert BeautifulSoup Tag into Node.""" |
| 27 | + node = Node(tag=element.name or "text", attrs=element.attrs) |
| 28 | + |
| 29 | + for child in element.children: |
| 30 | + if isinstance(child, NavigableString): |
| 31 | + text = str(child).strip() |
| 32 | + if text: |
| 33 | + node.children.append(Node(tag="_text", text=text, attrs={})) |
| 34 | + elif isinstance(child, Tag): |
| 35 | + node.children.append(HTMLParser.bs4_to_node(child)) |
| 36 | + return node |
| 37 | + |
| 38 | + @staticmethod |
| 39 | + def parse_html(html: str) -> Node: |
| 40 | + """Parse raw HTML into our Node tree.""" |
| 41 | + soup = BeautifulSoup(html, "html.parser") |
| 42 | + root_elem = soup.find("html") or soup |
| 43 | + return HTMLParser.bs4_to_node(root_elem) |
| 44 | + |
| 45 | + |
| 46 | + |
| 47 | +""" |
| 48 | +tests: |
| 49 | + |
| 50 | +html_code = |
| 51 | +<html> |
| 52 | + <body> |
| 53 | + <h1 class="hero">Hello</h1> |
| 54 | + <p id="greeting">Welcome to <b>your</b> browser</p> |
| 55 | + </body> |
| 56 | +</html> |
| 57 | +
|
| 58 | +
|
| 59 | +root = HTMLParser.parse_html(html_code) |
| 60 | +
|
| 61 | +def print_tree(node: Node, depth=0): |
| 62 | + indent = " " * depth |
| 63 | + print(f"{indent}{node.tag} -> text='{node.text}' attrs={node.attrs}") |
| 64 | + for child in node.children: |
| 65 | + print_tree(child, depth + 1) |
| 66 | +
|
| 67 | +print_tree(root) |
| 68 | +
|
| 69 | +""" |
0 commit comments