Skip to content

Commit c9a89b7

Browse files
committed
Parsing
1 parent 9ef480b commit c9a89b7

1 file changed

Lines changed: 69 additions & 0 deletions

File tree

src/Parser/HTMLParser.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from typing import Dict, List, Optional
2+
from dataclasses import dataclass, field
3+
from bs4 import BeautifulSoup, Tag, NavigableString
4+
5+
6+
@dataclass
7+
class Node:
8+
tag: str
9+
attrs: Dict[str, str]
10+
text: str = ""
11+
children: List["Node"] = field(default_factory=list)
12+
computed_style: Dict[str, str] = field(default_factory=dict)
13+
14+
def __repr__(self) -> str:
15+
child_tags = [child.tag for child in self.children]
16+
return (f"Node(tag={self.tag}, attrs={self.attrs}, "
17+
f"text='{self.text[:30]}', children={child_tags}, "
18+
f"computed_style={self.computed_style})")
19+
20+
21+
class HTMLParser:
22+
"""Convert raw HTML into a tree of Node objects."""
23+
24+
@staticmethod
25+
def bs4_to_node(element: Tag) -> Node:
26+
"""Recursively convert BeautifulSoup Tag into Node."""
27+
node = Node(tag=element.name or "text", attrs=element.attrs)
28+
29+
for child in element.children:
30+
if isinstance(child, NavigableString):
31+
text = str(child).strip()
32+
if text:
33+
node.children.append(Node(tag="_text", text=text, attrs={}))
34+
elif isinstance(child, Tag):
35+
node.children.append(HTMLParser.bs4_to_node(child))
36+
return node
37+
38+
@staticmethod
39+
def parse_html(html: str) -> Node:
40+
"""Parse raw HTML into our Node tree."""
41+
soup = BeautifulSoup(html, "html.parser")
42+
root_elem = soup.find("html") or soup
43+
return HTMLParser.bs4_to_node(root_elem)
44+
45+
46+
47+
"""
48+
tests:
49+
50+
html_code =
51+
<html>
52+
<body>
53+
<h1 class="hero">Hello</h1>
54+
<p id="greeting">Welcome to <b>your</b> browser</p>
55+
</body>
56+
</html>
57+
58+
59+
root = HTMLParser.parse_html(html_code)
60+
61+
def print_tree(node: Node, depth=0):
62+
indent = " " * depth
63+
print(f"{indent}{node.tag} -> text='{node.text}' attrs={node.attrs}")
64+
for child in node.children:
65+
print_tree(child, depth + 1)
66+
67+
print_tree(root)
68+
69+
"""

0 commit comments

Comments
 (0)