Skip to content

Commit cf0011b

Browse files
Feat: Upgrade html parser (infiniflow#9675)
### What problem does this PR solve? parse more html content. ### Type of change - [x] Other (please describe):
1 parent 1f47001 commit cf0011b

File tree

2 files changed

+179
-13
lines changed

2 files changed

+179
-13
lines changed

deepdoc/parser/html_parser.py

Lines changed: 177 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,35 +15,200 @@
1515
# limitations under the License.
1616
#
1717

18-
from rag.nlp import find_codec
19-
import readability
20-
import html_text
18+
from rag.nlp import find_codec, rag_tokenizer
19+
import uuid
2120
import chardet
22-
21+
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
22+
import html
2323

2424
def get_encoding(file):
2525
with open(file,'rb') as f:
2626
tmp = chardet.detect(f.read())
2727
return tmp['encoding']
2828

29+
BLOCK_TAGS = [
30+
"h1", "h2", "h3", "h4", "h5", "h6",
31+
"p", "div", "article", "section", "aside",
32+
"ul", "ol", "li",
33+
"table", "pre", "code", "blockquote",
34+
"figure", "figcaption"
35+
]
36+
TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"}
37+
2938

3039
class RAGFlowHtmlParser:
31-
def __call__(self, fnm, binary=None):
40+
def __call__(self, fnm, binary=None, chunk_token_num=None):
3241
if binary:
3342
encoding = find_codec(binary)
3443
txt = binary.decode(encoding, errors="ignore")
3544
else:
3645
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
3746
txt = f.read()
38-
return self.parser_txt(txt)
47+
return self.parser_txt(txt, chunk_token_num)
3948

4049
@classmethod
41-
def parser_txt(cls, txt):
50+
def parser_txt(cls, txt, chunk_token_num):
4251
if not isinstance(txt, str):
4352
raise TypeError("txt type should be string!")
44-
html_doc = readability.Document(txt)
45-
title = html_doc.title()
46-
content = html_text.extract_text(html_doc.summary(html_partial=True))
47-
txt = f"{title}\n{content}"
48-
sections = txt.split("\n")
53+
54+
temp_sections = []
55+
soup = BeautifulSoup(txt, "html5lib")
56+
# delete <style> tag
57+
for style_tag in soup.find_all(["style", "script"]):
58+
style_tag.decompose()
59+
# delete <script> tag in <div>
60+
for div_tag in soup.find_all("div"):
61+
for script_tag in div_tag.find_all("script"):
62+
script_tag.decompose()
63+
# delete inline style
64+
for tag in soup.find_all(True):
65+
if 'style' in tag.attrs:
66+
del tag.attrs['style']
67+
# delete HTML comment
68+
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
69+
comment.extract()
70+
71+
cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
72+
block_txt_list, table_list = cls.merge_block_text(temp_sections)
73+
sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
74+
for table in table_list:
75+
sections.append(table.get("content", ""))
4976
return sections
77+
78+
@classmethod
79+
def split_table(cls, html_table, chunk_token_num=512):
80+
soup = BeautifulSoup(html_table, "html.parser")
81+
rows = soup.find_all("tr")
82+
tables = []
83+
current_table = []
84+
current_count = 0
85+
table_str_list = []
86+
for row in rows:
87+
tks_str = rag_tokenizer.tokenize(str(row))
88+
token_count = len(tks_str.split(" ")) if tks_str else 0
89+
if current_count + token_count > chunk_token_num:
90+
tables.append(current_table)
91+
current_table = []
92+
current_count = 0
93+
current_table.append(row)
94+
current_count += token_count
95+
if current_table:
96+
tables.append(current_table)
97+
98+
for table_rows in tables:
99+
new_table = soup.new_tag("table")
100+
for row in table_rows:
101+
new_table.append(row)
102+
table_str_list.append(str(new_table))
103+
104+
return table_str_list
105+
106+
@classmethod
107+
def read_text_recursively(cls, element, parser_result, chunk_token_num=512, parent_name=None, block_id=None):
108+
if isinstance(element, NavigableString):
109+
content = element.strip()
110+
111+
def is_valid_html(content):
112+
try:
113+
soup = BeautifulSoup(content, "html.parser")
114+
return bool(soup.find())
115+
except Exception:
116+
return False
117+
118+
return_info = []
119+
if content:
120+
if is_valid_html(content):
121+
soup = BeautifulSoup(content, "html.parser")
122+
child_info = cls.read_text_recursively(soup, parser_result, chunk_token_num, element.name, block_id)
123+
parser_result.extend(child_info)
124+
else:
125+
info = {"content": element.strip(), "tag_name": "inner_text", "metadata": {"block_id": block_id}}
126+
if parent_name:
127+
info["tag_name"] = parent_name
128+
return_info.append(info)
129+
return return_info
130+
elif isinstance(element, Tag):
131+
132+
if str.lower(element.name) == "table":
133+
table_info_list = []
134+
table_id = str(uuid.uuid1())
135+
table_list = [html.unescape(str(element))]
136+
for t in table_list:
137+
table_info_list.append({"content": t, "tag_name": "table",
138+
"metadata": {"table_id": table_id, "index": table_list.index(t)}})
139+
return table_info_list
140+
else:
141+
block_id = None
142+
if str.lower(element.name) in BLOCK_TAGS:
143+
block_id = str(uuid.uuid1())
144+
for child in element.children:
145+
child_info = cls.read_text_recursively(child, parser_result, chunk_token_num, element.name,
146+
block_id)
147+
parser_result.extend(child_info)
148+
return []
149+
150+
@classmethod
151+
def merge_block_text(cls, parser_result):
152+
block_content = []
153+
current_content = ""
154+
table_info_list = []
155+
lask_block_id = None
156+
for item in parser_result:
157+
content = item.get("content")
158+
tag_name = item.get("tag_name")
159+
title_flag = tag_name in TITLE_TAGS
160+
block_id = item.get("metadata", {}).get("block_id")
161+
if block_id:
162+
if title_flag:
163+
content = f"{TITLE_TAGS[tag_name]} {content}"
164+
if lask_block_id != block_id:
165+
if lask_block_id is not None:
166+
block_content.append(current_content)
167+
current_content = content
168+
lask_block_id = block_id
169+
else:
170+
current_content += (" " if current_content else "") + content
171+
else:
172+
if tag_name == "table":
173+
table_info_list.append(item)
174+
else:
175+
current_content += (" " if current_content else "" + content)
176+
if current_content:
177+
block_content.append(current_content)
178+
return block_content, table_info_list
179+
180+
@classmethod
181+
def chunk_block(cls, block_txt_list, chunk_token_num=512):
182+
chunks = []
183+
current_block = ""
184+
current_token_count = 0
185+
186+
for block in block_txt_list:
187+
tks_str = rag_tokenizer.tokenize(block)
188+
block_token_count = len(tks_str.split(" ")) if tks_str else 0
189+
if block_token_count > chunk_token_num:
190+
if current_block:
191+
chunks.append(current_block)
192+
start = 0
193+
tokens = tks_str.split(" ")
194+
while start < len(tokens):
195+
end = start + chunk_token_num
196+
split_tokens = tokens[start:end]
197+
chunks.append(" ".join(split_tokens))
198+
start = end
199+
current_block = ""
200+
current_token_count = 0
201+
else:
202+
if current_token_count + block_token_count <= chunk_token_num:
203+
current_block += ("\n" if current_block else "") + block
204+
current_token_count += block_token_count
205+
else:
206+
chunks.append(current_block)
207+
current_block = block
208+
current_token_count = block_token_count
209+
210+
if current_block:
211+
chunks.append(current_block)
212+
213+
return chunks
214+

rag/app/naive.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
517517

518518
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
519519
callback(0.1, "Start to parse.")
520-
sections = HtmlParser()(filename, binary)
520+
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
521+
sections = HtmlParser()(filename, binary, chunk_token_num)
521522
sections = [(_, "") for _ in sections if _]
522523
callback(0.8, "Finish parsing.")
523524

0 commit comments

Comments
 (0)