|
15 | 15 | # limitations under the License. |
16 | 16 | # |
17 | 17 |
|
18 | | -from rag.nlp import find_codec |
19 | | -import readability |
20 | | -import html_text |
| 18 | +from rag.nlp import find_codec, rag_tokenizer |
| 19 | +import uuid |
21 | 20 | import chardet |
22 | | - |
| 21 | +from bs4 import BeautifulSoup, NavigableString, Tag, Comment |
| 22 | +import html |
23 | 23 |
|
24 | 24 | def get_encoding(file): |
25 | 25 | with open(file,'rb') as f: |
26 | 26 | tmp = chardet.detect(f.read()) |
27 | 27 | return tmp['encoding'] |
28 | 28 |
|
| 29 | +BLOCK_TAGS = [ |
| 30 | + "h1", "h2", "h3", "h4", "h5", "h6", |
| 31 | + "p", "div", "article", "section", "aside", |
| 32 | + "ul", "ol", "li", |
| 33 | + "table", "pre", "code", "blockquote", |
| 34 | + "figure", "figcaption" |
| 35 | +] |
| 36 | +TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"} |
| 37 | + |
29 | 38 |
|
30 | 39 | class RAGFlowHtmlParser: |
31 | | - def __call__(self, fnm, binary=None): |
| 40 | + def __call__(self, fnm, binary=None, chunk_token_num=None): |
32 | 41 | if binary: |
33 | 42 | encoding = find_codec(binary) |
34 | 43 | txt = binary.decode(encoding, errors="ignore") |
35 | 44 | else: |
36 | 45 | with open(fnm, "r",encoding=get_encoding(fnm)) as f: |
37 | 46 | txt = f.read() |
38 | | - return self.parser_txt(txt) |
| 47 | + return self.parser_txt(txt, chunk_token_num) |
39 | 48 |
|
40 | 49 | @classmethod |
41 | | - def parser_txt(cls, txt): |
| 50 | + def parser_txt(cls, txt, chunk_token_num): |
42 | 51 | if not isinstance(txt, str): |
43 | 52 | raise TypeError("txt type should be string!") |
44 | | - html_doc = readability.Document(txt) |
45 | | - title = html_doc.title() |
46 | | - content = html_text.extract_text(html_doc.summary(html_partial=True)) |
47 | | - txt = f"{title}\n{content}" |
48 | | - sections = txt.split("\n") |
| 53 | + |
| 54 | + temp_sections = [] |
| 55 | + soup = BeautifulSoup(txt, "html5lib") |
| 56 | + # delete <style> tag |
| 57 | + for style_tag in soup.find_all(["style", "script"]): |
| 58 | + style_tag.decompose() |
| 59 | + # delete <script> tag in <div> |
| 60 | + for div_tag in soup.find_all("div"): |
| 61 | + for script_tag in div_tag.find_all("script"): |
| 62 | + script_tag.decompose() |
| 63 | + # delete inline style |
| 64 | + for tag in soup.find_all(True): |
| 65 | + if 'style' in tag.attrs: |
| 66 | + del tag.attrs['style'] |
| 67 | + # delete HTML comment |
| 68 | + for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): |
| 69 | + comment.extract() |
| 70 | + |
| 71 | + cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num) |
| 72 | + block_txt_list, table_list = cls.merge_block_text(temp_sections) |
| 73 | + sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num) |
| 74 | + for table in table_list: |
| 75 | + sections.append(table.get("content", "")) |
49 | 76 | return sections |
| 77 | + |
| 78 | + @classmethod |
| 79 | + def split_table(cls, html_table, chunk_token_num=512): |
| 80 | + soup = BeautifulSoup(html_table, "html.parser") |
| 81 | + rows = soup.find_all("tr") |
| 82 | + tables = [] |
| 83 | + current_table = [] |
| 84 | + current_count = 0 |
| 85 | + table_str_list = [] |
| 86 | + for row in rows: |
| 87 | + tks_str = rag_tokenizer.tokenize(str(row)) |
| 88 | + token_count = len(tks_str.split(" ")) if tks_str else 0 |
| 89 | + if current_count + token_count > chunk_token_num: |
| 90 | + tables.append(current_table) |
| 91 | + current_table = [] |
| 92 | + current_count = 0 |
| 93 | + current_table.append(row) |
| 94 | + current_count += token_count |
| 95 | + if current_table: |
| 96 | + tables.append(current_table) |
| 97 | + |
| 98 | + for table_rows in tables: |
| 99 | + new_table = soup.new_tag("table") |
| 100 | + for row in table_rows: |
| 101 | + new_table.append(row) |
| 102 | + table_str_list.append(str(new_table)) |
| 103 | + |
| 104 | + return table_str_list |
| 105 | + |
| 106 | + @classmethod |
| 107 | + def read_text_recursively(cls, element, parser_result, chunk_token_num=512, parent_name=None, block_id=None): |
| 108 | + if isinstance(element, NavigableString): |
| 109 | + content = element.strip() |
| 110 | + |
| 111 | + def is_valid_html(content): |
| 112 | + try: |
| 113 | + soup = BeautifulSoup(content, "html.parser") |
| 114 | + return bool(soup.find()) |
| 115 | + except Exception: |
| 116 | + return False |
| 117 | + |
| 118 | + return_info = [] |
| 119 | + if content: |
| 120 | + if is_valid_html(content): |
| 121 | + soup = BeautifulSoup(content, "html.parser") |
| 122 | + child_info = cls.read_text_recursively(soup, parser_result, chunk_token_num, element.name, block_id) |
| 123 | + parser_result.extend(child_info) |
| 124 | + else: |
| 125 | + info = {"content": element.strip(), "tag_name": "inner_text", "metadata": {"block_id": block_id}} |
| 126 | + if parent_name: |
| 127 | + info["tag_name"] = parent_name |
| 128 | + return_info.append(info) |
| 129 | + return return_info |
| 130 | + elif isinstance(element, Tag): |
| 131 | + |
| 132 | + if str.lower(element.name) == "table": |
| 133 | + table_info_list = [] |
| 134 | + table_id = str(uuid.uuid1()) |
| 135 | + table_list = [html.unescape(str(element))] |
| 136 | + for t in table_list: |
| 137 | + table_info_list.append({"content": t, "tag_name": "table", |
| 138 | + "metadata": {"table_id": table_id, "index": table_list.index(t)}}) |
| 139 | + return table_info_list |
| 140 | + else: |
| 141 | + block_id = None |
| 142 | + if str.lower(element.name) in BLOCK_TAGS: |
| 143 | + block_id = str(uuid.uuid1()) |
| 144 | + for child in element.children: |
| 145 | + child_info = cls.read_text_recursively(child, parser_result, chunk_token_num, element.name, |
| 146 | + block_id) |
| 147 | + parser_result.extend(child_info) |
| 148 | + return [] |
| 149 | + |
| 150 | + @classmethod |
| 151 | + def merge_block_text(cls, parser_result): |
| 152 | + block_content = [] |
| 153 | + current_content = "" |
| 154 | + table_info_list = [] |
| 155 | + lask_block_id = None |
| 156 | + for item in parser_result: |
| 157 | + content = item.get("content") |
| 158 | + tag_name = item.get("tag_name") |
| 159 | + title_flag = tag_name in TITLE_TAGS |
| 160 | + block_id = item.get("metadata", {}).get("block_id") |
| 161 | + if block_id: |
| 162 | + if title_flag: |
| 163 | + content = f"{TITLE_TAGS[tag_name]} {content}" |
| 164 | + if lask_block_id != block_id: |
| 165 | + if lask_block_id is not None: |
| 166 | + block_content.append(current_content) |
| 167 | + current_content = content |
| 168 | + lask_block_id = block_id |
| 169 | + else: |
| 170 | + current_content += (" " if current_content else "") + content |
| 171 | + else: |
| 172 | + if tag_name == "table": |
| 173 | + table_info_list.append(item) |
| 174 | + else: |
| 175 | + current_content += (" " if current_content else "" + content) |
| 176 | + if current_content: |
| 177 | + block_content.append(current_content) |
| 178 | + return block_content, table_info_list |
| 179 | + |
| 180 | + @classmethod |
| 181 | + def chunk_block(cls, block_txt_list, chunk_token_num=512): |
| 182 | + chunks = [] |
| 183 | + current_block = "" |
| 184 | + current_token_count = 0 |
| 185 | + |
| 186 | + for block in block_txt_list: |
| 187 | + tks_str = rag_tokenizer.tokenize(block) |
| 188 | + block_token_count = len(tks_str.split(" ")) if tks_str else 0 |
| 189 | + if block_token_count > chunk_token_num: |
| 190 | + if current_block: |
| 191 | + chunks.append(current_block) |
| 192 | + start = 0 |
| 193 | + tokens = tks_str.split(" ") |
| 194 | + while start < len(tokens): |
| 195 | + end = start + chunk_token_num |
| 196 | + split_tokens = tokens[start:end] |
| 197 | + chunks.append(" ".join(split_tokens)) |
| 198 | + start = end |
| 199 | + current_block = "" |
| 200 | + current_token_count = 0 |
| 201 | + else: |
| 202 | + if current_token_count + block_token_count <= chunk_token_num: |
| 203 | + current_block += ("\n" if current_block else "") + block |
| 204 | + current_token_count += block_token_count |
| 205 | + else: |
| 206 | + chunks.append(current_block) |
| 207 | + current_block = block |
| 208 | + current_token_count = block_token_count |
| 209 | + |
| 210 | + if current_block: |
| 211 | + chunks.append(current_block) |
| 212 | + |
| 213 | + return chunks |
| 214 | + |
0 commit comments