11import os
2+ from html .parser import HTMLParser
23from submodules .model import UploadTask , enums
34from submodules .model .business_objects import knowledge_term , organization
45from submodules .model .business_objects import general
78import pandas as pd
89
910
11+ def _parse_html_tables_to_dataframe (path : str ) -> pd .DataFrame :
12+ """Parse the first HTML table from a file using stdlib html.parser only (no lxml).
13+ Avoids pd.read_html so we do not trigger lxml/XXE-related parsers.
14+ """
15+ with open (path , encoding = "utf-8" , errors = "replace" ) as f :
16+ html_content = f .read ()
17+
18+ class TableParser (HTMLParser ):
19+ def __init__ (self ) -> None :
20+ super ().__init__ ()
21+ self .in_table = False
22+ self .in_row = False
23+ self .in_cell = False
24+ self .current_row : list [str ] = []
25+ self .rows : list [list [str ]] = []
26+ self .current_cell_text : list [str ] = []
27+
28+ def handle_starttag (self , tag : str , attrs : list [tuple [str , str | None ]]) -> None :
29+ if tag == "table" :
30+ self .in_table = True
31+ self .rows = []
32+ elif self .in_table and tag == "tr" :
33+ self .in_row = True
34+ self .current_row = []
35+ elif self .in_table and self .in_row and tag in ("td" , "th" ):
36+ self .in_cell = True
37+ self .current_cell_text = []
38+
39+ def handle_endtag (self , tag : str ) -> None :
40+ if tag == "table" :
41+ self .in_table = False
42+ elif tag == "tr" :
43+ if self .in_table and self .current_row :
44+ self .rows .append (self .current_row )
45+ self .in_row = False
46+ elif tag in ("td" , "th" ) and self .in_cell :
47+ self .current_row .append ("" .join (self .current_cell_text ).strip ())
48+ self .in_cell = False
49+
50+ def handle_data (self , data : str ) -> None :
51+ if self .in_cell :
52+ self .current_cell_text .append (data )
53+
54+ parser = TableParser ()
55+ parser .feed (html_content )
56+ if not parser .rows :
57+ return pd .DataFrame ()
58+ return pd .DataFrame (parser .rows [1 :], columns = parser .rows [0 ])
59+
60+
1061def import_knowledge_base_file (project_id : str , task : UploadTask ) -> None :
1162 upload_task_manager .update_task (project_id , task .id , state = enums .UploadStates .PENDING .value )
1263 general .commit ()
@@ -26,8 +77,7 @@ def import_knowledge_base_file(project_id: str, task: UploadTask) -> None:
2677 elif file_type == "xlsx" :
2778 df = pd .read_excel (download_file_name )
2879 elif file_type == "html" :
29- # Use built-in html.parser to avoid lxml (XXE-vulnerable); flavor='html.parser' uses stdlib only.
30- df = pd .read_html (download_file_name , flavor = "html.parser" )
80+ df = _parse_html_tables_to_dataframe (download_file_name )
3181 elif file_type == "json" :
3282 df = pd .read_json (download_file_name )
3383
0 commit comments