Skip to content

Commit 7d6c72e

Browse files
committed
opengrep: apply agent fixes
1 parent d6f46b3 commit 7d6c72e

File tree

1 file changed

+52
-2
lines changed

1 file changed

+52
-2
lines changed

controller/transfer/knowledge_base_transfer_manager.py

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
from html.parser import HTMLParser
23
from submodules.model import UploadTask, enums
34
from submodules.model.business_objects import knowledge_term, organization
45
from submodules.model.business_objects import general
@@ -7,6 +8,56 @@
78
import pandas as pd
89

910

11+
def _parse_html_tables_to_dataframe(path: str) -> pd.DataFrame:
12+
"""Parse the first HTML table from a file using stdlib html.parser only (no lxml).
13+
Avoids pd.read_html so we do not trigger lxml/XXE-related parsers.
14+
"""
15+
with open(path, encoding="utf-8", errors="replace") as f:
16+
html_content = f.read()
17+
18+
class TableParser(HTMLParser):
19+
def __init__(self) -> None:
20+
super().__init__()
21+
self.in_table = False
22+
self.in_row = False
23+
self.in_cell = False
24+
self.current_row: list[str] = []
25+
self.rows: list[list[str]] = []
26+
self.current_cell_text: list[str] = []
27+
28+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
29+
if tag == "table":
30+
self.in_table = True
31+
self.rows = []
32+
elif self.in_table and tag == "tr":
33+
self.in_row = True
34+
self.current_row = []
35+
elif self.in_table and self.in_row and tag in ("td", "th"):
36+
self.in_cell = True
37+
self.current_cell_text = []
38+
39+
def handle_endtag(self, tag: str) -> None:
40+
if tag == "table":
41+
self.in_table = False
42+
elif tag == "tr":
43+
if self.in_table and self.current_row:
44+
self.rows.append(self.current_row)
45+
self.in_row = False
46+
elif tag in ("td", "th") and self.in_cell:
47+
self.current_row.append("".join(self.current_cell_text).strip())
48+
self.in_cell = False
49+
50+
def handle_data(self, data: str) -> None:
51+
if self.in_cell:
52+
self.current_cell_text.append(data)
53+
54+
parser = TableParser()
55+
parser.feed(html_content)
56+
if not parser.rows:
57+
return pd.DataFrame()
58+
return pd.DataFrame(parser.rows[1:], columns=parser.rows[0])
59+
60+
1061
def import_knowledge_base_file(project_id: str, task: UploadTask) -> None:
1162
upload_task_manager.update_task(project_id, task.id, state=enums.UploadStates.PENDING.value)
1263
general.commit()
@@ -26,8 +77,7 @@ def import_knowledge_base_file(project_id: str, task: UploadTask) -> None:
2677
elif file_type == "xlsx":
2778
df = pd.read_excel(download_file_name)
2879
elif file_type == "html":
29-
# Use built-in html.parser to avoid lxml (XXE-vulnerable); flavor='html.parser' uses stdlib only.
30-
df = pd.read_html(download_file_name, flavor="html.parser")
80+
df = _parse_html_tables_to_dataframe(download_file_name)
3181
elif file_type == "json":
3282
df = pd.read_json(download_file_name)
3383

0 commit comments

Comments
 (0)