Skip to content

Commit b2ae936

Browse files
committed
feat(docreader): structure Excel table extraction
1 parent 122408b commit b2ae936

3 files changed

Lines changed: 396 additions & 2 deletions

File tree

docreader/parser/excel_parser.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
engine_for_format,
2121
normalize_excel_bytes,
2222
)
23+
from docreader.parser.excel_structured import build_structured_excel_document
2324
from docreader.parser.xlsx_merge import fill_merged_cells_xlsx
2425
from docreader.parser.xlsx_repair import repair_xlsx_bytes
2526

@@ -86,10 +87,16 @@ def parse_into_text(self, content: bytes) -> Document:
8687
start, end = 0, 0
8788

8889
excel_file = _open_excel_file(content, file_type=self.file_type)
90+
sheet_frames = [
91+
(sheet_name, _read_sheet_dataframe(excel_file, sheet_name))
92+
for sheet_name in excel_file.sheet_names
93+
]
94+
structured_doc = build_structured_excel_document(sheet_frames)
95+
if structured_doc is not None:
96+
return structured_doc
8997

9098
# Process each sheet in the Excel file
91-
for excel_sheet_name in excel_file.sheet_names:
92-
df = _read_sheet_dataframe(excel_file, excel_sheet_name)
99+
for excel_sheet_name, df in sheet_frames:
93100
# Remove rows where all values are NaN (completely empty rows)
94101
df.dropna(how="all", inplace=True)
95102

Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
"""Structured Excel extraction for table-like workbooks.
2+
3+
The fallback Excel parser flattens rows as ``A: value,B: value``. That is
4+
robust, but it loses table semantics and can duplicate long merged notes across
5+
columns. This module keeps a conservative structured path for common RAG
6+
workbooks: policy tables, FAQ sheets, catalogs, and other header-driven data.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
from dataclasses import dataclass
12+
import re
13+
from typing import Any, Dict, List, Optional
14+
15+
import pandas as pd
16+
17+
from docreader.models.document import Chunk, Document
18+
19+
20+
NOTE_HEADER_HINTS = (
21+
"说明",
22+
"注意事项",
23+
"原则",
24+
"概述",
25+
"备注",
26+
"用途",
27+
"须知",
28+
)
29+
30+
MAX_HEADER_CHARS = 40
31+
MIN_STRUCTURED_CHUNKS = 2
32+
IMAGE_FUNC_RE = re.compile(r"^=?(_xlfn\.)?(DISPIMG|IMAGE)\(", re.IGNORECASE)
33+
34+
35+
@dataclass
36+
class StructuredSheet:
37+
"""Detected table layout for one worksheet."""
38+
39+
name: str
40+
df: pd.DataFrame
41+
header_idx: int
42+
headers: Dict[Any, str]
43+
note_columns: set[Any]
44+
45+
46+
def build_structured_excel_document(
47+
sheet_frames: List[tuple[str, pd.DataFrame]],
48+
) -> Optional[Document]:
49+
"""Build a semantically structured document when sheets look table-like.
50+
51+
Returns ``None`` when the workbook does not have enough detectable table
52+
structure. Callers should then use the legacy row-flattening path.
53+
"""
54+
55+
chunks: List[Chunk] = []
56+
parts: List[str] = []
57+
start = 0
58+
59+
for sheet_name, df in sheet_frames:
60+
detected = _detect_sheet(sheet_name, df)
61+
if detected is None:
62+
if _sheet_has_content(df):
63+
return None
64+
continue
65+
66+
sheet_intro = _format_sheet_intro(detected)
67+
if sheet_intro:
68+
start = _append_chunk(
69+
chunks,
70+
parts,
71+
sheet_intro,
72+
start,
73+
{
74+
"parser": "structured_excel",
75+
"sheet": sheet_name,
76+
"kind": "sheet_notes",
77+
},
78+
)
79+
80+
for row_idx in range(detected.header_idx + 1, len(detected.df)):
81+
record = _format_record(detected, row_idx)
82+
if not record:
83+
continue
84+
metadata = {
85+
"parser": "structured_excel",
86+
"sheet": sheet_name,
87+
"kind": "table_record",
88+
"row": row_idx + 1,
89+
"headers": list(detected.headers.values()),
90+
}
91+
start = _append_chunk(chunks, parts, record, start, metadata)
92+
93+
if len(chunks) < MIN_STRUCTURED_CHUNKS:
94+
return None
95+
96+
return Document(
97+
content="".join(parts),
98+
chunks=chunks,
99+
metadata={"parser": "structured_excel"},
100+
)
101+
102+
103+
def _append_chunk(
104+
chunks: List[Chunk],
105+
parts: List[str],
106+
content: str,
107+
start: int,
108+
metadata: Dict[str, Any],
109+
) -> int:
110+
if not content.endswith("\n"):
111+
content += "\n"
112+
end = start + len(content)
113+
parts.append(content)
114+
chunks.append(
115+
Chunk(
116+
content=content,
117+
seq=len(chunks),
118+
start=start,
119+
end=end,
120+
metadata=metadata,
121+
)
122+
)
123+
return end
124+
125+
126+
def _detect_sheet(sheet_name: str, df: pd.DataFrame) -> Optional[StructuredSheet]:
127+
if df.empty:
128+
return None
129+
130+
work = df.dropna(how="all").reset_index(drop=True)
131+
if work.empty:
132+
return None
133+
134+
header_idx = _find_header_row(work)
135+
if header_idx is None:
136+
return None
137+
138+
headers = _headers_from_row(work.iloc[header_idx])
139+
if len(headers) < 2:
140+
return None
141+
142+
note_columns = {
143+
col
144+
for col, header in headers.items()
145+
if _looks_like_note_header(header)
146+
}
147+
note_columns.update(_detect_long_context_columns(work, header_idx, headers))
148+
149+
# Keep at least two data columns; otherwise structured mode would be less
150+
# useful than the legacy parser.
151+
data_headers = [h for col, h in headers.items() if col not in note_columns]
152+
if len(data_headers) < 2:
153+
return None
154+
155+
return StructuredSheet(
156+
name=sheet_name,
157+
df=work,
158+
header_idx=header_idx,
159+
headers=headers,
160+
note_columns=note_columns,
161+
)
162+
163+
164+
def _sheet_has_content(df: pd.DataFrame) -> bool:
165+
if df.empty:
166+
return False
167+
for value in df.to_numpy().flatten():
168+
if _cell_text(value):
169+
return True
170+
return False
171+
172+
173+
def _find_header_row(df: pd.DataFrame) -> Optional[int]:
174+
best_idx: Optional[int] = None
175+
best_score = 0.0
176+
scan_limit = min(len(df), 20)
177+
for idx in range(scan_limit):
178+
values = [_cell_text(v) for v in df.iloc[idx].tolist()]
179+
non_empty = [v for v in values if v]
180+
if len(non_empty) < 2:
181+
continue
182+
183+
short_values = [v for v in non_empty if len(v) <= MAX_HEADER_CHARS]
184+
unique_values = set(non_empty)
185+
unique_ratio = len(unique_values) / len(non_empty)
186+
score = len(short_values) + unique_ratio
187+
188+
# Rows filled from a horizontally merged note usually contain the same
189+
# long value in every column. They should not become headers.
190+
if len(unique_values) == 1 and len(non_empty) > 1:
191+
continue
192+
if len(short_values) < 2:
193+
continue
194+
if score > best_score:
195+
best_score = score
196+
best_idx = idx
197+
198+
return best_idx
199+
200+
201+
def _headers_from_row(row: pd.Series) -> Dict[Any, str]:
202+
headers: Dict[Any, str] = {}
203+
used: Dict[str, int] = {}
204+
for col, value in row.items():
205+
header = _cell_text(value)
206+
if not header:
207+
continue
208+
if len(header) > MAX_HEADER_CHARS:
209+
continue
210+
count = used.get(header, 0)
211+
used[header] = count + 1
212+
if count:
213+
header = f"{header}_{count + 1}"
214+
headers[col] = header
215+
return headers
216+
217+
218+
def _detect_long_context_columns(
219+
df: pd.DataFrame,
220+
header_idx: int,
221+
headers: Dict[Any, str],
222+
) -> set[Any]:
223+
note_columns: set[Any] = set()
224+
for col in headers:
225+
values = [
226+
_cell_text(df.iloc[row_idx][col])
227+
for row_idx in range(header_idx + 1, len(df))
228+
]
229+
non_empty = [v for v in values if v]
230+
if not non_empty:
231+
continue
232+
long_count = sum(1 for v in non_empty if len(v) > 180)
233+
distinct_count = len(set(non_empty))
234+
if long_count >= 2 and distinct_count <= max(2, len(non_empty) // 4):
235+
note_columns.add(col)
236+
return note_columns
237+
238+
239+
def _format_sheet_intro(sheet: StructuredSheet) -> str:
240+
notes = _collect_notes(sheet)
241+
lines = [f"## Sheet: {sheet.name}\n"]
242+
if notes:
243+
lines.append("### Notes")
244+
for note in notes:
245+
lines.append(f"- {note}")
246+
return "\n".join(lines).strip() + "\n"
247+
248+
249+
def _collect_notes(sheet: StructuredSheet) -> List[str]:
250+
notes: List[str] = []
251+
seen: set[str] = set()
252+
253+
for row_idx in range(0, sheet.header_idx):
254+
for value in sheet.df.iloc[row_idx].tolist():
255+
_add_note(notes, seen, _cell_text(value))
256+
257+
for row_idx in range(sheet.header_idx + 1, len(sheet.df)):
258+
for col in sheet.note_columns:
259+
_add_note(notes, seen, _cell_text(sheet.df.iloc[row_idx][col]))
260+
261+
return notes
262+
263+
264+
def _add_note(notes: List[str], seen: set[str], value: str) -> None:
265+
if len(value) < 20:
266+
return
267+
if value in seen:
268+
return
269+
seen.add(value)
270+
notes.append(value)
271+
272+
273+
def _format_record(sheet: StructuredSheet, row_idx: int) -> str:
274+
row = sheet.df.iloc[row_idx]
275+
fields: List[tuple[str, str]] = []
276+
for col, header in sheet.headers.items():
277+
if col in sheet.note_columns:
278+
continue
279+
value = _cell_text(row[col])
280+
if not value or value == header:
281+
continue
282+
fields.append((header, value))
283+
284+
if _looks_like_repeated_note_row(fields):
285+
return ""
286+
if not fields:
287+
return ""
288+
289+
lines = [f"### {sheet.name} - Row {row_idx + 1}"]
290+
for header, value in fields:
291+
lines.append(f"- {header}: {value}")
292+
return "\n".join(lines) + "\n"
293+
294+
295+
def _looks_like_repeated_note_row(fields: List[tuple[str, str]]) -> bool:
296+
"""Detect rows created by horizontally filled merged notes.
297+
298+
openpyxl merge filling copies a wide note into each covered column. Such a
299+
row should be represented once in the sheet notes, not as a table record.
300+
"""
301+
302+
if len(fields) < 2:
303+
return False
304+
values = [value for _, value in fields if value]
305+
unique_values = set(values)
306+
if len(unique_values) != 1:
307+
return False
308+
only_value = values[0]
309+
return len(only_value) > 80 or _looks_like_note_header(only_value)
310+
311+
312+
def _looks_like_note_header(value: str) -> bool:
313+
return any(hint in value for hint in NOTE_HEADER_HINTS)
314+
315+
316+
def _cell_text(value: Any) -> str:
317+
if value is None:
318+
return ""
319+
try:
320+
if pd.isna(value):
321+
return ""
322+
except (TypeError, ValueError):
323+
pass
324+
text = str(value).strip()
325+
if IMAGE_FUNC_RE.match(text):
326+
return ""
327+
if text.endswith(".0"):
328+
text = text[:-2]
329+
return text

0 commit comments

Comments
 (0)