Skip to content

Commit a6869c1

Browse files
fix(parsing): handle page breaks within tables (#455)
* fixed routine for broken lines * remove comments * fix typecheck and linter * fixed small errors * fix: merging messed with snapshots * moved coditions text and fixed tests * fix whitespace * updated snapshot --------- Co-authored-by: kevin <[email protected]>
1 parent fb1e8d3 commit a6869c1

File tree

7 files changed

+4651
-4203
lines changed

7 files changed

+4651
-4203
lines changed

src/kohlrahbi/ahbtable/ahbsubtable.py

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,18 @@
22
This module contains the AhbSubTable class.
33
"""
44

5-
from typing import Generator
5+
from typing import Generator, Union
66

7+
import numpy as np
78
import pandas as pd
89
from docx.table import Table as DocxTable
910
from docx.table import _Cell, _Row
11+
from docx.text.paragraph import Paragraph
12+
from numpy.typing import NDArray
1013
from pydantic import BaseModel, ConfigDict
1114

1215
from kohlrahbi.ahbtable.ahbtablerow import AhbTableRow
16+
from kohlrahbi.docxtablecells.bodycell import INDEX_OF_CODES_AND_QUALIFIER_COLUMN, KNOW_SUFFIXES
1317
from kohlrahbi.enums import RowType
1418
from kohlrahbi.row_type_checker import get_row_type
1519
from kohlrahbi.seed import Seed
@@ -30,6 +34,7 @@ class AhbSubTable(BaseModel):
3034
def _parse_docx_table(
3135
table_meta_data: Seed, ahb_table_dataframe: pd.DataFrame, docx_table: DocxTable
3236
) -> pd.DataFrame:
37+
"""Parse the docx table and add the information to the dataframe."""
3338
for row in docx_table.rows:
3439
sanitized_cells = list(AhbSubTable.iter_visible_cells(row=row))
3540

@@ -58,16 +63,45 @@ def _parse_docx_table(
5863

5964
if ahb_table_row_dataframe is not None:
6065
ahb_table_dataframe = pd.concat([ahb_table_dataframe, ahb_table_row_dataframe], ignore_index=True)
61-
# this case covers the page break situation
6266
else:
67+
# this case covers the page break situation
68+
69+
# check for conditions_text
70+
contains_condition_texts = any(paragraph.text != "" for paragraph in bedingung_cell.paragraphs)
71+
# conditions are always at the top of a dataelement
72+
# add condition texts
73+
if contains_condition_texts:
74+
AhbSubTable.combine_condition_text(ahb_table_dataframe, bedingung_cell)
75+
76+
# add new row regularly
6377
ahb_table_row = AhbTableRow(
6478
seed=table_meta_data,
6579
edifact_struktur_cell=edifact_struktur_cell,
6680
middle_cell=middle_cell,
6781
bedingung_cell=bedingung_cell,
6882
)
83+
ahb_table_row_dataframe = ahb_table_row.parse(row_type=current_row_type)
6984

70-
ahb_table_row.parse(row_type=table_meta_data.last_two_row_types[1])
85+
# look at first line to determine if it is broken
86+
first_paragraph = middle_cell.paragraphs[0]
87+
88+
if ahb_table_row_dataframe is not None:
89+
if AhbSubTable.is_broken_line(
90+
table=ahb_table_dataframe,
91+
table_meta_data=table_meta_data,
92+
paragraph=first_paragraph,
93+
):
94+
AhbSubTable.add_broken_line(ahb_table_dataframe, ahb_table_row_dataframe)
95+
# we have a broken line
96+
ahb_table_dataframe = pd.concat(
97+
[ahb_table_dataframe, ahb_table_row_dataframe.iloc[1:]],
98+
ignore_index=True,
99+
)
100+
else:
101+
ahb_table_dataframe = pd.concat(
102+
[ahb_table_dataframe, ahb_table_row_dataframe],
103+
ignore_index=True,
104+
)
71105

72106
# An AhbSubTable can span over two pages.
73107
# But after every page break, even if we're still in the same subtable,
@@ -131,3 +165,73 @@ def iter_visible_cells(row: _Row) -> Generator[_Cell, None, None]:
131165
table_row = row._tr # pylint:disable=protected-access
132166
for table_column in table_row.tc_lst:
133167
yield _Cell(table_column, row.table)
168+
169+
@staticmethod
170+
def add_text_to_last_row(ahb_table_dataframe: pd.DataFrame, row_index: int, column_index: int, text: str) -> None:
171+
"""Add a text to the last row of the dataframe."""
172+
starts_with_known_suffix = any(text.startswith(suffix + " ") for suffix in KNOW_SUFFIXES)
173+
if len(text) > 0:
174+
if len(ahb_table_dataframe.iat[row_index, column_index]) > 0 and not starts_with_known_suffix:
175+
text = " " + text
176+
ahb_table_dataframe.iat[row_index, column_index] += text
177+
178+
@staticmethod
179+
def add_broken_line(ahb_table_dataframe: pd.DataFrame, broken_line: pd.DataFrame) -> None:
180+
"""Add a broken line to the dataframe."""
181+
for col_index in range(INDEX_OF_CODES_AND_QUALIFIER_COLUMN, len(ahb_table_dataframe.columns)):
182+
AhbSubTable.add_text_to_last_row(
183+
ahb_table_dataframe, ahb_table_dataframe.index.max(), col_index, str(broken_line.iat[0, col_index])
184+
)
185+
186+
@staticmethod
187+
def combine_condition_text(ahb_table_dataframe: pd.DataFrame, bedingung_cell: _Cell) -> None:
188+
"""Add the condition text to the dataframe."""
189+
conditions_text = " " + " ".join(
190+
paragraph.text for paragraph in bedingung_cell.paragraphs if paragraph.text != ""
191+
)
192+
last_valid_row = ahb_table_dataframe["Bedingung"].last_valid_index()
193+
conditions_text = ahb_table_dataframe.at[last_valid_row, "Bedingung"] + conditions_text
194+
# remove existing text
195+
ahb_table_dataframe.at[last_valid_row, "Bedingung"] = ""
196+
# remove remaining text to avoid misplacements
197+
for paragraph in bedingung_cell.paragraphs:
198+
paragraph.text = ""
199+
bedingung_cell.paragraphs[-1].text = conditions_text
200+
201+
@staticmethod
202+
def is_broken_line(
203+
table: pd.DataFrame,
204+
table_meta_data: Seed,
205+
paragraph: Paragraph,
206+
) -> bool:
207+
"""
208+
Check for broken lines in the middle cell.
209+
"""
210+
tabsplit_text = paragraph.text.split("\t")
211+
212+
loc: Union[int, slice, NDArray[np.bool_]] = table.columns.get_loc("Beschreibung")
213+
214+
# Ensure loc is an int
215+
if isinstance(loc, int):
216+
beschreibung_index: int = loc
217+
else:
218+
raise ValueError("The location of the column 'Beschreibung' is not an integer.")
219+
220+
is_empty_middle_line = all(text == "" for text in tabsplit_text)
221+
is_broken_code_qualifier = (
222+
paragraph.paragraph_format.left_indent is not None
223+
and paragraph.paragraph_format.left_indent != table_meta_data.middle_cell_left_indent_position
224+
and table.iat[-1, beschreibung_index] != ""
225+
and table.iloc[-1, beschreibung_index + 1 :].ne("").any()
226+
)
227+
if is_broken_code_qualifier and len(tabsplit_text) == 1:
228+
# only broken code / qualifier
229+
assert (
230+
table.iat[-1, beschreibung_index] != "" and table.iloc[-1, beschreibung_index + 1 :].ne("").any()
231+
), "no condition expected in broken line"
232+
there_are_conditions = (
233+
len(tabsplit_text) > 1
234+
and paragraph.paragraph_format.left_indent != table_meta_data.middle_cell_left_indent_position
235+
)
236+
237+
return is_empty_middle_line or there_are_conditions or is_broken_code_qualifier

src/kohlrahbi/ahbtable/ahbtablerow.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@ class AhbTableRow(BaseModel):
2727

2828
model_config = ConfigDict(arbitrary_types_allowed=True)
2929

30-
def parse(
31-
self,
32-
row_type: RowType,
33-
) -> Optional[pd.DataFrame]:
30+
def parse(self, row_type: RowType) -> Optional[pd.DataFrame]:
3431
"""
3532
Writes the current row of the current table into the DataFrame depending on the type of the row.
3633
If the row is a header row, it will be skipped and None will be returned.

src/kohlrahbi/docxtablecells/bedinungscell.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,16 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
2323
"""
2424
Parses a cell in the Bedingung column and puts the information into the appropriate column of the dataframe.
2525
"""
26-
27-
bedingung = self.beautify_bedingungen()
26+
bedingung = self.table_cell.text
27+
bedingung = self.beautify_bedingungen(bedingung)
2828

2929
row_index = ahb_row_dataframe.index.max()
3030
ahb_row_dataframe.at[row_index, "Bedingung"] += bedingung
3131
return ahb_row_dataframe
3232

3333
# pylint: disable=line-too-long
34-
def beautify_bedingungen(self) -> str:
34+
@staticmethod
35+
def beautify_bedingungen(bedingung: str) -> str:
3536
"""
3637
Beautifies the Bedingungen by removing the given line breaks and insert the line breaks at the correct places.
3738
@@ -41,11 +42,11 @@ def beautify_bedingungen(self) -> str:
4142
[494] Das hier genannte Datum muss der Zeitpunkt sein, zu dem das Dokument erstellt wurde, oder ein Zeitpunkt, der davor liegt
4243
[931] Format: ZZZ = +00
4344
"""
44-
beautified_bedingung = self.table_cell.text.replace("\n", " ")
45+
beautified_bedingung = bedingung.replace("\n", " ")
4546

4647
matches = re.findall(r"\[\d+\]", beautified_bedingung)
4748
for match in matches[1:]:
4849
index = beautified_bedingung.find(match)
49-
beautified_bedingung = beautified_bedingung[:index] + "\n" + beautified_bedingung[index:]
50+
beautified_bedingung = beautified_bedingung[:index].rstrip() + "\n" + beautified_bedingung[index:]
5051

51-
return beautified_bedingung
52+
return beautified_bedingung.lstrip()

src/kohlrahbi/docxtablecells/bodycell.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,11 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
5252
def add_text_to_column(row_index: int, column_index: int, text: str) -> None:
5353
starts_with_known_suffix = any(text.startswith(suffix + " ") for suffix in KNOW_SUFFIXES)
5454
if len(text) > 0:
55-
if len(ahb_row_dataframe.iat[row_index, column_index]) > 0 and not starts_with_known_suffix:
55+
if (
56+
len(ahb_row_dataframe.iat[row_index, column_index]) > 0
57+
and not starts_with_known_suffix
58+
and len(text) > 1
59+
):
5660
text = " " + text
5761
ahb_row_dataframe.iat[row_index, column_index] += text
5862

0 commit comments

Comments
 (0)