22This module contains the AhbSubTable class.
33"""
44
5- from typing import Generator
5+ from typing import Generator , Union
66
7+ import numpy as np
78import pandas as pd
89from docx .table import Table as DocxTable
910from docx .table import _Cell , _Row
11+ from docx .text .paragraph import Paragraph
12+ from numpy .typing import NDArray
1013from pydantic import BaseModel , ConfigDict
1114
1215from kohlrahbi .ahbtable .ahbtablerow import AhbTableRow
16+ from kohlrahbi .docxtablecells .bodycell import INDEX_OF_CODES_AND_QUALIFIER_COLUMN , KNOW_SUFFIXES
1317from kohlrahbi .enums import RowType
1418from kohlrahbi .row_type_checker import get_row_type
1519from kohlrahbi .seed import Seed
@@ -30,6 +34,7 @@ class AhbSubTable(BaseModel):
3034 def _parse_docx_table (
3135 table_meta_data : Seed , ahb_table_dataframe : pd .DataFrame , docx_table : DocxTable
3236 ) -> pd .DataFrame :
37+ """Parse the docx table and add the information to the dataframe."""
3338 for row in docx_table .rows :
3439 sanitized_cells = list (AhbSubTable .iter_visible_cells (row = row ))
3540
@@ -58,16 +63,45 @@ def _parse_docx_table(
5863
5964 if ahb_table_row_dataframe is not None :
6065 ahb_table_dataframe = pd .concat ([ahb_table_dataframe , ahb_table_row_dataframe ], ignore_index = True )
61- # this case covers the page break situation
6266 else :
67+ # this case covers the page break situation
68+
69+ # check for conditions_text
70+ contains_condition_texts = any (paragraph .text != "" for paragraph in bedingung_cell .paragraphs )
71+ # conditions are always at the top of a dataelement
72+ # add condition texts
73+ if contains_condition_texts :
74+ AhbSubTable .combine_condition_text (ahb_table_dataframe , bedingung_cell )
75+
76+ # add new row regularly
6377 ahb_table_row = AhbTableRow (
6478 seed = table_meta_data ,
6579 edifact_struktur_cell = edifact_struktur_cell ,
6680 middle_cell = middle_cell ,
6781 bedingung_cell = bedingung_cell ,
6882 )
83+ ahb_table_row_dataframe = ahb_table_row .parse (row_type = current_row_type )
6984
70- ahb_table_row .parse (row_type = table_meta_data .last_two_row_types [1 ])
85+ # look at first line to determine if it is broken
86+ first_paragraph = middle_cell .paragraphs [0 ]
87+
88+ if ahb_table_row_dataframe is not None :
89+ if AhbSubTable .is_broken_line (
90+ table = ahb_table_dataframe ,
91+ table_meta_data = table_meta_data ,
92+ paragraph = first_paragraph ,
93+ ):
94+ AhbSubTable .add_broken_line (ahb_table_dataframe , ahb_table_row_dataframe )
95+ # we have a broken line
96+ ahb_table_dataframe = pd .concat (
97+ [ahb_table_dataframe , ahb_table_row_dataframe .iloc [1 :]],
98+ ignore_index = True ,
99+ )
100+ else :
101+ ahb_table_dataframe = pd .concat (
102+ [ahb_table_dataframe , ahb_table_row_dataframe ],
103+ ignore_index = True ,
104+ )
71105
72106 # An AhbSubTable can span over two pages.
73107 # But after every page break, even if we're still in the same subtable,
@@ -131,3 +165,73 @@ def iter_visible_cells(row: _Row) -> Generator[_Cell, None, None]:
131165 table_row = row ._tr # pylint:disable=protected-access
132166 for table_column in table_row .tc_lst :
133167 yield _Cell (table_column , row .table )
168+
169+ @staticmethod
170+ def add_text_to_last_row (ahb_table_dataframe : pd .DataFrame , row_index : int , column_index : int , text : str ) -> None :
171+ """Add a text to the last row of the dataframe."""
172+ starts_with_known_suffix = any (text .startswith (suffix + " " ) for suffix in KNOW_SUFFIXES )
173+ if len (text ) > 0 :
174+ if len (ahb_table_dataframe .iat [row_index , column_index ]) > 0 and not starts_with_known_suffix :
175+ text = " " + text
176+ ahb_table_dataframe .iat [row_index , column_index ] += text
177+
178+ @staticmethod
179+ def add_broken_line (ahb_table_dataframe : pd .DataFrame , broken_line : pd .DataFrame ) -> None :
180+ """Add a broken line to the dataframe."""
181+ for col_index in range (INDEX_OF_CODES_AND_QUALIFIER_COLUMN , len (ahb_table_dataframe .columns )):
182+ AhbSubTable .add_text_to_last_row (
183+ ahb_table_dataframe , ahb_table_dataframe .index .max (), col_index , str (broken_line .iat [0 , col_index ])
184+ )
185+
186+ @staticmethod
187+ def combine_condition_text (ahb_table_dataframe : pd .DataFrame , bedingung_cell : _Cell ) -> None :
188+ """Add the condition text to the dataframe."""
189+ conditions_text = " " + " " .join (
190+ paragraph .text for paragraph in bedingung_cell .paragraphs if paragraph .text != ""
191+ )
192+ last_valid_row = ahb_table_dataframe ["Bedingung" ].last_valid_index ()
193+ conditions_text = ahb_table_dataframe .at [last_valid_row , "Bedingung" ] + conditions_text
194+ # remove existing text
195+ ahb_table_dataframe .at [last_valid_row , "Bedingung" ] = ""
196+ # remove remaining text to avoid misplacements
197+ for paragraph in bedingung_cell .paragraphs :
198+ paragraph .text = ""
199+ bedingung_cell .paragraphs [- 1 ].text = conditions_text
200+
201+ @staticmethod
202+ def is_broken_line (
203+ table : pd .DataFrame ,
204+ table_meta_data : Seed ,
205+ paragraph : Paragraph ,
206+ ) -> bool :
207+ """
208+ Check for broken lines in the middle cell.
209+ """
210+ tabsplit_text = paragraph .text .split ("\t " )
211+
212+ loc : Union [int , slice , NDArray [np .bool_ ]] = table .columns .get_loc ("Beschreibung" )
213+
214+ # Ensure loc is an int
215+ if isinstance (loc , int ):
216+ beschreibung_index : int = loc
217+ else :
218+ raise ValueError ("The location of the column 'Beschreibung' is not an integer." )
219+
220+ is_empty_middle_line = all (text == "" for text in tabsplit_text )
221+ is_broken_code_qualifier = (
222+ paragraph .paragraph_format .left_indent is not None
223+ and paragraph .paragraph_format .left_indent != table_meta_data .middle_cell_left_indent_position
224+ and table .iat [- 1 , beschreibung_index ] != ""
225+ and table .iloc [- 1 , beschreibung_index + 1 :].ne ("" ).any ()
226+ )
227+ if is_broken_code_qualifier and len (tabsplit_text ) == 1 :
228+ # only broken code / qualifier
229+ assert (
230+ table .iat [- 1 , beschreibung_index ] != "" and table .iloc [- 1 , beschreibung_index + 1 :].ne ("" ).any ()
231+ ), "no condition expected in broken line"
232+ there_are_conditions = (
233+ len (tabsplit_text ) > 1
234+ and paragraph .paragraph_format .left_indent != table_meta_data .middle_cell_left_indent_position
235+ )
236+
237+ return is_empty_middle_line or there_are_conditions or is_broken_code_qualifier
0 commit comments