1111from kohlrahbi .table_header import get_tabstop_positions
1212
1313INDEX_OF_CODES_AND_QUALIFIER_COLUMN = 4
14+ KNOW_SUFFIXES = {
15+ "g" ,
16+ "ung" ,
17+ "gs-" ,
18+ "vall" ,
19+ "n" ,
20+ "m" ,
21+ "t" ,
22+ "rage" ,
23+ "sgrund" ,
24+ } # only a temporary and incomplete list to filter some cases, not intended as NLP
1425
1526
1627class BodyCell (BaseModel ):
@@ -38,6 +49,13 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
3849 tabstop_positions (list[int]): All tabstop positions of the indicator middle cell
3950 """
4051
52+ def add_text_to_column (row_index : int , column_index : int , text : str ) -> None :
53+ starts_with_known_suffix = any (text .startswith (suffix ) for suffix in KNOW_SUFFIXES )
54+ if len (text ) > 0 :
55+ if len (ahb_row_dataframe .iat [row_index , column_index ]) > 0 and not starts_with_known_suffix :
56+ text = " " + text
57+ ahb_row_dataframe .iat [row_index , column_index ] += text
58+
4159 def handle_code_or_qualifier_entry (
4260 splitted_text_at_tabs : list [str ], row_index : int , is_first_iteration : bool
4361 ) -> int :
@@ -48,7 +66,7 @@ def handle_code_or_qualifier_entry(
4866 if not is_first_iteration :
4967 ahb_row_dataframe .loc [ahb_row_dataframe .index .max () + 1 , :] = ""
5068 row_index += 1
51- ahb_row_dataframe . iat [ row_index , INDEX_OF_CODES_AND_QUALIFIER_COLUMN ] += splitted_text_at_tabs .pop (0 )
69+ add_text_to_column ( row_index , INDEX_OF_CODES_AND_QUALIFIER_COLUMN , splitted_text_at_tabs .pop (0 ) )
5270 return row_index
5371
5472 def handle_tab_stops (
@@ -59,14 +77,16 @@ def handle_tab_stops(
5977 for indicator_tabstop_position , column_index in zip (self .indicator_tabstop_positions , column_indezes ):
6078 if len (tab_stops_in_current_paragraph ) == 1 :
6179 if indicator_tabstop_position in (tabstop , paragraph .paragraph_format .left_indent ):
62- ahb_row_dataframe . iat [ row_index , column_index ] += splitted_text_at_tabs .pop (0 )
80+ add_text_to_column ( row_index , column_index , splitted_text_at_tabs .pop (0 ) )
6381 else :
6482 if tabstop == indicator_tabstop_position :
65- ahb_row_dataframe . iat [ row_index , column_index ] += splitted_text_at_tabs .pop (0 )
83+ add_text_to_column ( row_index , column_index , splitted_text_at_tabs .pop (0 ) )
6684
6785 def handle_no_tab_stops (splitted_text_at_tabs : list [str ], row_index : int ) -> None :
6886 if splitted_text_at_tabs :
69- ahb_row_dataframe .at [row_index , "Beschreibung" ] += splitted_text_at_tabs .pop (0 )
87+ column_index = ahb_row_dataframe .columns .get_loc ("Beschreibung" )
88+ assert isinstance (column_index , int )
89+ add_text_to_column (row_index , column_index , splitted_text_at_tabs .pop (0 ))
7090
7191 cell_is_empty = self .table_cell .paragraphs [0 ].text == ""
7292 if cell_is_empty :
0 commit comments