11
11
from kohlrahbi .table_header import get_tabstop_positions
12
12
13
13
INDEX_OF_CODES_AND_QUALIFIER_COLUMN = 4
14
+ KNOW_SUFFIXES = {
15
+ "g" ,
16
+ "ung" ,
17
+ "gs-" ,
18
+ "vall" ,
19
+ "n" ,
20
+ "m" ,
21
+ "t" ,
22
+ "rage" ,
23
+ "sgrund" ,
24
+ } # only a temporary and incomplete list to filter some cases, not intended as NLP
14
25
15
26
16
27
class BodyCell (BaseModel ):
@@ -38,6 +49,13 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
38
49
tabstop_positions (list[int]): All tabstop positions of the indicator middle cell
39
50
"""
40
51
52
+ def add_text_to_column (row_index : int , column_index : int , text : str ) -> None :
53
+ starts_with_known_suffix = any (text .startswith (suffix ) for suffix in KNOW_SUFFIXES )
54
+ if len (text ) > 0 :
55
+ if len (ahb_row_dataframe .iat [row_index , column_index ]) > 0 and not starts_with_known_suffix :
56
+ text = " " + text
57
+ ahb_row_dataframe .iat [row_index , column_index ] += text
58
+
41
59
def handle_code_or_qualifier_entry (
42
60
splitted_text_at_tabs : list [str ], row_index : int , is_first_iteration : bool
43
61
) -> int :
@@ -48,7 +66,7 @@ def handle_code_or_qualifier_entry(
48
66
if not is_first_iteration :
49
67
ahb_row_dataframe .loc [ahb_row_dataframe .index .max () + 1 , :] = ""
50
68
row_index += 1
51
- ahb_row_dataframe . iat [ row_index , INDEX_OF_CODES_AND_QUALIFIER_COLUMN ] += splitted_text_at_tabs .pop (0 )
69
+ add_text_to_column ( row_index , INDEX_OF_CODES_AND_QUALIFIER_COLUMN , splitted_text_at_tabs .pop (0 ) )
52
70
return row_index
53
71
54
72
def handle_tab_stops (
@@ -59,14 +77,16 @@ def handle_tab_stops(
59
77
for indicator_tabstop_position , column_index in zip (self .indicator_tabstop_positions , column_indezes ):
60
78
if len (tab_stops_in_current_paragraph ) == 1 :
61
79
if indicator_tabstop_position in (tabstop , paragraph .paragraph_format .left_indent ):
62
- ahb_row_dataframe . iat [ row_index , column_index ] += splitted_text_at_tabs .pop (0 )
80
+ add_text_to_column ( row_index , column_index , splitted_text_at_tabs .pop (0 ) )
63
81
else :
64
82
if tabstop == indicator_tabstop_position :
65
- ahb_row_dataframe . iat [ row_index , column_index ] += splitted_text_at_tabs .pop (0 )
83
+ add_text_to_column ( row_index , column_index , splitted_text_at_tabs .pop (0 ) )
66
84
67
85
def handle_no_tab_stops (splitted_text_at_tabs : list [str ], row_index : int ) -> None :
68
86
if splitted_text_at_tabs :
69
- ahb_row_dataframe .at [row_index , "Beschreibung" ] += splitted_text_at_tabs .pop (0 )
87
+ column_index = ahb_row_dataframe .columns .get_loc ("Beschreibung" )
88
+ assert isinstance (column_index , int )
89
+ add_text_to_column (row_index , column_index , splitted_text_at_tabs .pop (0 ))
70
90
71
91
cell_is_empty = self .table_cell .paragraphs [0 ].text == ""
72
92
if cell_is_empty :
0 commit comments