Skip to content

Commit 2b94ffd

Browse files
authored
style(whitespaces in text/new lines): Make text prettier (#447)
* WIP * Added comments * fixed test * make know suffixes set * updated snapshots
1 parent 2456f60 commit 2b94ffd

File tree

3 files changed

+2561
-2541
lines changed

3 files changed

+2561
-2541
lines changed

src/kohlrahbi/docxtablecells/bodycell.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,17 @@
1111
from kohlrahbi.table_header import get_tabstop_positions
1212

1313
INDEX_OF_CODES_AND_QUALIFIER_COLUMN = 4
14+
KNOW_SUFFIXES = {
15+
"g",
16+
"ung",
17+
"gs-",
18+
"vall",
19+
"n",
20+
"m",
21+
"t",
22+
"rage",
23+
"sgrund",
24+
} # only a temporary and incomplete list to filter some cases, not intended as NLP
1425

1526

1627
class BodyCell(BaseModel):
@@ -38,6 +49,13 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
3849
tabstop_positions (list[int]): All tabstop positions of the indicator middle cell
3950
"""
4051

52+
def add_text_to_column(row_index: int, column_index: int, text: str) -> None:
53+
starts_with_known_suffix = any(text.startswith(suffix) for suffix in KNOW_SUFFIXES)
54+
if len(text) > 0:
55+
if len(ahb_row_dataframe.iat[row_index, column_index]) > 0 and not starts_with_known_suffix:
56+
text = " " + text
57+
ahb_row_dataframe.iat[row_index, column_index] += text
58+
4159
def handle_code_or_qualifier_entry(
4260
splitted_text_at_tabs: list[str], row_index: int, is_first_iteration: bool
4361
) -> int:
@@ -48,7 +66,7 @@ def handle_code_or_qualifier_entry(
4866
if not is_first_iteration:
4967
ahb_row_dataframe.loc[ahb_row_dataframe.index.max() + 1, :] = ""
5068
row_index += 1
51-
ahb_row_dataframe.iat[row_index, INDEX_OF_CODES_AND_QUALIFIER_COLUMN] += splitted_text_at_tabs.pop(0)
69+
add_text_to_column(row_index, INDEX_OF_CODES_AND_QUALIFIER_COLUMN, splitted_text_at_tabs.pop(0))
5270
return row_index
5371

5472
def handle_tab_stops(
@@ -59,14 +77,16 @@ def handle_tab_stops(
5977
for indicator_tabstop_position, column_index in zip(self.indicator_tabstop_positions, column_indezes):
6078
if len(tab_stops_in_current_paragraph) == 1:
6179
if indicator_tabstop_position in (tabstop, paragraph.paragraph_format.left_indent):
62-
ahb_row_dataframe.iat[row_index, column_index] += splitted_text_at_tabs.pop(0)
80+
add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))
6381
else:
6482
if tabstop == indicator_tabstop_position:
65-
ahb_row_dataframe.iat[row_index, column_index] += splitted_text_at_tabs.pop(0)
83+
add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))
6684

6785
def handle_no_tab_stops(splitted_text_at_tabs: list[str], row_index: int) -> None:
6886
if splitted_text_at_tabs:
69-
ahb_row_dataframe.at[row_index, "Beschreibung"] += splitted_text_at_tabs.pop(0)
87+
column_index = ahb_row_dataframe.columns.get_loc("Beschreibung")
88+
assert isinstance(column_index, int)
89+
add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))
7090

7191
cell_is_empty = self.table_cell.paragraphs[0].text == ""
7292
if cell_is_empty:

0 commit comments

Comments
 (0)