Skip to content

Commit fb1e8d3

Browse files
authored
fix(parsing): mixed columns (#474)
fix mixed columns and some whitespaceproblems
1 parent cb228cb commit fb1e8d3

File tree

2 files changed

+196
-194
lines changed

2 files changed

+196
-194
lines changed

src/kohlrahbi/docxtablecells/bodycell.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
5050
"""
5151

5252
def add_text_to_column(row_index: int, column_index: int, text: str) -> None:
53-
starts_with_known_suffix = any(text.startswith(suffix) for suffix in KNOW_SUFFIXES)
53+
starts_with_known_suffix = any(text.startswith(suffix + " ") for suffix in KNOW_SUFFIXES)
5454
if len(text) > 0:
5555
if len(ahb_row_dataframe.iat[row_index, column_index]) > 0 and not starts_with_known_suffix:
5656
text = " " + text
@@ -73,14 +73,16 @@ def handle_tab_stops(
7373
paragraph: Paragraph, splitted_text_at_tabs: list[str], row_index: int, column_indezes: list[int]
7474
) -> None:
7575
tab_stops_in_current_paragraph = get_tabstop_positions(paragraph=paragraph)
76+
if len(tab_stops_in_current_paragraph) == len(splitted_text_at_tabs) - 1:
77+
# we have remaining parts from a qualifier or code
78+
tab_stops_in_current_paragraph = [
79+
paragraph.paragraph_format.left_indent,
80+
*tab_stops_in_current_paragraph,
81+
]
7682
for tabstop in tab_stops_in_current_paragraph:
7783
for indicator_tabstop_position, column_index in zip(self.indicator_tabstop_positions, column_indezes):
78-
if len(tab_stops_in_current_paragraph) == 1:
79-
if indicator_tabstop_position in (tabstop, paragraph.paragraph_format.left_indent):
80-
add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))
81-
else:
82-
if tabstop == indicator_tabstop_position:
83-
add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))
84+
if tabstop == indicator_tabstop_position:
85+
add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))
8486

8587
def handle_no_tab_stops(splitted_text_at_tabs: list[str], row_index: int) -> None:
8688
if splitted_text_at_tabs:

0 commit comments

Comments
 (0)