33"""
44
55import logging
6+ import re
67import sys
78from pathlib import Path
89from typing import Any , Tuple , TypeAlias
@@ -183,6 +184,15 @@ def create_row(
183184 return row
184185
185186
187+ def normalize (value : str | None ) -> str :
188+ """
189+ normalizes strings like `Segmentname` values by removing all whitespaces, tabs, newlines, etc.
190+ """
191+ if value is None :
192+ return ""
193+ return re .sub (r"\s+" , "" , value )
194+
195+
186196# pylint:disable=too-many-branches, too-many-statements
187197def align_columns (
188198 previous_pruefid : DataFrame ,
@@ -192,6 +202,7 @@ def align_columns(
192202) -> DataFrame :
193203 """
194204 aligns `Segmentname` columns by adding empty cells each time the cell values do not match.
205+ during comparison, whitespaces are removed while preserving original values for the output.
195206 """
196207
197208 default_column_order = [
@@ -282,6 +293,17 @@ def align_columns(
282293 result_df = pd .DataFrame (result_rows )
283294 return result_df [column_order ]
284295
296+ # normalize `Segmentname` columns values by removing any whitespace
297+ segments_of_previous_formatversion_normalized = [
298+ normalize (s ) if isinstance (s , str ) else s
299+ for s in df_of_previous_formatversion [f"Segmentname_{ previous_formatversion } " ].tolist ()
300+ ]
301+ segments_of_subsequent_formatversion_normalized = [
302+ normalize (s ) if isinstance (s , str ) else s
303+ for s in df_of_subsequent_formatversion [f"Segmentname_{ subsequent_formatversion } " ].tolist ()
304+ ]
305+
306+ # keep original `Segmentname` values for output
285307 segments_of_previous_formatversion = df_of_previous_formatversion [f"Segmentname_{ previous_formatversion } " ].tolist ()
286308 segments_of_subsequent_formatversion = df_of_subsequent_formatversion [
287309 f"Segmentname_{ subsequent_formatversion } "
@@ -317,7 +339,7 @@ def align_columns(
317339 row ["changed_entries" ] = ""
318340 result_rows .append (row )
319341 i += 1
320- elif segments_of_previous_formatversion [i ] == segments_of_subsequent_formatversion [j ]:
342+ elif segments_of_previous_formatversion_normalized [i ] == segments_of_subsequent_formatversion_normalized [j ]:
321343 row = create_row (
322344 previous_df = df_of_previous_formatversion ,
323345 subsequent_df = df_of_subsequent_formatversion ,
@@ -341,8 +363,8 @@ def align_columns(
341363 prev_val = str (df_of_previous_formatversion .iloc [i ][col ])
342364 subs_val = str (df_of_subsequent_formatversion .iloc [j ][col ])
343365
344- # only consider cells/entries that are not empty for both formatversions.
345- if prev_val .strip () and subs_val .strip () and prev_val != subs_val :
366+ # consider a change when (1) at least one value is non- empty AND (2) the values are different
367+ if ( prev_val .strip () or subs_val .strip () ) and prev_val != subs_val :
346368 has_changes = True
347369 changed_entries .extend ([f"{ col } _{ previous_formatversion } " , f"{ col } _{ subsequent_formatversion } " ])
348370
@@ -354,19 +376,27 @@ def align_columns(
354376 else :
355377 try :
356378 # try to find next matching value.
357- next_match = segments_of_subsequent_formatversion [j :].index (segments_of_previous_formatversion [i ])
358- for k in range (next_match ):
359- row = create_row (
360- previous_df = df_of_previous_formatversion ,
361- subsequent_df = df_of_subsequent_formatversion ,
362- j = j + k ,
363- previous_formatversion = previous_formatversion ,
364- subsequent_formatversion = subsequent_formatversion ,
365- )
366- row ["Änderung" ] = "NEU"
367- row ["changed_entries" ] = ""
368- result_rows .append (row )
369- j += next_match
379+ next_match = - 1
380+ for k , subsequent_value in enumerate (segments_of_subsequent_formatversion_normalized [j :], start = j ):
381+ if subsequent_value == segments_of_previous_formatversion_normalized [i ]:
382+ next_match = k - j
383+ break
384+
385+ if next_match >= 0 :
386+ for k in range (next_match ):
387+ row = create_row (
388+ previous_df = df_of_previous_formatversion ,
389+ subsequent_df = df_of_subsequent_formatversion ,
390+ j = j + k ,
391+ previous_formatversion = previous_formatversion ,
392+ subsequent_formatversion = subsequent_formatversion ,
393+ )
394+ row ["Änderung" ] = "NEU"
395+ row ["changed_entries" ] = ""
396+ result_rows .append (row )
397+ j += next_match
398+ else :
399+ raise ValueError ("no match found." )
370400 except ValueError :
371401 # no match found: add old value and empty new cell.
372402 row = create_row (
0 commit comments