Skip to content

Commit f561abb

Browse files
authored
fix: remove whitespace during comparison to bypass scraping mismatches (#28)
* Remove whitespace during `Segmentname` comparison * Add tests covering `Segmentname` comparison with arbitrary whitespaces * Outsource regex into `normalize()` function * Add corresponding tests for `\n`ewline and `\t`ab
1 parent ce49219 commit f561abb

File tree

2 files changed

+331
-16
lines changed

2 files changed

+331
-16
lines changed

src/ahlbatross/main.py

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import logging
6+
import re
67
import sys
78
from pathlib import Path
89
from typing import Any, Tuple, TypeAlias
@@ -183,6 +184,15 @@ def create_row(
183184
return row
184185

185186

187+
def normalize(value: str | None) -> str:
188+
"""
189+
normalizes strings like `Segmentname` values by removing all whitespaces, tabs, newlines, etc.
190+
"""
191+
if value is None:
192+
return ""
193+
return re.sub(r"\s+", "", value)
194+
195+
186196
# pylint:disable=too-many-branches, too-many-statements
187197
def align_columns(
188198
previous_pruefid: DataFrame,
@@ -192,6 +202,7 @@ def align_columns(
192202
) -> DataFrame:
193203
"""
194204
aligns `Segmentname` columns by adding empty cells each time the cell values do not match.
205+
during comparison, whitespaces are removed while preserving original values for the output.
195206
"""
196207

197208
default_column_order = [
@@ -282,6 +293,17 @@ def align_columns(
282293
result_df = pd.DataFrame(result_rows)
283294
return result_df[column_order]
284295

296+
# normalize `Segmentname` columns values by removing any whitespace
297+
segments_of_previous_formatversion_normalized = [
298+
normalize(s) if isinstance(s, str) else s
299+
for s in df_of_previous_formatversion[f"Segmentname_{previous_formatversion}"].tolist()
300+
]
301+
segments_of_subsequent_formatversion_normalized = [
302+
normalize(s) if isinstance(s, str) else s
303+
for s in df_of_subsequent_formatversion[f"Segmentname_{subsequent_formatversion}"].tolist()
304+
]
305+
306+
# keep original `Segmentname` values for output
285307
segments_of_previous_formatversion = df_of_previous_formatversion[f"Segmentname_{previous_formatversion}"].tolist()
286308
segments_of_subsequent_formatversion = df_of_subsequent_formatversion[
287309
f"Segmentname_{subsequent_formatversion}"
@@ -317,7 +339,7 @@ def align_columns(
317339
row["changed_entries"] = ""
318340
result_rows.append(row)
319341
i += 1
320-
elif segments_of_previous_formatversion[i] == segments_of_subsequent_formatversion[j]:
342+
elif segments_of_previous_formatversion_normalized[i] == segments_of_subsequent_formatversion_normalized[j]:
321343
row = create_row(
322344
previous_df=df_of_previous_formatversion,
323345
subsequent_df=df_of_subsequent_formatversion,
@@ -341,8 +363,8 @@ def align_columns(
341363
prev_val = str(df_of_previous_formatversion.iloc[i][col])
342364
subs_val = str(df_of_subsequent_formatversion.iloc[j][col])
343365

344-
# only consider cells/entries that are not empty for both formatversions.
345-
if prev_val.strip() and subs_val.strip() and prev_val != subs_val:
366+
# consider a change when (1) at least one value is non-empty AND (2) the values are different
367+
if (prev_val.strip() or subs_val.strip()) and prev_val != subs_val:
346368
has_changes = True
347369
changed_entries.extend([f"{col}_{previous_formatversion}", f"{col}_{subsequent_formatversion}"])
348370

@@ -354,19 +376,27 @@ def align_columns(
354376
else:
355377
try:
356378
# try to find next matching value.
357-
next_match = segments_of_subsequent_formatversion[j:].index(segments_of_previous_formatversion[i])
358-
for k in range(next_match):
359-
row = create_row(
360-
previous_df=df_of_previous_formatversion,
361-
subsequent_df=df_of_subsequent_formatversion,
362-
j=j + k,
363-
previous_formatversion=previous_formatversion,
364-
subsequent_formatversion=subsequent_formatversion,
365-
)
366-
row["Änderung"] = "NEU"
367-
row["changed_entries"] = ""
368-
result_rows.append(row)
369-
j += next_match
379+
next_match = -1
380+
for k, subsequent_value in enumerate(segments_of_subsequent_formatversion_normalized[j:], start=j):
381+
if subsequent_value == segments_of_previous_formatversion_normalized[i]:
382+
next_match = k - j
383+
break
384+
385+
if next_match >= 0:
386+
for k in range(next_match):
387+
row = create_row(
388+
previous_df=df_of_previous_formatversion,
389+
subsequent_df=df_of_subsequent_formatversion,
390+
j=j + k,
391+
previous_formatversion=previous_formatversion,
392+
subsequent_formatversion=subsequent_formatversion,
393+
)
394+
row["Änderung"] = "NEU"
395+
row["changed_entries"] = ""
396+
result_rows.append(row)
397+
j += next_match
398+
else:
399+
raise ValueError("no match found.")
370400
except ValueError:
371401
# no match found: add old value and empty new cell.
372402
row = create_row(

0 commit comments

Comments
 (0)