feat: normalize/sanitize Anwendungsfall.beschreibung (#176)

hf-kklein · Konstantin · hf-krechan · web-flow · commit ca3b88533511 · 2025-10-15T09:29:51.000Z
* feat: normalize/sanitize `Anwendungsfall.beschreibung` fixes #172 * fix: typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * bump sm --------- Co-authored-by: Konstantin <konstantin.klein+github@hochfrequenz.de> Co-authored-by: kevin <68426071+hf-krechan@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
diff --git a/src/fundamend/reader/ahbreader.py b/src/fundamend/reader/ahbreader.py
@@ -31,7 +31,7 @@
     _is_segment_group,
     _is_uebertragungsdatei,
 )
-from fundamend.utils import lstrip, strip
+from fundamend.utils import lstrip, remove_linebreaks_and_hyphens, strip
 
 # pylint:disable=duplicate-code
 # yes, it's very similar to the MigReader
@@ -252,7 +252,7 @@ def _read_anwendungsfall(self, original_element: ET.Element) -> Anwendungsfall:
             format_element = next((child for child in original_element[0] if child.tag.startswith("M_")))
         return Anwendungsfall(
             pruefidentifikator=original_element.attrib["Pruefidentifikator"],
-            beschreibung=original_element.attrib["Beschreibung"].strip(),
+            beschreibung=remove_linebreaks_and_hyphens(original_element.attrib["Beschreibung"]),
             kommunikation_von=original_element.attrib["Kommunikation_von"].strip(),
             format=EdifactFormat(lstrip("M_", format_element.tag)),
             elements=tuple(segments_and_groups),
diff --git a/src/fundamend/utils.py b/src/fundamend/utils.py
@@ -53,6 +53,31 @@ def strip(prefix: str, text: str, suffix: str) -> str:
     return lstrip(prefix, rstrip(text, suffix))
 
 
+_replacements: dict[str, str] = {
+    "-\r\n": "",
+    "\r\n": " ",
+    "\r": "",
+    "\n": "",
+}
+
+
+def remove_linebreaks_and_hyphens(original: str) -> str:
+    """
+    Normalize a multi line string by stripping leading and trailing whitespace and removing line breaks.
+
+    Args:
+        original: The string to normalize.
+
+    Returns:
+        The normalized string.
+    """
+    result = original
+    for old, new in _replacements.items():
+        result = result.replace(old, new)
+    # if you add more replacement rules, please also add a unit test in test_utils.py
+    return " ".join(result.strip().split())
+
+
 _UNIFIED_SEPARATOR = "/"  # how multiple Marktrollen shall be split in the kommunikation_von attribute
 _ALTERNATIVE_SEPARATORS = [","]  # other separators that are used in the wild
 
@@ -118,4 +143,4 @@ def parse_kommunikation_von(kommunikation_von: Optional[str]) -> list[Kommunikat
     return result
 
 
-__all__ = ["lstrip", "rstrip", "strip", "parse_kommunikation_von"]
+__all__ = ["lstrip", "rstrip", "strip", "parse_kommunikation_von", "remove_linebreaks_and_hyphens"]
diff --git a/unittests/__snapshots__/test_ahbreader.ambr b/unittests/__snapshots__/test_ahbreader.ambr
diff --git a/unittests/test_ahbreader.py b/unittests/test_ahbreader.py
@@ -7,6 +7,8 @@
 from fundamend.models.anwendungshandbuch import Anwendungsfall, Anwendungshandbuch, Bedingung, Paket, UbBedingung
 from fundamend.reader import AhbReader
 
+from .conftest import is_private_submodule_checked_out
+
 
 @pytest.mark.parametrize(
     "ahb_xml_file_path, expected_date",
@@ -193,3 +195,28 @@ def test_anwendungshandbuch_hashable(ahb_xml_file_path: Path) -> None:
     assert isinstance(hash_code, int)
     hash_collection = set()
     hash_collection.add(ahb)
+
+
+_xml_submodule_root: Path = Path(__file__).parent.parent / "xml-migs-and-ahbs"
+
+
+@pytest.mark.snapshot()
+def test_sanitizing_all_awf_beschreibungen(snapshot: SnapshotAssertion) -> None:
+    """this test makes changes to the sanitation code visible."""
+    if not is_private_submodule_checked_out():
+        pytest.skip("Skipping test because of missing private submodule")
+    all_sanitized_awf_beschreibungen: set[str] = set()
+    for ahb_file_path in _xml_submodule_root.rglob("**/*AHB*.xml"):
+        reader = AhbReader(ahb_file_path)
+        ahb = reader.read()
+        for awf in ahb.anwendungsfaelle:
+            if awf.is_outdated:
+                continue
+            all_sanitized_awf_beschreibungen.add(awf.beschreibung)
+    # If you're unhappy with any specific entry in this list, better write a new unit test case in
+    # test_utils.py / test_anwendungsfall_beschreibung_normalization (add more parametrization).
+    # If the snapshot test fails because of updated data, just run
+    # tox -e snapshots
+    # and commit the updated .ambr file.
+    distinct_beschreibungen_as_list = list(sorted(all_sanitized_awf_beschreibungen))
+    snapshot.assert_match(distinct_beschreibungen_as_list)
diff --git a/unittests/test_utils.py b/unittests/test_utils.py
@@ -6,7 +6,7 @@
 from fundamend import AhbReader
 from fundamend.models.anwendungshandbuch import Anwendungsfall
 from fundamend.models.kommunikationsrichtung import Kommunikationsrichtung
-from fundamend.utils import parse_kommunikation_von
+from fundamend.utils import parse_kommunikation_von, remove_linebreaks_and_hyphens
 
 from .conftest import is_private_submodule_checked_out
 
@@ -126,3 +126,22 @@ def test_parsing_all_kommunikation_von_there_is() -> None:
         if not isinstance(kommunikation_von, str):
             pytest.skip("Skipping test because 'Kommunikation Von' is not a string (anymore)")
         _ = parse_kommunikation_von(kommunikation_von)  # must not crash
+
+
+@pytest.mark.parametrize(
+    "original, expected",
+    [
+        pytest.param("foo", "foo", id="no change"),
+        pytest.param("foo ", "foo", id="trailing whitespace"),
+        pytest.param(" foo", "foo", id="leading whitespace"),
+        pytest.param(" foo ", "foo", id="trailing and leading whitespaces"),
+        pytest.param(" foo\r\n ", "foo", id="trailing and leading whitespaces and line break"),
+        # hyphen requirements discussed here:
+        # https://github.com/Hochfrequenz/xml-fundamend-python/issues/172#issue-3427724092
+        pytest.param(" Foo-\r\nbar ", "Foobar", id="hyphen with line break"),
+        pytest.param(" Foo\r\n and bar ", "Foo and bar", id="line break w/o hyphen"),
+    ],
+)
+def test_anwendungsfall_beschreibung_normalization(original: str, expected: str) -> None:
+    actual = remove_linebreaks_and_hyphens(original)
+    assert actual == expected
diff --git a/xml-migs-and-ahbs b/xml-migs-and-ahbs
@@ -1 +1 @@
-Subproject commit 2e3fc58491de1e99a9f9407a959b3a6b503c6994
+Subproject commit c5d26876f1657b305995d0cb6c0d32a3f46b2abb