Skip to content

Commit ca3b885

Browse files
hf-kkleinKonstantinhf-krechanCopilot
authored
feat: normalize/sanitize Anwendungsfall.beschreibung (#176)
* feat: normalize/sanitize `Anwendungsfall.beschreibung` fixes #172 * fix: typo Co-authored-by: Copilot <[email protected]> * bump sm --------- Co-authored-by: Konstantin <[email protected]> Co-authored-by: kevin <[email protected]> Co-authored-by: Copilot <[email protected]>
1 parent e0af1af commit ca3b885

File tree

6 files changed

+522
-5
lines changed

6 files changed

+522
-5
lines changed

src/fundamend/reader/ahbreader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
_is_segment_group,
3232
_is_uebertragungsdatei,
3333
)
34-
from fundamend.utils import lstrip, strip
34+
from fundamend.utils import lstrip, remove_linebreaks_and_hyphens, strip
3535

3636
# pylint:disable=duplicate-code
3737
# yes, it's very similar to the MigReader
@@ -252,7 +252,7 @@ def _read_anwendungsfall(self, original_element: ET.Element) -> Anwendungsfall:
252252
format_element = next((child for child in original_element[0] if child.tag.startswith("M_")))
253253
return Anwendungsfall(
254254
pruefidentifikator=original_element.attrib["Pruefidentifikator"],
255-
beschreibung=original_element.attrib["Beschreibung"].strip(),
255+
beschreibung=remove_linebreaks_and_hyphens(original_element.attrib["Beschreibung"]),
256256
kommunikation_von=original_element.attrib["Kommunikation_von"].strip(),
257257
format=EdifactFormat(lstrip("M_", format_element.tag)),
258258
elements=tuple(segments_and_groups),

src/fundamend/utils.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,31 @@ def strip(prefix: str, text: str, suffix: str) -> str:
5353
return lstrip(prefix, rstrip(text, suffix))
5454

5555

56+
_replacements: dict[str, str] = {
57+
"-\r\n": "",
58+
"\r\n": " ",
59+
"\r": "",
60+
"\n": "",
61+
}
62+
63+
64+
def remove_linebreaks_and_hyphens(original: str) -> str:
65+
"""
66+
Normalize a multi line string by stripping leading and trailing whitespace and removing line breaks.
67+
68+
Args:
69+
original: The string to normalize.
70+
71+
Returns:
72+
The normalized string.
73+
"""
74+
result = original
75+
for old, new in _replacements.items():
76+
result = result.replace(old, new)
77+
# if you add more replacement rules, please also add a unit test in test_utils.py
78+
return " ".join(result.strip().split())
79+
80+
5681
_UNIFIED_SEPARATOR = "/" # how multiple Marktrollen shall be split in the kommunikation_von attribute
5782
_ALTERNATIVE_SEPARATORS = [","] # other separators that are used in the wild
5883

@@ -118,4 +143,4 @@ def parse_kommunikation_von(kommunikation_von: Optional[str]) -> list[Kommunikat
118143
return result
119144

120145

121-
__all__ = ["lstrip", "rstrip", "strip", "parse_kommunikation_von"]
146+
__all__ = ["lstrip", "rstrip", "strip", "parse_kommunikation_von", "remove_linebreaks_and_hyphens"]

unittests/__snapshots__/test_ahbreader.ambr

Lines changed: 446 additions & 0 deletions
Large diffs are not rendered by default.

unittests/test_ahbreader.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from fundamend.models.anwendungshandbuch import Anwendungsfall, Anwendungshandbuch, Bedingung, Paket, UbBedingung
88
from fundamend.reader import AhbReader
99

10+
from .conftest import is_private_submodule_checked_out
11+
1012

1113
@pytest.mark.parametrize(
1214
"ahb_xml_file_path, expected_date",
@@ -193,3 +195,28 @@ def test_anwendungshandbuch_hashable(ahb_xml_file_path: Path) -> None:
193195
assert isinstance(hash_code, int)
194196
hash_collection = set()
195197
hash_collection.add(ahb)
198+
199+
200+
_xml_submodule_root: Path = Path(__file__).parent.parent / "xml-migs-and-ahbs"
201+
202+
203+
@pytest.mark.snapshot()
204+
def test_sanitizing_all_awf_beschreibungen(snapshot: SnapshotAssertion) -> None:
205+
"""this test makes changes to the sanitation code visible."""
206+
if not is_private_submodule_checked_out():
207+
pytest.skip("Skipping test because of missing private submodule")
208+
all_sanitized_awf_beschreibungen: set[str] = set()
209+
for ahb_file_path in _xml_submodule_root.rglob("**/*AHB*.xml"):
210+
reader = AhbReader(ahb_file_path)
211+
ahb = reader.read()
212+
for awf in ahb.anwendungsfaelle:
213+
if awf.is_outdated:
214+
continue
215+
all_sanitized_awf_beschreibungen.add(awf.beschreibung)
216+
# If you're unhappy with any specific entry in this list, better write a new unit test case in
217+
# test_utils.py / test_anwendungsfall_beschreibung_normalization (add more parametrization).
218+
# If the snapshot test fails because of updated data, just run
219+
# tox -e snapshots
220+
# and commit the updated .ambr file.
221+
distinct_beschreibungen_as_list = list(sorted(all_sanitized_awf_beschreibungen))
222+
snapshot.assert_match(distinct_beschreibungen_as_list)

unittests/test_utils.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from fundamend import AhbReader
77
from fundamend.models.anwendungshandbuch import Anwendungsfall
88
from fundamend.models.kommunikationsrichtung import Kommunikationsrichtung
9-
from fundamend.utils import parse_kommunikation_von
9+
from fundamend.utils import parse_kommunikation_von, remove_linebreaks_and_hyphens
1010

1111
from .conftest import is_private_submodule_checked_out
1212

@@ -126,3 +126,22 @@ def test_parsing_all_kommunikation_von_there_is() -> None:
126126
if not isinstance(kommunikation_von, str):
127127
pytest.skip("Skipping test because 'Kommunikation Von' is not a string (anymore)")
128128
_ = parse_kommunikation_von(kommunikation_von) # must not crash
129+
130+
131+
@pytest.mark.parametrize(
132+
"original, expected",
133+
[
134+
pytest.param("foo", "foo", id="no change"),
135+
pytest.param("foo ", "foo", id="trailing whitespace"),
136+
pytest.param(" foo", "foo", id="leading whitespace"),
137+
pytest.param(" foo ", "foo", id="trailing and leading whitespaces"),
138+
pytest.param(" foo\r\n ", "foo", id="trailing and leading whitespaces and line break"),
139+
# hyphen requirements discussed here:
140+
# https://github.com/Hochfrequenz/xml-fundamend-python/issues/172#issue-3427724092
141+
pytest.param(" Foo-\r\nbar ", "Foobar", id="hyphen with line break"),
142+
pytest.param(" Foo\r\n and bar ", "Foo and bar", id="line break w/o hyphen"),
143+
],
144+
)
145+
def test_anwendungsfall_beschreibung_normalization(original: str, expected: str) -> None:
146+
actual = remove_linebreaks_and_hyphens(original)
147+
assert actual == expected

xml-migs-and-ahbs

0 commit comments

Comments
 (0)