Skip to content

Commit 422c6c9

Browse files
fix(file filter): find latest document (#488)
* changed filter for most recent files * edit test to match more realistic file names * modified filter fct * refactor __lt__ function --------- Co-authored-by: kevin <[email protected]>
1 parent 4fc231e commit 422c6c9

File tree

3 files changed

+208
-56
lines changed

3 files changed

+208
-56
lines changed

src/kohlrahbi/ahb/command.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def validate_path(ctx, param, value) -> Path: # type:ignore[no-untyped-def]
8888
default=False,
8989
help="Confirm all prompts automatically.",
9090
)
91-
# pylint: disable=too-many-positional-arguments, too-many-arguments
91+
# pylint: disable=too-many-arguments, too-many-positional-arguments
9292
def ahb(
9393
pruefis: list[str],
9494
edi_energy_mirror_path: Path,

src/kohlrahbi/docxfilefinder.py

+126-28
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
This module contains the DocxFileFinder class.
33
"""
44

5+
import re
56
from itertools import groupby
67
from pathlib import Path
78

@@ -11,6 +12,127 @@
1112
from kohlrahbi.logger import logger
1213

1314

15+
class EdiEnergyDocument(BaseModel):
16+
"""
17+
This class represents an EDI Energy document.
18+
"""
19+
20+
filename: Path
21+
document_version: str
22+
version_major: int
23+
version_minor: int
24+
version_suffix: str
25+
valid_from: int
26+
valid_until: int
27+
28+
@classmethod
29+
def from_path(cls, path: Path) -> "EdiEnergyDocument":
30+
"""
31+
Create an EdiEnergyDocument object from a file path.
32+
"""
33+
34+
file_name = extract_document_version_and_valid_dates(path.name)
35+
assert file_name is not None, f"Could not extract document version and valid dates from {path.name}."
36+
return cls(
37+
filename=path,
38+
document_version=file_name["document_version"],
39+
version_major=int(file_name["version_major"]),
40+
version_minor=int(file_name["version_minor"]),
41+
version_suffix=file_name["version_suffix"],
42+
valid_from=int(file_name["valid_from"]),
43+
valid_until=int(file_name["valid_until"]),
44+
)
45+
46+
def __lt__(self, other: "EdiEnergyDocument") -> bool:
47+
"""
48+
Compare two EdiEnergyDocument instances based on
49+
their document_version(major, minor and suffix), valid_until, and valid_from.
50+
51+
I did not know how the tuple comparison works in Python, so I looked it up:
52+
53+
Python compares tuples lexicographically, meaning it compares the elements one by one from left to right.
54+
The comparison starts with the first elements of both tuples:
55+
If self.valid_from is less than other.valid_from, the entire expression evaluates to True.
56+
If self.valid_from is greater than other.valid_from, the entire expression evaluates to False.
57+
If self.valid_from is equal to other.valid_from, Python moves to the next elements in the tuples.
58+
This process continues with self.valid_until vs. other.valid_until and then with the version numbers.
59+
60+
Args:
61+
other (EdiEnergyDocument): The other document to compare against.
62+
63+
Returns:
64+
bool: True if this document is considered less than the other document, False otherwise.
65+
"""
66+
return (self.valid_from, self.valid_until, self.version_major, self.version_minor, self.version_suffix) < (
67+
other.valid_from,
68+
other.valid_until,
69+
other.version_major,
70+
other.version_minor,
71+
other.version_suffix,
72+
)
73+
74+
75+
def extract_document_version_and_valid_dates(
76+
filename: str,
77+
) -> dict[str, str] | None:
78+
"""Extract the document version and valid dates from the filename.
79+
80+
Parameters:
81+
- filename (str): The filename of the document.
82+
83+
Returns:
84+
- tuple[str, str, str]: A tuple containing the document version, valid from date, and valid until date.
85+
"""
86+
87+
# Pattern to extract detailed version number, valid until and valid from dates
88+
document_name_pattern = re.compile(
89+
r"-informatorischeLesefassung"
90+
r"(?P<document_version>(?:S|G)?(?P<version_major>\d+)\.(?P<version_minor>\d+)(?P<version_suffix>[a-z]?))"
91+
r"(?:_|KonsolidierteLesefassung|-AußerordentlicheVeröffentlichung)?"
92+
r"([A-Za-z0-9.]+)?"
93+
r"_(?P<valid_until>\d{8})_(?P<valid_from>\d{8})\.docx$",
94+
re.IGNORECASE,
95+
)
96+
matches = document_name_pattern.search(filename)
97+
try:
98+
if matches:
99+
return matches.groupdict()
100+
except ValueError as e:
101+
logger.error("Error extracting document version and valid dates: %s", e)
102+
return None
103+
104+
105+
def get_most_recent_file(group_items: list[Path]) -> Path | None:
106+
"""
107+
Find the most recent file in a group of files based on specific criteria.
108+
109+
Parameters:
110+
- group_items (List[Path]): A list of Path objects representing the file paths.
111+
112+
Returns:
113+
- Path: A Path object representing the most recent file.
114+
"""
115+
116+
try:
117+
# Define the keywords to filter relevant files
118+
keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"]
119+
files_containing_keywords = [
120+
path for path in group_items if any(keyword in path.name.lower() for keyword in keywords)
121+
]
122+
if any(files_containing_keywords):
123+
list_of_edi_energy_documents = [EdiEnergyDocument.from_path(path) for path in files_containing_keywords]
124+
else:
125+
list_of_edi_energy_documents = [EdiEnergyDocument.from_path(path) for path in group_items]
126+
most_recent_file = max(list_of_edi_energy_documents)
127+
128+
return most_recent_file.filename
129+
130+
except ValueError as e:
131+
132+
logger.error("Error processing group items: %s", e)
133+
return None
134+
135+
14136
class DocxFileFinder(BaseModel):
15137
"""
16138
This class is responsible for finding the docx files in the input directory.
@@ -118,36 +240,12 @@ def filter_latest_version(groups: dict[str, list[Path]]) -> list[Path]:
118240
Returns:
119241
- List[Path]: A list of Path objects representing the latest version of the files.
120242
"""
121-
result = []
243+
result: list[Path] = []
122244

123245
for group_items in groups.values():
124-
if len(group_items) == 1:
125-
result.append(group_items[0])
126-
else:
127-
try:
128-
# Define the keywords to filter relevant files
129-
keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"]
130-
131-
# Find the most recent file based on keywords and date suffixes
132-
most_recent_file = max(
133-
(path for path in group_items if any(keyword in path.name.lower() for keyword in keywords)),
134-
key=lambda path: (
135-
int(path.stem.split("_")[-1]), # "gültig von" date
136-
int(path.stem.split("_")[-2]), # "gültig bis" date
137-
),
138-
)
139-
140-
# Add the most recent file to the result and log ignored files
141-
for path in group_items:
142-
if path != most_recent_file:
143-
logger.debug("Ignoring file %s", path.name)
144-
else:
145-
result.append(most_recent_file)
146-
147-
except ValueError as e:
148-
logger.error("Error processing group items: %s", e)
149-
continue
150-
246+
most_recent_file = get_most_recent_file(group_items)
247+
assert most_recent_file is not None, "Could not find the most recent file."
248+
result.append(most_recent_file)
151249
return result
152250

153251
def filter_for_latest_mig_and_ahb_docx_files(self) -> None:

unittests/test_docxfilefinder.py

+81-27
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,117 @@
1-
from pathlib import Path
1+
from pathlib import Path, PosixPath
22

33
import pytest
44

5-
from kohlrahbi.docxfilefinder import DocxFileFinder
5+
from kohlrahbi.docxfilefinder import DocxFileFinder, get_most_recent_file
66

77

88
class TestDocxFileFinder:
99
@pytest.mark.parametrize(
1010
["group_items", "expected"],
1111
[
1212
pytest.param(
13-
{
14-
"UTILTSAHB": [
15-
Path("UTILTSAHB_20240701_20240401.docx"),
16-
]
17-
},
18-
[Path("UTILTSAHB_20240701_20240401.docx")],
13+
{"UTILTSAHB": [Path("UTILTSAHB-informatorischeLesefassung4.0_20240701_20240401.docx")]},
14+
[Path("UTILTSAHB-informatorischeLesefassung4.0_20240701_20240401.docx")],
1915
id="Single File",
2016
),
2117
pytest.param(
2218
{
2319
"UTILTSAHB": [
24-
Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx"),
25-
Path("UTILTSAHB-außerordentlicheveröffentlichung_20240701_20240501.docx"),
26-
Path("UTILTSAHB-außerordentlicheveröffentlichung_20240930_20240401.docx"),
27-
Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"),
28-
Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx"),
20+
Path(
21+
"UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx"
22+
),
23+
Path(
24+
"UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240701_20240501.docx"
25+
),
26+
Path(
27+
"UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240930_20240401.docx"
28+
),
29+
Path(
30+
"UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"
31+
),
32+
Path(
33+
"UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx"
34+
),
2935
]
3036
},
31-
[Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx")],
37+
[
38+
Path(
39+
"UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"
40+
)
41+
],
3242
id="Standard Case",
3343
),
3444
pytest.param(
3545
{
3646
"UTILTSAHB": [
37-
Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx"),
38-
Path("UTILTSAHB-außerordentlicheveröffentlichung_20240731_20240701.docx"),
39-
Path("UTILTSAHB-außerordentlicheveröffentlichung_20240930_20240401.docx"),
40-
Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"),
41-
Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx"),
47+
Path(
48+
"UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx"
49+
),
50+
Path(
51+
"UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240731_20240701.docx"
52+
),
53+
Path(
54+
"UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240930_20240401.docx"
55+
),
56+
Path(
57+
"UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"
58+
),
59+
Path(
60+
"UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx"
61+
),
4262
]
4363
},
44-
[Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx")],
64+
[
65+
Path(
66+
"UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"
67+
)
68+
],
4569
id="Valid from tie",
4670
),
4771
pytest.param(
4872
{
49-
"UTILTSAHB": [
50-
Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx"),
51-
Path("UTILTSAHB_20250731_20240901.docx"),
52-
Path("UTILTSAHB-außerordentlicheveröffentlichung_20240930_20240401.docx"),
53-
Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"),
54-
Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx"),
73+
"UTILMDAHBMaBiS": [
74+
Path("UTILMDAHBMaBiS-informatorischeLesefassung4.0_99991231_20231001.docx"),
75+
Path(
76+
"UTILMDAHBMaBiS-informatorischeLesefassung4.1aKonsolidierteLesefassungmitFehlerkorrekturenStand11.03.2024_20250403_20240403.docx"
77+
),
78+
Path("UTILMDAHBMaBiS-informatorischeLesefassung4.1a_20250403_20240403.docx"),
5579
]
5680
},
57-
[Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx")],
81+
[
82+
Path(
83+
"UTILMDAHBMaBiS-informatorischeLesefassung4.1aKonsolidierteLesefassungmitFehlerkorrekturenStand11.03.2024_20250403_20240403.docx"
84+
)
85+
],
5886
id="different names",
5987
),
6088
],
6189
)
6290
def test_filter_latest_version(self, group_items, expected):
6391
assert DocxFileFinder.filter_latest_version(group_items) == expected
92+
93+
@pytest.mark.parametrize(
94+
["group_items", "expected"],
95+
[
96+
pytest.param(
97+
[
98+
Path("APERAKCONTRLAHB-informatorischeLesefassung2.4a_99991231_20250404.docx"),
99+
Path("APERAKCONTRLAHB-informatorischeLesefassung2.4_99991231_20250404.docx"),
100+
],
101+
Path("APERAKCONTRLAHB-informatorischeLesefassung2.4a_99991231_20250404.docx"),
102+
id="Two versions of the same file",
103+
),
104+
pytest.param(
105+
[
106+
Path("CodelistederKonfigurationen-informatorischeLesefassung1.3_99991231_20250404.docx"),
107+
Path("CodelistederKonfigurationen-informatorischeLesefassung1.1_99991231_20231001.docx"),
108+
Path("CodelistederKonfigurationen-informatorischeLesefassung1.3a_99991231_20250404.docx"),
109+
Path("CodelistederKonfigurationen-informatorischeLesefassung1.3b_99991231_20250404.docx"),
110+
],
111+
Path("CodelistederKonfigurationen-informatorischeLesefassung1.3b_99991231_20250404.docx"),
112+
id="Four versions of the same file",
113+
),
114+
],
115+
)
116+
def test_get_most_recent(self, group_items: list[Path], expected: Path):
117+
assert get_most_recent_file(group_items) == expected

0 commit comments

Comments
 (0)