fix(file filter): find latest document (#488)

DeltaDaniel · hf-krechan · web-flow · commit 422c6c9d00a2 · 2024-10-14T15:01:07.000+02:00
* changed filter for most recent files

* edit test to match more realistic file names

* modified filter fct

* refactor __lt__ function

---------

Co-authored-by: kevin &lt;kevin.krechan@hochfrequenz.de&gt;
diff --git a/src/kohlrahbi/ahb/command.py b/src/kohlrahbi/ahb/command.py
@@ -88,7 +88,7 @@ def validate_path(ctx, param, value) -> Path:  # type:ignore[no-untyped-def]
     default=False,
     help="Confirm all prompts automatically.",
 )
-# pylint: disable=too-many-positional-arguments, too-many-arguments
+# pylint: disable=too-many-arguments, too-many-positional-arguments
 def ahb(
     pruefis: list[str],
     edi_energy_mirror_path: Path,
diff --git a/src/kohlrahbi/docxfilefinder.py b/src/kohlrahbi/docxfilefinder.py
@@ -2,6 +2,7 @@
 This module contains the DocxFileFinder class.
 """
 
+import re
 from itertools import groupby
 from pathlib import Path
 
@@ -11,6 +12,127 @@
 from kohlrahbi.logger import logger
 
 
+class EdiEnergyDocument(BaseModel):
+    """
+    This class represents an EDI Energy document.
+    """
+
+    filename: Path
+    document_version: str
+    version_major: int
+    version_minor: int
+    version_suffix: str
+    valid_from: int
+    valid_until: int
+
+    @classmethod
+    def from_path(cls, path: Path) -> "EdiEnergyDocument":
+        """
+        Create an EdiEnergyDocument object from a file path.
+        """
+
+        file_name = extract_document_version_and_valid_dates(path.name)
+        assert file_name is not None, f"Could not extract document version and valid dates from {path.name}."
+        return cls(
+            filename=path,
+            document_version=file_name["document_version"],
+            version_major=int(file_name["version_major"]),
+            version_minor=int(file_name["version_minor"]),
+            version_suffix=file_name["version_suffix"],
+            valid_from=int(file_name["valid_from"]),
+            valid_until=int(file_name["valid_until"]),
+        )
+
+    def __lt__(self, other: "EdiEnergyDocument") -> bool:
+        """
+        Compare two EdiEnergyDocument instances based on
+        their document_version(major, minor and suffix), valid_until, and valid_from.
+
+        I did not know how the tuple comparison works in Python, so I looked it up:
+
+        Python compares tuples lexicographically, meaning it compares the elements one by one from left to right.
+        The comparison starts with the first elements of both tuples:
+          If self.valid_from is less than other.valid_from, the entire expression evaluates to True.
+          If self.valid_from is greater than other.valid_from, the entire expression evaluates to False.
+          If self.valid_from is equal to other.valid_from, Python moves to the next elements in the tuples.
+        This process continues with self.valid_until vs. other.valid_until and then with the version numbers.
+
+        Args:
+            other (EdiEnergyDocument): The other document to compare against.
+
+        Returns:
+            bool: True if this document is considered less than the other document, False otherwise.
+        """
+        return (self.valid_from, self.valid_until, self.version_major, self.version_minor, self.version_suffix) < (
+            other.valid_from,
+            other.valid_until,
+            other.version_major,
+            other.version_minor,
+            other.version_suffix,
+        )
+
+
+def extract_document_version_and_valid_dates(
+    filename: str,
+) -> dict[str, str] | None:
+    """Extract the document version and valid dates from the filename.
+
+    Parameters:
+    - filename (str): The filename of the document.
+
+    Returns:
+    - tuple[str, str, str]: A tuple containing the document version, valid from date, and valid until date.
+    """
+
+    # Pattern to extract detailed version number, valid until and valid from dates
+    document_name_pattern = re.compile(
+        r"-informatorischeLesefassung"
+        r"(?P<document_version>(?:S|G)?(?P<version_major>\d+)\.(?P<version_minor>\d+)(?P<version_suffix>[a-z]?))"
+        r"(?:_|KonsolidierteLesefassung|-AußerordentlicheVeröffentlichung)?"
+        r"([A-Za-z0-9.]+)?"
+        r"_(?P<valid_until>\d{8})_(?P<valid_from>\d{8})\.docx$",
+        re.IGNORECASE,
+    )
+    matches = document_name_pattern.search(filename)
+    try:
+        if matches:
+            return matches.groupdict()
+    except ValueError as e:
+        logger.error("Error extracting document version and valid dates: %s", e)
+    return None
+
+
+def get_most_recent_file(group_items: list[Path]) -> Path | None:
+    """
+    Find the most recent file in a group of files based on specific criteria.
+
+    Parameters:
+    - group_items (List[Path]): A list of Path objects representing the file paths.
+
+    Returns:
+    - Path: A Path object representing the most recent file.
+    """
+
+    try:
+        # Define the keywords to filter relevant files
+        keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"]
+        files_containing_keywords = [
+            path for path in group_items if any(keyword in path.name.lower() for keyword in keywords)
+        ]
+        if any(files_containing_keywords):
+            list_of_edi_energy_documents = [EdiEnergyDocument.from_path(path) for path in files_containing_keywords]
+        else:
+            list_of_edi_energy_documents = [EdiEnergyDocument.from_path(path) for path in group_items]
+        most_recent_file = max(list_of_edi_energy_documents)
+
+        return most_recent_file.filename
+
+    except ValueError as e:
+
+        logger.error("Error processing group items: %s", e)
+    return None
+
+
 class DocxFileFinder(BaseModel):
     """
     This class is responsible for finding the docx files in the input directory.
@@ -118,36 +240,12 @@ def filter_latest_version(groups: dict[str, list[Path]]) -> list[Path]:
         Returns:
         - List[Path]: A list of Path objects representing the latest version of the files.
         """
-        result = []
+        result: list[Path] = []
 
         for group_items in groups.values():
-            if len(group_items) == 1:
-                result.append(group_items[0])
-            else:
-                try:
-                    # Define the keywords to filter relevant files
-                    keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"]
-
-                    # Find the most recent file based on keywords and date suffixes
-                    most_recent_file = max(
-                        (path for path in group_items if any(keyword in path.name.lower() for keyword in keywords)),
-                        key=lambda path: (
-                            int(path.stem.split("_")[-1]),  # "gültig von" date
-                            int(path.stem.split("_")[-2]),  # "gültig bis" date
-                        ),
-                    )
-
-                    # Add the most recent file to the result and log ignored files
-                    for path in group_items:
-                        if path != most_recent_file:
-                            logger.debug("Ignoring file %s", path.name)
-                        else:
-                            result.append(most_recent_file)
-
-                except ValueError as e:
-                    logger.error("Error processing group items: %s", e)
-                    continue
-
+            most_recent_file = get_most_recent_file(group_items)
+            assert most_recent_file is not None, "Could not find the most recent file."
+            result.append(most_recent_file)
         return result
 
     def filter_for_latest_mig_and_ahb_docx_files(self) -> None:
diff --git a/unittests/test_docxfilefinder.py b/unittests/test_docxfilefinder.py
@@ -1,63 +1,117 @@
-from pathlib import Path
+from pathlib import Path, PosixPath
 
 import pytest
 
-from kohlrahbi.docxfilefinder import DocxFileFinder
+from kohlrahbi.docxfilefinder import DocxFileFinder, get_most_recent_file
 
 
 class TestDocxFileFinder:
     @pytest.mark.parametrize(
         ["group_items", "expected"],
         [
             pytest.param(
-                {
-                    "UTILTSAHB": [
-                        Path("UTILTSAHB_20240701_20240401.docx"),
-                    ]
-                },
-                [Path("UTILTSAHB_20240701_20240401.docx")],
+                {"UTILTSAHB": [Path("UTILTSAHB-informatorischeLesefassung4.0_20240701_20240401.docx")]},
+                [Path("UTILTSAHB-informatorischeLesefassung4.0_20240701_20240401.docx")],
                 id="Single File",
             ),
             pytest.param(
                 {
                     "UTILTSAHB": [
-                        Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx"),
-                        Path("UTILTSAHB-außerordentlicheveröffentlichung_20240701_20240501.docx"),
-                        Path("UTILTSAHB-außerordentlicheveröffentlichung_20240930_20240401.docx"),
-                        Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"),
-                        Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx"),
+                        Path(
+                            "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx"
+                        ),
+                        Path(
+                            "UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240701_20240501.docx"
+                        ),
+                        Path(
+                            "UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240930_20240401.docx"
+                        ),
+                        Path(
+                            "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"
+                        ),
+                        Path(
+                            "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx"
+                        ),
                     ]
                 },
-                [Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx")],
+                [
+                    Path(
+                        "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"
+                    )
+                ],
                 id="Standard Case",
             ),
             pytest.param(
                 {
                     "UTILTSAHB": [
-                        Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx"),
-                        Path("UTILTSAHB-außerordentlicheveröffentlichung_20240731_20240701.docx"),
-                        Path("UTILTSAHB-außerordentlicheveröffentlichung_20240930_20240401.docx"),
-                        Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"),
-                        Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx"),
+                        Path(
+                            "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx"
+                        ),
+                        Path(
+                            "UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240731_20240701.docx"
+                        ),
+                        Path(
+                            "UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240930_20240401.docx"
+                        ),
+                        Path(
+                            "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"
+                        ),
+                        Path(
+                            "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx"
+                        ),
                     ]
                 },
-                [Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx")],
+                [
+                    Path(
+                        "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"
+                    )
+                ],
                 id="Valid from tie",
             ),
             pytest.param(
                 {
-                    "UTILTSAHB": [
-                        Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx"),
-                        Path("UTILTSAHB_20250731_20240901.docx"),
-                        Path("UTILTSAHB-außerordentlicheveröffentlichung_20240930_20240401.docx"),
-                        Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx"),
-                        Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx"),
+                    "UTILMDAHBMaBiS": [
+                        Path("UTILMDAHBMaBiS-informatorischeLesefassung4.0_99991231_20231001.docx"),
+                        Path(
+                            "UTILMDAHBMaBiS-informatorischeLesefassung4.1aKonsolidierteLesefassungmitFehlerkorrekturenStand11.03.2024_20250403_20240403.docx"
+                        ),
+                        Path("UTILMDAHBMaBiS-informatorischeLesefassung4.1a_20250403_20240403.docx"),
                     ]
                 },
-                [Path("UTILTSAHB-konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx")],
+                [
+                    Path(
+                        "UTILMDAHBMaBiS-informatorischeLesefassung4.1aKonsolidierteLesefassungmitFehlerkorrekturenStand11.03.2024_20250403_20240403.docx"
+                    )
+                ],
                 id="different names",
             ),
         ],
     )
     def test_filter_latest_version(self, group_items, expected):
         assert DocxFileFinder.filter_latest_version(group_items) == expected
+
+    @pytest.mark.parametrize(
+        ["group_items", "expected"],
+        [
+            pytest.param(
+                [
+                    Path("APERAKCONTRLAHB-informatorischeLesefassung2.4a_99991231_20250404.docx"),
+                    Path("APERAKCONTRLAHB-informatorischeLesefassung2.4_99991231_20250404.docx"),
+                ],
+                Path("APERAKCONTRLAHB-informatorischeLesefassung2.4a_99991231_20250404.docx"),
+                id="Two versions of the same file",
+            ),
+            pytest.param(
+                [
+                    Path("CodelistederKonfigurationen-informatorischeLesefassung1.3_99991231_20250404.docx"),
+                    Path("CodelistederKonfigurationen-informatorischeLesefassung1.1_99991231_20231001.docx"),
+                    Path("CodelistederKonfigurationen-informatorischeLesefassung1.3a_99991231_20250404.docx"),
+                    Path("CodelistederKonfigurationen-informatorischeLesefassung1.3b_99991231_20250404.docx"),
+                ],
+                Path("CodelistederKonfigurationen-informatorischeLesefassung1.3b_99991231_20250404.docx"),
+                id="Four versions of the same file",
+            ),
+        ],
+    )
+    def test_get_most_recent(self, group_items: list[Path], expected: Path):
+        assert get_most_recent_file(group_items) == expected

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def validate_path(ctx, param, value) -> Path: # type:ignore[no-untyped-def]`
`88`	`88`	`default=False,`
`89`	`89`	`help="Confirm all prompts automatically.",`
`90`	`90`	`)`
`91`		`-# pylint: disable=too-many-positional-arguments, too-many-arguments`
	`91`	`+# pylint: disable=too-many-arguments, too-many-positional-arguments`
`92`	`92`	`def ahb(`
`93`	`93`	`pruefis: list[str],`
`94`	`94`	`edi_energy_mirror_path: Path,`