|
2 | 2 | This module contains the DocxFileFinder class.
|
3 | 3 | """
|
4 | 4 |
|
| 5 | +import re |
5 | 6 | from itertools import groupby
|
6 | 7 | from pathlib import Path
|
7 | 8 |
|
|
11 | 12 | from kohlrahbi.logger import logger
|
12 | 13 |
|
13 | 14 |
|
| 15 | +class EdiEnergyDocument(BaseModel): |
| 16 | + """ |
| 17 | + This class represents an EDI Energy document. |
| 18 | + """ |
| 19 | + |
| 20 | + filename: Path |
| 21 | + document_version: str |
| 22 | + version_major: int |
| 23 | + version_minor: int |
| 24 | + version_suffix: str |
| 25 | + valid_from: int |
| 26 | + valid_until: int |
| 27 | + |
| 28 | + @classmethod |
| 29 | + def from_path(cls, path: Path) -> "EdiEnergyDocument": |
| 30 | + """ |
| 31 | + Create an EdiEnergyDocument object from a file path. |
| 32 | + """ |
| 33 | + |
| 34 | + file_name = extract_document_version_and_valid_dates(path.name) |
| 35 | + assert file_name is not None, f"Could not extract document version and valid dates from {path.name}." |
| 36 | + return cls( |
| 37 | + filename=path, |
| 38 | + document_version=file_name["document_version"], |
| 39 | + version_major=int(file_name["version_major"]), |
| 40 | + version_minor=int(file_name["version_minor"]), |
| 41 | + version_suffix=file_name["version_suffix"], |
| 42 | + valid_from=int(file_name["valid_from"]), |
| 43 | + valid_until=int(file_name["valid_until"]), |
| 44 | + ) |
| 45 | + |
| 46 | + def __lt__(self, other: "EdiEnergyDocument") -> bool: |
| 47 | + """ |
| 48 | + Compare two EdiEnergyDocument instances based on |
| 49 | + their document_version(major, minor and suffix), valid_until, and valid_from. |
| 50 | +
|
| 51 | + I did not know how the tuple comparison works in Python, so I looked it up: |
| 52 | +
|
| 53 | + Python compares tuples lexicographically, meaning it compares the elements one by one from left to right. |
| 54 | + The comparison starts with the first elements of both tuples: |
| 55 | + If self.valid_from is less than other.valid_from, the entire expression evaluates to True. |
| 56 | + If self.valid_from is greater than other.valid_from, the entire expression evaluates to False. |
| 57 | + If self.valid_from is equal to other.valid_from, Python moves to the next elements in the tuples. |
| 58 | + This process continues with self.valid_until vs. other.valid_until and then with the version numbers. |
| 59 | +
|
| 60 | + Args: |
| 61 | + other (EdiEnergyDocument): The other document to compare against. |
| 62 | +
|
| 63 | + Returns: |
| 64 | + bool: True if this document is considered less than the other document, False otherwise. |
| 65 | + """ |
| 66 | + return (self.valid_from, self.valid_until, self.version_major, self.version_minor, self.version_suffix) < ( |
| 67 | + other.valid_from, |
| 68 | + other.valid_until, |
| 69 | + other.version_major, |
| 70 | + other.version_minor, |
| 71 | + other.version_suffix, |
| 72 | + ) |
| 73 | + |
| 74 | + |
| 75 | +def extract_document_version_and_valid_dates( |
| 76 | + filename: str, |
| 77 | +) -> dict[str, str] | None: |
| 78 | + """Extract the document version and valid dates from the filename. |
| 79 | +
|
| 80 | + Parameters: |
| 81 | + - filename (str): The filename of the document. |
| 82 | +
|
| 83 | + Returns: |
| 84 | + - tuple[str, str, str]: A tuple containing the document version, valid from date, and valid until date. |
| 85 | + """ |
| 86 | + |
| 87 | + # Pattern to extract detailed version number, valid until and valid from dates |
| 88 | + document_name_pattern = re.compile( |
| 89 | + r"-informatorischeLesefassung" |
| 90 | + r"(?P<document_version>(?:S|G)?(?P<version_major>\d+)\.(?P<version_minor>\d+)(?P<version_suffix>[a-z]?))" |
| 91 | + r"(?:_|KonsolidierteLesefassung|-AußerordentlicheVeröffentlichung)?" |
| 92 | + r"([A-Za-z0-9.]+)?" |
| 93 | + r"_(?P<valid_until>\d{8})_(?P<valid_from>\d{8})\.docx$", |
| 94 | + re.IGNORECASE, |
| 95 | + ) |
| 96 | + matches = document_name_pattern.search(filename) |
| 97 | + try: |
| 98 | + if matches: |
| 99 | + return matches.groupdict() |
| 100 | + except ValueError as e: |
| 101 | + logger.error("Error extracting document version and valid dates: %s", e) |
| 102 | + return None |
| 103 | + |
| 104 | + |
| 105 | +def get_most_recent_file(group_items: list[Path]) -> Path | None: |
| 106 | + """ |
| 107 | + Find the most recent file in a group of files based on specific criteria. |
| 108 | +
|
| 109 | + Parameters: |
| 110 | + - group_items (List[Path]): A list of Path objects representing the file paths. |
| 111 | +
|
| 112 | + Returns: |
| 113 | + - Path: A Path object representing the most recent file. |
| 114 | + """ |
| 115 | + |
| 116 | + try: |
| 117 | + # Define the keywords to filter relevant files |
| 118 | + keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"] |
| 119 | + files_containing_keywords = [ |
| 120 | + path for path in group_items if any(keyword in path.name.lower() for keyword in keywords) |
| 121 | + ] |
| 122 | + if any(files_containing_keywords): |
| 123 | + list_of_edi_energy_documents = [EdiEnergyDocument.from_path(path) for path in files_containing_keywords] |
| 124 | + else: |
| 125 | + list_of_edi_energy_documents = [EdiEnergyDocument.from_path(path) for path in group_items] |
| 126 | + most_recent_file = max(list_of_edi_energy_documents) |
| 127 | + |
| 128 | + return most_recent_file.filename |
| 129 | + |
| 130 | + except ValueError as e: |
| 131 | + |
| 132 | + logger.error("Error processing group items: %s", e) |
| 133 | + return None |
| 134 | + |
| 135 | + |
14 | 136 | class DocxFileFinder(BaseModel):
|
15 | 137 | """
|
16 | 138 | This class is responsible for finding the docx files in the input directory.
|
@@ -118,36 +240,12 @@ def filter_latest_version(groups: dict[str, list[Path]]) -> list[Path]:
|
118 | 240 | Returns:
|
119 | 241 | - List[Path]: A list of Path objects representing the latest version of the files.
|
120 | 242 | """
|
121 |
| - result = [] |
| 243 | + result: list[Path] = [] |
122 | 244 |
|
123 | 245 | for group_items in groups.values():
|
124 |
| - if len(group_items) == 1: |
125 |
| - result.append(group_items[0]) |
126 |
| - else: |
127 |
| - try: |
128 |
| - # Define the keywords to filter relevant files |
129 |
| - keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"] |
130 |
| - |
131 |
| - # Find the most recent file based on keywords and date suffixes |
132 |
| - most_recent_file = max( |
133 |
| - (path for path in group_items if any(keyword in path.name.lower() for keyword in keywords)), |
134 |
| - key=lambda path: ( |
135 |
| - int(path.stem.split("_")[-1]), # "gültig von" date |
136 |
| - int(path.stem.split("_")[-2]), # "gültig bis" date |
137 |
| - ), |
138 |
| - ) |
139 |
| - |
140 |
| - # Add the most recent file to the result and log ignored files |
141 |
| - for path in group_items: |
142 |
| - if path != most_recent_file: |
143 |
| - logger.debug("Ignoring file %s", path.name) |
144 |
| - else: |
145 |
| - result.append(most_recent_file) |
146 |
| - |
147 |
| - except ValueError as e: |
148 |
| - logger.error("Error processing group items: %s", e) |
149 |
| - continue |
150 |
| - |
| 246 | + most_recent_file = get_most_recent_file(group_items) |
| 247 | + assert most_recent_file is not None, "Could not find the most recent file." |
| 248 | + result.append(most_recent_file) |
151 | 249 | return result
|
152 | 250 |
|
153 | 251 | def filter_for_latest_mig_and_ahb_docx_files(self) -> None:
|
|
0 commit comments