diff --git a/edi_energy_mirror b/edi_energy_mirror index 48631fc3..c4a09456 160000 --- a/edi_energy_mirror +++ b/edi_energy_mirror @@ -1 +1 @@ -Subproject commit 48631fc3176da639f0cefd581bbbd1cc3a04811d +Subproject commit c4a094560fefc4d0ab7f9f953682853f6abdb4a3 diff --git a/pyproject.toml b/pyproject.toml index f3f95c4a..a6bab7d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,16 +13,16 @@ build-backend = "hatchling.build" dependencies = [ "click>=8.1.7", "colorlog>=6.9.0", + "edi_energy_scraper>=2.0.0", + "efoli>=1.4.0", "more_itertools>=10.5.0", "openpyxl>=3.1.5", "pandas>=2.2.3", + "pydantic>=2.9.2", "python-docx>=1.1.2", "pytz>=2024.2", - "pydantic>=2.9.2", "tomlkit>=0.13.2", "xlsxwriter>=3.2.0", - "efoli>=1.4.0", - "edi_energy_scraper>=2.0.0" ] name = "kohlrahbi" @@ -49,10 +49,7 @@ dynamic = ["readme", "version"] kohlrahbi = "kohlrahbi:cli" [project.optional-dependencies] -sqlmodels = [ - "sqlmodel>=0.0.22", - "sqlalchemy[mypy]>=2.0.37" -] +sqlmodels = ["sqlmodel>=0.0.22", "sqlalchemy[mypy]>=2.0.37"] dev = [ "kohlrahbi[sqlmodels]", "kohlrahbi[test]", diff --git a/src/kohlrahbi/ahb/__init__.py b/src/kohlrahbi/ahb/__init__.py index fac300c5..c1dc66a2 100644 --- a/src/kohlrahbi/ahb/__init__.py +++ b/src/kohlrahbi/ahb/__init__.py @@ -154,14 +154,15 @@ def get_ahb_documents_path(base_path: Path, version: str) -> Path: return path -def find_pruefidentifikatoren(path: Path) -> dict[str, str]: +def find_pruefidentifikatoren(path: Path, format_version: EdifactFormatVersion) -> dict[str, str]: """finds pruefis in given dir""" - pruefis = {} + pruefis: dict[str, str] = {} + + file_finder = DocxFileFinder(path_to_edi_energy_mirror=path, format_version=format_version) - ahb_file_finder = DocxFileFinder.from_input_path(input_path=path) - ahb_file_finder.filter_for_latest_ahb_docx_files() + file_finder.get_file_paths_for_ahbs() - for docx_path in ahb_file_finder.paths_to_docx_files: + for docx_path in file_finder.result_paths: pruefis.update(extract_pruefis_from_docx(docx_path)) return dict(sorted(pruefis.items())) @@ -228,8 +229,8 @@ def get_pruefi_to_file_mapping(basic_input_path: Path, format_version: EdifactFo raise ReferenceError(f"Could not find pruefidentifikatoren in {default_path_to_cache_file}") return dict(pruefi_to_file_mapping) - path_to_docx_files = basic_input_path / Path(f"edi_energy_de/{format_version}") - pruefi_to_file_mapping = find_pruefidentifikatoren(path_to_docx_files) + # path_to_docx_files = basic_input_path / Path(f"edi_energy_de/{format_version}") + pruefi_to_file_mapping = find_pruefidentifikatoren(basic_input_path, format_version=format_version) save_pruefi_map_to_toml(pruefi_to_file_mapping, format_version.value) return pruefi_to_file_mapping diff --git a/src/kohlrahbi/changehistory/__init__.py b/src/kohlrahbi/changehistory/__init__.py index e3633fe3..e9c833ab 100644 --- a/src/kohlrahbi/changehistory/__init__.py +++ b/src/kohlrahbi/changehistory/__init__.py @@ -9,6 +9,7 @@ - `create_sheet_name`: Creates a sheet name from the filename. """ +import re from datetime import datetime, timezone from pathlib import Path from typing import Optional @@ -17,6 +18,7 @@ import pandas as pd from docx.document import Document from docx.table import Table +from efoli import EdifactFormatVersion from kohlrahbi.changehistory.changehistorytable import ChangeHistoryTable from kohlrahbi.docxfilefinder import DocxFileFinder @@ -52,6 +54,63 @@ def get_change_history_table(document: Document) -> Optional[ChangeHistoryTable] return None +def extract_sheet_name(filename: str) -> str: + """ + Extract and format a valid Excel sheet name from a filename. + The sheet name will be no longer than 31 characters (Excel's limit). + + Args: + filename (str): The full filename like 'AHB_COMDIS_1.0f_20250606_99991231_20250606_ooox_8871.docx' + or 'Entscheidungsbaum-DiagrammeundCodelisten-informatorischeLesefassung3.5.docx' + or 'EBD_4.0b_20250606_20250131_20241215_xoxx_11449.docx' + + Returns: + str: The extracted sheet name, shortened to max 31 chars + """ + # Remove .docx extension if present + filename = filename.replace(".docx", "") + + # Handle standard AHB/MIG/EBD files + parts = filename.split("_") + if parts[0] in ["AHB", "MIG", "EBD"]: + if parts[0] == "EBD": + # EBD files have format: EBD_4.0b_date_... + return f"{parts[0]}_{parts[1]}" + # AHB/MIG files have format: AHB_COMDIS_1.0f_date_... + return f"{parts[0]}_{parts[1]}_{parts[2]}" + + # Handle special cases + if filename.startswith("allgemeinefestlegungeninformatorischelesefassung"): + # Extract version if present + version_pattern = r".*?_?([0-9]+\.[0-9]+[a-zA-Z]*)(?:_\d{8}|$)" + version_match = re.search(version_pattern, filename) + version = version_match.group(1) if version_match else "" + return f"Allgemeine_Festlegungen_{version}" + + if filename.startswith("apiguidelineinformatorischelesefassung"): + version_pattern = r".*?_?([0-9]+\.[0-9]+[a-zA-Z]*)(?:_\d{8}|$)" + version_match = re.search(version_pattern, filename) + version = version_match.group(1) if version_match else "" + return f"API_Guideline_{version}" + + if filename.startswith("codeliste"): + version_pattern = r".*?_?([0-9]+\.[0-9]+[a-zA-Z]*)(?:_\d{8}|$)" + version_match = re.search(version_pattern, filename) + version = version_match.group(1) if version_match else "" + return f"CL_der_Konfigurationen_{version}" + + # Handle Entscheidungsbaum files + if filename.startswith("Entscheidungsbaum"): + version_pattern = r".*?([0-9]+\.[0-9]+)(?:_\d{8}|$)" + version_match = re.search(version_pattern, filename) + version = version_match.group(1) if version_match else "" + return f"EBDs_CL_{version}" + + # For any other cases, just return the filename without extension + # and ensure it's not longer than 31 characters + return filename[:31] + + def save_change_histories_to_excel(change_history_collection: dict[str, pd.DataFrame], output_path: Path) -> None: """ Save the collected change histories to an Excel file. @@ -73,11 +132,12 @@ def save_change_histories_to_excel(change_history_collection: dict[str, pd.DataF # https://github.com/PyCQA/pylint/issues/3060 pylint: disable=abstract-class-instantiated with pd.ExcelWriter(path_to_change_history_excel_file, engine="xlsxwriter") as writer: for sheet_name, df in change_history_collection.items(): - df.to_excel(writer, sheet_name=sheet_name) + shorten_sheet_name = extract_sheet_name(filename=sheet_name) + df.to_excel(writer, sheet_name=shorten_sheet_name) # Access the XlsxWriter workbook and worksheet objects workbook = writer.book - worksheet = writer.sheets[sheet_name] + worksheet = writer.sheets[shorten_sheet_name] # Use shortened name here # Create a text wrap format, this is needed to avoid the text being cut off in the cells wrap_format = workbook.add_format({"text_wrap": True}) @@ -92,36 +152,6 @@ def save_change_histories_to_excel(change_history_collection: dict[str, pd.DataF worksheet.set_column(col_num, col_num, width, wrap_format) -def create_sheet_name(filename: str) -> str: - """ - Creates a sheet name from the filename. - - We need to shorten the sheet name because Excel only allows 31 characters for sheet names. - This function replaces some words with acronyms and removes some words. - """ - sheet_name = filename.split("-informatorischeLesefassung")[0] - - if "Entscheidungsbaum-Diagramm" in sheet_name: - sheet_name = sheet_name.replace("Entscheidungsbaum", "EBDs") - if "Artikelnummern" in sheet_name: - sheet_name = sheet_name.replace("Artikelnummern", "Artikelnr") - if "Codeliste" in sheet_name: - sheet_name = sheet_name.replace("Codeliste", "CL") - if len(sheet_name) > 31: - # Excel only allows 31 characters for sheet names - # but REQOTEQUOTESORDERSORDRSPORDCHGAHB is 33 characters long - sheet_name = sheet_name.replace("HG", "") - return sheet_name - - -def find_docx_files(input_path: Path) -> list[Path]: - """ - Find all .docx files containing change histories. - """ - docx_file_finder = DocxFileFinder.from_input_path(input_path=input_path) - return docx_file_finder.get_all_docx_files_which_contain_change_histories() - - def process_docx_file(file_path: Path) -> Optional[pd.DataFrame]: """ Read and process change history from a .docx file. @@ -136,17 +166,19 @@ def process_docx_file(file_path: Path) -> Optional[pd.DataFrame]: return None -def scrape_change_histories(input_path: Path, output_path: Path) -> None: +def scrape_change_histories(input_path: Path, output_path: Path, format_version: EdifactFormatVersion) -> None: """ starts the scraping process of the change histories """ logger.info("👀 Start looking for change histories") - ahb_file_paths = find_docx_files(input_path) + path_to_files_with_changehistory = DocxFileFinder( + path_to_edi_energy_mirror=input_path, format_version=format_version + ).get_file_paths_for_change_history() change_history_collection = {} - for file_path in ahb_file_paths: + for file_path in path_to_files_with_changehistory: df = process_docx_file(file_path) if df is not None: - change_history_collection[create_sheet_name(file_path.name)] = df + change_history_collection[extract_sheet_name(file_path.name)] = df save_change_histories_to_excel(change_history_collection, output_path) diff --git a/src/kohlrahbi/changehistory/command.py b/src/kohlrahbi/changehistory/command.py index 1b8602a8..d85ee58d 100644 --- a/src/kohlrahbi/changehistory/command.py +++ b/src/kohlrahbi/changehistory/command.py @@ -61,5 +61,5 @@ def changehistory( check_python_version() if isinstance(format_version, str): format_version = EdifactFormatVersion(format_version) - input_path = edi_energy_mirror_path / "edi_energy_de" / format_version.value - scrape_change_histories(input_path=input_path, output_path=output_path) + + scrape_change_histories(input_path=edi_energy_mirror_path, output_path=output_path, format_version=format_version) diff --git a/src/kohlrahbi/conditions/__init__.py b/src/kohlrahbi/conditions/__init__.py index 17d67c96..5404d8ab 100644 --- a/src/kohlrahbi/conditions/__init__.py +++ b/src/kohlrahbi/conditions/__init__.py @@ -47,6 +47,7 @@ def scrape_conditions( for file in files: # pylint: disable=too-many-function-args path: Path = basic_input_path / path_to_file / Path(file) + assert path.exists(), f"File {path} does not exist" doc = docx.Document(str(path.absolute())) logger.info("Start scraping conditions for %s in %s", edifact_format, file) if not doc: diff --git a/src/kohlrahbi/docxfilefinder.py b/src/kohlrahbi/docxfilefinder.py index 14c81746..dfee3336 100644 --- a/src/kohlrahbi/docxfilefinder.py +++ b/src/kohlrahbi/docxfilefinder.py @@ -3,10 +3,10 @@ """ import re -from itertools import groupby from pathlib import Path -from efoli import EdifactFormat, get_format_of_pruefidentifikator +from edi_energy_scraper import DocumentMetadata +from efoli import EdifactFormat, EdifactFormatVersion from pydantic import BaseModel from kohlrahbi.logger import logger @@ -19,6 +19,7 @@ class EdiEnergyDocument(BaseModel): filename: Path document_version: str + version_prefix: str version_major: int version_minor: int version_suffix: str @@ -31,16 +32,27 @@ def from_path(cls, path: Path) -> "EdiEnergyDocument": Create an EdiEnergyDocument object from a file path. """ - file_name = extract_document_version_and_valid_dates(path.name) - assert file_name is not None, f"Could not extract document version and valid dates from {path.name}." + document_metadata = extract_document_meta_data(path.name) + assert document_metadata is not None, f"Could not extract document version and valid dates from {path.name}." + assert document_metadata.version is not None, "Document version is None." + + prefix, version_major, version_minor, version_suffix = split_version_string(document_metadata.version) + + valid_from = int(document_metadata.valid_from.strftime("%Y%m%d")) + valid_until = int(document_metadata.valid_until.strftime("%Y%m%d")) + # assert valid_from <= valid_until, "Valid from is greater than valid until." + assert isinstance(valid_from, int), "Valid from is not an integer." + assert isinstance(valid_until, int), "Valid until is not an integer." + return cls( filename=path, - document_version=file_name["document_version"], - version_major=int(file_name["version_major"]), - version_minor=int(file_name["version_minor"]), - version_suffix=file_name["version_suffix"], - valid_from=int(file_name["valid_from"]), - valid_until=int(file_name["valid_until"]), + document_version=document_metadata.version, + version_prefix=prefix, + version_major=version_major, + version_minor=version_minor, + version_suffix=version_suffix, + valid_from=valid_from, + valid_until=valid_until, ) def __lt__(self, other: "EdiEnergyDocument") -> bool: @@ -72,257 +84,308 @@ def __lt__(self, other: "EdiEnergyDocument") -> bool: ) -def extract_document_version_and_valid_dates( +def split_version_string(version_string: str) -> tuple[str, int, int, str]: + """ + Split the version string into a tuple of (prefix, major, minor, suffix). + The prefix is optional (can be empty string). + + Examples: + >>> split_version_string("1.3a") + ('', 1, 3, 'a') + >>> split_version_string("G2.1e") + ('G', 2, 1, 'e') + >>> split_version_string("S1.2f") + ('S', 1, 2, 'f') + """ + pattern = r"^([GS])?(\d+)\.(\d+)([a-zA-Z]*)$" + match = re.match(pattern, version_string) + if not match: + raise ValueError(f"Invalid version string format: {version_string}") + + prefix, major, minor, suffix = match.groups() + return ( + prefix or "", # convert None to empty string if no prefix + int(major), + int(minor), + suffix or "", # convert None to empty string if no suffix + ) + + +def extract_document_meta_data( filename: str, -) -> dict[str, str] | None: - """Extract the document version and valid dates from the filename. +) -> DocumentMetadata: + """Extract the document metadata from the filename. Parameters: - filename (str): The filename of the document. Returns: - - tuple[str, str, str]: A tuple containing the document version, valid from date, and valid until date. + - DocumentMetadata: A DocumentMetadata object. """ - # Pattern to extract detailed version number, valid until and valid from dates - document_name_pattern = re.compile( - r"-informatorischeLesefassung" - r"(?P(?:S|G)?(?P\d+)\.(?P\d+)(?P[a-z]?))" - r"(?:_|KonsolidierteLesefassung|-AußerordentlicheVeröffentlichung)?" - r"([A-Za-z0-9.]+)?" - r"_(?P\d{8})_(?P\d{8})\.docx$", - re.IGNORECASE, - ) - matches = document_name_pattern.search(filename) - try: - if matches: - return matches.groupdict() - except ValueError as e: - logger.error("Error extracting document version and valid dates: %s", e) - return None + document_metadata = DocumentMetadata.from_filename(filename) + assert document_metadata is not None, f"Could not extract document metadata from {filename}." + return document_metadata -def get_most_recent_file(group_items: list[Path]) -> Path | None: +class DocxFileFinder(BaseModel): + """ + This class is responsible for finding the correct docx files in the edi energy mirror. + It is used for all commands which need to find docx files. """ - Find the most recent file in a group of files based on specific criteria. - Parameters: - - group_items (List[Path]): A list of Path objects representing the file paths. + path_to_edi_energy_mirror: Path - Returns: - - Path: A Path object representing the most recent file. - """ + format_version: EdifactFormatVersion - try: - # Define the keywords to filter relevant files - keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"] - files_containing_keywords = [ - path for path in group_items if any(keyword in path.name.lower() for keyword in keywords) - ] - if any(files_containing_keywords): - list_of_edi_energy_documents = [EdiEnergyDocument.from_path(path) for path in files_containing_keywords] - else: - list_of_edi_energy_documents = [EdiEnergyDocument.from_path(path) for path in group_items] - most_recent_file = max(list_of_edi_energy_documents) + result_paths: list[Path] = [] - return most_recent_file.filename + @property + def path_to_format_version_folders(self): + """Returns the path to the edi_energy_de directory containing format version folders like FV2410, FV2404, etc.""" + path_to_edi_energy_mirror = self.path_to_edi_energy_mirror + if not path_to_edi_energy_mirror.exists(): + raise ValueError(f"The edi_energy_mirror directory {path_to_edi_energy_mirror} does not exist.") + return path_to_edi_energy_mirror / Path("edi_energy_de") - except ValueError as e: + @property + def path_to_specific_format_version_folder(self): + """Returns the path to the specific format version folder.""" + specific_format_version_folder = self.path_to_format_version_folders / Path(self.format_version.value) + if not specific_format_version_folder.exists(): + raise ValueError(f"The specific format version folder {specific_format_version_folder} does not exist.") + return specific_format_version_folder - logger.error("Error processing group items: %s", e) - return None + def get_file_paths_for_change_history(self) -> list[Path]: + """Get all file paths that contain change history for a given format version.""" + self._get_valid_docx_files() + self._filter_informational_versions() + self._get_most_recent_versions() -class DocxFileFinder(BaseModel): - """ - This class is responsible for finding the docx files in the input directory. - It can find MIG and AHB docx files. - """ + return self.result_paths - paths_to_docx_files: list[Path] + def get_file_paths_for_ahbs(self) -> list[Path]: + """Get all AHB file paths for a given format version.""" - @classmethod - def from_input_path(cls, input_path: Path) -> "DocxFileFinder": - """ - Create an DocxFileFinder object from the input path. - """ + self._get_valid_docx_files() + self._filter_informational_versions() + self._filter_for_ahb_docx_files() + self._get_most_recent_versions() - ahb_file_paths: list[Path] = [path for path in input_path.iterdir() if path.is_file() if path.suffix == ".docx"] - if not any(ahb_file_paths): # this is suspicious at least - logger.warning("The directory '%s' does not contain any docx files.", input_path.absolute()) - return cls(paths_to_docx_files=ahb_file_paths) + return self.result_paths - @staticmethod - def get_first_part_of_ahb_docx_file_name(path_to_ahb_document: Path) -> str: + def get_file_paths_for_quality_map(self) -> list[Path]: """ - Return the first part of the AHB docx file name. - The first part contains the information about the EDIFACT formats. + This function returns a list of docx files which contain a quality map. + Only the UTILMD AHB Strom documents contain quality maps. + + Returns: + list[Path]: A list of paths to the most recent UTILMD AHB Strom documents. """ + self._get_valid_docx_files() + self._filter_informational_versions() - return path_to_ahb_document.name.split("-")[0] + # Group documents by kind and format + grouped_docs = self.group_documents_by_kind_and_format(self.result_paths) + + # Find the UTILMD AHB Strom group + utilmd_strom_docs = [] + for _, group_paths in grouped_docs.items(): + if any("AHB_UTILMD" in path.name for path in group_paths): + utilmd_strom_docs.extend(group_paths) + + if not utilmd_strom_docs: + self.result_paths = [] + return self.result_paths + + # Update result_paths with UTILMD Strom docs and get most recent version + self.result_paths = utilmd_strom_docs + self._get_most_recent_versions() + + return self.result_paths + + def _filter_for_ahb_docx_files(self) -> None: + """Filter the list of AHB docx paths for the latest AHB docx files. - def filter_for_latest_ahb_docx_files(self) -> None: - """ - Filter the list of AHB docx paths for the latest AHB docx files. The latest files contain `LesefassungmitFehlerkorrekturen` in their file names. - This method is _not_ pure. It changes the state of the object. """ - self.paths_to_docx_files = self.filter_ahb_docx_files(self.paths_to_docx_files) - grouped_files = self.group_files_by_name_prefix(self.paths_to_docx_files) - self.paths_to_docx_files = self.filter_latest_version(grouped_files) + self.result_paths = [path for path in self.result_paths if "AHB" in path.name] - @staticmethod - def filter_ahb_docx_files(paths_to_docx_files: list[Path]) -> list[Path]: - """ - This function filters the docx files which contain the string "AHB" in their file name. - """ - return [path for path in paths_to_docx_files if "AHB" in path.name] + def _get_valid_docx_files(self) -> None: + """Get all valid docx files from a directory, excluding temporary files. - @staticmethod - def filter_for_docx_files_with_change_history(paths_to_docx_files: list[Path]) -> list[Path]: - """ - This function filters the docx files which contain a change history. - At this time it seems that all docx files have a change history. - But this may change in the future, so search for some keywords in the file name. + Args: + directory (Path): The directory to search in. + + Returns: + list[Path]: A list of paths to valid docx files. """ - return [ + self.result_paths = [ path - for path in paths_to_docx_files - if "AHB" in path.name - or "MIG" in path.name - or "AllgemeineFestlegungen" in path.name - or "Codeliste" in path.name - or "Entscheidungsbaum" in path.name + for path in self.path_to_specific_format_version_folder.iterdir() + if path.name.endswith(".docx") and not path.name.startswith("~") ] - # pylint: disable=line-too-long - @staticmethod - def group_files_by_name_prefix(paths_to_docx_files: list[Path]) -> dict[str, list[Path]]: - """ - This function groups the docx files by their name prefix. - Groups may now look like this: - {'APERAKCONTRLAHB': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/APERAKCONTRLAHB-informatorischeLesefassung2.3m_99991231_20231001.docx')], - 'COMDISAHB': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/COMDISAHB-informatorischeLesefassung1.0dKonsolidierteLesefassungmitFehlerkorrekturenStand20.07.2023_99991231_20231001.docx'), WindowsPath('../edi_energy_mirror/edi_energy_de/future/COMDISAHB-informatorischeLesefassung1.0d_99991231_20231001.docx')], - 'HerkunftsnachweisregisterAHB': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/HerkunftsnachweisregisterAHB-informatorischeLesefassung2.3cKonsolidierteLesefassungmitFehlerkorrekturenStand19.06.2023_99991231_20231001.docx'), WindowsPath('../edi_energy_mirror/edi_energy_de/future/HerkunftsnachweisregisterAHB-informatorischeLesefassung2.3c_99991231_20231001.docx')], - 'IFTSTAAHB': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/IFTSTAAHB-informatorischeLesefassung2.0eKonsolidierteLesefassungmitFehlerkorrekturenStand20.07.2023_99991231_20231001.docx'), WindowsPath('../edi_energy_mirror/edi_energy_de/future/IFTSTAAHB-informatorischeLesefassung2.0e_99991231_20231001.docx')], - 'INVOICREMADVAHB': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/INVOICREMADVAHB-informatorischeLesefassung2.5bKonsolidierteLesefassungmitFehlerkorrekturenStand20.07.2023_99991231_20231001.docx'), WindowsPath('../edi_energy_mirror/edi_energy_de/future/INVOICREMADVAHB-informatorischeLesefassung2.5b_99991231_20231001.docx')], - 'MSCONSAHB': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/MSCONSAHB-informatorischeLesefassung3.1c_99991231_20231001.docx')], - 'ORDERSORDRSPAHBMaBiS': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/ORDERSORDRSPAHBMaBiS-informatorischeLesefassung2.2c_99991231_20231001.docx')], - 'PARTINAHB': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/PARTINAHB-informatorischeLesefassung1.0c_99991231_20231001.docx')], - 'PRICATAHB': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/PRICATAHB-informatorischeLesefassung2.0c_99991231_20231001.docx')], - 'REQOTEQUOTESORDERSORDRSPORDCHGAHB': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/REQOTEQUOTESORDERSORDRSPORDCHGAHB-informatorischeLesefassung2.2KonsolidierteLesefassungmitFehlerkorrekturenStand20.07.2023_99991231_20231001.docx'), WindowsPath('../edi_energy_mirror/edi_energy_de/future/REQOTEQUOTESORDERSORDRSPORDCHGAHB-informatorischeLesefassung2.2_99991231_20231001.docx')], - 'UTILMDAHBGas': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/UTILMDAHBGas-informatorischeLesefassung1.0aKonsolidierteLesefassungmitFehlerkorrekturenStand29.06.2023_99991231_20231001.docx'), WindowsPath('../edi_energy_mirror/edi_energy_de/future/UTILMDAHBGas-informatorischeLesefassung1.0a_99991231_20231001.docx')], - 'UTILMDAHBMaBiS': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/UTILMDAHBMaBiS-informatorischeLesefassung4.1_99991231_20231001.docx')], - 'UTILMDAHBStrom': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/UTILMDAHBStrom-informatorischeLesefassung1.1KonsolidierteLesefassungmitFehlerkorrekturenStand29.06.2023_99991231_20231001.docx'), WindowsPath('../edi_energy_mirror/edi_energy_de/future/UTILMDAHBStrom-informatorischeLesefassung1.1_99991231_20231001.docx')], - 'UTILTSAHBBerechnungsformel': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/UTILTSAHBBerechnungsformel-informatorischeLesefassung1.0e_99991231_20231001.docx')], 'UTILTSAHBDefinitionen': [WindowsPath('../edi_energy_mirror/edi_energy_de/future/UTILTSAHBDefinitionen-informatorischeLesefassung1.1_99991231_20231001.docx')]} - """ - return { - group_key: list(group) - for group_key, group in groupby( - sorted(paths_to_docx_files, key=DocxFileFinder.get_first_part_of_ahb_docx_file_name), - key=DocxFileFinder.get_first_part_of_ahb_docx_file_name, - ) - } + def _filter_informational_versions(self) -> None: + """Filter paths to only include informational reading versions. - @staticmethod - def filter_latest_version(groups: dict[str, list[Path]]) -> list[Path]: + Args: + paths (list[Path]): List of paths to filter. + + Returns: + list[Path]: Filtered list containing only informational reading versions. """ - Filters and returns the latest version of the AHB or MIG .docx files - from the provided groups based on specific criteria. + informational_versions = [] + for path in self.result_paths: + document_metadata = extract_document_meta_data(path.name) + if document_metadata and document_metadata.is_informational_reading_version: + informational_versions.append(path) + self.result_paths = informational_versions - The latest version is determined based on the presence of specific - keywords in the filename and the numerical suffix in the filename. + def _get_most_recent_versions(self) -> None: + """Get the most recent version from each group of documents. - Parameters: - - groups (Dict[str, List[Path]]): A dictionary where keys are group identifiers - and values are lists of Path objects representing the file paths. + Args: + grouped_docs (dict[tuple[str, str], list[Path]]): Documents grouped by kind and format. Returns: - - List[Path]: A list of Path objects representing the latest version of the files. + list[Path]: List of the most recent version from each group. """ - result: list[Path] = [] - for group_items in groups.values(): - most_recent_file = get_most_recent_file(group_items) - assert most_recent_file is not None, "Could not find the most recent file." - result.append(most_recent_file) - return result + grouped_docs = self.group_documents_by_kind_and_format(self.result_paths) + + most_recent_versions = [] + for group in grouped_docs.values(): + if len(group) == 1: + most_recent_versions.append(group[0]) + continue + + filtered_group = self._filter_error_corrections(group) + sorted_group = self._sort_group_by_metadata(filtered_group) + if sorted_group: + most_recent_versions.append(sorted_group[0]) + + self.result_paths = sorted(most_recent_versions) - def filter_for_latest_mig_and_ahb_docx_files(self) -> None: + def _filter_error_corrections(self, group: list[Path]) -> list[Path]: + """Filter group to keep only error correction versions if they exist. + + Args: + group (list[Path]): List of paths in a group. + + Returns: + list[Path]: Filtered list containing only error correction versions if they exist, + otherwise returns the original group. """ - Filter the list of MIG docx paths for the latest MIG docx files. + has_error_correction = any( + extract_document_meta_data(path.name).is_error_correction + for path in group + if extract_document_meta_data(path.name) + ) + + if has_error_correction: + return [ + path + for path in group + if extract_document_meta_data(path.name) and extract_document_meta_data(path.name).is_error_correction + ] + return group + + def _sort_group_by_metadata(self, group: list[Path]) -> list[Path]: + """Sort group by version, publication date, and validity dates. + + Args: + group (list[Path]): List of paths to sort. + + Returns: + list[Path]: Sorted list of paths. """ - self.paths_to_docx_files = self.filter_for_docx_files_with_change_history(self.paths_to_docx_files) - grouped_files = self.group_files_by_name_prefix(self.paths_to_docx_files) - self.paths_to_docx_files = self.filter_latest_version(grouped_files) + try: + return sorted( + group, + key=lambda x: ( + ( + extract_document_meta_data(x.name).version, + extract_document_meta_data(x.name).publication_date, + extract_document_meta_data(x.name).is_extraordinary_publication, + extract_document_meta_data(x.name).valid_from, + extract_document_meta_data(x.name).valid_until, + ) + if extract_document_meta_data(x.name) + else (None, None, None, None) + ), + reverse=True, + ) + except TypeError as e: + logger.exception("Could not sort group %s: %s", group, e) + return group - def filter_docx_files_for_edifact_format(self, edifact_format: EdifactFormat) -> None: + @staticmethod + def get_first_part_of_ahb_docx_file_name(path_to_ahb_document: Path) -> str: """ - Returns a list of docx files which contain the given edifact format. - This method is not pure. It changes the state of the object. + Return the first part of the AHB docx file name. + The first part contains the information about the EDIFACT formats. """ - self.paths_to_docx_files = [path for path in self.paths_to_docx_files if str(edifact_format) in path.name] + return path_to_ahb_document.name.split("-")[0] - def remove_temporary_files(self) -> None: - """ - This method removes all temporary files from paths_to_docx_files. - Temporary files lead to the exception `BadZipFile: File is not a zip file`. - It appears if a docx file is opened by Word. + def _filter_docx_files_for_edifact_format(self, edifact_format: EdifactFormat) -> None: """ + Filters the result_paths to only include files containing the given EDIFACT format in their name. - self.paths_to_docx_files = [path for path in self.paths_to_docx_files if not path.name.startswith("~")] + This method modifies the state of the object by updating result_paths. - def get_docx_files_which_may_contain_searched_pruefi(self, searched_pruefi: str) -> list[Path]: - """ - This functions takes a pruefidentifikator and returns a list of docx files which can contain the searched pruefi - Unfortunately, it is not clear in which docx the pruefidentifikator you are looking for is located. - A 11042 belongs to the UTILMD format. However, there are seven docx files that describe the UTILMD format. - A further reduction of the number of files is not possible with the pruefidentifikator only. - This method is _not_ pure. It changes the state of the object. + Args: + edifact_format (EdifactFormat): The EDIFACT format to filter for (e.g. UTILMD, MSCONS) + + Returns: + None """ - edifact_format = get_format_of_pruefidentifikator(searched_pruefi) - if edifact_format is None: - logger.exception("❌ There is no known format for the prüfi '%s'.", searched_pruefi) - raise ValueError(f"There is no known format for the prüfi '{searched_pruefi}'.") - - self.filter_for_latest_ahb_docx_files() - self.filter_docx_files_for_edifact_format(edifact_format=edifact_format) - if ( - edifact_format == EdifactFormat.UTILMD - and searched_pruefi.startswith("11") - and all("202310" in path.name for path in self.paths_to_docx_files) - ): - logger.info( - # pylint:disable=line-too-long - "You searched for a UTILMD prüfi %s starting with the soon deprecated prefix '11' but all relevant files %s are valid from 2023-10 onwards. They won't contain any match.", - searched_pruefi, - ", ".join([path.name for path in self.paths_to_docx_files]), - ) - return [] - return self.paths_to_docx_files + self.result_paths = [path for path in self.result_paths if str(edifact_format) in path.name] - def get_all_docx_files_which_contain_change_histories(self) -> list[Path]: - """ - This function returns a list of docx files which probably contain a change history. - Only format documents like UTILMD, MSCONS etc. contain a change history. + @staticmethod + def group_documents_by_kind_and_format(paths: list[Path]) -> dict[tuple[str, str], list[Path]]: """ + Groups documents by their kind and EDIFACT format. - self.filter_for_latest_mig_and_ahb_docx_files() - self.remove_temporary_files() + Args: + paths (list[Path]): List of paths to process - return self.paths_to_docx_files + Returns: + dict[tuple[str, str], list[Path]]: Dictionary where key is (kind, edifact_format) and value is list of paths - def get_docx_files_which_contain_quality_map(self) -> list[Path]: - """ - This function returns a list of docx files which contain a quality map. + Example: + >>> paths = [Path("UTILMDAHB-1.0.docx"), Path("INVOICAHB-2.0.docx")] + >>> result = DocxFileFinder.group_documents_by_format(paths) + >>> # Result might look like: {("AHB", "UTILMD"): [Path("UTILMDAHB-1.0.docx")], + >>> # ("AHB", "INVOIC"): [Path("INVOICAHB-2.0.docx")]} """ + result: dict[tuple[str, str], list[Path]] = {} + + for path in paths: + try: + metadata = extract_document_meta_data(path.name) + if metadata is None or metadata.kind is None or metadata.edifact_format is None: + + # cases for + # 'codelistederkonfigurationen_1.3b_20250606_99991231_20241213_xoxx_11124.docx' + # 'codelistederkonfigurationeninformatorischelesefassung_1.3b_20250606_99991231_20250606_ooox_8757.docx' + # 'allgemeinefestlegungeninformatorischelesefassung_6.1b_20250606_99991231_20250606_ooox_8638.docx' + # 'apiguidelineinformatorischelesefassung_1.0a_20250606_99991231_20250606_ooox_10824.docx' - self.filter_for_latest_ahb_docx_files() - self.remove_temporary_files() + x = path.name.split("_")[0] - indicator_string = "UTILMDAHBStrom" - self.paths_to_docx_files = [path for path in self.paths_to_docx_files if indicator_string in path.name] + key = (x, "") - return self.paths_to_docx_files + else: + key = (metadata.kind, metadata.edifact_format) + if key not in result: + result[key] = [] + result[key].append(path) + except Exception as e: + logger.warning(f"Could not process {path.name}: {str(e)}") + continue + + return result diff --git a/unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/COMDISAHB-informatorischeLesefassung1.0dKonsolidierteLesefassungmitFehlerkorrekturenStand19.06.2023_20230719_20231001.docx b/unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/AHB_COMDIS_1.0d_20231001_20230719_20230619_xoxx_8867.docx similarity index 100% rename from unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/COMDISAHB-informatorischeLesefassung1.0dKonsolidierteLesefassungmitFehlerkorrekturenStand19.06.2023_20230719_20231001.docx rename to unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/AHB_COMDIS_1.0d_20231001_20230719_20230619_xoxx_8867.docx diff --git a/unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/ORDERSORDRSPAHBMaBiS-informatorischeLesefassung2.2c_99991231_20231001.docx b/unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/AHB_ORDRSP_2.2c_20231001_20250605_20231001_oxox_9761.docx similarity index 100% rename from unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/ORDERSORDRSPAHBMaBiS-informatorischeLesefassung2.2c_99991231_20231001.docx rename to unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/AHB_ORDRSP_2.2c_20231001_20250605_20231001_oxox_9761.docx diff --git a/unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/PARTINAHB-informatorischeLesefassung1.0cKonsolidierteLesefassungmitFehlerkorrekturenStand29.09.2023_20240402_20231001.docx b/unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/AHB_PARTIN_1.0c_20231001_20240402_20230929_xoxx_9813.docx similarity index 100% rename from unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/PARTINAHB-informatorischeLesefassung1.0cKonsolidierteLesefassungmitFehlerkorrekturenStand29.09.2023_20240402_20231001.docx rename to unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/AHB_PARTIN_1.0c_20231001_20240402_20230929_xoxx_9813.docx diff --git a/unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/expected-output/2024-03-30_change_histories.xlsx b/unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/expected-output/2024-03-30_change_histories.xlsx index 17c54738..f9cfbcc8 100644 Binary files a/unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/expected-output/2024-03-30_change_histories.xlsx and b/unittests/test-edi-energy-mirror-repo/edi_energy_de/FV2310/expected-output/2024-03-30_change_histories.xlsx differ diff --git a/unittests/test_ahb.py b/unittests/test_ahb.py index 6501eb96..0cd84d82 100644 --- a/unittests/test_ahb.py +++ b/unittests/test_ahb.py @@ -1,6 +1,7 @@ from pathlib import Path import pytest +from efoli import EdifactFormatVersion from freezegun import freeze_time from kohlrahbi.ahb import find_pruefidentifikatoren, get_ahb_documents_path, save_pruefi_map_to_toml @@ -13,7 +14,9 @@ def test_find_pruefidentifikatoren(self, snapshot): """ test find_pruefidentifikatoren. """ - pruefis = find_pruefidentifikatoren(Path(__file__).parents[1] / "edi_energy_mirror/edi_energy_de/FV2310") + pruefis = find_pruefidentifikatoren( + Path(__file__).parents[1] / "edi_energy_mirror/", EdifactFormatVersion.FV2410 + ) assert pruefis == snapshot def test_get_ahb_documents_path(self): diff --git a/unittests/test_ahb_file_finder.py b/unittests/test_ahb_file_finder.py deleted file mode 100644 index 941b7bff..00000000 --- a/unittests/test_ahb_file_finder.py +++ /dev/null @@ -1,72 +0,0 @@ -from pathlib import Path - -import pytest -from efoli import EdifactFormat - -from kohlrahbi.docxfilefinder import DocxFileFinder - - -class TestDocxFileFinder: - """ - This class contains the unit tests for the DocxFileFinder class. - """ - - @pytest.mark.parametrize( - "searched_pruefi, expected_docx_count", - [ - pytest.param( - "11042", - 6, - id="11042 - Anmeldung MSB", - ), - pytest.param( - "13002", - 1, - id="13002 - Zaehlerstand (Gas)", - ), - ], - ) - def test_get_docx_files_which_may_contain_searched_pruefi(self, searched_pruefi: str, expected_docx_count: int): - """ - Test if the get_docx_files_which_may_contain_searched_pruefi method returns the correct number of docx files. - """ - path_to_ahb_documents: Path = Path.cwd() / Path("unittests/test-edi-energy-mirror-repo/docx_files") - - ahb_file_finder = DocxFileFinder.from_input_path(input_path=path_to_ahb_documents) - - ahb_file_finder.get_docx_files_which_may_contain_searched_pruefi(searched_pruefi=searched_pruefi) - - assert len(ahb_file_finder.paths_to_docx_files) == expected_docx_count - - def test_filter_docx_files_for_edifact_format(self) -> None: - """ - Test the filter_docx_files_for_edifact_format method. - This method filters the list of docx paths for the given EDIFACT format. - But it does not filter for the latest AHB docx files neither does it filter for AHBs or MIGs only. - So we assert two ahb files and two MIG files for the MSCONS format: - - MSCONSAHB-informatorischeLesefassung3.1aKonsolidierteLesefassungmitFehlerkorrekturenStand27.09.2022_20230331_20221001 - - MSCONSMIG-informatorischeLesefassung2.4a_99991231_20221001 - - MSCONSAHB-informatorischeLesefassung3.1a_20230331_20221001 - - MSCONSMIG-informatorischeLesefassung2.4aKonsolidierteLesefassungmitFehlerkorrekturenStand23.05.2022_99991231_20221001 - """ - path_to_ahb_documents: Path = Path.cwd() / Path("unittests/test-edi-energy-mirror-repo/docx_files") - - ahb_file_finder = DocxFileFinder.from_input_path(input_path=path_to_ahb_documents) - - ahb_file_finder.filter_docx_files_for_edifact_format(edifact_format=EdifactFormat.MSCONS) - - assert len(ahb_file_finder.paths_to_docx_files) == 4 - - def test_filter_for_latest_ahb_docx_files(self) -> None: - """ - Test the filter_for_latest_ahb_docx_files method. - This method filters the list of AHB docx paths for the latest AHB docx files. - The latest files contain `LesefassungmitFehlerkorrekturen` in their file names. - """ - path_to_ahb_documents: Path = Path.cwd() / Path("unittests/test-edi-energy-mirror-repo/docx_files") - - ahb_file_finder = DocxFileFinder.from_input_path(input_path=path_to_ahb_documents) - - ahb_file_finder.filter_for_latest_ahb_docx_files() - - assert len(ahb_file_finder.paths_to_docx_files) == 18 diff --git a/unittests/test_changehistory.py b/unittests/test_changehistory.py new file mode 100644 index 00000000..5796ff17 --- /dev/null +++ b/unittests/test_changehistory.py @@ -0,0 +1,165 @@ +"""Tests for the changehistory module.""" + +import pytest + +from kohlrahbi.changehistory import extract_sheet_name + + +@pytest.mark.parametrize( + "input_filename,expected_output", + [ + pytest.param( + "AHB_COMDIS_1.0f_20250606_99991231_20250606_oxox_11427.docx", + "AHB_COMDIS_1.0f", + id="AHB_COMDIS_1.0f", + ), + pytest.param( + "AHB_CONTRL_2.4a_20250606_99991231_20241213_xoxx_11128.docx", + "AHB_CONTRL_2.4a", + id="AHB_CONTRL_2.4a", + ), + pytest.param( + "AHB_IFTSTA_2.0g_20250606_99991231_20241213_xoxx_11132.docx", + "AHB_IFTSTA_2.0g", + id="AHB_IFTSTA_2.0g", + ), + pytest.param( + "AHB_MSCONS_3.1f_20250606_99991231_20250606_ooox_9612.docx", + "AHB_MSCONS_3.1f", + id="AHB_MSCONS_3.1f", + ), + pytest.param( + "AHB_ORDCHG_1.0a_20250606_99991231_20250606_ooox_11100.docx", + "AHB_ORDCHG_1.0a", + id="AHB_ORDCHG_1.0a", + ), + pytest.param( + "AHB_ORDERS_1.0a_20250606_99991231_20250131_xoxx_11441.docx", + "AHB_ORDERS_1.0a", + id="AHB_ORDERS_1.0a", + ), + pytest.param( + "AHB_ORDRSP_1.0a_20250606_99991231_20250606_ooox_11104.docx", + "AHB_ORDRSP_1.0a", + id="AHB_ORDRSP_1.0a", + ), + pytest.param( + "AHB_PARTIN_1.0e_20250606_99991231_20250606_ooox_9819.docx", + "AHB_PARTIN_1.0e", + id="AHB_PARTIN_1.0e", + ), + pytest.param( + "AHB_PRICAT_2.0e_20250606_99991231_20250606_ooox_9965.docx", + "AHB_PRICAT_2.0e", + id="AHB_PRICAT_2.0e", + ), + pytest.param( + "AHB_QUOTES_1.0_20250606_99991231_20241213_xoxx_11146.docx", + "AHB_QUOTES_1.0", + id="AHB_QUOTES_1.0", + ), + pytest.param( + "AHB_REMADV_2.5d_20250606_99991231_20250131_xoxx_11434.docx", + "AHB_REMADV_2.5d", + id="AHB_REMADV_2.5d", + ), + pytest.param( + "AHB_REQOTE_1.0a_20250606_99991231_20250606_ooox_11109.docx", + "AHB_REQOTE_1.0a", + id="AHB_REQOTE_1.0a", + ), + pytest.param( + "AHB_UTILMD_2.1_20250606_99991231_20241213_xoxx_11157.docx", + "AHB_UTILMD_2.1", + id="AHB_UTILMD_2.1", + ), + pytest.param( + "AHB_UTILTS_1.0_20250606_99991231_20241213_xoxx_11164.docx", + "AHB_UTILTS_1.0", + id="AHB_UTILTS_1.0", + ), + pytest.param( + "EBD_4.0b_20250606_99991231_20250131_xoxx_11425.docx", + "EBD_4.0b", + id="EBD_4.0b", + ), + pytest.param( + "MIG_APERAK_2.1i_20250606_99991231_20250606_ooox_8671.docx", + "MIG_APERAK_2.1i", + id="MIG_APERAK_2.1i", + ), + pytest.param( + "MIG_COMDIS_1.0e_20250606_99991231_20250606_ooox_8885.docx", + "MIG_COMDIS_1.0e", + id="MIG_COMDIS_1.0e", + ), + pytest.param( + "MIG_IFTSTA_2.0f_20250606_99991231_20250606_ooox_9326.docx", + "MIG_IFTSTA_2.0f", + id="MIG_IFTSTA_2.0f", + ), + pytest.param( + "MIG_INVOIC_2.8d_20250606_99991231_20250131_xoxx_11438.docx", + "MIG_INVOIC_2.8d", + id="MIG_INVOIC_2.8d", + ), + pytest.param( + "MIG_ORDERS_1.4a_20250606_99991231_20241213_xoxx_11139.docx", + "MIG_ORDERS_1.4a", + id="MIG_ORDERS_1.4a", + ), + pytest.param( + "MIG_ORDRSP_1.4_20250606_99991231_20250606_ooox_9797.docx", + "MIG_ORDRSP_1.4", + id="MIG_ORDRSP_1.4", + ), + pytest.param( + "MIG_PARTIN_1.0e_20250606_99991231_20250606_ooox_9836.docx", + "MIG_PARTIN_1.0e", + id="MIG_PARTIN_1.0e", + ), + pytest.param( + "MIG_PRICAT_2.0d_20250606_99991231_20250606_ooox_9982.docx", + "MIG_PRICAT_2.0d", + id="MIG_PRICAT_2.0d", + ), + pytest.param( + "MIG_QUOTES_1.3a_20250606_99991231_20241213_xoxx_11155.docx", + "MIG_QUOTES_1.3a", + id="MIG_QUOTES_1.3a", + ), + pytest.param( + "MIG_REQOTE_1.3b_20250606_99991231_20250606_ooox_10067.docx", + "MIG_REQOTE_1.3b", + id="MIG_REQOTE_1.3b", + ), + pytest.param( + "MIG_UTILMD_S2.1_20250606_99991231_20250131_xoxx_11449.docx", + "MIG_UTILMD_S2.1", + id="MIG_UTILMD_S2.1", + ), + pytest.param( + "MIG_UTILTS_1.1e_20250606_99991231_20241213_xoxx_11171.docx", + "MIG_UTILTS_1.1e", + id="MIG_UTILTS_1.1e", + ), + pytest.param( + "allgemeinefestlegungeninformatorischelesefassung_6.1b_20250606_99991231_20250606_ooox_8638.docx", + "Allgemeine_Festlegungen_6.1b", + id="allgemeinefestlegungeninformatorischelesefassung_6.1b", + ), + pytest.param( + "apiguidelineinformatorischelesefassung_1.0a_20250606_99991231_20250606_ooox_10824.docx", + "API_Guideline_1.0a", + id="apiguidelineinformatorischelesefassung_1.0a", + ), + pytest.param( + "codelistederkonfigurationen_1.3b_20250606_99991231_20241213_xoxx_11124.docx", + "CL_der_Konfigurationen_1.3b", + id="codelistederkonfigurationen_1.3b", + ), + ], +) +def test_extract_sheet_name(input_filename: str, expected_output: str) -> None: + """Test extraction of sheet names from standard AHB/MIG files.""" + assert extract_sheet_name(input_filename) == expected_output diff --git a/unittests/test_docxfilefinder.py b/unittests/test_docxfilefinder.py index 905981e8..b503464b 100644 --- a/unittests/test_docxfilefinder.py +++ b/unittests/test_docxfilefinder.py @@ -1,117 +1,343 @@ -from pathlib import Path, PosixPath +from pathlib import Path import pytest +from efoli import EdifactFormatVersion -from kohlrahbi.docxfilefinder import DocxFileFinder, get_most_recent_file +from kohlrahbi.docxfilefinder import DocxFileFinder, split_version_string class TestDocxFileFinder: + def test_split_version_string(self): + assert split_version_string("1.0f") == ("", 1, 0, "f") + assert split_version_string("1.0") == ("", 1, 0, "") + assert split_version_string("1.0a") == ("", 1, 0, "a") + assert split_version_string("2.0b") == ("", 2, 0, "b") + assert split_version_string("4.2c") == ("", 4, 2, "c") + assert split_version_string("S2.2c") == ("S", 2, 2, "c") + + def test_get_valid_docx_files(self, tmp_path): + """Test that _get_valid_docx_files correctly identifies and filters docx files.""" + # Create a mock format version directory structure + format_version = EdifactFormatVersion.FV2504 + format_dir = tmp_path / "edi_energy_de" / format_version.value + format_dir.mkdir(parents=True) + + # Create test files + valid_files = [format_dir / "test1.docx", format_dir / "test2.docx", format_dir / "AHB_UTILMD_2.1.docx"] + temp_files = [ + format_dir / "~$temp.docx", # Word temporary file + format_dir / "~WRL0001.tmp", # Another temp file + ] + other_files = [format_dir / "test.txt", format_dir / "test.pdf", format_dir / "test.doc"] + + # Create all the test files + for file in valid_files + temp_files + other_files: + file.touch() + + # Initialize DocxFileFinder with the test directory + docx_file_finder = DocxFileFinder(path_to_edi_energy_mirror=tmp_path, format_version=format_version) + + # Call the method + docx_file_finder._get_valid_docx_files() # pylint: disable=protected-access + + # Verify results + assert len(docx_file_finder.result_paths) == len(valid_files) + assert all(path in docx_file_finder.result_paths for path in valid_files) + assert all(path not in docx_file_finder.result_paths for path in temp_files) + assert all(path not in docx_file_finder.result_paths for path in other_files) + + def test_get_valid_docx_files_empty_directory(self, tmp_path): + """Test that _get_valid_docx_files handles empty directories correctly.""" + # Create empty format version directory + format_version = EdifactFormatVersion.FV2504 + format_dir = tmp_path / "edi_energy_de" / format_version.value + format_dir.mkdir(parents=True) + + # Initialize DocxFileFinder with the empty directory + docx_file_finder = DocxFileFinder(path_to_edi_energy_mirror=tmp_path, format_version=format_version) + + # Call the method + docx_file_finder._get_valid_docx_files() # pylint: disable=protected-access + + # Verify results + assert len(docx_file_finder.result_paths) == 0 + assert isinstance(docx_file_finder.result_paths, list) + @pytest.mark.parametrize( - ["group_items", "expected"], + ["initial_paths", "expected_paths"], [ pytest.param( - {"UTILTSAHB": [Path("UTILTSAHB-informatorischeLesefassung4.0_20240701_20240401.docx")]}, - [Path("UTILTSAHB-informatorischeLesefassung4.0_20240701_20240401.docx")], - id="Single File", - ), - pytest.param( - { - "UTILTSAHB": [ - Path( - "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx" - ), - Path( - "UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240701_20240501.docx" - ), - Path( - "UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240930_20240401.docx" - ), - Path( - "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx" - ), - Path( - "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx" - ), - ] - }, [ - Path( - "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx" - ) + Path("UTILMDAHBMaBiS-informatorischeLesefassung4.0.docx"), + Path("UTILMD-informatorischeLesefassung4.0.docx"), + Path("APERAKAHB-informatorischeLesefassung2.0.docx"), + Path("MIG-informatorischeLesefassung1.0.docx"), ], - id="Standard Case", - ), - pytest.param( - { - "UTILTSAHB": [ - Path( - "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240701_20240401.docx" - ), - Path( - "UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240731_20240701.docx" - ), - Path( - "UTILTSAHB-informatorischeLesefassung4.0-außerordentlicheveröffentlichung_20240930_20240401.docx" - ), - Path( - "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx" - ), - Path( - "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240501.docx" - ), - ] - }, [ - Path( - "UTILTSAHB-informatorischeLesefassung4.0Konsolidiertelesefassungmitfehlerkorrekturen_20240930_20240701.docx" - ) + Path("UTILMDAHBMaBiS-informatorischeLesefassung4.0.docx"), + Path("APERAKAHB-informatorischeLesefassung2.0.docx"), ], - id="Valid from tie", + id="mixed_files", ), pytest.param( - { - "UTILMDAHBMaBiS": [ - Path("UTILMDAHBMaBiS-informatorischeLesefassung4.0_99991231_20231001.docx"), - Path( - "UTILMDAHBMaBiS-informatorischeLesefassung4.1aKonsolidierteLesefassungmitFehlerkorrekturenStand11.03.2024_20250403_20240403.docx" - ), - Path("UTILMDAHBMaBiS-informatorischeLesefassung4.1a_20250403_20240403.docx"), - ] - }, [ - Path( - "UTILMDAHBMaBiS-informatorischeLesefassung4.1aKonsolidierteLesefassungmitFehlerkorrekturenStand11.03.2024_20250403_20240403.docx" - ) + Path("UTILMD-informatorischeLesefassung4.0.docx"), + Path("MIG-informatorischeLesefassung1.0.docx"), ], - id="different names", + [], + id="no_ahb_files", ), - ], - ) - def test_filter_latest_version(self, group_items, expected): - assert DocxFileFinder.filter_latest_version(group_items) == expected - - @pytest.mark.parametrize( - ["group_items", "expected"], - [ pytest.param( [ - Path("APERAKCONTRLAHB-informatorischeLesefassung2.4a_99991231_20250404.docx"), - Path("APERAKCONTRLAHB-informatorischeLesefassung2.4_99991231_20250404.docx"), + Path("UTILMDAHBMaBiS-informatorischeLesefassung4.0.docx"), + Path("APERAKAHB-informatorischeLesefassung2.0.docx"), ], - Path("APERAKCONTRLAHB-informatorischeLesefassung2.4a_99991231_20250404.docx"), - id="Two versions of the same file", - ), - pytest.param( [ - Path("CodelistederKonfigurationen-informatorischeLesefassung1.3_99991231_20250404.docx"), - Path("CodelistederKonfigurationen-informatorischeLesefassung1.1_99991231_20231001.docx"), - Path("CodelistederKonfigurationen-informatorischeLesefassung1.3a_99991231_20250404.docx"), - Path("CodelistederKonfigurationen-informatorischeLesefassung1.3b_99991231_20250404.docx"), + Path("UTILMDAHBMaBiS-informatorischeLesefassung4.0.docx"), + Path("APERAKAHB-informatorischeLesefassung2.0.docx"), ], - Path("CodelistederKonfigurationen-informatorischeLesefassung1.3b_99991231_20250404.docx"), - id="Four versions of the same file", + id="only_ahb_files", ), ], ) - def test_get_most_recent(self, group_items: list[Path], expected: Path): - assert get_most_recent_file(group_items) == expected + def test_filter_for_ahb_docx_files(self, initial_paths, expected_paths): + """Test that _filter_for_ahb_docx_files correctly filters for AHB files.""" + # Create DocxFileFinder instance with required format_version + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + # Set initial paths + docx_file_finder.result_paths = initial_paths + # Apply filter + docx_file_finder._filter_for_ahb_docx_files() # pylint: disable=protected-access + # Verify results + assert sorted(docx_file_finder.result_paths) == sorted(expected_paths) + + def test_filter_informational_versions(self): + """Test that _filter_informational_versions correctly filters for informational reading versions.""" + # Create DocxFileFinder instance + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + + # Set up test paths with mix of informational and non-informational versions + docx_file_finder.result_paths = [ + # Informational reading versions + Path("AHB_COMDIS_1.0f_20250606_99991231_20250606_ooox_8871.docx"), + Path("AHB_CONTRL_2.4a_20250606_99991231_20241213_xoxx_11128.docx"), + Path("MIG_UTILMD_S2.1_20250606_20250129_20241213_xoxx_11161.docx"), + # Non-informational versions + Path("AHB_COMDIS_1.0f_20250606_99991231_20250606_oooo_8872.pdf"), + Path("AHB_CONTRL_2.4a_20250606_99991231_20250606_oooo_8927.pdf"), + Path("MIG_UTILMD_S2.1_20250606_20250129_20241213_xoxo_11160.pdf"), + ] + + # Expected results (only informational reading versions) + expected_paths = [ + Path("AHB_COMDIS_1.0f_20250606_99991231_20250606_ooox_8871.docx"), + Path("AHB_CONTRL_2.4a_20250606_99991231_20241213_xoxx_11128.docx"), + Path("MIG_UTILMD_S2.1_20250606_20250129_20241213_xoxx_11161.docx"), + ] + + # Apply filter + docx_file_finder._filter_informational_versions() # pylint: disable=protected-access + + # Verify results + assert len(docx_file_finder.result_paths) == len(expected_paths) + assert sorted(docx_file_finder.result_paths) == sorted(expected_paths) + + def test_filter_informational_versions_empty(self): + """Test that _filter_informational_versions handles empty input correctly.""" + # Create DocxFileFinder instance + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + + # Test with empty list + docx_file_finder.result_paths = [] + docx_file_finder._filter_informational_versions() # pylint: disable=protected-access + assert len(docx_file_finder.result_paths) == 0 + + def test_filter_informational_versions_no_informational(self): + """Test that _filter_informational_versions handles case with no informational versions.""" + # Create DocxFileFinder instance + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + + # Set up test paths with only non-informational versions + docx_file_finder.result_paths = [ + Path("AHB_COMDIS_1.0f_20250606_99991231_20250606_oooo_8872.pdf"), + Path("AHB_CONTRL_2.4a_20250606_99991231_20250606_oooo_8927.pdf"), + Path("MIG_ORDRSP_1.4_20250606_99991231_20250606_oooo_9796.pdf"), + ] + + # Apply filter + docx_file_finder._filter_informational_versions() # pylint: disable=protected-access + + # Verify results + assert len(docx_file_finder.result_paths) == 0 + + def test_get_most_recent_versions(self): + """Test that _get_most_recent_versions correctly identifies the most recent version of each document.""" + # Create DocxFileFinder instance + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + + # Set up test paths with multiple versions of documents + docx_file_finder.result_paths = [ + # COMDIS versions + Path("AHB_COMDIS_1.0f_20250606_99991231_20250606_ooox_8871.docx"), + Path( + "AHB_COMDIS_1.0f_20250606_99991231_20250606_oxox_11427.docx" + ), # most recent, extraordinary publication + # CONTRL versions + Path("AHB_CONTRL_2.4a_20250606_99991231_20250606_ooox_8928.docx"), + # UTILMD versions + Path("MIG_UTILMD_S2.1_20250606_20250129_20241213_xoxx_11161.docx"), + Path("MIG_UTILMD_S2.1_20250606_99991231_20250131_xoxx_11449.docx"), + Path("MIG_UTILMD_S2.1_20250606_99991231_20250606_ooox_10660.docx"), + # MSCONS versions + Path("AHB_MSCONS_3.1_20250606_99991231_20250606_ooox_9612.docx"), + ] + + # Expected results (only most recent versions) + expected_paths = [ + Path("AHB_COMDIS_1.0f_20250606_99991231_20250606_oxox_11427.docx"), + Path("AHB_CONTRL_2.4a_20250606_99991231_20250606_ooox_8928.docx"), + Path("MIG_UTILMD_S2.1_20250606_99991231_20250131_xoxx_11449.docx"), + Path("AHB_MSCONS_3.1_20250606_99991231_20250606_ooox_9612.docx"), + ] + + # Apply filter + docx_file_finder._get_most_recent_versions() # pylint: disable=protected-access + + # Verify results + assert len(docx_file_finder.result_paths) == len(expected_paths) + assert sorted(docx_file_finder.result_paths) == sorted(expected_paths) + + def test_get_most_recent_versions_empty(self): + """Test that _get_most_recent_versions handles empty input correctly.""" + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + docx_file_finder.result_paths = [] + docx_file_finder._get_most_recent_versions() # pylint: disable=protected-access + assert len(docx_file_finder.result_paths) == 0 + + def test_get_most_recent_versions_single_files(self): + """Test that _get_most_recent_versions correctly handles groups with single files.""" + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + + # Set up test paths with single files of different types + input_paths = [ + Path("AHB_COMDIS_1.0f_20250606_99991231_20250606_ooox_8871.docx"), + Path("AHB_CONTRL_2.4a_20250606_99991231_20241213_xoxx_11128.docx"), + Path("MIG_UTILMD_S2.1_20250606_20250129_20241213_xoxx_11161.docx"), + ] + docx_file_finder.result_paths = input_paths.copy() + + # Apply filter + docx_file_finder._get_most_recent_versions() # pylint: disable=protected-access + + # Verify results - should be same as input since each file is unique + assert len(docx_file_finder.result_paths) == len(input_paths) + assert sorted(docx_file_finder.result_paths) == sorted(input_paths) + + def test_get_most_recent_versions_with_error_corrections(self): + """Test that _get_most_recent_versions correctly prioritizes error correction versions.""" + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + + # Set up test paths with error corrections and regular versions + docx_file_finder.result_paths = [ + # Regular versions + Path("AHB_CONTRL_2.4a_20250606_99991231_20250606_ooox_8928.docx"), + Path("MIG_QUOTES_1.3a_20250606_99991231_20250606_ooox_10001.docx"), + Path("MIG_UTILMD_S2.1_20250606_99991231_20250606_ooox_10660.docx"), + # Error correction versions (should be preferred) + Path("AHB_CONTRL_2.4a_20250606_99991231_20241213_xoxx_11128.docx"), + Path("MIG_QUOTES_1.3a_20250606_99991231_20241213_xoxx_11155.docx"), + Path("MIG_UTILMD_S2.1_20250606_99991231_20250131_xoxx_11449.docx"), + ] + + # Expected results (only error correction versions) + expected_paths = [ + Path("AHB_CONTRL_2.4a_20250606_99991231_20241213_xoxx_11128.docx"), + Path("MIG_QUOTES_1.3a_20250606_99991231_20241213_xoxx_11155.docx"), + Path("MIG_UTILMD_S2.1_20250606_99991231_20250131_xoxx_11449.docx"), + ] + + # Apply filter + docx_file_finder._get_most_recent_versions() # pylint: disable=protected-access + + # Verify results + assert len(docx_file_finder.result_paths) == len(expected_paths) + assert sorted(docx_file_finder.result_paths) == sorted(expected_paths) + + def test_filter_error_corrections(self): + """Test that _filter_error_corrections correctly filters for error correction versions.""" + # Create DocxFileFinder instance + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + + # Test case 1: Group with error corrections + group_with_corrections = [ + # Error correction versions (should be kept) + Path("AHB_CONTRL_2.4a_20250606_99991231_20241213_xoxx_11128.docx"), + Path("MIG_ORDERS_1.4a_20250606_99991231_20241213_xoxx_11139.docx"), + # Regular versions (should be filtered out) + Path("AHB_CONTRL_2.4a_20250606_99991231_20250606_ooox_8928.docx"), + Path("MIG_ORDERS_1.4a_20250606_99991231_20250606_ooox_9744.docx"), + ] + + # Expected results for group with corrections + expected_corrections = [ + Path("AHB_CONTRL_2.4a_20250606_99991231_20241213_xoxx_11128.docx"), + Path("MIG_ORDERS_1.4a_20250606_99991231_20241213_xoxx_11139.docx"), + ] + + result = docx_file_finder._filter_error_corrections(group_with_corrections) # pylint: disable=protected-access + assert sorted(result) == sorted(expected_corrections) + + # Test case 2: Group without error corrections + group_without_corrections = [ + Path("AHB_COMDIS_1.0f_20250606_99991231_20250606_oooo_8872.docx"), + Path("AHB_CONTRL_2.4a_20250606_99991231_20250606_ooox_8928.docx"), + ] + + # Should return original group if no error corrections exist + result = docx_file_finder._filter_error_corrections( # pylint: disable=protected-access + group_without_corrections + ) + assert sorted(result) == sorted(group_without_corrections) + + def test_filter_error_corrections_empty_group(self): + """Test that _filter_error_corrections handles empty groups correctly.""" + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + + result = docx_file_finder._filter_error_corrections([]) # pylint: disable=protected-access + assert result == [] + + def test_filter_error_corrections_single_file(self): + """Test that _filter_error_corrections handles single file groups correctly.""" + docx_file_finder = DocxFileFinder( + path_to_edi_energy_mirror=Path("dummy"), format_version=EdifactFormatVersion.FV2504 + ) + + # Test with single error correction file + single_correction = [Path("AHB_COMDIS_1.0f_20250606_99991231_20250606_ooox_8871.docx")] + result = docx_file_finder._filter_error_corrections(single_correction) # pylint: disable=protected-access + assert result == single_correction + + # Test with single non-error correction file + single_regular = [Path("AHB_COMDIS_1.0f_20250606_99991231_20250606_oooo_8872.docx")] + result = docx_file_finder._filter_error_corrections(single_regular) # pylint: disable=protected-access + assert result == single_regular