feat: scrape all valid documents per format version (#309)

DeltaDaniel · hf-kklein · web-flow · commit dadef21c2fed · 2025-03-10T09:26:49.000+01:00
* WIP * get valid format versions * WIP * Fixed tests and typings This reverts commit ef0b98f. * updated docstring for more clarity * Update src/edi_energy_scraper/utilities.py Co-authored-by: konstantin <konstantin.klein@hochfrequenz.de> * Update unittests/test_utilities.py Co-authored-by: konstantin <konstantin.klein@hochfrequenz.de> * fixed tests and modified docstrings --------- Co-authored-by: konstantin <konstantin.klein@hochfrequenz.de>
diff --git a/src/edi_energy_scraper/scraper.py b/src/edi_energy_scraper/scraper.py
@@ -7,11 +7,11 @@
 
 import aiohttp
 from aiohttp import ClientTimeout
-from efoli import get_edifact_format_version
+from efoli import EdifactFormatVersion, get_edifact_format_version
 from more_itertools import chunked
 
 from edi_energy_scraper.apidocument import Document, ResponseModel
-from edi_energy_scraper.utilities import _have_different_metadata
+from edi_energy_scraper.utilities import _get_valid_format_versions, _have_different_metadata
 
 _logger = logging.getLogger(__name__)
 
@@ -93,11 +93,47 @@ def _remove_old_files(self, documents: list[Document]) -> None:
                 number_of_files_removed += 1
         _logger.info("%i old files have been removed", number_of_files_removed)
 
-    async def download_document(self, document: Document) -> Path:
+    async def download_document_for_all_fv(self, document: Document) -> list[Path]:
         """
-        downloads the file related to the given document and returns its path
+        Downloads a specific document for all valid format versions.
+
+        This method takes a document provided by the API and determines all valid format versions based on the
+        `validFrom` and `validTo` dates. It then downloads the file for each valid format version individually.
+
+        Background: Sometimes valid documents stay valid, even if the format version changes.
+        One example is the ORDRSP AHB which is unchanged between FV2310 and FV2504.
+        We then store the same file in multiple format version directories, because downstream
+        tools and services rely on the file to be present in a FV folder.
+        So we spare those tools from implementing fallthrough kind of logic.
+        All the paths returned contain the same file but at different locations.
+
+        :param document: The document to be downloaded.
+        :return: A list of paths where the downloaded files are stored.
+        """
+        format_versions = _get_valid_format_versions(document.validFrom, document.validTo)
+        file_paths = []
+
+        for format_version in format_versions:
+            file_paths.append(await self.download_document_per_fv(document, format_version))
+        return file_paths
+
+    async def download_document_per_fv(
+        self, document: Document, format_version: EdifactFormatVersion | None = None
+    ) -> Path:
+        """
+        Downloads a specific document for a single format version.
+
+        This method downloads a document for a given format version (usually, we want to download a single file for all
+        valid format versions, not only the format version determined by the 'validFrom' date).
+        If no format version is provided, it defaults to the format version corresponding to the `validFrom` date of
+        the document.
+
+        :param document: The document to be downloaded.
+        :param format_version: The format version for which the document should be downloaded. Defaults to None.
+        :return: The path where the downloaded file is stored.
         """
-        format_version = get_edifact_format_version(document.validFrom)
+        if format_version is None:
+            format_version = get_edifact_format_version(document.validFrom)
         fv_path = self._root_dir / Path(format_version)
         if not fv_path.exists():
             _logger.debug("Creating directory %s", fv_path.absolute())
@@ -127,18 +163,18 @@ async def download_document(self, document: Document) -> Path:
     async def mirror(self) -> None:
         """
         Main method of the scraper.
-        Downloads all the filefs and pages and stores them in the filesystem.
+        Downloads all the files and pages and stores them in the filesystem.
         """
         if not self._root_dir.exists() or not self._root_dir.is_dir():
             # we'll raise an error for the root dir, but create sub dirs on the fly
             raise ValueError(f"The path {self._root_dir} is either no directory or does not exist")
-        download_tasks: list[Awaitable[Path]] = []
+        download_tasks: list[Awaitable[list[Path]]] = []
         all_metadata = await self.get_documents_overview()
         for document in all_metadata:
             if not document.isFree:
                 _logger.debug("Skipping %s because it's not free", document.title)
                 continue
-            download_tasks.append(self.download_document(document))
+            download_tasks.append(self.download_document_for_all_fv(document))
         for download_chunk in chunked(download_tasks, 10):
             await asyncio.gather(*download_chunk)
         _logger.info("Downloaded %i files", len(download_tasks))
@@ -164,7 +200,7 @@ async def get_best_match(
         if not matching_document:
             _logger.debug("No document matches %s", matcher)
             return None
-        downloaded_path = await self.download_document(matching_document)
+        downloaded_path = await self.download_document_per_fv(matching_document)
         if path is None:
             return downloaded_path
         downloaded_path.rename(path)
diff --git a/src/edi_energy_scraper/utilities.py b/src/edi_energy_scraper/utilities.py
@@ -2,8 +2,10 @@
 helper functions
 """
 
+from datetime import date
 from pathlib import Path
 
+from efoli import EdifactFormatVersion, get_edifact_format_version
 from pypdf import PdfReader
 
 
@@ -28,4 +30,32 @@ def _have_different_metadata(path_new_file: Path, path_to_old_file: Path) -> boo
     return metadata_has_changed
 
 
-__all__ = ["_have_different_metadata"]
+def _get_valid_format_versions(valid_from: date, valid_to: date | None) -> list[EdifactFormatVersion]:
+    """
+    Get all valid EdifactFormatVersions for a given date range.
+    Takes into account errors in the bdew-mako.de API where valid_to <= valid_from.
+    :param valid_from: Release date of document.
+    :param valid_to: Expiration date of document. Sets format version depending on threshold dates in efoli.
+    Therefore, it is inclusive. Might be None if not provided by BDEW/API.
+    :return: list of EdifactFormatVersions that are valid between the given dates.
+    """
+    valid_from_fv: EdifactFormatVersion = get_edifact_format_version(valid_from)
+    valid_to_fv: EdifactFormatVersion  # last format version for which the document is valid
+    # there is no expiration date, so we take the latest format version
+    if valid_to is None:
+        valid_to_fv = EdifactFormatVersion(max(EdifactFormatVersion))
+    # the expiration date is before the release date. This is an error.
+    # We only take the release date to find the format version.
+    elif valid_to <= valid_from:
+        valid_to_fv = get_edifact_format_version(valid_from)
+    # The generic case.
+    else:
+        valid_to_fv = get_edifact_format_version(valid_to)
+    return [
+        EdifactFormatVersion(format_version)
+        for format_version in EdifactFormatVersion
+        if valid_from_fv <= format_version <= valid_to_fv
+    ]
+
+
+__all__ = ["_have_different_metadata", "_get_valid_format_versions"]
diff --git a/tox.ini b/tox.ini
@@ -94,4 +94,4 @@ deps =
     -r requirements.txt
     .[tests]
 setenv = PYTHONPATH = {toxinidir}/src
-commands = python -m pytest -m snapshot --basetemp={envtmpdir} {posargs} --snapshot-update
+commands = python -m pytest -m snapshot --basetemp={envtmpdir} {posargs} --snapshot-update
diff --git a/unittests/test_downloads.py b/unittests/test_downloads.py
@@ -3,6 +3,7 @@
 
 import pytest
 from aioresponses import aioresponses
+from efoli import EdifactFormatVersion
 from more_itertools import last
 
 from edi_energy_scraper import DocumentMetadata
@@ -62,7 +63,7 @@ async def test_download_file(tmp_path: Path) -> None:
                 status=200,
                 body=example_pdf.read(),
             )
-            actual = await client.download_document(example_document)
+            actual = await client.download_document_per_fv(example_document)
     assert actual.is_file()
     assert actual.suffix == ".pdf"
 
@@ -152,7 +153,7 @@ async def get_fake_documents() -> list[Document]:
             Document.model_construct(fileId=789),
         ]
 
-    async def download_fake_document(document: Document) -> Path:
+    async def download_fake_document(document: Document, format_version: EdifactFormatVersion | None = None) -> Path:
         if document.fileId == 123:
             path123.touch()
             return path123
@@ -165,7 +166,7 @@ async def download_fake_document(document: Document) -> Path:
         raise NotImplementedError()
 
     client.get_documents_overview = get_fake_documents  # type:ignore[method-assign]
-    client.download_document = download_fake_document  # type:ignore[method-assign]
+    client.download_document_per_fv = download_fake_document  # type:ignore[method-assign]
     if with_own_path:
         own_path = tmp_path / "my_document"
         actual = await client.get_best_match(lambda ds: last(sorted(ds, key=lambda d: d.fileId)), own_path)
diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py
@@ -1,7 +1,11 @@
+from datetime import date
 from pathlib import Path
 
+import pytest
+from efoli import EdifactFormatVersion
+
 from edi_energy_scraper import DocumentMetadata
-from edi_energy_scraper.utilities import _have_different_metadata
+from edi_energy_scraper.utilities import _get_valid_format_versions, _have_different_metadata
 
 
 def test_have_different_metadata() -> None:
@@ -15,3 +19,32 @@ def test_have_different_metadata() -> None:
 def test_extraction() -> None:
     structured_information = DocumentMetadata.from_filename("MIG_REQOTE_1.3_20250604_20230930_20230930_oxox_10071.docx")
     print(structured_information)
+
+
+@pytest.mark.parametrize(
+    "valid_from, valid_to, expected_versions",
+    [
+        pytest.param(
+            date(2023, 10, 30),
+            None,
+            [
+                edifact_format_version
+                for edifact_format_version in EdifactFormatVersion
+                if edifact_format_version >= EdifactFormatVersion.FV2310
+            ],
+        ),
+        pytest.param(date(2021, 9, 30), date(2021, 9, 29), [EdifactFormatVersion.FV2104]),
+        pytest.param(date(2021, 9, 30), date(2021, 9, 30), [EdifactFormatVersion.FV2104]),
+        pytest.param(
+            date(2021, 9, 30),
+            date(2021, 10, 1),
+            [EdifactFormatVersion.FV2104, EdifactFormatVersion.FV2110],
+            id="inlcusive valid_to",
+        ),
+        pytest.param(date(2023, 9, 30), date(2024, 4, 1), [EdifactFormatVersion.FV2304, EdifactFormatVersion.FV2310]),
+        pytest.param(date(2023, 4, 1), date(2023, 4, 1), [EdifactFormatVersion.FV2304]),
+    ],
+)
+def test_get_formatversions(valid_from: date, valid_to: date, expected_versions: list[EdifactFormatVersion]) -> None:
+    """Tests the function _get_valid_format_versions."""
+    assert _get_valid_format_versions(valid_from, valid_to) == expected_versions