Skip to content

Commit dadef21

Browse files
feat: scrape all valid documents per format version (#309)
* WIP * get valid format versions * WIP * Fixed tests and typings This reverts commit ef0b98f. * updated docstring for more clarity * Update src/edi_energy_scraper/utilities.py Co-authored-by: konstantin <[email protected]> * Update unittests/test_utilities.py Co-authored-by: konstantin <[email protected]> * fixed tests and modified docstrings --------- Co-authored-by: konstantin <[email protected]>
1 parent c93ddbc commit dadef21

File tree

5 files changed

+115
-15
lines changed

5 files changed

+115
-15
lines changed

src/edi_energy_scraper/scraper.py

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77

88
import aiohttp
99
from aiohttp import ClientTimeout
10-
from efoli import get_edifact_format_version
10+
from efoli import EdifactFormatVersion, get_edifact_format_version
1111
from more_itertools import chunked
1212

1313
from edi_energy_scraper.apidocument import Document, ResponseModel
14-
from edi_energy_scraper.utilities import _have_different_metadata
14+
from edi_energy_scraper.utilities import _get_valid_format_versions, _have_different_metadata
1515

1616
_logger = logging.getLogger(__name__)
1717

@@ -93,11 +93,47 @@ def _remove_old_files(self, documents: list[Document]) -> None:
9393
number_of_files_removed += 1
9494
_logger.info("%i old files have been removed", number_of_files_removed)
9595

96-
async def download_document(self, document: Document) -> Path:
96+
async def download_document_for_all_fv(self, document: Document) -> list[Path]:
9797
"""
98-
downloads the file related to the given document and returns its path
98+
Downloads a specific document for all valid format versions.
99+
100+
This method takes a document provided by the API and determines all valid format versions based on the
101+
`validFrom` and `validTo` dates. It then downloads the file for each valid format version individually.
102+
103+
Background: Sometimes valid documents stay valid, even if the format version changes.
104+
One example is the ORDRSP AHB which is unchanged between FV2310 and FV2504.
105+
We then store the same file in multiple format version directories, because downstream
106+
tools and services rely on the file to be present in a FV folder.
107+
So we spare those tools from implementing fallthrough kind of logic.
108+
All the paths returned contain the same file but at different locations.
109+
110+
:param document: The document to be downloaded.
111+
:return: A list of paths where the downloaded files are stored.
112+
"""
113+
format_versions = _get_valid_format_versions(document.validFrom, document.validTo)
114+
file_paths = []
115+
116+
for format_version in format_versions:
117+
file_paths.append(await self.download_document_per_fv(document, format_version))
118+
return file_paths
119+
120+
async def download_document_per_fv(
121+
self, document: Document, format_version: EdifactFormatVersion | None = None
122+
) -> Path:
123+
"""
124+
Downloads a specific document for a single format version.
125+
126+
This method downloads a document for a given format version (usually, we want to download a single file for all
127+
valid format versions, not only the format version determined by the 'validFrom' date).
128+
If no format version is provided, it defaults to the format version corresponding to the `validFrom` date of
129+
the document.
130+
131+
:param document: The document to be downloaded.
132+
:param format_version: The format version for which the document should be downloaded. Defaults to None.
133+
:return: The path where the downloaded file is stored.
99134
"""
100-
format_version = get_edifact_format_version(document.validFrom)
135+
if format_version is None:
136+
format_version = get_edifact_format_version(document.validFrom)
101137
fv_path = self._root_dir / Path(format_version)
102138
if not fv_path.exists():
103139
_logger.debug("Creating directory %s", fv_path.absolute())
@@ -127,18 +163,18 @@ async def download_document(self, document: Document) -> Path:
127163
async def mirror(self) -> None:
128164
"""
129165
Main method of the scraper.
130-
Downloads all the filefs and pages and stores them in the filesystem.
166+
Downloads all the files and pages and stores them in the filesystem.
131167
"""
132168
if not self._root_dir.exists() or not self._root_dir.is_dir():
133169
# we'll raise an error for the root dir, but create sub dirs on the fly
134170
raise ValueError(f"The path {self._root_dir} is either no directory or does not exist")
135-
download_tasks: list[Awaitable[Path]] = []
171+
download_tasks: list[Awaitable[list[Path]]] = []
136172
all_metadata = await self.get_documents_overview()
137173
for document in all_metadata:
138174
if not document.isFree:
139175
_logger.debug("Skipping %s because it's not free", document.title)
140176
continue
141-
download_tasks.append(self.download_document(document))
177+
download_tasks.append(self.download_document_for_all_fv(document))
142178
for download_chunk in chunked(download_tasks, 10):
143179
await asyncio.gather(*download_chunk)
144180
_logger.info("Downloaded %i files", len(download_tasks))
@@ -164,7 +200,7 @@ async def get_best_match(
164200
if not matching_document:
165201
_logger.debug("No document matches %s", matcher)
166202
return None
167-
downloaded_path = await self.download_document(matching_document)
203+
downloaded_path = await self.download_document_per_fv(matching_document)
168204
if path is None:
169205
return downloaded_path
170206
downloaded_path.rename(path)

src/edi_energy_scraper/utilities.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
helper functions
33
"""
44

5+
from datetime import date
56
from pathlib import Path
67

8+
from efoli import EdifactFormatVersion, get_edifact_format_version
79
from pypdf import PdfReader
810

911

@@ -28,4 +30,32 @@ def _have_different_metadata(path_new_file: Path, path_to_old_file: Path) -> boo
2830
return metadata_has_changed
2931

3032

31-
__all__ = ["_have_different_metadata"]
33+
def _get_valid_format_versions(valid_from: date, valid_to: date | None) -> list[EdifactFormatVersion]:
34+
"""
35+
Get all valid EdifactFormatVersions for a given date range.
36+
Takes into account errors in the bdew-mako.de API where valid_to <= valid_from.
37+
:param valid_from: Release date of document.
38+
:param valid_to: Expiration date of document. Sets format version depending on threshold dates in efoli.
39+
Therefore, it is inclusive. Might be None if not provided by BDEW/API.
40+
:return: list of EdifactFormatVersions that are valid between the given dates.
41+
"""
42+
valid_from_fv: EdifactFormatVersion = get_edifact_format_version(valid_from)
43+
valid_to_fv: EdifactFormatVersion # last format version for which the document is valid
44+
# there is no expiration date, so we take the latest format version
45+
if valid_to is None:
46+
valid_to_fv = EdifactFormatVersion(max(EdifactFormatVersion))
47+
# the expiration date is before the release date. This is an error.
48+
# We only take the release date to find the format version.
49+
elif valid_to <= valid_from:
50+
valid_to_fv = get_edifact_format_version(valid_from)
51+
# The generic case.
52+
else:
53+
valid_to_fv = get_edifact_format_version(valid_to)
54+
return [
55+
EdifactFormatVersion(format_version)
56+
for format_version in EdifactFormatVersion
57+
if valid_from_fv <= format_version <= valid_to_fv
58+
]
59+
60+
61+
__all__ = ["_have_different_metadata", "_get_valid_format_versions"]

tox.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,4 @@ deps =
9494
-r requirements.txt
9595
.[tests]
9696
setenv = PYTHONPATH = {toxinidir}/src
97-
commands = python -m pytest -m snapshot --basetemp={envtmpdir} {posargs} --snapshot-update
97+
commands = python -m pytest -m snapshot --basetemp={envtmpdir} {posargs} --snapshot-update

unittests/test_downloads.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import pytest
55
from aioresponses import aioresponses
6+
from efoli import EdifactFormatVersion
67
from more_itertools import last
78

89
from edi_energy_scraper import DocumentMetadata
@@ -62,7 +63,7 @@ async def test_download_file(tmp_path: Path) -> None:
6263
status=200,
6364
body=example_pdf.read(),
6465
)
65-
actual = await client.download_document(example_document)
66+
actual = await client.download_document_per_fv(example_document)
6667
assert actual.is_file()
6768
assert actual.suffix == ".pdf"
6869

@@ -152,7 +153,7 @@ async def get_fake_documents() -> list[Document]:
152153
Document.model_construct(fileId=789),
153154
]
154155

155-
async def download_fake_document(document: Document) -> Path:
156+
async def download_fake_document(document: Document, format_version: EdifactFormatVersion | None = None) -> Path:
156157
if document.fileId == 123:
157158
path123.touch()
158159
return path123
@@ -165,7 +166,7 @@ async def download_fake_document(document: Document) -> Path:
165166
raise NotImplementedError()
166167

167168
client.get_documents_overview = get_fake_documents # type:ignore[method-assign]
168-
client.download_document = download_fake_document # type:ignore[method-assign]
169+
client.download_document_per_fv = download_fake_document # type:ignore[method-assign]
169170
if with_own_path:
170171
own_path = tmp_path / "my_document"
171172
actual = await client.get_best_match(lambda ds: last(sorted(ds, key=lambda d: d.fileId)), own_path)

unittests/test_utilities.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1+
from datetime import date
12
from pathlib import Path
23

4+
import pytest
5+
from efoli import EdifactFormatVersion
6+
37
from edi_energy_scraper import DocumentMetadata
4-
from edi_energy_scraper.utilities import _have_different_metadata
8+
from edi_energy_scraper.utilities import _get_valid_format_versions, _have_different_metadata
59

610

711
def test_have_different_metadata() -> None:
@@ -15,3 +19,32 @@ def test_have_different_metadata() -> None:
1519
def test_extraction() -> None:
1620
structured_information = DocumentMetadata.from_filename("MIG_REQOTE_1.3_20250604_20230930_20230930_oxox_10071.docx")
1721
print(structured_information)
22+
23+
24+
@pytest.mark.parametrize(
25+
"valid_from, valid_to, expected_versions",
26+
[
27+
pytest.param(
28+
date(2023, 10, 30),
29+
None,
30+
[
31+
edifact_format_version
32+
for edifact_format_version in EdifactFormatVersion
33+
if edifact_format_version >= EdifactFormatVersion.FV2310
34+
],
35+
),
36+
pytest.param(date(2021, 9, 30), date(2021, 9, 29), [EdifactFormatVersion.FV2104]),
37+
pytest.param(date(2021, 9, 30), date(2021, 9, 30), [EdifactFormatVersion.FV2104]),
38+
pytest.param(
39+
date(2021, 9, 30),
40+
date(2021, 10, 1),
41+
[EdifactFormatVersion.FV2104, EdifactFormatVersion.FV2110],
42+
id="inlcusive valid_to",
43+
),
44+
pytest.param(date(2023, 9, 30), date(2024, 4, 1), [EdifactFormatVersion.FV2304, EdifactFormatVersion.FV2310]),
45+
pytest.param(date(2023, 4, 1), date(2023, 4, 1), [EdifactFormatVersion.FV2304]),
46+
],
47+
)
48+
def test_get_formatversions(valid_from: date, valid_to: date, expected_versions: list[EdifactFormatVersion]) -> None:
49+
"""Tests the function _get_valid_format_versions."""
50+
assert _get_valid_format_versions(valid_from, valid_to) == expected_versions

0 commit comments

Comments
 (0)