77
88import aiohttp
99from aiohttp import ClientTimeout
10- from efoli import get_edifact_format_version
10+ from efoli import EdifactFormatVersion , get_edifact_format_version
1111from more_itertools import chunked
1212
1313from edi_energy_scraper .apidocument import Document , ResponseModel
14- from edi_energy_scraper .utilities import _have_different_metadata
14+ from edi_energy_scraper .utilities import _get_valid_format_versions , _have_different_metadata
1515
1616_logger = logging .getLogger (__name__ )
1717
@@ -93,11 +93,47 @@ def _remove_old_files(self, documents: list[Document]) -> None:
9393 number_of_files_removed += 1
9494 _logger .info ("%i old files have been removed" , number_of_files_removed )
9595
96- async def download_document (self , document : Document ) -> Path :
96+ async def download_document_for_all_fv (self , document : Document ) -> list [ Path ] :
9797 """
98- downloads the file related to the given document and returns its path
98+ Downloads a specific document for all valid format versions.
99+
100+ This method takes a document provided by the API and determines all valid format versions based on the
101+ `validFrom` and `validTo` dates. It then downloads the file for each valid format version individually.
102+
103+ Background: Sometimes valid documents stay valid, even if the format version changes.
104+ One example is the ORDRSP AHB which is unchanged between FV2310 and FV2504.
105+ We then store the same file in multiple format version directories, because downstream
106+ tools and services rely on the file to be present in a FV folder.
107+ So we spare those tools from implementing fallthrough kind of logic.
108+ All the paths returned contain the same file but at different locations.
109+
110+ :param document: The document to be downloaded.
111+ :return: A list of paths where the downloaded files are stored.
112+ """
113+ format_versions = _get_valid_format_versions (document .validFrom , document .validTo )
114+ file_paths = []
115+
116+ for format_version in format_versions :
117+ file_paths .append (await self .download_document_per_fv (document , format_version ))
118+ return file_paths
119+
120+ async def download_document_per_fv (
121+ self , document : Document , format_version : EdifactFormatVersion | None = None
122+ ) -> Path :
123+ """
124+ Downloads a specific document for a single format version.
125+
126+ This method downloads a document for a given format version (usually, we want to download a single file for all
127+ valid format versions, not only the format version determined by the 'validFrom' date).
128+ If no format version is provided, it defaults to the format version corresponding to the `validFrom` date of
129+ the document.
130+
131+ :param document: The document to be downloaded.
132+ :param format_version: The format version for which the document should be downloaded. Defaults to None.
133+ :return: The path where the downloaded file is stored.
99134 """
100- format_version = get_edifact_format_version (document .validFrom )
135+ if format_version is None :
136+ format_version = get_edifact_format_version (document .validFrom )
101137 fv_path = self ._root_dir / Path (format_version )
102138 if not fv_path .exists ():
103139 _logger .debug ("Creating directory %s" , fv_path .absolute ())
@@ -127,18 +163,18 @@ async def download_document(self, document: Document) -> Path:
127163 async def mirror (self ) -> None :
128164 """
129165 Main method of the scraper.
130- Downloads all the filefs and pages and stores them in the filesystem.
166+ Downloads all the files and pages and stores them in the filesystem.
131167 """
132168 if not self ._root_dir .exists () or not self ._root_dir .is_dir ():
133169 # we'll raise an error for the root dir, but create sub dirs on the fly
134170 raise ValueError (f"The path { self ._root_dir } is either no directory or does not exist" )
135- download_tasks : list [Awaitable [Path ]] = []
171+ download_tasks : list [Awaitable [list [ Path ] ]] = []
136172 all_metadata = await self .get_documents_overview ()
137173 for document in all_metadata :
138174 if not document .isFree :
139175 _logger .debug ("Skipping %s because it's not free" , document .title )
140176 continue
141- download_tasks .append (self .download_document (document ))
177+ download_tasks .append (self .download_document_for_all_fv (document ))
142178 for download_chunk in chunked (download_tasks , 10 ):
143179 await asyncio .gather (* download_chunk )
144180 _logger .info ("Downloaded %i files" , len (download_tasks ))
@@ -164,7 +200,7 @@ async def get_best_match(
164200 if not matching_document :
165201 _logger .debug ("No document matches %s" , matcher )
166202 return None
167- downloaded_path = await self .download_document (matching_document )
203+ downloaded_path = await self .download_document_per_fv (matching_document )
168204 if path is None :
169205 return downloaded_path
170206 downloaded_path .rename (path )
0 commit comments