2929# pylint:disable=anomalous-backslash-in-string
3030def get_valid_pruefis (list_of_pruefis : list [str ], all_known_pruefis : Optional [list [str ]] = None ) -> list [str ]:
3131 """
32- This function returns a new list with only those pruefis which match the pruefi_pattern r"^[1-9]\d{4}$".
33- It also supports unix wildcards like '*' and '?' iff a list of known pruefis is given.
32+ This function returns a list with only those pruefis which match the pruefi_pattern r"^[1-9]\d{4}$".
33+ It also supports unix wildcards like '*' and '?' if a list of known pruefis is given.
3434 E.g. '11*' for all pruefis starting with '11' or '*01' for all pruefis ending with '01'.
3535 """
3636 result : set [str ] = set ()
@@ -83,7 +83,7 @@ def check_output_path(path: Path) -> None:
8383
8484def load_all_known_pruefis_from_file (
8585 path_to_all_known_pruefis : Path = Path (__file__ ).parent / Path ("all_known_pruefis.toml" ),
86- ) -> list [str ]:
86+ ) -> dict [str , str | None ]:
8787 """
8888 Loads the file which contains all known Prüfidentifikatoren.
8989 The file may be manually updated with the script `collect_pruefis.py`.
@@ -93,17 +93,18 @@ def load_all_known_pruefis_from_file(
9393 state_of_kohlrahbi : dict [str , Any ] = tomlkit .load (file )
9494
9595 meta_data_section = state_of_kohlrahbi .get ("meta_data" )
96- content_section = state_of_kohlrahbi .get ("content" )
96+ pruefi_to_file_mapping : dict [ str , str | None ] | None = state_of_kohlrahbi .get ("pruefidentifikatoren" , None )
9797
9898 if meta_data_section is None :
9999 click .secho (f"There is no 'meta_data' section in the provided toml file: { path_to_all_known_pruefis } " , fg = "red" )
100100 raise click .Abort ()
101- if content_section is None :
102- click .secho (f"There is no 'content' section in the toml file: { path_to_all_known_pruefis } " , fg = "red" )
101+ if pruefi_to_file_mapping is None :
102+ click .secho (
103+ f"There is no 'pruefidentifikatoren' section in the toml file: { path_to_all_known_pruefis } " , fg = "red"
104+ )
103105 raise click .Abort ()
104106
105- pruefis : list [str ] = content_section .get ("pruefidentifikatoren" )
106- return pruefis
107+ return pruefi_to_file_mapping
107108
108109
109110def create_sheet_name (filename : str ) -> str :
@@ -206,14 +207,15 @@ def scrape_change_histories(input_path: Path, output_path: Path) -> None:
206207 save_change_histories_to_excel (change_history_collection , output_path )
207208
208209
209- def load_pruefis_if_empty (pruefis : list [str ]) -> list [str ]:
210+ def load_pruefis_if_empty (pruefi_to_file_mapping : dict [str , str | None ]) -> dict [str , str | None ]:
210211 """
211- If the user did not provide any pruefis we load all known pruefis from the toml file.
212+ If the user did not provide any pruefis we load all known pruefis
213+ and the paths to the file containing them from the toml file.
212214 """
213- if not pruefis :
215+ if not pruefi_to_file_mapping :
214216 click .secho ("☝️ No pruefis were given. I will parse all known pruefis." , fg = "yellow" )
215217 return load_all_known_pruefis_from_file ()
216- return pruefis
218+ return pruefi_to_file_mapping
217219
218220
219221def validate_file_type (file_type : str ):
@@ -228,7 +230,7 @@ def validate_file_type(file_type: str):
228230
229231def validate_pruefis (pruefis : list [str ]) -> list [str ]:
230232 """
231- Validate the pruefis parameter.
233+ Validate the pruefi_to_file_mapping parameter.
232234 """
233235 valid_pruefis = get_valid_pruefis (pruefis )
234236 if not valid_pruefis :
@@ -248,9 +250,14 @@ def process_pruefi(
248250):
249251 """
250252 Process one pruefi.
253+ If the input path ends with .docx, we assume that the file containing the pruefi is given.
254+ Therefore we only access that file.
251255 """
252- ahb_file_finder = DocxFileFinder .from_input_path (input_path = input_path )
253- ahb_file_paths = ahb_file_finder .get_docx_files_which_may_contain_searched_pruefi (pruefi )
256+ if not input_path .suffix == ".docx" :
257+ ahb_file_finder = DocxFileFinder .from_input_path (input_path = input_path )
258+ ahb_file_paths = ahb_file_finder .get_docx_files_which_may_contain_searched_pruefi (pruefi )
259+ else :
260+ ahb_file_paths = [input_path ]
254261
255262 if not ahb_file_paths :
256263 logger .warning ("No docx file was found for pruefi '%s'" , pruefi )
@@ -259,11 +266,11 @@ def process_pruefi(
259266 for ahb_file_path in ahb_file_paths :
260267 doc = get_or_cache_document (ahb_file_path , path_to_document_mapping )
261268 if not doc :
262- continue
269+ return
263270
264271 ahb_table = get_ahb_table (document = doc , pruefi = pruefi )
265272 if not ahb_table :
266- continue
273+ return
267274
268275 process_ahb_table (ahb_table , pruefi , output_path , file_type , collected_conditions )
269276
@@ -306,21 +313,31 @@ def process_ahb_table(
306313
307314
308315def scrape_pruefis (
309- pruefis : list [str ], input_path : Path , output_path : Path , file_type : Literal ["flatahb" , "csv" , "xlsx" , "conditions" ]
316+ pruefi_to_file_mapping : dict [str , str | None ],
317+ basic_input_path : Path ,
318+ output_path : Path ,
319+ file_type : Literal ["flatahb" , "csv" , "xlsx" , "conditions" ],
310320) -> None :
311321 """
312- starts the scraping process for provided pruefis
322+ starts the scraping process for provided pruefi_to_file_mappings
313323 """
314- pruefis = load_pruefis_if_empty (pruefis )
324+ pruefi_to_file_mapping = load_pruefis_if_empty (pruefi_to_file_mapping )
315325 validate_file_type (file_type )
316326
317- valid_pruefis = validate_pruefis (pruefis )
327+ valid_pruefis = validate_pruefis (list (pruefi_to_file_mapping .keys ()))
328+ valid_pruefi_to_file_mappings : dict [str , str | None ] = {}
329+ for pruefi in valid_pruefis :
330+ valid_pruefi_to_file_mappings .update ({pruefi : pruefi_to_file_mapping .get (pruefi , None )})
318331 path_to_document_mapping : dict [Path , docx .Document ] = {}
319332 collected_conditions : Optional [dict [EdifactFormat , dict [str , str ]]] = {} if "conditions" in file_type else None
320333
321- for pruefi in valid_pruefis :
334+ for pruefi , filename in valid_pruefi_to_file_mappings . items () :
322335 try :
323336 logger .info ("start looking for pruefi '%s'" , pruefi )
337+ input_path = basic_input_path # To prevent multiple adding of filenames
338+ # that would happen if filenames are added but never removed
339+ if filename is not None :
340+ input_path = basic_input_path / Path (filename )
324341 process_pruefi (pruefi , input_path , output_path , file_type , path_to_document_mapping , collected_conditions )
325342 # sorry for the pokemon catch
326343 except Exception as e : # pylint: disable=broad-except
@@ -394,12 +411,14 @@ def main(
394411 else :
395412 output_path .mkdir (parents = True )
396413 click .secho (f"I created a new directory at { output_path } " , fg = "yellow" )
397-
414+ pruefi_to_file_mapping : dict [str , str | None ] = {
415+ key : None for key in pruefis
416+ } # A mapping of a pruefi (key) to the name (+ path) of the file containing the prufi
398417 match flavour :
399418 case "pruefi" :
400419 scrape_pruefis (
401- pruefis = pruefis ,
402- input_path = input_path ,
420+ pruefi_to_file_mapping = pruefi_to_file_mapping ,
421+ basic_input_path = input_path ,
403422 output_path = output_path ,
404423 file_type = file_type ,
405424 )
0 commit comments