Skip to content

Commit e9b66d4

Browse files
authored
Memorize which File contains which Pruefi Tables (#209)
* 🚧Update collect_pruefis.py to also collect filenames * ✨get_all_pruefis only look through one file per pruefi collect_pruefis.py collect name of file where pruefi is found in. get_all_pruefis then only looks through that file, which speeds up the search for all pruefis. * Use "None" if no file path to a pruefi is given * Remove Union[str, None] with str | None * 🐛Remove bug when dict has wilcdards as keys Changed tests to detect those bugs in the future * 📝Update Readme and comments * ✅Add test for preufe: None as input * Update method naming and comments * Only send pruefis to validation, reattach filename afterwards * Rename content.pruefidentifikatoren in toml to pruefidentifikatoren
1 parent 642a2bc commit e9b66d4

File tree

5 files changed

+584
-522
lines changed

5 files changed

+584
-522
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,21 +96,21 @@ The easiest way to be compliant with this naming schema is to clone our [edi_ene
9696
If you want to extract a specific prüfidentifikator, you can run the following command.
9797

9898
```bash
99-
kohlrahbi --input_path ../edi_energy_mirror/edi_energy_de/current --output_path ./output/ --pruefis 11039 --file-type xslx
99+
kohlrahbi --input_path ../edi_energy_mirror/edi_energy_de/current --output_path ./output/ --pruefis 13002 --file-type xlsx
100100
```
101101

102102
You can also provide multiple prüfidentifikatoren.
103103

104104
```bash
105-
kohlrahbi --input_path ../edi_energy_mirror/edi_energy_de/current --output_path ./output/ --pruefis 11039 --pruefis 11040 --pruefi 11041 --file-type csv
105+
kohlrahbi --input_path ../edi_energy_mirror/edi_energy_de/current --output_path ./output/ --pruefis 13002 --pruefis 13003 --pruefis 13005 --file-type csv
106106
```
107107
### Results
108108
There is a kohlrahbi based CI pipeline from the edi_energy_mirror mentioned above to the repository [machine-readable_anwendungshandbuecher](https://github.com/Hochfrequenz/machine-readable_anwendungshandbuecher) where you can find scraped AHBs as JSON, CSV or Excel files.
109109

110110
### Export ConditionKeys and ConditionTexts
111111
For example to export condition.json files to [edi_energy_ahb_conditions_and_packages](https://github.com/Hochfrequenz/edi_energy_ahb_conditions_and_packages). Works best if no flags for "Prüfindentifikatoren" (--pruefis). In this case all known "Prüfidentifikatoren" are scanned. Thus all related conditions are gathered.
112112
```bash
113-
kohlrahbi --file-type conditions --input_path "Path\to\edi_energy_mirror\edi_energy_de\current" --output_path "Path\to\edi_energy_ahb_conditions_and_packages\aktuelleFV"
113+
kohlrahbi --file-type conditions --input_path ../edi_energy_mirror/edi_energy_de/current --output_path ./output/edi_energy_ahb_conditions_and_packages/aktuelleFV
114114
```
115115

116116
## Workflow

src/kohlrahbi/__init__.py

Lines changed: 44 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@
2929
# pylint:disable=anomalous-backslash-in-string
3030
def get_valid_pruefis(list_of_pruefis: list[str], all_known_pruefis: Optional[list[str]] = None) -> list[str]:
3131
"""
32-
This function returns a new list with only those pruefis which match the pruefi_pattern r"^[1-9]\d{4}$".
33-
It also supports unix wildcards like '*' and '?' iff a list of known pruefis is given.
32+
This function returns a list with only those pruefis which match the pruefi_pattern r"^[1-9]\d{4}$".
33+
It also supports unix wildcards like '*' and '?' if a list of known pruefis is given.
3434
E.g. '11*' for all pruefis starting with '11' or '*01' for all pruefis ending with '01'.
3535
"""
3636
result: set[str] = set()
@@ -83,7 +83,7 @@ def check_output_path(path: Path) -> None:
8383

8484
def load_all_known_pruefis_from_file(
8585
path_to_all_known_pruefis: Path = Path(__file__).parent / Path("all_known_pruefis.toml"),
86-
) -> list[str]:
86+
) -> dict[str, str | None]:
8787
"""
8888
Loads the file which contains all known Prüfidentifikatoren.
8989
The file may be manually updated with the script `collect_pruefis.py`.
@@ -93,17 +93,18 @@ def load_all_known_pruefis_from_file(
9393
state_of_kohlrahbi: dict[str, Any] = tomlkit.load(file)
9494

9595
meta_data_section = state_of_kohlrahbi.get("meta_data")
96-
content_section = state_of_kohlrahbi.get("content")
96+
pruefi_to_file_mapping: dict[str, str | None] | None = state_of_kohlrahbi.get("pruefidentifikatoren", None)
9797

9898
if meta_data_section is None:
9999
click.secho(f"There is no 'meta_data' section in the provided toml file: {path_to_all_known_pruefis}", fg="red")
100100
raise click.Abort()
101-
if content_section is None:
102-
click.secho(f"There is no 'content' section in the toml file: {path_to_all_known_pruefis}", fg="red")
101+
if pruefi_to_file_mapping is None:
102+
click.secho(
103+
f"There is no 'pruefidentifikatoren' section in the toml file: {path_to_all_known_pruefis}", fg="red"
104+
)
103105
raise click.Abort()
104106

105-
pruefis: list[str] = content_section.get("pruefidentifikatoren")
106-
return pruefis
107+
return pruefi_to_file_mapping
107108

108109

109110
def create_sheet_name(filename: str) -> str:
@@ -206,14 +207,15 @@ def scrape_change_histories(input_path: Path, output_path: Path) -> None:
206207
save_change_histories_to_excel(change_history_collection, output_path)
207208

208209

209-
def load_pruefis_if_empty(pruefis: list[str]) -> list[str]:
210+
def load_pruefis_if_empty(pruefi_to_file_mapping: dict[str, str | None]) -> dict[str, str | None]:
210211
"""
211-
If the user did not provide any pruefis we load all known pruefis from the toml file.
212+
If the user did not provide any pruefis we load all known pruefis
213+
and the paths to the file containing them from the toml file.
212214
"""
213-
if not pruefis:
215+
if not pruefi_to_file_mapping:
214216
click.secho("☝️ No pruefis were given. I will parse all known pruefis.", fg="yellow")
215217
return load_all_known_pruefis_from_file()
216-
return pruefis
218+
return pruefi_to_file_mapping
217219

218220

219221
def validate_file_type(file_type: str):
@@ -228,7 +230,7 @@ def validate_file_type(file_type: str):
228230

229231
def validate_pruefis(pruefis: list[str]) -> list[str]:
230232
"""
231-
Validate the pruefis parameter.
233+
Validate the pruefi_to_file_mapping parameter.
232234
"""
233235
valid_pruefis = get_valid_pruefis(pruefis)
234236
if not valid_pruefis:
@@ -248,9 +250,14 @@ def process_pruefi(
248250
):
249251
"""
250252
Process one pruefi.
253+
If the input path ends with .docx, we assume that the file containing the pruefi is given.
254+
Therefore we only access that file.
251255
"""
252-
ahb_file_finder = DocxFileFinder.from_input_path(input_path=input_path)
253-
ahb_file_paths = ahb_file_finder.get_docx_files_which_may_contain_searched_pruefi(pruefi)
256+
if not input_path.suffix == ".docx":
257+
ahb_file_finder = DocxFileFinder.from_input_path(input_path=input_path)
258+
ahb_file_paths = ahb_file_finder.get_docx_files_which_may_contain_searched_pruefi(pruefi)
259+
else:
260+
ahb_file_paths = [input_path]
254261

255262
if not ahb_file_paths:
256263
logger.warning("No docx file was found for pruefi '%s'", pruefi)
@@ -259,11 +266,11 @@ def process_pruefi(
259266
for ahb_file_path in ahb_file_paths:
260267
doc = get_or_cache_document(ahb_file_path, path_to_document_mapping)
261268
if not doc:
262-
continue
269+
return
263270

264271
ahb_table = get_ahb_table(document=doc, pruefi=pruefi)
265272
if not ahb_table:
266-
continue
273+
return
267274

268275
process_ahb_table(ahb_table, pruefi, output_path, file_type, collected_conditions)
269276

@@ -306,21 +313,31 @@ def process_ahb_table(
306313

307314

308315
def scrape_pruefis(
309-
pruefis: list[str], input_path: Path, output_path: Path, file_type: Literal["flatahb", "csv", "xlsx", "conditions"]
316+
pruefi_to_file_mapping: dict[str, str | None],
317+
basic_input_path: Path,
318+
output_path: Path,
319+
file_type: Literal["flatahb", "csv", "xlsx", "conditions"],
310320
) -> None:
311321
"""
312-
starts the scraping process for provided pruefis
322+
starts the scraping process for provided pruefi_to_file_mappings
313323
"""
314-
pruefis = load_pruefis_if_empty(pruefis)
324+
pruefi_to_file_mapping = load_pruefis_if_empty(pruefi_to_file_mapping)
315325
validate_file_type(file_type)
316326

317-
valid_pruefis = validate_pruefis(pruefis)
327+
valid_pruefis = validate_pruefis(list(pruefi_to_file_mapping.keys()))
328+
valid_pruefi_to_file_mappings: dict[str, str | None] = {}
329+
for pruefi in valid_pruefis:
330+
valid_pruefi_to_file_mappings.update({pruefi: pruefi_to_file_mapping.get(pruefi, None)})
318331
path_to_document_mapping: dict[Path, docx.Document] = {}
319332
collected_conditions: Optional[dict[EdifactFormat, dict[str, str]]] = {} if "conditions" in file_type else None
320333

321-
for pruefi in valid_pruefis:
334+
for pruefi, filename in valid_pruefi_to_file_mappings.items():
322335
try:
323336
logger.info("start looking for pruefi '%s'", pruefi)
337+
input_path = basic_input_path # To prevent multiple adding of filenames
338+
# that would happen if filenames are added but never removed
339+
if filename is not None:
340+
input_path = basic_input_path / Path(filename)
324341
process_pruefi(pruefi, input_path, output_path, file_type, path_to_document_mapping, collected_conditions)
325342
# sorry for the pokemon catch
326343
except Exception as e: # pylint: disable=broad-except
@@ -394,12 +411,14 @@ def main(
394411
else:
395412
output_path.mkdir(parents=True)
396413
click.secho(f"I created a new directory at {output_path}", fg="yellow")
397-
414+
pruefi_to_file_mapping: dict[str, str | None] = {
415+
key: None for key in pruefis
416+
} # A mapping of a pruefi (key) to the name (+ path) of the file containing the prufi
398417
match flavour:
399418
case "pruefi":
400419
scrape_pruefis(
401-
pruefis=pruefis,
402-
input_path=input_path,
420+
pruefi_to_file_mapping=pruefi_to_file_mapping,
421+
basic_input_path=input_path,
403422
output_path=output_path,
404423
file_type=file_type,
405424
)

0 commit comments

Comments
 (0)