Skip to content

Commit 6b65088

Browse files
OLILHRhf-kklein
andauthored
feat: facilitate comparison of multiple PIDs stored in separate xlsx tabs (#86)
* Facilitate comparison of different PIDs across multiple FVs * Store multiple comparison in separate xlsx tabs * Rename `main` command to `compare` * Add PIDs to column headers * Change list to List type Co-authored-by: konstantin <[email protected]> * Outsource xlsx header formatting into separate helper functions * Remove unused imports * Cache PID file locations to scan through input directory only once * Comment * Use type `Path` instead of `str` for paths * Refactor `_set_sheet_name` function to improve readability * Add example worksheet/tab names as comment * Use `strict: bool = True` to make sure there is no length mismatch --------- Co-authored-by: konstantin <[email protected]>
1 parent a31f97b commit 6b65088

File tree

6 files changed

+358
-71
lines changed

6 files changed

+358
-71
lines changed

.github/workflows/cli_test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ jobs:
2222
- name: Try to install the script, then run the cli
2323
run: |
2424
pip install .
25-
ahlbatross -i data/machine-readable_anwendungshandbuecher -o data/output
25+
ahlbatross compare -i data/machine-readable_anwendungshandbuecher -o data/output
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
"""
2+
Interactive CLI PID comparison.
3+
Output = xlsx only.
4+
Multiple comparisons PID_A <-> PID_B, PID_A <-> PID_C, PID_A <-> PID_D, ... are merged in separate tabs.
5+
"""
6+
7+
import logging
8+
import sys
9+
from pathlib import Path
10+
11+
import typer
12+
from rich.console import Console
13+
from rich.prompt import Prompt
14+
15+
from ahlbatross.core.ahb_comparison import align_ahb_rows
16+
from ahlbatross.core.ahb_processing import _get_formatversion_dirs, _get_nachrichtenformat_dirs
17+
from ahlbatross.formats.csv import get_csv_files, load_csv_files
18+
from ahlbatross.formats.xlsx import export_to_xlsx_multicompare
19+
20+
logger = logging.getLogger(__name__)
21+
console = Console()
22+
23+
_FORMATVERSION_PID_CACHE: dict[str, dict[str, tuple[Path, str]]] = {}
24+
25+
26+
def find_pid(root_dir: Path, formatversion: str, pruefid: str) -> tuple[Path, str] | None:
27+
"""
28+
Find a PID file across all nachrichtenformat directories in a given FV.
29+
"""
30+
# Store the locations of all PIDs after the initial scan/prompt of a FV directory
31+
if formatversion not in _FORMATVERSION_PID_CACHE:
32+
formatversion_dir = root_dir / formatversion
33+
if not formatversion_dir.exists():
34+
return None
35+
36+
nachrichtenformat_dirs = _get_nachrichtenformat_dirs(formatversion_dir)
37+
_FORMATVERSION_PID_CACHE[formatversion] = {}
38+
39+
for nf_dir in nachrichtenformat_dirs:
40+
csv_dir = nf_dir / "csv"
41+
if not csv_dir.exists():
42+
continue
43+
44+
for file in get_csv_files(csv_dir):
45+
_FORMATVERSION_PID_CACHE[formatversion][file.stem] = (file, nf_dir.name)
46+
47+
return _FORMATVERSION_PID_CACHE[formatversion].get(pruefid)
48+
49+
50+
def get_pids(root_dir: Path, formatversion: str) -> list[str]:
51+
"""
52+
Get all available PIDs across all nachrichtenformat directories for a given FV.
53+
The result is sorted and contains every PID once at max.
54+
"""
55+
if formatversion not in _FORMATVERSION_PID_CACHE:
56+
find_pid(root_dir, formatversion, "")
57+
58+
return sorted(list(_FORMATVERSION_PID_CACHE.get(formatversion, {}).keys()))
59+
60+
61+
# pylint:disable=too-many-locals, too-many-branches, too-many-statements
62+
def multicompare_command(
63+
input_dir: Path = typer.Option(..., "--input-dir", "-i", help="Directory containing AHB <PID>.json files."),
64+
output_dir: Path = typer.Option(
65+
..., "--output-dir", "-o", help="Destination path to output directory containing merged xlsx files."
66+
),
67+
) -> None:
68+
"""
69+
Interactive command to compare two PIDs across different FVs.
70+
"""
71+
try:
72+
if not input_dir.exists():
73+
logger.error("❌ Input directory does not exist: %s", input_dir.absolute())
74+
sys.exit(1)
75+
76+
formatversions = _get_formatversion_dirs(input_dir)
77+
if not formatversions:
78+
logger.error("❌ No format versions found in input directory")
79+
sys.exit(1)
80+
81+
# show available FVs
82+
formatversions_list = ", ".join(str(fv) for fv in formatversions)
83+
console.print(f"\nAVAILABLE FVs: {formatversions_list}")
84+
85+
# get first FV
86+
while True:
87+
first_fv = Prompt.ask("\nSELECT FV")
88+
if first_fv in [str(fv) for fv in formatversions]:
89+
break
90+
console.print("❌ Invalid FV.")
91+
92+
# get first PID
93+
first_available_pids = get_pids(input_dir, first_fv)
94+
if not first_available_pids:
95+
logger.error("❌ No PIDs found in format version %s", first_fv)
96+
sys.exit(1)
97+
98+
# show available PIDs
99+
first_pids_list = ", ".join(first_available_pids)
100+
console.print(f"\nAVAILABLE PIDs: {first_pids_list}")
101+
102+
while True:
103+
first_pruefid = Prompt.ask("\nSELECT PID #1")
104+
if first_pruefid in first_available_pids:
105+
break
106+
console.print("❌ Invalid PID.")
107+
108+
first_file = find_pid(input_dir, first_fv, first_pruefid)
109+
if not first_file:
110+
logger.error("❌ Could not find PID file for %s in %s", first_pruefid, first_fv)
111+
sys.exit(1)
112+
113+
first_file_path, _ = first_file
114+
115+
comparison_groups = []
116+
comparison_names = []
117+
118+
comparison_number = 2
119+
while True:
120+
# show available FVs
121+
formatversions_list = ", ".join(str(fv) for fv in formatversions)
122+
console.print(f"\nAVAILABLE FVs (🏁 PRESS ENTER TO FINISH): {formatversions_list}")
123+
124+
next_fv = Prompt.ask(f"\nSELECT FV #{comparison_number}", default="")
125+
if not next_fv:
126+
# hitting enter aborts the process.
127+
break
128+
129+
if next_fv not in [str(fv) for fv in formatversions]:
130+
console.print("❌ Invalid FV.")
131+
continue
132+
133+
next_available_pids = get_pids(input_dir, next_fv)
134+
if not next_available_pids:
135+
logger.error("❌ No PIDs found for format version %s", next_fv)
136+
continue
137+
138+
# show available PIDs
139+
next_pids_list = ", ".join(next_available_pids)
140+
console.print(f"\nAVAILABLE PIDs (FV{next_fv}): {next_pids_list}")
141+
142+
while True:
143+
next_pruefid = Prompt.ask(f"\nSELECT PID #{comparison_number}")
144+
if next_pruefid == first_pruefid and next_fv == first_fv:
145+
console.print("❌ Cannot compare identical PIDs of the same format version.")
146+
elif next_pruefid in next_available_pids:
147+
break
148+
else:
149+
console.print("❌ Invalid PID.")
150+
151+
next_file = find_pid(input_dir, next_fv, next_pruefid)
152+
if not next_file:
153+
logger.error("❌ Could not find PID file for %s in %s", next_pruefid, next_fv)
154+
continue
155+
156+
next_file_path, _ = next_file
157+
158+
try:
159+
first_rows, next_rows = load_csv_files(first_file_path, next_file_path, first_fv, next_fv)
160+
comparisons = align_ahb_rows(first_rows, next_rows)
161+
162+
comparison_groups.append(comparisons)
163+
comparison_names.append(f"{first_pruefid}_{next_pruefid}")
164+
165+
comparison_number += 1
166+
except (OSError, IOError, ValueError) as e:
167+
logger.error(
168+
"❌ Error comparing %s/%s with %s/%s: %s", first_fv, first_pruefid, next_fv, next_pruefid, str(e)
169+
)
170+
continue
171+
172+
if not comparison_groups:
173+
sys.exit(1)
174+
175+
output_dir.mkdir(parents=True, exist_ok=True)
176+
177+
xlsx_path = output_dir / f"{first_pruefid}_comparisons.xlsx"
178+
export_to_xlsx_multicompare(comparison_groups, comparison_names, Path(xlsx_path))
179+
180+
logger.info("✅ Successfully processed: %s", xlsx_path)
181+
182+
except (OSError, IOError, ValueError, TypeError) as e:
183+
logger.exception("❌ Error: %s", str(e))
184+
sys.exit(1)

src/ahlbatross/core/ahb_processing.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def _get_nachrichtenformat_dirs(formatversion_dir: Path) -> list[Path]:
4848
Fetch all <nachrichtenformat> directories that contain actual csv files.
4949
"""
5050
if not formatversion_dir.exists():
51-
raise FileNotFoundError(f"❌ Formatversion directory not found: {formatversion_dir.absolute()}")
51+
raise FileNotFoundError(f"❌ FV directory not found: {formatversion_dir.absolute()}")
5252

5353
return [d for d in formatversion_dir.iterdir() if d.is_dir() and (d / "csv").exists() and (d / "csv").is_dir()]
5454

@@ -77,15 +77,15 @@ def get_formatversion_pairs(root_dir: Path) -> list[tuple[EdifactFormatVersion,
7777

7878
if is_subsequent_empty or is_previous_empty:
7979
logger.warning(
80-
"❗️Skipping empty consecutive formatversions: %s -> %s",
80+
"❗️Skipping empty consecutive FVs: %s -> %s",
8181
subsequent_formatversion,
8282
previous_formatversion,
8383
)
8484
continue
8585

8686
consecutive_formatversions.append((subsequent_formatversion, previous_formatversion))
8787

88-
logger.debug("Consecutive formatversions: %s", consecutive_formatversions)
88+
logger.debug("Consecutive FVs: %s", consecutive_formatversions)
8989
return consecutive_formatversions
9090

9191

@@ -100,7 +100,7 @@ def get_matching_csv_files(
100100
subsequent_formatversion_dir = root_dir / subsequent_formatversion
101101

102102
if not all(d.exists() for d in [previous_formatversion_dir, subsequent_formatversion_dir]):
103-
logger.error("❌ At least one formatversion directory does not exist.")
103+
logger.error("❌ At least one FV directory does not exist.")
104104
return []
105105

106106
matching_files = []
@@ -140,13 +140,11 @@ def process_ahb_files(input_dir: Path, output_dir: Path) -> None:
140140

141141
consecutive_formatversions = get_formatversion_pairs(input_dir)
142142
if not consecutive_formatversions:
143-
logger.warning("❗️ No valid consecutive formatversion subdirectories found to compare.")
143+
logger.warning("❗️ No valid consecutive FVs subdirectories found to compare.")
144144
return
145145

146146
for subsequent_formatversion, previous_formatversion in consecutive_formatversions:
147-
logger.info(
148-
"⌛ Processing consecutive formatversions: %s -> %s", subsequent_formatversion, previous_formatversion
149-
)
147+
logger.info("⌛ Processing consecutive FVs: %s -> %s", subsequent_formatversion, previous_formatversion)
150148

151149
try:
152150
matching_files = get_matching_csv_files(input_dir, previous_formatversion, subsequent_formatversion)
@@ -184,7 +182,7 @@ def process_ahb_files(input_dir: Path, output_dir: Path) -> None:
184182

185183
except (OSError, IOError, ValueError) as e:
186184
logger.error(
187-
"❌ Error processing formatversions %s -> %s: %s",
185+
"❌ Error processing FVs %s -> %s: %s",
188186
subsequent_formatversion,
189187
previous_formatversion,
190188
str(e),

0 commit comments

Comments
 (0)