-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstart.py
More file actions
444 lines (374 loc) · 16.4 KB
/
start.py
File metadata and controls
444 lines (374 loc) · 16.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
import os
import sys
import subprocess
import threading
import webbrowser
import tkinter as tk
from tkinter import messagebox, ttk, filedialog
from metadata_enrichment.enrichment_dialog import EnrichmentProgressDialog
from metadata_enrichment.statistics_dialog import show_statistics
def run_enrichment(root: tk.Tk) -> None:
"""Starts the metadata enrichment with GUI dialog."""
# File selection
xml_path = filedialog.askopenfilename(
parent=root,
title="XML-Datei für Anreicherung auswählen",
filetypes=[("XML-Dateien", "*.xml"), ("Alle Dateien", "*.*")],
initialdir=os.path.dirname(__file__)
)
if not xml_path:
return
# Check if file exists
if not os.path.exists(xml_path):
messagebox.showerror("Fehler", f"Datei nicht gefunden: {xml_path}", parent=root)
return
# Determine ISBN count (only for small files, otherwise estimate)
isbn_count = None # None = unknown (determined during Pass 1)
try:
file_size_mb = os.path.getsize(xml_path) / (1024 * 1024)
# Only count for small files (<100MB) upfront
if file_size_mb < 100:
import xml.etree.ElementTree as ET
tree = ET.parse(xml_path)
root_elem = tree.getroot()
# Count ISBNs
isbn_count = 0
for record in root_elem.findall("record"):
for datafield in record.findall("datafield"):
if datafield.get("tag") == "020":
for subfield in datafield.findall("subfield"):
if subfield.get("code") == "a" and subfield.text and subfield.text.strip():
isbn_count += 1
break # Only first ISBN per record
break # Only first datafield 020
if isbn_count == 0:
messagebox.showwarning(
"Keine ISBNs gefunden",
"Die ausgewählte Datei enthält keine ISBNs (Feld 020$a).",
parent=root
)
return
else:
# For large files: Unknown count (determined during Pass 1)
print(f"Large file ({file_size_mb:.0f} MB) - ISBN count will be determined during processing")
isbn_count = None
except Exception as e:
messagebox.showerror("Fehler", f"Fehler beim Analysieren der Datei:\n{e}", parent=root)
return
# Create progress dialog
cancelled = False
def check_cancelled():
return cancelled
def on_cancel():
nonlocal cancelled
cancelled = True
progress_dialog = EnrichmentProgressDialog(root, isbn_count, on_cancel=on_cancel)
def run_enrichment_thread():
result = None
try:
# Import enrich_metadata locally to avoid circular import
import enrich_metadata
# Callback for progress updates
def progress_callback(processed, successful, failed, retry_1, retry_2, retry_3, isbn_not_found, conflicts_skipped, total=None):
if not cancelled: # Only send updates if not cancelled
try:
root.after(0, lambda p=processed, s=successful, f=failed, r1=retry_1, r2=retry_2, r3=retry_3, i=isbn_not_found, c=conflicts_skipped, t=total:
progress_dialog.update_progress(p, s, f, r1, r2, r3, i, c, total=t))
except (tk.TclError, AttributeError):
pass
# Perform enrichment
result = enrich_metadata.main(xml_path, progress_callback=progress_callback, check_cancelled=check_cancelled)
if result:
if result.get('cancelled'):
def show_cancel():
try:
progress_dialog.mark_complete(
success=False,
message="Die Anreicherung wurde abgebrochen."
)
except (tk.TclError, AttributeError):
pass
root.after(0, show_cancel)
else:
# Success
output_path = xml_path.replace(".xml", "_enriched.xml")
# Save file (depending on return format)
if result.get('output_path'):
# Iterative parsing: File was already written
output_path = result.get('output_path')
elif result.get('tree'):
# Old format: Tree returned (backward compatibility)
result['tree'].write(output_path, encoding='utf-8', xml_declaration=True)
# Export JSON statistics
json_path = None
try:
json_path = enrich_metadata.export_stats_to_json(result, xml_path, output_path)
except Exception as e:
print(f"Warnung: JSON-Export fehlgeschlagen: {e}")
def show_success():
try:
# Close progress dialog
progress_dialog.dialog.destroy()
# Show statistics dialog
show_statistics(root, result)
# Final confirmation with JSON info
success_msg = f"Angereicherte Datei gespeichert:\n{output_path}"
if json_path:
success_msg += f"\n\nStatistiken exportiert:\n{json_path}"
messagebox.showinfo(
"Erfolg",
success_msg,
parent=root
)
except (tk.TclError, AttributeError) as e:
print(f"Fehler beim Anzeigen der Statistiken: {e}")
root.after(0, show_success)
else:
def show_error():
try:
progress_dialog.mark_complete(
success=False,
message="Ein Fehler ist aufgetreten. Bitte prüfen Sie die Log-Datei."
)
except (tk.TclError, AttributeError):
pass
root.after(0, show_error)
except Exception as e:
error_msg = f"Fehler bei der Anreicherung:\n{str(e)}"
def show_exception():
try:
if progress_dialog.dialog.winfo_exists():
messagebox.showerror("Fehler", error_msg, parent=progress_dialog.dialog)
progress_dialog.mark_complete(success=False, message=error_msg)
except (tk.TclError, AttributeError):
pass
root.after(0, show_exception)
# Start thread
thread = threading.Thread(target=run_enrichment_thread, daemon=True)
thread.start()
def show_enrichment_statistics(root: tk.Tk) -> None:
"""Starts a web server and opens the enrichment statistics in browser."""
# Check if statistics files exist
stats_file = os.path.join(os.path.dirname(__file__), "voebvoll-20241027_enriched_stats.json")
if not os.path.exists(stats_file):
messagebox.showerror(
"Fehler",
"Statistik-Datei nicht gefunden.\nBitte führen Sie zuerst die Metadaten-Anreicherung durch.",
parent=root
)
return
# Create output directory for charts
output_dir = os.path.join(os.path.dirname(__file__), "enrichment_charts")
os.makedirs(output_dir, exist_ok=True)
# R script path
r_script = os.path.join(os.path.dirname(__file__), "generate_enrichment_charts.R")
# Check if R is installed and find Rscript.exe
rscript_paths = [
"Rscript", # In PATH
r"C:\Program Files\R\R-4.5.1\bin\Rscript.exe", # Standard Windows installation
r"C:\Program Files\R\R-4.4.1\bin\Rscript.exe",
r"C:\Program Files\R\R-4.3.1\bin\Rscript.exe",
]
rscript_cmd = None
for path in rscript_paths:
try:
result = subprocess.run(
[path, "--version"],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
rscript_cmd = path
break
except (FileNotFoundError, subprocess.TimeoutExpired):
continue
if not rscript_cmd:
messagebox.showerror(
"R nicht gefunden",
"R ist nicht installiert oder nicht im PATH.\n\n"
"Bitte installieren Sie R 4.5.1 oder höher:\n"
"https://cran.r-project.org/",
parent=root
)
return
# Generate charts in background
def generate_charts():
try:
# Execute R script
result = subprocess.run(
[rscript_cmd, r_script, stats_file, output_dir],
capture_output=True,
text=True,
timeout=60
)
if result.returncode != 0:
error_msg = f"R-Script Fehler:\n{result.stderr}"
root.after(0, lambda: messagebox.showerror("Fehler", error_msg, parent=root))
return False
print(result.stdout) # R script output
return True
except subprocess.TimeoutExpired:
root.after(0, lambda: messagebox.showerror(
"Fehler",
"R-Script Timeout: Diagramm-Generierung dauert zu lange.",
parent=root
))
return False
except Exception as e:
error_msg = f"Fehler beim Generieren der Diagramme:\n{e}"
root.after(0, lambda msg=error_msg: messagebox.showerror(
"Fehler",
msg,
parent=root
))
return False
# Status dialog
status_dialog = tk.Toplevel(root)
status_dialog.title("Generiere Diagramme...")
status_dialog.geometry("400x150")
status_dialog.transient(root)
status_dialog.grab_set()
status_label = ttk.Label(
status_dialog,
text="Generiere Diagramme mit R...\nBitte warten...",
font=('Arial', 11)
)
status_label.pack(pady=30)
status_progress = ttk.Progressbar(status_dialog, mode="indeterminate", length=300)
status_progress.pack(pady=10)
status_progress.start(10)
# Generate charts in separate thread
def run_chart_generation():
success = generate_charts()
# Close status dialog
root.after(0, status_dialog.destroy)
if not success:
return
# Start web server
try:
from metadata_enrichment.enrichment_stats_server import start_stats_server
except ImportError:
root.after(0, lambda: messagebox.showerror(
"Fehler",
"Webserver-Modul nicht gefunden.\nBitte prüfen Sie die Installation.",
parent=root
))
return
# Start web server in separate thread
try:
port = 8080
server_thread = threading.Thread(
target=start_stats_server,
args=(stats_file, output_dir, port),
daemon=True
)
server_thread.start()
# Open browser after short delay
def open_browser():
import time
time.sleep(0.5) # Wait until server is ready
webbrowser.open(f"http://localhost:{port}")
browser_thread = threading.Thread(target=open_browser, daemon=True)
browser_thread.start()
root.after(0, lambda: messagebox.showinfo(
"Webserver gestartet",
f"Die Anreicherungsstatistik wurde im Browser geöffnet.\n\n"
f"URL: http://localhost:{port}\n\n"
f"Der Webserver läuft im Hintergrund und wird beim Beenden der Anwendung automatisch geschlossen.",
parent=root
))
except Exception as e:
error_msg = f"Fehler beim Starten des Webservers:\n{e}"
root.after(0, lambda msg=error_msg: messagebox.showerror(
"Fehler",
msg,
parent=root
))
chart_thread = threading.Thread(target=run_chart_generation, daemon=True)
chart_thread.start()
def run_script(
root: tk.Tk,
progress_label: ttk.Label,
progress: ttk.Progressbar,
script_name: str,
) -> None:
progress_label.config(text=f"{script_name} wird ausgeführt...")
progress_label.pack(pady=(15, 5))
progress.pack(pady=(0, 15))
progress.start(10)
def on_finish() -> None:
progress.stop()
progress_label.pack_forget()
progress.pack_forget()
def execute_script() -> None:
script_path = os.path.join(os.path.dirname(__file__), script_name)
if not os.path.exists(script_path):
root.after(0, lambda: messagebox.showerror("Fehler", f"Skript nicht gefunden: {script_name}"))
root.after(0, on_finish)
return
try:
subprocess.run([sys.executable, script_path], check=True)
except subprocess.CalledProcessError as exc:
error_msg = (
f"Beim Ausführen von {script_name} ist ein Fehler aufgetreten:\n{exc}"
)
root.after(0, lambda msg=error_msg: messagebox.showerror("Fehler", msg))
finally:
root.after(0, on_finish)
threading.Thread(target=execute_script, daemon=True).start()
def main() -> None:
root = tk.Tk()
root.title("P2 - Datenqualitätsanalyse")
root.resizable(False, False)
frm = tk.Frame(root, padx=20, pady=20)
frm.pack()
progress_label = ttk.Label(frm)
progress_bar = ttk.Progressbar(frm, mode="indeterminate", length=300)
buttons = [
("Nach Besitz splitten", "data_processing/split_by_possession.py"),
("Nach Katalogisierungsquelle splitten", "data_processing/split_by_source.py"),
("Metadatenelemente auflisten", "data_analysis/analyze_elements_list.py"),
("Metadatenelemente (Menge) analysieren", "data_analysis/analyze_elements_quantity.py"),
("Primärschlüssel prüfen", "data_quality/check_primary_key.py"),
("ISBN prüfen", "data_quality/check_isbn.py"),
("Leader prüfen", "data_quality/check_leader.py"),
("Datum prüfen", "data_quality/check_date_field.py"),
("Doppelte ISBN/ISSN prüfen", "data_quality/check_duplicate_identifiers.py"),
("ISIL-Codes validieren", "data_quality/validate_isil_codes.py"),
("Besitznachweise zählen", "data_analysis/analyze_possession_counts.py"),
("Sprachcodes korrigieren+anreichern", "data_processing/enrich_language.py"),
]
for label, script in buttons:
ttk.Button(
frm,
text=label,
width=30,
command=lambda s=script: run_script(root, progress_label, progress_bar, s),
).pack(pady=5)
# Special button for metadata enrichment with custom dialog
ttk.Button(
frm,
text="Metadaten anreichern",
width=30,
command=lambda: run_enrichment(root),
).pack(pady=5)
# Show "Enrichment Statistics" button only if enriched files exist
enriched_xml = os.path.join(os.path.dirname(__file__), "voebvoll-20241027_enriched.xml")
enriched_stats = os.path.join(os.path.dirname(__file__), "voebvoll-20241027_enriched_stats.json")
if os.path.exists(enriched_xml) and os.path.exists(enriched_stats):
ttk.Button(
frm,
text="Anreicherungsstatistik",
width=30,
command=lambda: show_enrichment_statistics(root),
).pack(pady=5)
ttk.Button(
frm,
text="Beenden",
width=30,
command=root.destroy,
).pack(pady=5)
root.mainloop()
if __name__ == "__main__":
main()