Skip to content

Commit d7eae84

Browse files
authored
Merge pull request #24 from santi921/fix/out-files-zip-overwrite
fix: preserve out_files.zip across restart / parse_only reruns
2 parents be7191e + 980547e commit d7eae84

4 files changed

Lines changed: 233 additions & 13 deletions

File tree

qtaim_gen/source/core/omol.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
pull_ecp_dict,
4848
overwrite_molden_w_ecp,
4949
check_spin,
50+
merge_zip_into,
5051
)
5152

5253

@@ -1063,15 +1064,13 @@ def clean_jobs(
10631064
os.remove(os.path.join(folder, file))
10641065
logger.info(f"Zipped and removed {file}")
10651066

1066-
if move_results:
1067-
results_folder = os.path.join(folder, "generator")
1068-
if not os.path.exists(results_folder):
1069-
os.mkdir(results_folder)
1070-
os.rename(
1071-
zip_file_out,
1072-
os.path.join(results_folder, "out_files.zip"),
1073-
)
1074-
logger.info(f"Moved zipped out files to results folder")
1067+
if move_results:
1068+
results_folder = os.path.join(folder, "generator")
1069+
merge_zip_into(
1070+
zip_file_out,
1071+
os.path.join(results_folder, "out_files.zip"),
1072+
logger=logger,
1073+
)
10751074

10761075

10771076
def setup_logger(folder: str, name: str = "gbw_analysis") -> logging.Logger:

qtaim_gen/source/core/workflow.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -416,11 +416,13 @@ def process_folder_alcf(
416416
logger.info(f"Zipped and removed {file}")
417417

418418
if move_results:
419+
from qtaim_gen.source.utils.io import merge_zip_into
419420
results_folder = os.path.join(folder, "generator")
420-
if not os.path.exists(results_folder):
421-
os.makedirs(results_folder)
422-
shutil.move(zip_file_out, os.path.join(results_folder, "out_files.zip"))
423-
logger.info(f"Moved out_files.zip to results folder {results_folder}")
421+
merge_zip_into(
422+
zip_file_out,
423+
os.path.join(results_folder, "out_files.zip"),
424+
logger=logger,
425+
)
424426

425427
except Exception as e:
426428
logger.info(f"Couldn't zip .out files in {folder}: {e}")

qtaim_gen/source/utils/io.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,93 @@ def check_ecp_for_folder(folder_outputs: str) -> int:
298298
return ECP_NO_ZIP
299299

300300

301+
def merge_zip_into(
302+
src_zip: str,
303+
dest_zip: str,
304+
logger: Optional[Any] = None,
305+
) -> None:
306+
"""Move src_zip to dest_zip, merging if dest already exists.
307+
308+
On filename collision, keeps whichever entry has the larger uncompressed
309+
size (preserves the richest available output). On equal size, keeps the
310+
existing dest entry. The merge is written to a temp file and atomically
311+
replaces dest, so a failure mid-merge cannot corrupt dest. src_zip is
312+
removed on success.
313+
"""
314+
import shutil as _shutil
315+
316+
if not os.path.exists(src_zip):
317+
if logger is not None:
318+
logger.warning(f"merge_zip_into: source {src_zip} missing, nothing to do")
319+
return
320+
321+
dest_dir = os.path.dirname(dest_zip)
322+
if dest_dir:
323+
os.makedirs(dest_dir, exist_ok=True)
324+
325+
if not os.path.exists(dest_zip):
326+
_shutil.move(src_zip, dest_zip)
327+
if logger is not None:
328+
logger.info(f"Moved {src_zip} to {dest_zip}")
329+
return
330+
331+
tmp_dest = dest_zip + ".merge.tmp"
332+
if os.path.exists(tmp_dest):
333+
try:
334+
os.remove(tmp_dest)
335+
except OSError:
336+
pass
337+
338+
try:
339+
with zipfile.ZipFile(dest_zip, "r") as dest_zf, \
340+
zipfile.ZipFile(src_zip, "r") as src_zf:
341+
dest_infos = {i.filename: i for i in dest_zf.infolist()}
342+
src_infos = {i.filename: i for i in src_zf.infolist()}
343+
names = set(dest_infos) | set(src_infos)
344+
added: List[str] = []
345+
replaced: List[Tuple[str, int, int]] = []
346+
kept: List[str] = []
347+
with zipfile.ZipFile(tmp_dest, "w", zipfile.ZIP_DEFLATED) as out_zf:
348+
for name in sorted(names):
349+
in_dest = name in dest_infos
350+
in_src = name in src_infos
351+
if in_dest and in_src:
352+
d_sz = dest_infos[name].file_size
353+
s_sz = src_infos[name].file_size
354+
if s_sz > d_sz:
355+
out_zf.writestr(src_infos[name], src_zf.read(name))
356+
replaced.append((name, d_sz, s_sz))
357+
else:
358+
out_zf.writestr(dest_infos[name], dest_zf.read(name))
359+
kept.append(name)
360+
elif in_dest:
361+
out_zf.writestr(dest_infos[name], dest_zf.read(name))
362+
kept.append(name)
363+
else:
364+
out_zf.writestr(src_infos[name], src_zf.read(name))
365+
added.append(name)
366+
os.replace(tmp_dest, dest_zip)
367+
try:
368+
os.remove(src_zip)
369+
except OSError:
370+
pass
371+
if logger is not None:
372+
logger.info(
373+
f"Merged {src_zip} into {dest_zip}: "
374+
f"{len(added)} added, {len(replaced)} replaced by larger, "
375+
f"{len(kept)} kept from existing"
376+
)
377+
if replaced:
378+
logger.info(f"Replaced (name, dest_size, src_size): {replaced}")
379+
except Exception:
380+
if os.path.exists(tmp_dest):
381+
try:
382+
os.remove(tmp_dest)
383+
except OSError:
384+
pass
385+
raise
386+
387+
301388
def pull_ecp_dict(orca_out: str) -> Dict[int, Dict[str, Union[str, float]]]:
302389
"""
303390
Method to pull the ecp dictionary from the orca output file.

tests/test_merge_zip_into.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
"""Tests for merge_zip_into: preserve richest out_files.zip across reruns."""
2+
3+
import os
4+
import tempfile
5+
import zipfile
6+
7+
import pytest
8+
9+
from qtaim_gen.source.utils.io import merge_zip_into
10+
11+
12+
def _make_zip(path: str, entries: dict) -> None:
13+
with zipfile.ZipFile(path, "w", zipfile.ZIP_DEFLATED) as zf:
14+
for name, data in entries.items():
15+
zf.writestr(name, data)
16+
17+
18+
def _read_zip(path: str) -> dict:
19+
out = {}
20+
with zipfile.ZipFile(path, "r") as zf:
21+
for name in zf.namelist():
22+
out[name] = zf.read(name)
23+
return out
24+
25+
26+
def test_move_when_dest_missing():
27+
with tempfile.TemporaryDirectory() as tmp:
28+
src = os.path.join(tmp, "src.zip")
29+
dest = os.path.join(tmp, "sub", "dest.zip")
30+
_make_zip(src, {"a.out": b"hello"})
31+
32+
merge_zip_into(src, dest)
33+
34+
assert not os.path.exists(src)
35+
assert os.path.exists(dest)
36+
assert _read_zip(dest) == {"a.out": b"hello"}
37+
38+
39+
def test_merge_adds_missing_entries():
40+
with tempfile.TemporaryDirectory() as tmp:
41+
src = os.path.join(tmp, "src.zip")
42+
dest = os.path.join(tmp, "dest.zip")
43+
_make_zip(dest, {"a.out": b"AAA"})
44+
_make_zip(src, {"b.out": b"BBB", "c.out": b"CCC"})
45+
46+
merge_zip_into(src, dest)
47+
48+
assert not os.path.exists(src)
49+
assert _read_zip(dest) == {
50+
"a.out": b"AAA",
51+
"b.out": b"BBB",
52+
"c.out": b"CCC",
53+
}
54+
55+
56+
def test_collision_prefers_larger():
57+
with tempfile.TemporaryDirectory() as tmp:
58+
src = os.path.join(tmp, "src.zip")
59+
dest = os.path.join(tmp, "dest.zip")
60+
_make_zip(dest, {"adch.out": b"short", "cm5.out": b"rich cm5 content"})
61+
_make_zip(src, {"adch.out": b"richer adch content here", "cm5.out": b"x"})
62+
63+
merge_zip_into(src, dest)
64+
65+
merged = _read_zip(dest)
66+
assert merged["adch.out"] == b"richer adch content here"
67+
assert merged["cm5.out"] == b"rich cm5 content"
68+
69+
70+
def test_collision_equal_size_keeps_existing():
71+
with tempfile.TemporaryDirectory() as tmp:
72+
src = os.path.join(tmp, "src.zip")
73+
dest = os.path.join(tmp, "dest.zip")
74+
_make_zip(dest, {"a.out": b"XXXXX"})
75+
_make_zip(src, {"a.out": b"YYYYY"})
76+
77+
merge_zip_into(src, dest)
78+
79+
assert _read_zip(dest) == {"a.out": b"XXXXX"}
80+
81+
82+
def test_missing_src_is_noop():
83+
with tempfile.TemporaryDirectory() as tmp:
84+
src = os.path.join(tmp, "missing.zip")
85+
dest = os.path.join(tmp, "dest.zip")
86+
_make_zip(dest, {"a.out": b"AAA"})
87+
88+
merge_zip_into(src, dest)
89+
90+
assert _read_zip(dest) == {"a.out": b"AAA"}
91+
92+
93+
def test_dest_corrupt_raises_and_preserves_dest():
94+
with tempfile.TemporaryDirectory() as tmp:
95+
src = os.path.join(tmp, "src.zip")
96+
dest = os.path.join(tmp, "dest.zip")
97+
_make_zip(src, {"a.out": b"AAA"})
98+
with open(dest, "wb") as f:
99+
f.write(b"not a zip file")
100+
101+
with pytest.raises(zipfile.BadZipFile):
102+
merge_zip_into(src, dest)
103+
104+
assert os.path.exists(src)
105+
with open(dest, "rb") as f:
106+
assert f.read() == b"not a zip file"
107+
assert not os.path.exists(dest + ".merge.tmp")
108+
109+
110+
def test_simulated_rerun_preserves_original_entries():
111+
# Simulate first full run producing many .out files, then a parse_only
112+
# rerun producing only a subset - merged zip should retain everything.
113+
with tempfile.TemporaryDirectory() as tmp:
114+
dest = os.path.join(tmp, "out_files.zip")
115+
first_run = {
116+
"adch.out": b"first adch full output",
117+
"cm5.out": b"first cm5 full output",
118+
"hirshfeld.out": b"first hirshfeld full output",
119+
"fuzzy_full.out": b"first fuzzy full output",
120+
}
121+
_make_zip(dest, first_run)
122+
123+
src = os.path.join(tmp, "out_files.zip.new")
124+
rerun = {"adch.out": b"short rerun"}
125+
_make_zip(src, rerun)
126+
127+
merge_zip_into(src, dest)
128+
129+
merged = _read_zip(dest)
130+
assert set(merged) == set(first_run)
131+
for k, v in first_run.items():
132+
assert merged[k] == v

0 commit comments

Comments
 (0)