Skip to content

Commit a6be689

Browse files
committed
Add readme section for pause / resume
1 parent ee960b1 commit a6be689

File tree

6 files changed

+91
-79
lines changed

6 files changed

+91
-79
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,11 @@ not beneficial due to big difference of wavefunctions on neighboring images.
144144
On the other hand, `KubeVaspInteractive` has nearly linear scaling with worker pod numbers,
145145
if the workload per pod is balanced (see [examples/ex11_k8s_minimal.py](examples/ex11_k8s_minimal.py)).
146146
147+
By default, the MPI processes that run the VASP calculations will occupy 100% cpu on the allocated cores / slots, even when waiting for the inputs.
148+
This can lead to undesired effects when other CPU-expensive codes are running between two `VaspInteractive` ionic steps.
149+
Starting from version `0.0.5` we add the `pause_calc` and `resume_calc` methods to `VaspInteractive`, so the user can temporarily free the resources occupied by VASP processes between two ionic steps.
150+
An example can be found in [ex13_pause_mpi.py](examples/ex13_pause_mpi.py). Note currently the functionality is only tested for OpenMPI > 1.3.0. You may need to explicitly add the flag `--mca orte_forward_job_control 1` to your vasp command, or setting via environmental variable `export OMPI_MCA_orte_forward_job_control=1`.
151+
147152
148153
149154

examples/ex10_mlp_online.py

Lines changed: 58 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -49,34 +49,41 @@
4949
"txt": "-",
5050
}
5151

52-
def gen_online_calc(images, parent_calc,
53-
flare_config=default_flare_config,
54-
learner_params=default_learner_params):
55-
"""Use default parameters to generate online calc
56-
"""
52+
53+
def gen_online_calc(
54+
images,
55+
parent_calc,
56+
flare_config=default_flare_config,
57+
learner_params=default_learner_params,
58+
):
59+
"""Use default parameters to generate online calc"""
5760
from al_mlp.ml_potentials.flare_pp_calc import FlarePPCalc
5861
from al_mlp.online_learner.online_learner import OnlineLearner
62+
5963
ml_potential = FlarePPCalc(flare_config, images)
6064
calc = OnlineLearner(learner_params, images, ml_potential, parent_calc)
6165
return calc
6266

67+
6368
def gen_cluster(metal="Cu", number=10):
64-
"""Use random code in cluster_mlp to generate a cluster
65-
"""
69+
"""Use random code in cluster_mlp to generate a cluster"""
6670
from cluster_mlp.fillPool import fillPool
6771
from ase.data import atomic_numbers, covalent_radii
72+
6873
eleNames = [metal]
6974
eleNums = [number]
7075
eleRadii = [covalent_radii[atomic_numbers[ele]] for ele in eleNames]
7176
return fillPool(eleNames, eleNums, eleRadii, None)
7277

78+
7379
# curdir = Path("./").resolve()
7480
# example_dir = curdir / "mlp_benchmark"
7581

82+
7683
def parse_scf(output):
77-
"""Parse scf lines from stdout
78-
"""
84+
"""Parse scf lines from stdout"""
7985
import re
86+
8087
lines = output.split("\n")
8188
pat = r"DAV\:\s+([\d]+)"
8289
prev = -1
@@ -93,10 +100,15 @@ def parse_scf(output):
93100
elec_steps.append(nex)
94101
return elec_steps
95102

96-
def run_opt(initial_structure, vasp, optimizer=BFGS,
97-
use_al=True,
98-
store_wf=True,
99-
traj_name="oal_relax.traj"):
103+
104+
def run_opt(
105+
initial_structure,
106+
vasp,
107+
optimizer=BFGS,
108+
use_al=True,
109+
store_wf=True,
110+
traj_name="oal_relax.traj",
111+
):
100112
"""Choose a backend for vasp or VaspInteractive calculator"""
101113
os.system("rm -rf WAVECAR INCAR POTCAR POSCAR POTCAR")
102114
assert vasp.lower() in ("vasp", "vaspinteractive")
@@ -112,7 +124,7 @@ def run_opt(initial_structure, vasp, optimizer=BFGS,
112124
parent_calc.set(ibrion=-1, nsw=0)
113125
if not store_wf:
114126
parent_calc.set(istart=0, lwave=False)
115-
127+
116128
t_ = time.time()
117129
output = io.StringIO()
118130
# Use context lib to redirect output
@@ -123,16 +135,13 @@ def run_opt(initial_structure, vasp, optimizer=BFGS,
123135
else:
124136
real_calc = parent_calc
125137
images[0].calc = real_calc
126-
127-
128-
dyn = optimizer(images[0],
129-
trajectory=traj_name)
138+
139+
dyn = optimizer(images[0], trajectory=traj_name)
130140
if use_al:
131141
dyn.attach(replay_trajectory, 1, images[0].calc, dyn)
132142
dyn.run(fmax=0.05, steps=1000)
133143
t_elaps = time.time() - t_
134-
135-
144+
136145
output_string = output.getvalue()
137146
elec_steps = parse_scf(output_string)
138147
final_image = Trajectory(traj_name)[-1]
@@ -143,42 +152,48 @@ def run_opt(initial_structure, vasp, optimizer=BFGS,
143152
initial_structure = gen_cluster("Cu", 7)
144153
print("*" * 40)
145154
print("Running with BFGS + vasp -- no cache")
146-
t, steps, fin, _ = run_opt(initial_structure,
147-
"VASP",
148-
use_al=False,
149-
store_wf=False, )
155+
t, steps, fin, _ = run_opt(
156+
initial_structure,
157+
"VASP",
158+
use_al=False,
159+
store_wf=False,
160+
)
150161
print(f"Time: {t:.4s}")
151-
162+
152163
print("*" * 40)
153164
print("Running with BFGS + vasp -- cache")
154-
t, steps, fin, _ = run_opt(initial_structure,
155-
"VASP",
156-
use_al=False,
157-
store_wf=True, )
165+
t, steps, fin, _ = run_opt(
166+
initial_structure,
167+
"VASP",
168+
use_al=False,
169+
store_wf=True,
170+
)
158171
print(f"Time: {t:.4s}")
159172

160173
print("*" * 40)
161174
print("Running with BFGS + vasp inter")
162-
t, steps, fin, _ = run_opt(initial_structure,
163-
"VaspInteractive",
164-
use_al=False, )
175+
t, steps, fin, _ = run_opt(
176+
initial_structure,
177+
"VaspInteractive",
178+
use_al=False,
179+
)
165180
print(f"Time: {t:.4s}")
166181

167-
168-
169182
print("*" * 40)
170183
print("Running with OAL + vasp")
171-
t, steps, fin, _ = run_opt(initial_structure,
172-
"VASP",
173-
use_al=True,
174-
store_wf=True, )
184+
t, steps, fin, _ = run_opt(
185+
initial_structure,
186+
"VASP",
187+
use_al=True,
188+
store_wf=True,
189+
)
175190
print(f"Time: {t:.4s}")
176-
177191

178192
print("*" * 40)
179193
print("Running with OAL + vasp inter")
180-
t, steps, fin, _ = run_opt(initial_structure,
181-
"VaspInteractive",
182-
use_al=True, )
194+
t, steps, fin, _ = run_opt(
195+
initial_structure,
196+
"VaspInteractive",
197+
use_al=True,
198+
)
183199
print(f"Time: {t:.4s}")
184-

examples/ex13_pause_mpi.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222
random_seeds = [133, 357, 274, 331, 140]
2323
vasp_params = dict(xc="pbe", encut=250, istart=0, lwave=False)
2424

25-
mol = molecule("C2H4", vacuum=5, pbc=True)
25+
mol = molecule("CH3CH2NH2", vacuum=5, pbc=True)
26+
2627

2728
def expensive_function(seed=42):
28-
"""Useless function only to consume cpu
29-
"""
29+
"""Useless function only to consume cpu"""
3030
# t_start = time.time()
31-
size = 2048
31+
size = 1024
3232
A = np.random.random((size, size))
3333
B = np.random.random((size, size))
3434
C = np.dot(A, B)
@@ -37,9 +37,16 @@ def expensive_function(seed=42):
3737
t_elasp = timeit.timeit(lambda: np.dot(A, B), number=4)
3838
return t_elasp
3939

40+
4041
def multiproc_expensive_function(nprocs=8, seed=42):
4142
with Pool(nprocs) as pool:
42-
t_list = pool.map(expensive_function, [seed, ] * nprocs)
43+
t_list = pool.map(
44+
expensive_function,
45+
[
46+
seed,
47+
]
48+
* nprocs,
49+
)
4350
return t_list
4451

4552

@@ -67,16 +74,9 @@ def run_calc(pause=False):
6774
t_end = time.time()
6875
print(f"Total computation time {t_end - t_start} s.")
6976
return
70-
71-
72-
73-
74-
75-
76-
77-
7877

7978

8079
if __name__ == "__main__":
8180
run_calc(pause=False)
81+
time.sleep(3)
8282
run_calc(pause=True)

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66

77
setup(
88
name="vasp-interactive",
9-
version="0.0.4",
9+
version="0.0.5",
1010
packages=["vasp_interactive", "vasp_interactive.kubernetes"],
1111
install_requires=[
1212
"ase",
13+
"psutil",
1314
# "pymatgen",
1415
],
1516
)

tests/test_cell_tolerance.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,31 +18,33 @@
1818
# all default settings
1919
vasp_params = dict(xc="pbe", kpts=(1, 1, 1), gamma=True)
2020

21+
2122
def test_check_state():
22-
"""Unit test for check_state function """
23+
"""Unit test for check_state function"""
2324
vasp = Vasp(**vasp_params)
2425
vpi = VaspInteractive(**vasp_params)
25-
26+
2627
vasp.atoms = h2_origin
2728
# Both will report a cell change
2829
system_changes = vasp.check_state(h2_1)
2930
assert "positions" not in system_changes
3031
assert "cell" in system_changes
31-
32+
3233
system_changes = vasp.check_state(h2_2)
3334
assert "positions" not in system_changes
3435
assert "cell" in system_changes
35-
36+
3637
# for VaspInteractive, h2_1 is accepted
3738
vpi.atoms = h2_origin
3839
system_changes = vpi.check_state(h2_1)
3940
assert "positions" not in system_changes
4041
assert "cell" not in system_changes
41-
42+
4243
system_changes = vpi.check_state(h2_2)
4344
assert "positions" not in system_changes
4445
assert "cell" in system_changes
4546

47+
4648
def test_calculation():
4749
vpi = VaspInteractive(**vasp_params)
4850
with vpi:
@@ -56,9 +58,5 @@ def test_calculation():
5658
h2_2.calc = vpi
5759
with pytest.raises(Exception):
5860
e3 = h2_2.get_potential_energy()
59-
61+
6062
assert pytest.approx(e1) == e2
61-
62-
63-
64-

vasp_interactive/vasp_interactive.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525

2626
def _find_mpi_process(pid):
27-
"""Recursively search children processes with PID=pid and return the one
27+
"""Recursively search children processes with PID=pid and return the one
2828
that mpirun (or synonyms) are the main command
2929
"""
3030
allowed_names = ["mpirun", "mpiexec", "orterun", "oshrun", "shmemrun"]
@@ -37,7 +37,6 @@ def _find_mpi_process(pid):
3737
mpi_proc = proc
3838
break
3939
return mpi_proc
40-
4140

4241

4342
class VaspInteractive(Vasp):
@@ -135,7 +134,7 @@ def __init__(
135134
"In some cases the energy and forces can be wrong. "
136135
"Use such settings at your own risk."
137136
)
138-
137+
139138
# Cell tolerance parameter
140139
self.cell_tolerance = abs(cell_tolerance)
141140
if self.cell_tolerance > 1e-3:
@@ -162,7 +161,6 @@ def reset(self):
162161
self._force_kill_process()
163162
self.steps = 0
164163
self.final = False
165-
166164

167165
def _ensure_directory(self):
168166
"""Makesure self.directory exists, if not use `os.makedirs`"""
@@ -396,29 +394,25 @@ def close(self):
396394
self._stdout("VASP has been closed\n", out=out)
397395
self.process = None
398396
return
399-
400397

401398
def pause_calc(self, sig=signal.SIGTSTP):
402-
"""Pause the vasp processes by sending SIGTSTP to the master mpirun process
403-
"""
399+
"""Pause the vasp processes by sending SIGTSTP to the master mpirun process"""
404400
pid = self.process.pid
405401
mpi_process = _find_mpi_process(pid)
406402
if mpi_process is None:
407403
warn("Cannot find the mpi process. Will not send stop signal to mpi.")
408404
return
409405
mpi_process.send_signal(sig)
410406
return
411-
407+
412408
def resume_calc(self, sig=signal.SIGCONT):
413-
"""Resumt the vasp processes by sending SIGCONT to the master mpirun process
414-
"""
409+
"""Resumt the vasp processes by sending SIGCONT to the master mpirun process"""
415410
pid = self.process.pid
416411
mpi_process = _find_mpi_process(pid)
417412
if mpi_process is None:
418413
warn("Cannot find the mpi process. Will not send continue signal to mpi.")
419414
return
420415
mpi_process.send_signal(sig)
421-
422416

423417
def check_state(self, atoms, tol=1e-15):
424418
"""Modified check_state method to allow separate check for cell tolerance"""
@@ -430,7 +424,6 @@ def check_state(self, atoms, tol=1e-15):
430424
old_system_changes = [sc for sc in old_system_changes if sc != "cell"]
431425
return old_system_changes
432426

433-
434427
def calculate(
435428
self,
436429
atoms=None,

0 commit comments

Comments
 (0)