Add readme section for pause / resume

alchem0x2A · alchem0x2A · commit a6be689964e4 · 2022-02-09T19:48:12.000Z
diff --git a/README.md b/README.md
@@ -144,6 +144,11 @@ not beneficial due to big difference of wavefunctions on neighboring images.
 On the other hand, `KubeVaspInteractive` has nearly linear scaling with worker pod numbers,
 if the workload per pod is balanced (see [examples/ex11_k8s_minimal.py](examples/ex11_k8s_minimal.py)).
 
+By default, the MPI processes that run the VASP calculations will occupy 100% cpu on the allocated cores / slots, even when waiting for the inputs. 
+This can lead to undesired effects when other CPU-expensive codes are running between two `VaspInteractive` ionic steps. 
+Starting from version `0.0.5` we add the `pause_calc` and `resume_calc` methods to `VaspInteractive`, so the user can temporarily free the resources occupied by VASP processes between two ionic steps. 
+An example can be found in [ex13_pause_mpi.py](examples/ex13_pause_mpi.py). Note currently the functionality is only tested for OpenMPI > 1.3.0. You may need to explicitly add the flag `--mca orte_forward_job_control 1` to your vasp command, or setting via environmental variable `export OMPI_MCA_orte_forward_job_control=1`.
+
 
 
 
diff --git a/examples/ex10_mlp_online.py b/examples/ex10_mlp_online.py
@@ -49,34 +49,41 @@
     "txt": "-",
 }
 
-def gen_online_calc(images, parent_calc, 
-                    flare_config=default_flare_config, 
-                    learner_params=default_learner_params):
-    """Use default parameters to generate online calc
-    """
+
+def gen_online_calc(
+    images,
+    parent_calc,
+    flare_config=default_flare_config,
+    learner_params=default_learner_params,
+):
+    """Use default parameters to generate online calc"""
     from al_mlp.ml_potentials.flare_pp_calc import FlarePPCalc
     from al_mlp.online_learner.online_learner import OnlineLearner
+
     ml_potential = FlarePPCalc(flare_config, images)
     calc = OnlineLearner(learner_params, images, ml_potential, parent_calc)
     return calc
 
+
 def gen_cluster(metal="Cu", number=10):
-    """Use random code in cluster_mlp to generate a cluster
-    """
+    """Use random code in cluster_mlp to generate a cluster"""
     from cluster_mlp.fillPool import fillPool
     from ase.data import atomic_numbers, covalent_radii
+
     eleNames = [metal]
     eleNums = [number]
     eleRadii = [covalent_radii[atomic_numbers[ele]] for ele in eleNames]
     return fillPool(eleNames, eleNums, eleRadii, None)
 
+
 # curdir = Path("./").resolve()
 # example_dir = curdir / "mlp_benchmark"
 
+
 def parse_scf(output):
-    """Parse scf lines from stdout
-    """
+    """Parse scf lines from stdout"""
     import re
+
     lines = output.split("\n")
     pat = r"DAV\:\s+([\d]+)"
     prev = -1
@@ -93,10 +100,15 @@ def parse_scf(output):
     elec_steps.append(nex)
     return elec_steps
 
-def run_opt(initial_structure, vasp, optimizer=BFGS, 
-            use_al=True, 
-            store_wf=True, 
-            traj_name="oal_relax.traj"):
+
+def run_opt(
+    initial_structure,
+    vasp,
+    optimizer=BFGS,
+    use_al=True,
+    store_wf=True,
+    traj_name="oal_relax.traj",
+):
     """Choose a backend for vasp or VaspInteractive calculator"""
     os.system("rm -rf WAVECAR INCAR POTCAR POSCAR POTCAR")
     assert vasp.lower() in ("vasp", "vaspinteractive")
@@ -112,7 +124,7 @@ def run_opt(initial_structure, vasp, optimizer=BFGS,
         parent_calc.set(ibrion=-1, nsw=0)
         if not store_wf:
             parent_calc.set(istart=0, lwave=False)
-    
+
     t_ = time.time()
     output = io.StringIO()
     # Use context lib to redirect output
@@ -123,16 +135,13 @@ def run_opt(initial_structure, vasp, optimizer=BFGS,
             else:
                 real_calc = parent_calc
             images[0].calc = real_calc
-            
-            
-            dyn = optimizer(images[0], 
-                            trajectory=traj_name)
+
+            dyn = optimizer(images[0], trajectory=traj_name)
             if use_al:
                 dyn.attach(replay_trajectory, 1, images[0].calc, dyn)
             dyn.run(fmax=0.05, steps=1000)
     t_elaps = time.time() - t_
-        
-    
+
     output_string = output.getvalue()
     elec_steps = parse_scf(output_string)
     final_image = Trajectory(traj_name)[-1]
@@ -143,42 +152,48 @@ def run_opt(initial_structure, vasp, optimizer=BFGS,
     initial_structure = gen_cluster("Cu", 7)
     print("*" * 40)
     print("Running with BFGS + vasp -- no cache")
-    t, steps, fin, _ = run_opt(initial_structure,
-                          "VASP", 
-                          use_al=False, 
-                          store_wf=False, )
+    t, steps, fin, _ = run_opt(
+        initial_structure,
+        "VASP",
+        use_al=False,
+        store_wf=False,
+    )
     print(f"Time: {t:.4s}")
-    
+
     print("*" * 40)
     print("Running with BFGS + vasp -- cache")
-    t, steps, fin, _ = run_opt(initial_structure,
-                          "VASP", 
-                          use_al=False, 
-                          store_wf=True, )
+    t, steps, fin, _ = run_opt(
+        initial_structure,
+        "VASP",
+        use_al=False,
+        store_wf=True,
+    )
     print(f"Time: {t:.4s}")
 
     print("*" * 40)
     print("Running with BFGS + vasp inter")
-    t, steps, fin, _ = run_opt(initial_structure,
-                          "VaspInteractive", 
-                          use_al=False, )
+    t, steps, fin, _ = run_opt(
+        initial_structure,
+        "VaspInteractive",
+        use_al=False,
+    )
     print(f"Time: {t:.4s}")
 
-
-
     print("*" * 40)
     print("Running with OAL + vasp")
-    t, steps, fin, _ = run_opt(initial_structure,
-                          "VASP", 
-                          use_al=True, 
-                          store_wf=True, )
+    t, steps, fin, _ = run_opt(
+        initial_structure,
+        "VASP",
+        use_al=True,
+        store_wf=True,
+    )
     print(f"Time: {t:.4s}")
-    
 
     print("*" * 40)
     print("Running with OAL + vasp inter")
-    t, steps, fin, _ = run_opt(initial_structure,
-                          "VaspInteractive", 
-                          use_al=True, )
+    t, steps, fin, _ = run_opt(
+        initial_structure,
+        "VaspInteractive",
+        use_al=True,
+    )
     print(f"Time: {t:.4s}")
-
diff --git a/examples/ex13_pause_mpi.py b/examples/ex13_pause_mpi.py
@@ -22,13 +22,13 @@
 random_seeds = [133, 357, 274, 331, 140]
 vasp_params = dict(xc="pbe", encut=250, istart=0, lwave=False)
 
-mol = molecule("C2H4", vacuum=5, pbc=True)
+mol = molecule("CH3CH2NH2", vacuum=5, pbc=True)
+
 
 def expensive_function(seed=42):
-    """Useless function only to consume cpu
-    """
+    """Useless function only to consume cpu"""
     # t_start = time.time()
-    size = 2048
+    size = 1024
     A = np.random.random((size, size))
     B = np.random.random((size, size))
     C = np.dot(A, B)
@@ -37,9 +37,16 @@ def expensive_function(seed=42):
     t_elasp = timeit.timeit(lambda: np.dot(A, B), number=4)
     return t_elasp
 
+
 def multiproc_expensive_function(nprocs=8, seed=42):
     with Pool(nprocs) as pool:
-        t_list = pool.map(expensive_function, [seed, ] * nprocs)
+        t_list = pool.map(
+            expensive_function,
+            [
+                seed,
+            ]
+            * nprocs,
+        )
     return t_list
 
 
@@ -67,16 +74,9 @@ def run_calc(pause=False):
     t_end = time.time()
     print(f"Total computation time {t_end - t_start} s.")
     return
-        
-                
-    
-
-
-    
-
-
 
 
 if __name__ == "__main__":
     run_calc(pause=False)
+    time.sleep(3)
     run_calc(pause=True)
diff --git a/setup.py b/setup.py
@@ -6,10 +6,11 @@
 
 setup(
     name="vasp-interactive",
-    version="0.0.4",
+    version="0.0.5",
     packages=["vasp_interactive", "vasp_interactive.kubernetes"],
     install_requires=[
         "ase",
+        "psutil",
         # "pymatgen",
     ],
 )
diff --git a/tests/test_cell_tolerance.py b/tests/test_cell_tolerance.py
@@ -18,31 +18,33 @@
 # all default settings
 vasp_params = dict(xc="pbe", kpts=(1, 1, 1), gamma=True)
 
+
 def test_check_state():
-    """Unit test for check_state function """
+    """Unit test for check_state function"""
     vasp = Vasp(**vasp_params)
     vpi = VaspInteractive(**vasp_params)
-    
+
     vasp.atoms = h2_origin
     # Both will report a cell change
     system_changes = vasp.check_state(h2_1)
     assert "positions" not in system_changes
     assert "cell" in system_changes
-    
+
     system_changes = vasp.check_state(h2_2)
     assert "positions" not in system_changes
     assert "cell" in system_changes
-    
+
     # for VaspInteractive, h2_1 is accepted
     vpi.atoms = h2_origin
     system_changes = vpi.check_state(h2_1)
     assert "positions" not in system_changes
     assert "cell" not in system_changes
-    
+
     system_changes = vpi.check_state(h2_2)
     assert "positions" not in system_changes
     assert "cell" in system_changes
 
+
 def test_calculation():
     vpi = VaspInteractive(**vasp_params)
     with vpi:
@@ -56,9 +58,5 @@ def test_calculation():
         h2_2.calc = vpi
         with pytest.raises(Exception):
             e3 = h2_2.get_potential_energy()
-        
+
         assert pytest.approx(e1) == e2
-    
-    
-    
-    
diff --git a/vasp_interactive/vasp_interactive.py b/vasp_interactive/vasp_interactive.py
@@ -24,7 +24,7 @@
 
 
 def _find_mpi_process(pid):
-    """Recursively search children processes with PID=pid and return the one 
+    """Recursively search children processes with PID=pid and return the one
     that mpirun (or synonyms) are the main command
     """
     allowed_names = ["mpirun", "mpiexec", "orterun", "oshrun", "shmemrun"]
@@ -37,7 +37,6 @@ def _find_mpi_process(pid):
             mpi_proc = proc
             break
     return mpi_proc
-    
 
 
 class VaspInteractive(Vasp):
@@ -135,7 +134,7 @@ def __init__(
                 "In some cases the energy and forces can be wrong. "
                 "Use such settings at your own risk."
             )
-            
+
         # Cell tolerance parameter
         self.cell_tolerance = abs(cell_tolerance)
         if self.cell_tolerance > 1e-3:
@@ -162,7 +161,6 @@ def reset(self):
             self._force_kill_process()
             self.steps = 0
             self.final = False
-    
 
     def _ensure_directory(self):
         """Makesure self.directory exists, if not use `os.makedirs`"""
@@ -396,29 +394,25 @@ def close(self):
                 self._stdout("VASP has been closed\n", out=out)
                 self.process = None
             return
-        
 
     def pause_calc(self, sig=signal.SIGTSTP):
-        """Pause the vasp processes by sending SIGTSTP to the master mpirun process
-        """
+        """Pause the vasp processes by sending SIGTSTP to the master mpirun process"""
         pid = self.process.pid
         mpi_process = _find_mpi_process(pid)
         if mpi_process is None:
             warn("Cannot find the mpi process. Will not send stop signal to mpi.")
             return
         mpi_process.send_signal(sig)
         return
-    
+
     def resume_calc(self, sig=signal.SIGCONT):
-        """Resumt the vasp processes by sending SIGCONT to the master mpirun process
-        """
+        """Resumt the vasp processes by sending SIGCONT to the master mpirun process"""
         pid = self.process.pid
         mpi_process = _find_mpi_process(pid)
         if mpi_process is None:
             warn("Cannot find the mpi process. Will not send continue signal to mpi.")
             return
         mpi_process.send_signal(sig)
-        
 
     def check_state(self, atoms, tol=1e-15):
         """Modified check_state method to allow separate check for cell tolerance"""
@@ -430,7 +424,6 @@ def check_state(self, atoms, tol=1e-15):
                 old_system_changes = [sc for sc in old_system_changes if sc != "cell"]
         return old_system_changes
 
-
     def calculate(
         self,
         atoms=None,