bytedance
diff --git a/‎README.md‎
Lines changed: 6 additions & 5 deletions b/‎README.md‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎examples/00-h2o.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/00-h2o.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎gpu4pyscf/__config__.py‎
Lines changed: 8 additions & 0 deletions b/‎gpu4pyscf/__config__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎gpu4pyscf/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎gpu4pyscf/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎gpu4pyscf/df/df.py‎
Lines changed: 14 additions & 18 deletions b/‎gpu4pyscf/df/df.py‎
Lines changed: 14 additions & 18 deletions
diff --git a/‎gpu4pyscf/df/df_jk.py‎
Lines changed: 11 additions & 12 deletions b/‎gpu4pyscf/df/df_jk.py‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎gpu4pyscf/df/grad/jk.py‎
Lines changed: 4 additions & 3 deletions b/‎gpu4pyscf/df/grad/jk.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎gpu4pyscf/df/grad/rhf.py‎
Lines changed: 2 additions & 2 deletions b/‎gpu4pyscf/df/grad/rhf.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎gpu4pyscf/df/grad/uhf.py‎
Lines changed: 1 addition & 1 deletion b/‎gpu4pyscf/df/grad/uhf.py‎
Lines changed: 1 addition & 1 deletion
@@ -53,7 +53,8 @@ Features
 - MP2/DF-MP2 and CCSD (experimental);
 - Polarizability, IR, and NMR shielding (experimental);
 - QM/MM with PBC;
-- CHELPG, ESP, and RESP atomic charge
+- CHELPG, ESP, and RESP atomic charge;
+- Multi-GPU for density fitting (experimental)
 
 Limitations
 --------
@@ -134,22 +135,22 @@ References
 ---------
 ```
 @misc{li2024introducting,
-      title={Introducing GPU-acceleration into the Python-based Simulations of Chemistry Framework}, 
+      title={Introducing GPU-acceleration into the Python-based Simulations of Chemistry Framework},
       author={Rui Li and Qiming Sun and Xing Zhang and Garnet Kin-Lic Chan},
       year={2024},
       eprint={2407.09700},
       archivePrefix={arXiv},
       primaryClass={physics.comp-ph},
-      url={https://arxiv.org/abs/2407.09700}, 
+      url={https://arxiv.org/abs/2407.09700},
 }
 
 @misc{wu2024enhancing,
-      title={Enhancing GPU-acceleration in the Python-based Simulations of Chemistry Framework}, 
+      title={Enhancing GPU-acceleration in the Python-based Simulations of Chemistry Framework},
       author={Xiaojie Wu and Qiming Sun and Zhichen Pu and Tianze Zheng and Wenzhi Ma and Wen Yan and Xia Yu and Zhengxiao Wu and Mian Huo and Xiang Li and Weiluo Ren and Sheng Gong and Yumin Zhang and Weihao Gao},
       year={2024},
       eprint={2404.09452},
       archivePrefix={arXiv},
       primaryClass={physics.comp-ph},
-      url={https://arxiv.org/abs/2404.09452}, 
+      url={https://arxiv.org/abs/2404.09452},
 }
 ```
@@ -36,12 +36,12 @@
     atom=atom,                         # water molecule
     basis='def2-tzvpp',                # basis set
     output='./pyscf.log',              # save log file
-    verbose=6                          # control the level of print info
+    verbose=6                         # control the level of print info
     )
 
 mf_GPU = rks.RKS(                      # restricted Kohn-Sham DFT
     mol,                               # pyscf.gto.object
-    xc='b3lyp'                         # xc funtionals, such as pbe0, wb97m-v, tpss,
+    xc='b3lyp'                        # xc funtionals, such as pbe0, wb97m-v, tpss,
     ).density_fit()                    # density fitting
 
 mf_GPU.grids.atom_grid = (99,590)      # (99,590) lebedev grids, (75,302) is often enough
 
@@ -24,3 +24,11 @@
 mem_fraction = 0.9
 cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
 
+# Check P2P data transfer is available
+_p2p_access = True
+if _num_devices > 1:
+    for src in range(_num_devices):
+        for dst in range(_num_devices):
+            if src != dst:
+                can_access_peer = cupy.cuda.runtime.deviceCanAccessPeer(src, dst)
+                _p2p_access &= can_access_peer
@@ -1,4 +1,4 @@
-__version__ = '1.1.0'
+__version__ = '1.2.0'
 
 # monkey patch libxc reference due to a bug in nvcc
 from pyscf.dft import libxc
 
@@ -21,7 +21,7 @@
 from cupyx.scipy.linalg import solve_triangular
 from pyscf import lib
 from pyscf.df import df, addons, incore
-from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph
+from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
@@ -123,7 +123,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
         if key in self._rsh_df:
             rsh_df = self._rsh_df[key]
         else:
-            rsh_df = self._rsh_df[key] = copy.copy(self).reset()
+            rsh_df = self._rsh_df[key] = self.copy().reset()
             logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
 
         return df_jk.get_jk(rsh_df, dm, hermi, with_j, with_k, direct_scf_tol, omega=omega)
@@ -177,10 +177,10 @@ def loop(self, blksize=None, unpack=True):
             yield buf2, buf.T
             if isinstance(cderi_sparse, np.ndarray):
                 cupy.cuda.Device().synchronize()
-
+            
             if buf_prefetch is not None:
                 buf = buf_prefetch
-
+            
     def reset(self, mol=None):
         '''Reset mol and clean up relevant attributes for scanner mode'''
         if mol is not None:
@@ -208,13 +208,14 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     npairs = len(intopt.cderi_row)
     log = logger.new_logger(mol, mol.verbose)
 
-    # if the matrix exceeds the limit, store CDERI in CPU memory
-    # TODO: better estimate of memory consumption for each device
+    # Available memory on Device 0.
     avail_mem = get_avail_mem()
 
     if use_gpu_memory:
-        # If GPU memory is not enough
-        use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem
+        # CDERI will be equally distributed to the devices
+        # Other devices usually have more memory available than Device 0
+        # CDERI will use up to 40% of the available memory
+        use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices
 
     if use_gpu_memory:
         log.debug("Saving CDERI on GPU")
@@ -244,9 +245,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     cd_low_f = cupy.array(cd_low, order='F', copy=False)
     cd_low_f = tag_array(cd_low_f, tag=cd_low.tag)
 
-    for gpu_id in range(_num_devices):
-        cupy.cuda.Device(gpu_id).synchronize()
-
+    cupy.cuda.get_current_stream().synchronize()
     futures = []
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
@@ -258,9 +257,6 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     for future in futures:
         future.result()
 
-    for device_id in range(_num_devices):
-        cupy.cuda.Device(device_id).synchronize()
-
     if not use_gpu_memory:
         cupy.cuda.Device().synchronize()
 
@@ -344,14 +340,14 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
             # if CDERI is saved on CPU
             ij0 = pairs_loc[cp_ij_id]
             ij1 = pairs_loc[cp_ij_id+1]
-            if isinstance(_cderi, np.ndarray):
+            if isinstance(_cderi[0], np.ndarray):
                 for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
                     for i in range(p0,p1):
-                        cderi_block[i].get(out=_cderi[slice_id][i,ij0:ij1])
+                        cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
             else:
                 # Copy data to other Devices
                 for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
-                    _cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
-            
+                    #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
+                    p2p_transfer(_cderi[slice_id][:,ij0:ij1], cderi_block[p0:p1])
             t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
     return
@@ -47,7 +47,7 @@ def build_df():
             if key in mf.with_df._rsh_df:
                 rsh_df = mf.with_df._rsh_df[key]
             else:
-                rsh_df = mf.with_df._rsh_df[key] = copy.copy(mf.with_df).reset()
+                rsh_df = mf.with_df._rsh_df[key] = mf.with_df.copy().reset()
             rsh_df.build(omega=omega)
         return
 
@@ -101,7 +101,7 @@ def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False):
             mf.with_df = with_df
         elif getattr(mf.with_df, 'auxbasis', None) != auxbasis:
             #logger.warn(mf, 'DF might have been initialized twice.')
-            mf = copy.copy(mf)
+            mf = mf.copy()
             mf.with_df = with_df
             mf.only_dfj = only_dfj
         return mf
@@ -298,8 +298,7 @@ def _jk_task_with_mo(dfobj, dms, mo_coeff, mo_occ,
                         rhok = rhok.reshape([-1,nao])
                         vk[i] += cupy.dot(rhok.T, rhok)
                     rhok = None
-                cupy.cuda.get_current_stream().synchronize()
-                
+
             if with_j:
                 vj = cupy.zeros(dms_shape)
                 vj[:,rows,cols] = vj_packed
@@ -390,13 +389,12 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
             else:
                 dm_sparse *= 2
             dm_sparse[:, intopt.cderi_diag] *= .5
-        
+            vj_sparse = cupy.zeros_like(dm_sparse)
+
         if with_k:
             vk = cupy.zeros_like(dms)
 
         nset = dms.shape[0]
-        if with_j:
-            vj_sparse = cupy.zeros_like(dm_sparse)
         blksize = dfobj.get_blksize()
         for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
             if with_j:
@@ -406,7 +404,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
                 for k in range(nset):
                     rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao])
                     #vk[k] += contract('Lki,Lkj->ij', rhok, cderi)
-                    vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
+                    vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))            
         if with_j:
             vj = cupy.zeros(dms_shape)
             vj[:,rows,cols] = vj_sparse
@@ -445,6 +443,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
     intopt = dfobj.intopt
     dms = intopt.sort_orbitals(dms, axis=[1,2])
 
+    cupy.cuda.get_current_stream().synchronize()
     if getattr(dms_tag, 'mo_coeff', None) is not None:
         mo_occ = dms_tag.mo_occ
         mo_coeff = dms_tag.mo_coeff
@@ -498,13 +497,13 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
     vj = vk = None
     if with_j:
         vj = [future.result()[0] for future in futures]
-        vj = reduce_to_device(vj)
+        vj = reduce_to_device(vj, inplace=True)
         vj = intopt.unsort_orbitals(vj, axis=[1,2])
         vj = vj.reshape(out_shape)
-    
+
     if with_k:
         vk = [future.result()[1] for future in futures]
-        vk = reduce_to_device(vk)
+        vk = reduce_to_device(vk, inplace=True)
         vk = intopt.unsort_orbitals(vk, axis=[1,2])
         vk = vk.reshape(out_shape)
 
@@ -529,7 +528,7 @@ def _get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True,
     if key in dfobj._rsh_df:
         rsh_df = dfobj._rsh_df[key]
     else:
-        rsh_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
+        rsh_df = dfobj._rsh_df[key] = dfobj.copy().reset()
         logger.info(dfobj, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
 
     with rsh_df.mol.with_range_coulomb(omega):
 
@@ -15,7 +15,7 @@
 
 from concurrent.futures import ThreadPoolExecutor
 import cupy
-from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.lib.cupy_helper import contract, concatenate
 from gpu4pyscf.lib import logger
 from gpu4pyscf.__config__ import _streams, _num_devices
 
@@ -58,6 +58,7 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
     ''' Calculate rhoj and rhok on Multi-GPU system
     '''
     futures = []
+    cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             future = executor.submit(
@@ -74,8 +75,8 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
 
     rhoj = rhok = None
     if with_j:
-        rhoj = cupy.concatenate(rhoj_total)
+        rhoj = concatenate(rhoj_total)
     if with_k:
-        rhok = cupy.concatenate(rhok_total)
+        rhok = concatenate(rhok_total)
 
     return rhoj, rhok
@@ -71,7 +71,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
             with_df = mf_grad.base.with_df._rsh_df[key]
         else:
             dfobj = mf_grad.base.with_df
-            with_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
+            with_df = dfobj._rsh_df[key] = dfobj.copy().reset()
 
     auxmol = with_df.auxmol
     if not hasattr(with_df, 'intopt') or with_df._cderi is None:
@@ -282,4 +282,4 @@ def extra_force(self, atom_id, envs):
         else:
             return 0
 
-Grad = Gradients
+Grad = Gradients
@@ -51,7 +51,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
             with_df = mf_grad.base.with_df._rsh_df[key]
         else:
             dfobj = mf_grad.base.with_df
-            with_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
+            with_df = dfobj._rsh_df[key] = dfobj.copy().reset()
 
     auxmol = with_df.auxmol
     if not hasattr(with_df, 'intopt') or with_df._cderi is None:
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = '1.1.0'`
	`1`	`+__version__ = '1.2.0'`
`2`	`2`
`3`	`3`	`# monkey patch libxc reference due to a bug in nvcc`
`4`	`4`	`from pyscf.dft import libxc`