Skip to content

Commit 213331e

Browse files
committed
Merge branch 'master' into int1e_1st_derivative
2 parents db718d0 + 010ca2d commit 213331e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+51328
-54685
lines changed

README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ Features
5353
- MP2/DF-MP2 and CCSD (experimental);
5454
- Polarizability, IR, and NMR shielding (experimental);
5555
- QM/MM with PBC;
56-
- CHELPG, ESP, and RESP atomic charge
56+
- CHELPG, ESP, and RESP atomic charge;
57+
- Multi-GPU for density fitting (experimental)
5758

5859
Limitations
5960
--------
@@ -134,22 +135,22 @@ References
134135
---------
135136
```
136137
@misc{li2024introducting,
137-
title={Introducing GPU-acceleration into the Python-based Simulations of Chemistry Framework},
138+
title={Introducing GPU-acceleration into the Python-based Simulations of Chemistry Framework},
138139
author={Rui Li and Qiming Sun and Xing Zhang and Garnet Kin-Lic Chan},
139140
year={2024},
140141
eprint={2407.09700},
141142
archivePrefix={arXiv},
142143
primaryClass={physics.comp-ph},
143-
url={https://arxiv.org/abs/2407.09700},
144+
url={https://arxiv.org/abs/2407.09700},
144145
}
145146
146147
@misc{wu2024enhancing,
147-
title={Enhancing GPU-acceleration in the Python-based Simulations of Chemistry Framework},
148+
title={Enhancing GPU-acceleration in the Python-based Simulations of Chemistry Framework},
148149
author={Xiaojie Wu and Qiming Sun and Zhichen Pu and Tianze Zheng and Wenzhi Ma and Wen Yan and Xia Yu and Zhengxiao Wu and Mian Huo and Xiang Li and Weiluo Ren and Sheng Gong and Yumin Zhang and Weihao Gao},
149150
year={2024},
150151
eprint={2404.09452},
151152
archivePrefix={arXiv},
152153
primaryClass={physics.comp-ph},
153-
url={https://arxiv.org/abs/2404.09452},
154+
url={https://arxiv.org/abs/2404.09452},
154155
}
155156
```

examples/00-h2o.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,12 @@
3636
atom=atom, # water molecule
3737
basis='def2-tzvpp', # basis set
3838
output='./pyscf.log', # save log file
39-
verbose=6 # control the level of print info
39+
verbose=6 # control the level of print info
4040
)
4141

4242
mf_GPU = rks.RKS( # restricted Kohn-Sham DFT
4343
mol, # pyscf.gto.object
44-
xc='b3lyp' # xc funtionals, such as pbe0, wb97m-v, tpss,
44+
xc='b3lyp' # xc funtionals, such as pbe0, wb97m-v, tpss,
4545
).density_fit() # density fitting
4646

4747
mf_GPU.grids.atom_grid = (99,590) # (99,590) lebedev grids, (75,302) is often enough

gpu4pyscf/__config__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,11 @@
2424
mem_fraction = 0.9
2525
cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
2626

27+
# Check P2P data transfer is available
28+
_p2p_access = True
29+
if _num_devices > 1:
30+
for src in range(_num_devices):
31+
for dst in range(_num_devices):
32+
if src != dst:
33+
can_access_peer = cupy.cuda.runtime.deviceCanAccessPeer(src, dst)
34+
_p2p_access &= can_access_peer

gpu4pyscf/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = '1.1.0'
1+
__version__ = '1.2.0'
22

33
# monkey patch libxc reference due to a bug in nvcc
44
from pyscf.dft import libxc

gpu4pyscf/df/df.py

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from cupyx.scipy.linalg import solve_triangular
2222
from pyscf import lib
2323
from pyscf.df import df, addons, incore
24-
from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph
24+
from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer
2525
from gpu4pyscf.df import int3c2e, df_jk
2626
from gpu4pyscf.lib import logger
2727
from gpu4pyscf import __config__
@@ -123,7 +123,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
123123
if key in self._rsh_df:
124124
rsh_df = self._rsh_df[key]
125125
else:
126-
rsh_df = self._rsh_df[key] = copy.copy(self).reset()
126+
rsh_df = self._rsh_df[key] = self.copy().reset()
127127
logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
128128

129129
return df_jk.get_jk(rsh_df, dm, hermi, with_j, with_k, direct_scf_tol, omega=omega)
@@ -177,10 +177,10 @@ def loop(self, blksize=None, unpack=True):
177177
yield buf2, buf.T
178178
if isinstance(cderi_sparse, np.ndarray):
179179
cupy.cuda.Device().synchronize()
180-
180+
181181
if buf_prefetch is not None:
182182
buf = buf_prefetch
183-
183+
184184
def reset(self, mol=None):
185185
'''Reset mol and clean up relevant attributes for scanner mode'''
186186
if mol is not None:
@@ -208,13 +208,14 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
208208
npairs = len(intopt.cderi_row)
209209
log = logger.new_logger(mol, mol.verbose)
210210

211-
# if the matrix exceeds the limit, store CDERI in CPU memory
212-
# TODO: better estimate of memory consumption for each device
211+
# Available memory on Device 0.
213212
avail_mem = get_avail_mem()
214213

215214
if use_gpu_memory:
216-
# If GPU memory is not enough
217-
use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem
215+
# CDERI will be equally distributed to the devices
216+
# Other devices usually have more memory available than Device 0
217+
# CDERI will use up to 40% of the available memory
218+
use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices
218219

219220
if use_gpu_memory:
220221
log.debug("Saving CDERI on GPU")
@@ -244,9 +245,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
244245
cd_low_f = cupy.array(cd_low, order='F', copy=False)
245246
cd_low_f = tag_array(cd_low_f, tag=cd_low.tag)
246247

247-
for gpu_id in range(_num_devices):
248-
cupy.cuda.Device(gpu_id).synchronize()
249-
248+
cupy.cuda.get_current_stream().synchronize()
250249
futures = []
251250
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
252251
for device_id in range(_num_devices):
@@ -258,9 +257,6 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
258257
for future in futures:
259258
future.result()
260259

261-
for device_id in range(_num_devices):
262-
cupy.cuda.Device(device_id).synchronize()
263-
264260
if not use_gpu_memory:
265261
cupy.cuda.Device().synchronize()
266262

@@ -344,14 +340,14 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
344340
# if CDERI is saved on CPU
345341
ij0 = pairs_loc[cp_ij_id]
346342
ij1 = pairs_loc[cp_ij_id+1]
347-
if isinstance(_cderi, np.ndarray):
343+
if isinstance(_cderi[0], np.ndarray):
348344
for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
349345
for i in range(p0,p1):
350-
cderi_block[i].get(out=_cderi[slice_id][i,ij0:ij1])
346+
cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
351347
else:
352348
# Copy data to other Devices
353349
for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
354-
_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
355-
350+
#_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
351+
p2p_transfer(_cderi[slice_id][:,ij0:ij1], cderi_block[p0:p1])
356352
t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
357353
return

gpu4pyscf/df/df_jk.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def build_df():
4747
if key in mf.with_df._rsh_df:
4848
rsh_df = mf.with_df._rsh_df[key]
4949
else:
50-
rsh_df = mf.with_df._rsh_df[key] = copy.copy(mf.with_df).reset()
50+
rsh_df = mf.with_df._rsh_df[key] = mf.with_df.copy().reset()
5151
rsh_df.build(omega=omega)
5252
return
5353

@@ -101,7 +101,7 @@ def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False):
101101
mf.with_df = with_df
102102
elif getattr(mf.with_df, 'auxbasis', None) != auxbasis:
103103
#logger.warn(mf, 'DF might have been initialized twice.')
104-
mf = copy.copy(mf)
104+
mf = mf.copy()
105105
mf.with_df = with_df
106106
mf.only_dfj = only_dfj
107107
return mf
@@ -298,8 +298,7 @@ def _jk_task_with_mo(dfobj, dms, mo_coeff, mo_occ,
298298
rhok = rhok.reshape([-1,nao])
299299
vk[i] += cupy.dot(rhok.T, rhok)
300300
rhok = None
301-
cupy.cuda.get_current_stream().synchronize()
302-
301+
303302
if with_j:
304303
vj = cupy.zeros(dms_shape)
305304
vj[:,rows,cols] = vj_packed
@@ -390,13 +389,12 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
390389
else:
391390
dm_sparse *= 2
392391
dm_sparse[:, intopt.cderi_diag] *= .5
393-
392+
vj_sparse = cupy.zeros_like(dm_sparse)
393+
394394
if with_k:
395395
vk = cupy.zeros_like(dms)
396396

397397
nset = dms.shape[0]
398-
if with_j:
399-
vj_sparse = cupy.zeros_like(dm_sparse)
400398
blksize = dfobj.get_blksize()
401399
for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
402400
if with_j:
@@ -406,7 +404,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
406404
for k in range(nset):
407405
rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao])
408406
#vk[k] += contract('Lki,Lkj->ij', rhok, cderi)
409-
vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
407+
vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
410408
if with_j:
411409
vj = cupy.zeros(dms_shape)
412410
vj[:,rows,cols] = vj_sparse
@@ -445,6 +443,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
445443
intopt = dfobj.intopt
446444
dms = intopt.sort_orbitals(dms, axis=[1,2])
447445

446+
cupy.cuda.get_current_stream().synchronize()
448447
if getattr(dms_tag, 'mo_coeff', None) is not None:
449448
mo_occ = dms_tag.mo_occ
450449
mo_coeff = dms_tag.mo_coeff
@@ -498,13 +497,13 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
498497
vj = vk = None
499498
if with_j:
500499
vj = [future.result()[0] for future in futures]
501-
vj = reduce_to_device(vj)
500+
vj = reduce_to_device(vj, inplace=True)
502501
vj = intopt.unsort_orbitals(vj, axis=[1,2])
503502
vj = vj.reshape(out_shape)
504-
503+
505504
if with_k:
506505
vk = [future.result()[1] for future in futures]
507-
vk = reduce_to_device(vk)
506+
vk = reduce_to_device(vk, inplace=True)
508507
vk = intopt.unsort_orbitals(vk, axis=[1,2])
509508
vk = vk.reshape(out_shape)
510509

@@ -529,7 +528,7 @@ def _get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True,
529528
if key in dfobj._rsh_df:
530529
rsh_df = dfobj._rsh_df[key]
531530
else:
532-
rsh_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
531+
rsh_df = dfobj._rsh_df[key] = dfobj.copy().reset()
533532
logger.info(dfobj, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
534533

535534
with rsh_df.mol.with_range_coulomb(omega):

gpu4pyscf/df/grad/jk.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from concurrent.futures import ThreadPoolExecutor
1717
import cupy
18-
from gpu4pyscf.lib.cupy_helper import contract
18+
from gpu4pyscf.lib.cupy_helper import contract, concatenate
1919
from gpu4pyscf.lib import logger
2020
from gpu4pyscf.__config__ import _streams, _num_devices
2121

@@ -58,6 +58,7 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
5858
''' Calculate rhoj and rhok on Multi-GPU system
5959
'''
6060
futures = []
61+
cupy.cuda.get_current_stream().synchronize()
6162
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
6263
for device_id in range(_num_devices):
6364
future = executor.submit(
@@ -74,8 +75,8 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
7475

7576
rhoj = rhok = None
7677
if with_j:
77-
rhoj = cupy.concatenate(rhoj_total)
78+
rhoj = concatenate(rhoj_total)
7879
if with_k:
79-
rhok = cupy.concatenate(rhok_total)
80+
rhok = concatenate(rhok_total)
8081

8182
return rhoj, rhok

gpu4pyscf/df/grad/rhf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
7171
with_df = mf_grad.base.with_df._rsh_df[key]
7272
else:
7373
dfobj = mf_grad.base.with_df
74-
with_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
74+
with_df = dfobj._rsh_df[key] = dfobj.copy().reset()
7575

7676
auxmol = with_df.auxmol
7777
if not hasattr(with_df, 'intopt') or with_df._cderi is None:
@@ -282,4 +282,4 @@ def extra_force(self, atom_id, envs):
282282
else:
283283
return 0
284284

285-
Grad = Gradients
285+
Grad = Gradients

gpu4pyscf/df/grad/uhf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
5151
with_df = mf_grad.base.with_df._rsh_df[key]
5252
else:
5353
dfobj = mf_grad.base.with_df
54-
with_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
54+
with_df = dfobj._rsh_df[key] = dfobj.copy().reset()
5555

5656
auxmol = with_df.auxmol
5757
if not hasattr(with_df, 'intopt') or with_df._cderi is None:

0 commit comments

Comments
 (0)