diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py index bfd8d8d86..386706c86 100644 --- a/gpu4pyscf/__config__.py +++ b/gpu4pyscf/__config__.py @@ -32,7 +32,7 @@ # Use smaller blksize for old gaming GPUs if props['totalGlobalMem'] < 16 * GB: min_ao_blksize = 64 - min_grid_blksize = 64*64 + min_grid_blksize = 128*128 # Use 90% of the global memory for CuPy memory pool mem_fraction = 0.9 diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index c58c14284..a99e22c8b 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -20,7 +20,7 @@ from cupyx.scipy.linalg import solve_triangular from pyscf import lib from pyscf.df import df, addons, incore -from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem, +from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer, copy_array) from gpu4pyscf.df import int3c2e, df_jk from gpu4pyscf.lib import logger @@ -269,7 +269,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, return _cderi -def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, +def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, omega=None, sr_only=False, device_id=0): ''' Execute CDERI tasks on one device ''' @@ -362,5 +362,5 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, _cderi[dev_id][:,ij0:ij1] = tmp else: _cderi[0][:,ij0:ij1] = cderi_block - t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1) + t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1) return diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 17816bc8d..9f0cc6d0c 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -92,7 +92,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega orbo = intopt.sort_orbitals(orbo, axis=[0]) rhoj, rhok = get_rhojk(with_df, dm, orbo, with_j=with_j, with_k=with_k) - + # (d/dX P|Q) contributions if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): @@ -151,7 +151,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega cart2sph = intopt.cart2sph orbo_cart = cart2sph @ orbo dm_cart = cart2sph @ dm @ cart2sph.T - + with_df._cderi = None # release GPU memory vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, with_j=with_j, with_k=with_k, omega=omega) diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py index 53acd7e02..a80786387 100644 --- a/gpu4pyscf/df/grad/uhf.py +++ b/gpu4pyscf/df/grad/uhf.py @@ -27,7 +27,7 @@ FREE_CUPY_CACHE = True BINSIZE = 128 -def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, +def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega=None, mo_coeff=None, mo_occ=None, dm2 = None): ''' Computes the first-order derivatives of the energy contributions from @@ -143,11 +143,11 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, nao_cart = intopt._sorted_mol.nao block_size = with_df.get_blksize(nao=nao_cart) - + intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=block_size)#, group_size=block_size) - + if not mol.cart: # sph2cart for ao cart2sph = intopt.cart2sph @@ -168,7 +168,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, with_df._cderi = None # release GPU memory vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, with_j=with_j, with_k=with_k, omega=omega) - + # NOTE: vj and vk are still in cartesian _sorted_mol = intopt._sorted_mol natm = _sorted_mol.natm diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index bb98e8575..9b677c970 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -210,13 +210,51 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA', rho[tau_idx] *= .5 return rho +def eval_rho2_fast(ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA', + with_lapl=False, verbose=None, out=None): + xctype = xctype.upper() + ngrids = ao.shape[-1] + + cpos = (mo_coeff * mo_occ**0.5)[:,mo_occ>0] + + nao, nocc = cpos.shape + buf1 = cupy.empty([nocc, ngrids]) + buf2 = cupy.empty([nocc, ngrids]) + + cublas_handle = cupy.cuda.device.get_cublas_handle() + stream = cupy.cuda.get_current_stream() + if xctype in ("LDA", "HF"): + xctype_code = 0 + rho = cupy.empty([ngrids]) + elif xctype in ("GGA", "NLC"): + xctype_code = 1 + rho = cupy.empty([4, ngrids]) + else: + xctype_code = 2 + rho = cupy.empty([5, ngrids]) + + assert ao.flags.c_contiguous and cpos.flags.c_contiguous + + err = libgdft.GDFTeval_rho2( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(cublas_handle, ctypes.c_void_p), + ctypes.cast(ao.data.ptr, ctypes.c_void_p), + ctypes.cast(cpos.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), ctypes.c_int(nocc), ctypes.c_int(nao), + ctypes.c_int(xctype_code), + ctypes.cast(buf1.data.ptr, ctypes.c_void_p), + ctypes.cast(buf2.data.ptr, ctypes.c_void_p), + ctypes.cast(rho.data.ptr, ctypes.c_void_p)) + + if err != 0: + raise RuntimeError('CUDA Error in GDFTeval_rho2') + + return rho + def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA', with_lapl=False, verbose=None): xctype = xctype.upper() - if xctype == 'LDA' or xctype == 'HF': - _, ngrids = ao.shape - else: - _, ngrids = ao[0].shape + ngrids = ao.shape[-1] shls_slice = (0, mol.nbas) ao_loc = None #mol.ao_loc_nr() @@ -265,10 +303,7 @@ def eval_rho4(mol, ao, mo0, mo1, non0tab=None, xctype='LDA', hermi=0, log = logger.new_logger(mol, verbose) t0 = log.init_timer() xctype = xctype.upper() - if xctype == 'LDA' or xctype == 'HF': - _, ngrids = ao.shape - else: - _, ngrids = ao[0].shape + ngrids = ao.shape[-1] na = mo1.shape[0] if xctype == 'LDA' or xctype == 'HF': @@ -296,6 +331,41 @@ def eval_rho4(mol, ao, mo0, mo1, non0tab=None, xctype='LDA', hermi=0, t0 = log.timer_debug2('contract rho', *t0) return rho +def eval_vxc(ao, wv, idx, vmat, xctype_code): + nao = vmat.shape[-1] + wv = cupy.asarray(wv, order='C') + assert ao.flags['C_CONTIGUOUS'] + assert vmat.flags['C_CONTIGUOUS'] + assert idx.dtype == np.int32 + + vmat = vmat.reshape(-1, nao, nao) + nset = vmat.shape[0] + ngrids, nao_mask = ao.shape[-1], ao.shape[-2] + wv = wv.reshape(nset, -1, ngrids) + cublas_handle = cupy.cuda.device.get_cublas_handle() + stream = cupy.cuda.get_current_stream() + buf1 = cupy.empty([nao_mask, ngrids]) + buf2 = cupy.empty([nao_mask, nao_mask]) + for i in range(nset): + err = libgdft.GDFTeval_vxc( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(cublas_handle, ctypes.c_void_p), + ctypes.c_int(xctype_code), + ctypes.cast(ao.data.ptr, ctypes.c_void_p), + ctypes.cast(wv[i].data.ptr, ctypes.c_void_p), + ctypes.cast(buf1.data.ptr, ctypes.c_void_p), + ctypes.cast(buf2.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.c_int(nao_mask), + ctypes.c_int(nao), + ctypes.cast(idx.data.ptr, ctypes.c_void_p), + ctypes.cast(vmat[i].data.ptr, ctypes.c_void_p), + ) + + if err != 0: + raise RuntimeError('CUDA Error in GDFTeval_vxc') + return + def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): thresh=1e-10 @@ -383,7 +453,7 @@ def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): ctypes.c_int(coords.shape[0])) if err != 0: - raise RuntimeError('CUDA Error') + raise RuntimeError('CUDA Error in VXC_vv10nlc kernel') #exc is multiplied by Rho later exc[threshind] = Beta+0.5*F @@ -434,12 +504,12 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, rho_tot = cupy.empty([nset,4,ngrids_local]) else: rho_tot = cupy.empty([nset,5,ngrids_local]) - + p0 = p1 = 0 for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=None, grid_range=(grid_start, grid_end)): - p1 = p0 + weight.size + p0, p1 = p1, p1 + weight.size weights[p0:p1] = weight for i in range(nset): # If AO is sparse enough, use density matrix to calculate rho @@ -450,15 +520,14 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, else: assert hermi == 1 mo_coeff_mask = mo_coeff[idx,:] - rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, - None, xctype, with_lapl) - p0 = p1 + rho_tot[i,:,p0:p1] = eval_rho2_fast(ao_mask, mo_coeff_mask, mo_occ, + None, xctype, with_lapl) t0 = log.timer_debug1(f'eval rho on Device {device_id}', *t0) - + # libxc calls are still running on default stream nelec = cupy.zeros(nset) excsum = cupy.zeros(nset) - wv = [] + wv = cupy.empty_like(rho_tot) for i in range(nset): exc, vxc = ni.eval_xc_eff(xc_code, rho_tot[i], deriv=1, xctype=xctype)[:2] vxc = cupy.asarray(vxc, order='C') @@ -466,38 +535,27 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, den = rho_tot[i][0] * weights nelec[i] = den.sum() excsum[i] = cupy.dot(den, exc[:,0]) - wv.append(vxc * weights) + wv[i] = vxc * weights if xctype == 'GGA': wv[i][0] *= .5 if xctype == 'MGGA': wv[i][[0,4]] *= .5 t0 = log.timer_debug1(f'eval vxc on Device {device_id}', *t0) + if xctype in ("LDA", "HF"): + xctype_code = 0 + elif xctype in ("GGA", "NLC"): + xctype_code = 1 + else: + xctype_code = 2 + vmat = cupy.zeros((nset, nao, nao)) p0 = p1 = 0 for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=None, grid_range=(grid_start, grid_end)): - p1 = p0 + weight.size - for i in range(nset): - if xctype == 'LDA': - aow = _scale_ao(ao_mask, wv[i][0,p0:p1]) - add_sparse(vmat[i], ao_mask.dot(aow.T), idx) - elif xctype == 'GGA': - aow = _scale_ao(ao_mask, wv[i][:,p0:p1]) - add_sparse(vmat[i], ao_mask[0].dot(aow.T), idx) - elif xctype == 'NLC': - raise NotImplementedError('NLC') - elif xctype == 'MGGA': - aow = _scale_ao(ao_mask, wv[i][:4,p0:p1]) - vtmp = ao_mask[0].dot(aow.T) - vtmp+= _tau_dot(ao_mask, ao_mask, wv[i][4,p0:p1]) - add_sparse(vmat[i], vtmp, idx) - elif xctype == 'HF': - pass - else: - raise NotImplementedError(f'numint.nr_rks for functional {xc_code}') - p0 = p1 + p0, p1 = p1, p1 + weight.size + eval_vxc(ao_mask, wv[:,:,p0:p1], idx, vmat, xctype_code) t0 = log.timer_debug1(f'eval integration on {device_id}', *t0) return vmat, nelec.get(), excsum.get() @@ -691,7 +749,7 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, t1 = t0 = log.init_timer() for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): - p1 = p0 + weight.size + p0, p1 = p1, p1 + weight.size for i in range(nset): if mo_coeff is None: rho_tot[i,:,p0:p1] = eval_rho( @@ -702,7 +760,6 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, mo_coeff_mask = mo_coeff[idx,:] rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) - p0 = p1 t1 = log.timer_debug2('eval rho slice', *t1) t0 = log.timer_debug1('eval rho', *t0) @@ -823,6 +880,13 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, else: ao_deriv = 1 + if xctype in ("LDA", "HF"): + xctype_code = 0 + elif xctype in ("GGA", "NLC"): + xctype_code = 1 + else: + xctype_code = 2 + ngrids_glob = grids.coords.shape[0] grid_start, grid_end = gen_grid_range(ngrids_glob, device_id) ngrids_local = grid_end - grid_start @@ -838,47 +902,36 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, rho_b = eval_rho(_sorted_mol, ao_mask, dmb[i][idx[:,None],idx], xctype=xctype, hermi=hermi) else: mo_coeff_mask = mo_coeff[:, idx,:] - rho_a = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype) - rho_b = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype) - + rho_a = eval_rho2_fast(ao_mask, mo_coeff_mask[0], mo_occ[1], None, xctype, with_lapl) + rho_b = eval_rho2_fast(ao_mask, mo_coeff_mask[0], mo_occ[1], None, xctype, with_lapl) rho = cupy.stack([rho_a, rho_b], axis=0) exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] t1 = log.timer_debug1('eval vxc', *t0) + if xctype == 'LDA': - den_a = rho_a * weight - den_b = rho_b * weight wv = vxc[:,0] * weight - va = ao_mask.dot(_scale_ao(ao_mask, wv[0]).T) - vb = ao_mask.dot(_scale_ao(ao_mask, wv[1]).T) - add_sparse(vmata[i], va, idx) - add_sparse(vmatb[i], vb, idx) - elif xctype == 'GGA': - den_a = rho_a[0] * weight - den_b = rho_b[0] * weight wv = vxc * weight wv[:,0] *= .5 - va = ao_mask[0].dot(_scale_ao(ao_mask, wv[0]).T) - vb = ao_mask[0].dot(_scale_ao(ao_mask, wv[1]).T) - add_sparse(vmata[i], va, idx) - add_sparse(vmatb[i], vb, idx) elif xctype == 'NLC': raise NotImplementedError('NLC') elif xctype == 'MGGA': - den_a = rho_a[0] * weight - den_b = rho_b[0] * weight wv = vxc * weight wv[:,[0, 4]] *= .5 - va = ao_mask[0].dot(_scale_ao(ao_mask[:4], wv[0,:4]).T) - vb = ao_mask[0].dot(_scale_ao(ao_mask[:4], wv[1,:4]).T) - va += _tau_dot(ao_mask, ao_mask, wv[0,4]) - vb += _tau_dot(ao_mask, ao_mask, wv[1,4]) - add_sparse(vmata[i], va, idx) - add_sparse(vmatb[i], vb, idx) elif xctype == 'HF': pass else: raise NotImplementedError(f'numint.nr_uks for functional {xc_code}') + + eval_vxc(ao_mask, wv[0], idx, vmata[i], xctype_code) + eval_vxc(ao_mask, wv[1], idx, vmatb[i], xctype_code) + + if xctype == "LDA": + den_a = rho_a * weight + den_b = rho_b * weight + else: + den_a = rho_a[0] * weight + den_b = rho_b[0] * weight nelec[0,i] += den_a.sum() nelec[1,i] += den_b.sum() excsum[i] += cupy.dot(den_a, exc[:,0]) @@ -993,7 +1046,8 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None): if mo_coeff is None: rho[p0:p1] = eval_rho(_sorted_mol, ao, dm, xctype='LDA', hermi=1) else: - rho[p0:p1] = eval_rho2(_sorted_mol, ao, mo_coeff, mo_occ, None, 'LDA') + #rho[p0:p1] = eval_rho2(_sorted_mol, ao, mo_coeff, mo_occ, None, 'LDA') + rho[p0:p1] = eval_rho2_fast(ao, mo_coeff, mo_occ, None, 'LDA') t1 = log.timer_debug2('eval rho slice', *t1) t0 = log.timer_debug1('eval rho', *t0) @@ -1014,6 +1068,9 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, xctype = ni._xc_type(xc_code) opt = getattr(ni, 'gdftopt', None) + if xctype == 'NLC': + raise NotImplementedError('NLC') + _sorted_mol = opt.mol nao = mol.nao dms = cupy.asarray(dms) @@ -1025,6 +1082,13 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, else: ao_deriv = 1 + if xctype in ("LDA", "HF"): + xctype_code = 0 + elif xctype in ("GGA", "NLC"): + xctype_code = 1 + else: + xctype_code = 2 + ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + num_devices - 1) // num_devices ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE @@ -1062,21 +1126,13 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, fxc_w = fxc[:,:,p0:p1] * weights wv = contract('axg,xyg->ayg', rho1, fxc_w) - for i in range(nset): - if xctype == 'LDA': - vmat_tmp = ao.dot(_scale_ao(ao, wv[i]).T) - elif xctype == 'GGA': - wv[i,0] *= .5 - aow = _scale_ao(ao, wv[i]) - vmat_tmp = aow.dot(ao[0].T) - elif xctype == 'NLC': - raise NotImplementedError('NLC') - else: - wv[i,0] *= .5 - wv[i,4] *= .5 - vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[i,:4]).T) - vmat_tmp+= _tau_dot(ao, ao, wv[i,4]) - add_sparse(vmat[i], vmat_tmp, mask) + if xctype == 'GGA': + wv[:,0] *= .5 + if xctype == 'MGGA': + wv[:,0] *= .5 + wv[:,4] *= .5 + + eval_vxc(ao, wv, mask, vmat, xctype_code) t1 = log.timer_debug2('integration', *t1) ao = rho1 = None @@ -1147,15 +1203,15 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None, def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, verbose=None, hermi=1, device_id=0): with cupy.cuda.Device(device_id), _streams[device_id]: - if dms is not None: + if dms is not None: dma, dmb = dms dma = cupy.asarray(dma) dmb = cupy.asarray(dmb) - if mo1 is not None: + if mo1 is not None: mo1a, mo1b = mo1 mo1a = cupy.asarray(mo1a) mo1b = cupy.asarray(mo1b) - if occ_coeff is not None: + if occ_coeff is not None: occ_coeff_a, occ_coeff_b = occ_coeff occ_coeff_a = cupy.asarray(occ_coeff_a) occ_coeff_b = cupy.asarray(occ_coeff_b) @@ -1164,6 +1220,8 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, assert isinstance(verbose, int) log = logger.new_logger(mol, verbose) xctype = ni._xc_type(xc_code) + if xctype == 'NLC': + raise NotImplementedError('NLC') opt = getattr(ni, 'gdftopt', None) _sorted_mol = opt.mol @@ -1177,6 +1235,13 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, else: ao_deriv = 1 + if xctype in ("LDA", "HF"): + xctype_code = 0 + elif xctype in ("GGA", "NLC"): + xctype_code = 1 + else: + xctype_code = 2 + ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + num_devices - 1) // num_devices ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE @@ -1187,10 +1252,10 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, p0 = p1 = grid_start t1 = t0 = log.init_timer() - for ao, mask, weights, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + for ao, mask, weights, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=None, grid_range=(grid_start, grid_end)): - + t0 = log.init_timer() p0, p1 = p1, p1+len(weights) # precompute fxc_w @@ -1221,25 +1286,17 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, wv_a+= contract('xg,xyg->yg', rho1b[i], fxc_w[1,:,0]) wv_b = contract('xg,xyg->yg', rho1a[i], fxc_w[0,:,1]) wv_b+= contract('xg,xyg->yg', rho1b[i], fxc_w[1,:,1]) - if xctype == 'LDA': - va = ao.dot(_scale_ao(ao, wv_a[0]).T) - vb = ao.dot(_scale_ao(ao, wv_b[0]).T) - elif xctype == 'GGA': + + if xctype == 'GGA': wv_a[0] *= .5 # for transpose_sum at the end wv_b[0] *= .5 - va = ao[0].dot(_scale_ao(ao, wv_a).T) - vb = ao[0].dot(_scale_ao(ao, wv_b).T) - elif xctype == 'NLC': - raise NotImplementedError('NLC') - else: + if xctype == 'MGGA': wv_a[[0,4]] *= .5 # for transpose_sum at the end wv_b[[0,4]] *= .5 - va = ao[0].dot(_scale_ao(ao[:4], wv_a[:4]).T) - vb = ao[0].dot(_scale_ao(ao[:4], wv_b[:4]).T) - va += _tau_dot(ao, ao, wv_a[4]) - vb += _tau_dot(ao, ao, wv_b[4]) - add_sparse(vmata[i], va, mask) - add_sparse(vmatb[i], vb, mask) + + eval_vxc(ao, wv_a, mask, vmata[i], xctype_code) + eval_vxc(ao, wv_b, mask, vmatb[i], xctype_code) + t1 = log.timer_debug2('integration', *t1) t0 = log.timer_debug1('vxc', *t0) return vmata, vmatb @@ -1254,7 +1311,7 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if opt is None or mol not in [opt.mol, opt._sorted_mol]: ni.build(mol, grids.coords) opt = ni.gdftopt - + nao, nao0 = opt.coeff.shape dma, dmb = dms dm_shape = dma.shape @@ -1284,13 +1341,13 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= ni, mol, grids, xc_code, fxc, (dma, dmb), mo1, occ_coeff, verbose=log.verbose, hermi=hermi, device_id=device_id) futures.append(future) - vmata_dist = [] + vmata_dist = [] vmatb_dist = [] for future in futures: vmata, vmatb = future.result() vmata_dist.append(vmata) vmatb_dist.append(vmatb) - + vmata = reduce_to_device(vmata_dist, inplace=True) vmatb = reduce_to_device(vmatb_dist, inplace=True) @@ -1364,13 +1421,13 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) ao_deriv = 1 vvrho = [] - for ao, idx, weight, coords \ + for ao, idx, weight, _ \ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): if mo_coeff is None: rho = eval_rho(_sorted_mol, ao, dms[0][idx[:,None],idx], xctype='GGA', hermi=1) else: mo_coeff_mask = mo_coeff[idx,:] - rho = eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, None, 'GGA') + rho = eval_rho2_fast(ao, mo_coeff_mask, mo_occ, None, 'GGA') vvrho.append(rho) rho = cupy.hstack(vvrho) @@ -1393,13 +1450,12 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vmat = cupy.zeros((nao,nao)) p1 = 0 - for ao, mask, weight, coords \ + for ao, mask, weight, _ \ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): p0, p1 = p1, p1 + weight.size wv = vv_vxc[:,p0:p1] * weight wv[0] *= .5 - aow = _scale_ao(ao, wv) - add_sparse(vmat, ao[0].dot(aow.T), mask) + eval_vxc(ao, wv, mask, vmat, 0) t1 = log.timer_debug1('integration', *t1) transpose_sum(vmat) @@ -1432,10 +1488,10 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) rho = [] t1 = t0 = log.init_timer() - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, - max_memory=max_memory): + for ao_mask, idx, _, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=max_memory): mo_coeff_mask = mo_coeff[idx,:] - rho_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) + rho_slice = eval_rho2_fast(ao_mask, mo_coeff_mask, mo_occ, None, xctype) rho.append(rho_slice) t1 = log.timer_debug2('eval rho slice', *t1) rho = cupy.hstack(rho) @@ -1449,11 +1505,11 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, rhoa = [] rhob = [] t1 = t0 = log.init_timer() - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + for ao_mask, idx, _, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): mo_coeff_mask = mo_coeff[:,idx,:] - rhoa_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype) - rhob_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype) + rhoa_slice = eval_rho2_fast(ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype) + rhob_slice = eval_rho2_fast(ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype) rhoa.append(rhoa_slice) rhob.append(rhob_slice) t1 = log.timer_debug2('eval rho in fxc', *t1) @@ -1678,9 +1734,9 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, ni.non0ao_idx[lookup_key] = _sparse_index(_sorted_mol, coords, opt.l_ctr_offsets) pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[lookup_key] - if len(idx) == 0: + if len(idx) == 0: continue - + ao_mask = eval_ao( _sorted_mol, coords, deriv, nao_slice=len(idx), @@ -1874,23 +1930,28 @@ def _make_pairs2shls_idx(pair_mask, l_bas_loc, hermi=0): pair2bra + pair2ket).astype(np.int32).reshape(2,-1) return bas_pair2shls, bas_pairs_locs -def _contract_rho(bra, ket, rho=None): +def _contract_rho(bra, ket, rho=None, alpha=0.0, beta=1.0): + """ rho = alpha * rho + beta * contract('ig,ig->g', bra, ket) + """ if bra.flags.c_contiguous and ket.flags.c_contiguous: assert bra.shape == ket.shape nao, ngrids = bra.shape if rho is None: rho = cupy.empty(ngrids) stream = cupy.cuda.get_current_stream() + err = libgdft.GDFTcontract_rho( ctypes.cast(stream.ptr, ctypes.c_void_p), ctypes.cast(rho.data.ptr, ctypes.c_void_p), ctypes.cast(bra.data.ptr, ctypes.c_void_p), ctypes.cast(ket.data.ptr, ctypes.c_void_p), + ctypes.c_double(alpha), ctypes.c_double(beta), ctypes.c_int(ngrids), ctypes.c_int(nao)) if err != 0: - raise RuntimeError('CUDA Error') + raise RuntimeError('CUDA Error in _contract_rho kernel') else: - rho = contract('ig,ig->g', bra, ket) + raise NotImplementedError + #rho = contract('ig,ig->g', bra, ket) return rho def _contract_rho1(bra, ket, rho=None): @@ -1902,6 +1963,8 @@ def _contract_rho1(bra, ket, rho=None): if rho is None: rho = cupy.empty([nvar, ngrids]) + alpha = 0.0 + beta = 1.0 for i in range(nvar): stream = cupy.cuda.get_current_stream() err = libgdft.GDFTcontract_rho( @@ -1909,9 +1972,10 @@ def _contract_rho1(bra, ket, rho=None): ctypes.cast(rho[i].data.ptr, ctypes.c_void_p), ctypes.cast(bra[i].data.ptr, ctypes.c_void_p), ctypes.cast(ket.data.ptr, ctypes.c_void_p), + ctypes.c_double(alpha), ctypes.c_double(beta), ctypes.c_int(ngrids), ctypes.c_int(nao)) if err != 0: - raise RuntimeError('CUDA Error') + raise RuntimeError('CUDA Error in GDFTcontract_rho') return rho def _contract_rho_gga(bra, ket, rho=None): @@ -1929,7 +1993,7 @@ def _contract_rho_gga(bra, ket, rho=None): ctypes.cast(ket.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids), ctypes.c_int(nao)) if err != 0: - raise RuntimeError('CUDA Error') + raise RuntimeError('CUDA Error in GDFTcontract_rho_gga') return rho def _contract_rho_mgga(bra, ket, rho=None): @@ -1947,7 +2011,7 @@ def _contract_rho_mgga(bra, ket, rho=None): ctypes.cast(ket.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids), ctypes.c_int(nao)) if err != 0: - raise RuntimeError('CUDA Error') + raise RuntimeError('CUDA Error in GDFTcontract_rho_mgga') return rho def _dot_ao_dm(mol, ao, dm, non0tab, shls_slice, ao_loc, out=None): @@ -2052,7 +2116,7 @@ def _scale_ao(ao, wv, out=None): ctypes.cast(wv.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids), ctypes.c_int(nao), ctypes.c_int(nvar)) if err != 0: - raise RuntimeError('CUDA Error') + raise RuntimeError('CUDA Error in GDFTscale_ao') return out def _tau_dot(bra, ket, wv): diff --git a/gpu4pyscf/dft/tests/test_numint.py b/gpu4pyscf/dft/tests/test_numint.py index d4fc1de16..88a16d4e2 100644 --- a/gpu4pyscf/dft/tests/test_numint.py +++ b/gpu4pyscf/dft/tests/test_numint.py @@ -182,7 +182,7 @@ def test_uks_fxc_mgga(self): self._check_uks_fxc(MGGA_M06, hermi=1) ''' # Not implemented yet - + def test_rks_fxc_st_lda(self): self._check_rks_fxc_st('lda', -0.06358425564270553) diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index 25f43ef1a..e289e2a1c 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -166,7 +166,8 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, grid_range=(grid_start, grid_end)): for idm in range(nset): mo_coeff_mask = mo_coeff[idx,:] - rho = numint.eval_rho2(_sorted_mol, ao_mask[0], mo_coeff_mask, mo_occ, None, xctype) + #rho = numint.eval_rho2(_sorted_mol, ao_mask[0], mo_coeff_mask, mo_occ, None, xctype) + rho = numint.eval_rho2_fast(ao_mask[0], mo_coeff_mask, mo_occ, None, xctype) vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] wv = weight * vxc[0] aow = numint._scale_ao(ao_mask[0], wv) @@ -178,7 +179,8 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, grid_range=(grid_start, grid_end)): for idm in range(nset): mo_coeff_mask = mo_coeff[idx,:] - rho = numint.eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype) + #rho = numint.eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype) + rho = numint.eval_rho2_fast(ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype) vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] wv = weight * vxc wv[0] *= .5 @@ -193,7 +195,8 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, grid_range=(grid_start, grid_end)): for idm in range(nset): mo_coeff_mask = mo_coeff[idx,:] - rho = numint.eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype, with_lapl=False) + #rho = numint.eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype, with_lapl=False) + rho = numint.eval_rho2_fast(ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype) vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] wv = weight * vxc wv[0] *= .5 @@ -269,10 +272,11 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, ao_deriv = 2 vvrho = [] - for ao_mask, mask, weight, coords \ + for ao_mask, mask, weight, _ \ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): mo_coeff_mask = mo_coeff[mask] - rho = numint.eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype, with_lapl=False) + #rho = numint.eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype, with_lapl=False) + rho = numint.eval_rho2_fast(ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype, with_lapl=False) vvrho.append(rho) rho = cupy.hstack(vvrho) @@ -282,7 +286,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vmat = cupy.zeros((3,nao,nao)) p1 = 0 - for ao_mask, mask, weight, coords \ + for ao_mask, mask, weight, _ \ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory): p0, p1 = p1, p1 + weight.size wv = vv_vxc[:,p0:p1] * weight diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py index 50e8fd055..7e7bcd646 100644 --- a/gpu4pyscf/grad/uks.py +++ b/gpu4pyscf/grad/uks.py @@ -25,7 +25,7 @@ from gpu4pyscf.grad import uhf as uhf_grad from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint, xc_deriv -from gpu4pyscf.dft.numint import eval_rho2 +from gpu4pyscf.dft.numint import eval_rho2, eval_rho2_fast from gpu4pyscf.lib.cupy_helper import ( contract, get_avail_mem, add_sparse, tag_array, reduce_to_device) from gpu4pyscf.lib import logger @@ -153,7 +153,7 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, opt = ni.gdftopt _sorted_mol = opt._sorted_mol nset = dms.shape[0] - + ngrids_glob = grids.coords.shape[0] grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id) ngrids_local = grid_end - grid_start @@ -162,11 +162,14 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, vmat = cupy.zeros((nset,3,nao,nao)) if xctype == 'LDA': ao_deriv = 1 - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None, + for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None, grid_range=(grid_start, grid_end)): mo_coeff_mask = mo_coeff[:,idx,:] - rho_a = eval_rho2(_sorted_mol, ao_mask[0], mo_coeff_mask[0], mo_occ[0], None, xctype) - rho_b = eval_rho2(_sorted_mol, ao_mask[0], mo_coeff_mask[1], mo_occ[1], None, xctype) + #rho_a = eval_rho2(_sorted_mol, ao_mask[0], mo_coeff_mask[0], mo_occ[0], None, xctype) + #rho_b = eval_rho2(_sorted_mol, ao_mask[0], mo_coeff_mask[1], mo_occ[1], None, xctype) + + rho_a = eval_rho2_fast(ao_mask[0], mo_coeff_mask[0], mo_occ[0], None, xctype) + rho_b = eval_rho2_fast(ao_mask[0], mo_coeff_mask[1], mo_occ[1], None, xctype) vxc = ni.eval_xc_eff(xc_code, cupy.array([rho_a,rho_b]), 1, xctype=xctype)[1] wv = weight * vxc[:,0] @@ -181,8 +184,10 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None, grid_range=(grid_start, grid_end)): mo_coeff_mask = mo_coeff[:,idx,:] - rho_a = eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask[0], mo_occ[0], None, xctype) - rho_b = eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask[1], mo_occ[1], None, xctype) + #rho_a = eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask[0], mo_occ[0], None, xctype) + #rho_b = eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask[1], mo_occ[1], None, xctype) + rho_a = eval_rho2_fast(ao_mask[:4], mo_coeff_mask[0], mo_occ[0], None, xctype) + rho_b = eval_rho2_fast(ao_mask[:4], mo_coeff_mask[1], mo_occ[1], None, xctype) vxc = ni.eval_xc_eff(xc_code, cupy.array([rho_a,rho_b]), 1, xctype=xctype)[1] wv = weight * vxc @@ -199,8 +204,10 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None, grid_range=(grid_start, grid_end)): mo_coeff_mask = mo_coeff[:,idx,:] - rho_a = eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask[0], mo_occ[0], None, xctype) - rho_b = eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask[1], mo_occ[1], None, xctype) + #rho_a = eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask[0], mo_occ[0], None, xctype) + #rho_b = eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask[1], mo_occ[1], None, xctype) + rho_a = eval_rho2_fast(ao_mask[:4], mo_coeff_mask[0], mo_occ[0], None, xctype) + rho_b = eval_rho2_fast(ao_mask[:4], mo_coeff_mask[1], mo_occ[1], None, xctype) vxc = ni.eval_xc_eff(xc_code, cupy.array([rho_a,rho_b]), 1, xctype=xctype)[1] wv = weight * vxc wv[:,0] *= .5 @@ -381,12 +388,12 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, relativity=0, he ao_deriv = 2 vvrho = [] - for ao_mask, mask, weight, coords \ + for ao_mask, mask, weight, _ \ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): mo_coeff_mask_0 = mo_coeff_0[mask] mo_coeff_mask_1 = mo_coeff_1[mask] - rhoa = eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask_0, mo_occ[0], None, xctype) - rhob = eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask_1, mo_occ[1], None, xctype) + rhoa = eval_rho2_fast(ao_mask[:4], mo_coeff_mask_0, mo_occ[0], None, xctype) + rhob = eval_rho2_fast(ao_mask[:4], mo_coeff_mask_1, mo_occ[1], None, xctype) vvrho.append(rhoa + rhob) rho = cupy.hstack(vvrho) @@ -396,7 +403,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, relativity=0, he vmat = cupy.zeros((3,nao,nao)) p1 = 0 - for ao_mask, mask, weight, coords \ + for ao_mask, mask, weight, _ \ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory): p0, p1 = p1, p1 + weight.size wv = vv_vxc[:,p0:p1] * weight diff --git a/gpu4pyscf/lib/gdft/CMakeLists.txt b/gpu4pyscf/lib/gdft/CMakeLists.txt index d1bcadb9c..eec20f726 100644 --- a/gpu4pyscf/lib/gdft/CMakeLists.txt +++ b/gpu4pyscf/lib/gdft/CMakeLists.txt @@ -16,8 +16,9 @@ add_library(gdft SHARED nr_eval_gto.cu - contract_rho.cu + #contract_rho.cu gen_grids.cu + numint.cu nr_numint_sparse.cu vv10.cu libxc.cu @@ -25,3 +26,6 @@ add_library(gdft SHARED set_target_properties(gdft PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}) + +# Link cuBLAS +target_link_libraries(gdft PRIVATE cublas) diff --git a/gpu4pyscf/lib/gdft/contract_rho.cu b/gpu4pyscf/lib/gdft/contract_rho.cu index b73658a37..95b419f35 100644 --- a/gpu4pyscf/lib/gdft/contract_rho.cu +++ b/gpu4pyscf/lib/gdft/contract_rho.cu @@ -21,10 +21,15 @@ #include #include #include "contract_rho.cuh" -// TODO: improve this? + +#define THREADS 32 +#define BLOCK_DIM 32 + __global__ -void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids, int nao) +void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, double alpha, double beta, + int ngrids, int nao) { + // rho = alpha * rho + beta * int grid_id = blockIdx.x * blockDim.x + threadIdx.x; const bool active = grid_id < ngrids; size_t Ngrids = ngrids; @@ -49,7 +54,7 @@ void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids, if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads(); if (iy == 0 && active) { - rho[grid_id] = buf[ix]; + rho[grid_id] = alpha * rho[grid_id] + beta * buf[ix]; } } @@ -211,7 +216,7 @@ void GDFTcontract_rho_mgga_kernel(double *rho, double *bra, double *ket, int ngr } __global__ -void GDFTscale_ao_kernel(double *out, double *ket, double *wv, +void GDFTscale_ao_kernel(double *out, const double *ket, const double *wv, int ngrids, int nao, int nvar) { int grid_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -276,11 +281,12 @@ void GDFT_make_dR_dao_w_kernel(double *out, double *ket, double *wv, extern "C"{ __host__ -int GDFTcontract_rho(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao) +int GDFTcontract_rho(cudaStream_t stream, double *rho, double *bra, double *ket, double alpha, double beta, + int ngrids, int nao) { dim3 threads(BLKSIZEX, BLKSIZEY); dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); - GDFTcontract_rho_kernel<<>>(rho, bra, ket, ngrids, nao); + GDFTcontract_rho_kernel<<>>(rho, bra, ket, alpha, beta, ngrids, nao); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "CUDA Error of GDFTcontract_rho: %s\n", cudaGetErrorString(err)); @@ -342,7 +348,7 @@ int GDFT_make_dR_dao_w(cudaStream_t stream, double *out, double *ket, double *wv return 0; } -int GDFTscale_ao(cudaStream_t stream, double *out, double *ket, double *wv, +int GDFTscale_ao(cudaStream_t stream, double *out, const double *ket, const double *wv, int ngrids, int nao, int nvar) { dim3 threads(BLKSIZEX, BLKSIZEY); diff --git a/gpu4pyscf/lib/gdft/numint.cu b/gpu4pyscf/lib/gdft/numint.cu new file mode 100644 index 000000000..73e03d5f1 --- /dev/null +++ b/gpu4pyscf/lib/gdft/numint.cu @@ -0,0 +1,169 @@ +/* + * Copyright 2021-2024 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "contract_rho.cu" + +#define THREADS 32 +#define BLOCK_DIM 32 + +__global__ +void _add_sparse(double *a, double *b, int *indices, int n, int m, int count) +{ + int row = blockIdx.x * BLOCK_DIM + threadIdx.x; + int col = blockIdx.y * BLOCK_DIM + threadIdx.y; + if (row >= m || col >= m){ + return; + } + int idx_a = indices[row] * n + indices[col]; + int idx_b = row * m + col; + for (int i = 0; i < count; i++){ + a[idx_a + i*n*n] += b[idx_b + i*m*m]; + } +} + + +extern "C"{ +__host__ +int add_sparse(cudaStream_t stream, double *a, double *b, int *indices, int n, int m, int count){ + int ntile = (m + THREADS - 1) / THREADS; + dim3 threads(THREADS, THREADS); + dim3 blocks(ntile, ntile); + _add_sparse<<>>(a, b, indices, n, m, count); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + return 1; + } + return 0; +} + +__host__ +int GDFTeval_rho2(cudaStream_t stream, cublasHandle_t handle, + const double* ao, const double* mocc, int ngrids, int nocc, int nao, + const int xctype_code, double* buf1, double* buf2, double* rho) +{ + // mocc: nao x nocc + // ao: nao x ngrids + // rho: 1 x ngrids for LDA + // 4 x ngrids for GGA + // 5 x ngrids for mGGA + + double alpha, beta, contract_factor; + // LDA + alpha = 1.0; + beta = 0.0; + // buf1 = np.dot(mocc.T, ao) in C-order + // buf1 = np.dot(ao.T, mocc) in F-order + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, + ngrids, nocc, nao, + &alpha, + ao, ngrids, + mocc, nocc, + &beta, + buf1, ngrids); + GDFTcontract_rho(stream, rho, buf1, buf1, 0.0, 1.0, ngrids, nocc); + + if (xctype_code > 0) { + // GGA + alpha = 1.0; + beta = 0.0; + contract_factor = 0.0; + for (int i = 1; i < 4; i++){ + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, + ngrids, nocc, nao, + &alpha, + ao + i*nao*ngrids, ngrids, + mocc, nocc, + &beta, + buf2, ngrids); + GDFTcontract_rho(stream, rho + i*ngrids, buf1, buf2, 0.0, 2.0, ngrids, nocc); + + if (xctype_code > 1) { + // mGGA + // rho[4] += 0.5 * np.dot(buf2.T, buf2) + GDFTcontract_rho(stream, rho + 4*ngrids, buf2, buf2, contract_factor, 0.5, ngrids, nocc); + contract_factor = 1.0; // Add to the previous result + } + } + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTscale_ao: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +__host__ +int GDFTeval_vxc(cudaStream_t stream, cublasHandle_t handle, int xctype_code, + const double* ao, const double* wv, double *buf1, double *buf2, + int ngrids, int nao_mask, int nao, int* idx, double* vmat){ + /* + Calculate vxc matrix, and add the matrix block to the global vxc matrix + */ + + int err; + + if (xctype_code == 0){ + err = GDFTscale_ao(stream, buf1, ao, wv, ngrids, nao_mask, 1); + } else { + err = GDFTscale_ao(stream, buf1, ao, wv, ngrids, nao_mask, 4); + } + // LDA + // buf2 = ao.dot(buf1.T) + double alpha = 1.0; + double beta = 0.0; + cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, + nao_mask, nao_mask, ngrids, + &alpha, + ao, ngrids, + buf1, ngrids, + &beta, + buf2, nao_mask); + + // mGGA + // buf2 += 0.5 * ao[0].dot((wv * ao[0]).T) + // + 0.5 * ao[1].dot((wv * ao[1]).T) + // + 0.5 * ao[2].dot((wv * ao[2]).T) + if (xctype_code > 1){ + alpha = 0.5; + beta = 1.0; + for (int i = 1; i < 4; i++){ + const double *ao_i = ao + i*nao_mask*ngrids; + err = GDFTscale_ao(stream, buf1, ao_i, wv+4*ngrids, ngrids, nao_mask, 1); + cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, + nao_mask, nao_mask, ngrids, + &alpha, + ao_i, ngrids, + buf1, ngrids, + &beta, + buf2, nao_mask); + } + } + + err = add_sparse(stream, vmat, buf2, idx, nao, nao_mask, 1); + + return err; +} + +}