diff --git a/benchmarks/benchmark_eval_rho.py b/benchmarks/benchmark_eval_rho.py new file mode 100644 index 000000000..5627caa67 --- /dev/null +++ b/benchmarks/benchmark_eval_rho.py @@ -0,0 +1,124 @@ +import pyscf +from gpu4pyscf.scf import hf + +mol = pyscf.M(atom = 'molecules/water_clusters/008.xyz') +mol.basis = { + 'H': [[0, (0.01, 0.0070011547)]], + 'O': [[0, (0.01, 0.0070011547)]]} +mol.build() + +vhfopt = hf._VHFOpt(mol, 'int2e').build() + +from gpu4pyscf.lib.cupy_helper import load_library +libgdft = load_library('libgdft') + +import numpy as np +import cupy +from gpu4pyscf.dft.gen_grid import Grids +grids = Grids(mol) +grids.build() +coords = cupy.asarray(grids.coords[:256*256*16], order='F')#.copy() + +ngrids, _ = coords.shape + +ao_loc = mol.ao_loc_nr() +ao_loc = cupy.asarray(ao_loc, dtype=np.int32) +nao = mol.nao +dm = cupy.random.rand(nao,nao) +dm += dm.T +print(dm[:4,:4]) +print(vhfopt.bas_pair2shls[:,:1000]) + +import ctypes +from pyscf import gto +from gpu4pyscf.dft import numint +ni = numint.NumInt().build(mol, coords) +env = cupy.asarray(ni.gdftopt._sorted_mol._env) +ish, jsh = vhfopt.bas_pair2shls +diag_idx = ish == jsh +npairs = len(ish) +bas = ni.gdftopt._sorted_mol._bas +exp_sparse = cupy.empty([npairs, 2], order='F') +bas_exp = bas[ish, gto.PTR_EXP] +exp_sparse[:,0] = cupy.asarray(env[bas_exp]) +bas_exp = bas[jsh, gto.PTR_EXP] +exp_sparse[:,1] = cupy.asarray(env[bas_exp]) + +coef_sparse = cupy.empty([npairs, 2], order='F') +bas_coef = bas[ish, gto.PTR_COEFF] +coef_sparse[:,0] = cupy.asarray(env[bas_coef]) +bas_coef = bas[jsh, gto.PTR_COEFF] +coef_sparse[:,1] = cupy.asarray(env[bas_coef]) + +atm_coords = cupy.asarray(mol.atom_coords()) +coord_pairs = cupy.empty([npairs, 6], order='F') +coord_pairs[:,:3] = atm_coords[ish,:] +coord_pairs[:,3:] = atm_coords[jsh,:] + +dm_sparse = dm[ish, jsh] * coef_sparse[:,0] * coef_sparse[:,1] +dm_sparse[diag_idx] *= 0.5 + +coords_fp32 = coords.astype(np.float32) +exp_sparse_fp32 = exp_sparse.astype(np.float32) +coef_sparse_fp32 = coef_sparse.astype(np.float32) +coord_pairs_fp32 = coord_pairs.astype(np.float32) +dm_sparse_fp32 = dm_sparse.astype(np.float32) + +def eval_rho0(dm): + with ni.gdftopt.gdft_envs_cache(): + rho = cupy.empty([4,ngrids]) + libgdft.eval_rho_fp32( + vhfopt.bpcache, + ctypes.cast(coords.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(rho.data.ptr, ctypes.c_void_p), + ctypes.cast(exp_sparse_fp32.data.ptr, ctypes.c_void_p), + ctypes.cast(coef_sparse_fp32.data.ptr, ctypes.c_void_p), + ctypes.cast(coord_pairs_fp32.data.ptr, ctypes.c_void_p), + ctypes.c_int(nao), + ctypes.cast(dm_sparse_fp32.data.ptr, ctypes.c_void_p)) + return rho + return eval_rho0 + +def eval_rho1(dm): + rho = cupy.empty([4,ngrids]) + with ni.gdftopt.gdft_envs_cache(): + libgdft.eval_rho_fp64( + vhfopt.bpcache, + ctypes.cast(coords.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(rho.data.ptr, ctypes.c_void_p), + ctypes.cast(exp_sparse.data.ptr, ctypes.c_void_p), + ctypes.cast(coef_sparse.data.ptr, ctypes.c_void_p), + ctypes.cast(coord_pairs.data.ptr, ctypes.c_void_p), + ctypes.c_int(nao), + ctypes.cast(dm_sparse.data.ptr, ctypes.c_void_p)) + return rho + +def eval_rho2(dm): + ao = numint.eval_ao(ni, ni.gdftopt._sorted_mol, coords, deriv=1) + rho0 = numint.eval_rho(ni.gdftopt._sorted_mol, ao, dm, xctype='GGA') + return rho0 + +from cupyx import profiler +perf = profiler.benchmark(eval_rho0, (dm_sparse,), n_repeat=20, n_warmup=3) +print('with eval_rho0', perf.gpu_times.mean()) +# 0.130s on V100 +# 0.115s on A10 + +perf = profiler.benchmark(eval_rho1, (dm_sparse,), n_repeat=20, n_warmup=3) +print('with eval_rho1', perf.gpu_times.mean()) +# 0.324s on V100 +# 5.401s on A10 + +perf = profiler.benchmark(eval_rho2, (dm,), n_repeat=20, n_warmup=3) +print('with eval_rho2', perf.gpu_times.mean()) +# 0.0858s on v100 +# 0.723s on A10 + +rho0 = eval_rho0(dm) +rho1 = eval_rho1(dm) +print(rho0[:4,:3]) +print(rho1[:4,:3]) + +print(cupy.max(rho0 - rho1), cupy.max(rho0)) diff --git a/gpu4pyscf/lib/gdft/nr_eval_gto.cu b/gpu4pyscf/lib/gdft/nr_eval_gto.cu index 12b9fe9a7..57da02b19 100644 --- a/gpu4pyscf/lib/gdft/nr_eval_gto.cu +++ b/gpu4pyscf/lib/gdft/nr_eval_gto.cu @@ -27,6 +27,7 @@ #include "gint/cuda_alloc.cuh" #include "nr_eval_gto.cuh" #include "contract_rho.cuh" +#include "nr_eval_rho.cu" #define NG_PER_BLOCK 256 #define LMAX 8 @@ -1907,4 +1908,54 @@ int GDFTscreen_index(cudaStream_t stream, int *non0shl_idx, double cutoff, return 0; } +__host__ +int eval_rho_fp64(BasisProdCache *bpcache, double *grids, int ngrids, + double *rho, double *exp_sparse, double *coef_sparse, double *coord_pairs, + int nao, double *dm_sparse) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + int *bas_pairs_locs = bpcache->bas_pairs_locs; + int npairs = bas_pairs_locs[1]; + //printf("%d \n", npairs); + + BasOffsets offsets; + offsets.gridx = grids;//d_grids; + offsets.ngrids = ngrids; + offsets.data = rho; + offsets.nao = nao; + offsets.nprim = 1; + offsets.fac = CINTcommon_fac_sp(0); + dim3 threads(NG_PER_BLOCK); + dim3 blocks((ngrids+NG_PER_BLOCK-1)/NG_PER_BLOCK); + + _eval_rho<<>> (offsets, dm_sparse, npairs, exp_sparse, coef_sparse, coord_pairs); + + return 1; +} + +__host__ +int eval_rho_fp32(BasisProdCache *bpcache, double *grids, int ngrids, + double *rho, float *exp_sparse, float *coef_sparse, float *coord_pairs, + int nao, float *dm_sparse) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + int *bas_pairs_locs = bpcache->bas_pairs_locs; + int npairs = bas_pairs_locs[1]; + //printf("%d \n", npairs); + + BasOffsets offsets; + offsets.gridx = grids;//d_grids; + offsets.ngrids = ngrids; + offsets.data = rho; + offsets.nao = nao; + offsets.nprim = 1; + offsets.fac = CINTcommon_fac_sp(0); + dim3 threads(NG_PER_BLOCK); + dim3 blocks((ngrids+NG_PER_BLOCK-1)/NG_PER_BLOCK); + + _eval_rho<<>> (offsets, dm_sparse, npairs, exp_sparse, coef_sparse, coord_pairs); + + return 1; +} + } diff --git a/gpu4pyscf/lib/gdft/nr_eval_rho.cu b/gpu4pyscf/lib/gdft/nr_eval_rho.cu new file mode 100644 index 000000000..0a342b742 --- /dev/null +++ b/gpu4pyscf/lib/gdft/nr_eval_rho.cu @@ -0,0 +1,119 @@ + +__constant__ BasisProdCache c_bpcache; +#define NG_PER_BLOCK 256 +template +__global__ +static void _eval_rho(BasOffsets offsets, T2 *dm_sparse, int npairs, + T2 *exp_sparse, T2 *coef_sparse, T2 *coord_pairs){ + int ngrids = offsets.ngrids; + int grid_id = blockIdx.x * blockDim.x + threadIdx.x; + + double* __restrict__ rho = offsets.data; + double* __restrict__ rhox = offsets.data + 1 * ngrids; + double* __restrict__ rhoy = offsets.data + 2 * ngrids; + double* __restrict__ rhoz = offsets.data + 3 * ngrids; + + T2* __restrict__ xi = coord_pairs; + T2* __restrict__ yi = coord_pairs + npairs; + T2* __restrict__ zi = coord_pairs + 2*npairs; + T2* __restrict__ xj = coord_pairs + 3*npairs; + T2* __restrict__ yj = coord_pairs + 4*npairs; + T2* __restrict__ zj = coord_pairs + 5*npairs; + + T2* __restrict__ ei = exp_sparse; + T2* __restrict__ ej = exp_sparse + npairs; + + double *gridx = offsets.gridx; + double *gridy = offsets.gridx + ngrids; + double *gridz = offsets.gridx + ngrids * 2; + T2 xg = 0.0; + T2 yg = 0.0; + T2 zg = 0.0; + + if (grid_id < ngrids) { + xg = gridx[grid_id]; + yg = gridy[grid_id]; + zg = gridz[grid_id]; + } + + T2 rho_i = 0.0; + T2 rhox_i = 0.0; + T2 rhoy_i = 0.0; + T2 rhoz_i = 0.0; + + T2 __shared__ dm_shared[NG_PER_BLOCK]; + T2 __shared__ xi_shared[NG_PER_BLOCK]; + T2 __shared__ yi_shared[NG_PER_BLOCK]; + T2 __shared__ zi_shared[NG_PER_BLOCK]; + + T2 __shared__ xj_shared[NG_PER_BLOCK]; + T2 __shared__ yj_shared[NG_PER_BLOCK]; + T2 __shared__ zj_shared[NG_PER_BLOCK]; + + T2 __shared__ ei_shared[NG_PER_BLOCK]; + T2 __shared__ ej_shared[NG_PER_BLOCK]; + T2 fac_ij = offsets.fac * offsets.fac; + for (int block_id = 0; block_id < npairs; block_id+=NG_PER_BLOCK){ + int tx = threadIdx.x; + int bas_ij = block_id + tx; + + if (bas_ij < npairs){ + dm_shared[tx] = dm_sparse[bas_ij] * fac_ij; + xi_shared[tx] = xi[bas_ij]; + yi_shared[tx] = yi[bas_ij]; + zi_shared[tx] = zi[bas_ij]; + + xj_shared[tx] = xj[bas_ij]; + yj_shared[tx] = yj[bas_ij]; + zj_shared[tx] = zj[bas_ij]; + + ei_shared[tx] = ei[bas_ij]; + ej_shared[tx] = ej[bas_ij]; + } + __syncthreads(); + for (int task_id = 0; task_id < NG_PER_BLOCK && task_id + block_id < npairs; ++task_id){ + /* shell i */ + T2 rx = xg - xi_shared[task_id]; + T2 ry = yg - yi_shared[task_id]; + T2 rz = zg - zi_shared[task_id]; + T2 rri = rx * rx + ry * ry + rz * rz; + T2 ei_loc = ei_shared[task_id]; + T2 erri = ei_loc * rri; + T2 e_2a = -2.0 * ei_loc; + + T2 gtox = e_2a * rx; + T2 gtoy = e_2a * ry; + T2 gtoz = e_2a * rz; + + /* shell j*/ + rx = xg - xj_shared[task_id]; + ry = yg - yj_shared[task_id]; + rz = zg - zj_shared[task_id]; + T2 rrj = rx * rx + ry * ry + rz * rz; + T2 ej_loc = ej_shared[task_id]; + T2 errj = ej_loc * rrj; + e_2a = -2.0 * ej_loc; + + gtox += e_2a * rx; + gtoy += e_2a * ry; + gtoz += e_2a * rz; + + T2 eij = exp(-erri - errj); + T2 dme_ij = dm_shared[task_id] * eij; + + rho_i += dme_ij; + rhox_i += gtox * dme_ij; + rhoy_i += gtoy * dme_ij; + rhoz_i += gtoz * dme_ij; + } + __syncthreads(); + } + + // Due to the symmetry + if (grid_id < ngrids){ + rho[grid_id] = rho_i * 2.0; + rhox[grid_id] = rhox_i * 2.0; + rhoy[grid_id] = rhoy_i * 2.0; + rhoz[grid_id] = rhoz_i * 2.0; + } +}