Skip to content

Add eval_rho kernel #184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions benchmarks/benchmark_eval_rho.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import pyscf
from gpu4pyscf.scf import hf

mol = pyscf.M(atom = 'molecules/water_clusters/008.xyz')
mol.basis = {
'H': [[0, (0.01, 0.0070011547)]],
'O': [[0, (0.01, 0.0070011547)]]}
mol.build()

vhfopt = hf._VHFOpt(mol, 'int2e').build()

from gpu4pyscf.lib.cupy_helper import load_library
libgdft = load_library('libgdft')

import numpy as np
import cupy
from gpu4pyscf.dft.gen_grid import Grids
grids = Grids(mol)
grids.build()
coords = cupy.asarray(grids.coords[:256*256*16], order='F')#.copy()

ngrids, _ = coords.shape

ao_loc = mol.ao_loc_nr()
ao_loc = cupy.asarray(ao_loc, dtype=np.int32)
nao = mol.nao
dm = cupy.random.rand(nao,nao)
dm += dm.T
print(dm[:4,:4])
print(vhfopt.bas_pair2shls[:,:1000])

import ctypes
from pyscf import gto
from gpu4pyscf.dft import numint
ni = numint.NumInt().build(mol, coords)
env = cupy.asarray(ni.gdftopt._sorted_mol._env)
ish, jsh = vhfopt.bas_pair2shls
diag_idx = ish == jsh
npairs = len(ish)
bas = ni.gdftopt._sorted_mol._bas
exp_sparse = cupy.empty([npairs, 2], order='F')
bas_exp = bas[ish, gto.PTR_EXP]
exp_sparse[:,0] = cupy.asarray(env[bas_exp])
bas_exp = bas[jsh, gto.PTR_EXP]
exp_sparse[:,1] = cupy.asarray(env[bas_exp])

coef_sparse = cupy.empty([npairs, 2], order='F')
bas_coef = bas[ish, gto.PTR_COEFF]
coef_sparse[:,0] = cupy.asarray(env[bas_coef])
bas_coef = bas[jsh, gto.PTR_COEFF]
coef_sparse[:,1] = cupy.asarray(env[bas_coef])

atm_coords = cupy.asarray(mol.atom_coords())
coord_pairs = cupy.empty([npairs, 6], order='F')
coord_pairs[:,:3] = atm_coords[ish,:]
coord_pairs[:,3:] = atm_coords[jsh,:]

dm_sparse = dm[ish, jsh] * coef_sparse[:,0] * coef_sparse[:,1]
dm_sparse[diag_idx] *= 0.5

coords_fp32 = coords.astype(np.float32)
exp_sparse_fp32 = exp_sparse.astype(np.float32)
coef_sparse_fp32 = coef_sparse.astype(np.float32)
coord_pairs_fp32 = coord_pairs.astype(np.float32)
dm_sparse_fp32 = dm_sparse.astype(np.float32)

def eval_rho0(dm):
with ni.gdftopt.gdft_envs_cache():
rho = cupy.empty([4,ngrids])
libgdft.eval_rho_fp32(
vhfopt.bpcache,
ctypes.cast(coords.data.ptr, ctypes.c_void_p),
ctypes.c_int(ngrids),
ctypes.cast(rho.data.ptr, ctypes.c_void_p),
ctypes.cast(exp_sparse_fp32.data.ptr, ctypes.c_void_p),
ctypes.cast(coef_sparse_fp32.data.ptr, ctypes.c_void_p),
ctypes.cast(coord_pairs_fp32.data.ptr, ctypes.c_void_p),
ctypes.c_int(nao),
ctypes.cast(dm_sparse_fp32.data.ptr, ctypes.c_void_p))
return rho
return eval_rho0

def eval_rho1(dm):
rho = cupy.empty([4,ngrids])
with ni.gdftopt.gdft_envs_cache():
libgdft.eval_rho_fp64(
vhfopt.bpcache,
ctypes.cast(coords.data.ptr, ctypes.c_void_p),
ctypes.c_int(ngrids),
ctypes.cast(rho.data.ptr, ctypes.c_void_p),
ctypes.cast(exp_sparse.data.ptr, ctypes.c_void_p),
ctypes.cast(coef_sparse.data.ptr, ctypes.c_void_p),
ctypes.cast(coord_pairs.data.ptr, ctypes.c_void_p),
ctypes.c_int(nao),
ctypes.cast(dm_sparse.data.ptr, ctypes.c_void_p))
return rho

def eval_rho2(dm):
ao = numint.eval_ao(ni, ni.gdftopt._sorted_mol, coords, deriv=1)
rho0 = numint.eval_rho(ni.gdftopt._sorted_mol, ao, dm, xctype='GGA')
return rho0

from cupyx import profiler
perf = profiler.benchmark(eval_rho0, (dm_sparse,), n_repeat=20, n_warmup=3)
print('with eval_rho0', perf.gpu_times.mean())
# 0.130s on V100
# 0.115s on A10

perf = profiler.benchmark(eval_rho1, (dm_sparse,), n_repeat=20, n_warmup=3)
print('with eval_rho1', perf.gpu_times.mean())
# 0.324s on V100
# 5.401s on A10

perf = profiler.benchmark(eval_rho2, (dm,), n_repeat=20, n_warmup=3)
print('with eval_rho2', perf.gpu_times.mean())
# 0.0858s on v100
# 0.723s on A10

rho0 = eval_rho0(dm)
rho1 = eval_rho1(dm)
print(rho0[:4,:3])
print(rho1[:4,:3])

print(cupy.max(rho0 - rho1), cupy.max(rho0))
51 changes: 51 additions & 0 deletions gpu4pyscf/lib/gdft/nr_eval_gto.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "gint/cuda_alloc.cuh"
#include "nr_eval_gto.cuh"
#include "contract_rho.cuh"
#include "nr_eval_rho.cu"

#define NG_PER_BLOCK 256
#define LMAX 8
Expand Down Expand Up @@ -1907,4 +1908,54 @@ int GDFTscreen_index(cudaStream_t stream, int *non0shl_idx, double cutoff,
return 0;
}

__host__
int eval_rho_fp64(BasisProdCache *bpcache, double *grids, int ngrids,
double *rho, double *exp_sparse, double *coef_sparse, double *coord_pairs,
int nao, double *dm_sparse)
{
checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
int *bas_pairs_locs = bpcache->bas_pairs_locs;
int npairs = bas_pairs_locs[1];
//printf("%d \n", npairs);

BasOffsets offsets;
offsets.gridx = grids;//d_grids;
offsets.ngrids = ngrids;
offsets.data = rho;
offsets.nao = nao;
offsets.nprim = 1;
offsets.fac = CINTcommon_fac_sp(0);
dim3 threads(NG_PER_BLOCK);
dim3 blocks((ngrids+NG_PER_BLOCK-1)/NG_PER_BLOCK);

_eval_rho<double, double><<<blocks, threads>>> (offsets, dm_sparse, npairs, exp_sparse, coef_sparse, coord_pairs);

return 1;
}

__host__
int eval_rho_fp32(BasisProdCache *bpcache, double *grids, int ngrids,
double *rho, float *exp_sparse, float *coef_sparse, float *coord_pairs,
int nao, float *dm_sparse)
{
checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
int *bas_pairs_locs = bpcache->bas_pairs_locs;
int npairs = bas_pairs_locs[1];
//printf("%d \n", npairs);

BasOffsets offsets;
offsets.gridx = grids;//d_grids;
offsets.ngrids = ngrids;
offsets.data = rho;
offsets.nao = nao;
offsets.nprim = 1;
offsets.fac = CINTcommon_fac_sp(0);
dim3 threads(NG_PER_BLOCK);
dim3 blocks((ngrids+NG_PER_BLOCK-1)/NG_PER_BLOCK);

_eval_rho<double, float><<<blocks, threads>>> (offsets, dm_sparse, npairs, exp_sparse, coef_sparse, coord_pairs);

return 1;
}

}
119 changes: 119 additions & 0 deletions gpu4pyscf/lib/gdft/nr_eval_rho.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@

__constant__ BasisProdCache c_bpcache;
#define NG_PER_BLOCK 256
template <typename T1, typename T2>
__global__
static void _eval_rho(BasOffsets offsets, T2 *dm_sparse, int npairs,
T2 *exp_sparse, T2 *coef_sparse, T2 *coord_pairs){
int ngrids = offsets.ngrids;
int grid_id = blockIdx.x * blockDim.x + threadIdx.x;

double* __restrict__ rho = offsets.data;
double* __restrict__ rhox = offsets.data + 1 * ngrids;
double* __restrict__ rhoy = offsets.data + 2 * ngrids;
double* __restrict__ rhoz = offsets.data + 3 * ngrids;

T2* __restrict__ xi = coord_pairs;
T2* __restrict__ yi = coord_pairs + npairs;
T2* __restrict__ zi = coord_pairs + 2*npairs;
T2* __restrict__ xj = coord_pairs + 3*npairs;
T2* __restrict__ yj = coord_pairs + 4*npairs;
T2* __restrict__ zj = coord_pairs + 5*npairs;

T2* __restrict__ ei = exp_sparse;
T2* __restrict__ ej = exp_sparse + npairs;

double *gridx = offsets.gridx;
double *gridy = offsets.gridx + ngrids;
double *gridz = offsets.gridx + ngrids * 2;
T2 xg = 0.0;
T2 yg = 0.0;
T2 zg = 0.0;

if (grid_id < ngrids) {
xg = gridx[grid_id];
yg = gridy[grid_id];
zg = gridz[grid_id];
}

T2 rho_i = 0.0;
T2 rhox_i = 0.0;
T2 rhoy_i = 0.0;
T2 rhoz_i = 0.0;

T2 __shared__ dm_shared[NG_PER_BLOCK];
T2 __shared__ xi_shared[NG_PER_BLOCK];
T2 __shared__ yi_shared[NG_PER_BLOCK];
T2 __shared__ zi_shared[NG_PER_BLOCK];

T2 __shared__ xj_shared[NG_PER_BLOCK];
T2 __shared__ yj_shared[NG_PER_BLOCK];
T2 __shared__ zj_shared[NG_PER_BLOCK];

T2 __shared__ ei_shared[NG_PER_BLOCK];
T2 __shared__ ej_shared[NG_PER_BLOCK];
T2 fac_ij = offsets.fac * offsets.fac;
for (int block_id = 0; block_id < npairs; block_id+=NG_PER_BLOCK){
int tx = threadIdx.x;
int bas_ij = block_id + tx;

if (bas_ij < npairs){
dm_shared[tx] = dm_sparse[bas_ij] * fac_ij;
xi_shared[tx] = xi[bas_ij];
yi_shared[tx] = yi[bas_ij];
zi_shared[tx] = zi[bas_ij];

xj_shared[tx] = xj[bas_ij];
yj_shared[tx] = yj[bas_ij];
zj_shared[tx] = zj[bas_ij];

ei_shared[tx] = ei[bas_ij];
ej_shared[tx] = ej[bas_ij];
}
__syncthreads();
for (int task_id = 0; task_id < NG_PER_BLOCK && task_id + block_id < npairs; ++task_id){
/* shell i */
T2 rx = xg - xi_shared[task_id];
T2 ry = yg - yi_shared[task_id];
T2 rz = zg - zi_shared[task_id];
T2 rri = rx * rx + ry * ry + rz * rz;
T2 ei_loc = ei_shared[task_id];
T2 erri = ei_loc * rri;
T2 e_2a = -2.0 * ei_loc;

T2 gtox = e_2a * rx;
T2 gtoy = e_2a * ry;
T2 gtoz = e_2a * rz;

/* shell j*/
rx = xg - xj_shared[task_id];
ry = yg - yj_shared[task_id];
rz = zg - zj_shared[task_id];
T2 rrj = rx * rx + ry * ry + rz * rz;
T2 ej_loc = ej_shared[task_id];
T2 errj = ej_loc * rrj;
e_2a = -2.0 * ej_loc;

gtox += e_2a * rx;
gtoy += e_2a * ry;
gtoz += e_2a * rz;

T2 eij = exp(-erri - errj);
T2 dme_ij = dm_shared[task_id] * eij;

rho_i += dme_ij;
rhox_i += gtox * dme_ij;
rhoy_i += gtoy * dme_ij;
rhoz_i += gtoz * dme_ij;
}
__syncthreads();
}

// Due to the symmetry
if (grid_id < ngrids){
rho[grid_id] = rho_i * 2.0;
rhox[grid_id] = rhox_i * 2.0;
rhoy[grid_id] = rhoy_i * 2.0;
rhoz[grid_id] = rhoz_i * 2.0;
}
}
Loading