diff --git a/benchmarks/benchmark_eval_rho.py b/benchmarks/benchmark_eval_rho.py
new file mode 100644
index 000000000..5627caa67
--- /dev/null
+++ b/benchmarks/benchmark_eval_rho.py
@@ -0,0 +1,124 @@
+import pyscf
+from gpu4pyscf.scf import hf
+
+mol = pyscf.M(atom = 'molecules/water_clusters/008.xyz')
+mol.basis = {
+    'H': [[0, (0.01,  0.0070011547)]],
+    'O': [[0, (0.01,  0.0070011547)]]}
+mol.build()
+
+vhfopt = hf._VHFOpt(mol, 'int2e').build()
+
+from gpu4pyscf.lib.cupy_helper import load_library
+libgdft = load_library('libgdft')
+
+import numpy as np
+import cupy
+from gpu4pyscf.dft.gen_grid import Grids
+grids = Grids(mol)
+grids.build()
+coords = cupy.asarray(grids.coords[:256*256*16], order='F')#.copy()
+
+ngrids, _ = coords.shape
+
+ao_loc = mol.ao_loc_nr()
+ao_loc = cupy.asarray(ao_loc, dtype=np.int32)
+nao = mol.nao
+dm = cupy.random.rand(nao,nao)
+dm += dm.T
+print(dm[:4,:4])
+print(vhfopt.bas_pair2shls[:,:1000])
+
+import ctypes
+from pyscf import gto
+from gpu4pyscf.dft import numint
+ni = numint.NumInt().build(mol, coords)
+env = cupy.asarray(ni.gdftopt._sorted_mol._env)
+ish, jsh = vhfopt.bas_pair2shls
+diag_idx = ish == jsh
+npairs = len(ish)
+bas = ni.gdftopt._sorted_mol._bas
+exp_sparse = cupy.empty([npairs, 2], order='F')
+bas_exp = bas[ish, gto.PTR_EXP]
+exp_sparse[:,0] = cupy.asarray(env[bas_exp])
+bas_exp = bas[jsh, gto.PTR_EXP]
+exp_sparse[:,1] = cupy.asarray(env[bas_exp])
+
+coef_sparse = cupy.empty([npairs, 2], order='F')
+bas_coef = bas[ish, gto.PTR_COEFF]
+coef_sparse[:,0] = cupy.asarray(env[bas_coef])
+bas_coef = bas[jsh, gto.PTR_COEFF]
+coef_sparse[:,1] = cupy.asarray(env[bas_coef])
+
+atm_coords = cupy.asarray(mol.atom_coords())
+coord_pairs = cupy.empty([npairs, 6], order='F')
+coord_pairs[:,:3] = atm_coords[ish,:]
+coord_pairs[:,3:] = atm_coords[jsh,:]
+
+dm_sparse = dm[ish, jsh] * coef_sparse[:,0] * coef_sparse[:,1]
+dm_sparse[diag_idx] *= 0.5
+
+coords_fp32 = coords.astype(np.float32)
+exp_sparse_fp32 = exp_sparse.astype(np.float32)
+coef_sparse_fp32 = coef_sparse.astype(np.float32)
+coord_pairs_fp32 = coord_pairs.astype(np.float32)
+dm_sparse_fp32 = dm_sparse.astype(np.float32)
+
+def eval_rho0(dm):
+    with ni.gdftopt.gdft_envs_cache():
+        rho = cupy.empty([4,ngrids])
+        libgdft.eval_rho_fp32(
+            vhfopt.bpcache,
+            ctypes.cast(coords.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(ngrids),
+            ctypes.cast(rho.data.ptr, ctypes.c_void_p),
+            ctypes.cast(exp_sparse_fp32.data.ptr, ctypes.c_void_p),
+            ctypes.cast(coef_sparse_fp32.data.ptr, ctypes.c_void_p),
+            ctypes.cast(coord_pairs_fp32.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(nao),
+            ctypes.cast(dm_sparse_fp32.data.ptr, ctypes.c_void_p))
+        return rho
+    return eval_rho0
+
+def eval_rho1(dm):
+    rho = cupy.empty([4,ngrids])
+    with ni.gdftopt.gdft_envs_cache():
+        libgdft.eval_rho_fp64(
+            vhfopt.bpcache,
+            ctypes.cast(coords.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(ngrids),
+            ctypes.cast(rho.data.ptr, ctypes.c_void_p),
+            ctypes.cast(exp_sparse.data.ptr, ctypes.c_void_p),
+            ctypes.cast(coef_sparse.data.ptr, ctypes.c_void_p),
+            ctypes.cast(coord_pairs.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(nao),
+            ctypes.cast(dm_sparse.data.ptr, ctypes.c_void_p))
+    return rho
+
+def eval_rho2(dm):
+    ao = numint.eval_ao(ni, ni.gdftopt._sorted_mol, coords, deriv=1)
+    rho0 = numint.eval_rho(ni.gdftopt._sorted_mol, ao, dm, xctype='GGA')
+    return rho0
+
+from cupyx import profiler
+perf = profiler.benchmark(eval_rho0, (dm_sparse,), n_repeat=20, n_warmup=3)
+print('with eval_rho0', perf.gpu_times.mean())
+# 0.130s on V100
+# 0.115s on A10
+
+perf = profiler.benchmark(eval_rho1, (dm_sparse,), n_repeat=20, n_warmup=3)
+print('with eval_rho1', perf.gpu_times.mean())
+# 0.324s on V100
+# 5.401s on A10
+
+perf = profiler.benchmark(eval_rho2, (dm,), n_repeat=20, n_warmup=3)
+print('with eval_rho2', perf.gpu_times.mean())
+# 0.0858s on v100
+# 0.723s on A10
+
+rho0 = eval_rho0(dm)
+rho1 = eval_rho1(dm)
+print(rho0[:4,:3])
+print(rho1[:4,:3])
+
+print(cupy.max(rho0 - rho1), cupy.max(rho0))
diff --git a/gpu4pyscf/lib/gdft/nr_eval_gto.cu b/gpu4pyscf/lib/gdft/nr_eval_gto.cu
index 12b9fe9a7..57da02b19 100644
--- a/gpu4pyscf/lib/gdft/nr_eval_gto.cu
+++ b/gpu4pyscf/lib/gdft/nr_eval_gto.cu
@@ -27,6 +27,7 @@
 #include "gint/cuda_alloc.cuh"
 #include "nr_eval_gto.cuh"
 #include "contract_rho.cuh"
+#include "nr_eval_rho.cu"
 
 #define NG_PER_BLOCK       256
 #define LMAX            8
@@ -1907,4 +1908,54 @@ int GDFTscreen_index(cudaStream_t stream, int *non0shl_idx, double cutoff,
     return 0;
 }
 
+__host__
+int eval_rho_fp64(BasisProdCache *bpcache, double *grids, int ngrids,
+            double *rho, double *exp_sparse, double *coef_sparse, double *coord_pairs,
+            int nao, double *dm_sparse)
+{
+    checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+    int *bas_pairs_locs = bpcache->bas_pairs_locs;
+    int npairs = bas_pairs_locs[1];
+    //printf("%d \n", npairs);
+
+    BasOffsets offsets;
+    offsets.gridx = grids;//d_grids;
+    offsets.ngrids = ngrids;
+    offsets.data = rho;
+    offsets.nao = nao;
+    offsets.nprim = 1;
+    offsets.fac = CINTcommon_fac_sp(0);
+    dim3 threads(NG_PER_BLOCK);
+    dim3 blocks((ngrids+NG_PER_BLOCK-1)/NG_PER_BLOCK);
+
+    _eval_rho<double, double><<<blocks, threads>>> (offsets, dm_sparse, npairs, exp_sparse, coef_sparse, coord_pairs);
+
+    return 1;
+}
+
+__host__
+int eval_rho_fp32(BasisProdCache *bpcache, double *grids, int ngrids,
+            double *rho, float *exp_sparse, float *coef_sparse, float *coord_pairs,
+            int nao, float *dm_sparse)
+{
+    checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+    int *bas_pairs_locs = bpcache->bas_pairs_locs;
+    int npairs = bas_pairs_locs[1];
+    //printf("%d \n", npairs);
+
+    BasOffsets offsets;
+    offsets.gridx = grids;//d_grids;
+    offsets.ngrids = ngrids;
+    offsets.data = rho;
+    offsets.nao = nao;
+    offsets.nprim = 1;
+    offsets.fac = CINTcommon_fac_sp(0);
+    dim3 threads(NG_PER_BLOCK);
+    dim3 blocks((ngrids+NG_PER_BLOCK-1)/NG_PER_BLOCK);
+
+    _eval_rho<double, float><<<blocks, threads>>> (offsets, dm_sparse, npairs, exp_sparse, coef_sparse, coord_pairs);
+
+    return 1;
+}
+
 }
diff --git a/gpu4pyscf/lib/gdft/nr_eval_rho.cu b/gpu4pyscf/lib/gdft/nr_eval_rho.cu
new file mode 100644
index 000000000..0a342b742
--- /dev/null
+++ b/gpu4pyscf/lib/gdft/nr_eval_rho.cu
@@ -0,0 +1,119 @@
+
+__constant__ BasisProdCache c_bpcache;
+#define NG_PER_BLOCK       256
+template <typename T1, typename T2>
+__global__
+static void _eval_rho(BasOffsets offsets, T2 *dm_sparse, int npairs, 
+    T2 *exp_sparse, T2 *coef_sparse, T2 *coord_pairs){
+    int ngrids = offsets.ngrids;
+    int grid_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    double* __restrict__ rho = offsets.data;
+    double* __restrict__ rhox = offsets.data + 1 * ngrids;
+    double* __restrict__ rhoy = offsets.data + 2 * ngrids;
+    double* __restrict__ rhoz = offsets.data + 3 * ngrids;
+    
+    T2* __restrict__ xi = coord_pairs;
+    T2* __restrict__ yi = coord_pairs + npairs;
+    T2* __restrict__ zi = coord_pairs + 2*npairs;
+    T2* __restrict__ xj = coord_pairs + 3*npairs;
+    T2* __restrict__ yj = coord_pairs + 4*npairs;
+    T2* __restrict__ zj = coord_pairs + 5*npairs;
+
+    T2* __restrict__ ei = exp_sparse;
+    T2* __restrict__ ej = exp_sparse + npairs;
+    
+    double *gridx = offsets.gridx;
+    double *gridy = offsets.gridx + ngrids;
+    double *gridz = offsets.gridx + ngrids * 2;
+    T2 xg = 0.0;
+    T2 yg = 0.0;
+    T2 zg = 0.0;
+
+    if (grid_id < ngrids) {
+        xg = gridx[grid_id];
+        yg = gridy[grid_id];
+        zg = gridz[grid_id];
+    }
+
+    T2 rho_i  = 0.0;
+    T2 rhox_i = 0.0;
+    T2 rhoy_i = 0.0;
+    T2 rhoz_i = 0.0;
+    
+    T2 __shared__ dm_shared[NG_PER_BLOCK];
+    T2 __shared__ xi_shared[NG_PER_BLOCK];
+    T2 __shared__ yi_shared[NG_PER_BLOCK];
+    T2 __shared__ zi_shared[NG_PER_BLOCK];
+
+    T2 __shared__ xj_shared[NG_PER_BLOCK];
+    T2 __shared__ yj_shared[NG_PER_BLOCK];
+    T2 __shared__ zj_shared[NG_PER_BLOCK];
+
+    T2 __shared__ ei_shared[NG_PER_BLOCK];
+    T2 __shared__ ej_shared[NG_PER_BLOCK];
+    T2 fac_ij = offsets.fac * offsets.fac;
+    for (int block_id = 0; block_id < npairs; block_id+=NG_PER_BLOCK){
+        int tx = threadIdx.x;
+        int bas_ij = block_id + tx;
+
+        if (bas_ij < npairs){
+            dm_shared[tx] = dm_sparse[bas_ij] * fac_ij;
+            xi_shared[tx] = xi[bas_ij];
+            yi_shared[tx] = yi[bas_ij];
+            zi_shared[tx] = zi[bas_ij];
+
+            xj_shared[tx] = xj[bas_ij];
+            yj_shared[tx] = yj[bas_ij];
+            zj_shared[tx] = zj[bas_ij];
+
+            ei_shared[tx] = ei[bas_ij];
+            ej_shared[tx] = ej[bas_ij];
+        }
+        __syncthreads();
+        for (int task_id = 0; task_id < NG_PER_BLOCK && task_id + block_id < npairs; ++task_id){
+            /* shell i */
+            T2 rx = xg - xi_shared[task_id];
+            T2 ry = yg - yi_shared[task_id];
+            T2 rz = zg - zi_shared[task_id];
+            T2 rri = rx * rx + ry * ry + rz * rz;
+            T2 ei_loc = ei_shared[task_id];
+            T2 erri = ei_loc * rri;
+            T2 e_2a = -2.0 * ei_loc;
+
+            T2 gtox = e_2a * rx;
+            T2 gtoy = e_2a * ry;
+            T2 gtoz = e_2a * rz;
+
+            /* shell j*/
+            rx = xg - xj_shared[task_id];
+            ry = yg - yj_shared[task_id];
+            rz = zg - zj_shared[task_id];
+            T2 rrj = rx * rx + ry * ry + rz * rz;
+            T2 ej_loc = ej_shared[task_id];
+            T2 errj = ej_loc * rrj;
+            e_2a = -2.0 * ej_loc;
+
+            gtox += e_2a * rx;
+            gtoy += e_2a * ry;
+            gtoz += e_2a * rz;
+
+            T2 eij = exp(-erri - errj);
+            T2 dme_ij = dm_shared[task_id] * eij;
+            
+            rho_i  += dme_ij;
+            rhox_i += gtox * dme_ij;
+            rhoy_i += gtoy * dme_ij;
+            rhoz_i += gtoz * dme_ij;
+        }
+        __syncthreads();
+    }
+    
+    // Due to the symmetry
+    if (grid_id < ngrids){
+        rho[grid_id] = rho_i * 2.0;
+        rhox[grid_id] = rhox_i * 2.0;
+        rhoy[grid_id] = rhoy_i * 2.0;
+        rhoz[grid_id] = rhoz_i * 2.0;
+    }
+}