diff --git a/gpu4pyscf/lib/pbc/CMakeLists.txt b/gpu4pyscf/lib/pbc/CMakeLists.txt index 66b08c190..7b763bb29 100644 --- a/gpu4pyscf/lib/pbc/CMakeLists.txt +++ b/gpu4pyscf/lib/pbc/CMakeLists.txt @@ -3,6 +3,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=12 add_library(pbc SHARED pbc_driver.cu ft_ao.cu unrolled_ft_ao.cu ft_ao_bdiv.cu fill_int3c2e.cu unrolled_int3c2e.cu fill_int2c2e.cu + overlap.cu estimator.cu rys_roots_dat.cu nr_eval_gto.cu diff --git a/gpu4pyscf/lib/pbc/fill_int2c2e.cu b/gpu4pyscf/lib/pbc/fill_int2c2e.cu index aab2332d8..57de092fb 100644 --- a/gpu4pyscf/lib/pbc/fill_int2c2e.cu +++ b/gpu4pyscf/lib/pbc/fill_int2c2e.cu @@ -57,7 +57,7 @@ void pbc_int2c2e_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds double *env = envs.env; double *img_coords = envs.img_coords; - int gout_stride = bounds.gout_stride_lookup[lj*L_AUX1+li]; + int gout_stride = bounds.gout_stride_lookup[li*L_AUX1+lj]; int nsp_per_block = THREADS / gout_stride; int sp_id = thread_id % nsp_per_block; int gout_id = thread_id / nsp_per_block; diff --git a/gpu4pyscf/lib/pbc/overlap.cu b/gpu4pyscf/lib/pbc/overlap.cu new file mode 100644 index 000000000..caaaa4d83 --- /dev/null +++ b/gpu4pyscf/lib/pbc/overlap.cu @@ -0,0 +1,866 @@ +/* + * Copyright 2025 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "pbc.cuh" +#include "int3c2e.cuh" + +#define PI_POW_1_5 5.568327996831707845 + +typedef struct { + int8_t iprim; + int8_t jprim; + int8_t nfi; + int8_t nfj; +} PackedPGTO; + +#define GOUT_WIDTH 36 +#define GOUT_WIDTH_IP1 18 + +static __global__ +void int1e_ovlp_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds) +{ + int sp_block_id = blockIdx.x; + int thread_id = threadIdx.x; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.cell0_nbas * envs.bvk_ncells; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int *bas = envs.bas; + int li = bas[ish0*BAS_SLOTS+ANG_OF]; + int lj = bas[jsh0*BAS_SLOTS+ANG_OF]; + int8_t nfi = (li + 1) * (li + 2) / 2; + int8_t nfj = (lj + 1) * (lj + 2) / 2; + int8_t iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int8_t jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + PackedPGTO pdata = {iprim, jprim, nfi, nfj}; + double *env = envs.env; + double *img_coords = envs.img_coords; + + int gout_stride = bounds.gout_stride_lookup[li*L_AUX1+lj]; + int nsp_per_block = THREADS / gout_stride; + int sp_id = thread_id % nsp_per_block; + int gout_id = thread_id / nsp_per_block; + + int g_size = (li + 1) * (lj + 1); + int gx_len = g_size * nsp_per_block; + extern __shared__ double g[]; + double *gx = g + sp_id; + double *gy = gx + gx_len; + double *gz = gx + gx_len * 2; + double *rjri = gx + gx_len * 3; + gx[0] = PI_POW_1_5; + gy[0] = 1.; + + for (int task_id = shl_pair0; task_id < shl_pair1; task_id += nsp_per_block) { + double gout[GOUT_WIDTH]; +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { + gout[n] = 0.; + } + int pair_ij = task_id + sp_id; + if (pair_ij >= shl_pair1) { + pair_ij = shl_pair0; + } + int bas_ij = bounds.bas_ij_idx[pair_ij]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + for (int img = 0; img < envs.nimgs; img++) { + if (gout_id == 0) { + double xjL = img_coords[img*3+0]; + double yjL = img_coords[img*3+1]; + double zjL = img_coords[img*3+2]; + double xjxi = rj[0] + xjL - ri[0]; + double yjyi = rj[1] + yjL - ri[1]; + double zjzi = rj[2] + zjL - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nsp_per_block] = xjxi; + rjri[1*nsp_per_block] = yjyi; + rjri[2*nsp_per_block] = zjzi; + rjri[3*nsp_per_block] = rr_ij; + } + int iprim = pdata.iprim; + int jprim = pdata.jprim; + int ijprim = iprim * jprim; + int nfi = pdata.nfi; + int nfj = pdata.nfj; + int nfij = nfi * nfj; + int16_t *idx_ij = c_pair_idx + c_pair_offsets[li*L_AUX1+lj]; + int16_t *idy_ij = idx_ij + nfij; + int16_t *idz_ij = idx_ij + nfij * 2; + for (int ijp = 0; ijp < ijprim; ++ijp) { + __syncthreads(); + int ip = ijp % iprim; + int jp = ijp / iprim; + double ai = expi[ip]; + double aj = expj[jp]; + double aij = ai + aj; + double aj_aij = aj / aij; + if (gout_id == 0) { + double theta = ai * aj_aij; + double theta_rr = theta * rjri[3*nsp_per_block]; + double cicj = ci[ip] * cj[jp]; + gz[0] = cicj / (aij*sqrt(aij)) * exp(-theta_rr); + } + int lij = li + lj; + int stride_j = li + 1; + if (lij > 0) { + __syncthreads(); + double s0x, s1x, s2x; + double b = .5 / aij; + for (int n = gout_id; n < 3; n += gout_stride) { + double *_gx = gx + n * gx_len; + double xjxi = rjri[n*nsp_per_block]; + double xpa = xjxi * aj_aij; + s0x = _gx[0]; + s1x = xpa * s0x; + _gx[nsp_per_block] = s1x; + for (int i = 1; i < lij; ++i) { + s2x = xpa * s1x + i * b * s0x; + _gx[(i+1)*nsp_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + for (int j = 0; j < lj; ++j) { + int ij = (lij-j) + j*stride_j; + s1x = _gx[ij*nsp_per_block]; + for (--ij; ij >= j*stride_j; --ij) { + s0x = _gx[ij*nsp_per_block]; + _gx[(ij+stride_j)*nsp_per_block] = s1x - xjxi * s0x; + s1x = s0x; + } + } + } + } + __syncthreads(); +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ij = n*gout_stride+gout_id; + if (ij >= nfij) break; + int addrx = idx_ij[ij] * nsp_per_block; + int addry = idy_ij[ij] * nsp_per_block; + int addrz = idz_ij[ij] * nsp_per_block; + gout[n] += gx[addrx] * gy[addry] * gz[addrz]; + } + } + } + + if (task_id + sp_id < shl_pair1) { + int *ao_loc = envs.ao_loc; + int nbas = envs.cell0_nbas; + int nao = ao_loc[nbas]; + size_t nao2 = nao * nao; + int cell_id = jsh / nbas; + int jshp = jsh % nbas; + int i0 = ao_loc[ish]; + int j0 = ao_loc[jshp]; + double *out_subblock = out + cell_id*nao2 + i0 * nao + j0; + int nfi = pdata.nfi; + int nfj = pdata.nfj; + int nfij = nfi * nfj; +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ij = n*gout_stride+gout_id; + if (ij >= nfij) break; + int j = ij / nfi; + int i = ij % nfi; + out_subblock[i*nao+j] = gout[n]; + } + } + } +} + +static __global__ +void int1e_kin_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds) +{ + int sp_block_id = blockIdx.x; + int thread_id = threadIdx.x; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.cell0_nbas * envs.bvk_ncells; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int *bas = envs.bas; + int li = bas[ish0*BAS_SLOTS+ANG_OF]; + int lj = bas[jsh0*BAS_SLOTS+ANG_OF]; + int8_t nfi = (li + 1) * (li + 2) / 2; + int8_t nfj = (lj + 1) * (lj + 2) / 2; + int8_t iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int8_t jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + PackedPGTO pdata = {iprim, jprim, nfi, nfj}; + double *env = envs.env; + double *img_coords = envs.img_coords; + + int gout_stride = bounds.gout_stride_lookup[li*L_AUX1+lj]; + int nsp_per_block = THREADS / gout_stride; + int sp_id = thread_id % nsp_per_block; + int gout_id = thread_id / nsp_per_block; + + int g_size = (li + 3) * (lj + 1); + int gx_len = g_size * nsp_per_block; + extern __shared__ double g[]; + double *gx = g + sp_id; + double *gy = gx + gx_len; + double *gz = gx + gx_len * 2; + double *rjri = gx + gx_len * 3; + gx[0] = PI_POW_1_5; + gy[0] = -.5; + + for (int task_id = shl_pair0; task_id < shl_pair1; task_id += nsp_per_block) { + double gout[GOUT_WIDTH]; +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { + gout[n] = 0.; + } + int pair_ij = task_id + sp_id; + if (pair_ij >= shl_pair1) { + pair_ij = shl_pair0; + } + int bas_ij = bounds.bas_ij_idx[pair_ij]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + for (int img = 0; img < envs.nimgs; img++) { + if (gout_id == 0) { + double xjL = img_coords[img*3+0]; + double yjL = img_coords[img*3+1]; + double zjL = img_coords[img*3+2]; + double xjxi = rj[0] + xjL - ri[0]; + double yjyi = rj[1] + yjL - ri[1]; + double zjzi = rj[2] + zjL - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nsp_per_block] = xjxi; + rjri[1*nsp_per_block] = yjyi; + rjri[2*nsp_per_block] = zjzi; + rjri[3*nsp_per_block] = rr_ij; + } + int iprim = pdata.iprim; + int jprim = pdata.jprim; + int ijprim = iprim * jprim; + int nfi = pdata.nfi; + int nfj = pdata.nfj; + int nfij = nfi * nfj; + int16_t *idx_i = c_pair_idx + c_pair_offsets[li*L_AUX1]; + int16_t *idy_i = idx_i + nfi; + int16_t *idz_i = idx_i + nfi * 2; + int16_t *idx_j = c_pair_idx + c_pair_offsets[lj*L_AUX1]; + int16_t *idy_j = idx_j + nfj; + int16_t *idz_j = idx_j + nfj * 2; + for (int ijp = 0; ijp < ijprim; ++ijp) { + __syncthreads(); + int ip = ijp % iprim; + int jp = ijp / iprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ai2 = ai * -2; + double aij = ai + aj; + double aj_aij = aj / aij; + if (gout_id == 0) { + double theta = ai * aj_aij; + double theta_rr = theta * rjri[3*nsp_per_block]; + double cicj = ci[ip] * cj[jp]; + gz[0] = cicj / (aij*sqrt(aij)) * exp(-theta_rr); + } + __syncthreads(); + int lij = li + lj + 2; + int stride_j = li + 3; + int i_1 = nsp_per_block; + double s0x, s1x, s2x; + double b = .5 / aij; + for (int n = gout_id; n < 3; n += gout_stride) { + double *_gx = gx + n * gx_len; + double xjxi = rjri[n*nsp_per_block]; + double xpa = xjxi * aj_aij; + s0x = _gx[0]; + s1x = xpa * s0x; + _gx[nsp_per_block] = s1x; + for (int i = 1; i < lij; ++i) { + s2x = xpa * s1x + i * b * s0x; + _gx[(i+1)*nsp_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + for (int j = 0; j < lj; ++j) { + int ij = (lij-j) + j*stride_j; + s1x = _gx[ij*nsp_per_block]; + for (--ij; ij >= j*stride_j; --ij) { + s0x = _gx[ij*nsp_per_block]; + _gx[(ij+stride_j)*nsp_per_block] = s1x - xjxi * s0x; + s1x = s0x; + } + } + } + __syncthreads(); +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ij = n*gout_stride+gout_id; + if (ij >= nfij) break; + int i = ij % nfi; + int j = ij / nfi; + int ix = idx_i[i]; + int iy = idy_i[i]; + int iz = idz_i[i]; + int jx = idx_j[j]; + int jy = idy_j[j]; + int jz = idz_j[j]; + int addrx = (ix + jx*stride_j) * nsp_per_block; + int addry = (iy + jy*stride_j) * nsp_per_block; + int addrz = (iz + jz*stride_j) * nsp_per_block; + double fx0 = gx[addrx]; + double fy0 = gy[addry]; + double fz0 = gz[addrz]; + double fx2 = ai2 * ((ix*2+1)*fx0 + ai2*gx[addrx+i_1*2]); + double fy2 = ai2 * ((iy*2+1)*fy0 + ai2*gy[addry+i_1*2]); + double fz2 = ai2 * ((iz*2+1)*fz0 + ai2*gz[addrz+i_1*2]); + if (ix > 1) fx2 += ix*(ix-1) * gx[addrx-i_1*2]; + if (iy > 1) fy2 += iy*(iy-1) * gy[addry-i_1*2]; + if (iz > 1) fz2 += iz*(iz-1) * gz[addrz-i_1*2]; + gout[n] += fx2 * fy0 * fz0; + gout[n] += fx0 * fy2 * fz0; + gout[n] += fx0 * fy0 * fz2; + } + } + } + + if (task_id + sp_id < shl_pair1) { + int *ao_loc = envs.ao_loc; + int nbas = envs.cell0_nbas; + int nao = ao_loc[nbas]; + size_t nao2 = nao * nao; + int cell_id = jsh / nbas; + int jshp = jsh % nbas; + int i0 = ao_loc[ish]; + int j0 = ao_loc[jshp]; + double *out_subblock = out + cell_id*nao2 + i0 * nao + j0; + int nfi = pdata.nfi; + int nfj = pdata.nfj; + int nfij = nfi * nfj; +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ij = n*gout_stride+gout_id; + if (ij >= nfij) break; + int j = ij / nfi; + int i = ij % nfi; + out_subblock[i*nao+j] = gout[n]; + } + } + } +} + +static __global__ +void int1e_ipovlp_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds) +{ + int sp_block_id = blockIdx.x; + int thread_id = threadIdx.x; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.cell0_nbas * envs.bvk_ncells; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int *bas = envs.bas; + int li = bas[ish0*BAS_SLOTS+ANG_OF]; + int lj = bas[jsh0*BAS_SLOTS+ANG_OF]; + int8_t nfi = (li + 1) * (li + 2) / 2; + int8_t nfj = (lj + 1) * (lj + 2) / 2; + int8_t iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int8_t jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + PackedPGTO pdata = {iprim, jprim, nfi, nfj}; + double *env = envs.env; + double *img_coords = envs.img_coords; + + int gout_stride = bounds.gout_stride_lookup[li*L_AUX1+lj]; + int nsp_per_block = THREADS / gout_stride; + int sp_id = thread_id % nsp_per_block; + int gout_id = thread_id / nsp_per_block; + + int g_size = (li + 2) * (lj + 1); + int gx_len = g_size * nsp_per_block; + extern __shared__ double g[]; + double *gx = g + sp_id; + double *gy = gx + gx_len; + double *gz = gx + gx_len * 2; + double *rjri = gx + gx_len * 3; + gx[0] = PI_POW_1_5; + gy[0] = 1.; + + for (int task_id = shl_pair0; task_id < shl_pair1; task_id += nsp_per_block) { + double goutx[GOUT_WIDTH_IP1]; + double gouty[GOUT_WIDTH_IP1]; + double goutz[GOUT_WIDTH_IP1]; +#pragma unroll + for (int n = 0; n < GOUT_WIDTH_IP1; ++n) { + goutx[n] = 0.; + gouty[n] = 0.; + goutz[n] = 0.; + } + int pair_ij = task_id + sp_id; + if (pair_ij >= shl_pair1) { + pair_ij = shl_pair0; + } + int bas_ij = bounds.bas_ij_idx[pair_ij]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + for (int img = 0; img < envs.nimgs; img++) { + if (gout_id == 0) { + double xjL = img_coords[img*3+0]; + double yjL = img_coords[img*3+1]; + double zjL = img_coords[img*3+2]; + double xjxi = rj[0] + xjL - ri[0]; + double yjyi = rj[1] + yjL - ri[1]; + double zjzi = rj[2] + zjL - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nsp_per_block] = xjxi; + rjri[1*nsp_per_block] = yjyi; + rjri[2*nsp_per_block] = zjzi; + rjri[3*nsp_per_block] = rr_ij; + } + int iprim = pdata.iprim; + int jprim = pdata.jprim; + int ijprim = iprim * jprim; + int nfi = pdata.nfi; + int nfj = pdata.nfj; + int nfij = nfi * nfj; + int16_t *idx_i = c_pair_idx + c_pair_offsets[li*L_AUX1]; + int16_t *idy_i = idx_i + nfi; + int16_t *idz_i = idx_i + nfi * 2; + int16_t *idx_j = c_pair_idx + c_pair_offsets[lj*L_AUX1]; + int16_t *idy_j = idx_j + nfj; + int16_t *idz_j = idx_j + nfj * 2; + for (int ijp = 0; ijp < ijprim; ++ijp) { + __syncthreads(); + int ip = ijp % iprim; + int jp = ijp / iprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ai2 = ai * -2; + double aij = ai + aj; + double aj_aij = aj / aij; + if (gout_id == 0) { + double theta = ai * aj_aij; + double theta_rr = theta * rjri[3*nsp_per_block]; + double cicj = ci[ip] * cj[jp]; + gz[0] = cicj / (aij*sqrt(aij)) * exp(-theta_rr); + } + int lij = li + lj + 1; + int stride_j = li + 2; + int i_1 = nsp_per_block; + __syncthreads(); + double s0x, s1x, s2x; + double b = .5 / aij; + for (int n = gout_id; n < 3; n += gout_stride) { + double *_gx = gx + n * gx_len; + double xjxi = rjri[n*nsp_per_block]; + double xpa = xjxi * aj_aij; + s0x = _gx[0]; + s1x = xpa * s0x; + _gx[nsp_per_block] = s1x; + for (int i = 1; i < lij; ++i) { + s2x = xpa * s1x + i * b * s0x; + _gx[(i+1)*nsp_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + for (int j = 0; j < lj; ++j) { + int ij = (lij-j) + j*stride_j; + s1x = _gx[ij*nsp_per_block]; + for (--ij; ij >= j*stride_j; --ij) { + s0x = _gx[ij*nsp_per_block]; + _gx[(ij+stride_j)*nsp_per_block] = s1x - xjxi * s0x; + s1x = s0x; + } + } + } + __syncthreads(); +#pragma unroll + for (int n = 0; n < GOUT_WIDTH_IP1; ++n) { + int ij = n*gout_stride+gout_id; + if (ij >= nfij) break; + int i = ij % nfi; + int j = ij / nfi; + int ix = idx_i[i]; + int iy = idy_i[i]; + int iz = idz_i[i]; + int jx = idx_j[j]; + int jy = idy_j[j]; + int jz = idz_j[j]; + int addrx = (ix + jx*stride_j) * nsp_per_block; + int addry = (iy + jy*stride_j) * nsp_per_block; + int addrz = (iz + jz*stride_j) * nsp_per_block; + double fx0 = gx[addrx]; + double fy0 = gy[addry]; + double fz0 = gz[addrz]; + double fx1 = ai2 * gx[addrx+i_1]; + double fy1 = ai2 * gy[addry+i_1]; + double fz1 = ai2 * gz[addrz+i_1]; + if (ix > 0) fx1 += ix * gx[addrx-i_1]; + if (iy > 0) fy1 += iy * gy[addry-i_1]; + if (iz > 0) fz1 += iz * gz[addrz-i_1]; + goutx[n] += fx1 * fy0 * fz0; + gouty[n] += fx0 * fy1 * fz0; + goutz[n] += fx0 * fy0 * fz1; + } + } + } + + if (task_id + sp_id < shl_pair1) { + int *ao_loc = envs.ao_loc; + int nbas = envs.cell0_nbas; + int nao = ao_loc[nbas]; + size_t nao2 = nao * nao; + int cell_id = jsh / nbas; + int jshp = jsh % nbas; + int i0 = ao_loc[ish]; + int j0 = ao_loc[jshp]; + double *outx = out + cell_id*nao2*3 + i0 * nao + j0; + double *outy = outx + nao2; + double *outz = outx + nao2 * 2; + int nfi = pdata.nfi; + int nfj = pdata.nfj; + int nfij = nfi * nfj; +#pragma unroll + for (int n = 0; n < GOUT_WIDTH_IP1; ++n) { + int ij = n*gout_stride+gout_id; + if (ij >= nfij) break; + int j = ij / nfi; + int i = ij % nfi; + outx[i*nao+j] = goutx[n]; + outy[i*nao+j] = gouty[n]; + outz[i*nao+j] = goutz[n]; + } + } + } +} + +static __global__ +void int1e_ipkin_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds) +{ + int sp_block_id = blockIdx.x; + int thread_id = threadIdx.x; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.cell0_nbas * envs.bvk_ncells; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int *bas = envs.bas; + int li = bas[ish0*BAS_SLOTS+ANG_OF]; + int lj = bas[jsh0*BAS_SLOTS+ANG_OF]; + int8_t nfi = (li + 1) * (li + 2) / 2; + int8_t nfj = (lj + 1) * (lj + 2) / 2; + int8_t iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int8_t jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + PackedPGTO pdata = {iprim, jprim, nfi, nfj}; + double *env = envs.env; + double *img_coords = envs.img_coords; + + int gout_stride = bounds.gout_stride_lookup[li*L_AUX1+lj]; + int nsp_per_block = THREADS / gout_stride; + int sp_id = thread_id % nsp_per_block; + int gout_id = thread_id / nsp_per_block; + + int g_size = (li + 4) * (lj + 1); + int gx_len = g_size * nsp_per_block; + extern __shared__ double g[]; + double *gx = g + sp_id; + double *gy = gx + gx_len; + double *gz = gx + gx_len * 2; + double *rjri = gx + gx_len * 3; + gx[0] = PI_POW_1_5; + gy[0] = -.5; + + for (int task_id = shl_pair0; task_id < shl_pair1; task_id += nsp_per_block) { + double goutx[GOUT_WIDTH_IP1]; + double gouty[GOUT_WIDTH_IP1]; + double goutz[GOUT_WIDTH_IP1]; +#pragma unroll + for (int n = 0; n < GOUT_WIDTH_IP1; ++n) { + goutx[n] = 0.; + gouty[n] = 0.; + goutz[n] = 0.; + } + int pair_ij = task_id + sp_id; + if (pair_ij >= shl_pair1) { + pair_ij = shl_pair0; + } + int bas_ij = bounds.bas_ij_idx[pair_ij]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + for (int img = 0; img < envs.nimgs; img++) { + if (gout_id == 0) { + double xjL = img_coords[img*3+0]; + double yjL = img_coords[img*3+1]; + double zjL = img_coords[img*3+2]; + double xjxi = rj[0] + xjL - ri[0]; + double yjyi = rj[1] + yjL - ri[1]; + double zjzi = rj[2] + zjL - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nsp_per_block] = xjxi; + rjri[1*nsp_per_block] = yjyi; + rjri[2*nsp_per_block] = zjzi; + rjri[3*nsp_per_block] = rr_ij; + } + int iprim = pdata.iprim; + int jprim = pdata.jprim; + int ijprim = iprim * jprim; + int nfi = pdata.nfi; + int nfj = pdata.nfj; + int nfij = nfi * nfj; + int16_t *idx_i = c_pair_idx + c_pair_offsets[li*L_AUX1]; + int16_t *idy_i = idx_i + nfi; + int16_t *idz_i = idx_i + nfi * 2; + int16_t *idx_j = c_pair_idx + c_pair_offsets[lj*L_AUX1]; + int16_t *idy_j = idx_j + nfj; + int16_t *idz_j = idx_j + nfj * 2; + for (int ijp = 0; ijp < ijprim; ++ijp) { + __syncthreads(); + int ip = ijp % iprim; + int jp = ijp / iprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ai2 = ai * -2; + double aij = ai + aj; + double aj_aij = aj / aij; + if (gout_id == 0) { + double theta = ai * aj_aij; + double theta_rr = theta * rjri[3*nsp_per_block]; + double cicj = ci[ip] * cj[jp]; + gz[0] = cicj / (aij*sqrt(aij)) * exp(-theta_rr); + } + __syncthreads(); + int lij = li + lj + 3; + int stride_j = li + 4; + int i_1 = nsp_per_block; + double s0x, s1x, s2x; + double b = .5 / aij; + for (int n = gout_id; n < 3; n += gout_stride) { + double *_gx = gx + n * gx_len; + double xjxi = rjri[n*nsp_per_block]; + double xpa = xjxi * aj_aij; + s0x = _gx[0]; + s1x = xpa * s0x; + _gx[nsp_per_block] = s1x; + for (int i = 1; i < lij; ++i) { + s2x = xpa * s1x + i * b * s0x; + _gx[(i+1)*nsp_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + for (int j = 0; j < lj; ++j) { + int ij = (lij-j) + j*stride_j; + s1x = _gx[ij*nsp_per_block]; + for (--ij; ij >= j*stride_j; --ij) { + s0x = _gx[ij*nsp_per_block]; + _gx[(ij+stride_j)*nsp_per_block] = s1x - xjxi * s0x; + s1x = s0x; + } + } + } + __syncthreads(); +#pragma unroll + for (int n = 0; n < GOUT_WIDTH_IP1; ++n) { + int ij = n*gout_stride+gout_id; + if (ij >= nfij) break; + int i = ij % nfi; + int j = ij / nfi; + int ix = idx_i[i]; + int iy = idy_i[i]; + int iz = idz_i[i]; + int jx = idx_j[j]; + int jy = idy_j[j]; + int jz = idz_j[j]; + int addrx = (ix + jx*stride_j) * nsp_per_block; + int addry = (iy + jy*stride_j) * nsp_per_block; + int addrz = (iz + jz*stride_j) * nsp_per_block; + double fx0 = gx[addrx]; + double fy0 = gy[addry]; + double fz0 = gz[addrz]; + double fx1 = ai2 * gx[addrx+i_1]; + double fy1 = ai2 * gy[addry+i_1]; + double fz1 = ai2 * gz[addrz+i_1]; + double fx2 = ai2 * ((ix*2+1)*fx0 + ai2*gx[addrx+i_1*2]); + double fy2 = ai2 * ((iy*2+1)*fy0 + ai2*gy[addry+i_1*2]); + double fz2 = ai2 * ((iz*2+1)*fz0 + ai2*gz[addrz+i_1*2]); + double fx3 = ai2 * ((ix*3+3)*fx1 + ai2*ai2*gx[addrx+i_1*3]); + double fy3 = ai2 * ((iy*3+3)*fy1 + ai2*ai2*gy[addry+i_1*3]); + double fz3 = ai2 * ((iz*3+3)*fz1 + ai2*ai2*gz[addrz+i_1*3]); + if (ix > 0) { + double fx1m = ix * gx[addrx-i_1]; + fx1 += fx1m; + fx3 += ai2*(ix*2+1) * fx1m; + if (ix > 1) { fx2 += ix*(ix-1)*gx[addrx-i_1*2]; fx3 += ai2*(ix-1)*fx1m; } + if (ix > 2) fx3 += ix*(ix-1)*(ix-2) * gx[addrx-i_1*3]; + } + if (iy > 0) { + double fy1m = iy * gy[addry-i_1]; + fy1 += fy1m; + fy3 += ai2*(iy*2+1) * fy1m; + if (iy > 1) { fy2 += iy*(iy-1)*gy[addry-i_1*2]; fy3 += ai2*(iy-1)*fy1m; } + if (iy > 2) fy3 += iy*(iy-1)*(iy-2) * gy[addry-i_1*3]; + } + if (iz > 0) { + double fz1m = iz * gz[addrz-i_1]; + fz1 += fz1m; + fz3 += ai2*(iz*2+1) * fz1m; + if (iz > 1) { fz2 += iz*(iz-1)*gz[addrz-i_1*2]; fz3 += ai2*(iz-1)*fz1m; } + if (iz > 2) fz3 += iz*(iz-1)*(iz-2) * gz[addrz-i_1*3]; + } + goutx[n] += fx3 * fy0 * fz0; + goutx[n] += fx1 * fy2 * fz0; + goutx[n] += fx1 * fy0 * fz2; + gouty[n] += fx2 * fy1 * fz0; + gouty[n] += fx0 * fy3 * fz0; + gouty[n] += fx0 * fy1 * fz2; + goutz[n] += fx2 * fy0 * fz1; + goutz[n] += fx0 * fy2 * fz1; + goutz[n] += fx0 * fy0 * fz3; + } + } + } + + if (task_id + sp_id < shl_pair1) { + int *ao_loc = envs.ao_loc; + int nbas = envs.cell0_nbas; + int nao = ao_loc[nbas]; + size_t nao2 = nao * nao; + int cell_id = jsh / nbas; + int jshp = jsh % nbas; + int i0 = ao_loc[ish]; + int j0 = ao_loc[jshp]; + double *outx = out + cell_id*nao2*3 + i0 * nao + j0; + double *outy = outx + nao2; + double *outz = outx + nao2 * 2; + int nfi = pdata.nfi; + int nfj = pdata.nfj; + int nfij = nfi * nfj; +#pragma unroll + for (int n = 0; n < GOUT_WIDTH_IP1; ++n) { + int ij = n*gout_stride+gout_id; + if (ij >= nfij) break; + int j = ij / nfi; + int i = ij % nfi; + outx[i*nao+j] = goutx[n]; + outy[i*nao+j] = gouty[n]; + outz[i*nao+j] = goutz[n]; + } + } + } +} + +extern "C" { +int PBCint1e_ovlp(double *out, PBCIntEnvVars *envs, int shm_size, + int nbatches_shl_pair, int *bas_ij_idx, + int *shl_pair_offsets, int *gout_stride_lookup) +{ + cudaFuncSetAttribute(int1e_ovlp_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + PBCInt2c2eBounds bounds = { + bas_ij_idx, shl_pair_offsets, gout_stride_lookup, + }; + int1e_ovlp_kernel<<>>(out, *envs, bounds); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in int1e_ovlp kernel: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int PBCint1e_kin(double *out, PBCIntEnvVars *envs, int shm_size, + int nbatches_shl_pair, int *bas_ij_idx, + int *shl_pair_offsets, int *gout_stride_lookup) +{ + cudaFuncSetAttribute(int1e_kin_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + PBCInt2c2eBounds bounds = { + bas_ij_idx, shl_pair_offsets, gout_stride_lookup, + }; + int1e_kin_kernel<<>>(out, *envs, bounds); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in int1e_ovlp kernel: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int PBCint1e_ipovlp(double *out, PBCIntEnvVars *envs, int shm_size, + int nbatches_shl_pair, int *bas_ij_idx, + int *shl_pair_offsets, int *gout_stride_lookup) +{ + cudaFuncSetAttribute(int1e_ipovlp_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + PBCInt2c2eBounds bounds = { + bas_ij_idx, shl_pair_offsets, gout_stride_lookup, + }; + int1e_ipovlp_kernel<<>>(out, *envs, bounds); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in int1e_ipovlp kernel: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int PBCint1e_ipkin(double *out, PBCIntEnvVars *envs, int shm_size, + int nbatches_shl_pair, int *bas_ij_idx, + int *shl_pair_offsets, int *gout_stride_lookup) +{ + cudaFuncSetAttribute(int1e_ipkin_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + PBCInt2c2eBounds bounds = { + bas_ij_idx, shl_pair_offsets, gout_stride_lookup, + }; + int1e_ipkin_kernel<<>>(out, *envs, bounds); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in int1e_ipkin kernel: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} +} diff --git a/gpu4pyscf/lib/pbc/pbc_driver.cu b/gpu4pyscf/lib/pbc/pbc_driver.cu index 3fa0f77cf..3f4484de6 100644 --- a/gpu4pyscf/lib/pbc/pbc_driver.cu +++ b/gpu4pyscf/lib/pbc/pbc_driver.cu @@ -693,7 +693,7 @@ int fill_int2c2e(double *out, PBCIntEnvVars *envs, int shm_size, pbc_int2c2e_kernel<<>>(out, *envs, bounds); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - fprintf(stderr, "CUDA Error in int2ce kernel: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "CUDA Error in int2c2e kernel: %s\n", cudaGetErrorString(err)); return 1; } return 0; diff --git a/gpu4pyscf/pbc/df/fft_jk.py b/gpu4pyscf/pbc/df/fft_jk.py index 8c93bae88..963dd098f 100644 --- a/gpu4pyscf/pbc/df/fft_jk.py +++ b/gpu4pyscf/pbc/df/fft_jk.py @@ -370,7 +370,8 @@ def get_k_e1_kpts(mydf, dm_kpts, kpts=np.zeros((1,3)), exxdiv=None): def _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band=None): from pyscf.pbc.tools.pbc import madelung - s = cp.asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts)) + from gpu4pyscf.pbc.gto.int1e import int1e_ovlp + s = int1e_ovlp(cell, kpts=kpts) m = madelung(cell, kpts) if kpts is None: for i,dm in enumerate(dms): diff --git a/gpu4pyscf/pbc/df/int3c2e.py b/gpu4pyscf/pbc/df/int3c2e.py index 90a731b9f..f502d0b16 100644 --- a/gpu4pyscf/pbc/df/int3c2e.py +++ b/gpu4pyscf/pbc/df/int3c2e.py @@ -17,7 +17,6 @@ ''' import ctypes -import itertools import math import numpy as np import cupy as cp @@ -53,7 +52,6 @@ L_AUX_MAX = 6 GOUT_WIDTH = 45 THREADS = 256 -BVK_CELL_SHELLS = 400 def sr_aux_e2(cell, auxcell, omega, kpts=None, bvk_kmesh=None, j_only=False): r''' @@ -97,7 +95,6 @@ def sr_aux_e2(cell, auxcell, omega, kpts=None, bvk_kmesh=None, j_only=False): else: nf = uniq_l * 2 + 1 c_l_offsets = np.append(0, np.cumsum(c_shell_counts*nf)) - lmax = cell._bas[:,ANG_OF].max() c2s = [cart2sph_by_l(l) for l in range(lmax+1)] aux_coeff = asarray(int3c2e_opt.aux_coeff) diff --git a/gpu4pyscf/pbc/dft/krks.py b/gpu4pyscf/pbc/dft/krks.py index 5f496a78a..8df4ce6e6 100644 --- a/gpu4pyscf/pbc/dft/krks.py +++ b/gpu4pyscf/pbc/dft/krks.py @@ -26,6 +26,7 @@ from pyscf.pbc.dft import krks as krks_cpu from gpu4pyscf.lib import logger, utils from gpu4pyscf.lib.cupy_helper import tag_array, get_avail_mem +from gpu4pyscf.pbc.gto import int1e from gpu4pyscf.pbc.scf import khf from gpu4pyscf.pbc.dft import rks from gpu4pyscf.pbc.dft import multigrid, multigrid_v2 @@ -196,7 +197,7 @@ def get_hcore(self, cell=None, kpts=None): nuc = ni.get_nuc(kpts) if len(cell._ecpbas) > 0: raise NotImplementedError('ECP in PBC SCF') - t = cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpts)) + t = int1e.int1e_kin(cell, kpts) return nuc + t def Gradients(self): diff --git a/gpu4pyscf/pbc/dft/rks.py b/gpu4pyscf/pbc/dft/rks.py index c0cc6e1c7..cc664fd55 100644 --- a/gpu4pyscf/pbc/dft/rks.py +++ b/gpu4pyscf/pbc/dft/rks.py @@ -26,6 +26,7 @@ from pyscf.pbc.dft import rks as rks_cpu from gpu4pyscf.lib import logger, utils from gpu4pyscf.dft import rks as mol_ks +from gpu4pyscf.pbc.gto import int1e from gpu4pyscf.pbc.scf import hf as pbchf, khf from gpu4pyscf.pbc.df.df import GDF from gpu4pyscf.pbc.dft import gen_grid @@ -309,7 +310,8 @@ def get_hcore(self, cell=None, kpt=None): nuc = ni.get_nuc(kpt) if len(cell._ecpbas) > 0: raise NotImplementedError('ECP in PBC SCF') - return nuc + cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpt)) + t = int1e.int1e_kin(cell, kpt) + return nuc + t get_veff = get_veff energy_elec = mol_ks.energy_elec diff --git a/gpu4pyscf/pbc/grad/krhf.py b/gpu4pyscf/pbc/grad/krhf.py index 9f82975c3..444cdf90d 100644 --- a/gpu4pyscf/pbc/grad/krhf.py +++ b/gpu4pyscf/pbc/grad/krhf.py @@ -30,6 +30,7 @@ from gpu4pyscf.pbc.df.aft import _check_kpts from gpu4pyscf.pbc.df import ft_ao from gpu4pyscf.pbc import tools +from gpu4pyscf.pbc.gto import int1e from gpu4pyscf.lib.cupy_helper import contract __all__ = ['Gradients'] @@ -84,7 +85,7 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None): def get_hcore(cell, kpts): '''Part of the nuclear gradients of core Hamiltonian''' - h1 = cp.asarray(cell.pbc_intor('int1e_ipkin', kpts=kpts)) + h1 = int1e.int1e_ipkin(cell, kpts) dtype = h1.dtype if cell._pseudo: SI = cell.get_SI() @@ -269,7 +270,7 @@ def get_hcore(self, cell=None, kpts=None): def get_ovlp(self, cell=None, kpts=None): if cell is None: cell = self.cell if kpts is None: kpts = self.kpts - return -cp.asarray(cell.pbc_intor('int1e_ipovlp', kpts=kpts)) + return -int1e.int1e_ipovlp(cell, kpts) def get_jk(self, dm=None, kpts=None): if kpts is None: kpts = self.kpts diff --git a/gpu4pyscf/pbc/grad/rhf.py b/gpu4pyscf/pbc/grad/rhf.py index a72f50c34..3b0184c33 100644 --- a/gpu4pyscf/pbc/grad/rhf.py +++ b/gpu4pyscf/pbc/grad/rhf.py @@ -26,6 +26,7 @@ import gpu4pyscf.grad.rhf as mol_rhf from gpu4pyscf.lib.cupy_helper import return_cupy_array from gpu4pyscf.pbc.dft import multigrid_v2 +from gpu4pyscf.pbc.gto import int1e __all__ = ['Gradients'] @@ -73,7 +74,7 @@ def grad_elec( ni = mf._numint assert hasattr(mf, 'xc'), 'HF gradients not supported' de = multigrid_v2.get_veff_ip1(ni, mf.xc, dm0, with_j=True).get() - s1 = cell.pbc_intor('int1e_ipovlp') + s1 = int1e.int1e_ipovlp(cell)[0].get() de += cpu_rhf._contract_vhf_dm(self, s1, dme0) * 2 # the CPU code requires the attribute .rhoG @@ -82,7 +83,7 @@ def grad_elec( de += vpploc_part1_nuc_grad(ni, dm0_cpu) de += pp_int.vpploc_part2_nuc_grad(cell, dm0_cpu) de += pp_int.vppnl_nuc_grad(cell, dm0_cpu) - core_hamiltonian_gradient = cell.pbc_intor("int1e_ipkin") + core_hamiltonian_gradient = int1e.int1e_ipkin(cell)[0].get() kinetic_contribution = cpu_rhf._contract_vhf_dm( self, core_hamiltonian_gradient, dm0_cpu ) diff --git a/gpu4pyscf/pbc/grad/uhf.py b/gpu4pyscf/pbc/grad/uhf.py index f59b3efa8..55258c562 100644 --- a/gpu4pyscf/pbc/grad/uhf.py +++ b/gpu4pyscf/pbc/grad/uhf.py @@ -26,6 +26,7 @@ import gpu4pyscf.pbc.grad.rhf as rhf from gpu4pyscf.lib.cupy_helper import return_cupy_array from gpu4pyscf.pbc.dft import multigrid_v2 +from gpu4pyscf.pbc.gto import int1e __all__ = ['Gradients'] @@ -71,7 +72,7 @@ def grad_elec( ni = mf._numint assert hasattr(mf, 'xc'), 'HF gradients not supported' de = multigrid_v2.get_veff_ip1(ni, mf.xc, dm0, with_j=True).get() - s1 = cell.pbc_intor('int1e_ipovlp') + s1 = int1e.int1e_ipovlp(cell)[0].get() de += _contract_vhf_dm(self, s1, dme0_sf) * 2 # the CPU code requires the attribute .rhoG @@ -80,7 +81,7 @@ def grad_elec( de += vpploc_part1_nuc_grad(ni, dm0_cpu) de += pp_int.vpploc_part2_nuc_grad(cell, dm0_cpu) de += pp_int.vppnl_nuc_grad(cell, dm0_cpu) - core_hamiltonian_gradient = cell.pbc_intor("int1e_ipkin") + core_hamiltonian_gradient = int1e.int1e_ipkin(cell)[0].get() kinetic_contribution = _contract_vhf_dm( self, core_hamiltonian_gradient, dm0_cpu ) diff --git a/gpu4pyscf/pbc/gto/int1e.py b/gpu4pyscf/pbc/gto/int1e.py new file mode 100644 index 000000000..b8caa93db --- /dev/null +++ b/gpu4pyscf/pbc/gto/int1e.py @@ -0,0 +1,229 @@ +# Copyright 2025 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes +import numpy as np +import cupy as cp +from pyscf.gto import ATOM_OF, PTR_COORD +from pyscf.pbc import tools as pbctools +from pyscf.pbc.lib.kpts_helper import is_zero +from pyscf.pbc.tools.k2gamma import translation_vectors_for_kmesh +from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh +from gpu4pyscf.lib import logger +from gpu4pyscf.lib.cupy_helper import contract, asarray, sandwich_dot +from gpu4pyscf.gto.mole import group_basis, PTR_BAS_COORD +from gpu4pyscf.scf.jk import _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE +from gpu4pyscf.pbc.df.ft_ao import libpbc, PBCIntEnvVars +from gpu4pyscf.pbc.df.int3c2e import ( + _estimate_shl_pairs_per_block, fill_triu_bvk_conj, LMAX, L_AUX_MAX, THREADS +) + +__all__ = [ + 'int1e_ovlp', + 'int1e_kin', + 'int1e_ipovlp', + 'int1e_ipkin', +] + +libpbc.PBCint1e_ovlp.restype = ctypes.c_int +libpbc.PBCint1e_kin.restype = ctypes.c_int +libpbc.PBCint1e_ipovlp.restype = ctypes.c_int +libpbc.PBCint1e_ipkin.restype = ctypes.c_int + +def int1e_ovlp(cell, kpts=None, bvk_kmesh=None, opt=None): + opt = _check_opt(cell, kpts, bvk_kmesh, opt) + out = opt.intor('PBCint1e_ovlp', 1, 1, (0, 0)) + return out + +def int1e_kin(cell, kpts=None, bvk_kmesh=None, opt=None): + opt = _check_opt(cell, kpts, bvk_kmesh, opt) + out = opt.intor('PBCint1e_kin', 1, 1, (2, 0)) + return out + +def int1e_ipovlp(cell, kpts=None, bvk_kmesh=None, opt=None): + opt = _check_opt(cell, kpts, bvk_kmesh, opt) + out = opt.intor('PBCint1e_ipovlp', 0, 3, (1, 0)) + return out + +def int1e_ipkin(cell, kpts=None, bvk_kmesh=None, opt=None): + opt = _check_opt(cell, kpts, bvk_kmesh, opt) + out = opt.intor('PBCint1e_ipkin', 0, 3, (3, 0)) + return out + +def _check_opt(cell, kpts, bvk_kmesh, opt): + if opt is None: + opt = _Int1eOpt(cell, kpts, bvk_kmesh) + else: + assert kpts is opt.kpts + return opt + +class _Int1eOpt: + def __init__(self, cell, kpts=None, bvk_kmesh=None): + self.cell = cell + sorted_cell, coeff, uniq_l_ctr, l_ctr_counts = group_basis(cell, tile=1) + uniq_l = uniq_l_ctr[:,0] + lmax = uniq_l.max() + assert lmax <= LMAX + self.sorted_cell = sorted_cell + self.coeff = coeff + self.uniq_l_ctr = uniq_l_ctr + self.l_ctr_counts = l_ctr_counts + + if bvk_kmesh is None: + if kpts is None: + bvk_kmesh = np.ones(3, dtype=np.int32) + else: + bvk_kmesh = kpts_to_kmesh(cell, kpts.reshape(-1, 3)) + self.kpts = kpts + self.bvk_kmesh = bvk_kmesh + bvk_ncells = np.prod(bvk_kmesh) + if bvk_ncells == 1: + bvkcell = sorted_cell + else: + bvkcell = pbctools.super_cell(sorted_cell, bvk_kmesh, wrap_around=True) + # PTR_BAS_COORD was not initialized in pbctools.supe_rcell + bvkcell._bas[:,PTR_BAS_COORD] = bvkcell._atm[bvkcell._bas[:,ATOM_OF],PTR_COORD] + self.bvkcell = bvkcell + + Ls = asarray(bvkcell.get_lattice_Ls(rcut=cell.rcut)) + Ls = Ls[cp.linalg.norm(Ls-.5, axis=1).argsort()] + nimgs = len(Ls) + + _atm = cp.array(bvkcell._atm, dtype=np.int32) + _bas = cp.array(bvkcell._bas, dtype=np.int32) + _env = cp.array(_scale_sp_ctr_coeff(bvkcell), dtype=np.float64) + ao_loc = bvkcell.ao_loc_nr(cart=True) + ao_loc_gpu = cp.array(ao_loc, dtype=np.int32) + int1e_envs = PBCIntEnvVars( + sorted_cell.natm, sorted_cell.nbas, bvk_ncells, nimgs, + _atm.data.ptr, _bas.data.ptr, _env.data.ptr, + ao_loc_gpu.data.ptr, Ls.data.ptr, + ) + int1e_envs._env_ref_holder = (_atm, _bas, _env, ao_loc_gpu, Ls) + self.int1e_envs = int1e_envs + + def generate_shl_pairs(self, hermi=1): + sorted_cell = self.sorted_cell + l_ctr_offsets = np.append(0, np.cumsum(self.l_ctr_counts)) + uniq_l = self.uniq_l_ctr[:,0] + bas_ij_idx = [] # The effective shell pair = ish*nbas+jsh + shl_pair_offsets = [] # the bas_ij_idx offset for each blockIdx.x + sp0 = sp1 = 0 + nbas = sorted_cell.nbas + groups = len(uniq_l) + if hermi == 1: + ij_tasks = [(i, j) for i in range(groups) for j in range(i+1)] + else: + ij_tasks = [(i, j) for i in range(groups) for j in range(groups)] + bvk_ncells = np.prod(self.bvk_kmesh) + img = cp.arange(bvk_ncells, dtype=np.int32) + img_offsets = img * nbas + for i, j in ij_tasks: + li = uniq_l[i] + lj = uniq_l[j] + ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1] + jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1] + ish = cp.arange(ish0, ish1, dtype=np.int32) + jsh = cp.arange(jsh0, jsh1, dtype=np.int32) + ijsh = ish[:,None] * (nbas*bvk_ncells) + jsh + if hermi and i == j: + ijsh = ijsh[cp.tril_indices(ish1-ish0)] + else: + ijsh = ijsh.ravel() + idx = (img_offsets[:,None] + ijsh).ravel() + nshl_pair = len(idx) + bas_ij_idx.append(idx) + sp0, sp1 = sp1, sp1 + nshl_pair + nsp_per_block = _estimate_shl_pairs_per_block(li, lj, nshl_pair) + shl_pair_offsets.append(np.arange(sp0, sp1, nsp_per_block, dtype=np.int32)) + + shl_pair_offsets.append(np.array([sp1], dtype=np.int32)) + shl_pair_offsets = cp.array(np.hstack(shl_pair_offsets), dtype=np.int32) + bas_ij_idx = cp.array(cp.hstack(bas_ij_idx), dtype=np.int32) + return bas_ij_idx, shl_pair_offsets + + def create_gout_stride_lookup_table(self, deriv=None, gout_width=36): + # gout_width should be identical to the setting in cuda kernel + # based on the shm_size, find optimal gout_stride for each (li,lj) + # pattern, store them in the gout_stride_lookup + if deriv is None: + deriv = (0, 0) + i_inc, j_inc = deriv + lmax = self.uniq_l_ctr[:,0].max() + gout_stride_lookup = np.empty([L_AUX_MAX+1,L_AUX_MAX+1], dtype=np.int32) + gout_width = gout_width + shm_size = SHM_SIZE + ls = np.arange(lmax+1) + nf = (ls+1) * (ls+2) // 2 + max_shm_size = 0 + for li in range(lmax+1): + for lj in range(lmax+1): + unit = (li+1+i_inc)*(lj+1+j_inc)*3 + 4 + nsp_max = _nearest_power2(shm_size // (unit*8)) + gout_size = nf[li] * nf[lj] + gout_stride = (gout_size+gout_width-1) / gout_width + # Round up to the next 2^n + gout_stride = _nearest_power2(gout_stride, return_leq=False) + nsp_per_block = min(nsp_max, THREADS // gout_stride) + gout_stride_lookup[li, lj] = THREADS // nsp_per_block + max_shm_size = max(max_shm_size, nsp_per_block*unit*8) + return cp.array(gout_stride_lookup, dtype=np.int32), max_shm_size + + def intor(self, kern, hermi, comp, deriv_ij): + if comp == 1: + gout_width = 36 + else: + gout_width = 18 + + sorted_cell = self.sorted_cell + int1e_envs = self.int1e_envs + nao_cart, nao = self.coeff.shape + bas_ij_idx, shl_pair_offsets = self.generate_shl_pairs(hermi) + nbatches_shl_pair = len(shl_pair_offsets) - 1 + gout_stride_lookup, shm_size = self.create_gout_stride_lookup_table( + deriv_ij, gout_width) + bvk_kmesh = self.bvk_kmesh + bvk_ncells = np.prod(bvk_kmesh) + out = cp.empty((bvk_ncells, comp, nao_cart, nao_cart)) + drv = getattr(libpbc, kern) + err = drv( + ctypes.cast(out.data.ptr, ctypes.c_void_p), + ctypes.byref(int1e_envs), ctypes.c_int(shm_size), + ctypes.c_int(nbatches_shl_pair), + ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(shl_pair_offsets.data.ptr, ctypes.c_void_p), + ctypes.cast(gout_stride_lookup.data.ptr, ctypes.c_void_p), + sorted_cell._atm.ctypes, ctypes.c_int(sorted_cell.natm), + sorted_cell._bas.ctypes, ctypes.c_int(sorted_cell.nbas), + sorted_cell._env.ctypes) + if err != 0: + raise RuntimeError(f'{kern} failed') + + if hermi == 1: + assert comp == 1 + out = fill_triu_bvk_conj(out, nao_cart, bvk_kmesh) + out = sandwich_dot(out.reshape(-1,nao_cart,nao_cart), self.coeff) + out = out.reshape(bvk_ncells, comp, nao, nao) + + if self.kpts is not None and not is_zero(self.kpts): + bvkmesh_Ls = translation_vectors_for_kmesh(self.cell, bvk_kmesh, True) + kpts = self.kpts.reshape(-1, 3) + expLk = cp.exp(1j*asarray(bvkmesh_Ls.dot(kpts.T))) + out = contract('lk,lxpq->kxpq', expLk, out) + + if comp == 1: + out = out[:,0] + if self.kpts is not None and self.kpts.ndim == 1: + out = out[0] + return out diff --git a/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py b/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py new file mode 100644 index 000000000..8a0034242 --- /dev/null +++ b/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py @@ -0,0 +1,141 @@ +# Copyright 2025 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pyscf +from gpu4pyscf.pbc.gto import int1e + +def test_int1e_ovlp(): + cell = pyscf.M( + atom='''C1 1.3 .2 .3 + C2 .19 .1 1.1 + C3 0. 0. 0. + ''', + precision = 1e-8, + a=np.diag([2.5, 1.9, 2.2])*2, + basis='def2-tzvpp', + ) + kmesh = [6, 1, 1] + kpts = cell.make_kpts(kmesh) + pcell = cell.copy() + pcell.precision = 1e-14 + pcell.build(0, 0) + ref = pcell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts) + + dat = int1e.int1e_ovlp(cell).get()[0] + assert abs(dat - ref[0]).max() < 1e-10 + + dat = int1e.int1e_ovlp(cell, kpts=kpts).get() + assert abs(dat - ref).max() < 1e-10 + +def test_int1e_kin(): + cell = pyscf.M( + atom='''C1 1.3 .2 .3 + C2 .19 .1 1.1 + C3 0. 0. 0. + ''', + precision = 1e-8, + a=np.diag([2.5, 1.9, 2.2])*2, + basis='def2-tzvpp', + ) + kmesh = [6, 1, 1] + kpts = cell.make_kpts(kmesh) + pcell = cell.copy() + pcell.precision = 1e-14 + pcell.build(0, 0) + ref = pcell.pbc_intor('int1e_kin', hermi=1, kpts=kpts) + + dat = int1e.int1e_kin(cell).get()[0] + assert abs(dat - ref[0]).max() < 1e-10 + + dat = int1e.int1e_kin(cell, kpts=kpts).get() + assert abs(dat - ref).max() < 1e-10 + +def test_int1e_ipovlp(): + cell = pyscf.M( + atom='''C1 1.3 .2 .3 + C2 .19 .1 1.1 + C3 0. 0. 0. + ''', + precision = 1e-8, + a=np.diag([2.5, 1.9, 2.2])*2, + basis='def2-tzvpp', + ) + kmesh = [6, 1, 1] + kpts = cell.make_kpts(kmesh) + pcell = cell.copy() + pcell.precision = 1e-14 + pcell.build(0, 0) + ref = pcell.pbc_intor('int1e_ipovlp', hermi=0, kpts=kpts) + + dat = int1e.int1e_ipovlp(cell).get()[0] + assert abs(dat - ref[0]).max() < 1e-10 + + dat = int1e.int1e_ipovlp(cell, kpts=kpts).get() + assert abs(dat - ref).max() < 1e-10 + +def test_int1e_ipkin(): + cell = pyscf.M( + atom='''C1 1.3 .2 .3 + C2 .19 .1 1.1 + C3 0. 0. 0. + ''', + precision = 1e-8, + a=np.diag([2.5, 1.9, 2.2])*2, + basis='def2-tzvpp', + ) + kmesh = [6, 1, 1] + kpts = cell.make_kpts(kmesh) + pcell = cell.copy() + pcell.precision = 1e-14 + pcell.build(0, 0) + ref = pcell.pbc_intor('int1e_ipkin', hermi=0, kpts=kpts) + + dat = int1e.int1e_ipkin(cell).get()[0] + assert abs(dat - ref[0]).max() < 1e-10 + + dat = int1e.int1e_ipkin(cell, kpts=kpts).get() + assert abs(dat - ref).max() < 1e-10 + +def test_int1e_ovlp1(): + L = 4 + n = 21 + cell = pyscf.M(unit = 'B', + precision = 1e-10, + a = ((L,0,0),(0,L,0),(0,0,L)), + mesh = [n,n,n], + atom = [['He', (L/2.-.5,L/2.,L/2.-.5)], + ['He', (L/2. ,L/2.,L/2.+.5)]], + basis = { 'He': [[0, (0.8, 1.0)], + [0, (1.0, 1.0)], + [0, (1.2, 1.0)]]}) + nk = [2, 2, 1] + kpts = cell.make_kpts(nk, wrap_around=True) + s = int1e.int1e_ovlp(cell, kpts) + ref = cell.pbc_intor('int1e_ovlp', kpts=kpts) + assert abs(s.get() - ref).max() < 1e-10 + k = int1e.int1e_kin(cell, kpts) + ref = cell.pbc_intor('int1e_kin', kpts=kpts) + assert abs(k.get() - ref).max() < 1e-10 + + nk = [5, 4, 1] + kpts = cell.make_kpts(nk, wrap_around=True)[[3, 8, 11]] + s = int1e.int1e_ovlp(cell, kpts) + ref = cell.pbc_intor('int1e_ovlp', kpts=kpts) + assert abs(s.get() - ref).max() < 1e-10 + k = int1e.int1e_kin(cell, kpts) + ref = cell.pbc_intor('int1e_kin', kpts=kpts) + assert abs(k.get() - ref).max() < 1e-10 + +test_int1e_ovlp1() diff --git a/gpu4pyscf/pbc/scf/hf.py b/gpu4pyscf/pbc/scf/hf.py index e6ced9369..0c8a80a75 100644 --- a/gpu4pyscf/pbc/scf/hf.py +++ b/gpu4pyscf/pbc/scf/hf.py @@ -28,6 +28,7 @@ from gpu4pyscf.lib.cupy_helper import return_cupy_array, contract from gpu4pyscf.scf import hf as mol_hf from gpu4pyscf.pbc import df +from gpu4pyscf.pbc.gto import int1e def get_bands(mf, kpts_band, cell=None, dm=None, kpt=None): '''Get energy bands at the given (arbitrary) 'band' k-points. @@ -149,18 +150,22 @@ def kpt(self, x): get_bands = get_bands get_rho = get_rho - get_ovlp = return_cupy_array(hf_cpu.SCF.get_ovlp) + def get_ovlp(self, cell=None, kpt=None): + if kpt is None: kpt = self.kpt + if cell is None: cell = self.cell + return int1e.int1e_ovlp(cell, kpt) def get_hcore(self, cell=None, kpt=None): - if cell is None: cell = self.cell if kpt is None: kpt = self.kpt + if cell is None: cell = self.cell if cell.pseudo: nuc = self.with_df.get_pp(kpt) else: nuc = self.with_df.get_nuc(kpt) if len(cell._ecpbas) > 0: raise NotImplementedError('ECP in PBC SCF') - return nuc + cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpt)) + t = int1e.int1e_kin(cell, kpt) + return nuc + t def get_jk(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None, with_j=True, with_k=True, omega=None, **kwargs): diff --git a/gpu4pyscf/pbc/scf/khf.py b/gpu4pyscf/pbc/scf/khf.py index 61674cdad..0472491c4 100644 --- a/gpu4pyscf/pbc/scf/khf.py +++ b/gpu4pyscf/pbc/scf/khf.py @@ -30,6 +30,7 @@ from gpu4pyscf.scf import hf as mol_hf from gpu4pyscf.pbc.scf import hf as pbchf from gpu4pyscf.pbc import df +from gpu4pyscf.pbc.gto import int1e def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None, diis_start_cycle=None, level_shift_factor=None, damp_factor=None, @@ -253,6 +254,11 @@ def __init__(self, cell, kpts=None, exxdiv='ewald'): build = khf_cpu.KSCF.build reset = pbchf.SCF.reset + def get_ovlp(self, cell=None, kpts=None): + if cell is None: cell = self.cell + if kpts is None: kpts = self.kpts + return int1e.int1e_ovlp(cell, kpts) + def get_hcore(self, cell=None, kpts=None): if cell is None: cell = self.cell if kpts is None: kpts = self.kpts @@ -262,7 +268,7 @@ def get_hcore(self, cell=None, kpts=None): nuc = self.with_df.get_nuc(kpts) if len(cell._ecpbas) > 0: raise NotImplementedError('ECP in PBC SCF') - t = cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpts)) + t = int1e.int1e_kin(cell, kpts) return nuc + t def get_j(self, cell=None, dm_kpts=None, hermi=1, kpts=None, @@ -327,7 +333,6 @@ def make_rdm1(self, mo_coeff_kpts=None, mo_occ_kpts=None, **kwargs): make_rdm2 = NotImplemented init_direct_scf = NotImplemented - get_ovlp = return_cupy_array(khf_cpu.get_ovlp) get_fock = get_fock get_fermi = get_fermi get_occ = get_occ diff --git a/gpu4pyscf/pbc/tools/pbc.py b/gpu4pyscf/pbc/tools/pbc.py index 6f5c93977..2718ddfe5 100644 --- a/gpu4pyscf/pbc/tools/pbc.py +++ b/gpu4pyscf/pbc/tools/pbc.py @@ -14,6 +14,9 @@ import numpy as np import cupy as cp +import pyscf +from pyscf import lib +from pyscf.pbc.gto.cell import Cell from gpu4pyscf.lib.cupy_helper import asarray def fft(f, mesh): @@ -287,3 +290,69 @@ def get_coulG(cell, k=np.zeros(3), exx=False, mf=None, mesh=None, Gv=None, else: # for RangeSeparatedJKBuilder coulG[G0_idx] += Nk*cell.vol*madelung(cell, kpts, omega=0) return coulG + +def get_lattice_Ls(cell, nimgs=None, rcut=None, dimension=None, discard=True): + '''This version employs more strict criteria when discarding images in lattice sum. + It can be replaced by the built-in version available in PySCF 2.10. + ''' + if dimension is None: + # For atoms near the boundary of the cell, it is necessary (even in low- + # dimensional systems) to include lattice translations in all 3 dimensions. + if cell.dimension < 2 or cell.low_dim_ft_type == 'inf_vacuum': + dimension = cell.dimension + else: + dimension = 3 + if rcut is None: + rcut = cell.rcut + + if dimension == 0 or rcut <= 0 or cell.natm == 0: + return np.zeros((1, 3)) + + a = cell.lattice_vectors() + + scaled_atom_coords = cell.get_scaled_atom_coords() + atom_boundary_max = scaled_atom_coords[:,:dimension].max(axis=0) + atom_boundary_min = scaled_atom_coords[:,:dimension].min(axis=0) + if (np.any(atom_boundary_max > 1) or np.any(atom_boundary_min < -1)): + atom_boundary_max[atom_boundary_max > 1] = 1 + atom_boundary_min[atom_boundary_min <-1] = -1 + ovlp_penalty = atom_boundary_max - atom_boundary_min + dR = ovlp_penalty.dot(a[:dimension]) + dR_basis = np.diag(dR) + + # Search the minimal x,y,z requiring |x*a[0]+y*a[1]+z*a[2]+dR|^2 > rcut^2 + # Ls boundary should be derived by decomposing (a, Rij) for each atom-pair. + # For reasons unclear, the so-obtained Ls boundary seems not large enough. + # The upper-bound of the Ls boundary is generated by find_boundary function. + def find_boundary(a): + aR = np.vstack([a, dR_basis]) + r = np.linalg.qr(aR.T)[1] + ub = (rcut + abs(r[2,3:]).sum()) / abs(r[2,2]) + return ub + + xb = find_boundary(a[[1,2,0]]) + if dimension > 1: + yb = find_boundary(a[[2,0,1]]) + else: + yb = 0 + if dimension > 2: + zb = find_boundary(a) + else: + zb = 0 + bounds = np.ceil([xb, yb, zb]).astype(int) + Ts = lib.cartesian_prod((np.arange(-bounds[0], bounds[0]+1), + np.arange(-bounds[1], bounds[1]+1), + np.arange(-bounds[2], bounds[2]+1))) + Ls = np.dot(Ts[:,:dimension], a[:dimension]) + + if discard and len(Ls) > 1: + r = cell.atom_coords() + rr = r[:,None] - r + dist_max = np.linalg.norm(rr, axis=2).max() + Ls_mask = np.linalg.norm(Ls, axis=1) < rcut + dist_max + Ls = Ls[Ls_mask] + return np.asarray(Ls, order='C') + +if int(pyscf.__version__.split('.')[1]) < 10: + # Patch the get_lattice_Ls for pyscf-2.9 or older + Cell.get_lattice_Ls = get_lattice_Ls