diff --git a/gpu4pyscf/lib/pbc/CMakeLists.txt b/gpu4pyscf/lib/pbc/CMakeLists.txt
index 66b08c190..7b763bb29 100644
--- a/gpu4pyscf/lib/pbc/CMakeLists.txt
+++ b/gpu4pyscf/lib/pbc/CMakeLists.txt
@@ -3,6 +3,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=12
 add_library(pbc SHARED
   pbc_driver.cu ft_ao.cu unrolled_ft_ao.cu ft_ao_bdiv.cu
   fill_int3c2e.cu unrolled_int3c2e.cu fill_int2c2e.cu
+  overlap.cu
   estimator.cu
   rys_roots_dat.cu
   nr_eval_gto.cu
diff --git a/gpu4pyscf/lib/pbc/fill_int2c2e.cu b/gpu4pyscf/lib/pbc/fill_int2c2e.cu
index aab2332d8..57de092fb 100644
--- a/gpu4pyscf/lib/pbc/fill_int2c2e.cu
+++ b/gpu4pyscf/lib/pbc/fill_int2c2e.cu
@@ -57,7 +57,7 @@ void pbc_int2c2e_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds
     double *env = envs.env;
     double *img_coords = envs.img_coords;
 
-    int gout_stride = bounds.gout_stride_lookup[lj*L_AUX1+li];
+    int gout_stride = bounds.gout_stride_lookup[li*L_AUX1+lj];
     int nsp_per_block = THREADS / gout_stride;
     int sp_id = thread_id % nsp_per_block;
     int gout_id = thread_id / nsp_per_block;
diff --git a/gpu4pyscf/lib/pbc/overlap.cu b/gpu4pyscf/lib/pbc/overlap.cu
new file mode 100644
index 000000000..caaaa4d83
--- /dev/null
+++ b/gpu4pyscf/lib/pbc/overlap.cu
@@ -0,0 +1,866 @@
+/*
+ * Copyright 2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "pbc.cuh"
+#include "int3c2e.cuh"
+
+#define PI_POW_1_5       5.568327996831707845
+
+typedef struct {
+    int8_t iprim;
+    int8_t jprim;
+    int8_t nfi;
+    int8_t nfj;
+} PackedPGTO;
+
+#define GOUT_WIDTH      36
+#define GOUT_WIDTH_IP1  18
+
+static __global__
+void int1e_ovlp_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds)
+{
+    int sp_block_id = blockIdx.x;
+    int thread_id = threadIdx.x;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.cell0_nbas * envs.bvk_ncells;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int *bas = envs.bas;
+    int li = bas[ish0*BAS_SLOTS+ANG_OF];
+    int lj = bas[jsh0*BAS_SLOTS+ANG_OF];
+    int8_t nfi = (li + 1) * (li + 2) / 2;
+    int8_t nfj = (lj + 1) * (lj + 2) / 2;
+    int8_t iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int8_t jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    PackedPGTO pdata = {iprim, jprim, nfi, nfj};
+    double *env = envs.env;
+    double *img_coords = envs.img_coords;
+
+    int gout_stride = bounds.gout_stride_lookup[li*L_AUX1+lj];
+    int nsp_per_block = THREADS / gout_stride;
+    int sp_id = thread_id % nsp_per_block;
+    int gout_id = thread_id / nsp_per_block;
+
+    int g_size = (li + 1) * (lj + 1);
+    int gx_len = g_size * nsp_per_block;
+    extern __shared__ double g[];
+    double *gx = g + sp_id;
+    double *gy = gx + gx_len;
+    double *gz = gx + gx_len * 2;
+    double *rjri = gx + gx_len * 3;
+    gx[0] = PI_POW_1_5;
+    gy[0] = 1.;
+
+    for (int task_id = shl_pair0; task_id < shl_pair1; task_id += nsp_per_block) {
+        double gout[GOUT_WIDTH];
+#pragma unroll
+        for (int n = 0; n < GOUT_WIDTH; ++n) {
+            gout[n] = 0.;
+        }
+        int pair_ij = task_id + sp_id;
+        if (pair_ij >= shl_pair1) {
+            pair_ij = shl_pair0;
+        }
+        int bas_ij = bounds.bas_ij_idx[pair_ij];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        for (int img = 0; img < envs.nimgs; img++) {
+            if (gout_id == 0) {
+                double xjL = img_coords[img*3+0];
+                double yjL = img_coords[img*3+1];
+                double zjL = img_coords[img*3+2];
+                double xjxi = rj[0] + xjL - ri[0];
+                double yjyi = rj[1] + yjL - ri[1];
+                double zjzi = rj[2] + zjL - ri[2];
+                double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+                rjri[0*nsp_per_block] = xjxi;
+                rjri[1*nsp_per_block] = yjyi;
+                rjri[2*nsp_per_block] = zjzi;
+                rjri[3*nsp_per_block] = rr_ij;
+            }
+            int iprim = pdata.iprim;
+            int jprim = pdata.jprim;
+            int ijprim = iprim * jprim;
+            int nfi = pdata.nfi;
+            int nfj = pdata.nfj;
+            int nfij = nfi * nfj;
+            int16_t *idx_ij = c_pair_idx + c_pair_offsets[li*L_AUX1+lj];
+            int16_t *idy_ij = idx_ij + nfij;
+            int16_t *idz_ij = idx_ij + nfij * 2;
+            for (int ijp = 0; ijp < ijprim; ++ijp) {
+                __syncthreads();
+                int ip = ijp % iprim;
+                int jp = ijp / iprim;
+                double ai = expi[ip];
+                double aj = expj[jp];
+                double aij = ai + aj;
+                double aj_aij = aj / aij;
+                if (gout_id == 0) {
+                    double theta = ai * aj_aij;
+                    double theta_rr = theta * rjri[3*nsp_per_block];
+                    double cicj = ci[ip] * cj[jp];
+                    gz[0] = cicj / (aij*sqrt(aij)) * exp(-theta_rr);
+                }
+                int lij = li + lj;
+                int stride_j = li + 1;
+                if (lij > 0) {
+                    __syncthreads();
+                    double s0x, s1x, s2x;
+                    double b = .5 / aij;
+                    for (int n = gout_id; n < 3; n += gout_stride) {
+                        double *_gx = gx + n * gx_len;
+                        double xjxi = rjri[n*nsp_per_block];
+                        double xpa = xjxi * aj_aij;
+                        s0x = _gx[0];
+                        s1x = xpa * s0x;
+                        _gx[nsp_per_block] = s1x;
+                        for (int i = 1; i < lij; ++i) {
+                            s2x = xpa * s1x + i * b * s0x;
+                            _gx[(i+1)*nsp_per_block] = s2x;
+                            s0x = s1x;
+                            s1x = s2x;
+                        }
+                        for (int j = 0; j < lj; ++j) {
+                            int ij = (lij-j) + j*stride_j;
+                            s1x = _gx[ij*nsp_per_block];
+                            for (--ij; ij >= j*stride_j; --ij) {
+                                s0x = _gx[ij*nsp_per_block];
+                                _gx[(ij+stride_j)*nsp_per_block] = s1x - xjxi * s0x;
+                                s1x = s0x;
+                            }
+                        }
+                    }
+                }
+                __syncthreads();
+#pragma unroll
+                for (int n = 0; n < GOUT_WIDTH; ++n) {
+                    int ij = n*gout_stride+gout_id;
+                    if (ij >= nfij) break;
+                    int addrx = idx_ij[ij] * nsp_per_block;
+                    int addry = idy_ij[ij] * nsp_per_block;
+                    int addrz = idz_ij[ij] * nsp_per_block;
+                    gout[n] += gx[addrx] * gy[addry] * gz[addrz];
+                }
+            }
+        }
+
+        if (task_id + sp_id < shl_pair1) {
+            int *ao_loc = envs.ao_loc;
+            int nbas = envs.cell0_nbas;
+            int nao = ao_loc[nbas];
+            size_t nao2 = nao * nao;
+            int cell_id = jsh / nbas;
+            int jshp = jsh % nbas;
+            int i0 = ao_loc[ish];
+            int j0 = ao_loc[jshp];
+            double *out_subblock = out + cell_id*nao2 + i0 * nao + j0;
+            int nfi = pdata.nfi;
+            int nfj = pdata.nfj;
+            int nfij = nfi * nfj;
+#pragma unroll
+            for (int n = 0; n < GOUT_WIDTH; ++n) {
+                int ij = n*gout_stride+gout_id;
+                if (ij >= nfij) break;
+                int j = ij / nfi;
+                int i = ij % nfi;
+                out_subblock[i*nao+j] = gout[n];
+            }
+        }
+    }
+}
+
+static __global__
+void int1e_kin_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds)
+{
+    int sp_block_id = blockIdx.x;
+    int thread_id = threadIdx.x;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.cell0_nbas * envs.bvk_ncells;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int *bas = envs.bas;
+    int li = bas[ish0*BAS_SLOTS+ANG_OF];
+    int lj = bas[jsh0*BAS_SLOTS+ANG_OF];
+    int8_t nfi = (li + 1) * (li + 2) / 2;
+    int8_t nfj = (lj + 1) * (lj + 2) / 2;
+    int8_t iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int8_t jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    PackedPGTO pdata = {iprim, jprim, nfi, nfj};
+    double *env = envs.env;
+    double *img_coords = envs.img_coords;
+
+    int gout_stride = bounds.gout_stride_lookup[li*L_AUX1+lj];
+    int nsp_per_block = THREADS / gout_stride;
+    int sp_id = thread_id % nsp_per_block;
+    int gout_id = thread_id / nsp_per_block;
+
+    int g_size = (li + 3) * (lj + 1);
+    int gx_len = g_size * nsp_per_block;
+    extern __shared__ double g[];
+    double *gx = g + sp_id;
+    double *gy = gx + gx_len;
+    double *gz = gx + gx_len * 2;
+    double *rjri = gx + gx_len * 3;
+    gx[0] = PI_POW_1_5;
+    gy[0] = -.5;
+
+    for (int task_id = shl_pair0; task_id < shl_pair1; task_id += nsp_per_block) {
+        double gout[GOUT_WIDTH];
+#pragma unroll
+        for (int n = 0; n < GOUT_WIDTH; ++n) {
+            gout[n] = 0.;
+        }
+        int pair_ij = task_id + sp_id;
+        if (pair_ij >= shl_pair1) {
+            pair_ij = shl_pair0;
+        }
+        int bas_ij = bounds.bas_ij_idx[pair_ij];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        for (int img = 0; img < envs.nimgs; img++) {
+            if (gout_id == 0) {
+                double xjL = img_coords[img*3+0];
+                double yjL = img_coords[img*3+1];
+                double zjL = img_coords[img*3+2];
+                double xjxi = rj[0] + xjL - ri[0];
+                double yjyi = rj[1] + yjL - ri[1];
+                double zjzi = rj[2] + zjL - ri[2];
+                double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+                rjri[0*nsp_per_block] = xjxi;
+                rjri[1*nsp_per_block] = yjyi;
+                rjri[2*nsp_per_block] = zjzi;
+                rjri[3*nsp_per_block] = rr_ij;
+            }
+            int iprim = pdata.iprim;
+            int jprim = pdata.jprim;
+            int ijprim = iprim * jprim;
+            int nfi = pdata.nfi;
+            int nfj = pdata.nfj;
+            int nfij = nfi * nfj;
+            int16_t *idx_i = c_pair_idx + c_pair_offsets[li*L_AUX1];
+            int16_t *idy_i = idx_i + nfi;
+            int16_t *idz_i = idx_i + nfi * 2;
+            int16_t *idx_j = c_pair_idx + c_pair_offsets[lj*L_AUX1];
+            int16_t *idy_j = idx_j + nfj;
+            int16_t *idz_j = idx_j + nfj * 2;
+            for (int ijp = 0; ijp < ijprim; ++ijp) {
+                __syncthreads();
+                int ip = ijp % iprim;
+                int jp = ijp / iprim;
+                double ai = expi[ip];
+                double aj = expj[jp];
+                double ai2 = ai * -2;
+                double aij = ai + aj;
+                double aj_aij = aj / aij;
+                if (gout_id == 0) {
+                    double theta = ai * aj_aij;
+                    double theta_rr = theta * rjri[3*nsp_per_block];
+                    double cicj = ci[ip] * cj[jp];
+                    gz[0] = cicj / (aij*sqrt(aij)) * exp(-theta_rr);
+                }
+                __syncthreads();
+                int lij = li + lj + 2;
+                int stride_j = li + 3;
+                int i_1 = nsp_per_block;
+                double s0x, s1x, s2x;
+                double b = .5 / aij;
+                for (int n = gout_id; n < 3; n += gout_stride) {
+                    double *_gx = gx + n * gx_len;
+                    double xjxi = rjri[n*nsp_per_block];
+                    double xpa = xjxi * aj_aij;
+                    s0x = _gx[0];
+                    s1x = xpa * s0x;
+                    _gx[nsp_per_block] = s1x;
+                    for (int i = 1; i < lij; ++i) {
+                        s2x = xpa * s1x + i * b * s0x;
+                        _gx[(i+1)*nsp_per_block] = s2x;
+                        s0x = s1x;
+                        s1x = s2x;
+                    }
+                    for (int j = 0; j < lj; ++j) {
+                        int ij = (lij-j) + j*stride_j;
+                        s1x = _gx[ij*nsp_per_block];
+                        for (--ij; ij >= j*stride_j; --ij) {
+                            s0x = _gx[ij*nsp_per_block];
+                            _gx[(ij+stride_j)*nsp_per_block] = s1x - xjxi * s0x;
+                            s1x = s0x;
+                        }
+                    }
+                }
+                __syncthreads();
+#pragma unroll
+                for (int n = 0; n < GOUT_WIDTH; ++n) {
+                    int ij = n*gout_stride+gout_id;
+                    if (ij >= nfij) break;
+                    int i = ij % nfi;
+                    int j = ij / nfi;
+                    int ix = idx_i[i];
+                    int iy = idy_i[i];
+                    int iz = idz_i[i];
+                    int jx = idx_j[j];
+                    int jy = idy_j[j];
+                    int jz = idz_j[j];
+                    int addrx = (ix + jx*stride_j) * nsp_per_block;
+                    int addry = (iy + jy*stride_j) * nsp_per_block;
+                    int addrz = (iz + jz*stride_j) * nsp_per_block;
+                    double fx0 = gx[addrx];
+                    double fy0 = gy[addry];
+                    double fz0 = gz[addrz];
+                    double fx2 = ai2 * ((ix*2+1)*fx0 + ai2*gx[addrx+i_1*2]);
+                    double fy2 = ai2 * ((iy*2+1)*fy0 + ai2*gy[addry+i_1*2]);
+                    double fz2 = ai2 * ((iz*2+1)*fz0 + ai2*gz[addrz+i_1*2]);
+                    if (ix > 1) fx2 += ix*(ix-1) * gx[addrx-i_1*2];
+                    if (iy > 1) fy2 += iy*(iy-1) * gy[addry-i_1*2];
+                    if (iz > 1) fz2 += iz*(iz-1) * gz[addrz-i_1*2];
+                    gout[n] += fx2 * fy0 * fz0;
+                    gout[n] += fx0 * fy2 * fz0;
+                    gout[n] += fx0 * fy0 * fz2;
+                }
+            }
+        }
+
+        if (task_id + sp_id < shl_pair1) {
+            int *ao_loc = envs.ao_loc;
+            int nbas = envs.cell0_nbas;
+            int nao = ao_loc[nbas];
+            size_t nao2 = nao * nao;
+            int cell_id = jsh / nbas;
+            int jshp = jsh % nbas;
+            int i0 = ao_loc[ish];
+            int j0 = ao_loc[jshp];
+            double *out_subblock = out + cell_id*nao2 + i0 * nao + j0;
+            int nfi = pdata.nfi;
+            int nfj = pdata.nfj;
+            int nfij = nfi * nfj;
+#pragma unroll
+            for (int n = 0; n < GOUT_WIDTH; ++n) {
+                int ij = n*gout_stride+gout_id;
+                if (ij >= nfij) break;
+                int j = ij / nfi;
+                int i = ij % nfi;
+                out_subblock[i*nao+j] = gout[n];
+            }
+        }
+    }
+}
+
+static __global__
+void int1e_ipovlp_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds)
+{
+    int sp_block_id = blockIdx.x;
+    int thread_id = threadIdx.x;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.cell0_nbas * envs.bvk_ncells;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int *bas = envs.bas;
+    int li = bas[ish0*BAS_SLOTS+ANG_OF];
+    int lj = bas[jsh0*BAS_SLOTS+ANG_OF];
+    int8_t nfi = (li + 1) * (li + 2) / 2;
+    int8_t nfj = (lj + 1) * (lj + 2) / 2;
+    int8_t iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int8_t jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    PackedPGTO pdata = {iprim, jprim, nfi, nfj};
+    double *env = envs.env;
+    double *img_coords = envs.img_coords;
+
+    int gout_stride = bounds.gout_stride_lookup[li*L_AUX1+lj];
+    int nsp_per_block = THREADS / gout_stride;
+    int sp_id = thread_id % nsp_per_block;
+    int gout_id = thread_id / nsp_per_block;
+
+    int g_size = (li + 2) * (lj + 1);
+    int gx_len = g_size * nsp_per_block;
+    extern __shared__ double g[];
+    double *gx = g + sp_id;
+    double *gy = gx + gx_len;
+    double *gz = gx + gx_len * 2;
+    double *rjri = gx + gx_len * 3;
+    gx[0] = PI_POW_1_5;
+    gy[0] = 1.;
+
+    for (int task_id = shl_pair0; task_id < shl_pair1; task_id += nsp_per_block) {
+        double goutx[GOUT_WIDTH_IP1];
+        double gouty[GOUT_WIDTH_IP1];
+        double goutz[GOUT_WIDTH_IP1];
+#pragma unroll
+        for (int n = 0; n < GOUT_WIDTH_IP1; ++n) {
+            goutx[n] = 0.;
+            gouty[n] = 0.;
+            goutz[n] = 0.;
+        }
+        int pair_ij = task_id + sp_id;
+        if (pair_ij >= shl_pair1) {
+            pair_ij = shl_pair0;
+        }
+        int bas_ij = bounds.bas_ij_idx[pair_ij];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        for (int img = 0; img < envs.nimgs; img++) {
+            if (gout_id == 0) {
+                double xjL = img_coords[img*3+0];
+                double yjL = img_coords[img*3+1];
+                double zjL = img_coords[img*3+2];
+                double xjxi = rj[0] + xjL - ri[0];
+                double yjyi = rj[1] + yjL - ri[1];
+                double zjzi = rj[2] + zjL - ri[2];
+                double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+                rjri[0*nsp_per_block] = xjxi;
+                rjri[1*nsp_per_block] = yjyi;
+                rjri[2*nsp_per_block] = zjzi;
+                rjri[3*nsp_per_block] = rr_ij;
+            }
+            int iprim = pdata.iprim;
+            int jprim = pdata.jprim;
+            int ijprim = iprim * jprim;
+            int nfi = pdata.nfi;
+            int nfj = pdata.nfj;
+            int nfij = nfi * nfj;
+            int16_t *idx_i = c_pair_idx + c_pair_offsets[li*L_AUX1];
+            int16_t *idy_i = idx_i + nfi;
+            int16_t *idz_i = idx_i + nfi * 2;
+            int16_t *idx_j = c_pair_idx + c_pair_offsets[lj*L_AUX1];
+            int16_t *idy_j = idx_j + nfj;
+            int16_t *idz_j = idx_j + nfj * 2;
+            for (int ijp = 0; ijp < ijprim; ++ijp) {
+                __syncthreads();
+                int ip = ijp % iprim;
+                int jp = ijp / iprim;
+                double ai = expi[ip];
+                double aj = expj[jp];
+                double ai2 = ai * -2;
+                double aij = ai + aj;
+                double aj_aij = aj / aij;
+                if (gout_id == 0) {
+                    double theta = ai * aj_aij;
+                    double theta_rr = theta * rjri[3*nsp_per_block];
+                    double cicj = ci[ip] * cj[jp];
+                    gz[0] = cicj / (aij*sqrt(aij)) * exp(-theta_rr);
+                }
+                int lij = li + lj + 1;
+                int stride_j = li + 2;
+                int i_1 = nsp_per_block;
+                __syncthreads();
+                double s0x, s1x, s2x;
+                double b = .5 / aij;
+                for (int n = gout_id; n < 3; n += gout_stride) {
+                    double *_gx = gx + n * gx_len;
+                    double xjxi = rjri[n*nsp_per_block];
+                    double xpa = xjxi * aj_aij;
+                    s0x = _gx[0];
+                    s1x = xpa * s0x;
+                    _gx[nsp_per_block] = s1x;
+                    for (int i = 1; i < lij; ++i) {
+                        s2x = xpa * s1x + i * b * s0x;
+                        _gx[(i+1)*nsp_per_block] = s2x;
+                        s0x = s1x;
+                        s1x = s2x;
+                    }
+                    for (int j = 0; j < lj; ++j) {
+                        int ij = (lij-j) + j*stride_j;
+                        s1x = _gx[ij*nsp_per_block];
+                        for (--ij; ij >= j*stride_j; --ij) {
+                            s0x = _gx[ij*nsp_per_block];
+                            _gx[(ij+stride_j)*nsp_per_block] = s1x - xjxi * s0x;
+                            s1x = s0x;
+                        }
+                    }
+                }
+                __syncthreads();
+#pragma unroll
+                for (int n = 0; n < GOUT_WIDTH_IP1; ++n) {
+                    int ij = n*gout_stride+gout_id;
+                    if (ij >= nfij) break;
+                    int i = ij % nfi;
+                    int j = ij / nfi;
+                    int ix = idx_i[i];
+                    int iy = idy_i[i];
+                    int iz = idz_i[i];
+                    int jx = idx_j[j];
+                    int jy = idy_j[j];
+                    int jz = idz_j[j];
+                    int addrx = (ix + jx*stride_j) * nsp_per_block;
+                    int addry = (iy + jy*stride_j) * nsp_per_block;
+                    int addrz = (iz + jz*stride_j) * nsp_per_block;
+                    double fx0 = gx[addrx];
+                    double fy0 = gy[addry];
+                    double fz0 = gz[addrz];
+                    double fx1 = ai2 * gx[addrx+i_1];
+                    double fy1 = ai2 * gy[addry+i_1];
+                    double fz1 = ai2 * gz[addrz+i_1];
+                    if (ix > 0) fx1 += ix * gx[addrx-i_1];
+                    if (iy > 0) fy1 += iy * gy[addry-i_1];
+                    if (iz > 0) fz1 += iz * gz[addrz-i_1];
+                    goutx[n] += fx1 * fy0 * fz0;
+                    gouty[n] += fx0 * fy1 * fz0;
+                    goutz[n] += fx0 * fy0 * fz1;
+                }
+            }
+        }
+
+        if (task_id + sp_id < shl_pair1) {
+            int *ao_loc = envs.ao_loc;
+            int nbas = envs.cell0_nbas;
+            int nao = ao_loc[nbas];
+            size_t nao2 = nao * nao;
+            int cell_id = jsh / nbas;
+            int jshp = jsh % nbas;
+            int i0 = ao_loc[ish];
+            int j0 = ao_loc[jshp];
+            double *outx = out + cell_id*nao2*3 + i0 * nao + j0;
+            double *outy = outx + nao2;
+            double *outz = outx + nao2 * 2;
+            int nfi = pdata.nfi;
+            int nfj = pdata.nfj;
+            int nfij = nfi * nfj;
+#pragma unroll
+            for (int n = 0; n < GOUT_WIDTH_IP1; ++n) {
+                int ij = n*gout_stride+gout_id;
+                if (ij >= nfij) break;
+                int j = ij / nfi;
+                int i = ij % nfi;
+                outx[i*nao+j] = goutx[n];
+                outy[i*nao+j] = gouty[n];
+                outz[i*nao+j] = goutz[n];
+            }
+        }
+    }
+}
+
+static __global__
+void int1e_ipkin_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds)
+{
+    int sp_block_id = blockIdx.x;
+    int thread_id = threadIdx.x;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.cell0_nbas * envs.bvk_ncells;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int *bas = envs.bas;
+    int li = bas[ish0*BAS_SLOTS+ANG_OF];
+    int lj = bas[jsh0*BAS_SLOTS+ANG_OF];
+    int8_t nfi = (li + 1) * (li + 2) / 2;
+    int8_t nfj = (lj + 1) * (lj + 2) / 2;
+    int8_t iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int8_t jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    PackedPGTO pdata = {iprim, jprim, nfi, nfj};
+    double *env = envs.env;
+    double *img_coords = envs.img_coords;
+
+    int gout_stride = bounds.gout_stride_lookup[li*L_AUX1+lj];
+    int nsp_per_block = THREADS / gout_stride;
+    int sp_id = thread_id % nsp_per_block;
+    int gout_id = thread_id / nsp_per_block;
+
+    int g_size = (li + 4) * (lj + 1);
+    int gx_len = g_size * nsp_per_block;
+    extern __shared__ double g[];
+    double *gx = g + sp_id;
+    double *gy = gx + gx_len;
+    double *gz = gx + gx_len * 2;
+    double *rjri = gx + gx_len * 3;
+    gx[0] = PI_POW_1_5;
+    gy[0] = -.5;
+
+    for (int task_id = shl_pair0; task_id < shl_pair1; task_id += nsp_per_block) {
+        double goutx[GOUT_WIDTH_IP1];
+        double gouty[GOUT_WIDTH_IP1];
+        double goutz[GOUT_WIDTH_IP1];
+#pragma unroll
+        for (int n = 0; n < GOUT_WIDTH_IP1; ++n) {
+            goutx[n] = 0.;
+            gouty[n] = 0.;
+            goutz[n] = 0.;
+        }
+        int pair_ij = task_id + sp_id;
+        if (pair_ij >= shl_pair1) {
+            pair_ij = shl_pair0;
+        }
+        int bas_ij = bounds.bas_ij_idx[pair_ij];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        for (int img = 0; img < envs.nimgs; img++) {
+            if (gout_id == 0) {
+                double xjL = img_coords[img*3+0];
+                double yjL = img_coords[img*3+1];
+                double zjL = img_coords[img*3+2];
+                double xjxi = rj[0] + xjL - ri[0];
+                double yjyi = rj[1] + yjL - ri[1];
+                double zjzi = rj[2] + zjL - ri[2];
+                double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+                rjri[0*nsp_per_block] = xjxi;
+                rjri[1*nsp_per_block] = yjyi;
+                rjri[2*nsp_per_block] = zjzi;
+                rjri[3*nsp_per_block] = rr_ij;
+            }
+            int iprim = pdata.iprim;
+            int jprim = pdata.jprim;
+            int ijprim = iprim * jprim;
+            int nfi = pdata.nfi;
+            int nfj = pdata.nfj;
+            int nfij = nfi * nfj;
+            int16_t *idx_i = c_pair_idx + c_pair_offsets[li*L_AUX1];
+            int16_t *idy_i = idx_i + nfi;
+            int16_t *idz_i = idx_i + nfi * 2;
+            int16_t *idx_j = c_pair_idx + c_pair_offsets[lj*L_AUX1];
+            int16_t *idy_j = idx_j + nfj;
+            int16_t *idz_j = idx_j + nfj * 2;
+            for (int ijp = 0; ijp < ijprim; ++ijp) {
+                __syncthreads();
+                int ip = ijp % iprim;
+                int jp = ijp / iprim;
+                double ai = expi[ip];
+                double aj = expj[jp];
+                double ai2 = ai * -2;
+                double aij = ai + aj;
+                double aj_aij = aj / aij;
+                if (gout_id == 0) {
+                    double theta = ai * aj_aij;
+                    double theta_rr = theta * rjri[3*nsp_per_block];
+                    double cicj = ci[ip] * cj[jp];
+                    gz[0] = cicj / (aij*sqrt(aij)) * exp(-theta_rr);
+                }
+                __syncthreads();
+                int lij = li + lj + 3;
+                int stride_j = li + 4;
+                int i_1 = nsp_per_block;
+                double s0x, s1x, s2x;
+                double b = .5 / aij;
+                for (int n = gout_id; n < 3; n += gout_stride) {
+                    double *_gx = gx + n * gx_len;
+                    double xjxi = rjri[n*nsp_per_block];
+                    double xpa = xjxi * aj_aij;
+                    s0x = _gx[0];
+                    s1x = xpa * s0x;
+                    _gx[nsp_per_block] = s1x;
+                    for (int i = 1; i < lij; ++i) {
+                        s2x = xpa * s1x + i * b * s0x;
+                        _gx[(i+1)*nsp_per_block] = s2x;
+                        s0x = s1x;
+                        s1x = s2x;
+                    }
+                    for (int j = 0; j < lj; ++j) {
+                        int ij = (lij-j) + j*stride_j;
+                        s1x = _gx[ij*nsp_per_block];
+                        for (--ij; ij >= j*stride_j; --ij) {
+                            s0x = _gx[ij*nsp_per_block];
+                            _gx[(ij+stride_j)*nsp_per_block] = s1x - xjxi * s0x;
+                            s1x = s0x;
+                        }
+                    }
+                }
+                __syncthreads();
+#pragma unroll
+                for (int n = 0; n < GOUT_WIDTH_IP1; ++n) {
+                    int ij = n*gout_stride+gout_id;
+                    if (ij >= nfij) break;
+                    int i = ij % nfi;
+                    int j = ij / nfi;
+                    int ix = idx_i[i];
+                    int iy = idy_i[i];
+                    int iz = idz_i[i];
+                    int jx = idx_j[j];
+                    int jy = idy_j[j];
+                    int jz = idz_j[j];
+                    int addrx = (ix + jx*stride_j) * nsp_per_block;
+                    int addry = (iy + jy*stride_j) * nsp_per_block;
+                    int addrz = (iz + jz*stride_j) * nsp_per_block;
+                    double fx0 = gx[addrx];
+                    double fy0 = gy[addry];
+                    double fz0 = gz[addrz];
+                    double fx1 = ai2 * gx[addrx+i_1];
+                    double fy1 = ai2 * gy[addry+i_1];
+                    double fz1 = ai2 * gz[addrz+i_1];
+                    double fx2 = ai2 * ((ix*2+1)*fx0 + ai2*gx[addrx+i_1*2]);
+                    double fy2 = ai2 * ((iy*2+1)*fy0 + ai2*gy[addry+i_1*2]);
+                    double fz2 = ai2 * ((iz*2+1)*fz0 + ai2*gz[addrz+i_1*2]);
+                    double fx3 = ai2 * ((ix*3+3)*fx1 + ai2*ai2*gx[addrx+i_1*3]);
+                    double fy3 = ai2 * ((iy*3+3)*fy1 + ai2*ai2*gy[addry+i_1*3]);
+                    double fz3 = ai2 * ((iz*3+3)*fz1 + ai2*ai2*gz[addrz+i_1*3]);
+                    if (ix > 0) {
+                        double fx1m = ix * gx[addrx-i_1];
+                        fx1 += fx1m;
+                        fx3 += ai2*(ix*2+1) * fx1m;
+                        if (ix > 1) { fx2 += ix*(ix-1)*gx[addrx-i_1*2]; fx3 += ai2*(ix-1)*fx1m; }
+                        if (ix > 2) fx3 += ix*(ix-1)*(ix-2) * gx[addrx-i_1*3];
+                    }
+                    if (iy > 0) {
+                        double fy1m = iy * gy[addry-i_1];
+                        fy1 += fy1m;
+                        fy3 += ai2*(iy*2+1) * fy1m;
+                        if (iy > 1) { fy2 += iy*(iy-1)*gy[addry-i_1*2]; fy3 += ai2*(iy-1)*fy1m; }
+                        if (iy > 2) fy3 += iy*(iy-1)*(iy-2) * gy[addry-i_1*3];
+                    }
+                    if (iz > 0) {
+                        double fz1m = iz * gz[addrz-i_1];
+                        fz1 += fz1m;
+                        fz3 += ai2*(iz*2+1) * fz1m;
+                        if (iz > 1) { fz2 += iz*(iz-1)*gz[addrz-i_1*2]; fz3 += ai2*(iz-1)*fz1m; }
+                        if (iz > 2) fz3 += iz*(iz-1)*(iz-2) * gz[addrz-i_1*3];
+                    }
+                    goutx[n] += fx3 * fy0 * fz0;
+                    goutx[n] += fx1 * fy2 * fz0;
+                    goutx[n] += fx1 * fy0 * fz2;
+                    gouty[n] += fx2 * fy1 * fz0;
+                    gouty[n] += fx0 * fy3 * fz0;
+                    gouty[n] += fx0 * fy1 * fz2;
+                    goutz[n] += fx2 * fy0 * fz1;
+                    goutz[n] += fx0 * fy2 * fz1;
+                    goutz[n] += fx0 * fy0 * fz3;
+                }
+            }
+        }
+
+        if (task_id + sp_id < shl_pair1) {
+            int *ao_loc = envs.ao_loc;
+            int nbas = envs.cell0_nbas;
+            int nao = ao_loc[nbas];
+            size_t nao2 = nao * nao;
+            int cell_id = jsh / nbas;
+            int jshp = jsh % nbas;
+            int i0 = ao_loc[ish];
+            int j0 = ao_loc[jshp];
+            double *outx = out + cell_id*nao2*3 + i0 * nao + j0;
+            double *outy = outx + nao2;
+            double *outz = outx + nao2 * 2;
+            int nfi = pdata.nfi;
+            int nfj = pdata.nfj;
+            int nfij = nfi * nfj;
+#pragma unroll
+            for (int n = 0; n < GOUT_WIDTH_IP1; ++n) {
+                int ij = n*gout_stride+gout_id;
+                if (ij >= nfij) break;
+                int j = ij / nfi;
+                int i = ij % nfi;
+                outx[i*nao+j] = goutx[n];
+                outy[i*nao+j] = gouty[n];
+                outz[i*nao+j] = goutz[n];
+            }
+        }
+    }
+}
+
+extern "C" {
+int PBCint1e_ovlp(double *out, PBCIntEnvVars *envs, int shm_size,
+                  int nbatches_shl_pair, int *bas_ij_idx,
+                  int *shl_pair_offsets, int *gout_stride_lookup)
+{
+    cudaFuncSetAttribute(int1e_ovlp_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    PBCInt2c2eBounds bounds = {
+        bas_ij_idx, shl_pair_offsets, gout_stride_lookup,
+    };
+    int1e_ovlp_kernel<<<nbatches_shl_pair, THREADS, shm_size>>>(out, *envs, bounds);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in int1e_ovlp kernel: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int PBCint1e_kin(double *out, PBCIntEnvVars *envs, int shm_size,
+                 int nbatches_shl_pair, int *bas_ij_idx,
+                 int *shl_pair_offsets, int *gout_stride_lookup)
+{
+    cudaFuncSetAttribute(int1e_kin_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    PBCInt2c2eBounds bounds = {
+        bas_ij_idx, shl_pair_offsets, gout_stride_lookup,
+    };
+    int1e_kin_kernel<<<nbatches_shl_pair, THREADS, shm_size>>>(out, *envs, bounds);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in int1e_ovlp kernel: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int PBCint1e_ipovlp(double *out, PBCIntEnvVars *envs, int shm_size,
+                    int nbatches_shl_pair, int *bas_ij_idx,
+                    int *shl_pair_offsets, int *gout_stride_lookup)
+{
+    cudaFuncSetAttribute(int1e_ipovlp_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    PBCInt2c2eBounds bounds = {
+        bas_ij_idx, shl_pair_offsets, gout_stride_lookup,
+    };
+    int1e_ipovlp_kernel<<<nbatches_shl_pair, THREADS, shm_size>>>(out, *envs, bounds);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in int1e_ipovlp kernel: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int PBCint1e_ipkin(double *out, PBCIntEnvVars *envs, int shm_size,
+                    int nbatches_shl_pair, int *bas_ij_idx,
+                    int *shl_pair_offsets, int *gout_stride_lookup)
+{
+    cudaFuncSetAttribute(int1e_ipkin_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    PBCInt2c2eBounds bounds = {
+        bas_ij_idx, shl_pair_offsets, gout_stride_lookup,
+    };
+    int1e_ipkin_kernel<<<nbatches_shl_pair, THREADS, shm_size>>>(out, *envs, bounds);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in int1e_ipkin kernel: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+}
diff --git a/gpu4pyscf/lib/pbc/pbc_driver.cu b/gpu4pyscf/lib/pbc/pbc_driver.cu
index 3fa0f77cf..3f4484de6 100644
--- a/gpu4pyscf/lib/pbc/pbc_driver.cu
+++ b/gpu4pyscf/lib/pbc/pbc_driver.cu
@@ -693,7 +693,7 @@ int fill_int2c2e(double *out, PBCIntEnvVars *envs, int shm_size,
     pbc_int2c2e_kernel<<<nbatches_shl_pair, THREADS, shm_size>>>(out, *envs, bounds);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA Error in int2ce kernel: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, "CUDA Error in int2c2e kernel: %s\n", cudaGetErrorString(err));
         return 1;
     }
     return 0;
diff --git a/gpu4pyscf/pbc/df/fft_jk.py b/gpu4pyscf/pbc/df/fft_jk.py
index 8c93bae88..963dd098f 100644
--- a/gpu4pyscf/pbc/df/fft_jk.py
+++ b/gpu4pyscf/pbc/df/fft_jk.py
@@ -370,7 +370,8 @@ def get_k_e1_kpts(mydf, dm_kpts, kpts=np.zeros((1,3)), exxdiv=None):
 
 def _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band=None):
     from pyscf.pbc.tools.pbc import madelung
-    s = cp.asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+    from gpu4pyscf.pbc.gto.int1e import int1e_ovlp
+    s = int1e_ovlp(cell, kpts=kpts)
     m = madelung(cell, kpts)
     if kpts is None:
         for i,dm in enumerate(dms):
diff --git a/gpu4pyscf/pbc/df/int3c2e.py b/gpu4pyscf/pbc/df/int3c2e.py
index 90a731b9f..f502d0b16 100644
--- a/gpu4pyscf/pbc/df/int3c2e.py
+++ b/gpu4pyscf/pbc/df/int3c2e.py
@@ -17,7 +17,6 @@
 '''
 
 import ctypes
-import itertools
 import math
 import numpy as np
 import cupy as cp
@@ -53,7 +52,6 @@
 L_AUX_MAX = 6
 GOUT_WIDTH = 45
 THREADS = 256
-BVK_CELL_SHELLS = 400
 
 def sr_aux_e2(cell, auxcell, omega, kpts=None, bvk_kmesh=None, j_only=False):
     r'''
@@ -97,7 +95,6 @@ def sr_aux_e2(cell, auxcell, omega, kpts=None, bvk_kmesh=None, j_only=False):
     else:
         nf = uniq_l * 2 + 1
     c_l_offsets = np.append(0, np.cumsum(c_shell_counts*nf))
-    lmax = cell._bas[:,ANG_OF].max()
     c2s = [cart2sph_by_l(l) for l in range(lmax+1)]
 
     aux_coeff = asarray(int3c2e_opt.aux_coeff)
diff --git a/gpu4pyscf/pbc/dft/krks.py b/gpu4pyscf/pbc/dft/krks.py
index 5f496a78a..8df4ce6e6 100644
--- a/gpu4pyscf/pbc/dft/krks.py
+++ b/gpu4pyscf/pbc/dft/krks.py
@@ -26,6 +26,7 @@
 from pyscf.pbc.dft import krks as krks_cpu
 from gpu4pyscf.lib import logger, utils
 from gpu4pyscf.lib.cupy_helper import tag_array, get_avail_mem
+from gpu4pyscf.pbc.gto import int1e
 from gpu4pyscf.pbc.scf import khf
 from gpu4pyscf.pbc.dft import rks
 from gpu4pyscf.pbc.dft import multigrid, multigrid_v2
@@ -196,7 +197,7 @@ def get_hcore(self, cell=None, kpts=None):
             nuc = ni.get_nuc(kpts)
         if len(cell._ecpbas) > 0:
             raise NotImplementedError('ECP in PBC SCF')
-        t = cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpts))
+        t = int1e.int1e_kin(cell, kpts)
         return nuc + t
 
     def Gradients(self):
diff --git a/gpu4pyscf/pbc/dft/rks.py b/gpu4pyscf/pbc/dft/rks.py
index c0cc6e1c7..cc664fd55 100644
--- a/gpu4pyscf/pbc/dft/rks.py
+++ b/gpu4pyscf/pbc/dft/rks.py
@@ -26,6 +26,7 @@
 from pyscf.pbc.dft import rks as rks_cpu
 from gpu4pyscf.lib import logger, utils
 from gpu4pyscf.dft import rks as mol_ks
+from gpu4pyscf.pbc.gto import int1e
 from gpu4pyscf.pbc.scf import hf as pbchf, khf
 from gpu4pyscf.pbc.df.df import GDF
 from gpu4pyscf.pbc.dft import gen_grid
@@ -309,7 +310,8 @@ def get_hcore(self, cell=None, kpt=None):
             nuc = ni.get_nuc(kpt)
         if len(cell._ecpbas) > 0:
             raise NotImplementedError('ECP in PBC SCF')
-        return nuc + cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpt))
+        t = int1e.int1e_kin(cell, kpt)
+        return nuc + t
 
     get_veff = get_veff
     energy_elec = mol_ks.energy_elec
diff --git a/gpu4pyscf/pbc/grad/krhf.py b/gpu4pyscf/pbc/grad/krhf.py
index 9f82975c3..444cdf90d 100644
--- a/gpu4pyscf/pbc/grad/krhf.py
+++ b/gpu4pyscf/pbc/grad/krhf.py
@@ -30,6 +30,7 @@
 from gpu4pyscf.pbc.df.aft import _check_kpts
 from gpu4pyscf.pbc.df import ft_ao
 from gpu4pyscf.pbc import tools
+from gpu4pyscf.pbc.gto import int1e
 from gpu4pyscf.lib.cupy_helper import contract
 
 __all__ = ['Gradients']
@@ -84,7 +85,7 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None):
 
 def get_hcore(cell, kpts):
     '''Part of the nuclear gradients of core Hamiltonian'''
-    h1 = cp.asarray(cell.pbc_intor('int1e_ipkin', kpts=kpts))
+    h1 = int1e.int1e_ipkin(cell, kpts)
     dtype = h1.dtype
     if cell._pseudo:
         SI = cell.get_SI()
@@ -269,7 +270,7 @@ def get_hcore(self, cell=None, kpts=None):
     def get_ovlp(self, cell=None, kpts=None):
         if cell is None: cell = self.cell
         if kpts is None: kpts = self.kpts
-        return -cp.asarray(cell.pbc_intor('int1e_ipovlp', kpts=kpts))
+        return -int1e.int1e_ipovlp(cell, kpts)
 
     def get_jk(self, dm=None, kpts=None):
         if kpts is None: kpts = self.kpts
diff --git a/gpu4pyscf/pbc/grad/rhf.py b/gpu4pyscf/pbc/grad/rhf.py
index a72f50c34..3b0184c33 100644
--- a/gpu4pyscf/pbc/grad/rhf.py
+++ b/gpu4pyscf/pbc/grad/rhf.py
@@ -26,6 +26,7 @@
 import gpu4pyscf.grad.rhf as mol_rhf
 from gpu4pyscf.lib.cupy_helper import return_cupy_array
 from gpu4pyscf.pbc.dft import multigrid_v2
+from gpu4pyscf.pbc.gto import int1e
 
 __all__ = ['Gradients']
 
@@ -73,7 +74,7 @@ def grad_elec(
         ni = mf._numint
         assert hasattr(mf, 'xc'), 'HF gradients not supported'
         de = multigrid_v2.get_veff_ip1(ni, mf.xc, dm0, with_j=True).get()
-        s1 = cell.pbc_intor('int1e_ipovlp')
+        s1 = int1e.int1e_ipovlp(cell)[0].get()
         de += cpu_rhf._contract_vhf_dm(self, s1, dme0) * 2
 
         # the CPU code requires the attribute .rhoG
@@ -82,7 +83,7 @@ def grad_elec(
             de += vpploc_part1_nuc_grad(ni, dm0_cpu)
         de += pp_int.vpploc_part2_nuc_grad(cell, dm0_cpu)
         de += pp_int.vppnl_nuc_grad(cell, dm0_cpu)
-        core_hamiltonian_gradient = cell.pbc_intor("int1e_ipkin")
+        core_hamiltonian_gradient = int1e.int1e_ipkin(cell)[0].get()
         kinetic_contribution = cpu_rhf._contract_vhf_dm(
             self, core_hamiltonian_gradient, dm0_cpu
         )
diff --git a/gpu4pyscf/pbc/grad/uhf.py b/gpu4pyscf/pbc/grad/uhf.py
index f59b3efa8..55258c562 100644
--- a/gpu4pyscf/pbc/grad/uhf.py
+++ b/gpu4pyscf/pbc/grad/uhf.py
@@ -26,6 +26,7 @@
 import gpu4pyscf.pbc.grad.rhf as rhf
 from gpu4pyscf.lib.cupy_helper import return_cupy_array
 from gpu4pyscf.pbc.dft import multigrid_v2
+from gpu4pyscf.pbc.gto import int1e
 
 __all__ = ['Gradients']
 
@@ -71,7 +72,7 @@ def grad_elec(
         ni = mf._numint
         assert hasattr(mf, 'xc'), 'HF gradients not supported'
         de = multigrid_v2.get_veff_ip1(ni, mf.xc, dm0, with_j=True).get()
-        s1 = cell.pbc_intor('int1e_ipovlp')
+        s1 = int1e.int1e_ipovlp(cell)[0].get()
         de += _contract_vhf_dm(self, s1, dme0_sf) * 2
 
         # the CPU code requires the attribute .rhoG
@@ -80,7 +81,7 @@ def grad_elec(
             de += vpploc_part1_nuc_grad(ni, dm0_cpu)
         de += pp_int.vpploc_part2_nuc_grad(cell, dm0_cpu)
         de += pp_int.vppnl_nuc_grad(cell, dm0_cpu)
-        core_hamiltonian_gradient = cell.pbc_intor("int1e_ipkin")
+        core_hamiltonian_gradient = int1e.int1e_ipkin(cell)[0].get()
         kinetic_contribution = _contract_vhf_dm(
             self, core_hamiltonian_gradient, dm0_cpu
         )
diff --git a/gpu4pyscf/pbc/gto/int1e.py b/gpu4pyscf/pbc/gto/int1e.py
new file mode 100644
index 000000000..b8caa93db
--- /dev/null
+++ b/gpu4pyscf/pbc/gto/int1e.py
@@ -0,0 +1,229 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes
+import numpy as np
+import cupy as cp
+from pyscf.gto import ATOM_OF, PTR_COORD
+from pyscf.pbc import tools as pbctools
+from pyscf.pbc.lib.kpts_helper import is_zero
+from pyscf.pbc.tools.k2gamma import translation_vectors_for_kmesh
+from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract, asarray, sandwich_dot
+from gpu4pyscf.gto.mole import group_basis, PTR_BAS_COORD
+from gpu4pyscf.scf.jk import _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE
+from gpu4pyscf.pbc.df.ft_ao import libpbc, PBCIntEnvVars
+from gpu4pyscf.pbc.df.int3c2e import (
+    _estimate_shl_pairs_per_block, fill_triu_bvk_conj, LMAX, L_AUX_MAX, THREADS
+)
+
+__all__ = [
+    'int1e_ovlp',
+    'int1e_kin',
+    'int1e_ipovlp',
+    'int1e_ipkin',
+]
+
+libpbc.PBCint1e_ovlp.restype = ctypes.c_int
+libpbc.PBCint1e_kin.restype = ctypes.c_int
+libpbc.PBCint1e_ipovlp.restype = ctypes.c_int
+libpbc.PBCint1e_ipkin.restype = ctypes.c_int
+
+def int1e_ovlp(cell, kpts=None, bvk_kmesh=None, opt=None):
+    opt = _check_opt(cell, kpts, bvk_kmesh, opt)
+    out = opt.intor('PBCint1e_ovlp', 1, 1, (0, 0))
+    return out
+
+def int1e_kin(cell, kpts=None, bvk_kmesh=None, opt=None):
+    opt = _check_opt(cell, kpts, bvk_kmesh, opt)
+    out = opt.intor('PBCint1e_kin', 1, 1, (2, 0))
+    return out
+
+def int1e_ipovlp(cell, kpts=None, bvk_kmesh=None, opt=None):
+    opt = _check_opt(cell, kpts, bvk_kmesh, opt)
+    out = opt.intor('PBCint1e_ipovlp', 0, 3, (1, 0))
+    return out
+
+def int1e_ipkin(cell, kpts=None, bvk_kmesh=None, opt=None):
+    opt = _check_opt(cell, kpts, bvk_kmesh, opt)
+    out = opt.intor('PBCint1e_ipkin', 0, 3, (3, 0))
+    return out
+
+def _check_opt(cell, kpts, bvk_kmesh, opt):
+    if opt is None:
+        opt = _Int1eOpt(cell, kpts, bvk_kmesh)
+    else:
+        assert kpts is opt.kpts
+    return opt
+
+class _Int1eOpt:
+    def __init__(self, cell, kpts=None, bvk_kmesh=None):
+        self.cell = cell
+        sorted_cell, coeff, uniq_l_ctr, l_ctr_counts = group_basis(cell, tile=1)
+        uniq_l = uniq_l_ctr[:,0]
+        lmax = uniq_l.max()
+        assert lmax <= LMAX
+        self.sorted_cell = sorted_cell
+        self.coeff = coeff
+        self.uniq_l_ctr = uniq_l_ctr
+        self.l_ctr_counts = l_ctr_counts
+
+        if bvk_kmesh is None:
+            if kpts is None:
+                bvk_kmesh = np.ones(3, dtype=np.int32)
+            else:
+                bvk_kmesh = kpts_to_kmesh(cell, kpts.reshape(-1, 3))
+        self.kpts = kpts
+        self.bvk_kmesh = bvk_kmesh
+        bvk_ncells = np.prod(bvk_kmesh)
+        if bvk_ncells == 1:
+            bvkcell = sorted_cell
+        else:
+            bvkcell = pbctools.super_cell(sorted_cell, bvk_kmesh, wrap_around=True)
+            # PTR_BAS_COORD was not initialized in pbctools.supe_rcell
+            bvkcell._bas[:,PTR_BAS_COORD] = bvkcell._atm[bvkcell._bas[:,ATOM_OF],PTR_COORD]
+        self.bvkcell = bvkcell
+
+        Ls = asarray(bvkcell.get_lattice_Ls(rcut=cell.rcut))
+        Ls = Ls[cp.linalg.norm(Ls-.5, axis=1).argsort()]
+        nimgs = len(Ls)
+
+        _atm = cp.array(bvkcell._atm, dtype=np.int32)
+        _bas = cp.array(bvkcell._bas, dtype=np.int32)
+        _env = cp.array(_scale_sp_ctr_coeff(bvkcell), dtype=np.float64)
+        ao_loc = bvkcell.ao_loc_nr(cart=True)
+        ao_loc_gpu = cp.array(ao_loc, dtype=np.int32)
+        int1e_envs = PBCIntEnvVars(
+            sorted_cell.natm, sorted_cell.nbas, bvk_ncells, nimgs,
+            _atm.data.ptr, _bas.data.ptr, _env.data.ptr,
+            ao_loc_gpu.data.ptr, Ls.data.ptr,
+        )
+        int1e_envs._env_ref_holder = (_atm, _bas, _env, ao_loc_gpu, Ls)
+        self.int1e_envs = int1e_envs
+
+    def generate_shl_pairs(self, hermi=1):
+        sorted_cell = self.sorted_cell
+        l_ctr_offsets = np.append(0, np.cumsum(self.l_ctr_counts))
+        uniq_l = self.uniq_l_ctr[:,0]
+        bas_ij_idx = [] # The effective shell pair = ish*nbas+jsh
+        shl_pair_offsets = [] # the bas_ij_idx offset for each blockIdx.x
+        sp0 = sp1 = 0
+        nbas = sorted_cell.nbas
+        groups = len(uniq_l)
+        if hermi == 1:
+            ij_tasks = [(i, j) for i in range(groups) for j in range(i+1)]
+        else:
+            ij_tasks = [(i, j) for i in range(groups) for j in range(groups)]
+        bvk_ncells = np.prod(self.bvk_kmesh)
+        img = cp.arange(bvk_ncells, dtype=np.int32)
+        img_offsets = img * nbas
+        for i, j in ij_tasks:
+            li = uniq_l[i]
+            lj = uniq_l[j]
+            ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1]
+            jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1]
+            ish = cp.arange(ish0, ish1, dtype=np.int32)
+            jsh = cp.arange(jsh0, jsh1, dtype=np.int32)
+            ijsh = ish[:,None] * (nbas*bvk_ncells) + jsh
+            if hermi and i == j:
+                ijsh = ijsh[cp.tril_indices(ish1-ish0)]
+            else:
+                ijsh = ijsh.ravel()
+            idx = (img_offsets[:,None] + ijsh).ravel()
+            nshl_pair = len(idx)
+            bas_ij_idx.append(idx)
+            sp0, sp1 = sp1, sp1 + nshl_pair
+            nsp_per_block = _estimate_shl_pairs_per_block(li, lj, nshl_pair)
+            shl_pair_offsets.append(np.arange(sp0, sp1, nsp_per_block, dtype=np.int32))
+
+        shl_pair_offsets.append(np.array([sp1], dtype=np.int32))
+        shl_pair_offsets = cp.array(np.hstack(shl_pair_offsets), dtype=np.int32)
+        bas_ij_idx = cp.array(cp.hstack(bas_ij_idx), dtype=np.int32)
+        return bas_ij_idx, shl_pair_offsets
+
+    def create_gout_stride_lookup_table(self, deriv=None, gout_width=36):
+        # gout_width should be identical to the setting in cuda kernel
+        # based on the shm_size, find optimal gout_stride for each (li,lj)
+        # pattern, store them in the gout_stride_lookup
+        if deriv is None:
+            deriv = (0, 0)
+        i_inc, j_inc = deriv
+        lmax = self.uniq_l_ctr[:,0].max()
+        gout_stride_lookup = np.empty([L_AUX_MAX+1,L_AUX_MAX+1], dtype=np.int32)
+        gout_width = gout_width
+        shm_size = SHM_SIZE
+        ls = np.arange(lmax+1)
+        nf = (ls+1) * (ls+2) // 2
+        max_shm_size = 0
+        for li in range(lmax+1):
+            for lj in range(lmax+1):
+                unit = (li+1+i_inc)*(lj+1+j_inc)*3 + 4
+                nsp_max = _nearest_power2(shm_size // (unit*8))
+                gout_size = nf[li] * nf[lj]
+                gout_stride = (gout_size+gout_width-1) / gout_width
+                # Round up to the next 2^n
+                gout_stride = _nearest_power2(gout_stride, return_leq=False)
+                nsp_per_block = min(nsp_max, THREADS // gout_stride)
+                gout_stride_lookup[li, lj] = THREADS // nsp_per_block
+                max_shm_size = max(max_shm_size, nsp_per_block*unit*8)
+        return cp.array(gout_stride_lookup, dtype=np.int32), max_shm_size
+
+    def intor(self, kern, hermi, comp, deriv_ij):
+        if comp == 1:
+            gout_width = 36
+        else:
+            gout_width = 18
+
+        sorted_cell = self.sorted_cell
+        int1e_envs = self.int1e_envs
+        nao_cart, nao = self.coeff.shape
+        bas_ij_idx, shl_pair_offsets = self.generate_shl_pairs(hermi)
+        nbatches_shl_pair = len(shl_pair_offsets) - 1
+        gout_stride_lookup, shm_size = self.create_gout_stride_lookup_table(
+            deriv_ij, gout_width)
+        bvk_kmesh = self.bvk_kmesh
+        bvk_ncells = np.prod(bvk_kmesh)
+        out = cp.empty((bvk_ncells, comp, nao_cart, nao_cart))
+        drv = getattr(libpbc, kern)
+        err = drv(
+            ctypes.cast(out.data.ptr, ctypes.c_void_p),
+            ctypes.byref(int1e_envs), ctypes.c_int(shm_size),
+            ctypes.c_int(nbatches_shl_pair),
+            ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p),
+            ctypes.cast(shl_pair_offsets.data.ptr, ctypes.c_void_p),
+            ctypes.cast(gout_stride_lookup.data.ptr, ctypes.c_void_p),
+            sorted_cell._atm.ctypes, ctypes.c_int(sorted_cell.natm),
+            sorted_cell._bas.ctypes, ctypes.c_int(sorted_cell.nbas),
+            sorted_cell._env.ctypes)
+        if err != 0:
+            raise RuntimeError(f'{kern} failed')
+
+        if hermi == 1:
+            assert comp == 1
+            out = fill_triu_bvk_conj(out, nao_cart, bvk_kmesh)
+        out = sandwich_dot(out.reshape(-1,nao_cart,nao_cart), self.coeff)
+        out = out.reshape(bvk_ncells, comp, nao, nao)
+
+        if self.kpts is not None and not is_zero(self.kpts):
+            bvkmesh_Ls = translation_vectors_for_kmesh(self.cell, bvk_kmesh, True)
+            kpts = self.kpts.reshape(-1, 3)
+            expLk = cp.exp(1j*asarray(bvkmesh_Ls.dot(kpts.T)))
+            out = contract('lk,lxpq->kxpq', expLk, out)
+
+        if comp == 1:
+            out = out[:,0]
+        if self.kpts is not None and self.kpts.ndim == 1:
+            out = out[0]
+        return out
diff --git a/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py b/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py
new file mode 100644
index 000000000..8a0034242
--- /dev/null
+++ b/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py
@@ -0,0 +1,141 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pyscf
+from gpu4pyscf.pbc.gto import int1e
+
+def test_int1e_ovlp():
+    cell = pyscf.M(
+        atom='''C1  1.3    .2       .3
+                C2  .19   .1      1.1
+                C3  0.  0.  0.
+        ''',
+        precision = 1e-8,
+        a=np.diag([2.5, 1.9, 2.2])*2,
+        basis='def2-tzvpp',
+    )
+    kmesh = [6, 1, 1]
+    kpts = cell.make_kpts(kmesh)
+    pcell = cell.copy()
+    pcell.precision = 1e-14
+    pcell.build(0, 0)
+    ref = pcell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts)
+
+    dat = int1e.int1e_ovlp(cell).get()[0]
+    assert abs(dat - ref[0]).max() < 1e-10
+
+    dat = int1e.int1e_ovlp(cell, kpts=kpts).get()
+    assert abs(dat - ref).max() < 1e-10
+
+def test_int1e_kin():
+    cell = pyscf.M(
+        atom='''C1  1.3    .2       .3
+                C2  .19   .1      1.1
+                C3  0.  0.  0.
+        ''',
+        precision = 1e-8,
+        a=np.diag([2.5, 1.9, 2.2])*2,
+        basis='def2-tzvpp',
+    )
+    kmesh = [6, 1, 1]
+    kpts = cell.make_kpts(kmesh)
+    pcell = cell.copy()
+    pcell.precision = 1e-14
+    pcell.build(0, 0)
+    ref = pcell.pbc_intor('int1e_kin', hermi=1, kpts=kpts)
+
+    dat = int1e.int1e_kin(cell).get()[0]
+    assert abs(dat - ref[0]).max() < 1e-10
+
+    dat = int1e.int1e_kin(cell, kpts=kpts).get()
+    assert abs(dat - ref).max() < 1e-10
+
+def test_int1e_ipovlp():
+    cell = pyscf.M(
+        atom='''C1  1.3    .2       .3
+                C2  .19   .1      1.1
+                C3  0.  0.  0.
+        ''',
+        precision = 1e-8,
+        a=np.diag([2.5, 1.9, 2.2])*2,
+        basis='def2-tzvpp',
+    )
+    kmesh = [6, 1, 1]
+    kpts = cell.make_kpts(kmesh)
+    pcell = cell.copy()
+    pcell.precision = 1e-14
+    pcell.build(0, 0)
+    ref = pcell.pbc_intor('int1e_ipovlp', hermi=0, kpts=kpts)
+
+    dat = int1e.int1e_ipovlp(cell).get()[0]
+    assert abs(dat - ref[0]).max() < 1e-10
+
+    dat = int1e.int1e_ipovlp(cell, kpts=kpts).get()
+    assert abs(dat - ref).max() < 1e-10
+
+def test_int1e_ipkin():
+    cell = pyscf.M(
+        atom='''C1  1.3    .2       .3
+                C2  .19   .1      1.1
+                C3  0.  0.  0.
+        ''',
+        precision = 1e-8,
+        a=np.diag([2.5, 1.9, 2.2])*2,
+        basis='def2-tzvpp',
+    )
+    kmesh = [6, 1, 1]
+    kpts = cell.make_kpts(kmesh)
+    pcell = cell.copy()
+    pcell.precision = 1e-14
+    pcell.build(0, 0)
+    ref = pcell.pbc_intor('int1e_ipkin', hermi=0, kpts=kpts)
+
+    dat = int1e.int1e_ipkin(cell).get()[0]
+    assert abs(dat - ref[0]).max() < 1e-10
+
+    dat = int1e.int1e_ipkin(cell, kpts=kpts).get()
+    assert abs(dat - ref).max() < 1e-10
+
+def test_int1e_ovlp1():
+    L = 4
+    n = 21
+    cell = pyscf.M(unit = 'B',
+               precision = 1e-10,
+               a = ((L,0,0),(0,L,0),(0,0,L)),
+               mesh = [n,n,n],
+               atom = [['He', (L/2.-.5,L/2.,L/2.-.5)],
+                       ['He', (L/2.   ,L/2.,L/2.+.5)]],
+               basis = { 'He': [[0, (0.8, 1.0)],
+                                [0, (1.0, 1.0)],
+                                [0, (1.2, 1.0)]]})
+    nk = [2, 2, 1]
+    kpts = cell.make_kpts(nk, wrap_around=True)
+    s = int1e.int1e_ovlp(cell, kpts)
+    ref = cell.pbc_intor('int1e_ovlp', kpts=kpts)
+    assert abs(s.get() - ref).max() < 1e-10
+    k = int1e.int1e_kin(cell, kpts)
+    ref = cell.pbc_intor('int1e_kin', kpts=kpts)
+    assert abs(k.get() - ref).max() < 1e-10
+
+    nk = [5, 4, 1]
+    kpts = cell.make_kpts(nk, wrap_around=True)[[3, 8, 11]]
+    s = int1e.int1e_ovlp(cell, kpts)
+    ref = cell.pbc_intor('int1e_ovlp', kpts=kpts)
+    assert abs(s.get() - ref).max() < 1e-10
+    k = int1e.int1e_kin(cell, kpts)
+    ref = cell.pbc_intor('int1e_kin', kpts=kpts)
+    assert abs(k.get() - ref).max() < 1e-10
+
+test_int1e_ovlp1()
diff --git a/gpu4pyscf/pbc/scf/hf.py b/gpu4pyscf/pbc/scf/hf.py
index e6ced9369..0c8a80a75 100644
--- a/gpu4pyscf/pbc/scf/hf.py
+++ b/gpu4pyscf/pbc/scf/hf.py
@@ -28,6 +28,7 @@
 from gpu4pyscf.lib.cupy_helper import return_cupy_array, contract
 from gpu4pyscf.scf import hf as mol_hf
 from gpu4pyscf.pbc import df
+from gpu4pyscf.pbc.gto import int1e
 
 def get_bands(mf, kpts_band, cell=None, dm=None, kpt=None):
     '''Get energy bands at the given (arbitrary) 'band' k-points.
@@ -149,18 +150,22 @@ def kpt(self, x):
     get_bands = get_bands
     get_rho = get_rho
 
-    get_ovlp = return_cupy_array(hf_cpu.SCF.get_ovlp)
+    def get_ovlp(self, cell=None, kpt=None):
+        if kpt is None: kpt = self.kpt
+        if cell is None: cell = self.cell
+        return int1e.int1e_ovlp(cell, kpt)
 
     def get_hcore(self, cell=None, kpt=None):
-        if cell is None: cell = self.cell
         if kpt is None: kpt = self.kpt
+        if cell is None: cell = self.cell
         if cell.pseudo:
             nuc = self.with_df.get_pp(kpt)
         else:
             nuc = self.with_df.get_nuc(kpt)
         if len(cell._ecpbas) > 0:
             raise NotImplementedError('ECP in PBC SCF')
-        return nuc + cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpt))
+        t = int1e.int1e_kin(cell, kpt)
+        return nuc + t
 
     def get_jk(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None,
                with_j=True, with_k=True, omega=None, **kwargs):
diff --git a/gpu4pyscf/pbc/scf/khf.py b/gpu4pyscf/pbc/scf/khf.py
index 61674cdad..0472491c4 100644
--- a/gpu4pyscf/pbc/scf/khf.py
+++ b/gpu4pyscf/pbc/scf/khf.py
@@ -30,6 +30,7 @@
 from gpu4pyscf.scf import hf as mol_hf
 from gpu4pyscf.pbc.scf import hf as pbchf
 from gpu4pyscf.pbc import df
+from gpu4pyscf.pbc.gto import int1e
 
 def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
              diis_start_cycle=None, level_shift_factor=None, damp_factor=None,
@@ -253,6 +254,11 @@ def __init__(self, cell, kpts=None, exxdiv='ewald'):
     build = khf_cpu.KSCF.build
     reset = pbchf.SCF.reset
 
+    def get_ovlp(self, cell=None, kpts=None):
+        if cell is None: cell = self.cell
+        if kpts is None: kpts = self.kpts
+        return int1e.int1e_ovlp(cell, kpts)
+
     def get_hcore(self, cell=None, kpts=None):
         if cell is None: cell = self.cell
         if kpts is None: kpts = self.kpts
@@ -262,7 +268,7 @@ def get_hcore(self, cell=None, kpts=None):
             nuc = self.with_df.get_nuc(kpts)
         if len(cell._ecpbas) > 0:
             raise NotImplementedError('ECP in PBC SCF')
-        t = cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpts))
+        t = int1e.int1e_kin(cell, kpts)
         return nuc + t
 
     def get_j(self, cell=None, dm_kpts=None, hermi=1, kpts=None,
@@ -327,7 +333,6 @@ def make_rdm1(self, mo_coeff_kpts=None, mo_occ_kpts=None, **kwargs):
     make_rdm2 = NotImplemented
 
     init_direct_scf = NotImplemented
-    get_ovlp = return_cupy_array(khf_cpu.get_ovlp)
     get_fock = get_fock
     get_fermi = get_fermi
     get_occ = get_occ
diff --git a/gpu4pyscf/pbc/tools/pbc.py b/gpu4pyscf/pbc/tools/pbc.py
index 6f5c93977..2718ddfe5 100644
--- a/gpu4pyscf/pbc/tools/pbc.py
+++ b/gpu4pyscf/pbc/tools/pbc.py
@@ -14,6 +14,9 @@
 
 import numpy as np
 import cupy as cp
+import pyscf
+from pyscf import lib
+from pyscf.pbc.gto.cell import Cell
 from gpu4pyscf.lib.cupy_helper import asarray
 
 def fft(f, mesh):
@@ -287,3 +290,69 @@ def get_coulG(cell, k=np.zeros(3), exx=False, mf=None, mesh=None, Gv=None,
         else: # for RangeSeparatedJKBuilder
             coulG[G0_idx] += Nk*cell.vol*madelung(cell, kpts, omega=0)
     return coulG
+
+def get_lattice_Ls(cell, nimgs=None, rcut=None, dimension=None, discard=True):
+    '''This version employs more strict criteria when discarding images in lattice sum.
+    It can be replaced by the built-in version available in PySCF 2.10.
+    '''
+    if dimension is None:
+        # For atoms near the boundary of the cell, it is necessary (even in low-
+        # dimensional systems) to include lattice translations in all 3 dimensions.
+        if cell.dimension < 2 or cell.low_dim_ft_type == 'inf_vacuum':
+            dimension = cell.dimension
+        else:
+            dimension = 3
+    if rcut is None:
+        rcut = cell.rcut
+
+    if dimension == 0 or rcut <= 0 or cell.natm == 0:
+        return np.zeros((1, 3))
+
+    a = cell.lattice_vectors()
+
+    scaled_atom_coords = cell.get_scaled_atom_coords()
+    atom_boundary_max = scaled_atom_coords[:,:dimension].max(axis=0)
+    atom_boundary_min = scaled_atom_coords[:,:dimension].min(axis=0)
+    if (np.any(atom_boundary_max > 1) or np.any(atom_boundary_min < -1)):
+        atom_boundary_max[atom_boundary_max > 1] = 1
+        atom_boundary_min[atom_boundary_min <-1] = -1
+    ovlp_penalty = atom_boundary_max - atom_boundary_min
+    dR = ovlp_penalty.dot(a[:dimension])
+    dR_basis = np.diag(dR)
+
+    # Search the minimal x,y,z requiring |x*a[0]+y*a[1]+z*a[2]+dR|^2 > rcut^2
+    # Ls boundary should be derived by decomposing (a, Rij) for each atom-pair.
+    # For reasons unclear, the so-obtained Ls boundary seems not large enough.
+    # The upper-bound of the Ls boundary is generated by find_boundary function.
+    def find_boundary(a):
+        aR = np.vstack([a, dR_basis])
+        r = np.linalg.qr(aR.T)[1]
+        ub = (rcut + abs(r[2,3:]).sum()) / abs(r[2,2])
+        return ub
+
+    xb = find_boundary(a[[1,2,0]])
+    if dimension > 1:
+        yb = find_boundary(a[[2,0,1]])
+    else:
+        yb = 0
+    if dimension > 2:
+        zb = find_boundary(a)
+    else:
+        zb = 0
+    bounds = np.ceil([xb, yb, zb]).astype(int)
+    Ts = lib.cartesian_prod((np.arange(-bounds[0], bounds[0]+1),
+                             np.arange(-bounds[1], bounds[1]+1),
+                             np.arange(-bounds[2], bounds[2]+1)))
+    Ls = np.dot(Ts[:,:dimension], a[:dimension])
+
+    if discard and len(Ls) > 1:
+        r = cell.atom_coords()
+        rr = r[:,None] - r
+        dist_max = np.linalg.norm(rr, axis=2).max()
+        Ls_mask = np.linalg.norm(Ls, axis=1) < rcut + dist_max
+        Ls = Ls[Ls_mask]
+    return np.asarray(Ls, order='C')
+
+if int(pyscf.__version__.split('.')[1]) < 10:
+    # Patch the get_lattice_Ls for pyscf-2.9 or older
+    Cell.get_lattice_Ls = get_lattice_Ls