From fcd42d83d37e9e365b9196f5ce2278b4ae828510 Mon Sep 17 00:00:00 2001
From: Qiming Sun <qiming.sun@bytedance.com>
Date: Thu, 17 Jul 2025 09:18:03 +0800
Subject: [PATCH 1/9] DFT+U gradients for molecular DFT

---
 gpu4pyscf/dft/rkspu.py                  | 350 ++++++++++++++++++++++++
 gpu4pyscf/dft/tests/test_dftu.py        |  45 +++
 gpu4pyscf/dft/ukspu.py                  | 242 ++++++++++++++++
 gpu4pyscf/grad/rhf.py                   |   2 +-
 gpu4pyscf/grad/rkspu.py                 | 130 +++++++++
 gpu4pyscf/grad/tests/test_grad_rkspu.py |  72 +++++
 gpu4pyscf/grad/tests/test_grad_ukspu.py |  55 ++++
 gpu4pyscf/grad/ukspu.py                 |  77 ++++++
 8 files changed, 972 insertions(+), 1 deletion(-)
 create mode 100644 gpu4pyscf/dft/rkspu.py
 create mode 100644 gpu4pyscf/dft/tests/test_dftu.py
 create mode 100644 gpu4pyscf/dft/ukspu.py
 create mode 100644 gpu4pyscf/grad/rkspu.py
 create mode 100644 gpu4pyscf/grad/tests/test_grad_rkspu.py
 create mode 100644 gpu4pyscf/grad/tests/test_grad_ukspu.py
 create mode 100644 gpu4pyscf/grad/ukspu.py

diff --git a/gpu4pyscf/dft/rkspu.py b/gpu4pyscf/dft/rkspu.py
new file mode 100644
index 00000000..9ff988d5
--- /dev/null
+++ b/gpu4pyscf/dft/rkspu.py
@@ -0,0 +1,350 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+DFT+U for molecules
+
+See also the pbc.dft.krkspu and pbc.dft.kukspu module
+
+Refs:
+    Heather J. Kulik, J. Chem. Phys. 142, 240901 (2015)
+'''
+
+import itertools
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from pyscf.data.nist import HARTREE2EV
+from pyscf.lo.iao import reference_mol
+from gpu4pyscf.dft import rks
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import asarray
+
+def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
+    """
+    Coulomb + XC functional + Hubbard U terms for RKS+U.
+
+    .. note::
+        This function will change the ks object.
+
+    Args:
+        ks : an instance of :class:`RKS`
+            XC functional are controlled by ks.xc attribute.  Attribute
+            ks.grids might be initialized.
+        dm : ndarray or list of ndarrays
+            A density matrix or a list of density matrices
+
+    Returns:
+        Veff : ``(nao, nao)`` or ``(*, nao, nao)`` ndarray
+        Veff = J + Vxc + V_U.
+    """
+    if mol is None: mol = ks.mol
+    if dm is None: dm = ks.make_rdm1()
+
+    # J + V_xc
+    vxc = rks.get_veff(ks, mol, dm, dm_last=dm_last, vhf_last=vhf_last,
+                       hermi=hermi)
+
+    # V_U
+    ovlp = asarray(mol.intor('int1e_ovlp', hermi=1))
+    pmol = reference_mol(mol, ks.minao_ref)
+    U_idx, U_val, U_lab = _set_U(mol, pmol, ks.U_idx, ks.U_val)
+    # Construct orthogonal minao local orbitals.
+    assert ks.C_ao_lo is None
+    C_ao_lo = _make_minao_lo(mol, pmol)
+
+    alphas = ks.alpha
+    if not hasattr(alphas, '__len__'): # not a list or tuple
+        alphas = [alphas] * len(U_idx)
+
+    E_U = 0.0
+    logger.info(ks, "-" * 79)
+    lab_string = " "
+    with np.printoptions(precision=5, suppress=True, linewidth=1000):
+        for idx, val, lab, alpha in zip(U_idx, U_val, U_lab, alphas):
+            if ks.verbose >= logger.INFO:
+                lab_string = " "
+                for l in lab:
+                    lab_string += "%9s" %(l.split()[-1])
+                lab_sp = lab[0].split()
+                logger.info(ks, "local rdm1 of atom %s: ",
+                            " ".join(lab_sp[:2]) + " " + lab_sp[2][:2])
+            C_loc = C_ao_lo[:,idx]
+            SC = ovlp.dot(C_loc) # ~ C^{-1}
+            P = SC.conj().T.dot(dm).dot(SC)
+            loc_sites = P.shape[-1]
+            vhub_loc = (cp.eye(loc_sites) - P) * (val * 0.5)
+            if alpha is not None:
+                # The alpha perturbation is only applied to the linear term of
+                # the local density.
+                E_U += alpha * P.trace()
+                vhub_loc += cp.eye(loc_sites) * alpha
+            # vxc is a tagged array. The inplace updating avoids loosing the
+            # tagged attributes.
+            vxc[:] += SC.dot(vhub_loc).dot(SC.conj().T)
+            E_U += (val * 0.5) * (P.trace() - P.dot(P).trace() * 0.5)
+            logger.info(ks, "%s\n%s", lab_string, P)
+            logger.info(ks, "-" * 79)
+
+    E_U = E_U.real.get()[()]
+    if E_U < 0.0 and all(np.asarray(U_val) > 0):
+        logger.warn(ks, "E_U (%s) is negative...", E_U)
+    vxc.E_U = E_U
+    return vxc
+
+def energy_elec(mf, dm=None, h1e=None, vhf=None):
+    """
+    Electronic energy for RKSpU.
+    """
+    if dm is None: dm = mf.make_rdm1()
+    if h1e is None: h1e = mf.get_hcore()
+    if vhf is None: vhf = mf.get_veff(mf.mol, dm)
+    e1 = cp.einsum('ij,ji->', h1e, dm).get()[()].real
+    ecoul = vhf.ecoul.real
+    exc = vhf.exc.real
+    E_U = vhf.E_U
+    if isinstance(ecoul, cp.ndarray):
+        ecoul = ecoul.get()[()]
+    if isinstance(exc, cp.ndarray):
+        exc = exc.get()[()]
+    e2 = ecoul + exc + E_U
+    mf.scf_summary['e1'] = e1
+    mf.scf_summary['coul'] = ecoul
+    mf.scf_summary['exc'] = exc
+    mf.scf_summary['E_U'] = E_U
+    logger.debug(mf, 'E1 = %s  Ecoul = %s  Exc = %s  EU = %s', e1, ecoul, exc, E_U)
+    return e1+e2, e2
+
+def _groupby(inp, labels):
+    _, where, counts = np.unique(labels, return_index=True, return_counts=True)
+    return [inp[start:start+count] for start, count in zip(where, counts)]
+
+def _set_U(mol, minao_mol, U_idx, U_val):
+    """
+    Regularize the U_idx and U_val to each atom,
+    """
+    assert len(U_idx) == len(U_val)
+
+    ao_loc = minao_mol.ao_loc_nr()
+    dims = ao_loc[1:] - ao_loc[:-1]
+    # atm_ids labels the atom Id for each function
+    atm_ids = np.repeat(minao_mol._bas[:,gto.ATOM_OF], dims)
+
+    ao_labels = mol.ao_labels()
+    minao_labels = minao_mol.ao_labels()
+
+    U_indices = []
+    U_values = []
+    for i, idx in enumerate(U_idx):
+        if isinstance(idx, str):
+            lab_idx = minao_mol.search_ao_label(idx)
+            # Group basis functions centered on the same atom
+            for idxj in _groupby(lab_idx, atm_ids[lab_idx]):
+                U_indices.append(idxj)
+                U_values.append(U_val[i])
+        else:
+            # Map to MINAO indices
+            idx_minao = [minao_labels.index(ao_labels[i]) for i in idx]
+            U_indices.append(idx_minao)
+            U_values.append(U_val[i])
+
+    if len(U_indices) == 0:
+        logger.warn(mol, "No sites specified for Hubbard U. "
+                    "Please check if 'U_idx' is correctly specified")
+
+    U_values = np.asarray(U_values) / HARTREE2EV
+
+    U_labels = [[minao_labels[i] for i in idx] for idx in U_indices]
+    return U_indices, U_values, U_labels
+
+def _make_minao_lo(mol, minao_ref='minao'):
+    '''
+    Construct orthogonal minao local orbitals.
+    '''
+    if isinstance(minao_ref, str):
+        minao_mol = reference_mol(mol, minao_ref)
+    else:
+        minao_mol = minao_ref
+    ovlp = asarray(mol.intor('int1e_ovlp', hermi=1))
+    s12 = asarray(gto.intor_cross('int1e_ovlp', mol, minao_mol))
+    C_minao = cp.linalg.solve(ovlp, s12)
+    S0 = C_minao.T.dot(ovlp).dot(C_minao)
+    w2, v = cp.linalg.eigh(S0)
+    C_minao = C_minao.dot((v*cp.sqrt(1./w2)).dot(v.T))
+    return C_minao
+
+def _format_idx(idx_list):
+    string = ''
+    for k, g in itertools.groupby(enumerate(idx_list), lambda ix: ix[0] - ix[1]):
+        g = list(g)
+        if len(g) > 1:
+            string += '%d-%d, '%(g[0][1], g[-1][1])
+        else:
+            string += '%d, '%(g[0][1])
+    return string[:-2]
+
+def _print_U_info(mf, log):
+    mol = mf.mol
+    pmol = reference_mol(mol, mf.minao_ref)
+    U_idx, U_val, U_lab = _set_U(mol, pmol, mf.U_idx, mf.U_val)
+    alphas = mf.alpha
+    if not hasattr(alphas, '__len__'): # not a list or tuple
+        alphas = [alphas] * len(U_idx)
+    log.info("-" * 79)
+    log.info('U indices and values: ')
+    for idx, val, lab, alpha in zip(U_idx, U_val, U_lab, alphas):
+        log.info('%6s [%.6g eV] ==> %-100s', _format_idx(idx),
+                    val * HARTREE2EV, "".join(lab))
+        if alpha is not None:
+            log.info('              alpha for LR-cDFT %s (eV)',
+                     alpha * HARTREE2EV)
+    log.info("-" * 79)
+
+class RKSpU(rks.RKS):
+    """
+    DFT+U for RKS
+    """
+
+    _keys = {"U_idx", "U_val", "C_ao_lo", "U_lab", 'minao_ref', 'alpha'}
+
+    get_veff = get_veff
+    energy_elec = energy_elec
+    to_hf = NotImplemented
+
+    def __init__(self, mol, xc='LDA,VWN',
+                 U_idx=[], U_val=[], C_ao_lo=None, minao_ref='MINAO'):
+        """
+        Args:
+            U_idx: can be
+                   list of list: each sublist is a set indices for AO orbitals
+                                 (indcies corresponding to the large-basis-set mol).
+                   list of string: each string is one kind of LO orbitals,
+                                   e.g. ['Ni 3d', '1 O 2pz'].
+                   or a combination of these two.
+            U_val: a list of effective U [in eV], i.e. U-J in Dudarev's DFT+U.
+                   each U corresponds to one kind of LO orbitals, should have
+                   the same length as U_idx.
+            C_ao_lo: Customized LO coefficients, can be
+                     np.array, shape ((spin,), nao, nlo),
+            minao_ref: reference for minao orbitals, default is 'MINAO'.
+
+        Attributes:
+            U_idx: same as the input.
+            U_val: effectiv U-J [in AU]
+            C_ao_loc: np.array
+            alpha: the perturbation [in AU] used to compute U in LR-cDFT.
+                Refs: Cococcioni and de Gironcoli, PRB 71, 035105 (2005)
+        """
+        super().__init__(mol, xc=xc)
+
+        self.U_idx = U_idx
+        self.U_val = U_val
+        if isinstance(C_ao_lo, str):
+            assert C_ao_lo.upper() == 'MINAO'
+            C_ao_lo = None # API backward compatibility
+        self.C_ao_lo = C_ao_lo
+        self.minao_ref = minao_ref
+        # The perturbation (eV) used to compute U in LR-cDFT.
+        self.alpha = None
+
+    def dump_flags(self, verbose=None):
+        log = logger.new_logger(self, verbose)
+        super().dump_flags(log)
+        if log.verbose >= logger.INFO:
+            _print_U_info(self, log)
+        return self
+
+    def Gradients(self):
+        from gpu4pyscf.grad.rkspu import Gradients
+        return Gradients(self)
+
+    def nuc_grad_method(self):
+        return self.Gradients()
+
+def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
+    '''
+    Refs:
+        [1] M. Cococcioni and S. de Gironcoli, Phys. Rev. B 71, 035105 (2005)
+        [2] H. J. Kulik, M. Cococcioni, D. A. Scherlis, and N. Marzari, Phys. Rev. Lett. 97, 103001 (2006)
+        [3] Heather J. Kulik, J. Chem. Phys. 142, 240901 (2015)
+        [4] https://hjkgrp.mit.edu/tutorials/2011-05-31-calculating-hubbard-u/
+        [5] https://hjkgrp.mit.edu/tutorials/2011-06-28-hubbard-u-multiple-sites/
+
+    Args:
+        alphalist :
+            alpha parameters (in eV) are the displacements for the linear
+            response calculations. For each alpha in this list, the DFT+U with
+            U=u0+alpha, U=u0-alpha are evaluated. u0 is the U value from the
+            reference mf_plus_u object, which will be treated as a standard DFT
+            functional.
+    '''
+    assert isinstance(mf_plus_u, RKSpU)
+    assert len(mf_plus_u.U_idx) > 0
+    if not mf_plus_u.converged:
+        mf_plus_u.run()
+    assert mf_plus_u.converged
+    # The bare density matrix without adding U
+    bare_dm = mf_plus_u.make_rdm1()
+
+    mf = mf_plus_u.copy()
+    log = logger.new_logger(mf)
+
+    alphalist = np.asarray(alphalist)
+    alphalist = np.append(-alphalist[::-1], alphalist)
+
+    mol = mf.mol
+    pmol = reference_mol(mol, mf.minao_ref)
+    U_idx, U_val, U_lab = _set_U(mol, pmol, mf.U_idx, mf.U_val)
+    # Construct orthogonal minao local orbitals.
+    assert mf.C_ao_lo is None
+    C_ao_lo = _make_minao_lo(mol, pmol)
+    ovlp = asarray(mol.intor('int1e_ovlp', hermi=1))
+    C_inv = []
+    for idx in U_idx:
+        c = C_ao_lo[:,idx]
+        C_inv.append(c.conj().T.dot(ovlp))
+
+    bare_occupancies = []
+    final_occupancies = []
+    for alpha in alphalist:
+        # All in atomic unit
+        mf.alpha = alpha / HARTREE2EV
+        mf.kernel(dm0=bare_dm)
+        local_occ = 0
+        for c in C_inv:
+            C_on_site = c.dot(mf.mo_coeff)
+            rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
+            local_occ += rdm1_lo.trace()
+        final_occupancies.append(local_occ.get())
+
+        # The first iteration of SCF
+        fock = mf.get_fock(dm=bare_dm)
+        e, mo = mf.eig(fock, ovlp)
+        local_occ = 0
+        for c in C_inv:
+            C_on_site = c.dot(mo)
+            rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
+            local_occ += rdm1_lo.trace()
+        bare_occupancies.append(local_occ.get())
+        log.info('alpha=%f bare_occ=%g final_occ=%g',
+                 alpha, bare_occupancies[-1], final_occupancies[-1])
+
+    chi0, occ0 = np.polyfit(alphalist, bare_occupancies, deg=1)
+    chif, occf = np.polyfit(alphalist, final_occupancies, deg=1)
+    log.info('Line fitting chi0 = %f x + %f', chi0, occ0)
+    log.info('Line fitting chif = %f x + %f', chif, occf)
+    Uresp = 1./chi0 - 1./chif
+    log.note('Uresp = %f, chi0 = %f, chif = %f', Uresp, chi0, chif)
+    return Uresp
diff --git a/gpu4pyscf/dft/tests/test_dftu.py b/gpu4pyscf/dft/tests/test_dftu.py
new file mode 100644
index 00000000..741d2f6c
--- /dev/null
+++ b/gpu4pyscf/dft/tests/test_dftu.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+import numpy as np
+from pyscf import gto
+from gpu4pyscf.dft import rkspu, ukspu
+
+class KnownValues(unittest.TestCase):
+    def test_RKSpU_linear_response(self):
+        mol = gto.M(atom='''
+        O  0.   0.       0.
+        H  0.   -0.757   0.587
+        H  0.   0.757    0.587''', basis='6-31g')
+        mf = rkspu.RKSpU(mol, xc='pbe', U_idx=['O 2p'], U_val=[3.5])
+        mf.run()
+        u0 = rkspu.linear_response_u(mf)
+        assert abs(u0 - 5.8926) < 1e-2
+
+    def test_UKSpU_linear_response(self):
+        mol = gto.M(atom='''
+        O  0.   0.       0.
+        H  0.   -0.757   0.587
+        H  0.   0.757    0.587''', basis='6-31g')
+        mf = ukspu.UKSpU(mol, xc='pbe', U_idx=['O 2p'], U_val=[3.5])
+        mf.run()
+        u0 = ukspu.linear_response_u(mf)
+        assert abs(u0 - 5.8926) < 1e-2
+
+if __name__ == '__main__':
+    print("Full Tests for dft+U")
+    unittest.main()
diff --git a/gpu4pyscf/dft/ukspu.py b/gpu4pyscf/dft/ukspu.py
new file mode 100644
index 00000000..14492b33
--- /dev/null
+++ b/gpu4pyscf/dft/ukspu.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+DFT+U on molecules
+
+See also the pbc.dft.krkspu and pbc.dft.kukspu module
+'''
+
+import numpy as np
+import cupy as cp
+from pyscf.data.nist import HARTREE2EV
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import asarray
+from gpu4pyscf.dft import uks
+from gpu4pyscf.dft.rkspu import (
+    _set_U, _make_minao_lo, _print_U_info, reference_mol)
+
+def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
+    """
+    Coulomb + XC functional + (Hubbard - double counting) for UKS+U.
+    """
+    if mol is None: mol = ks.mol
+    if dm is None: dm = ks.make_rdm1()
+
+    # J + V_xc
+    vxc = uks.get_veff(ks, mol, dm, dm_last=dm_last, vhf_last=vhf_last,
+                       hermi=hermi)
+
+    # V_U
+    ovlp = asarray(mol.intor('int1e_ovlp', hermi=1))
+    pmol = reference_mol(mol, ks.minao_ref)
+    U_idx, U_val, U_lab = _set_U(mol, pmol, ks.U_idx, ks.U_val)
+    # Construct orthogonal minao local orbitals.
+    assert ks.C_ao_lo is None
+    C_ao_lo = _make_minao_lo(mol, pmol)
+
+    alphas = ks.alpha
+    if not hasattr(alphas, '__len__'): # not a list or tuple
+        alphas = [alphas] * len(U_idx)
+
+    E_U = 0.0
+    logger.info(ks, "-" * 79)
+    lab_string = " "
+    with np.printoptions(precision=5, suppress=True, linewidth=1000):
+        for idx, val, lab, alpha in zip(U_idx, U_val, U_lab, alphas):
+            if ks.verbose >= logger.INFO:
+                lab_string = " "
+                for l in lab:
+                    lab_string += "%9s" %(l.split()[-1])
+                lab_sp = lab[0].split()
+                logger.info(ks, "local rdm1 of atom %s: ",
+                            " ".join(lab_sp[:2]) + " " + lab_sp[2][:2])
+            C_loc = C_ao_lo[:,idx]
+            SC = ovlp.dot(C_loc) # ~ C^{-1}
+            for s in range(2):
+                P = SC.conj().T.dot(dm[s]).dot(SC)
+                loc_sites = P.shape[-1]
+                vhub_loc = (cp.eye(loc_sites) - P * 2.0) * (val * 0.5)
+                if alpha is not None:
+                    # LR-cDFT perturbation for Hubbard U
+                    E_U += alpha * P.trace()
+                    vhub_loc += cp.eye(loc_sites) * alpha
+                vxc[s] += SC.dot(vhub_loc).dot(SC.conj().T)
+                E_U += (val * 0.5) * (P.trace() - P.dot(P).trace())
+                logger.info(ks, "spin %s\n%s\n%s", s, lab_string, P)
+            logger.info(ks, "-" * 79)
+
+    E_U = E_U.real.get()[()]
+    if E_U < 0.0 and all(np.asarray(U_val) > 0):
+        logger.warn(ks, "E_U (%s) is negative...", E_U)
+    vxc.E_U = E_U
+    return vxc
+
+def energy_elec(mf, dm=None, h1e=None, vhf=None):
+    """
+    Electronic energy for UKSpU.
+    """
+    if dm is None: dm = mf.make_rdm1()
+    if h1e is None: h1e = mf.get_hcore()
+    if vhf is None: vhf = mf.get_veff(mf.mol, dm)
+    e1 = cp.einsum('ij,nji->', h1e, dm).get()[()].real
+    ecoul = vhf.ecoul.real
+    exc = vhf.exc.real
+    E_U = vhf.E_U
+    if isinstance(ecoul, cp.ndarray):
+        ecoul = ecoul.get()[()]
+    if isinstance(exc, cp.ndarray):
+        exc = exc.get()[()]
+    e2 = ecoul + exc + E_U
+    mf.scf_summary['e1'] = e1
+    mf.scf_summary['coul'] = ecoul
+    mf.scf_summary['exc'] = exc
+    mf.scf_summary['E_U'] = E_U
+    logger.debug(mf, 'E1 = %s  Ecoul = %s  Exc = %s  EU = %s', e1, ecoul, exc, E_U)
+    return e1+e2, e2
+
+class UKSpU(uks.UKS):
+    """
+    UKSpU class adapted for PBCs with k-point sampling.
+    """
+
+    _keys = {"U_idx", "U_val", "C_ao_lo", "U_lab", 'minao_ref', 'alpha'}
+
+    get_veff = get_veff
+    energy_elec = energy_elec
+    to_hf = NotImplemented
+
+    def __init__(self, mol, xc='LDA,VWN',
+                 U_idx=[], U_val=[], C_ao_lo=None, minao_ref='MINAO'):
+        """
+        DFT+U args:
+            U_idx: can be
+                   list of list: each sublist is a set indices for AO orbitals
+                                 (indcies corresponding to the large-basis-set mol).
+                   list of string: each string is one kind of LO orbitals,
+                                   e.g. ['Ni 3d', '1 O 2pz'].
+                   or a combination of these two.
+            U_val: a list of effective U [in eV], i.e. U-J in Dudarev's DFT+U.
+                   each U corresponds to one kind of LO orbitals, should have
+                   the same length as U_idx.
+            C_ao_lo: LO coefficients, can be
+                     np.array, shape ((spin,), nao, nlo),
+                     string, in 'minao'.
+            minao_ref: reference for minao orbitals, default is 'MINAO'.
+
+        Attributes:
+            U_idx: same as the input.
+            U_val: effectiv U-J [in AU]
+            C_ao_loc: np.array
+            alpha: the perturbation [in AU] used to compute U in LR-cDFT.
+                Refs: Cococcioni and de Gironcoli, PRB 71, 035105 (2005)
+        """
+        super(self.__class__, self).__init__(mol, xc=xc)
+        self.U_idx = U_idx
+        self.U_val = U_val
+        if isinstance(C_ao_lo, str):
+            assert C_ao_lo.upper() == 'MINAO'
+            C_ao_lo = None # API backward compatibility
+        self.C_ao_lo = C_ao_lo
+        self.minao_ref = minao_ref
+        self.alpha = None
+
+    def dump_flags(self, verbose=None):
+        super().dump_flags(verbose)
+        log = logger.new_logger(self, verbose)
+        if log.verbose >= logger.INFO:
+            _print_U_info(self, log)
+        return self
+
+    def Gradients(self):
+        from gpu4pyscf.grad.ukspu import Gradients
+        return Gradients(self)
+
+    def nuc_grad_method(self):
+        return self.Gradients()
+
+def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
+    '''
+    Refs:
+        [1] M. Cococcioni and S. de Gironcoli, Phys. Rev. B 71, 035105 (2005)
+        [2] H. J. Kulik, M. Cococcioni, D. A. Scherlis, and N. Marzari, Phys. Rev. Lett. 97, 103001 (2006)
+        [3] Heather J. Kulik, J. Chem. Phys. 142, 240901 (2015)
+        [4] https://hjkgrp.mit.edu/tutorials/2011-05-31-calculating-hubbard-u/
+        [5] https://hjkgrp.mit.edu/tutorials/2011-06-28-hubbard-u-multiple-sites/
+
+    Args:
+        alphalist :
+            alpha parameters (in eV) are the displacements for the linear
+            response calculations. For each alpha in this list, the DFT+U with
+            U=u0+alpha, U=u0-alpha are evaluated. u0 is the U value from the
+            reference mf_plus_u object, which will be treated as a standard DFT
+            functional.
+    '''
+    assert isinstance(mf_plus_u, UKSpU)
+    assert len(mf_plus_u.U_idx) > 0
+    if not mf_plus_u.converged:
+        mf_plus_u.run()
+    assert mf_plus_u.converged
+    # The bare density matrix without adding U
+    bare_dm = mf_plus_u.make_rdm1()
+
+    mf = mf_plus_u.copy()
+    log = logger.new_logger(mf)
+
+    alphalist = np.asarray(alphalist)
+    alphalist = np.append(-alphalist[::-1], alphalist)
+
+    mol = mf.mol
+    pmol = reference_mol(mol, mf.minao_ref)
+    U_idx, U_val, U_lab = _set_U(mol, pmol, mf.U_idx, mf.U_val)
+    assert mf.C_ao_lo is None
+    C_ao_lo = _make_minao_lo(mol, pmol)
+    ovlp = asarray(mol.intor('int1e_ovlp', hermi=1))
+    C_inv = []
+    for idx in U_idx:
+        c = C_ao_lo[:,idx]
+        C_inv.append(c.conj().T.dot(ovlp))
+
+    bare_occupancies = []
+    final_occupancies = []
+    for alpha in alphalist:
+        mf.alpha = alpha / HARTREE2EV
+        mf.kernel(dm0=bare_dm)
+        local_occ = 0
+        for c in C_inv:
+            C_on_site = [c.dot(mf.mo_coeff[0]), c.dot(mf.mo_coeff[1])]
+            rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
+            local_occ += rdm1_lo[0].trace() + rdm1_lo[1].trace()
+        final_occupancies.append(local_occ.get())
+
+        # The first iteration of SCF
+        fock = mf.get_fock(dm=bare_dm)
+        e, mo = mf.eig(fock, ovlp)
+        local_occ = 0
+        for c in C_inv:
+            C_on_site = [c.dot(mo[0]), c.dot(mo[1])]
+            rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
+            local_occ += rdm1_lo[0].trace() + rdm1_lo[1].trace()
+        bare_occupancies.append(local_occ.get())
+        log.info('alpha=%f bare_occ=%g final_occ=%g',
+                 alpha, bare_occupancies[-1], final_occupancies[-1])
+
+    chi0, occ0 = np.polyfit(alphalist, bare_occupancies, deg=1)
+    chif, occf = np.polyfit(alphalist, final_occupancies, deg=1)
+    log.info('Line fitting chi0 = %f x + %f', chi0, occ0)
+    log.info('Line fitting chif = %f x + %f', chif, occf)
+    Uresp = 1./chi0 - 1./chif
+    log.note('Uresp = %f, chi0 = %f, chif = %f', Uresp, chi0, chif)
+    return Uresp
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index e7a25640..41d75389 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -295,7 +295,7 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
     dm0 = tag_array(dm0, mo_coeff=mo_coeff, mo_occ=mo_occ)
     extra_force = cupy.zeros((len(atmlst),3))
     for k, ia in enumerate(atmlst):
-        extra_force[k] += mf_grad.extra_force(ia, locals())
+        extra_force[k] += cupy.asarray(mf_grad.extra_force(ia, locals()))
 
     log.timer_debug1('gradients of 2e part', *t3)
 
diff --git a/gpu4pyscf/grad/rkspu.py b/gpu4pyscf/grad/rkspu.py
new file mode 100644
index 00000000..07dec9c9
--- /dev/null
+++ b/gpu4pyscf/grad/rkspu.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Analytical derivatives for DFT+U
+'''
+
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf.grad import rks as rks_grad
+from gpu4pyscf.dft.rkspu import _set_U, _make_minao_lo, reference_mol
+from gpu4pyscf.lib.cupy_helper import asarray, contract
+
+def generate_first_order_local_orbitals(mol, minao_ref='MINAO'):
+    if isinstance(minao_ref, str):
+        pmol = reference_mol(mol, minao_ref)
+    else:
+        pmol = minao_ref
+    sAA = asarray(mol.intor('int1e_ovlp', hermi=1))
+    sAB = asarray(gto.intor_cross('int1e_ovlp', mol, pmol))
+    C0_minao = cp.linalg.solve(sAA, sAB)
+
+    # Lowdin orthogonalization coefficients = S^{-1/2}
+    S0 = sAB.conj().T.dot(C0_minao)
+    w2, v = cp.linalg.eigh(S0)
+    w = cp.sqrt(w2)
+    S0_lowdin = (v/w).dot(v.conj().T)
+
+    sAA_ip1 = asarray(mol.intor('int1e_ipovlp'))
+    sAB_ip1 = asarray(gto.intor_cross('int1e_ipovlp', mol, pmol))
+    sBA_ip1 = asarray(gto.intor_cross('int1e_ipovlp', pmol, mol))
+
+    nao, n_minao = C0_minao.shape
+    aoslice = mol.aoslice_by_atom()
+    minao_slice = pmol.aoslice_by_atom()
+
+    def make_coeff(atm_id):
+        p0, p1 = aoslice[atm_id,2:]
+        q0, q1 = minao_slice[atm_id,2:]
+        C1 = cp.empty((3, nao, n_minao))
+        for n in range(3):
+            sAA1 = cp.zeros((nao, nao))
+            sAA1[p0:p1,:] -= sAA_ip1[n,p0:p1]
+            sAA1[:,p0:p1] -= sAA_ip1[n,p0:p1].conj().T
+            sAB1 = cp.zeros((nao, n_minao))
+            sAB1[p0:p1,:] -= sAB_ip1[n,p0:p1]
+            sAB1[:,q0:q1] -= sBA_ip1[n,q0:q1].conj().T
+
+            # The first order of A = S^{-1/2}
+            # A S A = 1
+            # A1 S0 A0 + A0 S1 A0 + A0 S0 A1 = 0
+            # inv(A0) A1 S0 + S1 + S0 A1 inv(A0) = 0
+            # A0 = (U/w) U^T = U (U/w)^T
+            # U (U w)^T A1 U w^2 U^T + U w^2 U^T A1 U w U^T = -S1
+            # (Uw)^T A1 Uw = - U^T S1 U / (w[:,None] + w)
+            S1 = sAB1.conj().T.dot(C0_minao)
+            S1 = S1 + S1.conj().T
+            S1 -= C0_minao.conj().T.dot(sAA1).dot(C0_minao)
+            S1 = v.conj().T.dot(-S1).dot(v)
+            S1 /= (w[:,None] + w)
+            vw = v / w
+            S1_lowdin = vw.dot(S1).dot(vw.conj().T)
+
+            C1_minao = cp.linalg.solve(sAA, sAB1 - sAA1.dot(C0_minao))
+            C1[n] = C1_minao.dot(S0_lowdin)
+            C1[n] += C0_minao.dot(S1_lowdin)
+        return C1
+    return make_coeff
+
+def _hubbard_U_deriv1(mf, dm=None):
+    assert mf.alpha is None
+    assert mf.C_ao_lo is None
+    assert mf.minao_ref is not None
+    if dm is None:
+        dm = mf.make_rdm1()
+
+    mol = mf.mol
+    # Construct orthogonal minao local orbitals.
+    pmol = reference_mol(mol, mf.minao_ref)
+    C_ao_lo = _make_minao_lo(mol, pmol)
+    U_idx, U_val = _set_U(mol, pmol, mf.U_idx, mf.U_val)[:2]
+    U_idx_stack = np.hstack(U_idx)
+    C0 = C_ao_lo[:,U_idx_stack]
+
+    ovlp0 = mf.get_ovlp()
+    C_inv = C0.conj().T.dot(ovlp0)
+    dm_deriv0 = C_inv.dot(dm).dot(C_inv.conj().T)
+    ovlp1 = asarray(mol.intor('int1e_ipovlp'))
+    f_local_ao = generate_first_order_local_orbitals(mol, pmol)
+
+    ao_slices = mol.aoslice_by_atom()
+    natm = mol.natm
+    dE_U = cp.zeros((natm, 3))
+    for atm_id, (p0, p1) in enumerate(ao_slices[:,2:]):
+        C1 = f_local_ao(atm_id)[:,:,U_idx_stack]
+        SC1 = contract('pq,xqi->xpi', ovlp0, C1)
+        SC1 -= contract('xqp,qi->xpi', ovlp1[:,p0:p1], C0[p0:p1])
+        SC1[:,p0:p1] -= contract('xpq,qi->xpi', ovlp1[:,p0:p1], C0)
+        dm_deriv1 = contract('pj,xjq->xpq', C_inv.dot(dm), SC1)
+        i0 = i1 = 0
+        for idx, val in zip(U_idx, U_val):
+            i0, i1 = i1, i1 + len(idx)
+            P0 = dm_deriv0[i0:i1,i0:i1]
+            P1 = dm_deriv1[:,i0:i1,i0:i1]
+            dE_U[atm_id] += (val * 0.5) * (
+                cp.einsum('xii->x', P1).real * 2 # *2 for P1+P1.T
+                - cp.einsum('xij,ji->x', P1, P0).real * 2)
+    return dE_U
+
+class Gradients(rks_grad.Gradients):
+    def get_veff(self, mol=None, dm=None):
+        self._dE_U = _hubbard_U_deriv1(self.base, dm)
+        return rks_grad.get_veff(self, mol, dm)
+
+    def extra_force(self, atom_id, envs):
+        val = super().extra_force(atom_id, envs)
+        return self._dE_U[atom_id] + val
diff --git a/gpu4pyscf/grad/tests/test_grad_rkspu.py b/gpu4pyscf/grad/tests/test_grad_rkspu.py
new file mode 100644
index 00000000..b3a14303
--- /dev/null
+++ b/gpu4pyscf/grad/tests/test_grad_rkspu.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy
+from pyscf import gto, lib
+from gpu4pyscf.dft import rkspu
+from gpu4pyscf.grad import rkspu as rkspu_grad
+
+class KnownValues(unittest.TestCase):
+    def test_finite_diff_local_orbitals(self):
+        mol = gto.M(atom='C 0 1.6 0; O 0 0 1', basis='ccpvdz', unit='B')
+        f_local = rkspu_grad.generate_first_order_local_orbitals(mol)
+
+        C1 = f_local(0)
+        pmol = mol.copy()
+        C0p = rkspu._make_minao_lo(pmol.set_geom_('C 0 1.601 0; O 0 0 1', unit='B'))
+        C0m = rkspu._make_minao_lo(pmol.set_geom_('C 0 1.599 0; O 0 0 1', unit='B'))
+        ref = (C0p - C0m) / 2e-3
+        self.assertAlmostEqual(abs(C1[1] - ref).max().get(), 0, 6)
+
+        C1 = f_local(1)
+        C0p = rkspu._make_minao_lo(pmol.set_geom_('C 0 1.6 0; O 0 0 1.001', unit='B'))
+        C0m = rkspu._make_minao_lo(pmol.set_geom_('C 0 1.6 0; O 0 0 0.999', unit='B'))
+        ref = (C0p - C0m) / 2e-3
+        self.assertAlmostEqual(abs(C1[2] - ref).max().get(), 0, 6)
+
+    def test_finite_diff_hubbard_U_grad(self):
+        mol = gto.M(atom='C 0 1.6 0; O 0 0 1', basis='ccpvdz', unit='B')
+        U_idx = ["C 2p"]
+        U_val = [5.0]
+        mf = rkspu.RKSpU(mol, U_idx=U_idx, U_val=U_val)
+        mf.__dict__.update(mol.RHF().to_gpu().run().__dict__)
+        de = rkspu_grad._hubbard_U_deriv1(mf).get()
+
+        mf.mol.set_geom_('C 0 1.6 0; O 0 0 1.001', unit='B')
+        e1 = mf.get_veff().E_U
+
+        mf.mol.set_geom_('C 0 1.6 0; O 0 0 0.999', unit='B')
+        e2 = mf.get_veff().E_U
+        self.assertAlmostEqual(de[1,2], (e1 - e2)/2e-3, 6)
+
+    def test_finite_diff_rkspu_grad(self):
+        mol = gto.M(atom='C 0 1.6 0; O 0 0 1', basis='ccpvdz', unit='B', verbose=0)
+        U_idx = ["C 2p"]
+        U_val = [5.0]
+        mf = rkspu.RKSpU(mol, xc='pbe', U_idx=U_idx, U_val=U_val)
+        e, g = mf.nuc_grad_method().as_scanner()(mol)
+        self.assertAlmostEqual(e, -113.0468869772191, 8)
+        self.assertAlmostEqual(lib.fp(g), -0.7728822285688215, 5)
+
+        mol1 = mol.copy()
+        mf_scanner = mf.as_scanner()
+        e1 = mf_scanner(mol1.set_geom_('C 0 1.6 0; O 0 0 1.001', unit='B'))
+        e2 = mf_scanner(mol1.set_geom_('C 0 1.6 0; O 0 0 0.999', unit='B'))
+        self.assertAlmostEqual(g[1,2], (e1-e2)/2e-3, 4)
+
+if __name__ == '__main__':
+    print("Full Tests for RKS+U Gradients")
+    unittest.main()
diff --git a/gpu4pyscf/grad/tests/test_grad_ukspu.py b/gpu4pyscf/grad/tests/test_grad_ukspu.py
new file mode 100644
index 00000000..fa1f4c85
--- /dev/null
+++ b/gpu4pyscf/grad/tests/test_grad_ukspu.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy
+from pyscf import gto, lib
+from gpu4pyscf.dft import ukspu
+from gpu4pyscf.grad import ukspu as ukspu_grad
+
+class KnownValues(unittest.TestCase):
+    def test_finite_diff_hubbard_U_grad(self):
+        mol = gto.M(atom='C 0 1.6 0; O 0 0 1', spin=2, basis='ccpvdz', unit='B')
+        U_idx = ["C 2p"]
+        U_val = [5.0]
+        mf = ukspu.UKSpU(mol, U_idx=U_idx, U_val=U_val)
+        mf.__dict__.update(mol.UHF().to_gpu().run().__dict__)
+        de = ukspu_grad._hubbard_U_deriv1(mf).get()
+
+        mf.mol.set_geom_('C 0 1.6 0; O 0 0 1.001', unit='B')
+        e1 = mf.get_veff().E_U
+
+        mf.mol.set_geom_('C 0 1.6 0; O 0 0 0.999', unit='B')
+        e2 = mf.get_veff().E_U
+        self.assertAlmostEqual(de[1,2], (e1 - e2)/2e-3, 6)
+
+    def test_finite_diff_ukspu_grad(self):
+        mol = gto.M(atom='C 0 1.6 0; O 0 0 1', spin=2, basis='ccpvdz', unit='B', verbose=0)
+        U_idx = ["C 2p"]
+        U_val = [5.0]
+        mf = ukspu.UKSpU(mol, xc='pbe', U_idx=U_idx, U_val=U_val)
+        e, g = mf.nuc_grad_method().as_scanner()(mol)
+        self.assertAlmostEqual(e, -112.7869835493314, 8)
+        self.assertAlmostEqual(lib.fp(g), -1.0489136822191656, 5)
+
+        mol1 = mol.copy()
+        mf_scanner = mf.as_scanner()
+        e1 = mf_scanner(mol1.set_geom_('C 0 1.6 0; O 0 0 1.001', unit='B'))
+        e2 = mf_scanner(mol1.set_geom_('C 0 1.6 0; O 0 0 0.999', unit='B'))
+        self.assertAlmostEqual(g[1,2], (e1-e2)/2e-3, 4)
+
+if __name__ == '__main__':
+    print("Full Tests for UKS+U Gradients")
+    unittest.main()
diff --git a/gpu4pyscf/grad/ukspu.py b/gpu4pyscf/grad/ukspu.py
new file mode 100644
index 00000000..de7b6395
--- /dev/null
+++ b/gpu4pyscf/grad/ukspu.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Analytical derivatives for DFT+U
+'''
+
+import numpy as np
+import cupy as cp
+from gpu4pyscf.dft.ukspu import UKSpU
+from gpu4pyscf.dft.rkspu import _set_U, _make_minao_lo, reference_mol
+from gpu4pyscf.grad import uks as uks_grad
+from gpu4pyscf.grad.rkspu import generate_first_order_local_orbitals
+from gpu4pyscf.lib.cupy_helper import asarray, contract
+
+def _hubbard_U_deriv1(mf, dm=None):
+    assert mf.alpha is None
+    assert mf.C_ao_lo is None
+    assert mf.minao_ref is not None
+    if dm is None:
+        dm = mf.make_rdm1()
+
+    mol = mf.mol
+    # Construct orthogonal minao local orbitals.
+    pmol = reference_mol(mol, mf.minao_ref)
+    C_ao_lo = _make_minao_lo(mol, pmol)
+    U_idx, U_val = _set_U(mol, pmol, mf.U_idx, mf.U_val)[:2]
+    U_idx_stack = np.hstack(U_idx)
+    C0 = C_ao_lo[:,U_idx_stack]
+
+    ovlp0 = mf.get_ovlp()
+    C_inv = C0.conj().T.dot(ovlp0)
+    dm_deriv0 = [C_inv.dot(dm[0]).dot(C_inv.conj().T),
+                 C_inv.dot(dm[1]).dot(C_inv.conj().T)]
+    ovlp1 = asarray(mol.intor('int1e_ipovlp'))
+    f_local_ao = generate_first_order_local_orbitals(mol, pmol)
+
+    ao_slices = mol.aoslice_by_atom()
+    natm = mol.natm
+    dE_U = cp.zeros((natm, 3))
+    for atm_id, (p0, p1) in enumerate(ao_slices[:,2:]):
+        C1 = f_local_ao(atm_id)[:,:,U_idx_stack]
+        SC1 = contract('pq,xqi->xpi', ovlp0, C1)
+        SC1 -= contract('xqp,qi->xpi', ovlp1[:,p0:p1], C0[p0:p1])
+        SC1[:,p0:p1] -= contract('xpq,qi->xpi', ovlp1[:,p0:p1], C0)
+        for s in range(2):
+            dm_deriv1 = contract('pj,xjq->xpq', C_inv.dot(dm[s]), SC1)
+            i0 = i1 = 0
+            for idx, val in zip(U_idx, U_val):
+                i0, i1 = i1, i1 + len(idx)
+                P0 = dm_deriv0[s][i0:i1,i0:i1]
+                P1 = dm_deriv1[:,i0:i1,i0:i1]
+                dE_U[atm_id] += (val * 0.5) * (
+                    cp.einsum('xii->x', P1).real * 2 # *2 for P1+P1.T
+                    - cp.einsum('xij,ji->x', P1, P0).real * 4)
+    return dE_U
+
+class Gradients(uks_grad.Gradients):
+    def get_veff(self, mol=None, dm=None):
+        self._dE_U = _hubbard_U_deriv1(self.base, dm)
+        return uks_grad.get_veff(self, mol, dm)
+
+    def extra_force(self, atom_id, envs):
+        val = super().extra_force(atom_id, envs)
+        return self._dE_U[atom_id] + val

From bec98b4f9de131e3b0615e6114db90f5de627f3b Mon Sep 17 00:00:00 2001
From: Qiming Sun <qiming.sun@bytedance.com>
Date: Thu, 17 Jul 2025 09:47:20 +0800
Subject: [PATCH 2/9] Refactor kpoint DFT+U

---
 gpu4pyscf/pbc/dft/krkspu.py                   | 272 +++++++++++-------
 gpu4pyscf/pbc/dft/kukspu.py                   | 215 +++++++++-----
 .../tests/{test_dftu.py => test_pbc_dftu.py}  |  56 ++++
 3 files changed, 366 insertions(+), 177 deletions(-)
 rename gpu4pyscf/pbc/dft/tests/{test_dftu.py => test_pbc_dftu.py} (58%)

diff --git a/gpu4pyscf/pbc/dft/krkspu.py b/gpu4pyscf/pbc/dft/krkspu.py
index b9504b8b..71648e74 100644
--- a/gpu4pyscf/pbc/dft/krkspu.py
+++ b/gpu4pyscf/pbc/dft/krkspu.py
@@ -19,14 +19,16 @@
 Refs: PRB, 1998, 57, 1505.
 """
 
-import cupy as cp
 import numpy as np
-from pyscf.lo import vec_lowdin
-from pyscf.pbc.dft import krkspu as krkspu_cpu
+import cupy as cp
+
+from pyscf import __config__
 from pyscf.data.nist import HARTREE2EV
+from pyscf.pbc import gto as pgto
+from gpu4pyscf.dft.rkspu import _set_U, reference_mol
 from gpu4pyscf.lib import logger
 from gpu4pyscf.pbc.dft import krks
-from gpu4pyscf.lib.cupy_helper import asarray, tag_array
+from gpu4pyscf.lib.cupy_helper import asarray
 
 def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
              kpts=None, kpts_band=None):
@@ -61,66 +63,66 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
 def _add_Vhubbard(vxc, ks, dm, kpts):
     '''Add Hubbard U to Vxc matrix inplace.
     '''
-    C_ao_lo = asarray(ks.C_ao_lo)
-    ovlp = ks.get_ovlp()
-    nkpts = len(kpts)
-    nlo = C_ao_lo.shape[-1]
-
-    rdm1_lo = cp.empty((nkpts, nlo, nlo), dtype=np.complex128)
-    for k in range(nkpts):
-        C_inv = C_ao_lo[k].conj().T.dot(ovlp[k])
-        rdm1_lo[k] = C_inv.dot(dm[k]).dot(C_inv.conj().T)
+    cell = ks.cell
+    pcell = reference_mol(cell, ks.minao_ref)
 
     is_ibz = hasattr(kpts, "kpts_ibz")
+    kpts_input = kpts
     if is_ibz:
-        rdm1_lo_0 = kpts.dm_at_ref_cell(rdm1_lo)
+        raise NotImplementedError('DFT+U for k-point symmetry')
+    kpts = kpts.reshape(-1, 3)
+    nkpts = len(kpts)
+
+    ovlp = asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+    U_idx, U_val, U_lab = _set_U(cell, pcell, ks.U_idx, ks.U_val)
+    assert ks.C_ao_lo is None
+    C_ao_lo = _make_minao_lo(cell, pcell, kpts)
 
     alphas = ks.alpha
     if not hasattr(alphas, '__len__'): # not a list or tuple
-        alphas = [alphas] * len(ks.U_idx)
+        alphas = [alphas] * len(U_idx)
 
     E_U = 0.0
-    weight = getattr(kpts, "weights_ibz", np.repeat(1.0/nkpts, nkpts))
+    weight = getattr(kpts_input, "weights_ibz", np.repeat(1.0/nkpts, nkpts))
     logger.info(ks, "-" * 79)
+    lab_string = " "
     with np.printoptions(precision=5, suppress=True, linewidth=1000):
-        for idx, val, lab, alpha in zip(ks.U_idx, ks.U_val, ks.U_lab, alphas):
-            lab_string = " "
-            for l in lab:
-                lab_string += "%9s" %(l.split()[-1])
-            lab_sp = lab[0].split()
-            logger.info(ks, "local rdm1 of atom %s: ",
-                        " ".join(lab_sp[:2]) + " " + lab_sp[2][:2])
-            U_mesh = np.ix_(idx, idx)
-            P_loc = 0.0
+        for idx, val, lab, alpha in zip(U_idx, U_val, U_lab, alphas):
+            if ks.verbose >= logger.INFO:
+                lab_string = " "
+                for l in lab:
+                    lab_string += "%9s" %(l.split()[-1])
+                lab_sp = lab[0].split()
+                logger.info(ks, "local rdm1 of atom %s: ",
+                            " ".join(lab_sp[:2]) + " " + lab_sp[2][:2])
+
+            P_loc = []
             for k in range(nkpts):
-                S_k = ovlp[k]
-                C_k = C_ao_lo[k][:, idx]
-                P_k = rdm1_lo[k][U_mesh]
+                C_loc = C_ao_lo[k][:,idx]
+                SC = ovlp[k].dot(C_loc) # ~ C^{-1}
+                P_k = SC.conj().T.dot(dm[k]).dot(SC)
                 E_U += weight[k] * (val * 0.5) * (P_k.trace() - P_k.dot(P_k).trace() * 0.5)
-                vhub_loc = (cp.eye(P_k.shape[-1]) - P_k) * (val * 0.5)
+                loc_sites = P_k.shape[-1]
+                vhub_loc = (cp.eye(loc_sites) - P_k) * (val * 0.5)
                 if alpha is not None:
                     # The alpha perturbation is only applied to the linear term of
                     # the local density.
                     E_U += weight[k] * alpha * P_k.trace()
-                    vhub_loc += cp.eye(P_k.shape[-1]) * alpha
-                SC = S_k.dot(C_k)
+                    vhub_loc += cp.eye(loc_sites) * alpha
                 vhub_loc = SC.dot(vhub_loc).dot(SC.conj().T)
-                if vxc.dtype == np.float64:
+                if vxc[k].dtype == np.float64:
                     vhub_loc = vhub_loc.real
                 vxc[k] += vhub_loc
-                if not is_ibz:
-                    P_loc += P_k
-            if is_ibz:
-                P_loc = rdm1_lo_0[U_mesh].real
-            else:
-                P_loc = P_loc.real / nkpts
-            logger.info(ks, "%s\n%s", lab_string, P_loc)
-            logger.info(ks, "-" * 79)
-
-    E_U = E_U.get()[()]
-    if E_U.real < 0.0 and all(np.asarray(ks.U_val) > 0):
-        logger.warn(ks, "E_U (%g) is negative...", E_U.real)
-    vxc = tag_array(vxc, E_U=E_U)
+                P_loc.append(P_k)
+            if ks.verbose >= logger.INFO:
+                P_loc = sum(P_loc).real / nkpts
+                logger.info(ks, "%s\n%s", lab_string, P_loc)
+                logger.info(ks, "-" * 79)
+
+    E_U = E_U.real.get()[()]
+    if E_U < 0.0 and all(np.asarray(U_val) > 0):
+        logger.warn(ks, "E_U (%s) is negative...", E_U)
+    vxc.E_U = E_U
     return vxc
 
 def energy_elec(mf, dm_kpts=None, h1e_kpts=None, vhf=None):
@@ -143,66 +145,61 @@ def energy_elec(mf, dm_kpts=None, h1e_kpts=None, vhf=None):
     e2 = ecoul + exc + E_U
     tot_e = e1 + e2
     mf.scf_summary['e1'] = e1.real
-    mf.scf_summary['coul'] = ecoul.real
-    mf.scf_summary['exc'] = exc.real
-    mf.scf_summary['E_U'] = E_U.real
-    logger.debug(mf, 'E1 = %s  Ecoul = %s  Exc = %s  EU = %s', e1, ecoul, exc, E_U)
+    mf.scf_summary['coul'] = vhf.ecoul.real
+    mf.scf_summary['exc'] = vhf.exc.real
+    mf.scf_summary['E_U'] = vhf.E_U.real
     if abs(ecoul.imag) > mf.cell.precision*10:
         logger.warn(mf, "Coulomb energy has imaginary part %s. "
                     "Coulomb integrals (e-e, e-N) may not converge !",
                     ecoul.imag)
     return tot_e.real, e2.real
 
-def make_minao_lo(ks, minao_ref):
-    """
-    Construct minao local orbitals.
-    """
-    cell = ks.cell
-    nao = cell.nao
-    kpts = ks.kpts
-    nkpts = len(kpts)
-    ovlp = ks.get_ovlp()
-    C_ao_minao, labels = krkspu_cpu.proj_ref_ao(cell, minao=minao_ref, kpts=kpts,
-                                                return_labels=True)
-    for k in range(nkpts):
-        C_ao_minao[k] = vec_lowdin(C_ao_minao[k], ovlp[k].get())
-
-    C_ao_lo = np.zeros((nkpts, nao, nao), dtype=np.complex128)
-    for idx, lab in zip(ks.U_idx, ks.U_lab):
-        idx_minao = [i for i, l in enumerate(labels) if l in lab]
-        assert len(idx_minao) == len(idx)
-        C_ao_sub = C_ao_minao[:, :, idx_minao]
-        C_ao_lo[:, :, idx] = C_ao_sub
-    return C_ao_lo
+def _make_minao_lo(cell, minao_ref='minao', kpts=None):
+    '''
+    Construct orthogonal minao local orbitals.
+    '''
+    assert kpts is not None
+    if isinstance(minao_ref, str):
+        pcell = reference_mol(cell, minao_ref)
+    else:
+        pcell = minao_ref
+    ovlp = asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+    s12 = asarray(pgto.cell.intor_cross('int1e_ovlp', cell, pcell, kpts=kpts))
+    C_minao = []
+    for k, S_k in enumerate(ovlp):
+        C = cp.linalg.solve(S_k, s12[k])
+        S0 = C.conj().T.dot(S_k).dot(C)
+        w2, v = cp.linalg.eigh(S0)
+        C_minao.append(C.dot((v*cp.sqrt(1./w2)).dot(v.conj().T)))
+    return C_minao
 
 class KRKSpU(krks.KRKS):
     """
     RKSpU (DFT+U) class adapted for PBCs with k-point sampling.
     """
 
-    _keys = {"U_idx", "U_val", "C_ao_lo", "U_lab", 'alpha'}
+    _keys = {"U_idx", "U_val", "C_ao_lo", "U_lab", 'minao_ref', 'alpha'}
 
     get_veff = get_veff
     energy_elec = energy_elec
     to_hf = NotImplemented
 
     def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN',
-                 exxdiv='ewald', U_idx=[], U_val=[], C_ao_lo='minao',
-                 minao_ref='MINAO', **kwargs):
+                 exxdiv=getattr(__config__, 'pbc_scf_SCF_exxdiv', 'ewald'),
+                 U_idx=[], U_val=[], C_ao_lo=None, minao_ref='MINAO', **kwargs):
         """
         DFT+U args:
             U_idx: can be
-                   list of list: each sublist is a set of LO indices to add U.
+                   list of list: each sublist is a set indices for AO orbitals
+                                 (indcies corresponding to the large-basis-set mol).
                    list of string: each string is one kind of LO orbitals,
-                                   e.g. ['Ni 3d', '1 O 2pz'], in this case,
-                                   LO should be aranged as ao_labels order.
+                                   e.g. ['Ni 3d', '1 O 2pz'].
                    or a combination of these two.
             U_val: a list of effective U [in eV], i.e. U-J in Dudarev's DFT+U.
                    each U corresponds to one kind of LO orbitals, should have
                    the same length as U_idx.
             C_ao_lo: LO coefficients, can be
                      np.array, shape ((spin,), nkpts, nao, nlo),
-                     string, in 'minao'.
             minao_ref: reference for minao orbitals, default is 'MINAO'.
 
         Attributes:
@@ -214,18 +211,13 @@ def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN',
         """
         super(self.__class__, self).__init__(cell, kpts, xc=xc, exxdiv=exxdiv, **kwargs)
 
-        krkspu_cpu.set_U(self, U_idx, U_val)
-
+        self.U_idx = U_idx
+        self.U_val = U_val
         if isinstance(C_ao_lo, str):
-            if C_ao_lo.upper() == 'MINAO':
-                self.C_ao_lo = make_minao_lo(self, minao_ref)
-            else:
-                raise NotImplementedError
-        else:
-            self.C_ao_lo = C_ao_lo
-        if self.C_ao_lo.ndim == 4:
-            self.C_ao_lo = self.C_ao_lo[0]
-
+            assert C_ao_lo.upper() == 'MINAO'
+            C_ao_lo = None # API backward compatibility
+        self.C_ao_lo = C_ao_lo
+        self.minao_ref = minao_ref
         # The perturbation (eV) used to compute U in LR-cDFT.
         self.alpha = None
 
@@ -233,27 +225,97 @@ def dump_flags(self, verbose=None):
         super().dump_flags(verbose)
         log = logger.new_logger(self, verbose)
         if log.verbose >= logger.INFO:
+            from gpu4pyscf.dft.rkspu import _print_U_info
             _print_U_info(self, log)
         return self
 
     def Gradients(self):
-        raise NotImplementedError
+        from gpu4pyscf.pbc.grad.krkspu import Gradients
+        return Gradients(self)
 
-def _print_U_info(mf, log):
-    from pyscf.pbc.dft.krkspu import format_idx
-    alphas = mf.alpha
-    if not hasattr(alphas, '__len__'): # not a list or tuple
-        alphas = [alphas] * len(mf.U_idx)
-    log.info("-" * 79)
-    log.info('U indices and values: ')
-    for idx, val, lab, alpha in zip(mf.U_idx, mf.U_val, mf.U_lab, alphas):
-        log.info('%6s [%.6g eV] ==> %-100s', format_idx(idx),
-                    val * HARTREE2EV, "".join(lab))
-        if alpha is not None:
-            log.info('alpha for LR-cDFT %s (eV)', alpha*HARTREE2EV)
-    log.info("-" * 79)
+    def nuc_grad_method(self):
+        return self.Gradients()
 
 def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
-    # LR-cDFT for Hubbard U is only available fro pyscf>2.9
-    from pyscf.pbc.dft.krkspu import linear_response_u
-    return linear_response_u(mf_plus_u, alphalist)
+    '''
+    Refs:
+        [1] M. Cococcioni and S. de Gironcoli, Phys. Rev. B 71, 035105 (2005)
+        [2] H. J. Kulik, M. Cococcioni, D. A. Scherlis, and N. Marzari, Phys. Rev. Lett. 97, 103001 (2006)
+        [3] Heather J. Kulik, J. Chem. Phys. 142, 240901 (2015)
+        [4] https://hjkgrp.mit.edu/tutorials/2011-05-31-calculating-hubbard-u/
+        [5] https://hjkgrp.mit.edu/tutorials/2011-06-28-hubbard-u-multiple-sites/
+
+    Args:
+        alphalist :
+            alpha parameters (in eV) are the displacements for the linear
+            response calculations. For each alpha in this list, the DFT+U with
+            U=u0+alpha, U=u0-alpha are evaluated. u0 is the U value from the
+            reference mf_plus_u object, which will be treated as a standard DFT
+            functional.
+    '''
+    is_ibz = hasattr(mf_plus_u.kpts, "kpts_ibz")
+    if is_ibz:
+        raise NotImplementedError
+
+    assert isinstance(mf_plus_u, KRKSpU)
+    assert len(mf_plus_u.U_idx) > 0
+    if not mf_plus_u.converged:
+        mf_plus_u.run()
+    assert mf_plus_u.converged
+    # The bare density matrix without adding U
+    bare_dm = mf_plus_u.make_rdm1()
+
+    mf = mf_plus_u.copy()
+    log = logger.new_logger(mf)
+
+    alphalist = np.asarray(alphalist)
+    alphalist = np.append(-alphalist[::-1], alphalist)
+
+    kpts = mf.kpts.reshape(-1, 3)
+    nkpts = len(kpts)
+    cell = mf.cell
+
+    ovlp = asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+    pcell = reference_mol(cell, mf.minao_ref)
+    U_idx, U_val, U_lab = _set_U(cell, pcell, mf.U_idx, mf.U_val)
+    if mf.C_ao_lo is None:
+        C_ao_lo = _make_minao_lo(cell, pcell, kpts)
+    else:
+        C_ao_lo = mf.C_ao_lo
+    C_inv = [[C_k[:,local_idx].conj().T.dot(S_k) for C_k, S_k in zip(C_ao_lo, ovlp)]
+             for local_idx in U_idx]
+
+    bare_occupancies = []
+    final_occupancies = []
+    for alpha in alphalist:
+        # All in atomic unit
+        mf.alpha = alpha / HARTREE2EV
+        mf.kernel(dm0=bare_dm)
+        local_occ = 0
+        for c in C_inv:
+            C_on_site = [c[k].dot(mf.mo_coeff[k]) for k in range(nkpts)]
+            rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
+            local_occ += sum(x.trace().real for x in rdm1_lo)
+        local_occ /= nkpts
+        final_occupancies.append(local_occ)
+
+        # The first iteration of SCF
+        fock = mf.get_fock(dm=bare_dm)
+        e, mo = mf.eig(fock, ovlp)
+        local_occ = 0
+        for c in C_inv:
+            C_on_site = [c[k].dot(mo[k]) for k in range(nkpts)]
+            rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
+            local_occ += sum(x.trace().real for x in rdm1_lo)
+        local_occ /= nkpts
+        bare_occupancies.append(local_occ)
+        log.info('alpha=%f bare_occ=%g final_occ=%g',
+                 alpha, bare_occupancies[-1], final_occupancies[-1])
+
+    chi0, occ0 = np.polyfit(alphalist, bare_occupancies, deg=1)
+    chif, occf = np.polyfit(alphalist, final_occupancies, deg=1)
+    log.info('Line fitting chi0 = %f x + %f', chi0, occ0)
+    log.info('Line fitting chif = %f x + %f', chif, occf)
+    Uresp = 1./chi0 - 1./chif
+    log.note('Uresp = %f, chi0 = %f, chif = %f', Uresp, chi0, chif)
+    return Uresp
diff --git a/gpu4pyscf/pbc/dft/kukspu.py b/gpu4pyscf/pbc/dft/kukspu.py
index 946a719c..3c37adbf 100644
--- a/gpu4pyscf/pbc/dft/kukspu.py
+++ b/gpu4pyscf/pbc/dft/kukspu.py
@@ -21,10 +21,12 @@
 
 import numpy as np
 import cupy as cp
+from pyscf import __config__
+from pyscf.data.nist import HARTREE2EV
 from pyscf.pbc.dft import kukspu as kukspu_cpu
 from gpu4pyscf.lib import logger
 from gpu4pyscf.pbc.dft import kuks
-from gpu4pyscf.pbc.dft.krkspu import make_minao_lo
+from gpu4pyscf.pbc.dft.krkspu import _set_U, _make_minao_lo, reference_mol
 from gpu4pyscf.lib.cupy_helper import asarray, tag_array
 
 def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
@@ -45,68 +47,65 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
 def _add_Vhubbard(vxc, ks, dm, kpts):
     '''Add Hubbard U to Vxc matrix inplace.
     '''
-    C_ao_lo = asarray(ks.C_ao_lo)
-    ovlp = ks.get_ovlp()
-    nkpts = len(kpts)
-    nlo = C_ao_lo.shape[-1]
-
-    rdm1_lo  = cp.zeros((2, nkpts, nlo, nlo), dtype=np.complex128)
-    for s in range(2):
-        for k in range(nkpts):
-            C_inv = C_ao_lo[s, k].conj().T.dot(ovlp[k])
-            rdm1_lo[s, k] = C_inv.dot(dm[s][k]).dot(C_inv.conj().T)
+    cell = ks.cell
+    pcell = reference_mol(cell, ks.minao_ref)
 
     is_ibz = hasattr(kpts, "kpts_ibz")
+    kpts_input = kpts
     if is_ibz:
-        rdm1_lo_0 = kpts.dm_at_ref_cell(rdm1_lo)
+        raise NotImplementedError('DFT+U for k-point symmetry')
+    kpts = kpts.reshape(-1, 3)
+    nkpts = len(kpts)
+
+    ovlp = asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+    U_idx, U_val, U_lab = _set_U(cell, pcell, ks.U_idx, ks.U_val)
+    assert ks.C_ao_lo is None
+    C_ao_lo = _make_minao_lo(cell, pcell, kpts)
 
     alphas = ks.alpha
     if not hasattr(alphas, '__len__'): # not a list or tuple
-        alphas = [alphas] * len(ks.U_idx)
+        alphas = [alphas] * len(U_idx)
 
     E_U = 0.0
-    weight = getattr(kpts, "weights_ibz", np.repeat(1.0/nkpts, nkpts))
+    weight = getattr(kpts_input, "weights_ibz", np.repeat(1.0/nkpts, nkpts))
     logger.info(ks, "-" * 79)
+    lab_string = " "
     with np.printoptions(precision=5, suppress=True, linewidth=1000):
-        for idx, val, lab, alpha in zip(ks.U_idx, ks.U_val, ks.U_lab, alphas):
-            lab_string = " "
-            for l in lab:
-                lab_string += "%9s" %(l.split()[-1])
-            lab_sp = lab[0].split()
-            logger.info(ks, "local rdm1 of atom %s: ",
-                        " ".join(lab_sp[:2]) + " " + lab_sp[2][:2])
-            U_mesh = np.ix_(idx, idx)
+        for idx, val, lab, alpha in zip(U_idx, U_val, U_lab, alphas):
+            if ks.verbose >= logger.INFO:
+                lab_string = " "
+                for l in lab:
+                    lab_string += "%9s" %(l.split()[-1])
+                lab_sp = lab[0].split()
+                logger.info(ks, "local rdm1 of atom %s: ",
+                            " ".join(lab_sp[:2]) + " " + lab_sp[2][:2])
             for s in range(2):
-                P_loc = 0.0
+                P_loc = []
                 for k in range(nkpts):
-                    S_k = ovlp[k]
-                    C_k = C_ao_lo[s, k][:, idx]
-                    P_k = rdm1_lo[s, k][U_mesh]
-                    E_U += weight[k] * (val * 0.5) * (P_k.trace() - cp.dot(P_k, P_k).trace())
+                    C_loc = C_ao_lo[k][:,idx]
+                    SC = ovlp[k].dot(C_loc) # ~ C^{-1}
+                    P_k = SC.conj().T.dot(dm[s][k]).dot(SC)
+                    E_U += weight[k] * (val * 0.5) * (P_k.trace() - P_k.dot(P_k).trace())
                     vhub_loc = (cp.eye(P_k.shape[-1]) - P_k * 2.0) * (val * 0.5)
                     if alpha is not None:
                         # The alpha perturbation is only applied to the linear term of
                         # the local density.
                         E_U += weight[k] * alpha * P_k.trace()
                         vhub_loc += cp.eye(P_k.shape[-1]) * alpha
-                    SC = cp.dot(S_k, C_k)
                     vhub_loc = SC.dot(vhub_loc).dot(SC.conj().T)
-                    if vxc.dtype == np.float64:
+                    if vxc[s,k].dtype == np.float64:
                         vhub_loc = vhub_loc.real
                     vxc[s,k] += vhub_loc
-                    if not is_ibz:
-                        P_loc += P_k
-                if is_ibz:
-                    P_loc = rdm1_lo_0[s][U_mesh].real
-                else:
-                    P_loc = P_loc.real / nkpts
-                logger.info(ks, "spin %s\n%s\n%s", s, lab_string, P_loc)
+                    P_loc.append(P_k)
+                if ks.verbose >= logger.INFO:
+                    P_loc = sum(P_loc).real / nkpts
+                    logger.info(ks, "spin %s\n%s\n%s", s, lab_string, P_loc)
             logger.info(ks, "-" * 79)
 
-    E_U = E_U.get()[()]
-    if E_U.real < 0.0 and all(np.asarray(ks.U_val) > 0):
-        logger.warn(ks, "E_U (%g) is negative...", E_U.real)
-    vxc = tag_array(vxc, E_U=E_U.real)
+    E_U = E_U.real.get()[()]
+    if E_U < 0.0 and all(np.asarray(U_val) > 0):
+        logger.warn(ks, "E_U (%s) is negative...", E_U)
+    vxc.E_U = E_U
     return vxc
 
 def energy_elec(mf, dm_kpts=None, h1e_kpts=None, vhf=None):
@@ -118,33 +117,34 @@ def energy_elec(mf, dm_kpts=None, h1e_kpts=None, vhf=None):
     if vhf is None or getattr(vhf, 'ecoul', None) is None:
         vhf = mf.get_veff(mf.cell, dm_kpts)
 
-    weight = getattr(mf.kpts, "weights_ibz",
-                     np.array([1.0/len(h1e_kpts),]*len(h1e_kpts)))
-    e1 = cp.einsum('k,kij,nkji->', weight, h1e_kpts, dm_kpts).get()[()]
-    tot_e = e1 + vhf.ecoul + vhf.exc + vhf.E_U
+    if hasattr(mf.kpts, "weights_ibz"):
+        raise NotImplementedError('DFT+U for k-point symmetry')
+    nkpts = len(h1e_kpts)
+    e1 = cp.einsum('kij,nkji->', h1e_kpts, dm_kpts).get()[()] / nkpts
+    e2 = vhf.ecoul + vhf.exc + vhf.E_U
+    tot_e = e1 + e2
     mf.scf_summary['e1'] = e1.real
     mf.scf_summary['coul'] = vhf.ecoul.real
     mf.scf_summary['exc'] = vhf.exc.real
     mf.scf_summary['E_U'] = vhf.E_U.real
-
     logger.debug(mf, 'E1 = %s  Ecoul = %s  Exc = %s  EU = %s',
                  e1, vhf.ecoul, vhf.exc, vhf.E_U)
-    return tot_e.real, (vhf.ecoul + vhf.exc + vhf.E_U).real
+    return tot_e.real, e2
 
 class KUKSpU(kuks.KUKS):
     """
     UKSpU class adapted for PBCs with k-point sampling.
     """
 
-    _keys = {"U_idx", "U_val", "C_ao_lo", "U_lab", 'alpha'}
+    _keys = {"U_idx", "U_val", "C_ao_lo", "U_lab", 'minao_ref', 'alpha'}
 
     get_veff = get_veff
     energy_elec = energy_elec
     to_hf = NotImplemented
 
     def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN',
-                 exxdiv='ewald', U_idx=[], U_val=[], C_ao_lo='minao',
-                 minao_ref='MINAO', **kwargs):
+                 exxdiv=getattr(__config__, 'pbc_scf_SCF_exxdiv', 'ewald'),
+                 U_idx=[], U_val=[], C_ao_lo=None, minao_ref='MINAO', **kwargs):
         """
         DFT+U args:
             U_idx: can be
@@ -158,7 +158,6 @@ def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN',
                    the same length as U_idx.
             C_ao_lo: LO coefficients, can be
                      np.array, shape ((spin,), nkpts, nao, nlo),
-                     string, in 'minao'.
             minao_ref: reference for minao orbitals, default is 'MINAO'.
 
         Attributes:
@@ -170,24 +169,13 @@ def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN',
         """
         super(self.__class__, self).__init__(cell, kpts, xc=xc, exxdiv=exxdiv, **kwargs)
 
-        kukspu_cpu.set_U(self, U_idx, U_val)
-
+        self.U_idx = U_idx
+        self.U_val = U_val
         if isinstance(C_ao_lo, str):
-            if C_ao_lo.upper() == 'MINAO':
-                self.C_ao_lo = make_minao_lo(self, minao_ref)
-            else:
-                raise NotImplementedError
-        else:
-            self.C_ao_lo = np.asarray(C_ao_lo)
-        if self.C_ao_lo.ndim == 3:
-            self.C_ao_lo = np.asarray((self.C_ao_lo, self.C_ao_lo))
-        elif self.C_ao_lo.ndim == 4:
-            if self.C_ao_lo.shape[0] == 1:
-                self.C_ao_lo = np.asarray((self.C_ao_lo[0], self.C_ao_lo[0]))
-            assert self.C_ao_lo.shape[0] == 2
-        else:
-            raise ValueError
-
+            assert C_ao_lo.upper() == 'MINAO'
+            C_ao_lo = None # API backward compatibility
+        self.C_ao_lo = C_ao_lo
+        self.minao_ref = minao_ref
         # The perturbation (eV) used to compute U in LR-cDFT.
         self.alpha = None
 
@@ -195,14 +183,97 @@ def dump_flags(self, verbose=None):
         super().dump_flags(verbose)
         log = logger.new_logger(self, verbose)
         if log.verbose >= logger.INFO:
-            from gpu4pyscf.pbc.dft.krkspu import _print_U_info
+            from gpu4pyscf.dft.krkspu import _print_U_info
             _print_U_info(self, log)
         return self
 
     def Gradients(self):
-        raise NotImplementedError
+        from gpu4pyscf.pbc.grad.kukspu import Gradients
+        return Gradients(self)
+
+    def nuc_grad_method(self):
+        return self.Gradients()
 
 def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
-    # LR-cDFT for Hubbard U is only available fro pyscf>2.9
-    from pyscf.pbc.dft.kukspu import linear_response_u
-    return linear_response_u(mf_plus_u, alphalist)
+    '''
+    Refs:
+        [1] M. Cococcioni and S. de Gironcoli, Phys. Rev. B 71, 035105 (2005)
+        [2] H. J. Kulik, M. Cococcioni, D. A. Scherlis, and N. Marzari, Phys. Rev. Lett. 97, 103001 (2006)
+        [3] Heather J. Kulik, J. Chem. Phys. 142, 240901 (2015)
+        [4] https://hjkgrp.mit.edu/tutorials/2011-05-31-calculating-hubbard-u/
+        [5] https://hjkgrp.mit.edu/tutorials/2011-06-28-hubbard-u-multiple-sites/
+
+    Args:
+        alphalist :
+            alpha parameters (in eV) are the displacements for the linear
+            response calculations. For each alpha in this list, the DFT+U with
+            U=u0+alpha, U=u0-alpha are evaluated. u0 is the U value from the
+            reference mf_plus_u object, which will be treated as a standard DFT
+            functional.
+    '''
+    is_ibz = hasattr(mf_plus_u.kpts, "kpts_ibz")
+    if is_ibz:
+        raise NotImplementedError
+
+    assert isinstance(mf_plus_u, KUKSpU)
+    assert len(mf_plus_u.U_idx) > 0
+    if not mf_plus_u.converged:
+        mf_plus_u.run()
+    assert mf_plus_u.converged
+    # The bare density matrix without adding U
+    bare_dm = mf_plus_u.make_rdm1()
+
+    mf = mf_plus_u.copy()
+    log = logger.new_logger(mf)
+
+    alphalist = np.asarray(alphalist)
+    alphalist = np.append(-alphalist[::-1], alphalist)
+
+    kpts = mf.kpts.reshape(-1, 3)
+    nkpts = len(kpts)
+    cell = mf.cell
+
+    ovlp = asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+    pcell = reference_mol(cell, mf.minao_ref)
+    U_idx, U_val, U_lab = _set_U(cell, pcell, mf.U_idx, mf.U_val)
+    C_ao_lo = _make_minao_lo(cell, pcell, kpts)
+    C_inv = []
+    for local_idx in U_idx:
+        C_inv.append([C_k[:,local_idx].conj().T.dot(S_k) for C_k, S_k in zip(C_ao_lo, ovlp)])
+
+    bare_occupancies = []
+    final_occupancies = []
+    for alpha in alphalist:
+        mf.alpha = alpha / HARTREE2EV
+        mf.kernel(dm0=bare_dm)
+        local_occ = 0
+        for c in C_inv:
+            C_on_site = [[c[k].dot(mf.mo_coeff[0][k]) for k in range(nkpts)],
+                         [c[k].dot(mf.mo_coeff[1][k]) for k in range(nkpts)]]
+            rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
+            local_occ += sum(x.trace().real for x in rdm1_lo[0])
+            local_occ += sum(x.trace().real for x in rdm1_lo[1])
+        local_occ /= nkpts
+        final_occupancies.append(local_occ)
+
+        # The first iteration of SCF
+        fock = mf.get_fock(dm=bare_dm)
+        e, mo = mf.eig(fock, ovlp)
+        for c in C_inv:
+            C_on_site = [[c[k].dot(mf.mo_coeff[0][k]) for k in range(nkpts)],
+                         [c[k].dot(mf.mo_coeff[1][k]) for k in range(nkpts)]]
+            rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
+            local_occ += sum(x.trace().real for x in rdm1_lo[0])
+            local_occ += sum(x.trace().real for x in rdm1_lo[1])
+        local_occ /= nkpts
+        bare_occupancies.append(local_occ)
+        log.info('alpha=%f bare_occ=%g final_occ=%g',
+                 alpha, bare_occupancies[-1], final_occupancies[-1])
+
+    chi0, occ0 = np.polyfit(alphalist, bare_occupancies, deg=1)
+    chif, occf = np.polyfit(alphalist, final_occupancies, deg=1)
+    log.info('Line fitting chi0 = %f x + %f', chi0, occ0)
+    log.info('Line fitting chif = %f x + %f', chif, occf)
+    Uresp = 1./chi0 - 1./chif
+    log.note('Uresp = %f, chi0 = %f, chif = %f', Uresp, chi0, chif)
+    return Uresp
diff --git a/gpu4pyscf/pbc/dft/tests/test_dftu.py b/gpu4pyscf/pbc/dft/tests/test_pbc_dftu.py
similarity index 58%
rename from gpu4pyscf/pbc/dft/tests/test_dftu.py
rename to gpu4pyscf/pbc/dft/tests/test_pbc_dftu.py
index 0c076a27..6e2b90bb 100644
--- a/gpu4pyscf/pbc/dft/tests/test_dftu.py
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_dftu.py
@@ -75,6 +75,62 @@ def test_get_veff(self):
         self.assertAlmostEqual(vxc.E_U, 0.07587726255165786, 11)
         self.assertAlmostEqual(lib.fp(vxc.get()), 12.77643098220399, 8)
 
+    def test_KRKSpU_linear_response(self):
+        cell = pgto.Cell()
+        cell.unit = 'A'
+        cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
+        cell.a = '''0.      1.7834  1.7834
+                    1.7834  0.      1.7834
+                    1.7834  1.7834  0.    '''
+
+        cell.basis = 'gth-szv'
+        cell.pseudo = 'gth-pade'
+        cell.verbose = 7
+        cell.output = '/dev/null'
+        cell.mesh = [29]*3
+        cell.build()
+        kmesh = [2, 1, 1]
+        kpts = cell.make_kpts(kmesh, wrap_around=True)
+        U_idx = ["1 C 2p"]
+        U_val = [5.0]
+
+        mf = krkspu.KRKSpU(cell, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao',
+                minao_ref='gth-szv')
+        mf.conv_tol = 1e-10
+        e_tot = mf.kernel()
+        self.assertAlmostEqual(e_tot, -10.6191452297714, 8)
+
+        uresp = krkspu.linear_response_u(mf, (0.03, 0.08))
+        self.assertAlmostEqual(uresp, 6.279179, 2)
+
+    def test_KUKSpU_linear_response(self):
+        cell = pgto.Cell()
+        cell.unit = 'A'
+        cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
+        cell.a = '''0.      1.7834  1.7834
+                    1.7834  0.      1.7834
+                    1.7834  1.7834  0.    '''
+
+        cell.basis = 'gth-szv'
+        cell.pseudo = 'gth-pade'
+        cell.verbose = 7
+        cell.output = '/dev/null'
+        cell.mesh = [29]*3
+        cell.build()
+        kmesh = [2, 1, 1]
+        kpts = cell.make_kpts(kmesh, wrap_around=True)
+        U_idx = ["1 C 2p"]
+        U_val = [5.0]
+
+        mf = kukspu.KUKSpU(cell, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao',
+                minao_ref='gth-szv')
+        mf.conv_tol = 1e-10
+        e_tot = mf.kernel()
+        self.assertAlmostEqual(e_tot, -10.6191452297714, 8)
+
+        uresp = kukspu.linear_response_u(mf, (0.03, 0.08))
+        self.assertAlmostEqual(uresp, 6.279179, 2)
+
 if __name__ == '__main__':
     print("Full Tests for pbc.dft dft+U")
     unittest.main()

From 30c5ecf6f794961cd596e41beccb23b4a304e95e Mon Sep 17 00:00:00 2001
From: Qiming Sun <qiming.sun@bytedance.com>
Date: Thu, 17 Jul 2025 10:07:48 +0800
Subject: [PATCH 3/9] Analytical gradients for KDFT+U

---
 gpu4pyscf/grad/rhf.py                         |   7 +-
 gpu4pyscf/grad/rkspu.py                       |   2 +-
 gpu4pyscf/grad/tests/test_grad_rkspu.py       |   2 +-
 gpu4pyscf/grad/tests/test_grad_ukspu.py       |   2 +-
 gpu4pyscf/grad/uhf.py                         |   8 +-
 gpu4pyscf/grad/ukspu.py                       |   2 +-
 gpu4pyscf/lib/cupy_helper.py                  |   5 +
 gpu4pyscf/pbc/grad/krhf.py                    |   4 +-
 gpu4pyscf/pbc/grad/krkspu.py                  | 140 ++++++++++++++++++
 gpu4pyscf/pbc/grad/kuhf.py                    |   4 +-
 gpu4pyscf/pbc/grad/kukspu.py                  |  85 +++++++++++
 .../pbc/grad/tests/test_pbc_grad_krkspu.py    |  96 ++++++++++++
 .../pbc/grad/tests/test_pbc_grad_kukspu.py    |  76 ++++++++++
 13 files changed, 418 insertions(+), 15 deletions(-)
 create mode 100644 gpu4pyscf/pbc/grad/krkspu.py
 create mode 100644 gpu4pyscf/pbc/grad/kukspu.py
 create mode 100644 gpu4pyscf/pbc/grad/tests/test_pbc_grad_krkspu.py
 create mode 100644 gpu4pyscf/pbc/grad/tests/test_pbc_grad_kukspu.py

diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index 41d75389..eb632310 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -26,7 +26,8 @@
 from gpu4pyscf.gto.ecp import get_ecp_ip
 from gpu4pyscf.lib import utils
 from gpu4pyscf.scf.hf import KohnShamDFT
-from gpu4pyscf.lib.cupy_helper import tag_array, contract, condense, reduce_to_device, transpose_sum
+from gpu4pyscf.lib.cupy_helper import (
+    tag_array, contract, condense, reduce_to_device, transpose_sum, ensure_numpy)
 from gpu4pyscf.__config__ import props as gpu_specs
 from gpu4pyscf.__config__ import _streams, num_devices
 from gpu4pyscf.df import int3c2e      #TODO: move int3c2e to out of df
@@ -293,9 +294,9 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
     log.debug('Computing Gradients of NR-HF Coulomb repulsion')
 
     dm0 = tag_array(dm0, mo_coeff=mo_coeff, mo_occ=mo_occ)
-    extra_force = cupy.zeros((len(atmlst),3))
+    extra_force = np.zeros((len(atmlst),3))
     for k, ia in enumerate(atmlst):
-        extra_force[k] += cupy.asarray(mf_grad.extra_force(ia, locals()))
+        extra_force[k] += ensure_numpy(mf_grad.extra_force(ia, locals()))
 
     log.timer_debug1('gradients of 2e part', *t3)
 
diff --git a/gpu4pyscf/grad/rkspu.py b/gpu4pyscf/grad/rkspu.py
index 07dec9c9..fb822227 100644
--- a/gpu4pyscf/grad/rkspu.py
+++ b/gpu4pyscf/grad/rkspu.py
@@ -118,7 +118,7 @@ def _hubbard_U_deriv1(mf, dm=None):
             dE_U[atm_id] += (val * 0.5) * (
                 cp.einsum('xii->x', P1).real * 2 # *2 for P1+P1.T
                 - cp.einsum('xij,ji->x', P1, P0).real * 2)
-    return dE_U
+    return dE_U.get()
 
 class Gradients(rks_grad.Gradients):
     def get_veff(self, mol=None, dm=None):
diff --git a/gpu4pyscf/grad/tests/test_grad_rkspu.py b/gpu4pyscf/grad/tests/test_grad_rkspu.py
index b3a14303..df85149f 100644
--- a/gpu4pyscf/grad/tests/test_grad_rkspu.py
+++ b/gpu4pyscf/grad/tests/test_grad_rkspu.py
@@ -43,7 +43,7 @@ def test_finite_diff_hubbard_U_grad(self):
         U_val = [5.0]
         mf = rkspu.RKSpU(mol, U_idx=U_idx, U_val=U_val)
         mf.__dict__.update(mol.RHF().to_gpu().run().__dict__)
-        de = rkspu_grad._hubbard_U_deriv1(mf).get()
+        de = rkspu_grad._hubbard_U_deriv1(mf)
 
         mf.mol.set_geom_('C 0 1.6 0; O 0 0 1.001', unit='B')
         e1 = mf.get_veff().E_U
diff --git a/gpu4pyscf/grad/tests/test_grad_ukspu.py b/gpu4pyscf/grad/tests/test_grad_ukspu.py
index fa1f4c85..947ad41a 100644
--- a/gpu4pyscf/grad/tests/test_grad_ukspu.py
+++ b/gpu4pyscf/grad/tests/test_grad_ukspu.py
@@ -26,7 +26,7 @@ def test_finite_diff_hubbard_U_grad(self):
         U_val = [5.0]
         mf = ukspu.UKSpU(mol, U_idx=U_idx, U_val=U_val)
         mf.__dict__.update(mol.UHF().to_gpu().run().__dict__)
-        de = ukspu_grad._hubbard_U_deriv1(mf).get()
+        de = ukspu_grad._hubbard_U_deriv1(mf)
 
         mf.mol.set_geom_('C 0 1.6 0; O 0 0 1.001', unit='B')
         e1 = mf.get_veff().E_U
diff --git a/gpu4pyscf/grad/uhf.py b/gpu4pyscf/grad/uhf.py
index d756cbca..50cf0548 100644
--- a/gpu4pyscf/grad/uhf.py
+++ b/gpu4pyscf/grad/uhf.py
@@ -22,7 +22,7 @@
 from pyscf.grad import rhf as rhf_grad_cpu
 from gpu4pyscf.gto.ecp import get_ecp_ip
 from gpu4pyscf.lib import utils
-from gpu4pyscf.lib.cupy_helper import tag_array, contract
+from gpu4pyscf.lib.cupy_helper import tag_array, contract, ensure_numpy
 from gpu4pyscf.df import int3c2e      #TODO: move int3c2e to out of df
 from gpu4pyscf.lib import logger
 from gpu4pyscf.grad import rhf as rhf_grad
@@ -83,10 +83,10 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
     t1 = log.timer_debug1('gradients of h1e', *t1)
     log.debug('Computing Gradients of NR-HF Coulomb repulsion')
     dvhf = mf_grad.get_veff(mol, dm0)
-    
-    extra_force = cupy.zeros((len(atmlst),3))
+
+    extra_force = np.zeros((len(atmlst),3))
     for k, ia in enumerate(atmlst):
-        extra_force[k] += mf_grad.extra_force(ia, locals())
+        extra_force[k] += ensure_numpy(mf_grad.extra_force(ia, locals()))
     log.timer_debug1('gradients of 2e part', *t1)
 
     dh = contract('xij,ij->xi', h1, dm0_sf)
diff --git a/gpu4pyscf/grad/ukspu.py b/gpu4pyscf/grad/ukspu.py
index de7b6395..7d67787d 100644
--- a/gpu4pyscf/grad/ukspu.py
+++ b/gpu4pyscf/grad/ukspu.py
@@ -65,7 +65,7 @@ def _hubbard_U_deriv1(mf, dm=None):
                 dE_U[atm_id] += (val * 0.5) * (
                     cp.einsum('xii->x', P1).real * 2 # *2 for P1+P1.T
                     - cp.einsum('xij,ji->x', P1, P0).real * 4)
-    return dE_U
+    return dE_U.get()
 
 class Gradients(uks_grad.Gradients):
     def get_veff(self, mol=None, dm=None):
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index c47ccc94..4724aa46 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -203,6 +203,11 @@ def asarray(a, **kwargs):
 
     return cupy.asarray(a, **kwargs)
 
+def ensure_numpy(a):
+    if isinstance(a, cupy.ndarray):
+        a = a.get()
+    return np.asarray(a)
+
 def to_cupy(a):
     '''Converts a numpy (and subclass) object to a cupy object'''
     if isinstance(a, lib.NPArrayWithTag):
diff --git a/gpu4pyscf/pbc/grad/krhf.py b/gpu4pyscf/pbc/grad/krhf.py
index 9f82975c..57ca116b 100644
--- a/gpu4pyscf/pbc/grad/krhf.py
+++ b/gpu4pyscf/pbc/grad/krhf.py
@@ -30,7 +30,7 @@
 from gpu4pyscf.pbc.df.aft import _check_kpts
 from gpu4pyscf.pbc.df import ft_ao
 from gpu4pyscf.pbc import tools
-from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.lib.cupy_helper import contract, ensure_numpy
 
 __all__ = ['Gradients']
 
@@ -68,7 +68,7 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None):
     for ia in range(natm):
         h1ao = hcore_deriv(ia)
         dh1e[ia] = cp.einsum('kxij,kji->x', h1ao, dm0).real
-        extra_force[ia] = mf_grad.extra_force(ia, locals())
+        extra_force[ia] = ensure_numpy(mf_grad.extra_force(ia, locals()))
     log.timer('gradients of 1e part', *t1)
 
     # nabla is applied on bra in vhf. *2 for the contributions of nabla|ket>
diff --git a/gpu4pyscf/pbc/grad/krkspu.py b/gpu4pyscf/pbc/grad/krkspu.py
new file mode 100644
index 00000000..bc5995cc
--- /dev/null
+++ b/gpu4pyscf/pbc/grad/krkspu.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Analytical derivatives for DFT+U with kpoints sampling
+'''
+
+import numpy as np
+import cupy as cp
+from pyscf.pbc import gto
+from gpu4pyscf.pbc.grad import krks as krks_grad
+from gpu4pyscf.pbc.dft.krkspu import _set_U, _make_minao_lo, reference_mol
+from gpu4pyscf.lib.cupy_helper import asarray, contract
+
+def generate_first_order_local_orbitals(cell, minao_ref='MINAO', kpts=None):
+    kpts = kpts.reshape(-1, 3)
+    nkpts = len(kpts)
+    if isinstance(minao_ref, str):
+        pcell = reference_mol(cell, minao_ref)
+    else:
+        pcell = minao_ref
+    sAA = asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+    sAB = asarray(gto.intor_cross('int1e_ovlp', cell, pcell, kpts=kpts))
+
+    C0_minao = []
+    wv_ks = []
+    S0_lowdin = []
+    for k in range(nkpts):
+        C0_minao.append(cp.linalg.solve(sAA[k], sAB[k]))
+
+        # Lowdin orthogonalization coefficients = S^{-1/2}
+        S0 = sAB[k].conj().T.dot(C0_minao[k])
+        w2, v = cp.linalg.eigh(S0)
+        w = np.sqrt(w2)
+        wv_ks.append((w, v))
+        S0_lowdin.append((v/w).dot(v.conj().T))
+
+    sAA_ip1 = asarray(cell.pbc_intor('int1e_ipovlp', kpts=kpts))
+    sAB_ip1 = asarray(gto.intor_cross('int1e_ipovlp', cell, pcell, kpts=kpts))
+    sBA_ip1 = asarray(gto.intor_cross('int1e_ipovlp', pcell, cell, kpts=kpts))
+
+    nao, n_minao = C0_minao[0].shape
+    ao_slice = cell.aoslice_by_atom()
+    minao_slice = pcell.aoslice_by_atom()
+    dtype = np.result_type(*C0_minao)
+
+    def make_coeff(atm_id):
+        p0, p1 = ao_slice[atm_id,2:]
+        q0, q1 = minao_slice[atm_id,2:]
+        C1 = cp.empty((nkpts, 3, nao, n_minao), dtype=dtype)
+        for k in range(nkpts):
+            w, v = wv_ks[k]
+            for n in range(3):
+                sAA1 = cp.zeros((nao, nao), dtype=dtype)
+                sAA1[p0:p1,:] -= sAA_ip1[k][n,p0:p1]
+                sAA1[:,p0:p1] -= sAA_ip1[k][n,p0:p1].conj().T
+                sAB1 = cp.zeros((nao, n_minao), dtype=dtype)
+                sAB1[p0:p1,:] -= sAB_ip1[k][n,p0:p1]
+                sAB1[:,q0:q1] -= sBA_ip1[k][n,q0:q1].conj().T
+
+                S1 = C0_minao[k].conj().T.dot(sAB1)
+                S1 = S1 + S1.conj().T
+                S1 -= C0_minao[k].conj().T.dot(sAA1).dot(C0_minao[k])
+                S1 = v.conj().T.dot(-S1).dot(v)
+                S1 /= (w[:,None] + w)
+                vw = v / w
+                S1_lowdin = vw.dot(S1).dot(vw.conj().T)
+
+                C1_minao = cp.linalg.solve(sAA[k], sAB1 - sAA1.dot(C0_minao[k]))
+                C1[k,n] = C1_minao.dot(S0_lowdin[k])
+                C1[k,n] += C0_minao[k].dot(S1_lowdin)
+        return C1
+    return make_coeff
+
+def _hubbard_U_deriv1(mf, dm=None, kpts=None):
+    assert mf.alpha is None
+    assert mf.C_ao_lo is None
+    assert mf.minao_ref is not None
+    if dm is None:
+        dm = mf.make_rdm1()
+    if kpts is None:
+        kpts = mf.kpts.reshape(-1, 3)
+    nkpts = len(kpts)
+    cell = mf.cell
+
+    # Construct orthogonal minao local orbitals.
+    pcell = reference_mol(cell, mf.minao_ref)
+    C_ao_lo = _make_minao_lo(cell, pcell, kpts=kpts)
+    U_idx, U_val = _set_U(cell, pcell, mf.U_idx, mf.U_val)[:2]
+    U_idx_stack = np.hstack(U_idx)
+    C0 = [C_k[:,U_idx_stack] for C_k in C_ao_lo]
+
+    ovlp0 = asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+    ovlp1 = asarray(cell.pbc_intor('int1e_ipovlp', kpts=kpts))
+    C_inv = [C_k.conj().T.dot(S_k) for C_k, S_k in zip(C0, ovlp0)]
+    dm_deriv0 = [C_k.dot(dm_k).dot(C_k.conj().T) for C_k, dm_k in zip(C_inv, dm)]
+    f_local_ao = generate_first_order_local_orbitals(cell, pcell, kpts)
+
+    ao_slices = cell.aoslice_by_atom()
+    natm = cell.natm
+    dE_U = cp.zeros((natm, 3))
+    weight = 1. / nkpts
+    for atm_id, (p0, p1) in enumerate(ao_slices[:,2:]):
+        C1 = f_local_ao(atm_id)
+        for k in range(nkpts):
+            C1_k = C1[k][:,:,U_idx_stack]
+            SC1 = contract('pq,xqi->xpi', ovlp0[k], C1_k)
+            SC1 -= contract('xqp,qi->xpi', ovlp1[k][:,p0:p1].conj(), C0[k][p0:p1])
+            SC1[:,p0:p1] -= contract('xpq,qi->xpi', ovlp1[k][:,p0:p1], C0[k])
+            dm_deriv1 = contract('pj,xjq->xpq', C_inv[k].dot(dm[k]), SC1)
+            i0 = i1 = 0
+            for idx, val in zip(U_idx, U_val):
+                i0, i1 = i1, i1 + len(idx)
+                P0 = dm_deriv0[k][i0:i1,i0:i1]
+                P1 = dm_deriv1[:,i0:i1,i0:i1]
+                dE_U[atm_id] += weight * (val * 0.5) * (
+                    cp.einsum('xii->x', P1).real * 2 # *2 for P1+P1.T
+                    - cp.einsum('xij,ji->x', P1, P0).real * 2)
+    return dE_U.get()
+
+class Gradients(krks_grad.Gradients):
+    def get_veff(self, dm=None, kpts=None):
+        self._dE_U = _hubbard_U_deriv1(self.base, dm, kpts)
+        return krks_grad.get_veff(self, dm, kpts)
+
+    def extra_force(self, atom_id, envs):
+        val = super().extra_force(atom_id, envs)
+        return self._dE_U[atom_id] + val
diff --git a/gpu4pyscf/pbc/grad/kuhf.py b/gpu4pyscf/pbc/grad/kuhf.py
index cadca42f..380843f9 100644
--- a/gpu4pyscf/pbc/grad/kuhf.py
+++ b/gpu4pyscf/pbc/grad/kuhf.py
@@ -22,7 +22,7 @@
 from pyscf import lib
 from gpu4pyscf.lib import logger
 from gpu4pyscf.pbc.grad import krhf as krhf_grad
-from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.lib.cupy_helper import contract, ensure_numpy
 
 __all__ = ['Gradients']
 
@@ -57,7 +57,7 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
     for ia in range(natm):
         h1ao = hcore_deriv(ia)
         dh1e[ia] = cp.einsum('kxij,kji->x', h1ao, dm0_sf).real
-        extra_force[ia] = mf_grad.extra_force(ia, locals())
+        extra_force[ia] = ensure_numpy(mf_grad.extra_force(ia, locals()))
     log.timer('gradients of 1e part', *t1)
 
     # nabla is applied on bra in vhf. *2 for the contributions of nabla|ket>
diff --git a/gpu4pyscf/pbc/grad/kukspu.py b/gpu4pyscf/pbc/grad/kukspu.py
new file mode 100644
index 00000000..13e27f6f
--- /dev/null
+++ b/gpu4pyscf/pbc/grad/kukspu.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Analytical derivatives for DFT+U with kpoints sampling
+'''
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from gpu4pyscf.pbc.dft.krkspu import _set_U, _make_minao_lo, reference_mol
+from gpu4pyscf.pbc.grad import kuks as kuks_grad
+from gpu4pyscf.pbc.grad.krkspu import generate_first_order_local_orbitals
+from gpu4pyscf.lib.cupy_helper import asarray, contract
+
+def _hubbard_U_deriv1(mf, dm=None, kpts=None):
+    assert mf.alpha is None
+    assert mf.C_ao_lo is None
+    assert mf.minao_ref is not None
+    if dm is None:
+        dm = mf.make_rdm1()
+    if kpts is None:
+        kpts = mf.kpts.reshape(-1, 3)
+    nkpts = len(kpts)
+    cell = mf.cell
+
+    # Construct orthogonal minao local orbitals.
+    pcell = reference_mol(cell, mf.minao_ref)
+    C_ao_lo = _make_minao_lo(cell, pcell, kpts=kpts)
+    U_idx, U_val = _set_U(cell, pcell, mf.U_idx, mf.U_val)[:2]
+    U_idx_stack = np.hstack(U_idx)
+    C0 = [C_k[:,U_idx_stack] for C_k in C_ao_lo]
+
+    ovlp0 = asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+    ovlp1 = asarray(cell.pbc_intor('int1e_ipovlp', kpts=kpts))
+    C_inv = [C_k.conj().T.dot(S_k) for C_k, S_k in zip(C0, ovlp0)]
+    dm_deriv0 = [
+        [C_k.dot(dm_k).dot(C_k.conj().T) for C_k, dm_k in zip(C_inv, dm_s)]
+        for dm_s in dm
+    ]
+    f_local_ao = generate_first_order_local_orbitals(cell, pcell, kpts)
+
+    ao_slices = cell.aoslice_by_atom()
+    natm = cell.natm
+    dE_U = cp.zeros((natm, 3))
+    weight = 1. / nkpts
+    for atm_id, (p0, p1) in enumerate(ao_slices[:,2:]):
+        C1 = f_local_ao(atm_id)
+        for k in range(nkpts):
+            C1_k = C1[k][:,:,U_idx_stack]
+            SC1 = contract('pq,xqi->xpi', ovlp0[k], C1_k)
+            SC1 -= contract('xqp,qi->xpi', ovlp1[k][:,p0:p1].conj(), C0[k][p0:p1])
+            SC1[:,p0:p1] -= contract('xpq,qi->xpi', ovlp1[k][:,p0:p1], C0[k])
+            for s in range(2):
+                dm_deriv1 = contract('pj,xjq->xpq', C_inv[k].dot(dm[s][k]), SC1)
+                i0 = i1 = 0
+                for idx, val in zip(U_idx, U_val):
+                    i0, i1 = i1, i1 + len(idx)
+                    P0 = dm_deriv0[s][k][i0:i1,i0:i1]
+                    P1 = dm_deriv1[:,i0:i1,i0:i1]
+                    dE_U[atm_id] += weight * (val * 0.5) * (
+                        cp.einsum('xii->x', P1).real * 2 # *2 for P1+P1.T
+                        - cp.einsum('xij,ji->x', P1, P0).real * 4)
+    return dE_U.get()
+
+class Gradients(kuks_grad.Gradients):
+    def get_veff(self, dm=None, kpts=None):
+        self._dE_U = _hubbard_U_deriv1(self.base, dm, kpts)
+        return kuks_grad.get_veff(self, dm, kpts)
+
+    def extra_force(self, atom_id, envs):
+        val = super().extra_force(atom_id, envs)
+        return self._dE_U[atom_id] + val
diff --git a/gpu4pyscf/pbc/grad/tests/test_pbc_grad_krkspu.py b/gpu4pyscf/pbc/grad/tests/test_pbc_grad_krkspu.py
new file mode 100644
index 00000000..69b72294
--- /dev/null
+++ b/gpu4pyscf/pbc/grad/tests/test_pbc_grad_krkspu.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import pyscf
+from pyscf import lib
+from gpu4pyscf.pbc.dft import krkspu
+from gpu4pyscf.pbc.grad import krkspu as krkspu_grad
+from pyscf.data.nist import BOHR
+
+class KnownValues(unittest.TestCase):
+    def test_finite_diff_local_orbitals(self):
+        cell = pyscf.M(
+            unit = 'A',
+            atom = 'C 0.,  0.,  0.; O 0.5,  0.8,  1.1',
+            a = '''0.      1.7834  1.7834
+                   1.7834  0.      1.7834
+                   1.7834  1.7834  0.    ''',
+            basis = [[0, [1.3, 1]], [1, [0.8, 1]]],
+            pseudo = 'gth-pbe')
+        kpts = cell.make_kpts([3,1,1])
+        minao = 'gth-szv'
+
+        f_local = krkspu_grad.generate_first_order_local_orbitals(cell, minao, kpts)
+        C1 = f_local(1)
+        pcell = cell.copy()
+        C0p = krkspu._make_minao_lo(pcell.set_geom_('C 0 0 0 0; O 0.5 0.801 1.1'), minao, kpts=kpts)
+        C0m = krkspu._make_minao_lo(pcell.set_geom_('C 0 0 0 0; O 0.5 0.799 1.1'), minao, kpts=kpts)
+        for k in range(len(kpts)):
+            ref = (C0p[k] - C0m[k]) / 2e-3 * BOHR
+            self.assertAlmostEqual(abs(C1[k][1] - ref).max().get(), 0, 6)
+
+    def test_finite_diff_hubbard_U_grad(self):
+        cell = pyscf.M(
+            unit = 'A',
+            atom = 'C 0.,  0.,  0.; O 0.5,  0.8,  1.1',
+            a = '''0.      1.7834  1.7834
+                   1.7834  0.      1.7834
+                   1.7834  1.7834  0.    ''',
+            basis = [[0, [1.3, 1]], [1, [0.8, 1]]],
+            pseudo = 'gth-pbe')
+        kpts = cell.make_kpts([3,1,1])
+        minao = 'gth-szv'
+
+        U_idx = ['C 2p']
+        U_val = [5]
+        mf = krkspu.KRKSpU(cell, kpts=kpts, U_idx=U_idx, U_val=U_val, minao_ref=minao)
+        mf.__dict__.update(cell.KRKS(kpts=kpts).to_gpu().run(max_cycle=1).__dict__)
+        de = krkspu_grad._hubbard_U_deriv1(mf)
+
+        mf.cell.set_geom_('C 0 0 0 0; O 0.5 0.801 1.1')
+        e1 = mf.get_veff().E_U.real
+
+        mf.cell.set_geom_('C 0 0 0 0; O 0.5 0.799 1.1')
+        e2 = mf.get_veff().E_U.real
+        self.assertAlmostEqual(de[1,1], (e1 - e2)/2e-3*BOHR, 6)
+
+    def test_finite_diff_krkspu_grad(self):
+        cell = pyscf.M(
+            unit = 'A',
+            atom = 'C 0.,  0.,  0.; O 0.5,  0.8,  1.1',
+            a = '''0.      1.7834  1.7834
+                   1.7834  0.      1.7834
+                   1.7834  1.7834  0.    ''',
+            basis = [[0, [1.3, 1]], [1, [0.8, 1]]],
+            pseudo = 'gth-pbe')
+        kpts = cell.make_kpts([3,1,1])
+        minao = 'gth-szv'
+
+        U_idx = ['C 2p']
+        U_val = [5]
+        mf = krkspu.KRKSpU(cell, kpts=kpts, U_idx=U_idx, U_val=U_val, minao_ref=minao)
+        e, g = mf.nuc_grad_method().as_scanner()(cell)
+        self.assertAlmostEqual(e, -15.939464667807849, 8)
+        self.assertAlmostEqual(lib.fp(g), -0.42370983409650914, 5)
+
+        mf_scanner = mf.as_scanner()
+        e1 = mf_scanner(cell.set_geom_('C 0 0 0 0; O 0.5 0.801 1.1'))
+        e2 = mf_scanner(cell.set_geom_('C 0 0 0 0; O 0.5 0.799 1.1'))
+        self.assertAlmostEqual(g[1,1], (e1-e2)/2e-3*BOHR, 5)
+
+if __name__ == '__main__':
+    print("Full Tests for KRKS+U Gradients")
+    unittest.main()
diff --git a/gpu4pyscf/pbc/grad/tests/test_pbc_grad_kukspu.py b/gpu4pyscf/pbc/grad/tests/test_pbc_grad_kukspu.py
new file mode 100644
index 00000000..06872912
--- /dev/null
+++ b/gpu4pyscf/pbc/grad/tests/test_pbc_grad_kukspu.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import pyscf
+from pyscf import lib
+from gpu4pyscf.pbc.dft import kukspu
+from gpu4pyscf.pbc.grad import kukspu as kukspu_grad
+from pyscf.data.nist import BOHR
+
+class KnownValues(unittest.TestCase):
+    def test_finite_diff_hubbard_U_grad(self):
+        cell = pyscf.M(
+            unit = 'A',
+            atom = 'C 0.,  0.,  0.; O 0.5,  0.8,  1.1',
+            a = '''0.      1.7834  1.7834
+                   1.7834  0.      1.7834
+                   1.7834  1.7834  0.    ''',
+            basis = [[0, [1.3, 1]], [1, [0.8, 1]]],
+            spin = 2,
+            pseudo = 'gth-pbe')
+        kpts = cell.make_kpts([3,1,1])
+        minao = 'gth-szv'
+
+        U_idx = ['C 2p']
+        U_val = [5]
+        mf = kukspu.KUKSpU(cell, kpts=kpts, U_idx=U_idx, U_val=U_val, minao_ref=minao)
+        mf.__dict__.update(cell.KUKS(kpts=kpts).to_gpu().run(max_cycle=1).__dict__)
+        de = kukspu_grad._hubbard_U_deriv1(mf)
+
+        mf.cell.set_geom_('C 0 0 0 0; O 0.5 0.801 1.1')
+        e1 = mf.get_veff().E_U.real
+
+        mf.cell.set_geom_('C 0 0 0 0; O 0.5 0.799 1.1')
+        e2 = mf.get_veff().E_U.real
+        self.assertAlmostEqual(de[1,1], (e1 - e2)/2e-3*BOHR, 6)
+
+    def test_finite_diff_kukspu_grad(self):
+        cell = pyscf.M(
+            unit = 'A',
+            atom = 'C 0.,  0.,  0.; O 0.5,  0.8,  1.1',
+            a = '''0.      1.7834  1.7834
+                   1.7834  0.      1.7834
+                   1.7834  1.7834  0.    ''',
+            basis = [[0, [1.3, 1]], [1, [0.8, 1]]],
+            pseudo = 'gth-pbe')
+        kpts = cell.make_kpts([3,1,1])
+        minao = 'gth-szv'
+
+        U_idx = ['C 2p']
+        U_val = [5]
+        mf = kukspu.KUKSpU(cell, kpts=kpts, U_idx=U_idx, U_val=U_val, minao_ref=minao)
+        e, g = mf.nuc_grad_method().as_scanner()(cell)
+        self.assertAlmostEqual(e, -15.939464667807849, 8)
+        self.assertAlmostEqual(lib.fp(g), -0.42370983409650914, 5)
+
+        mf_scanner = mf.as_scanner()
+        e1 = mf_scanner(cell.set_geom_('C 0 0 0 0; O 0.5 0.801 1.1'))
+        e2 = mf_scanner(cell.set_geom_('C 0 0 0 0; O 0.5 0.799 1.1'))
+        self.assertAlmostEqual(g[1,1], (e1-e2)/2e-3*BOHR, 5)
+
+if __name__ == '__main__':
+    print("Full Tests for KUKS+U Gradients")
+    unittest.main()

From b34b3bd196678e348eb95704be8dee8ca98dec31 Mon Sep 17 00:00:00 2001
From: Qiming Sun <qiming.sun@bytedance.com>
Date: Fri, 18 Jul 2025 06:03:13 +0800
Subject: [PATCH 4/9] Fix KUKS+U linear response U method

---
 gpu4pyscf/pbc/dft/kukspu.py                |   5 +-
 gpu4pyscf/pbc/dft/tests/test_pbc_krkspu.py | 126 ++++++++++++++++++++
 gpu4pyscf/pbc/dft/tests/test_pbc_kukspu.py | 127 +++++++++++++++++++++
 3 files changed, 256 insertions(+), 2 deletions(-)
 create mode 100644 gpu4pyscf/pbc/dft/tests/test_pbc_krkspu.py
 create mode 100644 gpu4pyscf/pbc/dft/tests/test_pbc_kukspu.py

diff --git a/gpu4pyscf/pbc/dft/kukspu.py b/gpu4pyscf/pbc/dft/kukspu.py
index 3c37adbf..ab43aaa6 100644
--- a/gpu4pyscf/pbc/dft/kukspu.py
+++ b/gpu4pyscf/pbc/dft/kukspu.py
@@ -259,9 +259,10 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
         # The first iteration of SCF
         fock = mf.get_fock(dm=bare_dm)
         e, mo = mf.eig(fock, ovlp)
+        local_occ = 0
         for c in C_inv:
-            C_on_site = [[c[k].dot(mf.mo_coeff[0][k]) for k in range(nkpts)],
-                         [c[k].dot(mf.mo_coeff[1][k]) for k in range(nkpts)]]
+            C_on_site = [[c[k].dot(mo[0][k]) for k in range(nkpts)],
+                         [c[k].dot(mo[1][k]) for k in range(nkpts)]]
             rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
             local_occ += sum(x.trace().real for x in rdm1_lo[0])
             local_occ += sum(x.trace().real for x in rdm1_lo[1])
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_krkspu.py b/gpu4pyscf/pbc/dft/tests/test_pbc_krkspu.py
new file mode 100644
index 00000000..732d1a6b
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_krkspu.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+# Copyright 2014-2023 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Authors: Zhi-Hao Cui <zhcui0408@gmail.com>
+#
+
+import unittest
+import numpy as np
+
+from pyscf import lib
+from pyscf.pbc import gto as pgto
+from pyscf.pbc import dft as pdft
+from pyscf.pbc.dft import krkspu
+
+def setUpModule():
+    global cell
+    cell = pgto.Cell()
+    cell.unit = 'A'
+    cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
+    cell.a = '''0.      1.7834  1.7834
+                1.7834  0.      1.7834
+                1.7834  1.7834  0.    '''
+
+    cell.basis = 'gth-dzvp'
+    cell.pseudo = 'gth-pade'
+    cell.verbose = 7
+    cell.output = '/dev/null'
+    cell.mesh = [29]*3
+    cell.space_group_symmetry=True
+    cell.build()
+
+def tearDownModule():
+    global cell
+    cell.stdout.close()
+    del cell
+
+class KnownValues(unittest.TestCase):
+    def test_KRKSpU_high_cost(self):
+        kmesh = [2, 1, 1]
+        kpts = cell.make_kpts(kmesh, wrap_around=True)
+        U_idx = ["1 C 2p"]
+        U_val = [5.0]
+
+        mf = pdft.KRKSpU(cell, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao',
+                minao_ref='gth-szv')
+        mf.conv_tol = 1e-10
+        e1 = mf.kernel()
+        self.assertAlmostEqual(e1, -10.694460059491741, 8)
+
+    def test_KRKSpU_ksymm(self):
+        cell1 = cell.copy()
+        cell1.basis = 'gth-szv'
+        cell1.mesh = [16,]*3
+        cell1.build()
+
+        U_idx = ["1 C 2p"]
+        U_val = [5.0]
+
+        kmesh = [2, 2, 1]
+        kpts0 = cell1.make_kpts(kmesh, wrap_around=True)
+        mf0 = pdft.KRKSpU(cell1, kpts0, U_idx=U_idx, U_val=U_val, C_ao_lo='minao')
+        e0 = mf0.kernel()
+
+        kpts = cell1.make_kpts(kmesh, wrap_around=True,
+                               space_group_symmetry=True, time_reversal_symmetry=True)
+        assert kpts.nkpts_ibz == 3
+        mf = pdft.KRKSpU(cell1, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao')
+        e1 = mf.kernel()
+        self.assertAlmostEqual(e1, e0, 8)
+
+    def test_get_veff(self):
+        kmesh = [2, 1, 1]
+        kpts = cell.make_kpts(kmesh, wrap_around=True)
+        U_idx = ["1 C 2p"]
+        U_val = [5.0]
+
+        mf = pdft.KRKSpU(cell, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao',
+                         minao_ref='gth-szv')
+        dm = mf.get_init_guess(cell, 'minao')
+        vxc = mf.get_veff(cell, dm)
+        self.assertAlmostEqual(vxc.E_U, 0.07587726255165786, 11)
+        self.assertAlmostEqual(lib.fp(vxc), 12.77643098220399, 8)
+
+    def test_KRKSpU_linear_response(self):
+        cell = pgto.Cell()
+        cell.unit = 'A'
+        cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
+        cell.a = '''0.      1.7834  1.7834
+                    1.7834  0.      1.7834
+                    1.7834  1.7834  0.    '''
+
+        cell.basis = 'gth-szv'
+        cell.pseudo = 'gth-pade'
+        cell.verbose = 7
+        cell.output = '/dev/null'
+        cell.mesh = [29]*3
+        cell.build()
+        kmesh = [2, 1, 1]
+        kpts = cell.make_kpts(kmesh, wrap_around=True)
+        U_idx = ["1 C 2p"]
+        U_val = [5.0]
+
+        mf = pdft.KRKSpU(cell, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao',
+                minao_ref='gth-szv')
+        mf.conv_tol = 1e-10
+        e_tot = mf.kernel()
+        self.assertAlmostEqual(e_tot, -10.6191452297714, 8)
+
+        uresp = krkspu.linear_response_u(mf, (0.03, 0.08))
+        self.assertAlmostEqual(uresp, 6.279179, 2)
+
+if __name__ == '__main__':
+    print("Full Tests for pbc.dft.krkspu")
+    unittest.main()
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_kukspu.py b/gpu4pyscf/pbc/dft/tests/test_pbc_kukspu.py
new file mode 100644
index 00000000..b4d0ee91
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_kukspu.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+# Copyright 2014-2023 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Authors: Zhi-Hao Cui <zhcui0408@gmail.com>
+#
+
+import unittest
+import numpy as np
+
+from pyscf import lib
+from pyscf.pbc import gto as pgto
+from pyscf.pbc import dft as pdft
+from pyscf.pbc.dft import kukspu
+
+def setUpModule():
+    global cell
+    cell = pgto.Cell()
+    cell.unit = 'A'
+    cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
+    cell.a = '''0.      1.7834  1.7834
+                1.7834  0.      1.7834
+                1.7834  1.7834  0.    '''
+
+    cell.basis = 'gth-dzvp'
+    cell.pseudo = 'gth-pade'
+    cell.verbose = 7
+    cell.output = '/dev/null'
+    cell.mesh = [29]*3
+    cell.space_group_symmetry = True
+    cell.build()
+
+def tearDownModule():
+    global cell
+    cell.stdout.close()
+    del cell
+
+class KnownValues(unittest.TestCase):
+    def test_KUKSpU_high_cost(self):
+        kmesh = [2, 1, 1]
+        kpts = cell.make_kpts(kmesh, wrap_around=True)
+        U_idx = ["1 C 2p"]
+        U_val = [5.0]
+
+        mf = pdft.KUKSpU(cell, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao',
+                minao_ref='gth-szv')
+        mf.conv_tol = 1e-10
+        e1 = mf.kernel()
+        self.assertAlmostEqual(e1, -10.694460059491741, 8)
+
+    def test_KUKSpU_ksymm(self):
+        cell1 = cell.copy()
+        cell1.basis = 'gth-szv'
+        cell1.mesh = [16,]*3
+        cell1.build()
+
+        U_idx = ["1 C 2p"]
+        U_val = [5.0]
+
+        kmesh = [2, 2, 1]
+        kpts0 = cell1.make_kpts(kmesh, wrap_around=True)
+        mf0 = pdft.KUKSpU(cell1, kpts0, U_idx=U_idx, U_val=U_val, C_ao_lo='minao')
+        e0 = mf0.kernel()
+
+        kpts = cell1.make_kpts(kmesh, wrap_around=True,
+                               space_group_symmetry=True, time_reversal_symmetry=True)
+        assert kpts.nkpts_ibz == 3
+        mf = pdft.KUKSpU(cell1, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao')
+        e1 = mf.kernel()
+        self.assertAlmostEqual(e1, e0, 8)
+
+    def test_get_veff(self):
+        kmesh = [2, 1, 1]
+        kpts = cell.make_kpts(kmesh, wrap_around=True)
+        U_idx = ["1 C 2p"]
+        U_val = [5.0]
+
+        mf = pdft.KUKSpU(cell, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao',
+                         minao_ref='gth-szv')
+        dm = mf.get_init_guess(cell, 'minao')
+        vxc = mf.get_veff(cell, dm)
+        self.assertAlmostEqual(vxc.E_U, 0.07587726255165786, 11)
+        self.assertAlmostEqual(lib.fp(vxc), 6.37407828665724, 8)
+
+    def test_KUKSpU_linear_response(self):
+        cell = pgto.Cell()
+        cell.unit = 'A'
+        cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
+        cell.a = '''0.      1.7834  1.7834
+                    1.7834  0.      1.7834
+                    1.7834  1.7834  0.    '''
+
+        cell.basis = 'gth-szv'
+        cell.pseudo = 'gth-pade'
+        cell.verbose = 7
+        cell.output = '/dev/null'
+        cell.mesh = [29]*3
+        cell.build()
+        kmesh = [2, 1, 1]
+        kpts = cell.make_kpts(kmesh, wrap_around=True)
+        U_idx = ["1 C 2p"]
+        U_val = [5.0]
+
+        mf = pdft.KUKSpU(cell, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao',
+                minao_ref='gth-szv')
+        mf.conv_tol = 1e-10
+        e_tot = mf.kernel()
+        self.assertAlmostEqual(e_tot, -10.6191452297714, 8)
+
+        uresp = kukspu.linear_response_u(mf, (0.03, 0.08))
+        self.assertAlmostEqual(uresp, 6.279179, 2)
+
+
+if __name__ == '__main__':
+    print("Full Tests for pbc.dft.kukspu")
+    unittest.main()

From 7f3576571154bc567d25c0b8582ad38ab9d6da78 Mon Sep 17 00:00:00 2001
From: Qiming Sun <qiming.sun@bytedance.com>
Date: Fri, 18 Jul 2025 06:43:38 +0800
Subject: [PATCH 5/9] Fix bug introduced in the gradients base method

---
 gpu4pyscf/grad/rhf.py | 2 +-
 gpu4pyscf/grad/uhf.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index eb632310..92a4841c 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -305,7 +305,7 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
     delec = 2.0*(dh - ds)
 
     delec = cupy.asarray([cupy.sum(delec[:, p0:p1], axis=1) for p0, p1 in aoslices[:,2:]])
-    de = 2.0 * dvhf + dh1e + delec + extra_force
+    de = 2.0 * dvhf + dh1e + delec + asarray(extra_force)
 
     # for backforward compatiability
     if(hasattr(mf, 'disp') and mf.disp is not None):
diff --git a/gpu4pyscf/grad/uhf.py b/gpu4pyscf/grad/uhf.py
index 50cf0548..f24a1b86 100644
--- a/gpu4pyscf/grad/uhf.py
+++ b/gpu4pyscf/grad/uhf.py
@@ -94,7 +94,7 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
     delec = 2.0*(dh - ds)
     delec = cupy.asarray([cupy.sum(delec[:, p0:p1], axis=1) for p0, p1 in aoslices[:,2:]])
 
-    de = 2.0 * dvhf + dh1e + delec + extra_force
+    de = 2.0 * dvhf + dh1e + delec + asarray(extra_force)
 
     # for backward compatiability
     if(hasattr(mf, 'disp') and mf.disp is not None):

From 819d1dd5c8e5f46b2c079d88cf0eaeb9adec32b5 Mon Sep 17 00:00:00 2001
From: Qiming Sun <qiming.sun@bytedance.com>
Date: Sat, 19 Jul 2025 03:50:45 +0800
Subject: [PATCH 6/9] miss import

---
 gpu4pyscf/grad/rhf.py | 2 +-
 gpu4pyscf/grad/uhf.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index 92a4841c..322e1841 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -305,7 +305,7 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
     delec = 2.0*(dh - ds)
 
     delec = cupy.asarray([cupy.sum(delec[:, p0:p1], axis=1) for p0, p1 in aoslices[:,2:]])
-    de = 2.0 * dvhf + dh1e + delec + asarray(extra_force)
+    de = 2.0 * dvhf + dh1e + delec + cupy.asarray(extra_force)
 
     # for backforward compatiability
     if(hasattr(mf, 'disp') and mf.disp is not None):
diff --git a/gpu4pyscf/grad/uhf.py b/gpu4pyscf/grad/uhf.py
index f24a1b86..426820fe 100644
--- a/gpu4pyscf/grad/uhf.py
+++ b/gpu4pyscf/grad/uhf.py
@@ -94,7 +94,7 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
     delec = 2.0*(dh - ds)
     delec = cupy.asarray([cupy.sum(delec[:, p0:p1], axis=1) for p0, p1 in aoslices[:,2:]])
 
-    de = 2.0 * dvhf + dh1e + delec + asarray(extra_force)
+    de = 2.0 * dvhf + dh1e + delec + cupy.asarray(extra_force)
 
     # for backward compatiability
     if(hasattr(mf, 'disp') and mf.disp is not None):

From e4a57d2cae2ac2b609153a0d278834bfcc0270f0 Mon Sep 17 00:00:00 2001
From: Qiming Sun <qiming.sun@bytedance.com>
Date: Tue, 22 Jul 2025 01:11:12 +0800
Subject: [PATCH 7/9] DFT+U gradients fixes

---
 gpu4pyscf/grad/tests/test_grad_rkspu.py       |  7 ++-
 gpu4pyscf/grad/tests/test_grad_ukspu.py       |  7 ++-
 gpu4pyscf/pbc/dft/__init__.py                 |  4 ++
 gpu4pyscf/pbc/dft/krkspu.py                   | 11 ++--
 gpu4pyscf/pbc/dft/kukspu.py                   | 12 ++--
 gpu4pyscf/pbc/dft/tests/test_pbc_dftu.py      | 55 -------------------
 .../pbc/grad/tests/test_pbc_grad_krkspu.py    |  2 +-
 .../pbc/grad/tests/test_pbc_grad_kukspu.py    |  2 +-
 8 files changed, 23 insertions(+), 77 deletions(-)

diff --git a/gpu4pyscf/grad/tests/test_grad_rkspu.py b/gpu4pyscf/grad/tests/test_grad_rkspu.py
index df85149f..adacb67c 100644
--- a/gpu4pyscf/grad/tests/test_grad_rkspu.py
+++ b/gpu4pyscf/grad/tests/test_grad_rkspu.py
@@ -53,13 +53,14 @@ def test_finite_diff_hubbard_U_grad(self):
         self.assertAlmostEqual(de[1,2], (e1 - e2)/2e-3, 6)
 
     def test_finite_diff_rkspu_grad(self):
-        mol = gto.M(atom='C 0 1.6 0; O 0 0 1', basis='ccpvdz', unit='B', verbose=0)
+        mol = gto.M(atom='C 0 0 0; O 1 2 1', basis='ccpvdz', unit='B', verbose=0)
         U_idx = ["C 2p"]
         U_val = [5.0]
         mf = rkspu.RKSpU(mol, xc='pbe', U_idx=U_idx, U_val=U_val)
+        mol = gto.M(atom='C 0 1.6 0; O 0 0 1', basis='ccpvdz', unit='B', verbose=0)
         e, g = mf.nuc_grad_method().as_scanner()(mol)
-        self.assertAlmostEqual(e, -113.0468869772191, 8)
-        self.assertAlmostEqual(lib.fp(g), -0.7728822285688215, 5)
+        self.assertAlmostEqual(e, -113.00924201782233, 8)
+        self.assertAlmostEqual(lib.fp(g), -0.7753406769593958, 5)
 
         mol1 = mol.copy()
         mf_scanner = mf.as_scanner()
diff --git a/gpu4pyscf/grad/tests/test_grad_ukspu.py b/gpu4pyscf/grad/tests/test_grad_ukspu.py
index 947ad41a..3bffd88b 100644
--- a/gpu4pyscf/grad/tests/test_grad_ukspu.py
+++ b/gpu4pyscf/grad/tests/test_grad_ukspu.py
@@ -36,13 +36,14 @@ def test_finite_diff_hubbard_U_grad(self):
         self.assertAlmostEqual(de[1,2], (e1 - e2)/2e-3, 6)
 
     def test_finite_diff_ukspu_grad(self):
-        mol = gto.M(atom='C 0 1.6 0; O 0 0 1', spin=2, basis='ccpvdz', unit='B', verbose=0)
+        mol = gto.M(atom='C 0 0 0; O 1 2 1', basis='ccpvdz', unit='B', verbose=0)
         U_idx = ["C 2p"]
         U_val = [5.0]
         mf = ukspu.UKSpU(mol, xc='pbe', U_idx=U_idx, U_val=U_val)
+        mol = gto.M(atom='C 0 1.6 0; O 0 0 1', spin=2, basis='ccpvdz', unit='B', verbose=0)
         e, g = mf.nuc_grad_method().as_scanner()(mol)
-        self.assertAlmostEqual(e, -112.7869835493314, 8)
-        self.assertAlmostEqual(lib.fp(g), -1.0489136822191656, 5)
+        self.assertAlmostEqual(e, -112.76620903336628, 8)
+        self.assertAlmostEqual(lib.fp(g), -1.0568299810814519, 5)
 
         mol1 = mol.copy()
         mf_scanner = mf.as_scanner()
diff --git a/gpu4pyscf/pbc/dft/__init__.py b/gpu4pyscf/pbc/dft/__init__.py
index 3d5a64b7..c31cb0b3 100644
--- a/gpu4pyscf/pbc/dft/__init__.py
+++ b/gpu4pyscf/pbc/dft/__init__.py
@@ -20,9 +20,13 @@
 from . import uks
 from . import krks
 from . import kuks
+from . import krkspu
+from . import kukspu
 from .rks import KohnShamDFT
 
 RKS = rks.RKS
 UKS = uks.UKS
 KRKS = krks.KRKS
 KUKS = kuks.KUKS
+KRKSpU = krkspu.KRKSpU
+KUKSpU = kukspu.KUKSpU
diff --git a/gpu4pyscf/pbc/dft/krkspu.py b/gpu4pyscf/pbc/dft/krkspu.py
index 71648e74..35cc0c91 100644
--- a/gpu4pyscf/pbc/dft/krkspu.py
+++ b/gpu4pyscf/pbc/dft/krkspu.py
@@ -28,7 +28,7 @@
 from gpu4pyscf.dft.rkspu import _set_U, reference_mol
 from gpu4pyscf.lib import logger
 from gpu4pyscf.pbc.dft import krks
-from gpu4pyscf.lib.cupy_helper import asarray
+from gpu4pyscf.lib.cupy_helper import asarray, contract
 
 def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
              kpts=None, kpts_band=None):
@@ -171,7 +171,7 @@ def _make_minao_lo(cell, minao_ref='minao', kpts=None):
         S0 = C.conj().T.dot(S_k).dot(C)
         w2, v = cp.linalg.eigh(S0)
         C_minao.append(C.dot((v*cp.sqrt(1./w2)).dot(v.conj().T)))
-    return C_minao
+    return cp.asarray(C_minao)
 
 class KRKSpU(krks.KRKS):
     """
@@ -282,8 +282,7 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
         C_ao_lo = _make_minao_lo(cell, pcell, kpts)
     else:
         C_ao_lo = mf.C_ao_lo
-    C_inv = [[C_k[:,local_idx].conj().T.dot(S_k) for C_k, S_k in zip(C_ao_lo, ovlp)]
-             for local_idx in U_idx]
+    C_inv = [contract('kpi,kpq->kiq', C_ao_lo[:,:,local_idx].conj(), ovlp) for local_idx in U_idx]
 
     bare_occupancies = []
     final_occupancies = []
@@ -293,7 +292,7 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
         mf.kernel(dm0=bare_dm)
         local_occ = 0
         for c in C_inv:
-            C_on_site = [c[k].dot(mf.mo_coeff[k]) for k in range(nkpts)]
+            C_on_site = contract('kiq,kqj->kij', c, mf.mo_coeff)
             rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
             local_occ += sum(x.trace().real for x in rdm1_lo)
         local_occ /= nkpts
@@ -304,7 +303,7 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
         e, mo = mf.eig(fock, ovlp)
         local_occ = 0
         for c in C_inv:
-            C_on_site = [c[k].dot(mo[k]) for k in range(nkpts)]
+            C_on_site = contract('kiq,kqj->kij', c, mo)
             rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
             local_occ += sum(x.trace().real for x in rdm1_lo)
         local_occ /= nkpts
diff --git a/gpu4pyscf/pbc/dft/kukspu.py b/gpu4pyscf/pbc/dft/kukspu.py
index ab43aaa6..99557011 100644
--- a/gpu4pyscf/pbc/dft/kukspu.py
+++ b/gpu4pyscf/pbc/dft/kukspu.py
@@ -27,7 +27,7 @@
 from gpu4pyscf.lib import logger
 from gpu4pyscf.pbc.dft import kuks
 from gpu4pyscf.pbc.dft.krkspu import _set_U, _make_minao_lo, reference_mol
-from gpu4pyscf.lib.cupy_helper import asarray, tag_array
+from gpu4pyscf.lib.cupy_helper import asarray, contract, tag_array
 
 def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
              kpts=None, kpts_band=None):
@@ -237,9 +237,7 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
     pcell = reference_mol(cell, mf.minao_ref)
     U_idx, U_val, U_lab = _set_U(cell, pcell, mf.U_idx, mf.U_val)
     C_ao_lo = _make_minao_lo(cell, pcell, kpts)
-    C_inv = []
-    for local_idx in U_idx:
-        C_inv.append([C_k[:,local_idx].conj().T.dot(S_k) for C_k, S_k in zip(C_ao_lo, ovlp)])
+    C_inv = [contract('kpi,kpq->kiq', C_ao_lo[:,:,local_idx].conj(), ovlp) for local_idx in U_idx]
 
     bare_occupancies = []
     final_occupancies = []
@@ -248,8 +246,7 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
         mf.kernel(dm0=bare_dm)
         local_occ = 0
         for c in C_inv:
-            C_on_site = [[c[k].dot(mf.mo_coeff[0][k]) for k in range(nkpts)],
-                         [c[k].dot(mf.mo_coeff[1][k]) for k in range(nkpts)]]
+            C_on_site = contract('kiq,nkqj->nkij', c, mf.mo_coeff)
             rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
             local_occ += sum(x.trace().real for x in rdm1_lo[0])
             local_occ += sum(x.trace().real for x in rdm1_lo[1])
@@ -261,8 +258,7 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
         e, mo = mf.eig(fock, ovlp)
         local_occ = 0
         for c in C_inv:
-            C_on_site = [[c[k].dot(mo[0][k]) for k in range(nkpts)],
-                         [c[k].dot(mo[1][k]) for k in range(nkpts)]]
+            C_on_site = contract('kiq,nkqj->nkij', c, mo)
             rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
             local_occ += sum(x.trace().real for x in rdm1_lo[0])
             local_occ += sum(x.trace().real for x in rdm1_lo[1])
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_dftu.py b/gpu4pyscf/pbc/dft/tests/test_pbc_dftu.py
index 6e2b90bb..61561623 100644
--- a/gpu4pyscf/pbc/dft/tests/test_pbc_dftu.py
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_dftu.py
@@ -75,61 +75,6 @@ def test_get_veff(self):
         self.assertAlmostEqual(vxc.E_U, 0.07587726255165786, 11)
         self.assertAlmostEqual(lib.fp(vxc.get()), 12.77643098220399, 8)
 
-    def test_KRKSpU_linear_response(self):
-        cell = pgto.Cell()
-        cell.unit = 'A'
-        cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
-        cell.a = '''0.      1.7834  1.7834
-                    1.7834  0.      1.7834
-                    1.7834  1.7834  0.    '''
-
-        cell.basis = 'gth-szv'
-        cell.pseudo = 'gth-pade'
-        cell.verbose = 7
-        cell.output = '/dev/null'
-        cell.mesh = [29]*3
-        cell.build()
-        kmesh = [2, 1, 1]
-        kpts = cell.make_kpts(kmesh, wrap_around=True)
-        U_idx = ["1 C 2p"]
-        U_val = [5.0]
-
-        mf = krkspu.KRKSpU(cell, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao',
-                minao_ref='gth-szv')
-        mf.conv_tol = 1e-10
-        e_tot = mf.kernel()
-        self.assertAlmostEqual(e_tot, -10.6191452297714, 8)
-
-        uresp = krkspu.linear_response_u(mf, (0.03, 0.08))
-        self.assertAlmostEqual(uresp, 6.279179, 2)
-
-    def test_KUKSpU_linear_response(self):
-        cell = pgto.Cell()
-        cell.unit = 'A'
-        cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
-        cell.a = '''0.      1.7834  1.7834
-                    1.7834  0.      1.7834
-                    1.7834  1.7834  0.    '''
-
-        cell.basis = 'gth-szv'
-        cell.pseudo = 'gth-pade'
-        cell.verbose = 7
-        cell.output = '/dev/null'
-        cell.mesh = [29]*3
-        cell.build()
-        kmesh = [2, 1, 1]
-        kpts = cell.make_kpts(kmesh, wrap_around=True)
-        U_idx = ["1 C 2p"]
-        U_val = [5.0]
-
-        mf = kukspu.KUKSpU(cell, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao',
-                minao_ref='gth-szv')
-        mf.conv_tol = 1e-10
-        e_tot = mf.kernel()
-        self.assertAlmostEqual(e_tot, -10.6191452297714, 8)
-
-        uresp = kukspu.linear_response_u(mf, (0.03, 0.08))
-        self.assertAlmostEqual(uresp, 6.279179, 2)
 
 if __name__ == '__main__':
     print("Full Tests for pbc.dft dft+U")
diff --git a/gpu4pyscf/pbc/grad/tests/test_pbc_grad_krkspu.py b/gpu4pyscf/pbc/grad/tests/test_pbc_grad_krkspu.py
index 69b72294..c8e46ed8 100644
--- a/gpu4pyscf/pbc/grad/tests/test_pbc_grad_krkspu.py
+++ b/gpu4pyscf/pbc/grad/tests/test_pbc_grad_krkspu.py
@@ -84,7 +84,7 @@ def test_finite_diff_krkspu_grad(self):
         mf = krkspu.KRKSpU(cell, kpts=kpts, U_idx=U_idx, U_val=U_val, minao_ref=minao)
         e, g = mf.nuc_grad_method().as_scanner()(cell)
         self.assertAlmostEqual(e, -15.939464667807849, 8)
-        self.assertAlmostEqual(lib.fp(g), -0.42370983409650914, 5)
+        self.assertAlmostEqual(lib.fp(g), -0.42370983409650914, 4)
 
         mf_scanner = mf.as_scanner()
         e1 = mf_scanner(cell.set_geom_('C 0 0 0 0; O 0.5 0.801 1.1'))
diff --git a/gpu4pyscf/pbc/grad/tests/test_pbc_grad_kukspu.py b/gpu4pyscf/pbc/grad/tests/test_pbc_grad_kukspu.py
index 06872912..0669b0d7 100644
--- a/gpu4pyscf/pbc/grad/tests/test_pbc_grad_kukspu.py
+++ b/gpu4pyscf/pbc/grad/tests/test_pbc_grad_kukspu.py
@@ -64,7 +64,7 @@ def test_finite_diff_kukspu_grad(self):
         mf = kukspu.KUKSpU(cell, kpts=kpts, U_idx=U_idx, U_val=U_val, minao_ref=minao)
         e, g = mf.nuc_grad_method().as_scanner()(cell)
         self.assertAlmostEqual(e, -15.939464667807849, 8)
-        self.assertAlmostEqual(lib.fp(g), -0.42370983409650914, 5)
+        self.assertAlmostEqual(lib.fp(g), -0.42370983409650914, 4)
 
         mf_scanner = mf.as_scanner()
         e1 = mf_scanner(cell.set_geom_('C 0 0 0 0; O 0.5 0.801 1.1'))

From 0cf6a74697e6c19f333575a7c2a3097a7b638dce Mon Sep 17 00:00:00 2001
From: Qiming Sun <qiming.sun@bytedance.com>
Date: Tue, 22 Jul 2025 04:43:32 +0800
Subject: [PATCH 8/9] Update KRKSpU and KUKSpU tests

---
 gpu4pyscf/pbc/dft/krkspu.py                |  2 ++
 gpu4pyscf/pbc/dft/kukspu.py                |  4 +++-
 gpu4pyscf/pbc/dft/tests/test_pbc_krkspu.py | 28 +++-------------------
 gpu4pyscf/pbc/dft/tests/test_pbc_kukspu.py | 28 +++-------------------
 4 files changed, 11 insertions(+), 51 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/krkspu.py b/gpu4pyscf/pbc/dft/krkspu.py
index 35cc0c91..50143bb5 100644
--- a/gpu4pyscf/pbc/dft/krkspu.py
+++ b/gpu4pyscf/pbc/dft/krkspu.py
@@ -295,6 +295,7 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
             C_on_site = contract('kiq,kqj->kij', c, mf.mo_coeff)
             rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
             local_occ += sum(x.trace().real for x in rdm1_lo)
+        local_occ = local_occ.get()
         local_occ /= nkpts
         final_occupancies.append(local_occ)
 
@@ -306,6 +307,7 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
             C_on_site = contract('kiq,kqj->kij', c, mo)
             rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
             local_occ += sum(x.trace().real for x in rdm1_lo)
+        local_occ = local_occ.get()
         local_occ /= nkpts
         bare_occupancies.append(local_occ)
         log.info('alpha=%f bare_occ=%g final_occ=%g',
diff --git a/gpu4pyscf/pbc/dft/kukspu.py b/gpu4pyscf/pbc/dft/kukspu.py
index 99557011..22691b29 100644
--- a/gpu4pyscf/pbc/dft/kukspu.py
+++ b/gpu4pyscf/pbc/dft/kukspu.py
@@ -183,7 +183,7 @@ def dump_flags(self, verbose=None):
         super().dump_flags(verbose)
         log = logger.new_logger(self, verbose)
         if log.verbose >= logger.INFO:
-            from gpu4pyscf.dft.krkspu import _print_U_info
+            from gpu4pyscf.dft.rkspu import _print_U_info
             _print_U_info(self, log)
         return self
 
@@ -250,6 +250,7 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
             rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
             local_occ += sum(x.trace().real for x in rdm1_lo[0])
             local_occ += sum(x.trace().real for x in rdm1_lo[1])
+        local_occ = local_occ.get()
         local_occ /= nkpts
         final_occupancies.append(local_occ)
 
@@ -262,6 +263,7 @@ def linear_response_u(mf_plus_u, alphalist=(0.02, 0.05, 0.08)):
             rdm1_lo = mf.make_rdm1(C_on_site, mf.mo_occ)
             local_occ += sum(x.trace().real for x in rdm1_lo[0])
             local_occ += sum(x.trace().real for x in rdm1_lo[1])
+        local_occ = local_occ.get()
         local_occ /= nkpts
         bare_occupancies.append(local_occ)
         log.info('alpha=%f bare_occ=%g final_occ=%g',
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_krkspu.py b/gpu4pyscf/pbc/dft/tests/test_pbc_krkspu.py
index 732d1a6b..d1ac0052 100644
--- a/gpu4pyscf/pbc/dft/tests/test_pbc_krkspu.py
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_krkspu.py
@@ -21,8 +21,8 @@
 
 from pyscf import lib
 from pyscf.pbc import gto as pgto
-from pyscf.pbc import dft as pdft
-from pyscf.pbc.dft import krkspu
+from gpu4pyscf.pbc import dft as pdft
+from gpu4pyscf.pbc.dft import krkspu
 
 def setUpModule():
     global cell
@@ -38,7 +38,6 @@ def setUpModule():
     cell.verbose = 7
     cell.output = '/dev/null'
     cell.mesh = [29]*3
-    cell.space_group_symmetry=True
     cell.build()
 
 def tearDownModule():
@@ -59,27 +58,6 @@ def test_KRKSpU_high_cost(self):
         e1 = mf.kernel()
         self.assertAlmostEqual(e1, -10.694460059491741, 8)
 
-    def test_KRKSpU_ksymm(self):
-        cell1 = cell.copy()
-        cell1.basis = 'gth-szv'
-        cell1.mesh = [16,]*3
-        cell1.build()
-
-        U_idx = ["1 C 2p"]
-        U_val = [5.0]
-
-        kmesh = [2, 2, 1]
-        kpts0 = cell1.make_kpts(kmesh, wrap_around=True)
-        mf0 = pdft.KRKSpU(cell1, kpts0, U_idx=U_idx, U_val=U_val, C_ao_lo='minao')
-        e0 = mf0.kernel()
-
-        kpts = cell1.make_kpts(kmesh, wrap_around=True,
-                               space_group_symmetry=True, time_reversal_symmetry=True)
-        assert kpts.nkpts_ibz == 3
-        mf = pdft.KRKSpU(cell1, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao')
-        e1 = mf.kernel()
-        self.assertAlmostEqual(e1, e0, 8)
-
     def test_get_veff(self):
         kmesh = [2, 1, 1]
         kpts = cell.make_kpts(kmesh, wrap_around=True)
@@ -91,7 +69,7 @@ def test_get_veff(self):
         dm = mf.get_init_guess(cell, 'minao')
         vxc = mf.get_veff(cell, dm)
         self.assertAlmostEqual(vxc.E_U, 0.07587726255165786, 11)
-        self.assertAlmostEqual(lib.fp(vxc), 12.77643098220399, 8)
+        self.assertAlmostEqual(lib.fp(vxc.get()), 12.77643098220399, 8)
 
     def test_KRKSpU_linear_response(self):
         cell = pgto.Cell()
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_kukspu.py b/gpu4pyscf/pbc/dft/tests/test_pbc_kukspu.py
index b4d0ee91..d1f53763 100644
--- a/gpu4pyscf/pbc/dft/tests/test_pbc_kukspu.py
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_kukspu.py
@@ -21,8 +21,8 @@
 
 from pyscf import lib
 from pyscf.pbc import gto as pgto
-from pyscf.pbc import dft as pdft
-from pyscf.pbc.dft import kukspu
+from gpu4pyscf.pbc import dft as pdft
+from gpu4pyscf.pbc.dft import kukspu
 
 def setUpModule():
     global cell
@@ -38,7 +38,6 @@ def setUpModule():
     cell.verbose = 7
     cell.output = '/dev/null'
     cell.mesh = [29]*3
-    cell.space_group_symmetry = True
     cell.build()
 
 def tearDownModule():
@@ -59,27 +58,6 @@ def test_KUKSpU_high_cost(self):
         e1 = mf.kernel()
         self.assertAlmostEqual(e1, -10.694460059491741, 8)
 
-    def test_KUKSpU_ksymm(self):
-        cell1 = cell.copy()
-        cell1.basis = 'gth-szv'
-        cell1.mesh = [16,]*3
-        cell1.build()
-
-        U_idx = ["1 C 2p"]
-        U_val = [5.0]
-
-        kmesh = [2, 2, 1]
-        kpts0 = cell1.make_kpts(kmesh, wrap_around=True)
-        mf0 = pdft.KUKSpU(cell1, kpts0, U_idx=U_idx, U_val=U_val, C_ao_lo='minao')
-        e0 = mf0.kernel()
-
-        kpts = cell1.make_kpts(kmesh, wrap_around=True,
-                               space_group_symmetry=True, time_reversal_symmetry=True)
-        assert kpts.nkpts_ibz == 3
-        mf = pdft.KUKSpU(cell1, kpts, U_idx=U_idx, U_val=U_val, C_ao_lo='minao')
-        e1 = mf.kernel()
-        self.assertAlmostEqual(e1, e0, 8)
-
     def test_get_veff(self):
         kmesh = [2, 1, 1]
         kpts = cell.make_kpts(kmesh, wrap_around=True)
@@ -91,7 +69,7 @@ def test_get_veff(self):
         dm = mf.get_init_guess(cell, 'minao')
         vxc = mf.get_veff(cell, dm)
         self.assertAlmostEqual(vxc.E_U, 0.07587726255165786, 11)
-        self.assertAlmostEqual(lib.fp(vxc), 6.37407828665724, 8)
+        self.assertAlmostEqual(lib.fp(vxc.get()), 6.37407828665724, 8)
 
     def test_KUKSpU_linear_response(self):
         cell = pgto.Cell()

From f0ffcc331f97b6714a497b4788a1a4f46fb63bed Mon Sep 17 00:00:00 2001
From: Qiming Sun <qiming.sun@bytedance.com>
Date: Fri, 1 Aug 2025 04:04:23 +0800
Subject: [PATCH 9/9] Update pcm tests

---
 gpu4pyscf/solvent/tests/test_pcm_hessian.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
index 6e19ec96..7e2f11f7 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
@@ -226,7 +226,7 @@ def test_grad_vmat_cpcm(self):
         test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
         ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
 
-        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+        assert abs(ref_grad_vmat - test_grad_vmat).max() < 1e-9
 
     def test_grad_vmat_iefpcm(self):
         print("testing IEF-PCM dV_solv/dx")
@@ -240,7 +240,7 @@ def test_grad_vmat_iefpcm(self):
         test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
         ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
 
-        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+        assert abs(ref_grad_vmat - test_grad_vmat).max() < 1e-9
 
     def test_grad_vmat_ssvpe(self):
         print("testing SS(V)PE dV_solv/dx")
@@ -254,7 +254,7 @@ def test_grad_vmat_ssvpe(self):
         test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
         ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
 
-        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+        assert abs(ref_grad_vmat - test_grad_vmat).max() < 1e-9
 
     def test_hess_nuc_iefpcm(self):
         print("testing IEF-PCM d2E_nuc/dx2")
@@ -266,7 +266,7 @@ def test_hess_nuc_iefpcm(self):
         from gpu4pyscf.solvent.grad.pcm import grad_nuc
         ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_nuc)
 
-        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+        assert abs(ref_grad_vmat - test_grad_vmat).max() < 1e-9
 
     def test_hess_qv_iefpcm(self):
         print("testing IEF-PCM d2E_elec/dx2")
@@ -278,7 +278,7 @@ def test_hess_qv_iefpcm(self):
         from gpu4pyscf.solvent.grad.pcm import grad_qv
         ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_qv)
 
-        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+        assert abs(ref_grad_vmat - test_grad_vmat).max() < 1e-9
 
     def test_hess_solver_cpcm(self):
         print("testing C-PCM d2E_KR/dx2")
@@ -290,7 +290,7 @@ def test_hess_solver_cpcm(self):
         from gpu4pyscf.solvent.grad.pcm import grad_solver
         ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver)
 
-        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+        assert abs(ref_grad_vmat - test_grad_vmat).max() < 1e-9
 
     def test_hess_solver_iefpcm(self):
         print("testing IEF-PCM d2E_KR/dx2")
@@ -302,7 +302,7 @@ def test_hess_solver_iefpcm(self):
         from gpu4pyscf.solvent.grad.pcm import grad_solver
         ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver)
 
-        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+        assert abs(ref_grad_vmat - test_grad_vmat).max() < 1e-9
 
     def test_hess_solver_ssvpe(self):
         print("testing SS(V)PE d2E_KR/dx2")
@@ -314,7 +314,7 @@ def test_hess_solver_ssvpe(self):
         from gpu4pyscf.solvent.grad.pcm import grad_solver
         ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver)
 
-        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+        assert abs(ref_grad_vmat - test_grad_vmat).max() < 1e-9
 
     @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
     def test_to_gpu(self):