pyscf · wxj6000 · Feb 1, 2025 · Feb 1, 2025 · Feb 2, 2025 · Feb 2, 2025
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
@@ -32,7 +32,7 @@
 # Use smaller blksize for old gaming GPUs
 if props['totalGlobalMem'] < 16 * GB:
     min_ao_blksize = 64
-    min_grid_blksize = 64*64
+    min_grid_blksize = 128*128
 
 # Use 90% of the global memory for CuPy memory pool
 mem_fraction = 0.9

diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
@@ -20,7 +20,7 @@
 from cupyx.scipy.linalg import solve_triangular
 from pyscf import lib
 from pyscf.df import df, addons, incore
-from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem, 
+from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem,
                                        cart2sph, p2p_transfer, copy_array)
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
@@ -269,7 +269,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
 
     return _cderi
 
-def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, 
+def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
                 omega=None, sr_only=False, device_id=0):
     ''' Execute CDERI tasks on one device
     '''
@@ -362,5 +362,5 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
                         _cderi[dev_id][:,ij0:ij1] = tmp
             else:
                 _cderi[0][:,ij0:ij1] = cderi_block
-            t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)    
+            t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
     return
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
@@ -92,7 +92,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     orbo = intopt.sort_orbitals(orbo, axis=[0])
 
     rhoj, rhok = get_rhojk(with_df, dm, orbo, with_j=with_j, with_k=with_k)
-    
+
     # (d/dX P|Q) contributions
     if omega and omega > 1e-10:
         with auxmol.with_range_coulomb(omega):
@@ -151,7 +151,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         cart2sph = intopt.cart2sph
         orbo_cart = cart2sph @ orbo
         dm_cart = cart2sph @ dm @ cart2sph.T
-        
+
     with_df._cderi = None # release GPU memory
     vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
                                         with_j=with_j, with_k=with_k, omega=omega)

diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py
@@ -27,7 +27,7 @@
 FREE_CUPY_CACHE = True
 BINSIZE = 128
 
-def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, 
+def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
            omega=None, mo_coeff=None, mo_occ=None, dm2 = None):
     '''
     Computes the first-order derivatives of the energy contributions from
@@ -143,11 +143,11 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
 
     nao_cart = intopt._sorted_mol.nao
     block_size = with_df.get_blksize(nao=nao_cart)
-    
+
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
     intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False,
                  group_size_aux=block_size)#, group_size=block_size)
-    
+
     if not mol.cart:
         # sph2cart for ao
         cart2sph = intopt.cart2sph
@@ -168,7 +168,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
     with_df._cderi = None  # release GPU memory
     vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
                                         with_j=with_j, with_k=with_k, omega=omega)
-    
+
     # NOTE: vj and vk are still in cartesian
     _sorted_mol = intopt._sorted_mol
     natm = _sorted_mol.natm