From f2e91bd5919cb4529170aeef41a4e48e71ec6869 Mon Sep 17 00:00:00 2001
From: Qianrui Liu <76200646+Qianruipku@users.noreply.github.com>
Date: Fri, 15 Nov 2024 09:58:18 +0800
Subject: [PATCH] Feature: make force and stress of sDFT support GPU (#5487)

* refactor force in sdft

* refactor stress in sDFT

* make stress_ekin GPU

* finish sdft GPU

* fix compile

* add annotations

* fix bug of stress and force

* modify
---
 source/Makefile.Objects                       |   2 +
 source/module_base/CMakeLists.txt             |   1 +
 source/module_base/module_device/device.cpp   |  31 +-
 source/module_base/module_device/device.h     |   8 +-
 source/module_base/parallel_common.cpp        |  13 +-
 source/module_base/parallel_device.cpp        |  38 ++
 source/module_base/parallel_device.h          |  45 +-
 source/module_elecstate/elecstate_pw_sdft.cpp |   9 +-
 .../module_charge/charge_extra.cpp            |  17 +-
 source/module_esolver/esolver_ks_pw.cpp       |   1 -
 source/module_esolver/esolver_sdft_pw.cpp     |  34 +-
 .../hamilt_pwdft/CMakeLists.txt               |   1 +
 .../module_hamilt_pw/hamilt_pwdft/forces.cpp  |   3 +-
 source/module_hamilt_pw/hamilt_pwdft/forces.h |   3 +-
 .../hamilt_pwdft/forces_nl.cpp                |  13 +-
 .../hamilt_pwdft/fs_kin_tools.cpp             | 160 ++++++
 .../hamilt_pwdft/fs_kin_tools.h               |  72 +++
 .../hamilt_pwdft/fs_nonlocal_tools.cpp        | 340 ++++++++-----
 .../hamilt_pwdft/fs_nonlocal_tools.h          |  96 +++-
 .../hamilt_pwdft/kernels/cuda/force_op.cu     |  62 ++-
 .../hamilt_pwdft/kernels/cuda/stress_op.cu    |  92 +++-
 .../hamilt_pwdft/kernels/force_op.cpp         |  57 ++-
 .../hamilt_pwdft/kernels/force_op.h           |   5 +
 .../hamilt_pwdft/kernels/rocm/force_op.hip.cu |  62 ++-
 .../kernels/rocm/stress_op.hip.cu             |  90 +++-
 .../hamilt_pwdft/kernels/stress_op.cpp        |  78 ++-
 .../hamilt_pwdft/kernels/stress_op.h          |  24 +
 .../kernels/test/force_op_test.cpp            |   2 +
 .../kernels/test/stress_op_test.cpp           |   2 +
 .../hamilt_pwdft/stress_func.h                |   6 +-
 .../hamilt_pwdft/stress_func_kin.cpp          | 168 +------
 .../hamilt_pwdft/stress_func_nl.cpp           |  24 +-
 .../hamilt_pwdft/stress_pw.cpp                |   3 +-
 .../module_hamilt_pw/hamilt_pwdft/stress_pw.h |   1 -
 .../hamilt_stodft/sto_forces.cpp              | 382 +++++----------
 .../hamilt_stodft/sto_forces.h                |  25 +-
 .../hamilt_stodft/sto_iter.cpp                |   8 +
 .../hamilt_stodft/sto_stress_pw.cpp           | 461 ++++--------------
 .../hamilt_stodft/sto_stress_pw.h             |  23 +-
 source/module_hsolver/hsolver_pw_sdft.cpp     |   4 +-
 source/module_io/input_conv.cpp               |   2 +-
 source/module_io/read_input_item_system.cpp   |   6 +
 source/module_psi/psi.cpp                     |   2 +-
 tests/integrate/187_PW_MD_SDFT_ALL_GPU/INPUT  |  43 ++
 tests/integrate/187_PW_MD_SDFT_ALL_GPU/KPT    |   4 +
 tests/integrate/187_PW_MD_SDFT_ALL_GPU/README |  11 +
 tests/integrate/187_PW_MD_SDFT_ALL_GPU/STRU   |  19 +
 tests/integrate/187_PW_MD_SDFT_ALL_GPU/jd     |   1 +
 .../187_PW_MD_SDFT_ALL_GPU/result.ref         |   5 +
 tests/integrate/187_PW_SDFT_ALL_GPU/INPUT     |  38 ++
 tests/integrate/187_PW_SDFT_ALL_GPU/KPT       |   4 +
 tests/integrate/187_PW_SDFT_ALL_GPU/README    |  11 +
 tests/integrate/187_PW_SDFT_ALL_GPU/STRU      |  19 +
 tests/integrate/187_PW_SDFT_ALL_GPU/jd        |   1 +
 .../integrate/187_PW_SDFT_ALL_GPU/result.ref  |   5 +
 tests/integrate/187_PW_SDFT_MALL_GPU/INPUT    |  38 ++
 tests/integrate/187_PW_SDFT_MALL_GPU/KPT      |   4 +
 tests/integrate/187_PW_SDFT_MALL_GPU/README   |  11 +
 tests/integrate/187_PW_SDFT_MALL_GPU/STRU     |  19 +
 tests/integrate/187_PW_SDFT_MALL_GPU/jd       |   1 +
 .../integrate/187_PW_SDFT_MALL_GPU/result.ref |   5 +
 tests/integrate/CASES_GPU.txt                 |   3 +
 62 files changed, 1645 insertions(+), 1073 deletions(-)
 create mode 100644 source/module_base/parallel_device.cpp
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h
 create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/INPUT
 create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/KPT
 create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/README
 create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/STRU
 create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/jd
 create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/result.ref
 create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/INPUT
 create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/KPT
 create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/README
 create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/STRU
 create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/jd
 create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/result.ref
 create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/INPUT
 create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/KPT
 create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/README
 create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/STRU
 create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/jd
 create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/result.ref

diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 78e3080356..430e7b05c9 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -617,6 +617,7 @@ OBJS_PARALLEL=parallel_common.o\
     parallel_grid.o\
     parallel_kpoints.o\
     parallel_reduce.o\
+    parallel_device.o
       
 OBJS_SRCPW=H_Ewald_pw.o\
     dnrm2.o\
@@ -640,6 +641,7 @@ OBJS_SRCPW=H_Ewald_pw.o\
     forces_cc.o\
     forces_scc.o\
     fs_nonlocal_tools.o\
+    fs_kin_tools.o\
     force_op.o\
     stress_op.o\
     wf_op.o\
diff --git a/source/module_base/CMakeLists.txt b/source/module_base/CMakeLists.txt
index 712cb902ba..b5129f0c6f 100644
--- a/source/module_base/CMakeLists.txt
+++ b/source/module_base/CMakeLists.txt
@@ -50,6 +50,7 @@ add_library(
     parallel_global.cpp
     parallel_comm.cpp
     parallel_reduce.cpp
+    parallel_device.cpp
     spherical_bessel_transformer.cpp
     cubic_spline.cpp
     module_mixing/mixing_data.cpp
diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp
index 4897ef1e8a..b20ea9f3ad 100644
--- a/source/module_base/module_device/device.cpp
+++ b/source/module_base/module_device/device.cpp
@@ -191,27 +191,30 @@ else { return "cpu";
 }
 }
 
-int get_device_kpar(const int &kpar) {
+int get_device_kpar(const int& kpar, const int& bndpar)
+{
 #if __MPI && (__CUDA || __ROCM)
-  int temp_nproc;
-  MPI_Comm_size(MPI_COMM_WORLD, &temp_nproc);
-  if (temp_nproc != kpar) {
-    ModuleBase::WARNING("Input_conv",
-                        "None kpar set in INPUT file, auto set kpar value.");
-  }
-  // GlobalV::KPAR = temp_nproc;
-  // band the CPU processor to the devices
-  int node_rank = base_device::information::get_node_rank();
+    int temp_nproc = 0;
+    int new_kpar = kpar;
+    MPI_Comm_size(MPI_COMM_WORLD, &temp_nproc);
+    if (temp_nproc != kpar * bndpar)
+    {
+        new_kpar = temp_nproc / bndpar;
+        ModuleBase::WARNING("Input_conv", "kpar is not compatible with the number of processors, auto set kpar value.");
+    }
+    
+    // get the CPU rank of current node
+    int node_rank = base_device::information::get_node_rank();
 
-  int device_num = -1;
+    int device_num = -1;
 #if defined(__CUDA)
-  cudaGetDeviceCount(&device_num);
-  cudaSetDevice(node_rank % device_num);
+  cudaGetDeviceCount(&device_num); // get the number of GPU devices of current node
+  cudaSetDevice(node_rank % device_num); // band the CPU processor to the devices
 #elif defined(__ROCM)
   hipGetDeviceCount(&device_num);
   hipSetDevice(node_rank % device_num);
 #endif
-  return temp_nproc;
+  return new_kpar;
 #endif
   return kpar;
 }
diff --git a/source/module_base/module_device/device.h b/source/module_base/module_device/device.h
index 6be4952f14..c89d3bc9cd 100644
--- a/source/module_base/module_device/device.h
+++ b/source/module_base/module_device/device.h
@@ -40,7 +40,7 @@ std::string get_device_info(std::string device_flag);
  * @brief Get the device kpar object
  * for module_io GlobalV::KPAR
  */
-int get_device_kpar(const int& kpar);
+int get_device_kpar(const int& kpar, const int& bndpar);
 
 /**
  * @brief Get the device flag object
@@ -50,6 +50,12 @@ std::string get_device_flag(const std::string& device,
                             const std::string& basis_type);
 
 #if __MPI
+/**
+ * @brief Get the rank of current node
+ *        Note that GPU can only be binded with CPU in the same node
+ * 
+ * @return int 
+ */
 int get_node_rank();
 int get_node_rank_with_mpi_shared(const MPI_Comm mpi_comm = MPI_COMM_WORLD);
 int stringCmp(const void* a, const void* b);
diff --git a/source/module_base/parallel_common.cpp b/source/module_base/parallel_common.cpp
index 1bc3a9e627..6f8ce79fbc 100644
--- a/source/module_base/parallel_common.cpp
+++ b/source/module_base/parallel_common.cpp
@@ -11,15 +11,16 @@ void Parallel_Common::bcast_string(std::string& object) // Peize Lin fix bug 201
 {
     int size = object.size();
     MPI_Bcast(&size, 1, MPI_INT, 0, MPI_COMM_WORLD);
-    char* swap = new char[size + 1];
+    
     int my_rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-    if (0 == my_rank)
-        strcpy(swap, object.c_str());
-    MPI_Bcast(swap, size + 1, MPI_CHAR, 0, MPI_COMM_WORLD);
+    
     if (0 != my_rank)
-        object = static_cast<std::string>(swap);
-    delete[] swap;
+    {
+        object.resize(size);
+    }
+
+    MPI_Bcast(&object[0], size, MPI_CHAR, 0, MPI_COMM_WORLD);
     return;
 }
 
diff --git a/source/module_base/parallel_device.cpp b/source/module_base/parallel_device.cpp
new file mode 100644
index 0000000000..269a41821e
--- /dev/null
+++ b/source/module_base/parallel_device.cpp
@@ -0,0 +1,38 @@
+#include "parallel_device.h"
+#ifdef __MPI
+namespace Parallel_Common
+{
+void bcast_data(std::complex<double>* object, const int& n, const MPI_Comm& comm)
+{
+    MPI_Bcast(object, n * 2, MPI_DOUBLE, 0, comm);
+}
+void bcast_data(std::complex<float>* object, const int& n, const MPI_Comm& comm)
+{
+    MPI_Bcast(object, n * 2, MPI_FLOAT, 0, comm);
+}
+void bcast_data(double* object, const int& n, const MPI_Comm& comm)
+{
+    MPI_Bcast(object, n, MPI_DOUBLE, 0, comm);
+}
+void bcast_data(float* object, const int& n, const MPI_Comm& comm)
+{
+    MPI_Bcast(object, n, MPI_FLOAT, 0, comm);
+}
+void reduce_data(std::complex<double>* object, const int& n, const MPI_Comm& comm)
+{
+    MPI_Allreduce(MPI_IN_PLACE, object, n * 2, MPI_DOUBLE, MPI_SUM, comm);
+}
+void reduce_data(std::complex<float>* object, const int& n, const MPI_Comm& comm)
+{
+    MPI_Allreduce(MPI_IN_PLACE, object, n * 2, MPI_FLOAT, MPI_SUM, comm);
+}
+void reduce_data(double* object, const int& n, const MPI_Comm& comm)
+{
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, comm);
+}
+void reduce_data(float* object, const int& n, const MPI_Comm& comm)
+{
+    MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_FLOAT, MPI_SUM, comm);
+}
+}
+#endif
\ No newline at end of file
diff --git a/source/module_base/parallel_device.h b/source/module_base/parallel_device.h
index 8d867ba4fc..09625f6303 100644
--- a/source/module_base/parallel_device.h
+++ b/source/module_base/parallel_device.h
@@ -1,39 +1,34 @@
+#ifndef __PARALLEL_DEVICE_H__
+#define __PARALLEL_DEVICE_H__
 #ifdef __MPI
 #include "mpi.h"
 #include "module_base/module_device/device.h"
+#include "module_base/module_device/memory_op.h"
 #include <complex>
-#include <string>
-#include <vector>
 namespace Parallel_Common
 {
-void bcast_complex(std::complex<double>* object, const int& n, const MPI_Comm& comm)
-{
-    MPI_Bcast(object, n * 2, MPI_DOUBLE, 0, comm);
-}
-void bcast_complex(std::complex<float>* object, const int& n, const MPI_Comm& comm)
-{
-    MPI_Bcast(object, n * 2, MPI_FLOAT, 0, comm);
-}
-void bcast_real(double* object, const int& n, const MPI_Comm& comm)
-{
-    MPI_Bcast(object, n, MPI_DOUBLE, 0, comm);
-}
-void bcast_real(float* object, const int& n, const MPI_Comm& comm)
-{
-    MPI_Bcast(object, n, MPI_FLOAT, 0, comm);
-}
+void bcast_data(std::complex<double>* object, const int& n, const MPI_Comm& comm);
+void bcast_data(std::complex<float>* object, const int& n, const MPI_Comm& comm);
+void bcast_data(double* object, const int& n, const MPI_Comm& comm);
+void bcast_data(float* object, const int& n, const MPI_Comm& comm);
+void reduce_data(std::complex<double>* object, const int& n, const MPI_Comm& comm);
+void reduce_data(std::complex<float>* object, const int& n, const MPI_Comm& comm);
+void reduce_data(double* object, const int& n, const MPI_Comm& comm);
+void reduce_data(float* object, const int& n, const MPI_Comm& comm);
 
-template <typename T, typename Device>
 /**
- * @brief bcast complex in Device
+ * @brief bcast data in Device
  * 
+ * @tparam T: float, double, std::complex<float>, std::complex<double>
+ * @tparam Device 
  * @param ctx Device ctx
  * @param object complex arrays in Device
  * @param n the size of complex arrays
  * @param comm MPI_Comm
  * @param tmp_space tmp space in CPU
  */
-void bcast_complex(const Device* ctx, T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
+template <typename T, typename Device>
+void bcast_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
 {
     const base_device::DEVICE_CPU* cpu_ctx = {};
     T* object_cpu = nullptr;
@@ -56,7 +51,7 @@ void bcast_complex(const Device* ctx, T* object, const int& n, const MPI_Comm& c
         object_cpu = object;
     }
 
-    bcast_complex(object_cpu, n, comm);
+    bcast_data(object_cpu, n, comm);
 
     if (base_device::get_device_type<Device>(ctx) == base_device::GpuDevice)
     {
@@ -70,7 +65,7 @@ void bcast_complex(const Device* ctx, T* object, const int& n, const MPI_Comm& c
 }
 
 template <typename T, typename Device>
-void bcast_real(const Device* ctx, T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
+void reduce_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
 {
     const base_device::DEVICE_CPU* cpu_ctx = {};
     T* object_cpu = nullptr;
@@ -93,7 +88,7 @@ void bcast_real(const Device* ctx, T* object, const int& n, const MPI_Comm& comm
         object_cpu = object;
     }
 
-    bcast_real(object_cpu, n, comm);
+    reduce_data(object_cpu, n, comm);
 
     if (base_device::get_device_type<Device>(ctx) == base_device::GpuDevice)
     {
@@ -105,7 +100,9 @@ void bcast_real(const Device* ctx, T* object, const int& n, const MPI_Comm& comm
     }
     return;
 }
+
 }
     
 
+#endif
 #endif
\ No newline at end of file
diff --git a/source/module_elecstate/elecstate_pw_sdft.cpp b/source/module_elecstate/elecstate_pw_sdft.cpp
index 9a3be4a42f..ad6f98c3c3 100644
--- a/source/module_elecstate/elecstate_pw_sdft.cpp
+++ b/source/module_elecstate/elecstate_pw_sdft.cpp
@@ -14,14 +14,13 @@ void ElecStatePW_SDFT<T, Device>::psiToRho(const psi::Psi<T, Device>& psi)
     ModuleBase::TITLE(this->classname, "psiToRho");
     ModuleBase::timer::tick(this->classname, "psiToRho");
     const int nspin = PARAM.inp.nspin;
+    for (int is = 0; is < nspin; is++)
+    {
+        setmem_var_op()(this->ctx, this->rho[is], 0, this->charge->nrxx);
+    }
 
     if (GlobalV::MY_STOGROUP == 0)
     {
-        for (int is = 0; is < nspin; is++)
-        {
-            setmem_var_op()(this->ctx, this->rho[is], 0, this->charge->nrxx);
-        }
-
         for (int ik = 0; ik < psi.get_nk(); ++ik)
         {
             psi.fix_k(ik);
diff --git a/source/module_elecstate/module_charge/charge_extra.cpp b/source/module_elecstate/module_charge/charge_extra.cpp
index fc40a2d494..d0e0ff1299 100644
--- a/source/module_elecstate/module_charge/charge_extra.cpp
+++ b/source/module_elecstate/module_charge/charge_extra.cpp
@@ -47,9 +47,20 @@ void Charge_Extra::Init_CE(const int& nspin, const int& natom, const int& nrxx,
 
     if (pot_order > 0)
     {
-        delta_rho1.resize(this->nspin, std::vector<double>(nrxx, 0.0));
-        delta_rho2.resize(this->nspin, std::vector<double>(nrxx, 0.0));
-        delta_rho3.resize(this->nspin, std::vector<double>(nrxx, 0.0));
+        // delta_rho1.resize(this->nspin, std::vector<double>(nrxx, 0.0));
+        // delta_rho2.resize(this->nspin, std::vector<double>(nrxx, 0.0));
+        // delta_rho3.resize(this->nspin, std::vector<double>(nrxx, 0.0));
+        // qianrui replace the above code with the following code.
+        // The above code cannot passed valgrind tests, which has an invalid read of size 32.
+        delta_rho1.resize(this->nspin);
+        delta_rho2.resize(this->nspin);
+        delta_rho3.resize(this->nspin);
+        for (int is = 0; is < this->nspin; is++)
+        {
+            delta_rho1[is].resize(nrxx, 0.0);
+            delta_rho2[is].resize(nrxx, 0.0);
+            delta_rho3[is].resize(nrxx, 0.0);
+        }
     }
 
     if(pot_order == 3)
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index 8af664799d..f6a4e335dc 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -623,7 +623,6 @@ void ESolver_KS_PW<T, Device>::cal_stress(ModuleBase::matrix& stress)
                   &this->sf,
                   &this->kv,
                   this->pw_wfc,
-                  this->psi,
                   this->__kspw_psi);
 
     // external stress
diff --git a/source/module_esolver/esolver_sdft_pw.cpp b/source/module_esolver/esolver_sdft_pw.cpp
index 8ee6098fe2..19ec08137e 100644
--- a/source/module_esolver/esolver_sdft_pw.cpp
+++ b/source/module_esolver/esolver_sdft_pw.cpp
@@ -169,6 +169,9 @@ void ESolver_SDFT_PW<T, Device>::after_scf(const int istep)
 template <typename T, typename Device>
 void ESolver_SDFT_PW<T, Device>::hamilt2density_single(int istep, int iter, double ethr)
 {
+    ModuleBase::TITLE("ESolver_SDFT_PW", "hamilt2density");
+    ModuleBase::timer::tick("ESolver_SDFT_PW", "hamilt2density");
+
     // reset energy
     this->pelec->f_en.eband = 0.0;
     this->pelec->f_en.demet = 0.0;
@@ -241,6 +244,7 @@ void ESolver_SDFT_PW<T, Device>::hamilt2density_single(int istep, int iter, doub
 #ifdef __MPI
     MPI_Bcast(&(this->pelec->f_en.deband), 1, MPI_DOUBLE, 0, PARAPW_WORLD);
 #endif
+    ModuleBase::timer::tick("ESolver_SDFT_PW", "hamilt2density");
 }
 
 template <typename T, typename Device>
@@ -249,10 +253,10 @@ double ESolver_SDFT_PW<T, Device>::cal_energy()
     return this->pelec->f_en.etot;
 }
 
-template <>
-void ESolver_SDFT_PW<std::complex<double>, base_device::DEVICE_CPU>::cal_force(ModuleBase::matrix& force)
+template <typename T, typename Device>
+void ESolver_SDFT_PW<T, Device>::cal_force(ModuleBase::matrix& force)
 {
-    Sto_Forces ff(GlobalC::ucell.nat);
+    Sto_Forces<double, Device> ff(GlobalC::ucell.nat);
 
     ff.cal_stoforce(force,
                     *this->pelec,
@@ -261,20 +265,16 @@ void ESolver_SDFT_PW<std::complex<double>, base_device::DEVICE_CPU>::cal_force(M
                     &this->sf,
                     &this->kv,
                     this->pw_wfc,
-                    this->psi,
+                    GlobalC::ppcell,
+                    GlobalC::ucell,
+                    *this->kspw_psi,
                     this->stowf);
 }
 
-template <>
-void ESolver_SDFT_PW<std::complex<double>, base_device::DEVICE_GPU>::cal_force(ModuleBase::matrix& force)
-{
-    ModuleBase::WARNING_QUIT("ESolver_SDFT_PW<T, Device>::cal_force", "DEVICE_GPU is not supported");
-}
-
-template <>
-void ESolver_SDFT_PW<std::complex<double>, base_device::DEVICE_CPU>::cal_stress(ModuleBase::matrix& stress)
+template <typename T, typename Device>
+void ESolver_SDFT_PW<T, Device>::cal_stress(ModuleBase::matrix& stress)
 {
-    Sto_Stress_PW ss;
+    Sto_Stress_PW<double, Device> ss;
     ss.cal_stress(stress,
                   *this->pelec,
                   this->pw_rho,
@@ -282,19 +282,13 @@ void ESolver_SDFT_PW<std::complex<double>, base_device::DEVICE_CPU>::cal_stress(
                   &this->sf,
                   &this->kv,
                   this->pw_wfc,
-                  this->psi,
+                  *this->kspw_psi,
                   this->stowf,
                   this->pelec->charge,
                   &GlobalC::ppcell,
                   GlobalC::ucell);
 }
 
-template <>
-void ESolver_SDFT_PW<std::complex<double>, base_device::DEVICE_GPU>::cal_stress(ModuleBase::matrix& stress)
-{
-    ModuleBase::WARNING_QUIT("ESolver_SDFT_PW<T, Device>::cal_stress", "DEVICE_GPU is not supported");
-}
-
 template <typename T, typename Device>
 void ESolver_SDFT_PW<T, Device>::after_all_runners()
 {
diff --git a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt
index 2cb1ae5247..95636d8959 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt
+++ b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt
@@ -36,6 +36,7 @@ list(APPEND objects
     parallel_grid.cpp
     elecond.cpp
     fs_nonlocal_tools.cpp
+    fs_kin_tools.cpp
     radial_proj.cpp
 )
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index 589fc71009..41fb91da6f 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -33,6 +33,7 @@ void Forces<FPTYPE, Device>::cal_force(ModuleBase::matrix& force,
                                        ModulePW::PW_Basis_K* wfc_basis,
                                        const psi::Psi<std::complex<FPTYPE>, Device>* psi_in)
 {
+    ModuleBase::timer::tick("Forces", "cal_force");
     ModuleBase::TITLE("Forces", "init");
     this->device = base_device::get_device_type<Device>(this->ctx);
     const ModuleBase::matrix& wg = elec.wg;
@@ -455,7 +456,7 @@ void Forces<FPTYPE, Device>::cal_force(ModuleBase::matrix& force,
         }
     }
     ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "TOTAL-FORCE (eV/Angstrom)", force, false);
-
+    ModuleBase::timer::tick("Forces", "cal_force");
     return;
 }
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h
index 0247fbfae7..712cceac66 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h
@@ -96,10 +96,11 @@ class Forces
                      FPTYPE* drhocg,
                      ModulePW::PW_Basis* rho_basis,
                      const UnitCell& ucell_in); // used in nonlinear core correction stress
-  private:
+  protected:
     Device* ctx = {};
     base_device::DEVICE_CPU* cpu_ctx = {};
     base_device::AbacusDevice_t device = {};
+  private:
     using gemm_op = hsolver::gemm_op<std::complex<FPTYPE>, Device>;
 
     using resmem_complex_op = base_device::memory::resize_memory_op<std::complex<FPTYPE>, Device>;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
index 92733745b4..a8fc6976c6 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
@@ -30,9 +30,10 @@ void Forces<FPTYPE, Device>::cal_force_nl(ModuleBase::matrix& forcenl,
     resmem_var_op()(this->ctx, force, ucell_in.nat * 3);
     base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, force, 0.0, ucell_in.nat * 3);
 
-    hamilt::FS_Nonlocal_tools<FPTYPE, Device> nl_tools(nlpp_in, &ucell_in, psi_in, p_kv, wfc_basis, p_sf, wg, ekb);
+    hamilt::FS_Nonlocal_tools<FPTYPE, Device> nl_tools(nlpp_in, &ucell_in, p_kv, wfc_basis, p_sf, wg, &ekb);
 
     const int nks = wfc_basis->nks;
+    const int max_nbands = wg.nc;
     for (int ik = 0; ik < nks; ik++) // loop k points
     {
         // skip zero weights to speed up
@@ -46,15 +47,19 @@ void Forces<FPTYPE, Device>::cal_force_nl(ModuleBase::matrix& forcenl,
             }
         }
         const int npm = nbands_occ;
+        nl_tools.cal_vkb(ik, max_nbands);
         // calculate becp = <psi|beta> for all beta functions
-        nl_tools.cal_becp(ik, npm);
+        nl_tools.cal_becp(ik, npm, &psi_in[0](ik,0,0));
+        nl_tools.reduce_pool_becp(max_nbands);
         for (int ipol = 0; ipol < 3; ipol++)
         {
+            nl_tools.cal_vkb_deri_f(ik, max_nbands, ipol);
             // calculate dbecp = <psi|\nabla beta> for all beta functions
-            nl_tools.cal_dbecp_f(ik, npm, ipol);
+            nl_tools.cal_dbecp_f(ik, max_nbands, npm, ipol, &psi_in[0](ik,0,0));
+            nl_tools.revert_vkb(ik, ipol);
         }
         // calculate the force_i = \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_i
-        nl_tools.cal_force(ik, npm, force);
+        nl_tools.cal_force(ik, max_nbands, npm, true, force);
     } // end ik
 
     syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcenl.c, force, forcenl.nr * forcenl.nc);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp
new file mode 100644
index 0000000000..89efb3f879
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp
@@ -0,0 +1,160 @@
+#include "fs_kin_tools.h"
+
+#include "module_base/parallel_reduce.h"
+namespace hamilt
+{
+template <typename FPTYPE, typename Device>
+FS_Kin_tools<FPTYPE, Device>::FS_Kin_tools(const UnitCell& ucell_in,
+                                           const K_Vectors* p_kv,
+                                           const ModulePW::PW_Basis_K* wfc_basis_in,
+                                           const ModuleBase::matrix& wg)
+    : ucell_(ucell_in), nksbands_(wg.nc), wg(wg.c), wk(p_kv->wk.data())
+{
+    this->device = base_device::get_device_type<Device>(this->ctx);
+    this->wfc_basis_ = wfc_basis_in;
+    const int npwk_max = this->wfc_basis_->npwk_max;
+    const int nks = this->wfc_basis_->nks;
+    const int npol = ucell_in.get_npol();
+
+    this->gk3_.resize(3 * npwk_max);
+    this->gk.resize(3);
+    for (int i = 0; i < 3; ++i)
+    {
+        this->gk[i] = &this->gk3_[i * npwk_max];
+    }
+    this->kfac.resize(npwk_max);
+    this->s_kin.resize(9, 0.0);
+
+    if (this->device == base_device::GpuDevice)
+    {
+        resmem_var_op()(this->ctx, d_gk, 3 * npwk_max);
+        resmem_var_op()(this->ctx, d_kfac, npwk_max);
+    }
+    else
+    {
+        d_gk = gk3_.data();
+        d_kfac = kfac.data();
+    }
+}
+
+template <typename FPTYPE, typename Device>
+FS_Kin_tools<FPTYPE, Device>::~FS_Kin_tools()
+{
+    if (this->device == base_device::GpuDevice)
+    {
+        delmem_var_op()(this->ctx, d_gk);
+        delmem_var_op()(this->ctx, d_kfac);
+    }
+}
+
+template <typename FPTYPE, typename Device>
+void FS_Kin_tools<FPTYPE, Device>::cal_gk(const int& ik)
+{
+    const double tpiba = ModuleBase::TWO_PI / this->ucell_.lat0;
+    const double twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI);
+    const int npw = wfc_basis_->npwk[ik];
+    const int npwk_max = wfc_basis_->npwk_max;
+    for (int i = 0; i < npw; ++i)
+    {
+        gk[0][i] = wfc_basis_->getgpluskcar(ik, i)[0] * tpiba;
+        gk[1][i] = wfc_basis_->getgpluskcar(ik, i)[1] * tpiba;
+        gk[2][i] = wfc_basis_->getgpluskcar(ik, i)[2] * tpiba;
+        if (wfc_basis_->erf_height > 0)
+        {
+            double gk2 = gk[0][i] * gk[0][i] + gk[1][i] * gk[1][i] + gk[2][i] * gk[2][i];
+            double arg = (gk2 - wfc_basis_->erf_ecut) / wfc_basis_->erf_sigma;
+            kfac[i] = 1.0 + wfc_basis_->erf_height / wfc_basis_->erf_sigma * twobysqrtpi * std::exp(-arg * arg);
+        }
+        else
+        {
+            kfac[i] = 1.0;
+        }
+    }
+    if (this->device == base_device::GpuDevice)
+    {
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gk, gk[0], 3 * npwk_max);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_kfac, kfac.data(), npwk_max);
+    }
+}
+
+template <typename FPTYPE, typename Device>
+void FS_Kin_tools<FPTYPE, Device>::cal_stress_kin(const int& ik,
+                                                  const int& npm,
+                                                  const bool& occ,
+                                                  const std::complex<FPTYPE>* psi)
+{
+    if (npm == 0)
+    {
+        return;
+    }
+    const int npw = wfc_basis_->npwk[ik];
+    const int npwk_max = wfc_basis_->npwk_max;
+    const int npol = this->ucell_.get_npol();
+    for (int ib = 0; ib < npm; ib++)
+    {
+        const std::complex<FPTYPE>* ppsi = psi + ib * npwk_max * npol;
+        const std::complex<FPTYPE>* ppsi2 = ppsi + npwk_max;
+        FPTYPE fac = 0.0;
+        if (occ)
+        {
+            fac = wg[ik * this->nksbands_ + ib];
+            if (fac == 0.0)
+            {
+                continue;
+            }
+        }
+        else
+        {
+            fac = wk[ik];
+        }
+        for (int l = 0; l < 3; l++)
+        {
+            const FPTYPE* d_gkl = d_gk + l * npwk_max;
+            for (int m = 0; m < l + 1; m++)
+            {
+                const FPTYPE* d_gkm = d_gk + m * npwk_max;
+                FPTYPE sum = 0;
+                sum += cal_multi_dot_op()(npw, fac, d_gkl, d_gkm, d_kfac, ppsi);
+                if (npol == 2)
+                {
+                    sum += cal_multi_dot_op()(npw, fac, d_gkl, d_gkm, d_kfac, ppsi2);
+                }
+                s_kin[l * 3 + m] += sum;
+            }
+        }
+    }
+}
+
+template <typename FPTYPE, typename Device>
+void FS_Kin_tools<FPTYPE, Device>::symmetrize_stress(ModuleSymmetry::Symmetry* p_symm, ModuleBase::matrix& sigma)
+{
+    for (int l = 0; l < 3; ++l)
+    {
+        for (int m = 0; m < l; ++m)
+        {
+            s_kin[m * 3 + l] = s_kin[l * 3 + m];
+        }
+    }
+
+    Parallel_Reduce::reduce_all(s_kin.data(), 9);
+
+    for (int l = 0; l < 3; ++l)
+    {
+        for (int m = 0; m < 3; ++m)
+        {
+            sigma(l, m) = s_kin[l * 3 + m] * ModuleBase::e2 / this->ucell_.omega;
+        }
+    }
+    // do symmetry
+    if (ModuleSymmetry::Symmetry::symm_flag == 1)
+    {
+        p_symm->symmetrize_mat3(sigma, this->ucell_.lat);
+    }
+}
+
+template class FS_Kin_tools<double, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class FS_Kin_tools<double, base_device::DEVICE_GPU>;
+#endif
+
+} // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h
new file mode 100644
index 0000000000..b6327cf47f
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h
@@ -0,0 +1,72 @@
+#ifndef FS_KIN_TOOLS_H
+#define FS_KIN_TOOLS_H
+#include "module_base/module_device/device.h"
+#include "module_basis/module_pw/pw_basis_k.h"
+#include "module_cell/klist.h"
+#include "module_cell/unitcell.h"
+#include "module_cell/module_symmetry/symmetry.h"
+#include "module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h"
+
+#include <complex>
+namespace hamilt
+{
+template <typename FPTYPE, typename Device>
+class FS_Kin_tools
+{
+  public:
+    FS_Kin_tools(const UnitCell& ucell_in,
+                 const K_Vectors* kv_in,
+                 const ModulePW::PW_Basis_K* wfc_basis_in,
+                 const ModuleBase::matrix& wg);
+    ~FS_Kin_tools();
+
+    /**
+     * @brief calculate G+k and store it in gk and also calculate kfac
+     */
+    void cal_gk(const int& ik);
+
+    /**
+     * @brief calculate stress tensor for kinetic energy
+     *        stress = \sum_{G,k,i}  wk(k) * gk_l(G) * gk_m(G) * d_kfac(G) * occ_i*|ppsi_i(G)|^2
+     * 
+     * @param ik k-point index
+     * @param npm number of bands
+     * @param occ if use the occupation of the bands
+     * @param psi wavefunctions
+     */
+    void cal_stress_kin(const int& ik, const int& npm, const bool& occ, const std::complex<FPTYPE>* psi);
+
+    /**
+     * @brief symmetrize the stress tensor
+     */
+    void symmetrize_stress(ModuleSymmetry::Symmetry* p_symm, ModuleBase::matrix& sigma);
+
+  protected:
+    Device* ctx = {};
+    base_device::DEVICE_CPU* cpu_ctx = {};
+    base_device::AbacusDevice_t device = {};
+    std::vector<FPTYPE> gk3_;
+    std::vector<FPTYPE*> gk;
+    std::vector<FPTYPE> kfac;
+    std::vector<FPTYPE> s_kin;
+    FPTYPE* d_gk = nullptr;
+    FPTYPE* d_kfac = nullptr;
+    const FPTYPE* wg = nullptr;
+    const FPTYPE* wk = nullptr;
+    const ModulePW::PW_Basis_K* wfc_basis_ = nullptr;
+    const UnitCell& ucell_;
+    const int nksbands_;
+
+
+  private:
+    using resmem_var_op = base_device::memory::resize_memory_op<FPTYPE, Device>;
+    using setmem_var_op = base_device::memory::set_memory_op<FPTYPE, Device>;
+    using delmem_var_op = base_device::memory::delete_memory_op<FPTYPE, Device>;
+    using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<FPTYPE, Device, base_device::DEVICE_CPU>;
+    using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, Device>;
+    using cal_multi_dot_op = hamilt::cal_multi_dot_op<FPTYPE, Device>;
+};
+
+} // namespace hamilt
+
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
index 4dbdbc558f..b219678f4c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
@@ -1,12 +1,13 @@
 #include "fs_nonlocal_tools.h"
 
 #include "module_base/math_polyint.h"
-#include "module_parameter/parameter.h"
 #include "module_base/math_ylmreal.h"
 #include "module_base/memory.h"
+#include "module_base/parallel_device.h"
 #include "module_base/timer.h"
 #include "module_base/tool_title.h"
 #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
+#include "module_parameter/parameter.h"
 #include "nonlocal_maths.hpp"
 
 namespace hamilt
@@ -15,27 +16,26 @@ namespace hamilt
 template <typename FPTYPE, typename Device>
 FS_Nonlocal_tools<FPTYPE, Device>::FS_Nonlocal_tools(const pseudopot_cell_vnl* nlpp_in,
                                                      const UnitCell* ucell_in,
-                                                     const psi::Psi<std::complex<FPTYPE>, Device>* psi_in,
                                                      const K_Vectors* kv_in,
                                                      const ModulePW::PW_Basis_K* wfc_basis_in,
                                                      const Structure_Factor* sf_in,
                                                      const ModuleBase::matrix& wg,
-                                                     const ModuleBase::matrix& ekb)
-    : nlpp_(nlpp_in), ucell_(ucell_in), psi_(psi_in), kv_(kv_in), wfc_basis_(wfc_basis_in), sf_(sf_in)
+                                                     const ModuleBase::matrix* p_ekb)
+    : nlpp_(nlpp_in), ucell_(ucell_in), kv_(kv_in), wfc_basis_(wfc_basis_in), sf_(sf_in)
 {
     // get the device context
     this->device = base_device::get_device_type<Device>(this->ctx);
     this->nkb = nlpp_->nkb;
-    this->nbands = psi_->get_nbands();
     this->max_npw = wfc_basis_->npwk_max;
     this->ntype = ucell_->ntype;
+    this->nbands = wg.nc;
 
     // There is a contribution for jh<>ih in US case or multi projectors case
     // Actually, the judge of nondiagonal should be done on every atom type
     this->nondiagonal = (PARAM.globalv.use_uspp || this->nlpp_->multi_proj) ? true : false;
 
     // allocate memory
-    this->allocate_memory(wg, ekb);
+    this->allocate_memory(wg, p_ekb);
 }
 
 template <typename FPTYPE, typename Device>
@@ -46,7 +46,7 @@ FS_Nonlocal_tools<FPTYPE, Device>::~FS_Nonlocal_tools()
 }
 
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix& wg, const ModuleBase::matrix& ekb)
+void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix& wg, const ModuleBase::matrix* p_ekb)
 {
     // allocate memory
 
@@ -81,13 +81,19 @@ void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix
     const int _lmax = this->nlpp_->lmaxkb;
     resmem_var_op()(this->ctx, this->hd_ylm, (_lmax + 1) * (_lmax + 1) * max_npw);
     resmem_var_op()(this->ctx, this->hd_ylm_deri, 3 * (_lmax + 1) * (_lmax + 1) * max_npw);
+    const int nks = this->kv_->get_nks();
+    resmem_var_op()(this->ctx, d_wk, nks);
+    syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wk, this->kv_->wk.data(), nks);
 
     if (this->device == base_device::GpuDevice)
     {
         resmem_var_op()(this->ctx, d_wg, wg.nr * wg.nc);
-        resmem_var_op()(this->ctx, d_ekb, ekb.nr * ekb.nc);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, wg.c, wg.nr * wg.nc);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_ekb, ekb.c, ekb.nr * ekb.nc);
+        if (p_ekb != nullptr)
+        {
+            resmem_var_op()(this->ctx, d_ekb, p_ekb->nr * p_ekb->nc);
+            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_ekb, p_ekb->c, p_ekb->nr * p_ekb->nc);
+        }
         resmem_int_op()(this->ctx, atom_nh, this->ntype);
         resmem_int_op()(this->ctx, atom_na, this->ntype);
         syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_nh, h_atom_nh.data(), this->ntype);
@@ -103,7 +109,10 @@ void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix
     else
     {
         this->d_wg = wg.c;
-        this->d_ekb = ekb.c;
+        if (p_ekb != nullptr)
+        {
+            this->d_ekb = p_ekb->c;
+        }
         this->atom_nh = h_atom_nh.data();
         this->atom_na = h_atom_na.data();
         this->ppcell_vkb = this->nlpp_->vkb.c;
@@ -119,6 +128,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::delete_memory()
     delmem_var_op()(this->ctx, hd_vq_deri);
     delmem_var_op()(this->ctx, hd_ylm);
     delmem_var_op()(this->ctx, hd_ylm_deri);
+    delmem_var_op()(this->ctx, d_wk);
 
     // delete memory on GPU
     if (this->device == base_device::GpuDevice)
@@ -151,15 +161,13 @@ void FS_Nonlocal_tools<FPTYPE, Device>::delete_memory()
     }
 }
 
-// cal_becp
+// cal_vkb
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb(const int& ik, const int& nbdall)
 {
-    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp");
-    ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
+    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_vkb");
     const int npol = this->ucell_->get_npol();
-    const int size_becp = this->nbands * npol * this->nkb;
-    const int size_becp_act = npm * npol * this->nkb;
+    const int size_becp = nbdall * npol * this->nkb;
     if (this->becp == nullptr)
     {
         resmem_complex_op()(this->ctx, becp, size_becp);
@@ -167,8 +175,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
 
     // prepare math tools
     Nonlocal_maths<FPTYPE, Device> maths(this->nlpp_, this->ucell_);
-
-    const std::complex<FPTYPE>* ppsi = &(this->psi_[0](ik, 0, 0));
     const int npw = this->wfc_basis_->npwk[ik];
 
     std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
@@ -244,50 +250,66 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
             d_sk += npw;
         }
     }
+}
+
+// cal_becp
+template <typename FPTYPE, typename Device>
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(const int& ik,
+                                                 const int& npm,
+                                                 const std::complex<FPTYPE>* ppsi,
+                                                 const int& nbd0)
+{
+    if (npm == 0)
+    {
+        return;
+    }
+    const int npol = this->ucell_->get_npol();
+    const int npw = this->wfc_basis_->npwk[ik];
     const char transa = 'C';
     const char transb = 'N';
-    int npm_npol = npm * npol;
+    const int npm_npol = npm * npol;
+    const int index0 = nbd0 * npol * nkb;
     gemm_op()(this->ctx,
               transa,
               transb,
-              nkb,
+              this->nkb,
               npm_npol,
               npw,
               &ModuleBase::ONE,
-              ppcell_vkb,
+              this->ppcell_vkb,
               npw,
               ppsi,
               this->max_npw,
               &ModuleBase::ZERO,
-              becp,
-              nkb);
+              this->becp + index0,
+              this->nkb);
+    ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
+}
 
+template <typename FPTYPE, typename Device>
+void FS_Nonlocal_tools<FPTYPE, Device>::reduce_pool_becp(const int& npm)
+{
+    const int npol = this->ucell_->get_npol();
+    const int size_becp_act = npm * npol * this->nkb;
     // becp calculate is over , now we should broadcast this data.
-    if (this->device == base_device::GpuDevice)
+#ifdef __MPI
+    if (GlobalV::NPROC_IN_POOL > 1)
     {
-        std::complex<FPTYPE>* h_becp = nullptr;
-        resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act);
-        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, size_becp_act);
-        Parallel_Reduce::reduce_pool(h_becp, size_becp_act);
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, size_becp_act);
-        delmem_complex_h_op()(this->cpu_ctx, h_becp);
+        Parallel_Common::reduce_dev(this->ctx, this->becp, size_becp_act, POOL_WORLD);
     }
-    else
-    {
-        Parallel_Reduce::reduce_pool(becp, size_becp_act);
-    }
-    ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
+#endif
 }
 
 // cal_dbecp
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, int jpol, FPTYPE* stress)
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb_deri_s(const int& ik,
+                                                       const int& nbdall,
+                                                       const int& ipol,
+                                                       const int& jpol)
 {
-    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s");
-    ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
+    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_vkb_deri_s");
     const int npol = this->ucell_->get_npol();
-    const int size_becp = this->nbands * npol * this->nkb;
-    const int npm_npol = npm * npol;
+    const int size_becp = nbdall * npol * this->nkb;
     if (this->dbecp == nullptr)
     {
         resmem_complex_op()(this->ctx, dbecp, size_becp);
@@ -296,7 +318,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
     // prepare math tools
     Nonlocal_maths<FPTYPE, Device> maths(this->nlpp_, this->ucell_);
 
-    const std::complex<FPTYPE>* ppsi = &(this->psi_[0](ik, 0, 0));
     const int npw = this->wfc_basis_->npwk[ik];
     std::complex<FPTYPE>* vkb_deri_ptr = this->ppcell_vkb;
 
@@ -394,51 +415,99 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
             vkb_deri_ptr += nh * npw;
         }
     }
+}
+
+// cal_dbecp
+template <typename FPTYPE, typename Device>
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(const int& ik,
+                                                    const int& npm,
+                                                    const std::complex<FPTYPE>* ppsi,
+                                                    const int& nbd0)
+{
+    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s");
+    const int npol = this->ucell_->get_npol();
+    const int npm_npol = npm * npol;
+    const int npw = this->wfc_basis_->npwk[ik];
+    std::complex<FPTYPE>* dbecp_ptr = this->dbecp + nbd0 * npol * this->nkb;
+
     // 2.b calculate dbecp = dbecp_noevc * psi
     const char transa = 'C';
     const char transb = 'N';
-
     gemm_op()(this->ctx,
               transa,
               transb,
-              nkb,
+              this->nkb,
               npm_npol,
               npw,
               &ModuleBase::ONE,
-              ppcell_vkb,
+              this->ppcell_vkb,
               npw,
               ppsi,
               this->max_npw,
               &ModuleBase::ZERO,
-              dbecp,
-              nkb);
+              dbecp_ptr,
+              this->nkb);
+}
+
+template <typename FPTYPE, typename Device>
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_stress(const int& ik,
+                                                   const int& npm,
+                                                   const bool& occ,
+                                                   const int& ipol,
+                                                   const int& jpol,
+                                                   FPTYPE* stress,
+                                                   const int& nbd0)
+{
+    const int npol = this->ucell_->get_npol();
+    const int index0 = nbd0 * npol * this->nkb;
     // calculate stress for target (ipol, jpol)
-    if(npol == 1)
+    if (npol == 1)
     {
         const int current_spin = this->kv_->isk[ik];
+        FPTYPE* d_ekb_ik = nullptr;
+        if (d_ekb != nullptr)
+        {
+            d_ekb_ik = d_ekb + this->nbands * ik;
+        }
+        FPTYPE* d_wg_ik = d_wk + ik;
+        if (occ)
+        {
+            d_wg_ik = d_wg + this->nbands * ik;
+        }
         cal_stress_nl_op()(this->ctx,
-                        nondiagonal,
-                        ipol,
-                        jpol,
-                        nkb,
-                        npm,
-                        this->ntype,
-                        current_spin, // uspp only
-                        this->nlpp_->deeq.getBound2(),
-                        this->nlpp_->deeq.getBound3(),
-                        this->nlpp_->deeq.getBound4(),
-                        atom_nh,
-                        atom_na,
-                        d_wg + this->nbands * ik,
-                        d_ekb + this->nbands * ik,
-                        qq_nt,
-                        deeq,
-                        becp,
-                        dbecp,
-                        stress);
+                           nondiagonal,
+                           ipol,
+                           jpol,
+                           nkb,
+                           npm,
+                           this->ntype,
+                           current_spin, // uspp only
+                           this->nlpp_->deeq.getBound2(),
+                           this->nlpp_->deeq.getBound3(),
+                           this->nlpp_->deeq.getBound4(),
+                           atom_nh,
+                           atom_na,
+                           d_wg_ik,
+                           occ,
+                           d_ekb_ik,
+                           qq_nt,
+                           deeq,
+                           becp + index0,
+                           dbecp + index0,
+                           stress);
     }
     else
     {
+        FPTYPE* d_ekb_ik = nullptr;
+        if (d_ekb != nullptr)
+        {
+            d_ekb_ik = d_ekb + this->nbands * ik;
+        }
+        FPTYPE* d_wg_ik = d_wk + ik;
+        if (occ)
+        {
+            d_wg_ik = d_wg + this->nbands * ik;
+        }
         cal_stress_nl_op()(this->ctx,
                            ipol,
                            jpol,
@@ -450,35 +519,31 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
                            this->nlpp_->deeq_nc.getBound4(),
                            atom_nh,
                            atom_na,
-                           d_wg + this->nbands * ik,
-                           d_ekb + this->nbands * ik,
+                           d_wg_ik,
+                           occ,
+                           d_ekb_ik,
                            qq_nt,
                            this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
-                           becp,
-                           dbecp,
+                           becp + index0,
+                           dbecp + index0,
                            stress);
     }
-    ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
 }
 
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb_deri_f(const int& ik, const int& nbdall, const int& ipol)
 {
-    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f");
-    ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f");
+    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_vkb_deri");
     const int npol = this->ucell_->get_npol();
-    const int npm_npol = npm * npol;
-    const int size_becp = this->nbands * npol * this->nkb;
+    const int size_becp = nbdall * npol * this->nkb;
     if (this->dbecp == nullptr)
     {
         resmem_complex_op()(this->ctx, dbecp, 3 * size_becp);
     }
 
-    std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * size_becp;
     const std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
     std::complex<FPTYPE>* vkb_deri_ptr = this->ppcell_vkb;
 
-    const std::complex<FPTYPE>* ppsi = &(this->psi_[0](ik, 0, 0));
     const int npw = this->wfc_basis_->npwk[ik];
     if (this->pre_ik_f == -1)
     {
@@ -493,13 +558,32 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
                             &(this->wfc_basis_->gcar[ik * this->wfc_basis_->npwk_max].x));
     }
 
-    this->save_vkb(npw, ipol);
+    this->save_vkb(ik, ipol);
 
     const std::complex<double> coeff = ipol == 0 ? ModuleBase::NEG_IMAG_UNIT : ModuleBase::ONE;
 
     // calculate the vkb_deri for ipol with the memory of ppcell_vkb
     cal_vkb1_nl_op<FPTYPE, Device>()(this->ctx, nkb, npw, npw, npw, ipol, coeff, vkb_ptr, gcar, vkb_deri_ptr);
 
+}
+
+template <typename FPTYPE, typename Device>
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(const int& ik,
+                                                    const int& nbdall,
+                                                    const int& npm,
+                                                    const int& ipol,
+                                                    const std::complex<FPTYPE>* ppsi,
+                                                    const int& nbd0)
+{
+    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f");
+    const int npol = this->ucell_->get_npol();
+    const int size_becp = nbdall * npol * this->nkb;
+    const int index0 = nbd0 * npol * this->nkb;
+    std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * size_becp + index0;
+    std::complex<FPTYPE>* vkb_deri_ptr = this->ppcell_vkb;
+    const int npm_npol = npm * npol;
+    const int npw = this->wfc_basis_->npwk[ik];
+
     // do gemm to get dbecp and revert the ppcell_vkb for next ipol
     const char transa = 'C';
     const char transb = 'N';
@@ -516,16 +600,14 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
               this->max_npw,
               &ModuleBase::ZERO,
               dbecp_ptr,
-              nkb);
-    this->revert_vkb(npw, ipol);
-    this->pre_ik_f = ik;
-    ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f");
+              this->nkb);
 }
 
 // save_vkb
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::save_vkb(int npw, int ipol)
+void FS_Nonlocal_tools<FPTYPE, Device>::save_vkb(const int& ik, const int& ipol)
 {
+    const int npw = this->wfc_basis_->npwk[ik];
     if (this->device == base_device::CpuDevice)
     {
         const int gcar_zero_count = this->gcar_zero_indexes[ipol * this->wfc_basis_->npwk_max];
@@ -560,8 +642,9 @@ void FS_Nonlocal_tools<FPTYPE, Device>::save_vkb(int npw, int ipol)
 
 // revert_vkb
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::revert_vkb(int npw, int ipol)
+void FS_Nonlocal_tools<FPTYPE, Device>::revert_vkb(const int& ik, const int& ipol)
 {
+    const int npw = this->wfc_basis_->npwk[ik];
     const std::complex<FPTYPE> coeff = ipol == 0 ? ModuleBase::NEG_IMAG_UNIT : ModuleBase::ONE;
     if (this->device == base_device::CpuDevice)
     {
@@ -594,10 +677,11 @@ void FS_Nonlocal_tools<FPTYPE, Device>::revert_vkb(int npw, int ipol)
                                 coeff);
 #endif
     }
+    this->pre_ik_f = ik;
 }
 
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::transfer_gcar(int npw, int npw_max, const FPTYPE* gcar_in)
+void FS_Nonlocal_tools<FPTYPE, Device>::transfer_gcar(const int& npw, const int& npw_max, const FPTYPE* gcar_in)
 {
     std::vector<FPTYPE> gcar_tmp(3 * npw_max);
     gcar_tmp.assign(gcar_in, gcar_in + 3 * npw_max);
@@ -654,37 +738,66 @@ void FS_Nonlocal_tools<FPTYPE, Device>::transfer_gcar(int npw, int npw_max, cons
 
 // cal_force
 template <typename FPTYPE, typename Device>
-void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force)
+void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(const int& ik,
+                                                  const int& nbdall,
+                                                  const int& npm,
+                                                  const bool& occ,
+                                                  FPTYPE* force,
+                                                  const int& nbd0)
 {
     const int current_spin = this->kv_->isk[ik];
     const int force_nc = 3;
+    const int npol = this->ucell_->get_npol();
+    const int index0 = nbd0 * npol * this->nkb;
     // calculate the force
-    if(this->ucell_->get_npol() == 1)
+    if (npol == 1)
     {
+        FPTYPE* d_ekb_ik = nullptr;
+        if (d_ekb != nullptr)
+        {
+            d_ekb_ik = d_ekb + this->nbands * ik;
+        }
+        FPTYPE* d_wg_ik = d_wk + ik;
+        if (occ)
+        {
+            d_wg_ik = d_wg + this->nbands * ik;
+        }
+        
         cal_force_nl_op<FPTYPE, Device>()(this->ctx,
-                                        nondiagonal,
-                                        npm,
-                                        this->ntype,
-                                        current_spin,
-                                        this->nlpp_->deeq.getBound2(),
-                                        this->nlpp_->deeq.getBound3(),
-                                        this->nlpp_->deeq.getBound4(),
-                                        force_nc,
-                                        this->nbands,
-                                        nkb,
-                                        atom_nh,
-                                        atom_na,
-                                        this->ucell_->tpiba,
-                                        d_wg + this->nbands * ik,
-                                        d_ekb + this->nbands * ik,
-                                        qq_nt,
-                                        deeq,
-                                        becp,
-                                        dbecp,
-                                        force);
+                                          nondiagonal,
+                                          npm,
+                                          this->ntype,
+                                          current_spin,
+                                          this->nlpp_->deeq.getBound2(),
+                                          this->nlpp_->deeq.getBound3(),
+                                          this->nlpp_->deeq.getBound4(),
+                                          force_nc,
+                                          nbdall,
+                                          nkb,
+                                          atom_nh,
+                                          atom_na,
+                                          this->ucell_->tpiba,
+                                          d_wg_ik,
+                                          occ,
+                                          d_ekb_ik,
+                                          qq_nt,
+                                          deeq,
+                                          becp + index0,
+                                          dbecp + index0,
+                                          force);
     }
     else
     {
+        FPTYPE* d_ekb_ik = nullptr;
+        if (d_ekb != nullptr)
+        {
+            d_ekb_ik = d_ekb + this->nbands * ik;
+        }
+        FPTYPE* d_wg_ik = d_wk + ik;
+        if (occ)
+        {
+            d_wg_ik = d_wg + this->nbands * ik;
+        }
         cal_force_nl_op<FPTYPE, Device>()(this->ctx,
                                           npm,
                                           this->ntype,
@@ -692,17 +805,18 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
                                           this->nlpp_->deeq_nc.getBound3(),
                                           this->nlpp_->deeq_nc.getBound4(),
                                           force_nc,
-                                          this->nbands,
+                                          nbdall,
                                           nkb,
                                           atom_nh,
                                           atom_na,
                                           this->ucell_->tpiba,
-                                          d_wg + this->nbands * ik,
-                                          d_ekb + this->nbands * ik,
+                                          d_wg_ik,
+                                          occ,
+                                          d_ekb_ik,
                                           qq_nt,
                                           this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
-                                          becp,
-                                          dbecp,
+                                          becp + index0,
+                                          dbecp + index0,
                                           force);
     }
 }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h
index 6680edde05..0cc640f27c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h
@@ -32,37 +32,111 @@ class FS_Nonlocal_tools
   public:
     FS_Nonlocal_tools(const pseudopot_cell_vnl* nlpp_in,
                       const UnitCell* ucell_in,
-                      const psi::Psi<std::complex<FPTYPE>, Device>* psi_in,
                       const K_Vectors* kv_in,
                       const ModulePW::PW_Basis_K* wfc_basis_in,
                       const Structure_Factor* sf_in,
                       const ModuleBase::matrix& wg,
-                      const ModuleBase::matrix& ekb);
+                      const ModuleBase::matrix* p_ekb);
     ~FS_Nonlocal_tools();
 
+    /**
+     * @brief calculate the projectors |beta>
+     * 
+     */
+    void cal_vkb(const int& ik, const int& nbdall);
+
     /**
      * @brief calculate the becp = <psi|beta> for all beta functions
+     * 
+     * @param ik the index of k point
+     * @param npm the number of bands
+     * @param ppsi the wave functions
+     * @param nbd0 the start index of the bands
      */
-    void cal_becp(int ik, int npm);
+    void cal_becp(const int& ik, const int& npm, const std::complex<FPTYPE>* ppsi, const int& nbd0 = 0);
+    
+    /// @brief mpi_allreduce the becp in the pool
+    void reduce_pool_becp(const int& npm);
+
+    /**
+     * @brief calculate vkb_deri
+     * 
+     * @param ik the index of k point
+     * @param nbdall the number of all bands, it decides the size of vkb_deri
+     * @param ipol the i index of the direction
+     * @param jpol the j index of the direction
+     */
+    void cal_vkb_deri_s(const int& ik, const int& nbdall, const int& ipol, const int& jpol);
+
     /**
      * @brief calculate the dbecp_{ij} = <psi|\partial beta/\partial varepsilon_{ij}> for all beta functions
      *       stress_{ij} = -1/omega \sum_{n,k}f_{nk} \sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_{ij} also calculated
+     * 
+     * @param ik the index of k point
+     * @param npm the number of bands
+     * @param ppsi the wave functions
+     * @param nbd0 the start index of the bands
      */
-    void cal_dbecp_s(int ik, int npm, int ipol, int jpol, FPTYPE* stress);
+    void cal_dbecp_s(const int& ik, const int& npm, const std::complex<FPTYPE>* ppsi, const int& nbd0 = 0);
+
+    /**
+     * @brief calculate stress
+     * 
+     * @param ik the index of k point
+     * @param npm the number of bands
+     * @param occ if use the occupation of the bands
+     * @param ipol the i index of the direction
+     * @param jpol the j index of the direction
+     * @param stress [out] the stress tensor
+     * @param nbd0 the start index of the bands
+     */
+    void cal_stress(const int& ik,
+                    const int& npm,
+                    const bool& occ,
+                    const int& ipol,
+                    const int& jpol,
+                    FPTYPE* stress,
+                    const int& nbd0 = 0);
+
+        /**
+         * @brief calculate vkb_deri
+         *
+         * @param ik the index of k point
+         * @param nbdall the number of all bands, it decides the size of vkb_deri
+         * @param ipol the index of the polar
+         */
+        void cal_vkb_deri_f(const int& ik, const int& nbdall, const int& ipol);
     /**
      * @brief calculate the dbecp_i = <psi|\partial beta/\partial \tau^I_i> for all beta functions
+     * 
+     * @param ik the index of k point
+     * @param nbdall the number of all bands, which is the dimension of dbecp and becp
+     * @param npm the number of bands
+     * @param ipol the index of the polar
+     * @param ppsi the wave functions
+     * @param nbd0 the start index of the bands
      */
-    void cal_dbecp_f(int ik, int npm, int ipol);
+    void cal_dbecp_f(const int& ik, const int& nbdall, const int& npm, const int& ipol, const std::complex<FPTYPE>* ppsi, const int& nbd0 = 0);
     /**
      * @brief calculate the force^I_i = - \sum_{n,k}f_{nk} \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_i
+     * 
+     * @param ik the index of k point
+     * @param npm the number of bands
+     * @param nbdall the number of all bands, which is the dimension of dbecp and becp
+     * @param nbd0 start band index for dbecp and becp
+     * @param occ if use the occupation of the bands
+     * @param force [out] the force
      */
-    void cal_force(int ik, int npm, FPTYPE* force);
+    void cal_force(const int& ik, const int& nbdall, const int& npm, const bool& occ, FPTYPE* force, const int& nbd0 = 0);
+
+    /// @brief revert the 0-value dvkbs for calculating the dbecp_i in the force calculation
+    void revert_vkb(const int& ik, const int& ipol);
 
   private:
     /**
      * @brief allocate the memory for the variables
      */
-    void allocate_memory(const ModuleBase::matrix& wg, const ModuleBase::matrix& ekb);
+    void allocate_memory(const ModuleBase::matrix& wg, const ModuleBase::matrix* p_ekb);
     /**
      * @brief delete the memory for the variables
      */
@@ -73,7 +147,6 @@ class FS_Nonlocal_tools
     const Structure_Factor* sf_;
     const pseudopot_cell_vnl* nlpp_;
     const UnitCell* ucell_;
-    const psi::Psi<std::complex<FPTYPE>, Device>* psi_;
     const K_Vectors* kv_;
     const ModulePW::PW_Basis_K* wfc_basis_;
 
@@ -102,15 +175,14 @@ class FS_Nonlocal_tools
     int gcar_zero_counts[3] = {0, 0, 0};
     std::complex<FPTYPE>* vkb_save = nullptr;
     /// @brief count zero gcar indexes and prepare zero_indexes, do gcar_y /= gcar_x, gcar_z /= gcar_y
-    void transfer_gcar(int npw, int npw_max, const FPTYPE* gcar_in);
+    void transfer_gcar(const int& npw, const int& npw_max, const FPTYPE* gcar_in);
     /// @brief save the 0-value dvkbs for calculating the dbecp_i in the force calculation
-    void save_vkb(int npw, int ipol);
-    /// @brief revert the 0-value dvkbs for calculating the dbecp_i in the force calculation
-    void revert_vkb(int npw, int ipol);
+    void save_vkb(const int& npw, const int& ipol);
     /// ---------------------------------------------------------------------
 
     /// pointers to access the data without memory arrangement
     FPTYPE* d_wg = nullptr;
+    FPTYPE* d_wk = nullptr;
     FPTYPE* d_ekb = nullptr;
     FPTYPE* gcar = nullptr;
     FPTYPE* deeq = nullptr;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index dddf889de1..991a81e746 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -47,6 +47,7 @@ __global__ void cal_force_nl(
         const int *atom_na,
         const FPTYPE tpiba,
         const FPTYPE *d_wg,
+        const bool occ,
         const FPTYPE* d_ekb,
         const FPTYPE* qq_nt,
         const FPTYPE *deeq,
@@ -64,13 +65,29 @@ __global__ void cal_force_nl(
     }
 
     int nproj = atom_nh[it];
-    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
-    FPTYPE ekb_now = d_ekb[ib];
+    FPTYPE fac;
+    if(occ)
+    {
+        fac = d_wg[ib] * 2.0 * tpiba;
+    }
+    else
+    {
+        fac = d_wg[0] * 2.0 * tpiba;
+    }
+    FPTYPE ekb_now = 0.0;
+    if (d_ekb != nullptr)
+    {
+        ekb_now = d_ekb[ib];
+    }
     for (int ia = 0; ia < atom_na[it]; ia++) {
         for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
+            FPTYPE ps_qq = 0;
+            if(ekb_now != 0)
+            {
+                ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
+            }
             // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip];
-            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]
-                        - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
+            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] + ps_qq;
             const int inkb = sum + ip;
             //out<<"\n ps = "<<ps;
 
@@ -87,8 +104,12 @@ __global__ void cal_force_nl(
                 for (int ip2 = 0; ip2 < nproj; ip2++) {
                     if (ip != ip2) {
                         const int jnkb = sum + ip2;
-                        ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]
-                             - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                        FPTYPE ps_qq = 0;
+                        if(ekb_now != 0)
+                        {
+                            ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                        }
+                        ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
                         for (int ipol = 0; ipol < 3; ipol++) {
                             const FPTYPE dbb = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) *
                                                 becp[ib * nkb + jnkb]).real();
@@ -144,6 +165,7 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                                                                   const int* atom_na,
                                                                   const FPTYPE& tpiba,
                                                                   const FPTYPE* d_wg,
+                                                                  const bool& occ,
                                                                   const FPTYPE* d_ekb,
                                                                   const FPTYPE* qq_nt,
                                                                   const FPTYPE* deeq,
@@ -158,7 +180,7 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
             forcenl_nc, nbands, nkb,
             atom_nh, atom_na,
             tpiba,
-            d_wg, d_ekb, qq_nt, deeq,
+            d_wg, occ, d_ekb, qq_nt, deeq,
             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
             force);// array of data
@@ -179,6 +201,7 @@ __global__ void cal_force_nl(
         const int *atom_na,
         const FPTYPE tpiba,
         const FPTYPE *d_wg,
+        const bool occ,
         const FPTYPE* d_ekb,
         const FPTYPE* qq_nt,
         const thrust::complex<FPTYPE> *deeq_nc,
@@ -197,15 +220,31 @@ __global__ void cal_force_nl(
     }
 
     int nproj = atom_nh[it];
-    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
-    FPTYPE ekb_now = d_ekb[ib];
+    FPTYPE fac;
+    if(occ)
+    {
+        fac = d_wg[ib] * 2.0 * tpiba;
+    }
+    else
+    {
+        fac = d_wg[0] * 2.0 * tpiba;
+    }
+    FPTYPE ekb_now = 0.0;
+    if (d_ekb != nullptr)
+    {
+        ekb_now = d_ekb[ib];
+    }
     for (int ia = 0; ia < atom_na[it]; ia++) {
         for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
             const int inkb = sum + ip;
             for (int ip2 = 0; ip2 < nproj; ip2++) 
             {
                 // Effective values of the D-eS coefficients
-                const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                thrust::complex<FPTYPE> ps_qq = 0;
+                if (ekb_now)
+                {
+                    ps_qq = thrust::complex<FPTYPE>(-ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2], 0.0);
+                }
                 const int jnkb = sum + ip2;
                 const thrust::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
                 const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
@@ -244,6 +283,7 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                     const int* atom_na,
                     const FPTYPE& tpiba,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const std::complex<FPTYPE>* deeq_nc,
@@ -257,7 +297,7 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
             forcenl_nc, nbands, nkb,
             atom_nh, atom_na,
             tpiba,
-            d_wg, d_ekb, qq_nt, 
+            d_wg, occ, d_ekb, qq_nt, 
             reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index e566126876..36e0aac37a 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -113,6 +113,7 @@ __global__ void cal_stress_nl(
         const int *atom_nh,
         const int *atom_na,
         const FPTYPE *d_wg,
+        const bool occ,
         const FPTYPE* d_ekb,
         const FPTYPE* qq_nt,
         const FPTYPE *deeq,
@@ -131,8 +132,20 @@ __global__ void cal_stress_nl(
     }
 
     FPTYPE stress_var = 0;
-    const FPTYPE fac = d_wg[ib];
-    const FPTYPE ekb_now = d_ekb[ib];
+    FPTYPE fac;
+    if (occ)
+    {
+        fac = d_wg[ib];
+    }
+    else
+    {
+        fac = d_wg[0];
+    }
+    FPTYPE ekb_now = 0.0;
+    if (d_ekb != nullptr)
+    {
+        ekb_now = d_ekb[ib];
+    }
     const int nproj = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
@@ -141,8 +154,12 @@ __global__ void cal_stress_nl(
             if(!nondiagonal && ip1 != ip2) {
                 continue;
             }
-            const FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2]
-                        - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            FPTYPE ps_qq = 0;
+            if (ekb_now != 0)
+            {
+                ps_qq = -ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            }
+            const FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
             const int inkb1 = sum + ip1;
             const int inkb2 = sum + ip2;
             //out<<"\n ps = "<<ps;
@@ -159,6 +176,21 @@ __global__ void cal_stress_nl(
     }
 }
 
+template <typename FPTYPE>
+__global__ void cal_multi_dot(const int npw,
+                              const FPTYPE fac,
+                              const FPTYPE* gk1,
+                              const FPTYPE* gk2,
+                              const FPTYPE* d_kfac,
+                              const thrust::complex<FPTYPE>* psi,
+                              FPTYPE* sum)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < npw) {
+        atomicAdd(sum, fac * gk1[idx] * gk2[idx] * d_kfac[idx] * thrust::norm(psi[idx]));
+    }
+}
+
 template <typename FPTYPE>
 void cal_dbecp_noevc_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
                                                                         const int& ipol,
@@ -211,6 +243,7 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                                                                    const int* atom_nh,
                                                                    const int* atom_na,
                                                                    const FPTYPE* d_wg,
+                                                                   const bool& occ,
                                                                    const FPTYPE* d_ekb,
                                                                    const FPTYPE* qq_nt,
                                                                    const FPTYPE* deeq,
@@ -231,6 +264,7 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
              atom_nh,
              atom_na,
              d_wg,
+             occ,
              d_ekb,
              qq_nt,
              deeq,
@@ -253,6 +287,7 @@ __global__ void cal_stress_nl(
         const int *atom_nh,
         const int *atom_na,
         const FPTYPE *d_wg,
+        const bool occ,
         const FPTYPE* d_ekb,
         const FPTYPE* qq_nt,
         const thrust::complex<FPTYPE> *deeq_nc,
@@ -272,15 +307,31 @@ __global__ void cal_stress_nl(
     }
 
     FPTYPE stress_var = 0;
-    const FPTYPE fac = d_wg[ib];
-    const FPTYPE ekb_now = d_ekb[ib];
+    FPTYPE fac;
+    if (occ)
+    {
+        fac = d_wg[ib];
+    }
+    else
+    {
+        fac = d_wg[0];
+    }
+    FPTYPE ekb_now = 0.0;
+    if (d_ekb != nullptr)
+    {
+        ekb_now = d_ekb[ib];
+    }
     const int nproj = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
         for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) {
             const int ip1 = ii / nproj;
 	        const int ip2 = ii % nproj;
-            const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            thrust::complex<FPTYPE> ps_qq = 0;
+            if(ekb_now != 0)
+            {
+                ps_qq = thrust::complex<FPTYPE>(- ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2], 0.0);
+            }
             const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
             const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
             const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
@@ -317,6 +368,7 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                                                                    const int* atom_nh,
                                                                    const int* atom_na,
                                                                    const FPTYPE* d_wg,
+                                                                   const bool& occ,
                                                                    const FPTYPE* d_ekb,
                                                                    const FPTYPE* qq_nt,
                                                                    const std::complex<FPTYPE>* deeq_nc,
@@ -335,6 +387,7 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
              atom_nh,
              atom_na,
              d_wg,
+             occ,
              d_ekb,
              qq_nt,
              reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
@@ -345,6 +398,28 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+FPTYPE cal_multi_dot_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const int& npw,
+                                                                   const FPTYPE& fac,
+                                                                   const FPTYPE* gk1,
+                                                                   const FPTYPE* gk2,
+                                                                   const FPTYPE* d_kfac,
+                                                                   const std::complex<FPTYPE>* psi)
+{
+    FPTYPE* d_sum = nullptr;
+    cudaMalloc(&d_sum, sizeof(FPTYPE) * 1);
+    cudaMemset(d_sum, 0, sizeof(FPTYPE) * 1);
+    int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    cal_multi_dot<FPTYPE><<<block, THREADS_PER_BLOCK>>>(
+        npw, fac, gk1, gk2, d_kfac, reinterpret_cast<const thrust::complex<FPTYPE>*>(psi), d_sum);
+    FPTYPE sum;
+    cudaMemcpy(&sum, d_sum, sizeof(FPTYPE) * 1, cudaMemcpyDeviceToHost);
+    cudaFree(d_sum);
+
+    cudaCheckOnDebug();
+    return sum;
+}
+
 template <typename T, typename Device>
 void cal_stress_mgga_op<T, Device>::operator()(
     const int& spin,
@@ -861,6 +936,9 @@ template struct cal_stress_drhoc_aux_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_npw_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
 
+template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
+template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
+
 // template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_GPU>;
 // template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_GPU>;
 }  // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 2ee9cb4530..261c510efc 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -54,6 +54,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int* atom_na,
                     const FPTYPE& tpiba,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const FPTYPE* deeq,
@@ -78,15 +79,31 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     for (int ib = 0; ib < nbands_occ; ib++)
                     {
                         FPTYPE local_force[3] = {0, 0, 0};
-                        FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
-                        FPTYPE ekb_now = d_ekb[ib];
+                        FPTYPE fac;
+                        if(occ)
+                        {
+                            fac = d_wg[ib] * 2.0 * tpiba;
+                        }
+                        else
+                        {
+                            fac = d_wg[0] * 2.0 * tpiba;
+                        }
+                        FPTYPE ekb_now = 0.0;
+                        if (d_ekb != nullptr)
+                        {
+                            ekb_now = d_ekb[ib];
+                        }
                         int iat = iat0 + ia;
                         int sum = sum0 + ia * nproj;
                         for (int ip = 0; ip < nproj; ip++)
                         {
+                            FPTYPE ps_qq = 0;
+                            if(ekb_now != 0)
+                            {
+                                ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
+                            }
                             // Effective values of the D-eS coefficients
-                            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]
-                                        - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
+                            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] + ps_qq;
                             const int inkb = sum + ip;
                             // out<<"\n ps = "<<ps;
 
@@ -105,9 +122,12 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                                     if (ip != ip2)
                                     {
                                         const int jnkb = sum + ip2;
-                                        FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]
-                                                    - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
-
+                                        FPTYPE ps_qq = 0;
+                                        if (ekb_now != 0)
+                                        {
+                                            ps_qq = -ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                                        }
+                                        FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
                                         for (int ipol = 0; ipol < 3; ipol++)
                                         {
                                             const FPTYPE dbb = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb])
@@ -159,6 +179,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int* atom_na,
                     const FPTYPE& tpiba,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const std::complex<FPTYPE>* deeq_nc,
@@ -184,8 +205,20 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     {
                         const int ib2 = ib*2;
                         FPTYPE local_force[3] = {0, 0, 0};
-                        FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
-                        FPTYPE ekb_now = d_ekb[ib];
+                        FPTYPE fac;
+                        if(occ)
+                        {
+                            fac = d_wg[ib] * 2.0 * tpiba;
+                        }
+                        else
+                        {
+                            fac = d_wg[0] * 2.0 * tpiba;
+                        }
+                        FPTYPE ekb_now = 0.0;
+                        if (d_ekb != nullptr)
+                        {
+                            ekb_now = d_ekb[ib];
+                        }
                         int iat = iat0 + ia;
                         int sum = sum0 + ia * nprojs;
                         for (int ip = 0; ip < nprojs; ip++)
@@ -195,7 +228,11 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                             for (int ip2 = 0; ip2 < nprojs; ip2++)
                             {
                                 // Effective values of the D-eS coefficients
-                                std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps_qq = 0;
+                                if(ekb_now)
+                                {
+                                    ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                                }
                                 const int jnkb = sum + ip2;
                                 std::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
                                 std::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index c68df7bd0f..ce6ae7afb3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -60,6 +60,7 @@ struct cal_force_nl_op
     /// @param atom_na - GlobalC::ucell.atoms[ii].na
     /// @param tpiba - GlobalC::ucell.tpiba
     /// @param d_wg - input parameter wg
+    /// @param occ - if use the occupation of the bands
     /// @param d_ekb - input parameter ekb
     /// @param qq_nt - GlobalC::ppcell.qq_nt
     /// @param deeq - GlobalC::ppcell.deeq
@@ -83,6 +84,7 @@ struct cal_force_nl_op
                     const int* atom_na,
                     const FPTYPE& tpiba,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const FPTYPE* deeq,
@@ -103,6 +105,7 @@ struct cal_force_nl_op
                     const int* atom_na,
                     const FPTYPE& tpiba,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const std::complex<FPTYPE>* deeq_nc,
@@ -145,6 +148,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int* atom_na,
                     const FPTYPE& tpiba,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const FPTYPE* deeq,
@@ -165,6 +169,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int* atom_na,
                     const FPTYPE& tpiba,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const std::complex<FPTYPE>* deeq_nc,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index da45a375e7..b89a380133 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -43,6 +43,7 @@ __global__ void cal_force_nl(
         const int *atom_na,
         const FPTYPE tpiba,
         const FPTYPE *d_wg,
+        const bool occ,
         const FPTYPE* d_ekb,
         const FPTYPE* qq_nt,
         const FPTYPE *deeq,
@@ -61,13 +62,29 @@ __global__ void cal_force_nl(
     }
 
     int nproj = atom_nh[it];
-    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
-    FPTYPE ekb_now = d_ekb[ib];
+    FPTYPE fac;
+    if(occ)
+    {
+        fac = d_wg[ib] * 2.0 * tpiba;
+    }
+    else
+    {
+        fac = d_wg[0] * 2.0 * tpiba;
+    }
+    FPTYPE ekb_now = 0.0;
+    if (d_ekb != nullptr)
+    {
+        ekb_now = d_ekb[ib];
+    }
     for (int ia = 0; ia < atom_na[it]; ia++) {
         for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
+            FPTYPE ps_qq = 0;
+            if(ekb_now != 0)
+            {
+                ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
+            }
             // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip];
-            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip]
-                        - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip];
+            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] + ps_qq;
             const int inkb = sum + ip;
             //out<<"\n ps = "<<ps;
 
@@ -84,8 +101,12 @@ __global__ void cal_force_nl(
                 for (int ip2 = 0; ip2 < nproj; ip2++) {
                     if (ip != ip2) {
                         const int jnkb = sum + ip2;
-                        ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]
-                             - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                        FPTYPE ps_qq = 0;
+                        if(ekb_now != 0)
+                        {
+                            ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                        }
+                        ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
                         for (int ipol = 0; ipol < 3; ipol++) {
                             const FPTYPE dbb = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) *
                                                 becp[ib * nkb + jnkb]).real();
@@ -141,6 +162,7 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                                                                   const int* atom_na,
                                                                   const FPTYPE& tpiba,
                                                                   const FPTYPE* d_wg,
+                                                                  const bool& occ,
                                                                   const FPTYPE* d_ekb,
                                                                   const FPTYPE* qq_nt,
                                                                   const FPTYPE* deeq,
@@ -155,7 +177,7 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
             forcenl_nc, nbands, nkb,
             atom_nh, atom_na,
             tpiba,
-            d_wg, d_ekb, qq_nt, deeq,
+            d_wg, occ, d_ekb, qq_nt, deeq,
             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
             force);// array of data
@@ -176,6 +198,7 @@ __global__ void cal_force_nl(
         const int *atom_na,
         const FPTYPE tpiba,
         const FPTYPE *d_wg,
+        const bool occ,
         const FPTYPE* d_ekb,
         const FPTYPE* qq_nt,
         const thrust::complex<FPTYPE> *deeq_nc,
@@ -195,15 +218,31 @@ __global__ void cal_force_nl(
     }
 
     int nproj = atom_nh[it];
-    FPTYPE fac = d_wg[ib] * 2.0 * tpiba;
-    FPTYPE ekb_now = d_ekb[ib];
+    FPTYPE fac;
+    if(occ)
+    {
+        fac = d_wg[ib] * 2.0 * tpiba;
+    }
+    else
+    {
+        fac = d_wg[0] * 2.0 * tpiba;
+    }
+    FPTYPE ekb_now = 0.0;
+    if (d_ekb != nullptr)
+    {
+        ekb_now = d_ekb[ib];
+    }
     for (int ia = 0; ia < atom_na[it]; ia++) {
         for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) {
             const int inkb = sum + ip;
             for (int ip2 = 0; ip2 < nproj; ip2++) 
             {
                 // Effective values of the D-eS coefficients
-                const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                thrust::complex<FPTYPE> ps_qq = 0;
+                if (ekb_now)
+                {
+                    ps_qq = thrust::complex<FPTYPE>(-ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2], 0.0);
+                }
                 const int jnkb = sum + ip2;
                 const thrust::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
                 const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
@@ -242,6 +281,7 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                     const int* atom_na,
                     const FPTYPE& tpiba,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const std::complex<FPTYPE>* deeq_nc,
@@ -255,7 +295,7 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
             forcenl_nc, nbands, nkb,
             atom_nh, atom_na,
             tpiba,
-            d_wg, d_ekb, qq_nt, 
+            d_wg, occ, d_ekb, qq_nt, 
             reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
index b3e7b0a34a..cd26e312a5 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -108,6 +108,7 @@ __global__ void cal_stress_nl(
         const int *atom_nh,
         const int *atom_na,
         const FPTYPE *d_wg,
+        const bool occ,
         const FPTYPE* d_ekb,
         const FPTYPE* qq_nt,
         const FPTYPE *deeq,
@@ -126,8 +127,20 @@ __global__ void cal_stress_nl(
     }
 
     FPTYPE stress_var = 0;
-    FPTYPE fac = d_wg[ib];
-    FPTYPE ekb_now = d_ekb[ib];
+    FPTYPE fac;
+    if (occ)
+    {
+        fac = d_wg[ib];
+    }
+    else
+    {
+        fac = d_wg[0];
+    }
+    FPTYPE ekb_now = 0.0;
+    if (d_ekb != nullptr)
+    {
+        ekb_now = d_ekb[ib];
+    }
     const int nproj = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
@@ -136,8 +149,12 @@ __global__ void cal_stress_nl(
             if(!nondiagonal && ip1 != ip2) {
                 continue;
             }
-            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2]
-                        - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            FPTYPE ps_qq = 0;
+            if (ekb_now != 0)
+            {
+                ps_qq = -ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            }
+            FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
             const int inkb1 = sum + ip1;
             const int inkb2 = sum + ip2;
             //out<<"\n ps = "<<ps;
@@ -205,6 +222,7 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                                                                    const int* atom_nh,
                                                                    const int* atom_na,
                                                                    const FPTYPE* d_wg,
+                                                                   const bool& occ,
                                                                    const FPTYPE* d_ekb,
                                                                    const FPTYPE* qq_nt,
                                                                    const FPTYPE* deeq,
@@ -225,6 +243,7 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
              atom_nh,
              atom_na,
              d_wg,
+             occ,
              d_ekb,
              qq_nt,
              deeq,
@@ -247,6 +266,7 @@ __global__ void cal_stress_nl(
         const int *atom_nh,
         const int *atom_na,
         const FPTYPE *d_wg,
+        const bool occ,
         const FPTYPE* d_ekb,
         const FPTYPE* qq_nt,
         const thrust::complex<FPTYPE> *deeq_nc,
@@ -266,15 +286,31 @@ __global__ void cal_stress_nl(
     }
 
     FPTYPE stress_var = 0;
-    FPTYPE fac = d_wg[ib];
-    FPTYPE ekb_now = d_ekb[ib];
+    FPTYPE fac;
+    if (occ)
+    {
+        fac = d_wg[ib];
+    }
+    else
+    {
+        fac = d_wg[0];
+    }
+    FPTYPE ekb_now = 0.0;
+    if (d_ekb != nullptr)
+    {
+        ekb_now = d_ekb[ib];
+    }
     const int nproj = atom_nh[it];
     for (int ia = 0; ia < atom_na[it]; ia++)
     {
         for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) {
             const int ip1 = ii / nproj;
             const int ip2 = ii % nproj;
-            const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            thrust::complex<FPTYPE> ps_qq = 0;
+            if (ekb_now != 0)
+            {
+                ps_qq = thrust::complex<FPTYPE>(- ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2], 0.0);
+            }
             const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
             const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
             const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
@@ -297,6 +333,21 @@ __global__ void cal_stress_nl(
     }
 }
 
+template <typename FPTYPE>
+__global__ void cal_multi_dot(const int npw,
+                              const FPTYPE fac,
+                              const FPTYPE* gk1,
+                              const FPTYPE* gk2,
+                              const FPTYPE* d_kfac,
+                              const thrust::complex<FPTYPE>* psi,
+                              FPTYPE* sum)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < npw) {
+        atomicAdd(sum, fac * gk1[idx] * gk2[idx] * d_kfac[idx] * thrust::norm(psi[idx]));
+    }
+}
+
 template <typename FPTYPE>
 void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
                                                                    const int& ipol,
@@ -310,6 +361,7 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                                                                    const int* atom_nh,
                                                                    const int* atom_na,
                                                                    const FPTYPE* d_wg,
+                                                                   const bool& occ,
                                                                    const FPTYPE* d_ekb,
                                                                    const FPTYPE* qq_nt,
                                                                    const std::complex<FPTYPE>* deeq_nc,
@@ -328,6 +380,7 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
              atom_nh,
              atom_na,
              d_wg,
+             occ,
              d_ekb,
              qq_nt,
              reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
@@ -798,6 +851,27 @@ void cal_force_npw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     return ;
 }
 
+template <typename FPTYPE>
+FPTYPE cal_multi_dot_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const int& npw,
+                                                                   const FPTYPE& fac,
+                                                                   const FPTYPE* gk1,
+                                                                   const FPTYPE* gk2,
+                                                                   const FPTYPE* d_kfac,
+                                                                   const std::complex<FPTYPE>* psi)
+{
+    FPTYPE* d_sum = nullptr;
+    hipMalloc(&d_sum, sizeof(FPTYPE) * 1);
+    hipMemset(d_sum, 0, sizeof(FPTYPE) * 1);
+    int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_multi_dot<FPTYPE>), dim3(block), dim3(THREADS_PER_BLOCK), 0, 0,
+        npw, fac, gk1, gk2, d_kfac, reinterpret_cast<const thrust::complex<FPTYPE>*>(psi), d_sum);
+    FPTYPE sum;
+    hipMemcpy(&sum, d_sum, sizeof(FPTYPE) * 1, hipMemcpyDeviceToHost);
+    hipFree(d_sum);
+    hipCheckOnDebug();
+    return sum;
+}
+
 template struct cal_vq_op<double, base_device::DEVICE_GPU>;
 template struct cal_vq_op<float, base_device::DEVICE_GPU>;
 
@@ -846,4 +920,6 @@ template struct cal_stress_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_npw_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
 
+template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
+template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
 }  // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index f36f2e8316..979955d3e8 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -9,7 +9,6 @@
 #include "module_base/libm/libm.h"
 namespace hamilt
 {
-
 template <typename FPTYPE>
 struct cal_dbecp_noevc_nl_op<FPTYPE, base_device::DEVICE_CPU>
 {
@@ -83,6 +82,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const FPTYPE* deeq,
@@ -101,10 +101,15 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
             {
                 const int nproj = atom_nh[it];
 #ifdef _OPENMP
-#pragma omp for collapse(4)
+#pragma omp for
 #endif
                 for (int ib = 0; ib < nbands_occ; ib++)
                 {
+                    FPTYPE ekb_now = 0.0;
+                    if(d_ekb != nullptr)
+                    {
+                        ekb_now = d_ekb[ib];
+                    }
                     for (int ia = 0; ia < atom_na[it]; ia++)
                     {
                         for (int ip1 = 0; ip1 < nproj; ip1++)
@@ -115,10 +120,22 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                                 {
                                     continue;
                                 }
-                                FPTYPE fac = d_wg[ib] * 1.0;
-                                FPTYPE ekb_now = d_ekb[ib];
-                                FPTYPE ps = deeq[((spin * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]
-                                            - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+
+                                FPTYPE fac;
+                                if (occ)
+                                {
+                                    fac = d_wg[ib];
+                                }
+                                else
+                                {
+                                    fac = d_wg[0];
+                                }
+                                FPTYPE ps_qq = 0;
+                                if(ekb_now != 0)
+                                {
+                                    ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+                                }
+                                FPTYPE ps = deeq[((spin * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
                                 const int inkb1 = sum + ia * nproj + ip1;
                                 const int inkb2 = sum + ia * nproj + ip2;
                                 // out<<"\n ps = "<<ps;
@@ -150,6 +167,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const std::complex<FPTYPE>* deeq_nc,
@@ -167,10 +185,15 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
             {
                 const int nproj = atom_nh[it];
 #ifdef _OPENMP
-#pragma omp for collapse(4)
+#pragma omp for
 #endif
                 for (int ib = 0; ib < nbands_occ; ib++)
                 {
+                    FPTYPE ekb_now = 0.0;
+                    if(d_ekb != nullptr)
+                    {
+                        ekb_now = d_ekb[ib];
+                    }
                     for (int ia = 0; ia < atom_na[it]; ia++)
                     {
                         for (int ip1 = 0; ip1 < nproj; ip1++)
@@ -178,9 +201,20 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                             for (int ip2 = 0; ip2 < nproj; ip2++)
                             {
                                 const int ib2 = ib*2;
-                                FPTYPE fac = d_wg[ib] * 1.0;
-                                FPTYPE ekb_now = d_ekb[ib];
-                                std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+                                FPTYPE fac;
+                                if (occ)
+                                {
+                                    fac = d_wg[ib];
+                                }
+                                else
+                                {
+                                    fac = d_wg[0];
+                                }
+                                std::complex<FPTYPE> ps_qq = 0;
+                                if(ekb_now != 0)
+                                {
+                                    ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+                                }
                                 std::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
                                 std::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
                                 std::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
@@ -508,6 +542,27 @@ struct cal_force_npw_op<FPTYPE, base_device::DEVICE_CPU> {
     }
 };
 
+template <typename FPTYPE>
+struct cal_multi_dot_op<FPTYPE, base_device::DEVICE_CPU> {
+    FPTYPE operator()(const int& npw,
+                      const FPTYPE& fac,
+                      const FPTYPE* gk1,
+                      const FPTYPE* gk2,
+                      const FPTYPE* d_kfac,
+                      const std::complex<FPTYPE>* psi)
+    {
+        FPTYPE sum = 0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : sum)
+#endif
+        for (int i = 0; i < npw; i++)
+        {
+            sum += fac * gk1[i] * gk2[i] * d_kfac[i] * std::norm(psi[i]);
+        }
+        return sum;
+    }
+};
+
 // // cpu version first, gpu version later
 // template <typename FPTYPE>
 // struct prepare_vkb_deri_ptr_op<FPTYPE, base_device::DEVICE_CPU>{
@@ -586,6 +641,9 @@ template struct cal_stress_drhoc_aux_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_npw_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_npw_op<double, base_device::DEVICE_CPU>;
 
+template struct cal_multi_dot_op<float, base_device::DEVICE_CPU>;
+template struct cal_multi_dot_op<double, base_device::DEVICE_CPU>;
+
 
 // template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_CPU>;
 // template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_CPU>;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index 2f4bbd0989..800bb3eba3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -72,6 +72,7 @@ struct cal_stress_nl_op
     /// @param atom_nh - GlobalC::ucell.atoms[ii].ncpp.nh
     /// @param atom_na - GlobalC::ucell.atoms[ii].na
     /// @param d_wg - input parameter wg
+    /// @param occ - if use the occupation of the bands
     /// @param d_ekb - input parameter ekb
     /// @param qq_nt - GlobalC::ppcell.qq_nt
     /// @param deeq - GlobalC::ppcell.deeq
@@ -94,6 +95,7 @@ struct cal_stress_nl_op
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const FPTYPE* deeq,
@@ -113,6 +115,7 @@ struct cal_stress_nl_op
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const std::complex<FPTYPE>* deeq_nc,
@@ -218,6 +221,16 @@ struct cal_force_npw_op{
     );
 };
 
+template <typename FPTYPE, typename Device>
+struct cal_multi_dot_op{
+    FPTYPE operator()(const int& npw,
+                    const FPTYPE& fac,
+                    const FPTYPE* gk1,
+                    const FPTYPE* gk2,
+                    const FPTYPE* d_kfac,
+                    const std::complex<FPTYPE>* psi);
+};
+
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 template <typename FPTYPE>
@@ -258,6 +271,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const FPTYPE* deeq,
@@ -277,6 +291,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
+                    const bool& occ,
                     const FPTYPE* d_ekb,
                     const FPTYPE* qq_nt,
                     const std::complex<FPTYPE>* deeq_nc,
@@ -351,6 +366,15 @@ struct cal_vq_deri_op<FPTYPE, base_device::DEVICE_GPU>
                     FPTYPE* vq);
 };
 
+template <typename FPTYPE>
+struct cal_multi_dot_op<FPTYPE, base_device::DEVICE_GPU>{
+    FPTYPE operator()(const int& npw,
+                    const FPTYPE& fac,
+                    const FPTYPE* gk1,
+                    const FPTYPE* gk2,
+                    const FPTYPE* d_kfac,
+                    const std::complex<FPTYPE>* psi);
+};
 
 /**
  * The operator is used to compute the auxiliary amount of stress /force 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
index 3da3ca3b19..0507ff3358 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
@@ -2899,6 +2899,7 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_cpu)
                                                                atom_na.data(),
                                                                tpiba,
                                                                wg.data(),
+                                                               true,
                                                                ekb.data(),
                                                                qq_nt.data(),
                                                                deeq.data(),
@@ -2990,6 +2991,7 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu)
                                                                d_atom_na,
                                                                tpiba,
                                                                d_wg,
+                                                               true,
                                                                d_ekb,
                                                                d_qq_nt,
                                                                d_deeq,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
index 4f84fa098d..cbf434da0c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
@@ -102,6 +102,7 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_cpu)
                                                                 atom_nh.data(),
                                                                 atom_na.data(),
                                                                 wg.data(),
+                                                                true,
                                                                 ekb.data(),
                                                                 qq_nt.data(),
                                                                 deeq.data(),
@@ -275,6 +276,7 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu)
                                                                 d_atom_nh,
                                                                 d_atom_na,
                                                                 d_wg,
+                                                                true,
                                                                 d_ekb,
                                                                 d_qq_nt,
                                                                 d_deeq,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
index 73cf31bf2d..ddd1c6ca44 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
@@ -66,7 +66,8 @@ class Stress_Func
                     ModuleSymmetry::Symmetry* p_symm,
                     K_Vectors* p_kv,
                     ModulePW::PW_Basis_K* wfc_basis,
-                    const psi::Psi<complex<FPTYPE>>* psi_in = nullptr); // electron kinetic part in PW basis
+                    const UnitCell& ucell_in,
+                    const psi::Psi<complex<FPTYPE>, Device>* psi_in = nullptr); // electron kinetic part in PW basis
 
     // 2) the stress from the Hartree term
     void stress_har(ModuleBase::matrix& sigma,
@@ -207,10 +208,11 @@ class Stress_Func
                 const ModuleBase::matrix& dylmk0,
                 std::complex<FPTYPE>* dqg);
 
-  private:
+  protected:
     Device* ctx = {};
     base_device::DEVICE_CPU* cpu_ctx = {};
     base_device::AbacusDevice_t device = {};
+  private:
     using gemm_op = hsolver::gemm_op<std::complex<FPTYPE>, Device>;
     using cal_stress_nl_op = hamilt::cal_stress_nl_op<FPTYPE, Device>;
     using cal_dbecp_noevc_nl_op = hamilt::cal_dbecp_noevc_nl_op<FPTYPE, Device>;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
index 489e1fe0f8..a5afae2340 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
@@ -2,6 +2,7 @@
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_parameter/parameter.h"
 #include "module_base/timer.h"
+#include "module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h"
 
 //calculate the kinetic stress in PW base
 template <typename FPTYPE, typename Device>
@@ -10,168 +11,29 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
                                              ModuleSymmetry::Symmetry* p_symm,
                                              K_Vectors* p_kv,
                                              ModulePW::PW_Basis_K* wfc_basis,
-                                             const psi::Psi<complex<FPTYPE>>* psi_in)
+                                             const UnitCell& ucell_in,
+                                             const psi::Psi<complex<FPTYPE>, Device>* psi_in)
 {
     ModuleBase::TITLE("Stress_Func","stress_kin");
 	ModuleBase::timer::tick("Stress_Func","stress_kin");
-	
-	FPTYPE **gk;
-	gk=new FPTYPE* [3];
-	int npw;
-	FPTYPE s_kin[3][3];
-	for(int l=0;l<3;l++)
-	{
-		for(int m=0;m<3;m++)
-		{
-			s_kin[l][m]=0.0;
-		}
-	}
-		
-	int npwx=0;
-    for (int ik = 0; ik < p_kv->get_nks(); ik++)
+	this->ucell = &ucell_in;
+	hamilt::FS_Kin_tools<FPTYPE, Device> kin_tool(*this->ucell, p_kv, wfc_basis, wg);
+    for (int ik = 0; ik < wfc_basis->nks; ++ik)
     {
-        if (npwx < p_kv->ngk[ik]) {
-            npwx = p_kv->ngk[ik];
-}
-    }
-
-    gk[0]= new FPTYPE[npwx];
-	gk[1]= new FPTYPE[npwx];
-	gk[2]= new FPTYPE[npwx];
-    FPTYPE tpiba = ModuleBase::TWO_PI / GlobalC::ucell.lat0;
-    FPTYPE twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI);
-    FPTYPE* kfac = new FPTYPE[npwx];
-    const int npol = GlobalC::ucell.get_npol();
-
-    for (int ik = 0; ik < p_kv->get_nks(); ik++)
-    {
-        npw = p_kv->ngk[ik];
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-        for (int i = 0; i < npw; i++)
-        {
-            gk[0][i] = wfc_basis->getgpluskcar(ik, i)[0] * tpiba;
-            gk[1][i] = wfc_basis->getgpluskcar(ik, i)[1] * tpiba;
-            gk[2][i] = wfc_basis->getgpluskcar(ik, i)[2] * tpiba;
-            if (wfc_basis->erf_height > 0)
-            {
-                FPTYPE gk2 = gk[0][i] * gk[0][i] + gk[1][i] * gk[1][i] + gk[2][i] * gk[2][i];
-                FPTYPE arg = (gk2 - wfc_basis->erf_ecut) / wfc_basis->erf_sigma;
-                kfac[i] = 1.0 + wfc_basis->erf_height / wfc_basis->erf_sigma * twobysqrtpi * std::exp(-arg * arg);
-            }
-            else
-            {
-                kfac[i] = 1.0;
-            }
-        }
-
-        // kinetic contribution
-
-        for (int l = 0; l < 3; l++)
+        int nbands_occ = wg.nc;
+        while (wg(ik, nbands_occ - 1) == 0.0)
         {
-            for (int m = 0; m < l + 1; m++)
+            nbands_occ--;
+            if (nbands_occ == 0)
             {
-                for (int ibnd = 0; ibnd < PARAM.inp.nbands; ibnd++)
-                {
-                    if (wg(ik, ibnd) == 0.0) { continue;
-}
-                    const std::complex<FPTYPE>* ppsi = nullptr;
-                    ppsi = &(psi_in[0](ik, ibnd, 0));
-
-                    FPTYPE sum = 0;
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+ : sum)
-#endif
-                    for (int i = 0; i < npw; i++)
-                    {
-                        sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i]
-                               * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real()));
-                    }
-                    if(npol == 2)
-                    {
-                        ppsi = &(psi_in[0](ik, ibnd, npwx));
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+ : sum)
-#endif
-                        for (int i = 0; i < npw; i++)
-                        {
-                            sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i]
-                                   * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real()));
-                        }
-                    }
-                    s_kin[l][m] += sum;
-                }
+                break;
             }
         }
-
-        //contribution from the nonlocal part
-		   
-		//stres_us(ik, gk, npw);
+        const int npm = nbands_occ;
+        kin_tool.cal_gk(ik);
+        kin_tool.cal_stress_kin(ik, npm, true, &psi_in[0](ik, 0, 0));
     }
-
-    // add the US term from augmentation charge derivatives
-
-    // addussstres(sigmanlc);
-	
-	//mp_cast
-		
-	for(int l=0;l<3;l++)
-	{
-		for(int m=0;m<l;m++)
-		{
-			s_kin[m][l]=s_kin[l][m];
-		}
-	}
-
-	if(PARAM.globalv.gamma_only_pw)
-	{
-		for(int l=0;l<3;l++)
-		{
-			for(int m=0;m<3;m++)
-			{
-				s_kin[l][m] *= 2.0*ModuleBase::e2/GlobalC::ucell.omega;
-			}
-		}
-	}
-	else 
-	{
-		for(int l=0;l<3;l++)
-		{
-			for(int m=0;m<3;m++)
-			{
-				s_kin[l][m] *= ModuleBase::e2/GlobalC::ucell.omega;
-			}
-		}
-	}
-
-	for(int l=0;l<3;l++)
-	{
-		for(int m=0;m<3;m++)
-		{
-            Parallel_Reduce::reduce_all(s_kin[l][m]); //qianrui fix a bug for kpar > 1
-		}
-	}
-
-
-	for(int l=0;l<3;l++)
-	{
-		for(int m=0;m<3;m++)
-		{
-			sigma(l,m) = s_kin[l][m];
-		}
-	}
-	//do symmetry
-    if (ModuleSymmetry::Symmetry::symm_flag == 1)
-    {
-        p_symm->symmetrize_mat3(sigma, GlobalC::ucell.lat);
-    } // end symmetry
-
-    delete[] gk[0];
-    delete[] gk[1];
-    delete[] gk[2];
-	delete[] gk;
-    delete[] kfac;
+    kin_tool.symmetrize_stress(p_symm, sigma);
 		
 	ModuleBase::timer::tick("Stress_Func","stress_kin");
 	return;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
index 5b177b37b6..2a761d8118 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
@@ -34,9 +34,10 @@ void Stress_Func<FPTYPE, Device>::stress_nl(ModuleBase::matrix& sigma,
     setmem_var_op()(this->ctx, stress_device, 0, 9);
     std::vector<FPTYPE> sigmanlc(9, 0.0);
 
-    hamilt::FS_Nonlocal_tools<FPTYPE, Device> nl_tools(nlpp_in, &ucell_in, psi_in, p_kv, wfc_basis, p_sf, wg, ekb);
+    hamilt::FS_Nonlocal_tools<FPTYPE, Device> nl_tools(nlpp_in, &ucell_in, p_kv, wfc_basis, p_sf, wg, &ekb);
 
     const int nks = p_kv->get_nks();
+    const int max_nbands = wg.nc;
     for (int ik = 0; ik < nks; ik++) // loop k points
     {
         // skip zero weights to speed up
@@ -51,15 +52,19 @@ void Stress_Func<FPTYPE, Device>::stress_nl(ModuleBase::matrix& sigma,
         }
         const int npm = nbands_occ;
 
+        nl_tools.cal_vkb(ik, max_nbands);
         // calculate becp = <psi|beta> for all beta functions
-        nl_tools.cal_becp(ik, npm);
+        nl_tools.cal_becp(ik, npm, &psi_in[0](ik,0,0));
+        nl_tools.reduce_pool_becp(max_nbands);
         // calculate dbecp = <psi|d(beta)/dR> for all beta functions
         // calculate stress = \sum <psi|d(beta_j)/dR> * <psi|beta_i> * D_{ij}
         for (int ipol = 0; ipol < 3; ipol++)
         {
             for (int jpol = 0; jpol <= ipol; jpol++)
             {
-                nl_tools.cal_dbecp_s(ik, npm, ipol, jpol, stress_device);
+                nl_tools.cal_vkb_deri_s(ik, max_nbands, ipol, jpol);
+                nl_tools.cal_dbecp_s(ik, npm, &psi_in[0](ik,0,0));
+                nl_tools.cal_stress(ik, npm, true, ipol, jpol, stress_device);
             }
         }
     }
@@ -75,23 +80,16 @@ void Stress_Func<FPTYPE, Device>::stress_nl(ModuleBase::matrix& sigma,
             {
                 sigmanlc[l * 3 + m] = sigmanlc[m * 3 + l];
             }
-            Parallel_Reduce::reduce_all(sigmanlc[l * 3 + m]); // qianrui fix a bug for kpar > 1
-        }
-    }
-    // rescale the stress with 1/omega
-    for (int ipol = 0; ipol < 3; ipol++)
-    {
-        for (int jpol = 0; jpol < 3; jpol++)
-        {
-            sigmanlc[ipol * 3 + jpol] *= 1.0 / ucell_in.omega;
         }
     }
+    // sum up forcenl from all processors
+    Parallel_Reduce::reduce_all(sigmanlc.data(), 9);
 
     for (int ipol = 0; ipol < 3; ipol++)
     {
         for (int jpol = 0; jpol < 3; jpol++)
         {
-            sigma(ipol, jpol) = sigmanlc[ipol * 3 + jpol];
+            sigma(ipol, jpol) = sigmanlc[ipol * 3 + jpol] / ucell_in.omega;
         }
     }
     // do symmetry
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
index b56553c708..f32f0067dc 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
@@ -14,7 +14,6 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
                                            Structure_Factor* p_sf,
                                            K_Vectors* p_kv,
                                            ModulePW::PW_Basis_K* wfc_basis,
-                                           const psi::Psi<complex<FPTYPE>>* psi_in,
                                            const psi::Psi<complex<FPTYPE>, Device>* d_psi_in)
 {
     ModuleBase::TITLE("Stress_PW", "cal_stress");
@@ -64,7 +63,7 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
     }
 
     // kinetic contribution
-    this->stress_kin(sigmakin, this->pelec->wg, p_symm, p_kv, wfc_basis, psi_in);
+    this->stress_kin(sigmakin, this->pelec->wg, p_symm, p_kv, wfc_basis, ucell, d_psi_in);
 
     // hartree contribution
     this->stress_har(sigmahar, rho_basis, 1, pelec->charge);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.h b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.h
index 454f5e5280..4671d25567 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.h
@@ -19,7 +19,6 @@ class Stress_PW : public Stress_Func<FPTYPE, Device>
                     Structure_Factor* p_sf,
                     K_Vectors* p_kv,
                     ModulePW::PW_Basis_K* wfc_basis,
-                    const psi::Psi<complex<FPTYPE>>* psi_in = nullptr,
                     const psi::Psi<complex<FPTYPE>, Device>* d_psi_in = nullptr);
 
   protected:
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp
index 8cb543b026..c652b38635 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp
@@ -8,6 +8,7 @@
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_io/output_log.h"
 #include "module_parameter/parameter.h"
+#include "module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h"
 
 // new
 #include "module_base/math_integral.h"
@@ -15,59 +16,61 @@
 #include "module_base/timer.h"
 #include "module_hamilt_general/module_xc/xc_functional.h"
 
-void Sto_Forces::cal_stoforce(ModuleBase::matrix& force,
-                              const elecstate::ElecState& elec,
-                              ModulePW::PW_Basis* rho_basis,
-                              ModuleSymmetry::Symmetry* p_symm,
-                              Structure_Factor* p_sf,
-                              K_Vectors* pkv,
-                              ModulePW::PW_Basis_K* wfc_basis,
-                              const psi::Psi<std::complex<double>>* psi_in,
-                              Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>& stowf)
+template <typename FPTYPE, typename Device>
+void Sto_Forces<FPTYPE, Device>::cal_stoforce(ModuleBase::matrix& force,
+                                              const elecstate::ElecState& elec,
+                                              ModulePW::PW_Basis* rho_basis,
+                                              ModuleSymmetry::Symmetry* p_symm,
+                                              const Structure_Factor* p_sf,
+                                              K_Vectors* pkv,
+                                              ModulePW::PW_Basis_K* wfc_basis,
+                                              const pseudopot_cell_vnl& nlpp,
+                                              UnitCell& ucell,
+                                              const psi::Psi<std::complex<FPTYPE>, Device>& psi,
+                                              const Stochastic_WF<std::complex<FPTYPE>, Device>& stowf)
 {
-    ModuleBase::timer::tick("Sto_Force", "cal_force");
+    ModuleBase::timer::tick("Sto_Forces", "cal_force");
     ModuleBase::TITLE("Sto_Forces", "init");
-    this->nat = GlobalC::ucell.nat;
+    this->device = base_device::get_device_type<Device>(this->ctx);
     const ModuleBase::matrix& wg = elec.wg;
     const Charge* chr = elec.charge;
-    force.create(nat, 3);
+    force.create(this->nat, 3);
 
-    ModuleBase::matrix forcelc(nat, 3);
-    ModuleBase::matrix forceion(nat, 3);
-    ModuleBase::matrix forcecc(nat, 3);
-    ModuleBase::matrix forcenl(nat, 3);
-    ModuleBase::matrix forcescc(nat, 3);
+    ModuleBase::matrix forcelc(this->nat, 3);
+    ModuleBase::matrix forceion(this->nat, 3);
+    ModuleBase::matrix forcecc(this->nat, 3);
+    ModuleBase::matrix forcenl(this->nat, 3);
+    ModuleBase::matrix forcescc(this->nat, 3);
     this->cal_force_loc(forcelc, rho_basis, chr);
     this->cal_force_ew(forceion, rho_basis, p_sf);
-    this->cal_sto_force_nl(forcenl, wg, pkv, wfc_basis, psi_in, stowf);
-    this->cal_force_cc(forcecc, rho_basis, chr, GlobalC::ucell);
-    this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, GlobalC::ucell);
+    this->cal_sto_force_nl(forcenl, wg, pkv, wfc_basis, p_sf, nlpp, ucell, psi, stowf);
+    this->cal_force_cc(forcecc, rho_basis, chr, ucell);
+    this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, ucell);
 
     // impose total force = 0
-    int iat = 0;
-
     ModuleBase::matrix force_e;
     if (PARAM.inp.efield_flag)
     {
-        force_e.create(GlobalC::ucell.nat, 3);
-        elecstate::Efield::compute_force(GlobalC::ucell, force_e);
+        force_e.create(this->nat, 3);
+        elecstate::Efield::compute_force(ucell, force_e);
     }
 
     ModuleBase::matrix force_gate;
     if (PARAM.inp.gate_flag)
     {
-        force_gate.create(GlobalC::ucell.nat, 3);
-        elecstate::Gatefield::compute_force(GlobalC::ucell, force_gate);
+        force_gate.create(this->nat, 3);
+        elecstate::Gatefield::compute_force(ucell, force_gate);
     }
 
+    int iat = 0;
     for (int ipol = 0; ipol < 3; ipol++)
     {
         double sum = 0.0;
         iat = 0;
 
-        for (int it = 0; it < GlobalC::ucell.ntype; it++)
+        for (int it = 0; it < ucell.ntype; it++)
         {
-            for (int ia = 0; ia < GlobalC::ucell.atoms[it].na; ia++)
+            for (int ia = 0; ia < ucell.atoms[it].na; ia++)
             {
                 force(iat, ipol) = forcelc(iat, ipol) + forceion(iat, ipol) + forcenl(iat, ipol) + forcecc(iat, ipol)
                                    + forcescc(iat, ipol);
@@ -90,8 +93,8 @@ void Sto_Forces::cal_stoforce(ModuleBase::matrix& force,
 
         if (!(PARAM.inp.gate_flag || PARAM.inp.efield_flag))
         {
-            double compen = sum / GlobalC::ucell.nat;
-            for (int iat = 0; iat < GlobalC::ucell.nat; ++iat)
+            double compen = sum / ucell.nat;
+            for (int iat = 0; iat < ucell.nat; ++iat)
             {
                 force(iat, ipol) = force(iat, ipol) - compen;
             }
@@ -106,20 +109,20 @@ void Sto_Forces::cal_stoforce(ModuleBase::matrix& force,
     if (ModuleSymmetry::Symmetry::symm_flag == 1)
     {
         double d1, d2, d3;
-        for (int iat = 0; iat < GlobalC::ucell.nat; iat++)
+        for (int iat = 0; iat < ucell.nat; iat++)
         {
             ModuleBase::Mathzone::Cartesian_to_Direct(force(iat, 0),
                                                       force(iat, 1),
                                                       force(iat, 2),
-                                                      GlobalC::ucell.a1.x,
-                                                      GlobalC::ucell.a1.y,
-                                                      GlobalC::ucell.a1.z,
-                                                      GlobalC::ucell.a2.x,
-                                                      GlobalC::ucell.a2.y,
-                                                      GlobalC::ucell.a2.z,
-                                                      GlobalC::ucell.a3.x,
-                                                      GlobalC::ucell.a3.y,
-                                                      GlobalC::ucell.a3.z,
+                                                      ucell.a1.x,
+                                                      ucell.a1.y,
+                                                      ucell.a1.z,
+                                                      ucell.a2.x,
+                                                      ucell.a2.y,
+                                                      ucell.a2.z,
+                                                      ucell.a3.x,
+                                                      ucell.a3.y,
+                                                      ucell.a3.z,
                                                       d1,
                                                       d2,
                                                       d3);
@@ -129,20 +132,20 @@ void Sto_Forces::cal_stoforce(ModuleBase::matrix& force,
             force(iat, 2) = d3;
         }
         p_symm->symmetrize_vec3_nat(force.c);
-        for (int iat = 0; iat < GlobalC::ucell.nat; iat++)
+        for (int iat = 0; iat < ucell.nat; iat++)
         {
             ModuleBase::Mathzone::Direct_to_Cartesian(force(iat, 0),
                                                       force(iat, 1),
                                                       force(iat, 2),
-                                                      GlobalC::ucell.a1.x,
-                                                      GlobalC::ucell.a1.y,
-                                                      GlobalC::ucell.a1.z,
-                                                      GlobalC::ucell.a2.x,
-                                                      GlobalC::ucell.a2.y,
-                                                      GlobalC::ucell.a2.z,
-                                                      GlobalC::ucell.a3.x,
-                                                      GlobalC::ucell.a3.y,
-                                                      GlobalC::ucell.a3.z,
+                                                      ucell.a1.x,
+                                                      ucell.a1.y,
+                                                      ucell.a1.z,
+                                                      ucell.a2.x,
+                                                      ucell.a2.y,
+                                                      ucell.a2.z,
+                                                      ucell.a3.x,
+                                                      ucell.a3.y,
+                                                      ucell.a3.z,
                                                       d1,
                                                       d2,
                                                       d3);
@@ -153,271 +156,110 @@ void Sto_Forces::cal_stoforce(ModuleBase::matrix& force,
     }
 
     GlobalV::ofs_running << setiosflags(std::ios::fixed) << std::setprecision(6) << std::endl;
-    if (PARAM.inp.test_force)
-    {
-        ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "LOCAL    FORCE (Ry/Bohr)", forcelc);
-        ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "NONLOCAL FORCE (Ry/Bohr)", forcenl);
-        ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "NLCC     FORCE (Ry/Bohr)", forcecc);
-        ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "ION      FORCE (Ry/Bohr)", forceion);
-        ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "SCC      FORCE (Ry/Bohr)", forcescc);
-        if (PARAM.inp.efield_flag)
-        {
-            ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "EFIELD   FORCE (Ry/Bohr)", force_e);
-        }
-        if (PARAM.inp.gate_flag)
-        {
-            ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "GATEFIELD   FORCE (Ry/Bohr)", force_gate);
-        }
-    }
 
     // output force in unit eV/Angstrom
     GlobalV::ofs_running << std::endl;
 
     if (PARAM.inp.test_force)
     {
-        ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "LOCAL    FORCE (eV/Angstrom)", forcelc, false);
-        ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "NONLOCAL FORCE (eV/Angstrom)", forcenl, false);
-        ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "NLCC     FORCE (eV/Angstrom)", forcecc, false);
-        ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "ION      FORCE (eV/Angstrom)", forceion, false);
-        ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "SCC      FORCE (eV/Angstrom)", forcescc, false);
+        ModuleIO::print_force(GlobalV::ofs_running, ucell, "LOCAL    FORCE (eV/Angstrom)", forcelc, false);
+        ModuleIO::print_force(GlobalV::ofs_running, ucell, "NONLOCAL FORCE (eV/Angstrom)", forcenl, false);
+        ModuleIO::print_force(GlobalV::ofs_running, ucell, "NLCC     FORCE (eV/Angstrom)", forcecc, false);
+        ModuleIO::print_force(GlobalV::ofs_running, ucell, "ION      FORCE (eV/Angstrom)", forceion, false);
+        ModuleIO::print_force(GlobalV::ofs_running, ucell, "SCC      FORCE (eV/Angstrom)", forcescc, false);
         if (PARAM.inp.efield_flag)
         {
-            ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "EFIELD   FORCE (eV/Angstrom)", force_e, false);
+            ModuleIO::print_force(GlobalV::ofs_running, ucell, "EFIELD   FORCE (eV/Angstrom)", force_e, false);
         }
         if (PARAM.inp.gate_flag)
         {
             ModuleIO::print_force(GlobalV::ofs_running,
-                                  GlobalC::ucell,
+                                  ucell,
                                   "GATEFIELD   FORCE (eV/Angstrom)",
                                   force_gate,
                                   false);
         }
     }
-    ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "TOTAL-FORCE (eV/Angstrom)", force, false);
-    ModuleBase::timer::tick("Sto_Force", "cal_force");
+    ModuleIO::print_force(GlobalV::ofs_running, ucell, "TOTAL-FORCE (eV/Angstrom)", force, false);
+    ModuleBase::timer::tick("Sto_Forces", "cal_force");
     return;
 }
 
-void Sto_Forces::cal_sto_force_nl(ModuleBase::matrix& forcenl,
-                                  const ModuleBase::matrix& wg,
-                                  K_Vectors* p_kv,
-                                  ModulePW::PW_Basis_K* wfc_basis,
-                                  const psi::Psi<std::complex<double>>* psi_in,
-                                  Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>& stowf)
+template <typename FPTYPE, typename Device>
+void Sto_Forces<FPTYPE, Device>::cal_sto_force_nl(
+    ModuleBase::matrix& forcenl,
+    const ModuleBase::matrix& wg,
+    K_Vectors* p_kv,
+    ModulePW::PW_Basis_K* wfc_basis,
+    const Structure_Factor* p_sf,
+    const pseudopot_cell_vnl& nlpp,
+    const UnitCell& ucell,
+    const psi::Psi<std::complex<FPTYPE>, Device>& psi_in,
+    const Stochastic_WF<std::complex<FPTYPE>, Device>& stowf)
 {
     ModuleBase::TITLE("Sto_Forces", "cal_force_nl");
-    ModuleBase::timer::tick("Sto_Forces", "cal_force_nl");
-
-    const int nkb = GlobalC::ppcell.nkb;
-    int* nchip = stowf.nchip;
+    const int nkb = nlpp.nkb;
     if (nkb == 0)
     {
-        return; // mohan add 2010-07-25
+        return;
     }
 
+    ModuleBase::timer::tick("Sto_Forces", "cal_force_nl");
+
+    const int* nchip = stowf.nchip;
     const int npwx = wfc_basis->npwk_max;
-    // vkb1: |Beta(nkb,npw)><Beta(nkb,npw)|psi(nbnd,npw)>
-    ModuleBase::ComplexMatrix vkb1(nkb, npwx);
-    int nksbands = psi_in->get_nbands();
+    int nksbands = psi_in.get_nbands();
     if (GlobalV::MY_STOGROUP != 0)
     {
         nksbands = 0;
     }
 
+   // allocate memory for the force
+    FPTYPE* force = nullptr;
+    resmem_var_op()(this->ctx, force, ucell.nat * 3);
+    base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, force, 0.0, ucell.nat * 3);
+
+    hamilt::FS_Nonlocal_tools<FPTYPE, Device> nl_tools(&nlpp, &ucell, p_kv, wfc_basis, p_sf, wg, nullptr);
+
+
     for (int ik = 0; ik < wfc_basis->nks; ik++)
     {
         const int nstobands = nchip[ik];
-        const int nbandstot = nstobands + nksbands;
+        const int max_nbands = stowf.shchi->get_nbands() + nksbands;
         const int npw = wfc_basis->npwk[ik];
+        psi_in.fix_k(ik);
+        stowf.shchi->fix_k(ik);
 
-        // dbecp: conj( -iG * <Beta(nkb,npw)|psi(nbnd,npw)> )
-        ModuleBase::ComplexArray dbecp(3, nbandstot, nkb);
-        ModuleBase::ComplexMatrix becp(nbandstot, nkb);
-        const int current_spin = p_kv->isk[ik];
+        nl_tools.cal_vkb(ik, max_nbands); // vkb has dimension of nkb * max_nbands * npol
 
-        // generate vkb
-        if (GlobalC::ppcell.nkb > 0)
-        {
-            GlobalC::ppcell.getvnl(ik, GlobalC::ppcell.vkb);
-        }
+        // calculate becp = <psi|beta> for all beta functions
+        nl_tools.cal_becp(ik, nksbands, psi_in.get_pointer(), 0);
+        nl_tools.cal_becp(ik, nstobands, stowf.shchi->get_pointer(), nksbands);
+        nl_tools.reduce_pool_becp(max_nbands);
 
-        // get becp according to wave functions and vkb
-        // important here ! becp must set zero!!
-        // vkb: Beta(nkb,npw)
-        // becp(nkb,nbnd): <Beta(nkb,npw)|psi(nbnd,npw)>
-        becp.zero_out();
-        char transa = 'C';
-        char transb = 'N';
-        psi_in->fix_k(ik);
-        stowf.shchi->fix_k(ik);
-        // KS orbitals
-        int npmks = PARAM.globalv.npol * nksbands;
-        zgemm_(&transa,
-               &transb,
-               &nkb,
-               &npmks,
-               &npw,
-               &ModuleBase::ONE,
-               GlobalC::ppcell.vkb.c,
-               &npwx,
-               psi_in->get_pointer(),
-               &npwx,
-               &ModuleBase::ZERO,
-               becp.c,
-               &nkb);
-        // stochastic orbitals
-        int npmsto = PARAM.globalv.npol * nstobands;
-        zgemm_(&transa,
-               &transb,
-               &nkb,
-               &npmsto,
-               &npw,
-               &ModuleBase::ONE,
-               GlobalC::ppcell.vkb.c,
-               &npwx,
-               stowf.shchi->get_pointer(),
-               &npwx,
-               &ModuleBase::ZERO,
-               &becp(nksbands, 0),
-               &nkb);
-
-        Parallel_Reduce::reduce_pool(becp.c, becp.size);
-
-        // out.printcm_real("becp",becp,1.0e-4);
-        //  Calculate the derivative of beta,
-        //  |dbeta> =  -ig * |beta>
-        dbecp.zero_out();
         for (int ipol = 0; ipol < 3; ipol++)
         {
-            for (int i = 0; i < nkb; i++)
-            {
-                std::complex<double>* pvkb1 = &vkb1(i, 0);
-                std::complex<double>* pvkb = &GlobalC::ppcell.vkb(i, 0);
-                if (ipol == 0)
-                {
-                    for (int ig = 0; ig < npw; ig++)
-                    {
-                        pvkb1[ig] = pvkb[ig] * ModuleBase::NEG_IMAG_UNIT * wfc_basis->getgcar(ik, ig)[0];
-                    }
-                }
-                if (ipol == 1)
-                {
-                    for (int ig = 0; ig < npw; ig++)
-                    {
-                        pvkb1[ig] = pvkb[ig] * ModuleBase::NEG_IMAG_UNIT * wfc_basis->getgcar(ik, ig)[1];
-                    }
-                }
-                if (ipol == 2)
-                {
-                    for (int ig = 0; ig < npw; ig++)
-                    {
-                        pvkb1[ig] = pvkb[ig] * ModuleBase::NEG_IMAG_UNIT * wfc_basis->getgcar(ik, ig)[2];
-                    }
-                }
-            }
-            // KS orbitals
-            zgemm_(&transa,
-                   &transb,
-                   &nkb,
-                   &npmks,
-                   &npw,
-                   &ModuleBase::ONE,
-                   vkb1.c,
-                   &npwx,
-                   psi_in->get_pointer(),
-                   &npwx,
-                   &ModuleBase::ZERO,
-                   &dbecp(ipol, 0, 0),
-                   &nkb);
-            // stochastic orbitals
-            zgemm_(&transa,
-                   &transb,
-                   &nkb,
-                   &npmsto,
-                   &npw,
-                   &ModuleBase::ONE,
-                   vkb1.c,
-                   &npwx,
-                   stowf.shchi->get_pointer(),
-                   &npwx,
-                   &ModuleBase::ZERO,
-                   &dbecp(ipol, nksbands, 0),
-                   &nkb);
-        } // end ipol
-
-        //		don't need to reduce here, keep dbecp different in each processor,
-        //		and at last sum up all the forces.
-        //		Parallel_Reduce::reduce_complex_double_pool( dbecp.ptr, dbecp.ndata);
-
-        //		double *cf = new double[ucell.nat*3];
-        //		ZEROS(cf, ucell.nat);
-        for (int ib = 0; ib < nbandstot; ib++)
-        {
-            double fac;
-            if (ib < nksbands)
-            {
-                fac = wg(ik, ib) * 2.0 * GlobalC::ucell.tpiba;
-            }
-            else
-            {
-                fac = p_kv->wk[ik] * 2.0 * GlobalC::ucell.tpiba;
-            }
-            int iat = 0;
-            int sum = 0;
-            for (int it = 0; it < GlobalC::ucell.ntype; it++)
-            {
-                const int Nprojs = GlobalC::ucell.atoms[it].ncpp.nh;
-                for (int ia = 0; ia < GlobalC::ucell.atoms[it].na; ia++)
-                {
-                    for (int ip = 0; ip < Nprojs; ip++)
-                    {
-                        double ps = GlobalC::ppcell.deeq(current_spin, iat, ip, ip);
-                        const int inkb = sum + ip;
-                        // out<<"\n ps = "<<ps;
-
-                        for (int ipol = 0; ipol < 3; ipol++)
-                        {
-                            const double dbb = (conj(dbecp(ipol, ib, inkb)) * becp(ib, inkb)).real();
-                            forcenl(iat, ipol) = forcenl(iat, ipol) - ps * fac * dbb;
-                            // cf[iat*3+ipol] += ps * fac * dbb;
-                        }
-                    }
-
-                    // if ( ucell.atoms[it].nbeta > ucell.atoms[it].lmax+1 )    //{zws add 20160110
-                    //{
-                    // cout << " \n multi-projector force calculation ... " << endl;
-                    for (int ip = 0; ip < Nprojs; ip++)
-                    {
-                        const int inkb = sum + ip;
-                        // for (int ip2=0; ip2<Nprojs; ip2++)
-                        for (int ip2 = ip + 1; ip2 < Nprojs; ip2++)
-                        {
-                            // if ( ip != ip2 )
-                            //{
-                            const int jnkb = sum + ip2;
-                            double ps = GlobalC::ppcell.deeq(current_spin, iat, ip2, ip);
-                            for (int ipol = 0; ipol < 3; ipol++)
-                            {
-                                const double dbb = 2.0 * (conj(dbecp(ipol, ib, inkb)) * becp(ib, jnkb)).real();
-                                // const double dbb = ( conj( dbecp( inkb, ib, ipol) ) * becp( jnkb, ib) ).real();
-                                forcenl(iat, ipol) = forcenl(iat, ipol) - ps * fac * dbb;
-                                // cf[iat*3+ipol] += ps * fac * dbb;
-                            }
-                            //}
-                        }
-                    }
-                    //}    //}zws add 20160110
-
-                    ++iat;
-                    sum += Nprojs;
-                }
-            } // end it
-        }     // end band
-    }         // end ik
+            nl_tools.cal_vkb_deri_f(ik, max_nbands, ipol); // vkb_deri has dimension of nkb * max_nbands * npol
+            // calculate dbecp = <psi|\nabla beta> for all beta functions
+            nl_tools.cal_dbecp_f(ik, max_nbands, nksbands, ipol, psi_in.get_pointer(), 0);
+            nl_tools.cal_dbecp_f(ik, max_nbands, nstobands, ipol, stowf.shchi->get_pointer(), nksbands);
+            nl_tools.revert_vkb(ik, ipol);
+        }
+        nl_tools.cal_force(ik, max_nbands, nksbands, true, force, 0);
+        nl_tools.cal_force(ik, max_nbands, nstobands, false, force, nksbands);
+    } // end ik
 
+    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcenl.c, force, forcenl.nr * forcenl.nc);
+    delmem_var_op()(this->ctx, force);
     // sum up forcenl from all processors
     Parallel_Reduce::reduce_all(forcenl.c, forcenl.nr * forcenl.nc);
 
+    
     ModuleBase::timer::tick("Sto_Forces", "cal_force_nl");
     return;
 }
+
+template class Sto_Forces<double, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class Sto_Forces<double, base_device::DEVICE_GPU>;
+#endif
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_forces.h b/source/module_hamilt_pw/hamilt_stodft/sto_forces.h
index 2655821f0b..e6620a7791 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_forces.h
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_forces.h
@@ -5,7 +5,8 @@
 #include "module_psi/psi.h"
 #include "sto_wf.h"
 
-class Sto_Forces : public Forces<double>
+template <typename FPTYPE, typename Device = base_device::DEVICE_CPU>
+class Sto_Forces : public Forces<FPTYPE, Device>
 {
   public:
     /* This routine is a driver routine which compute the forces
@@ -17,26 +18,36 @@ class Sto_Forces : public Forces<double>
      * (4) cal_nl: contribution due to the non-local pseudopotential.
      * (4) cal_scc: contributino due to incomplete SCF calculation.
      */
-    Sto_Forces(const int nat_in) : Forces(nat_in){};
+    Sto_Forces(const int nat_in) : Forces<FPTYPE, Device>(nat_in){};
     ~Sto_Forces(){};
 
     void cal_stoforce(ModuleBase::matrix& force,
                       const elecstate::ElecState& elec,
                       ModulePW::PW_Basis* rho_basis,
                       ModuleSymmetry::Symmetry* p_symm,
-                      Structure_Factor* p_sf,
+                      const Structure_Factor* p_sf,
                       K_Vectors* pkv,
                       ModulePW::PW_Basis_K* wfc_basis,
-                      const psi::Psi<std::complex<double>>* psi_in,
-                      Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>& stowf);
+                      const pseudopot_cell_vnl& nlpp,
+                      UnitCell& ucell,
+                      const psi::Psi<std::complex<FPTYPE>, Device>& psi_in,
+                      const Stochastic_WF<std::complex<FPTYPE>, Device>& stowf);
 
   private:
     void cal_sto_force_nl(ModuleBase::matrix& forcenl,
                           const ModuleBase::matrix& wg,
                           K_Vectors* p_kv,
                           ModulePW::PW_Basis_K* wfc_basis,
-                          const psi::Psi<complex<double>>* psi_in,
-                          Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>& stowf);
+                          const Structure_Factor* p_sf,
+                          const pseudopot_cell_vnl& nlpp,
+                          const UnitCell& ucell,
+                          const psi::Psi<std::complex<FPTYPE>, Device>& psi_in,
+                          const Stochastic_WF<std::complex<FPTYPE>, Device>& stowf);
+
+    using resmem_var_op = base_device::memory::resize_memory_op<FPTYPE, Device>;
+    using delmem_var_op = base_device::memory::delete_memory_op<FPTYPE, Device>;
+    using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<FPTYPE, Device, base_device::DEVICE_CPU>;
+    using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, Device>;
 };
 
 #endif
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp
index 4983ab1a6f..0f450bdc31 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp
@@ -55,6 +55,7 @@ template <typename T, typename Device>
 void Stochastic_Iter<T, Device>::orthog(const int& ik, psi::Psi<T, Device>& psi, Stochastic_WF<T, Device>& stowf)
 {
     ModuleBase::TITLE("Stochastic_Iter", "orthog");
+    ModuleBase::timer::tick("Stochastic_Iter", "orthog");
     // orthogonal part
     if (PARAM.inp.nbands > 0)
     {
@@ -110,6 +111,7 @@ void Stochastic_Iter<T, Device>::orthog(const int& ik, psi::Psi<T, Device>& psi,
                                       npwx);
         delmem_complex_op()(this->ctx, sum);
     }
+    ModuleBase::timer::tick("Stochastic_Iter", "orthog");
 }
 
 template <typename T, typename Device>
@@ -119,6 +121,7 @@ void Stochastic_Iter<T, Device>::checkemm(const int& ik,
                                           Stochastic_WF<T, Device>& stowf)
 {
     ModuleBase::TITLE("Stochastic_Iter", "checkemm");
+    ModuleBase::timer::tick("Stochastic_Iter", "checkemm");
     // iter = 1,2,...   istep = 0,1,2,...
     //  if( istep%PARAM.inp.initsto_freq != 0 )    return;
     const int npw = stowf.ngk[ik];
@@ -192,6 +195,7 @@ void Stochastic_Iter<T, Device>::checkemm(const int& ik,
         }
         change = false;
     }
+    ModuleBase::timer::tick("Stochastic_Iter", "checkemm");
 }
 
 template <typename T, typename Device>
@@ -412,6 +416,7 @@ void Stochastic_Iter<T, Device>::calPn(const int& ik, Stochastic_WF<T, Device>&
 template <typename T, typename Device>
 double Stochastic_Iter<T, Device>::calne(elecstate::ElecState* pes)
 {
+    ModuleBase::TITLE("Stochastic_Iter", "calne");
     ModuleBase::timer::tick("Stochastic_Iter", "calne");
     double totne = 0;
     KS_ne = 0;
@@ -455,12 +460,15 @@ double Stochastic_Iter<T, Device>::calne(elecstate::ElecState* pes)
 template <typename T, typename Device>
 void Stochastic_Iter<T, Device>::calHsqrtchi(Stochastic_WF<T, Device>& stowf)
 {
+    ModuleBase::TITLE("Stochastic_Iter", "calHsqrtchi");
+    ModuleBase::timer::tick("Stochastic_Iter", "calHsqrtchi");
     auto nroot_fd = std::bind(&Sto_Func<double>::nroot_fd, &this->stofunc, std::placeholders::_1);
     p_che->calcoef_real(nroot_fd);
     for (int ik = 0; ik < this->pkv->get_nks(); ++ik)
     {
         this->calTnchi_ik(ik, stowf);
     }
+    ModuleBase::timer::tick("Stochastic_Iter", "calHsqrtchi");
 }
 
 template <typename T, typename Device>
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp
index 73205edecd..1ea3b51321 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp
@@ -5,19 +5,22 @@
 #include "module_hamilt_pw/hamilt_pwdft/structure_factor.h"
 #include "module_io/output_log.h"
 #include "module_parameter/parameter.h"
-
-void Sto_Stress_PW::cal_stress(ModuleBase::matrix& sigmatot,
-                               const elecstate::ElecState& elec,
-                               ModulePW::PW_Basis* rho_basis,
-                               ModuleSymmetry::Symmetry* p_symm,
-                               Structure_Factor* p_sf,
-                               K_Vectors* p_kv,
-                               ModulePW::PW_Basis_K* wfc_basis,
-                               const psi::Psi<complex<double>>* psi_in,
-                               Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>& stowf,
-                               const Charge* const chr,
-                               pseudopot_cell_vnl* nlpp_in,
-                               const UnitCell& ucell_in)
+#include "module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h"
+#include "module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h"
+
+template <typename FPTYPE, typename Device>
+void Sto_Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
+                                               const elecstate::ElecState& elec,
+                                               ModulePW::PW_Basis* rho_basis,
+                                               ModuleSymmetry::Symmetry* p_symm,
+                                               Structure_Factor* p_sf,
+                                               K_Vectors* p_kv,
+                                               ModulePW::PW_Basis_K* wfc_basis,
+                                               const psi::Psi<complex<FPTYPE>, Device>& psi_in,
+                                               const Stochastic_WF<std::complex<FPTYPE>, Device>& stowf,
+                                               const Charge* const chr,
+                                               pseudopot_cell_vnl* nlpp,
+                                               const UnitCell& ucell_in)
 {
     ModuleBase::TITLE("Sto_Stress_PW", "cal_stress");
     ModuleBase::timer::tick("Sto_Stress_PW", "cal_stress");
@@ -33,29 +36,29 @@ void Sto_Stress_PW::cal_stress(ModuleBase::matrix& sigmatot,
     ModuleBase::matrix sigmaxcc(3, 3);
 
     // kinetic contribution
-    sto_stress_kin(sigmakin, wg, p_symm, p_kv, wfc_basis, psi_in, stowf);
+    this->sto_stress_kin(sigmakin, wg, p_symm, p_kv, wfc_basis, psi_in, stowf);
 
     // hartree contribution
-    stress_har(sigmahar, rho_basis, true, chr);
+    this->stress_har(sigmahar, rho_basis, true, chr);
 
     // ewald contribution
-    stress_ewa(sigmaewa, rho_basis, true);
+    this->stress_ewa(sigmaewa, rho_basis, true);
 
     // xc contribution: add gradient corrections(non diagonal)
     for (int i = 0; i < 3; ++i)
     {
         sigmaxc(i, i) = -(elec.f_en.etxc - elec.f_en.vtxc) / this->ucell->omega;
     }
-    stress_gga(sigmaxc, rho_basis, chr);
+    this->stress_gga(sigmaxc, rho_basis, chr);
 
     // local contribution
-    stress_loc(sigmaloc, rho_basis, p_sf, true, chr);
+    this->stress_loc(sigmaloc, rho_basis, p_sf, true, chr);
 
     // nlcc
-    stress_cc(sigmaxcc, rho_basis, p_sf, true, chr);
+    this->stress_cc(sigmaxcc, rho_basis, p_sf, true, chr);
 
     // nonlocal
-    sto_stress_nl(sigmanl, wg, p_sf, p_symm, p_kv, wfc_basis, psi_in, stowf, nlpp_in);
+    this->sto_stress_nl(sigmanl, wg, p_sf, p_symm, p_kv, wfc_basis, *nlpp, ucell_in, psi_in, stowf);
 
     for (int ipol = 0; ipol < 3; ++ipol)
     {
@@ -77,9 +80,10 @@ void Sto_Stress_PW::cal_stress(ModuleBase::matrix& sigmatot,
 
     if (PARAM.inp.test_stress)
     {
+        ry = true;
         GlobalV::ofs_running << "\n PARTS OF STRESS: " << std::endl;
-        GlobalV::ofs_running << setiosflags(std::ios::showpos);
-        GlobalV::ofs_running << setiosflags(std::ios::fixed) << std::setprecision(10) << std::endl;
+        GlobalV::ofs_running << std::setiosflags(std::ios::showpos);
+        GlobalV::ofs_running << std::setiosflags(std::ios::fixed) << std::setprecision(10) << std::endl;
         ModuleIO::print_stress("KINETIC    STRESS", sigmakin, PARAM.inp.test_stress, ry);
         ModuleIO::print_stress("LOCAL    STRESS", sigmaloc, PARAM.inp.test_stress, ry);
         ModuleIO::print_stress("HARTREE    STRESS", sigmahar, PARAM.inp.test_stress, ry);
@@ -93,394 +97,145 @@ void Sto_Stress_PW::cal_stress(ModuleBase::matrix& sigmatot,
     return;
 }
 
-void Sto_Stress_PW::sto_stress_kin(ModuleBase::matrix& sigma,
-                                   const ModuleBase::matrix& wg,
-                                   ModuleSymmetry::Symmetry* p_symm,
-                                   K_Vectors* p_kv,
-                                   ModulePW::PW_Basis_K* wfc_basis,
-                                   const psi::Psi<complex<double>>* psi_in,
-                                   Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>& stowf)
+template <typename FPTYPE, typename Device>
+void Sto_Stress_PW<FPTYPE, Device>::sto_stress_kin(ModuleBase::matrix& sigma,
+                                                   const ModuleBase::matrix& wg,
+                                                   ModuleSymmetry::Symmetry* p_symm,
+                                                   K_Vectors* p_kv,
+                                                   ModulePW::PW_Basis_K* wfc_basis,
+                                                   const psi::Psi<std::complex<FPTYPE>, Device>& psi,
+                                                   const Stochastic_WF<std::complex<FPTYPE>, Device>& stowf)
 {
-    ModuleBase::TITLE("Sto_Stress_PW", "cal_stress");
-    ModuleBase::timer::tick("Sto_Stress_PW", "cal_stress");
-    double** gk;
-    gk = new double*[3];
-    ModuleBase::matrix s_kin(3, 3, true);
+    ModuleBase::TITLE("Sto_Stress_PW", "stress_kin");
+    ModuleBase::timer::tick("Sto_Stress_PW", "stress_kin");
 
-    const int npwx = wfc_basis->npwk_max;
-    const int nks = wfc_basis->nks;
-    gk[0] = new double[npwx];
-    gk[1] = new double[npwx];
-    gk[2] = new double[npwx];
-    double tpiba = ModuleBase::TWO_PI / this->ucell->lat0;
-    double twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI);
-    double* kfac = new double[npwx];
-    int nksbands = psi_in->get_nbands();
+    int nksbands = psi.get_nbands();
     if (GlobalV::MY_STOGROUP != 0)
     {
         nksbands = 0;
     }
+    
+    hamilt::FS_Kin_tools<FPTYPE, Device> kin_tool(*this->ucell, p_kv, wfc_basis, wg);
 
-    for (int ik = 0; ik < nks; ++ik)
+    for (int ik = 0; ik < wfc_basis->nks; ++ik)
     {
-        const int nstobands = stowf.nchip[ik];
-        const int nbandstot = nstobands + nksbands;
-        const int npw = wfc_basis->npwk[ik];
-        for (int i = 0; i < npw; ++i)
-        {
-            gk[0][i] = wfc_basis->getgpluskcar(ik, i)[0] * tpiba;
-            gk[1][i] = wfc_basis->getgpluskcar(ik, i)[1] * tpiba;
-            gk[2][i] = wfc_basis->getgpluskcar(ik, i)[2] * tpiba;
-            if (wfc_basis->erf_height > 0)
-            {
-                double gk2 = gk[0][i] * gk[0][i] + gk[1][i] * gk[1][i] + gk[2][i] * gk[2][i];
-                double arg = (gk2 - wfc_basis->erf_ecut) / wfc_basis->erf_sigma;
-                kfac[i] = 1.0 + wfc_basis->erf_height / wfc_basis->erf_sigma * twobysqrtpi * std::exp(-arg * arg);
-            }
-            else
-            {
-                kfac[i] = 1.0;
-            }
-        }
-
-        // kinetic contribution
-
-        for (int l = 0; l < 3; ++l)
-        {
-            for (int m = 0; m < l + 1; ++m)
-            {
-                for (int ibnd = 0; ibnd < nbandstot; ++ibnd)
-                {
-                    if (ibnd < nksbands)
-                    {
-                        for (int i = 0; i < npw; ++i)
-                        {
-                            std::complex<double> p = psi_in->operator()(ik, ibnd, i);
-                            double np = p.real() * p.real() + p.imag() * p.imag();
-                            s_kin(l, m) += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] * np;
-                        }
-                    }
-                    else
-                    {
-                        for (int i = 0; i < npw; ++i)
-                        {
-                            std::complex<double> p = stowf.shchi->operator()(ik, ibnd - nksbands, i);
-                            double np = p.real() * p.real() + p.imag() * p.imag();
-                            s_kin(l, m) += p_kv->wk[ik] * gk[l][i] * gk[m][i] * kfac[i] * np;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    for (int l = 0; l < 3; ++l)
-    {
-        for (int m = 0; m < l; ++m)
-        {
-            s_kin(m, l) = s_kin(l, m);
-        }
-    }
+        const int stobands = stowf.nchip[ik];
+        psi.fix_k(ik);
+        stowf.shchi->fix_k(ik);
+        
+        kin_tool.cal_gk(ik);
 
-    for (int l = 0; l < 3; ++l)
-    {
-        for (int m = 0; m < 3; ++m)
-        {
-            s_kin(l, m) *= ModuleBase::e2 / this->ucell->omega;
-        }
+        kin_tool.cal_stress_kin(ik, nksbands, true, psi.get_pointer());
+        kin_tool.cal_stress_kin(ik, stobands, false, stowf.shchi->get_pointer());
     }
 
-    Parallel_Reduce::reduce_all(s_kin.c, 9);
-
-    for (int l = 0; l < 3; ++l)
-    {
-        for (int m = 0; m < 3; ++m)
-        {
-            sigma(l, m) = s_kin(l, m);
-        }
-    }
-    // do symmetry
-    if (ModuleSymmetry::Symmetry::symm_flag == 1)
-    {
-        p_symm->symmetrize_mat3(sigma, this->ucell->lat);
-    }
-    delete[] gk[0];
-    delete[] gk[1];
-    delete[] gk[2];
-    delete[] gk;
-    delete[] kfac;
-    ModuleBase::timer::tick("Sto_Stress_PW", "cal_stress");
+    kin_tool.symmetrize_stress(p_symm, sigma);
+    ModuleBase::timer::tick("Sto_Stress_PW", "stress_kin");
 
     return;
 }
 
-void Sto_Stress_PW::sto_stress_nl(ModuleBase::matrix& sigma,
-                                  const ModuleBase::matrix& wg,
-                                  Structure_Factor* p_sf,
-                                  ModuleSymmetry::Symmetry* p_symm,
-                                  K_Vectors* p_kv,
-                                  ModulePW::PW_Basis_K* wfc_basis,
-                                  const psi::Psi<complex<double>>* psi_in,
-                                  Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>& stowf,
-                                  pseudopot_cell_vnl* nlpp_in)
+template <typename FPTYPE, typename Device>
+void Sto_Stress_PW<FPTYPE, Device>::sto_stress_nl(ModuleBase::matrix& sigma,
+                                                  const ModuleBase::matrix& wg,
+                                                  Structure_Factor* p_sf,
+                                                  ModuleSymmetry::Symmetry* p_symm,
+                                                  K_Vectors* p_kv,
+                                                  ModulePW::PW_Basis_K* wfc_basis,
+                                                  const pseudopot_cell_vnl& nlpp,
+                                                  const UnitCell& ucell,
+                                                  const psi::Psi<std::complex<FPTYPE>, Device>& psi_in,
+                                                  const Stochastic_WF<std::complex<FPTYPE>, Device>& stowf)
 {
     ModuleBase::TITLE("Sto_Stress_Func", "stres_nl");
-    ModuleBase::timer::tick("Sto_Stress_Func", "stres_nl");
-
-    this->nlpp = nlpp_in;
-    const int nkb = this->nlpp->nkb;
+    const int nkb = nlpp.nkb;
     if (nkb == 0)
     {
-        ModuleBase::timer::tick("Stress_Func", "stres_nl");
         return;
     }
 
-    ModuleBase::matrix sigmanlc(3, 3, true);
+    ModuleBase::timer::tick("Sto_Stress_Func", "stres_nl");
+
     int* nchip = stowf.nchip;
     const int npwx = wfc_basis->npwk_max;
-    int nksbands = psi_in->get_nbands();
+    int nksbands = psi_in.get_nbands();
     if (GlobalV::MY_STOGROUP != 0)
     {
         nksbands = 0;
     }
 
-    // vkb1: |Beta(nkb,npw)><Beta(nkb,npw)|psi(nbnd,npw)>
-    ModuleBase::ComplexMatrix vkb1(nkb, npwx);
-    ModuleBase::ComplexMatrix vkb0[3];
-    for (int i = 0; i < 3; ++i)
-    {
-        vkb0[i].create(nkb, npwx);
-    }
-    ModuleBase::ComplexMatrix vkb2(nkb, npwx);
+    // allocate memory for the stress
+    FPTYPE* stress_device = nullptr;
+    resmem_var_op()(this->ctx, stress_device, 9);
+    setmem_var_op()(this->ctx, stress_device, 0, 9);
+    std::vector<FPTYPE> sigmanlc(9, 0.0);
+
+    hamilt::FS_Nonlocal_tools<FPTYPE, Device> nl_tools(&nlpp, &ucell, p_kv, wfc_basis, p_sf, wg, nullptr);
 
     for (int ik = 0; ik < p_kv->get_nks(); ik++)
     {
-        const int nstobands = stowf.nchip[ik];
-        const int nbandstot = nstobands + nksbands;
-        const int npw = p_kv->ngk[ik];
-        // dbecp: conj( -iG * <Beta(nkb,npw)|psi(nbnd,npw)> )
-        ModuleBase::ComplexMatrix dbecp(nbandstot, nkb);
-        ModuleBase::ComplexMatrix becp(nbandstot, nkb);
-
-        const int current_spin = p_kv->isk[ik];
-        // generate vkb
-        if (this->nlpp->nkb > 0)
-        {
-            this->nlpp->getvnl(ik, this->nlpp->vkb);
-        }
-
-        // get becp according to wave functions and vkb
-        // important here ! becp must set zero!!
-        // vkb: Beta(nkb,npw)
-        // becp(nkb,nbnd): <Beta(nkb,npw)|psi(nbnd,npw)>
-        becp.zero_out();
-        char transa = 'C';
-        char transb = 'N';
-        psi_in[0].fix_k(ik);
+        const int nstobands = nchip[ik];
+        const int max_nbands = stowf.shchi->get_nbands() + nksbands;
+        const int npw = wfc_basis->npwk[ik];
+        psi_in.fix_k(ik);
         stowf.shchi->fix_k(ik);
-        // KS orbitals
-        int npmks = PARAM.globalv.npol * nksbands;
-        zgemm_(&transa,
-               &transb,
-               &nkb,
-               &npmks,
-               &npw,
-               &ModuleBase::ONE,
-               this->nlpp->vkb.c,
-               &npwx,
-               psi_in->get_pointer(),
-               &npwx,
-               &ModuleBase::ZERO,
-               becp.c,
-               &nkb);
-        // stochastic orbitals
-        int npmsto = PARAM.globalv.npol * nstobands;
-        zgemm_(&transa,
-               &transb,
-               &nkb,
-               &npmsto,
-               &npw,
-               &ModuleBase::ONE,
-               this->nlpp->vkb.c,
-               &npwx,
-               stowf.shchi->get_pointer(),
-               &npwx,
-               &ModuleBase::ZERO,
-               &becp(nksbands, 0),
-               &nkb);
-
-        Parallel_Reduce::reduce_pool(becp.c, becp.size);
-
-        for (int i = 0; i < 3; ++i)
+        nl_tools.cal_vkb(ik, max_nbands);
+        // calculate becp = <psi|beta> for all beta functions
+        nl_tools.cal_becp(ik, nksbands, psi_in.get_pointer(), 0);
+        nl_tools.cal_becp(ik, nstobands, stowf.shchi->get_pointer(), nksbands);
+        nl_tools.reduce_pool_becp(max_nbands);
+        // calculate dbecp = <psi|d(beta)/dR> for all beta functions
+        // calculate stress = \sum <psi|d(beta_j)/dR> * <psi|beta_i> * D_{ij}
+        for (int ipol = 0; ipol < 3; ipol++)
         {
-            get_dvnl1(vkb0[i], ik, i, p_sf, wfc_basis);
-        }
-
-        get_dvnl2(vkb2, ik, p_sf, wfc_basis);
-
-        ModuleBase::Vector3<double> qvec;
-        double* qvec0[3];
-        qvec0[0] = &(qvec.x);
-        qvec0[1] = &(qvec.y);
-        qvec0[2] = &(qvec.z);
-
-        for (int ipol = 0; ipol < 3; ++ipol)
-        {
-            for (int jpol = 0; jpol < ipol + 1; ++jpol)
+            for (int jpol = 0; jpol <= ipol; jpol++)
             {
-                dbecp.zero_out();
-                vkb1.zero_out();
-                for (int i = 0; i < nkb; ++i)
-                {
-                    std::complex<double>* pvkb0i = &vkb0[ipol](i, 0);
-                    std::complex<double>* pvkb0j = &vkb0[jpol](i, 0);
-                    std::complex<double>* pvkb1 = &vkb1(i, 0);
-                    // third term of dbecp_noevc
-                    for (int ig = 0; ig < npw; ++ig)
-                    {
-                        qvec = wfc_basis->getgpluskcar(ik, ig);
-                        pvkb1[ig] += 0.5 * qvec0[ipol][0] * pvkb0j[ig] + 0.5 * qvec0[jpol][0] * pvkb0i[ig];
-                    } // end ig
-                }     // end nkb
-
-                ModuleBase::ComplexMatrix dbecp_noevc(nkb, npwx, true);
-                for (int i = 0; i < nkb; ++i)
-                {
-                    std::complex<double>* pdbecp_noevc = &dbecp_noevc(i, 0);
-                    std::complex<double>* pvkb = &vkb1(i, 0);
-                    // first term
-                    for (int ig = 0; ig < npw; ++ig)
-                    {
-                        pdbecp_noevc[ig] -= 2.0 * pvkb[ig];
-                    }
-                    // second termi
-                    if (ipol == jpol)
-                    {
-                        pvkb = &this->nlpp->vkb(i, 0);
-                        for (int ig = 0; ig < npw; ++ig)
-                        {
-                            pdbecp_noevc[ig] -= pvkb[ig];
-                        }
-                    }
-                    // third term
-                    pvkb = &vkb2(i, 0);
-                    for (int ig = 0; ig < npw; ++ig)
-                    {
-                        qvec = wfc_basis->getgpluskcar(ik, ig);
-                        double qm1;
-                        if (qvec.norm2() > 1e-16)
-                        {
-                            qm1 = 1.0 / qvec.norm();
-                        }
-                        else
-                        {
-                            qm1 = 0;
-                        }
-                        pdbecp_noevc[ig] -= 2.0 * pvkb[ig] * qvec0[ipol][0] * qvec0[jpol][0] * qm1 * this->ucell->tpiba;
-                    } // end ig
-                }     // end i
-
-                //              //KS orbitals
-                zgemm_(&transa,
-                       &transb,
-                       &nkb,
-                       &npmks,
-                       &npw,
-                       &ModuleBase::ONE,
-                       dbecp_noevc.c,
-                       &npwx,
-                       psi_in->get_pointer(),
-                       &npwx,
-                       &ModuleBase::ZERO,
-                       dbecp.c,
-                       &nkb);
-                // stochastic orbitals
-                zgemm_(&transa,
-                       &transb,
-                       &nkb,
-                       &npmsto,
-                       &npw,
-                       &ModuleBase::ONE,
-                       dbecp_noevc.c,
-                       &npwx,
-                       stowf.shchi->get_pointer(),
-                       &npwx,
-                       &ModuleBase::ZERO,
-                       &dbecp(nksbands, 0),
-                       &nkb);
-
-                for (int ib = 0; ib < nbandstot; ++ib)
-                {
-                    double fac;
-                    if (ib < nksbands)
-                    {
-                        fac = wg(ik, ib);
-                    }
-                    else
-                    {
-                        fac = p_kv->wk[ik];
-                    }
-                    int iat = 0;
-                    int sum = 0;
-                    for (int it = 0; it < this->ucell->ntype; ++it)
-                    {
-                        const int Nprojs = this->ucell->atoms[it].ncpp.nh;
-                        for (int ia = 0; ia < this->ucell->atoms[it].na; ++ia)
-                        {
-                            for (int ip = 0; ip < Nprojs; ++ip)
-                            {
-                                double ps = this->nlpp->deeq(current_spin, iat, ip, ip);
-                                const int inkb = sum + ip;
-                                // out<<"\n ps = "<<ps;
-
-                                const double dbb = (conj(dbecp(ib, inkb)) * becp(ib, inkb)).real();
-                                sigmanlc(ipol, jpol) -= ps * fac * dbb;
+                nl_tools.cal_vkb_deri_s(ik, max_nbands, ipol, jpol);
+                nl_tools.cal_dbecp_s(ik, nksbands, psi_in.get_pointer(), 0);
+                nl_tools.cal_dbecp_s(ik, nstobands, stowf.shchi->get_pointer(), nksbands);
+                nl_tools.cal_stress(ik, nksbands, true, ipol, jpol, stress_device, 0);
+                nl_tools.cal_stress(ik, nstobands, false, ipol, jpol, stress_device, nksbands);
+            }
+        }
 
-                            } // end ip
-                            ++iat;
-                            sum += Nprojs;
-                        } // ia
-                    }     // end it
-                }         // end band
-            }             // end jpol
-        }                 // end ipol
-    }                     // end ik
+    }
 
-    for (int l = 0; l < 3; ++l)
+    // transfer stress from device to host
+    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigmanlc.data(), stress_device, 9);
+    delmem_var_op()(this->ctx, stress_device);
+    // sum up forcenl from all processors
+    for (int l = 0; l < 3; l++)
     {
-        for (int m = 0; m < 3; ++m)
+        for (int m = 0; m < 3; m++)
         {
             if (m > l)
             {
-                sigmanlc(l, m) = sigmanlc(m, l);
+                sigmanlc[l * 3 + m] = sigmanlc[m * 3 + l];
             }
         }
     }
     // sum up forcenl from all processors
-    Parallel_Reduce::reduce_all(sigmanlc.c, 9);
+    Parallel_Reduce::reduce_all(sigmanlc.data(), 9);
 
     for (int ipol = 0; ipol < 3; ++ipol)
     {
         for (int jpol = 0; jpol < 3; ++jpol)
         {
-            sigmanlc(ipol, jpol) *= 1.0 / this->ucell->omega;
-        }
-    }
-
-    for (int ipol = 0; ipol < 3; ++ipol)
-    {
-        for (int jpol = 0; jpol < 3; ++jpol)
-        {
-            sigma(ipol, jpol) = sigmanlc(ipol, jpol);
+            sigma(ipol, jpol) = sigmanlc[ipol * 3 + jpol] / ucell.omega;
         }
     }
     // do symmetry
     if (ModuleSymmetry::Symmetry::symm_flag == 1)
     {
-        p_symm->symmetrize_mat3(sigma, this->ucell->lat);
+        p_symm->symmetrize_mat3(sigma, ucell.lat);
     }
 
-    //  this->print(ofs_running, "nonlocal stress", stresnl);
     ModuleBase::timer::tick("Sto_Stress_Func", "stres_nl");
     return;
 }
+
+
+template class Sto_Stress_PW<double, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class Sto_Stress_PW<double, base_device::DEVICE_GPU>;
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.h b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.h
index e3fc3eadd6..7b25166dac 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.h
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.h
@@ -7,7 +7,8 @@
 #include "sto_wf.h"
 // qianrui create 2021-6-4
 
-class Sto_Stress_PW : public Stress_Func<double>
+template <typename FPTYPE, typename Device = base_device::DEVICE_CPU>
+class Sto_Stress_PW : public Stress_Func<FPTYPE, Device>
 {
   public:
     Sto_Stress_PW(){};
@@ -21,8 +22,8 @@ class Sto_Stress_PW : public Stress_Func<double>
                     Structure_Factor* p_sf,
                     K_Vectors* p_kv,
                     ModulePW::PW_Basis_K* wfc_basis,
-                    const psi::Psi<complex<double>>* psi_in,
-                    Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>& stowf,
+                    const psi::Psi<std::complex<FPTYPE>, Device>& psi_in,
+                    const Stochastic_WF<std::complex<FPTYPE>, Device>& stowf,
                     const Charge* const chr,
                     pseudopot_cell_vnl* nlpp_in,
                     const UnitCell& ucell_in);
@@ -33,8 +34,8 @@ class Sto_Stress_PW : public Stress_Func<double>
                         ModuleSymmetry::Symmetry* p_symm,
                         K_Vectors* p_kv,
                         ModulePW::PW_Basis_K* wfc_basis,
-                        const psi::Psi<complex<double>>* psi_in,
-                        Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>& stowf);
+                        const psi::Psi<std::complex<FPTYPE>, Device>& psi_in,
+                        const Stochastic_WF<std::complex<FPTYPE>, Device>& stowf);
 
     void sto_stress_nl(ModuleBase::matrix& sigma,
                        const ModuleBase::matrix& wg,
@@ -42,8 +43,14 @@ class Sto_Stress_PW : public Stress_Func<double>
                        ModuleSymmetry::Symmetry* p_symm,
                        K_Vectors* p_kv,
                        ModulePW::PW_Basis_K* wfc_basis,
-                       const psi::Psi<complex<double>>* psi_in,
-                       Stochastic_WF<std::complex<double>, base_device::DEVICE_CPU>& stowf,
-                       pseudopot_cell_vnl* nlpp_in);
+                       const pseudopot_cell_vnl& nlpp,
+                       const UnitCell& ucell,
+                       const psi::Psi<std::complex<FPTYPE>, Device>& psi,
+                       const Stochastic_WF<std::complex<FPTYPE>, Device>& stowf);
+  private:
+    using resmem_var_op = base_device::memory::resize_memory_op<FPTYPE, Device>;
+    using setmem_var_op = base_device::memory::set_memory_op<FPTYPE, Device>;
+    using delmem_var_op = base_device::memory::delete_memory_op<FPTYPE, Device>;
+    using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, Device>;
 };
 #endif
diff --git a/source/module_hsolver/hsolver_pw_sdft.cpp b/source/module_hsolver/hsolver_pw_sdft.cpp
index 8698324f79..971873a13b 100644
--- a/source/module_hsolver/hsolver_pw_sdft.cpp
+++ b/source/module_hsolver/hsolver_pw_sdft.cpp
@@ -41,6 +41,7 @@ void HSolverPW_SDFT<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
     // part of KSDFT to get KS orbitals
     for (int ik = 0; ik < nks; ++ik)
     {
+        ModuleBase::timer::tick("HSolverPW_SDFT", "solve_KS");
         pHamilt->updateHk(ik);
         if (nbands > 0 && GlobalV::MY_STOGROUP == 0)
         {
@@ -55,10 +56,11 @@ void HSolverPW_SDFT<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
 #ifdef __MPI
         if (nbands > 0 && PARAM.inp.bndpar > 1)
         {
-            Parallel_Common::bcast_complex(this->ctx, &psi(ik, 0, 0), npwx * nbands, PARAPW_WORLD, &psi_cpu(ik, 0, 0));
+            Parallel_Common::bcast_dev(this->ctx, &psi(ik, 0, 0), npwx * nbands, PARAPW_WORLD, &psi_cpu(ik, 0, 0));
             MPI_Bcast(&pes->ekb(ik, 0), nbands, MPI_DOUBLE, 0, PARAPW_WORLD);
         }
 #endif
+        ModuleBase::timer::tick("HSolverPW_SDFT", "solve_KS");
         stoiter.orthog(ik, psi, stowf);
         stoiter.checkemm(ik, istep, iter, stowf); // check and reset emax & emin
     }
diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp
index 7c3997bc22..badf2e4b98 100644
--- a/source/module_io/input_conv.cpp
+++ b/source/module_io/input_conv.cpp
@@ -183,7 +183,7 @@ void Input_Conv::Convert()
 
     if (PARAM.inp.device  == "gpu" && PARAM.inp.basis_type == "pw")
     {
-        GlobalV::KPAR = base_device::information::get_device_kpar(PARAM.inp.kpar);
+        GlobalV::KPAR = base_device::information::get_device_kpar(PARAM.inp.kpar, PARAM.inp.bndpar);
     }
 #ifdef __LCAO
     else if (PARAM.inp.basis_type == "lcao") {
diff --git a/source/module_io/read_input_item_system.cpp b/source/module_io/read_input_item_system.cpp
index fb698ae877..1ceae0b79c 100644
--- a/source/module_io/read_input_item_system.cpp
+++ b/source/module_io/read_input_item_system.cpp
@@ -256,6 +256,12 @@ void ReadInput::item_system()
                 para.input.bndpar = GlobalV::NPROC;
             }
         };
+        item.check_value = [](const Input_Item& item, const Parameter& para) {
+            if (GlobalV::NPROC % para.input.bndpar != 0)
+            {
+                ModuleBase::WARNING_QUIT("ReadInput", "The number of processors can not be divided by bndpar");
+            }
+        };
         this->add_item(item);
     }
     {
diff --git a/source/module_psi/psi.cpp b/source/module_psi/psi.cpp
index a164da46a8..f129d3e422 100644
--- a/source/module_psi/psi.cpp
+++ b/source/module_psi/psi.cpp
@@ -227,7 +227,7 @@ template <typename T, typename Device> T* Psi<T, Device>::get_pointer(const int&
 {
     assert(ikb >= 0);
     assert(this->k_first ? ikb < this->nbands : ikb < this->nk);
-    return &this->psi_current[ikb * this->nbasis];
+    return this->psi_current + ikb * this->nbasis;
 }
 
 template <typename T, typename Device> const int* Psi<T, Device>::get_ngk_pointer() const
diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/INPUT b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/INPUT
new file mode 100644
index 0000000000..beef5ef8b4
--- /dev/null
+++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/INPUT
@@ -0,0 +1,43 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix			autotest
+calculation     md
+esolver_type    sdft
+method_sto      2
+device          gpu
+
+symmetry		0
+pseudo_dir	../../PP_ORB
+
+nbands			0
+nbands_sto		all
+
+nche_sto		90
+seed_sto        20000
+kpar            1
+bndpar          2
+cal_force       1
+cal_stress      1
+
+#Parameters (2.Iteration)
+ecutwfc			30
+scf_thr			1e-4
+scf_nmax		20
+
+
+#Parameters (3.Basis)
+basis_type		pw
+
+#Parameters (4.Smearing)
+smearing_method		fd
+smearing_sigma		0.6
+
+#Parameters (5.Mixing)
+mixing_type		broyden
+mixing_beta		0.4
+
+#MD
+md_tfirst      10
+md_tfreq       0.1
+md_nstep       2
+init_vel       1
diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/KPT b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/KPT
new file mode 100644
index 0000000000..c289c0158a
--- /dev/null
+++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/README b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/README
new file mode 100644
index 0000000000..cea120ea70
--- /dev/null
+++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/README
@@ -0,0 +1,11 @@
+This test for:
+*SDFT
+*MD
+*Si
+*kpoints 1*1*1
+*All complete basis
+*mixing_type broyden
+*mixing_beta 0.4
+*seed_sto > 0
+*sto_method 2
+*bndpar     2
diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/STRU b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/STRU
new file mode 100644
index 0000000000..408b4bd8fa
--- /dev/null
+++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/STRU
@@ -0,0 +1,19 @@
+ATOMIC_SPECIES
+Si 28 Si.pz-vbc.UPF
+
+LATTICE_CONSTANT
+4
+
+LATTICE_VECTORS
+1 0 0 #latvec1
+0 1 0 #latvec2
+0 0 1 #latvec3
+
+ATOMIC_POSITIONS
+Cartesian
+
+Si #label
+0 #magnetism
+2 #number of atoms
+0  0  0  m  1  1  1  v  1.62777250569e-06  -2.01685158301e-05  2.2830936097e-05
+0.5  0.5  0.5  m  1  1  1  v  -1.62777250569e-06  2.01685158301e-05  -2.2830936097e-05
diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/jd b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/jd
new file mode 100644
index 0000000000..ad033a1631
--- /dev/null
+++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/jd
@@ -0,0 +1 @@
+test sdft MD, nstep = 2
diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/result.ref b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/result.ref
new file mode 100644
index 0000000000..ef2e7fcc80
--- /dev/null
+++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/result.ref
@@ -0,0 +1,5 @@
+etotref -228.9492279657028
+etotperatomref -114.4746139829
+totalforceref 0.510260
+totalstressref 42801.474893
+totaltimeref 7.82
diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/INPUT b/tests/integrate/187_PW_SDFT_ALL_GPU/INPUT
new file mode 100644
index 0000000000..a6804500a7
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_ALL_GPU/INPUT
@@ -0,0 +1,38 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix			autotest
+calculation     scf
+esolver_type    sdft
+method_sto      1
+
+symmetry		0
+pseudo_dir	../../PP_ORB
+device          gpu
+
+kpar            2
+bndpar          1
+
+nbands			0
+nbands_sto		all
+
+nche_sto		120
+seed_sto        20000
+cal_force       1
+cal_stress      1
+
+#Parameters (2.Iteration)
+ecutwfc			20
+scf_thr			1e-6
+scf_nmax		30
+
+
+#Parameters (3.Basis)
+basis_type		pw
+
+#Parameters (4.Smearing)
+smearing_method		fd
+smearing_sigma		0.6
+
+#Parameters (5.Mixing)
+mixing_type		plain
+mixing_beta		0.7
diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/KPT b/tests/integrate/187_PW_SDFT_ALL_GPU/KPT
new file mode 100644
index 0000000000..24c3230814
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_ALL_GPU/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+2 2 2 0.5 0.5 0.5
diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/README b/tests/integrate/187_PW_SDFT_ALL_GPU/README
new file mode 100644
index 0000000000..9cd65665cb
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_ALL_GPU/README
@@ -0,0 +1,11 @@
+This test for:
+*SDFT
+*Si
+*kpoints 2*2*2 shift
+*complete orbitals
+*mixing_type plain
+*mixing_beta 0.7
+*method_sto 1
+*seed_sto > 0
+*bndpar  1
+*kpar    2
diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/STRU b/tests/integrate/187_PW_SDFT_ALL_GPU/STRU
new file mode 100644
index 0000000000..28a94e8c0c
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_ALL_GPU/STRU
@@ -0,0 +1,19 @@
+ATOMIC_SPECIES
+Si 28 Si.pz-vbc.UPF
+
+LATTICE_CONSTANT
+5  // add lattice constant
+
+LATTICE_VECTORS
+0.5 0 0.5
+0.5 0.5 0
+0 0.5 0.5
+
+ATOMIC_POSITIONS
+Direct 
+
+Si	// Element type	
+0.0	// magnetism
+2	
+0.10 0.00 0.20 1 1 1
+0.5 0.5 0.5 1 1 1
diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/jd b/tests/integrate/187_PW_SDFT_ALL_GPU/jd
new file mode 100644
index 0000000000..1fb7d0f4b9
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_ALL_GPU/jd
@@ -0,0 +1 @@
+test parallel method kpar and GPU
\ No newline at end of file
diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/result.ref b/tests/integrate/187_PW_SDFT_ALL_GPU/result.ref
new file mode 100644
index 0000000000..dd61868257
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_ALL_GPU/result.ref
@@ -0,0 +1,5 @@
+etotref -105.2612355454176907
+etotperatomref -52.6306177727
+totalforceref 197.906706
+totalstressref 254537.682905
+totaltimeref 20.82
diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/INPUT b/tests/integrate/187_PW_SDFT_MALL_GPU/INPUT
new file mode 100644
index 0000000000..a0855d63f9
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_MALL_GPU/INPUT
@@ -0,0 +1,38 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix			autotest
+calculation     scf
+esolver_type    sdft
+method_sto      2
+device          gpu
+
+symmetry		0
+pseudo_dir	../../PP_ORB
+
+kpar            1
+bndpar          2
+
+nbands			10
+nbands_sto		all
+
+nche_sto		120
+seed_sto        20000
+cal_force       1
+cal_stress      1
+
+#Parameters (2.Iteration)
+ecutwfc			30
+scf_thr			1e-6
+scf_nmax		20
+
+
+#Parameters (3.Basis)
+basis_type		pw
+
+#Parameters (4.Smearing)
+smearing_method		fd
+smearing_sigma		0.6
+
+#Parameters (5.Mixing)
+mixing_type		plain
+mixing_beta		0.7
diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/KPT b/tests/integrate/187_PW_SDFT_MALL_GPU/KPT
new file mode 100644
index 0000000000..c289c0158a
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_MALL_GPU/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/README b/tests/integrate/187_PW_SDFT_MALL_GPU/README
new file mode 100644
index 0000000000..b0fc10b3f9
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_MALL_GPU/README
@@ -0,0 +1,11 @@
+This test for:
+*SDFT
+*Si
+*kpoints 1*1*1
+*10 KS + complete orbitals
+*mixing_type plain
+*mixing_beta 0.7
+*sto_method  2
+*seed_sto > 0
+*bndpar  2
+*kpar    1
diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/STRU b/tests/integrate/187_PW_SDFT_MALL_GPU/STRU
new file mode 100644
index 0000000000..28a94e8c0c
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_MALL_GPU/STRU
@@ -0,0 +1,19 @@
+ATOMIC_SPECIES
+Si 28 Si.pz-vbc.UPF
+
+LATTICE_CONSTANT
+5  // add lattice constant
+
+LATTICE_VECTORS
+0.5 0 0.5
+0.5 0.5 0
+0 0.5 0.5
+
+ATOMIC_POSITIONS
+Direct 
+
+Si	// Element type	
+0.0	// magnetism
+2	
+0.10 0.00 0.20 1 1 1
+0.5 0.5 0.5 1 1 1
diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/jd b/tests/integrate/187_PW_SDFT_MALL_GPU/jd
new file mode 100644
index 0000000000..7356c219e1
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_MALL_GPU/jd
@@ -0,0 +1 @@
+test parallel method bndpar and GPU
diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/result.ref b/tests/integrate/187_PW_SDFT_MALL_GPU/result.ref
new file mode 100644
index 0000000000..dafd960f93
--- /dev/null
+++ b/tests/integrate/187_PW_SDFT_MALL_GPU/result.ref
@@ -0,0 +1,5 @@
+etotref -96.9361115889958853
+etotperatomref -48.4680557945
+totalforceref 248.975546
+totalstressref 230454.805813
+totaltimeref 5.17
diff --git a/tests/integrate/CASES_GPU.txt b/tests/integrate/CASES_GPU.txt
index 641c3c0a36..00de10ccb6 100644
--- a/tests/integrate/CASES_GPU.txt
+++ b/tests/integrate/CASES_GPU.txt
@@ -1,6 +1,9 @@
 102_PW_CG_GPU
 102_PW_DA_davidson_GPU
 102_PW_BPCG_GPU
+187_PW_SDFT_ALL_GPU
+187_PW_SDFT_MALL_GPU
+187_PW_MD_SDFT_ALL_GPU
 930_NO_BI2SE2CU2O2_GPU
 930_NO_BI2SE2CU2O2_k_GPU
 931_NO_H20_GPU