diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index 781886e65b..af0cbdf4ac 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -143,6 +143,7 @@
     - [out\_level](#out_level)
     - [out\_alllog](#out_alllog)
     - [out\_mat\_hs](#out_mat_hs)
+    - [out\_mat\_tk](#out_mat_tk)
     - [out\_mat\_r](#out_mat_r)
     - [out\_mat\_hs2](#out_mat_hs2)
     - [out\_mat\_t](#out_mat_t)
@@ -1667,6 +1668,13 @@ These variables are used to control the output of properties.
 - **Description**: Whether to print the upper triangular part of the Hamiltonian matrices (in Ry) and overlap matrices for each k point into files in the directory `OUT.${suffix}`. The second number controls precision. For more information, please refer to [hs_matrix.md](../elec_properties/hs_matrix.md#out_mat_hs). Also controled by [out_interval](#out_interval) and [out_app_flag](#out_app_flag).
 - **Default**: False 8
 
+### out_mat_tk
+
+- **Type**: Boolean \[Integer\](optional)
+- **Availability**: Numerical atomic orbital basis
+- **Description**: Whether to print the upper triangular part of the kinetic matrices (in Ry) for each k point into `OUT.${suffix}/data-i-T`, where i is the index of k points (see `OUT.${suffix}/kpoints`). One may optionally provide a second parameter to specify the precision. 
+- **Default**: False \[8\]
+
 ### out_mat_r
 
 - **Type**: Boolean
diff --git a/source/module_esolver/esolver_ks.h b/source/module_esolver/esolver_ks.h
index feb4d5ee17..36a85eac68 100644
--- a/source/module_esolver/esolver_ks.h
+++ b/source/module_esolver/esolver_ks.h
@@ -70,7 +70,7 @@ class ESolver_KS : public ESolver_FP
         virtual void iter_finish(int& iter);
 
         //! Something to do after SCF iterations when SCF is converged or comes to the max iter step.
-        virtual void after_scf(const int istep);
+        virtual void after_scf(const int istep) override;
 
         //! <Temporary> It should be replaced by a function in Hamilt Class
 		virtual void update_pot(const int istep, const int iter) {};
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index 07d8936fb8..177b791b98 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -26,6 +26,7 @@
 #include "module_elecstate/module_charge/symmetry_rho.h"
 #include "module_elecstate/occupy.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h" // need divide_HS_in_frag
+#include "module_hamilt_lcao/hamilt_lcaodft/hs_matrix_k.hpp"
 #include "module_hamilt_lcao/module_dftu/dftu.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_io/print_info.h"
@@ -994,7 +995,10 @@ void ESolver_KS_LCAO<TK, TR>::iter_finish(int& iter)
                 *this->p_hamilt,
                 *dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM(),
                 this->kv,
-                iter);
+                PARAM.inp.nspin,
+                iter,
+                this->pelec->f_en.etot,
+                this->scf_ene_thr);
         }
         else
         {
@@ -1002,7 +1006,10 @@ void ESolver_KS_LCAO<TK, TR>::iter_finish(int& iter)
                 *this->p_hamilt,
                 *dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM(),
                 this->kv,
-                iter);
+                PARAM.inp.nspin,
+                iter,
+                this->pelec->f_en.etot,
+                this->scf_ene_thr);
         }
     }
 #endif
@@ -1130,11 +1137,10 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(const int istep)
     }
 
 #ifdef __EXX
-    // 4) write Exx matrix
-    if (GlobalC::exx_info.info_global.cal_exx) // Peize Lin add if 2022.11.14
+    // 4) write Hexx matrix for NSCF (see `out_chg` in docs/advanced/input_files/input-main.md)
+    if (GlobalC::exx_info.info_global.cal_exx && PARAM.inp.out_chg[0] && istep % PARAM.inp.out_interval == 0) // Peize Lin add if 2022.11.14
     {
-        const std::string file_name_exx = GlobalV::global_out_dir + "HexxR"
-                                          + std::to_string(GlobalV::MY_RANK);
+        const std::string file_name_exx = GlobalV::global_out_dir + "HexxR" + std::to_string(GlobalV::MY_RANK);
         if (GlobalC::exx_info.info_ri.real_number) {
             ModuleIO::write_Hexxs_csr(file_name_exx, GlobalC::ucell, this->exd->get_Hexxs());
         } else {
@@ -1255,6 +1261,38 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(const int istep)
                        GlobalV::NPROC);
         tqo.calculate();
     }
+
+    if (PARAM.inp.out_mat_tk[0])
+    {
+        hamilt::HS_Matrix_K<TK> hsk(&pv, true);
+        hamilt::HContainer<TR> hR(&pv);
+        hamilt::Operator<TK>* ekinetic = 
+            new hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>(&hsk,
+                                                                  this->kv.kvec_d,
+                                                                  &hR,
+                                                                  &GlobalC::ucell,
+                                                                  &GlobalC::GridD,
+                                                                  two_center_bundle_.kinetic_orb.get());
+
+        const int nspin_k = (GlobalV::NSPIN == 2 ? 2 : 1);
+        for (int ik = 0; ik < this->kv.get_nks() / nspin_k; ++ik)
+        {
+            ekinetic->init(ik);
+            ModuleIO::save_mat(0,
+                               hsk.get_hk(),
+                               GlobalV::NLOCAL,
+                               false,
+                               PARAM.inp.out_mat_tk[1],
+                               1,
+                               GlobalV::out_app_flag,
+                               "T",
+                               "data-" + std::to_string(ik),
+                               this->pv,
+                               GlobalV::DRANK);
+        }
+
+        delete ekinetic;
+    }
 }
 
 //------------------------------------------------------------------------------
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
index 9efc766902..fc996fad52 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
@@ -760,7 +760,7 @@ void Force_Stress_LCAO<T>::calForcePwPart(ModuleBase::matrix& fvl_dvl,
     //--------------------------------------------------------
     // force due to core correlation.
     //--------------------------------------------------------
-    f_pw.cal_force_cc(fcc, rhopw, chr);
+    f_pw.cal_force_cc(fcc, rhopw, chr,GlobalC::ucell);
     //--------------------------------------------------------
     // force due to self-consistent charge.
     //--------------------------------------------------------
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index cb84387efa..6ea8e894cd 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -157,7 +157,7 @@ void Forces<FPTYPE, Device>::cal_force(ModuleBase::matrix& force,
     // not relevant for PAW
     if (!GlobalV::use_paw)
     {
-        Forces::cal_force_cc(forcecc, rho_basis, chr);
+        Forces::cal_force_cc(forcecc, rho_basis, chr,GlobalC::ucell);
     }
     else
     {
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h
index 8c734932a1..0247fbfae7 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h
@@ -48,7 +48,7 @@ class Forces
 
     void cal_force_loc(ModuleBase::matrix& forcelc, ModulePW::PW_Basis* rho_basis, const Charge* const chr);
     void cal_force_ew(ModuleBase::matrix& forceion, ModulePW::PW_Basis* rho_basis, const Structure_Factor* p_sf);
-    void cal_force_cc(ModuleBase::matrix& forcecc, ModulePW::PW_Basis* rho_basis, const Charge* const chr);
+    void cal_force_cc(ModuleBase::matrix& forcecc, ModulePW::PW_Basis* rho_basis, const Charge* const chr, UnitCell& ucell_in);
     /**
      * @brief This routine computes the atomic force of non-local pseudopotential
      *    F^{NL}_i = \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}D_{l,l'}^{I} [
@@ -86,7 +86,8 @@ class Forces
                      const FPTYPE* rhoc,
                      FPTYPE* drhocg,
                      ModulePW::PW_Basis* rho_basis,
-                     int type); // used in nonlinear core correction stress
+                     int type,
+                     const UnitCell& ucell_in); // used in nonlinear core correction stress
     void deriv_drhoc_scc(const bool& numeric,
                      const int mesh,
                      const FPTYPE* r,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
index c111a78938..b4b09c768a 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
@@ -27,7 +27,8 @@
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
                                           ModulePW::PW_Basis* rho_basis,
-                                          const Charge* const chr)
+                                          const Charge* const chr,
+                                           UnitCell& ucell_in)
 {
     ModuleBase::TITLE("Forces", "cal_force_cc");
     // recalculate the exchange-correlation potential.
@@ -35,11 +36,11 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
 
     int total_works = 0;
     // cal total works for skipping preprocess
-    for (int it = 0; it < GlobalC::ucell.ntype; ++it)
+    for (int it = 0; it < ucell_in.ntype; ++it)
     {
-        if (GlobalC::ucell.atoms[it].ncpp.nlcc)
+        if (ucell_in.atoms[it].ncpp.nlcc)
         {
-            total_works += GlobalC::ucell.atoms[it].na;
+            total_works += ucell_in.atoms[it].na;
         }
     }
     if (total_works == 0)
@@ -54,7 +55,7 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
     {
 #ifdef USE_LIBXC
         const auto etxc_vtxc_v
-            = XC_Functional::v_xc_meta(rho_basis->nrxx, GlobalC::ucell.omega, GlobalC::ucell.tpiba, chr);
+            = XC_Functional::v_xc_meta(rho_basis->nrxx, ucell_in.omega, ucell_in.tpiba, chr);
 
         // etxc = std::get<0>(etxc_vtxc_v);
         // vtxc = std::get<1>(etxc_vtxc_v);
@@ -66,9 +67,9 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
     else
     {
         if (GlobalV::NSPIN == 4) {
-            GlobalC::ucell.cal_ux();
+            ucell_in.cal_ux();
 }
-        const auto etxc_vtxc_v = XC_Functional::v_xc(rho_basis->nrxx, chr, &GlobalC::ucell);
+        const auto etxc_vtxc_v = XC_Functional::v_xc(rho_basis->nrxx, chr, &ucell_in);
 
         // etxc = std::get<0>(etxc_vtxc_v);
         // vtxc = std::get<1>(etxc_vtxc_v);
@@ -105,83 +106,116 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
     double* rhocg = new double[rho_basis->ngg];
     ModuleBase::GlobalFunc::ZEROS(rhocg, rho_basis->ngg);
 
-    for (int it = 0; it < GlobalC::ucell.ntype; ++it)
+    std::vector<double> gv_x(rho_basis->npw);
+    std::vector<double> gv_y(rho_basis->npw);
+    std::vector<double> gv_z(rho_basis->npw);
+    std::vector<double> rhocgigg_vec(rho_basis->npw);
+    double *gv_x_d = nullptr;
+    double *gv_y_d = nullptr;
+    double *gv_z_d = nullptr;
+    double *force_d = nullptr;
+    double *rhocgigg_vec_d = nullptr;
+    std::complex<FPTYPE>* psiv_d = nullptr;
+    this->device = base_device::get_device_type<Device>(this->ctx);
+
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int ig = 0; ig < rho_basis->npw; ig++)
     {
-        if (GlobalC::ucell.atoms[it].ncpp.nlcc)
+        gv_x[ig] = rho_basis->gcar[ig].x;
+        gv_y[ig] = rho_basis->gcar[ig].y;
+        gv_z[ig] = rho_basis->gcar[ig].z;
+    }
+
+	if(this->device == base_device::GpuDevice ) {
+		resmem_var_op()(this->ctx, gv_x_d, rho_basis->npw);
+        resmem_var_op()(this->ctx, gv_y_d, rho_basis->npw);
+        resmem_var_op()(this->ctx, gv_z_d, rho_basis->npw);
+        resmem_var_op()(this->ctx, rhocgigg_vec_d, rho_basis->npw);
+        resmem_complex_op()(this->ctx, psiv_d, rho_basis->nmaxgr);
+        resmem_var_op()(this->ctx, force_d, 3);
+
+		syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_x_d, gv_x.data(), rho_basis->npw);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_y_d, gv_y.data(), rho_basis->npw);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_z_d, gv_z.data(), rho_basis->npw);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, psiv_d, psiv, rho_basis->nmaxgr);
+	}
+
+
+    for (int it = 0; it < ucell_in.ntype; ++it)
+    {
+        if (ucell_in.atoms[it].ncpp.nlcc)
         {
 
             // chr->non_linear_core_correction(GlobalC::ppcell.numeric,
-            //                                 GlobalC::ucell.atoms[it].ncpp.msh,
-            //                                 GlobalC::ucell.atoms[it].ncpp.r,
-            //                                 GlobalC::ucell.atoms[it].ncpp.rab,
-            //                                 GlobalC::ucell.atoms[it].ncpp.rho_atc,
+            //                                 ucell_in.atoms[it].ncpp.msh,
+            //                                 ucell_in.atoms[it].ncpp.r,
+            //                                 ucell_in.atoms[it].ncpp.rab,
+            //                                 ucell_in.atoms[it].ncpp.rho_atc,
             //                                 rhocg);
             this->deriv_drhoc(GlobalC::ppcell.numeric,
-                              GlobalC::ucell.atoms[it].ncpp.msh,
-                              GlobalC::ucell.atoms[it].ncpp.r.data(),
-                              GlobalC::ucell.atoms[it].ncpp.rab.data(),
-                              GlobalC::ucell.atoms[it].ncpp.rho_atc.data(),
+                              ucell_in.atoms[it].ncpp.msh,
+                              ucell_in.atoms[it].ncpp.r.data(),
+                              ucell_in.atoms[it].ncpp.rab.data(),
+                              ucell_in.atoms[it].ncpp.rho_atc.data(),
                               rhocg,
                               rho_basis,
-                              1);
+                              1,
+                              ucell_in);
+
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel for
+#endif                              
+            for (int ig = 0; ig < rho_basis->npw; ig++)
             {
-#endif
-                for (int ia = 0; ia < GlobalC::ucell.atoms[it].na; ++ia)
+                rhocgigg_vec[ig] = rhocg[rho_basis->ig2igg[ig]];
+            }
+
+            if(this->device == base_device::GpuDevice ) {
+                syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhocgigg_vec_d, rhocgigg_vec.data(), rho_basis->npw);
+            }
+            for (int ia = 0; ia < ucell_in.atoms[it].na; ++ia)
+            {
+                const ModuleBase::Vector3<double> pos = ucell_in.atoms[it].tau[ia];
+                // get iat form table
+                int iat = ucell_in.itia2iat(it, ia);
+                double force[3] = {0, 0, 0};
+
+                if(this->device == base_device::GpuDevice ) {
+                    syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, force_d, force, 3);
+                    hamilt::cal_force_npw_op<FPTYPE, Device>()(
+                        psiv_d, gv_x_d, gv_y_d, gv_z_d, rhocgigg_vec_d, force_d, pos.x, pos.y, pos.z, 
+                        rho_basis->npw, ucell_in.omega, ucell_in.tpiba
+                    );      
+                    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force, force_d, 3);	          
+                
+                } else {
+                    hamilt::cal_force_npw_op<FPTYPE, Device>()(
+                        psiv, gv_x.data(), gv_y.data(), gv_z.data(), rhocgigg_vec.data(), force, pos.x, pos.y, pos.z, 
+                        rho_basis->npw, ucell_in.omega, ucell_in.tpiba
+                    );  
+                }
+
                 {
-                    // get iat form table
-                    int iat = GlobalC::ucell.itia2iat(it, ia);
-                    double force[3] = {0, 0, 0};
-#ifdef _OPENMP
-#pragma omp for nowait
-#endif
-                    for (int ig = 0; ig < rho_basis->npw; ig++)
-                    {
-                        const ModuleBase::Vector3<double> gv = rho_basis->gcar[ig];
-                        const ModuleBase::Vector3<double> pos = GlobalC::ucell.atoms[it].tau[ia];
-                        const double rhocgigg = rhocg[rho_basis->ig2igg[ig]];
-                        const std::complex<double> psiv_conj = conj(psiv[ig]);
-
-                        const double arg = ModuleBase::TWO_PI * (gv.x * pos.x + gv.y * pos.y + gv.z * pos.z);
-                        double sinp, cosp;
-                        ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                        const std::complex<double> expiarg = std::complex<double>(sinp, cosp);
-
-                        auto ipol0
-                            = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.x * psiv_conj * expiarg;
-                        force[0] += ipol0.real();
-
-                        auto ipol1
-                            = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.y * psiv_conj * expiarg;
-                        force[1] += ipol1.real();
-
-                        auto ipol2
-                            = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.z * psiv_conj * expiarg;
-                        force[2] += ipol2.real();
-                    }
-#ifdef _OPENMP
-#pragma omp critical  
-                    if (omp_get_num_threads() > 1)
-                    {
-                        forcecc(iat, 0) += force[0];
-                        forcecc(iat, 1) += force[1];
-                        forcecc(iat, 2) += force[2];
-                    }
-                    else
-#endif
-                    {
-                        forcecc(iat, 0) += force[0];
-                        forcecc(iat, 1) += force[1];
-                        forcecc(iat, 2) += force[2];
-                    }
+                    forcecc(iat, 0) += force[0];
+                    forcecc(iat, 1) += force[1];
+                    forcecc(iat, 2) += force[2];
                 }
-#ifdef _OPENMP
-            } // omp parallel
-#endif
+            }
+
         }
     }
-
+    if (this->device == base_device::GpuDevice)
+    {
+        delmem_var_op()(this->ctx, gv_x_d);
+        delmem_var_op()(this->ctx, gv_y_d);
+        delmem_var_op()(this->ctx, gv_z_d);
+        delmem_var_op()(this->ctx, force_d);
+        delmem_var_op()(this->ctx, rhocgigg_vec_d);
+        delmem_complex_op()(this->ctx, psiv_d);
+    }
     delete[] rhocg;
 
     delete[] psiv;                                                           // mohan fix bug 2012-03-22
@@ -202,7 +236,8 @@ void Forces<FPTYPE, Device>::deriv_drhoc
 	const FPTYPE *rhoc,
 	FPTYPE *drhocg,
 	ModulePW::PW_Basis* rho_basis,
-	int type
+	int type,
+    const UnitCell& ucell_in
 )
 {
 	int  igl0;
@@ -241,7 +276,7 @@ void Forces<FPTYPE, Device>::deriv_drhoc
 				aux [ir] = r [ir] * r [ir] * rhoc [ir];
 			}
 			ModuleBase::Integral::Simpson_Integral(mesh, aux.data(), rab, rhocg1);
-			drhocg [0] = ModuleBase::FOUR_PI * rhocg1 / GlobalC::ucell.omega;
+			drhocg [0] = ModuleBase::FOUR_PI * rhocg1 / ucell_in.omega;
 			igl0 = 1;
 		} 
 		else
@@ -260,7 +295,7 @@ void Forces<FPTYPE, Device>::deriv_drhoc
 #endif
 	for(int igl = igl0;igl< rho_basis->ngg;igl++)
 	{
-		gx_arr[igl] = sqrt(rho_basis->gg_uniq[igl] * GlobalC::ucell.tpiba2);
+		gx_arr[igl] = sqrt(rho_basis->gg_uniq[igl] * ucell_in.tpiba2);
 	}
 
 	double *r_d = nullptr;
@@ -285,14 +320,14 @@ void Forces<FPTYPE, Device>::deriv_drhoc
 
 	if(this->device == base_device::GpuDevice) {
 		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
-			r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,mesh,igl0,rho_basis->ngg-igl0,GlobalC::ucell.omega,type);
+			r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,mesh,igl0,rho_basis->ngg-igl0,ucell_in.omega,type);
 		syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);	
 
 
 
 	} else {
 		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
-			r,rhoc,gx_arr.data()+igl0,rab,drhocg+igl0,mesh,igl0,rho_basis->ngg-igl0,GlobalC::ucell.omega,type);
+			r,rhoc,gx_arr.data()+igl0,rab,drhocg+igl0,mesh,igl0,rho_basis->ngg-igl0,ucell_in.omega,type);
     }
 
     delmem_var_op()(this->ctx, r_d);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index 769d6351e9..e965446d77 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -23,7 +23,11 @@ void warp_reduce(FPTYPE & val) {
         val += __shfl_down_sync(FULL_MASK, val, offset);
     }
 }
-
+template <typename T>
+__device__ static inline 
+thrust::complex<T> conj(thrust::complex<T>& in) {
+    return thrust::conj(in);
+}
 template <typename T>
 __global__ void cal_stress_mgga(
     const int spin,
@@ -504,6 +508,48 @@ __global__ void cal_stress_drhoc_aux3(
 
 
 
+template <typename FPTYPE>
+__global__ void cal_force_npw(
+        const thrust::complex<FPTYPE> *psiv,
+        const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
+        const FPTYPE* rhocgigg_vec,
+        FPTYPE* force,
+        const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z,
+        const int npw,
+        const FPTYPE omega, const FPTYPE tpiba
+){
+    const double TWO_PI = 2.0 * 3.14159265358979323846;
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int begin_idx = tid * 1024;
+    if(begin_idx > npw) return;
+
+    FPTYPE t_force0 = 0;
+    FPTYPE t_force1 = 0;
+    FPTYPE t_force2 = 0;
+    for(int ig = begin_idx; ig<begin_idx+1024 && ig<npw;ig++) {
+        const thrust::complex<FPTYPE> psiv_conj = conj(psiv[ig]);
+
+        const FPTYPE arg = TWO_PI * (gv_x[ig] * pos_x + gv_y[ig] * pos_y + gv_z[ig] * pos_z);
+        const FPTYPE sinp = sin(arg);
+        const FPTYPE cosp = cos(arg);
+        const thrust::complex<FPTYPE> expiarg = thrust::complex<FPTYPE>(sinp, cosp);
+
+        const thrust::complex<FPTYPE> tmp_var = psiv_conj * expiarg * tpiba * omega * rhocgigg_vec[ig];
+
+        const thrust::complex<FPTYPE> ipol0 = tmp_var * gv_x[ig];
+        t_force0 += ipol0.real();
+
+        const thrust::complex<FPTYPE> ipol1 = tmp_var * gv_y[ig];
+        t_force1 += ipol1.real();
+
+        const thrust::complex<FPTYPE> ipol2 = tmp_var * gv_z[ig];
+        t_force2 += ipol2.real();
+    }
+    atomicAdd(&force[0], t_force0);
+    atomicAdd(&force[1], t_force1);
+    atomicAdd(&force[2], t_force2);
+}
+
 template <typename FPTYPE>
 void cal_vkb_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
         const base_device::DEVICE_GPU* ctx,
@@ -638,6 +684,32 @@ void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     return ;
 }
 
+
+template <typename FPTYPE>
+void cal_force_npw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+        const std::complex<FPTYPE> *psiv,
+        const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
+        const FPTYPE* rhocgigg_vec,
+        FPTYPE* force,
+        const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z,
+        const int npw,
+        const FPTYPE omega, const FPTYPE tpiba
+    )
+{
+    // Divide the npw size range into blocksize 1024 blocks
+    int t_size = 1024;
+    int t_num = (npw%t_size) ? (npw/t_size + 1) : (npw/t_size);
+    dim3 npwgrid(((t_num%THREADS_PER_BLOCK) ? (t_num/THREADS_PER_BLOCK + 1) : (t_num/THREADS_PER_BLOCK)));
+
+    cal_force_npw << < npwgrid, THREADS_PER_BLOCK >> > (
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(psiv),
+        gv_x, gv_y, gv_z, rhocgigg_vec, force, pos_x, pos_y, pos_z,
+        npw, omega, tpiba
+    ); 
+    return ;
+}
+
+
 // template <typename FPTYPE>
 // void prepare_vkb_deri_ptr_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
 //         const base_device::DEVICE_GPU* ctx,
@@ -713,6 +785,9 @@ template struct cal_vkb_deri_op<float, base_device::DEVICE_GPU>;
 template struct cal_stress_drhoc_aux_op<double, base_device::DEVICE_GPU>;
 template struct cal_stress_drhoc_aux_op<float, base_device::DEVICE_GPU>;
 
+template struct cal_force_npw_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
+
 // template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_GPU>;
 // template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_GPU>;
 }  // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
index d7db3b3eea..17cfae404a 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -502,6 +502,49 @@ __global__ void cal_stress_drhoc_aux3(
 }
 
 
+template <typename FPTYPE>
+__global__ void cal_force_npw(
+        const thrust::complex<FPTYPE> *psiv,
+        const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
+        const FPTYPE* rhocgigg_vec,
+        FPTYPE* force,
+        const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z,
+        const int npw,
+        const FPTYPE omega, const FPTYPE tpiba
+){
+    const double TWO_PI = 2.0 * 3.14159265358979323846;
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int begin_idx = tid * 1024;
+    if(begin_idx > npw) return;
+
+    FPTYPE t_force0 = 0;
+    FPTYPE t_force1 = 0;
+    FPTYPE t_force2 = 0;
+    for(int ig = begin_idx; ig<begin_idx+1024 && ig<npw;ig++) {
+        const thrust::complex<FPTYPE> psiv_conj = conj(psiv[ig]);
+
+        const FPTYPE arg = TWO_PI * (gv_x[ig] * pos_x + gv_y[ig] * pos_y + gv_z[ig] * pos_z);
+        const FPTYPE sinp = sin(arg);
+        const FPTYPE cosp = cos(arg);
+        const thrust::complex<FPTYPE> expiarg = thrust::complex<FPTYPE>(sinp, cosp);
+
+        const thrust::complex<FPTYPE> tmp_var = psiv_conj * expiarg * tpiba * omega * rhocgigg_vec[ig];
+
+        const thrust::complex<FPTYPE> ipol0 = tmp_var * gv_x[ig];
+        t_force0 += ipol0.real();
+
+        const thrust::complex<FPTYPE> ipol1 = tmp_var * gv_y[ig];
+        t_force1 += ipol1.real();
+
+        const thrust::complex<FPTYPE> ipol2 = tmp_var * gv_z[ig];
+        t_force2 += ipol2.real();
+    }
+    atomicAdd(&force[0], t_force0);
+    atomicAdd(&force[1], t_force1);
+    atomicAdd(&force[2], t_force2);
+}
+
+
 template <typename FPTYPE>
 void cal_vkb_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
         const base_device::DEVICE_GPU* ctx,
@@ -629,8 +672,32 @@ void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
 
     return ;
 }
+template <typename FPTYPE>
+void cal_force_npw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+        const std::complex<FPTYPE> *psiv,
+        const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
+        const FPTYPE* rhocgigg_vec,
+        FPTYPE* force,
+        const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z,
+        const int npw,
+        const FPTYPE omega, const FPTYPE tpiba
+    )
+{
+    int t_size = 1024;
+    int t_num = (npw%t_size) ? (npw/t_size + 1) : (npw/t_size);
+
+    dim3 npwgrid(((t_num%THREADS_PER_BLOCK) ? (t_num/THREADS_PER_BLOCK + 1) : (t_num/THREADS_PER_BLOCK)));
 
 
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_npw<FPTYPE>), npwgrid, THREADS_PER_BLOCK,0,0,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(psiv),
+        gv_x, gv_y, gv_z, rhocgigg_vec, force, pos_x, pos_y, pos_z,
+        npw, omega, tpiba
+    );
+
+    return ;
+}
+
 template struct cal_vq_op<double, base_device::DEVICE_GPU>;
 template struct cal_vq_op<float, base_device::DEVICE_GPU>;
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index be03cee401..c3a8008e04 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -401,6 +401,45 @@ struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_CPU> {
 };
 
 
+template <typename FPTYPE>
+struct cal_force_npw_op<FPTYPE, base_device::DEVICE_CPU> {
+    void operator()(const std::complex<FPTYPE> *psiv,
+                    const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
+                    const FPTYPE* rhocgigg_vec,
+                    FPTYPE* force,
+                    const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z,
+                    const int npw,
+                    const FPTYPE omega, const FPTYPE tpiba) {
+        const double TWO_PI = 2.0 * 3.14159265358979323846;
+        // printf("%d,%d,%lf\n",ngg,mesh,omega);
+
+        // printf("iminininininin\n\n\n\n");
+#ifdef _OPENMP
+#pragma omp for nowait
+#endif
+        for (int ig = 0; ig < npw; ig++)
+        {
+            const std::complex<FPTYPE> psiv_conj = conj(psiv[ig]);
+
+            const FPTYPE arg = TWO_PI * (gv_x[ig] * pos_x + gv_y[ig] * pos_y + gv_z[ig] * pos_z);
+            FPTYPE sinp, cosp;
+            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+            const std::complex<FPTYPE> expiarg = std::complex<FPTYPE>(sinp, cosp);
+
+            const std::complex<FPTYPE> tmp_var = psiv_conj * expiarg * tpiba * omega * rhocgigg_vec[ig];
+
+            const std::complex<FPTYPE> ipol0 = tmp_var * gv_x[ig];
+            force[0] += ipol0.real();
+
+            const std::complex<FPTYPE> ipol1 = tmp_var * gv_y[ig];
+            force[1] += ipol1.real();
+
+            const std::complex<FPTYPE> ipol2 = tmp_var * gv_z[ig];
+            force[2] += ipol2.real();
+        }
+    }
+};
+
 // // cpu version first, gpu version later
 // template <typename FPTYPE>
 // struct prepare_vkb_deri_ptr_op<FPTYPE, base_device::DEVICE_CPU>{
@@ -476,6 +515,10 @@ template struct cal_vq_deri_op<double, base_device::DEVICE_CPU>;
 template struct cal_stress_drhoc_aux_op<float, base_device::DEVICE_CPU>;
 template struct cal_stress_drhoc_aux_op<double, base_device::DEVICE_CPU>;
 
+template struct cal_force_npw_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_npw_op<double, base_device::DEVICE_CPU>;
+
+
 // template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_CPU>;
 // template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_CPU>;
 } // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index b4dfc35ff8..62e373f1ff 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -190,6 +190,19 @@ struct cal_stress_drhoc_aux_op{
     );
 };
 
+template <typename FPTYPE, typename Device>
+struct cal_force_npw_op{
+    void operator()(const std::complex<FPTYPE> *psiv,
+                    const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
+                    const FPTYPE* rhocgigg_vec,
+                    FPTYPE* force,
+                    const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_xz,
+                    const int npw,
+                    const FPTYPE omega, const FPTYPE tpiba
+    );
+};
+
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 template <typename FPTYPE>
 struct cal_dbecp_noevc_nl_op<FPTYPE, base_device::DEVICE_GPU>
@@ -340,6 +353,27 @@ struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>{
 };
 
 
+/**
+ * This operator is used to compute the force force in three directions for each atom in force_cc 
+ * in parallel on GPU [0~3], which is:
+ * Force_p =    (2* pi * tpiba * omega * rhocg[ig] * gv_p[ig] 
+ *              * (gv_x[ig] * pos_x + gv_y[ig] * pos_y + gv_z[ig] * pos_z)
+ *              * complex(sinp, cosp) * psiv[ig]).real()
+ *
+ * The operator splits NPW into blocks on the GPU in parallel, and the block size is t_size = 1024.
+ */
+template <typename FPTYPE>
+struct cal_force_npw_op<FPTYPE, base_device::DEVICE_GPU>{
+    void operator()(const std::complex<FPTYPE> *psiv,
+                    const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z,
+                    const FPTYPE* rhocgigg_vec,
+                    FPTYPE* force,
+                    const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_xz,
+                    const int npw,
+                    const FPTYPE omega, const FPTYPE tpiba
+    );
+};
+
 
 
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp
index 5567f51ef9..e4974afbeb 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp
@@ -39,8 +39,8 @@ void Sto_Forces::cal_stoforce(ModuleBase::matrix& force,
     this->cal_force_loc(forcelc, rho_basis, chr);
     this->cal_force_ew(forceion, rho_basis, p_sf);
     this->cal_sto_force_nl(forcenl, wg, pkv, wfc_basis, psi_in, stowf);
-    this->cal_force_cc(forcecc, rho_basis, chr);
-    this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist,GlobalC::ucell);
+    this->cal_force_cc(forcecc, rho_basis, chr, GlobalC::ucell);
+    this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, GlobalC::ucell);
 
     //impose total force = 0
     int iat = 0;
diff --git a/source/module_io/read_input_item_output.cpp b/source/module_io/read_input_item_output.cpp
index 1f38e9fe69..b22cebec7c 100644
--- a/source/module_io/read_input_item_output.cpp
+++ b/source/module_io/read_input_item_output.cpp
@@ -254,6 +254,29 @@ void ReadInput::item_output()
         sync_intvec(input.out_mat_hs, 2, 0);
         this->add_item(item);
     }
+    {
+        Input_Item item("out_mat_tk");
+        item.annotation = "output T(k)";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            size_t count = item.get_size();
+            if (count == 1)
+            {
+                para.input.out_mat_tk[0] = std::stoi(item.str_values[0]);
+                para.input.out_mat_tk[1] = 8;
+            }
+            else if (count == 2)
+            {
+                para.input.out_mat_tk[0] = std::stoi(item.str_values[0]);
+                para.input.out_mat_tk[1] = std::stoi(item.str_values[1]);
+            }
+            else
+            {
+                ModuleBase::WARNING_QUIT("ReadInput", "out_mat_tk should have 1 or 2 values");
+            }
+        };
+        sync_intvec(input.out_mat_tk, 2, 0);
+        this->add_item(item);
+    }
     {
         Input_Item item("out_mat_hs2");
         item.annotation = "output H(R) and S(R) matrix";
@@ -453,4 +476,4 @@ void ReadInput::item_output()
         this->add_item(item);
     }
 }
-} // namespace ModuleIO
\ No newline at end of file
+} // namespace ModuleIO
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index d1d417cb96..bd326cd3b6 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -324,6 +324,7 @@ struct Input_para
     bool out_dm1 = false;                 ///< output density matrix. (multi-k points)
     bool out_bandgap = false;             ///< QO added for bandgap printing
     std::vector<int> out_mat_hs = {0, 8}; ///< output H matrix and S matrix in local basis.
+    std::vector<int> out_mat_tk = {0, 8}; ///< output T(k) matrix in local basis.
     bool out_mat_hs2 = false;             ///< LiuXh add 2019-07-16, output H(R) matrix and
                                           ///< S(R) matrix in local basis.
     bool out_mat_dh = false;
@@ -567,4 +568,4 @@ struct Input_para
     bool test_stress = false;     ///< test the stress.
     bool test_skip_ewald = false; ///< variables for test only
 };
-#endif
\ No newline at end of file
+#endif
diff --git a/source/module_ri/Exx_LRI_interface.h b/source/module_ri/Exx_LRI_interface.h
index cf30babb77..9f16e95acc 100644
--- a/source/module_ri/Exx_LRI_interface.h
+++ b/source/module_ri/Exx_LRI_interface.h
@@ -50,8 +50,12 @@ class Exx_LRI_Interface
         hamilt::Hamilt<T>& hamilt,
         const elecstate::DensityMatrix<T, double>& dm/**< double should be Tdata if complex-PBE-DM is supported*/,
         const K_Vectors& kv,
-        int& iter);
+        const int& nspin,
+        int& iter,
+        const double& etot,
+        const double& scf_ene_thr);
     int two_level_step = 0;
+    double etot_last_outer_loop = 0.0;
 private:
     std::shared_ptr<Exx_LRI<Tdata>> exx_ptr;
     Mix_DMk_2D mix_DMk_2D;
diff --git a/source/module_ri/Exx_LRI_interface.hpp b/source/module_ri/Exx_LRI_interface.hpp
index 2da1fd43f2..aaa338845d 100644
--- a/source/module_ri/Exx_LRI_interface.hpp
+++ b/source/module_ri/Exx_LRI_interface.hpp
@@ -7,6 +7,7 @@
 #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.h"
 #include "module_base/parallel_common.h"
+#include "module_base/formatter.h"
 
 #include <sys/time.h>
 #include "module_io/csr_reader.h"
@@ -126,7 +127,10 @@ bool Exx_LRI_Interface<T, Tdata>::exx_after_converge(
     hamilt::Hamilt<T>& hamilt,
     const elecstate::DensityMatrix<T, double>& dm,
     const K_Vectors& kv,
-    int& iter)
+    const int& nspin,
+    int& iter,
+    const double& etot,
+    const double& scf_ene_thr)
 {   // only called if (GlobalC::exx_info.info_global.cal_exx)
     auto restart_reset = [this]()
         { // avoid calling restart related procedure in the subsequent ion steps
@@ -158,45 +162,50 @@ bool Exx_LRI_Interface<T, Tdata>::exx_after_converge(
                 return false;
             }
         }
-        // has separate_loop case
-        // exx converged or get max exx steps
-        else if (this->two_level_step == GlobalC::exx_info.info_global.hybrid_step
-            || (iter == 1 && this->two_level_step != 0))
-        {
-            restart_reset();
-            return true;
-        }
         else
-        {
-            // update exx and redo scf
-            if (this->two_level_step == 0)
+        { // has separate_loop case
+            const double ediff = std::abs(etot - etot_last_outer_loop) * ModuleBase::Ry_to_eV;
+            if (two_level_step) { std::cout << FmtCore::format("EDIFF/eV (outer loop): %.8e \n", ediff); }
+            // exx converged or get max exx steps
+            if (this->two_level_step == GlobalC::exx_info.info_global.hybrid_step
+                || (iter == 1 && this->two_level_step != 0) // density convergence of outer loop
+                || (ediff < scf_ene_thr && this->two_level_step != 0))   //energy convergence of outer loop
             {
-                XC_Functional::set_xc_type(GlobalC::ucell.atoms[0].ncpp.xc_func);
+                restart_reset();
+                return true;
             }
+            else
+            {
+                this->etot_last_outer_loop = etot;
+                // update exx and redo scf
+                if (this->two_level_step == 0)
+                {
+                    XC_Functional::set_xc_type(GlobalC::ucell.atoms[0].ncpp.xc_func);
+                }
+
+                std::cout << " Updating EXX " << std::flush;
+                timeval t_start;       gettimeofday(&t_start, nullptr);
+
+                const bool flag_restart = (this->two_level_step == 0) ? true : false;
+                this->mix_DMk_2D.mix(dm.get_DMK_vector(), flag_restart);
+
+                // GlobalC::exx_lcao.cal_exx_elec(p_esolver->LOC, p_esolver->LOWF.wfc_k_grid);
+                const std::vector<std::map<int, std::map<std::pair<int, std::array<int, 3>>, RI::Tensor<Tdata>>>>
+                    Ds = std::is_same<T, double>::value //gamma_only_local
+                    ? RI_2D_Comm::split_m2D_ktoR<Tdata>(*this->exx_ptr->p_kv, this->mix_DMk_2D.get_DMk_gamma_out(), *dm.get_paraV_pointer(), nspin)
+                    : RI_2D_Comm::split_m2D_ktoR<Tdata>(*this->exx_ptr->p_kv, this->mix_DMk_2D.get_DMk_k_out(), *dm.get_paraV_pointer(), nspin);
+                this->exx_ptr->cal_exx_elec(Ds, *dm.get_paraV_pointer());
+                iter = 0;
+                this->two_level_step++;
 
-            std::cout << " Updating EXX " << std::flush;
-            timeval t_start;       gettimeofday(&t_start, nullptr);
-
-            const bool flag_restart = (this->two_level_step == 0) ? true : false;
-            this->mix_DMk_2D.mix(dm.get_DMK_vector(), flag_restart);
-
-            // GlobalC::exx_lcao.cal_exx_elec(p_esolver->LOC, p_esolver->LOWF.wfc_k_grid);
-			const std::vector<std::map<int,std::map<std::pair<int, std::array<int, 3>>,RI::Tensor<Tdata>>>>
-				Ds = GlobalV::GAMMA_ONLY_LOCAL
-                ? RI_2D_Comm::split_m2D_ktoR<Tdata>(*this->exx_ptr->p_kv, this->mix_DMk_2D.get_DMk_gamma_out(), *dm.get_paraV_pointer(), GlobalV::NSPIN)
-                : RI_2D_Comm::split_m2D_ktoR<Tdata>(*this->exx_ptr->p_kv, this->mix_DMk_2D.get_DMk_k_out(), *dm.get_paraV_pointer(), GlobalV::NSPIN);
-            this->exx_ptr->cal_exx_elec(Ds, *dm.get_paraV_pointer());
-            iter = 0;
-            this->two_level_step++;
-            
-            timeval t_end;       gettimeofday(&t_end, nullptr);
-            std::cout << "and rerun SCF\t"
-                << std::setprecision(3) << std::setiosflags(std::ios::scientific)
-                << (double)(t_end.tv_sec-t_start.tv_sec) + (double)(t_end.tv_usec-t_start.tv_usec)/1000000.0 
-                << std::defaultfloat << " (s)" << std::endl;
-            return false;
+                timeval t_end;       gettimeofday(&t_end, nullptr);
+                std::cout << "and rerun SCF\t"
+                    << std::setprecision(3) << std::setiosflags(std::ios::scientific)
+                    << (double)(t_end.tv_sec - t_start.tv_sec) + (double)(t_end.tv_usec - t_start.tv_usec) / 1000000.0
+                    << std::defaultfloat << " (s)" << std::endl;
+                return false;
+            }
         }
-
     restart_reset();
     return true;
 }