diff --git a/source/module_cell/module_symmetry/symmetry_basic.cpp b/source/module_cell/module_symmetry/symmetry_basic.cpp
index 8a4cadacf..664e0c5b7 100644
--- a/source/module_cell/module_symmetry/symmetry_basic.cpp
+++ b/source/module_cell/module_symmetry/symmetry_basic.cpp
@@ -1068,11 +1068,9 @@ void Symmetry_Basic::atom_ordering_new(double *posi, const int natom, int *subin
             ModuleBase::heapsort(nxequal, weighted_func, tmpindex.data());
             this->order_atoms(&posi[i * 3], nxequal, tmpindex.data());
             //rearange subindex using tmpindex
-            for (int j = 0; j < nxequal; ++j)
-            {
-                tmpindex[j] = subindex[i + tmpindex[j]];
-                subindex[i + j] = tmpindex[j];
-            }
+            for (int j = 0; j < nxequal; ++j)tmpindex[j] = subindex[i + tmpindex[j]];
+            for (int j = 0; j < nxequal; ++j)subindex[i + j] = tmpindex[j];
+
         }
 		i=ix_right;
 	}
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index b9c4f7aeb..c274c394a 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -485,10 +485,13 @@ void ESolver_KS_LCAO::eachiterinit(const int istep, const int iter)
 
 #ifdef __EXX
     // calculate exact-exchange
+    if (GlobalC::exx_info.info_global.cal_exx && !GlobalC::exx_info.info_global.separate_loop// && exx_lri_complex->two_level_step
+        && !GlobalV::GAMMA_ONLY_LOCAL && ModuleSymmetry::Symmetry::symm_flag == 1)
+        this->exc->restore_dm(this->UHM, this->LOC, kv, GlobalC::ucell, *psi, symm, pelec->wg);
     if (GlobalC::exx_info.info_ri.real_number)
-        this->exd->exx_eachiterinit(this->LOC, *(this->p_chgmix), iter);
+        this->exd->exx_eachiterinit(this->LOC, *(this->p_chgmix), this->symm, iter);
     else
-        this->exc->exx_eachiterinit(this->LOC, *(this->p_chgmix), iter);
+        this->exc->exx_eachiterinit(this->LOC, *(this->p_chgmix), this->symm, iter);
 #endif
 
     if (GlobalV::dft_plus_u)
@@ -551,9 +554,9 @@ void ESolver_KS_LCAO::hamilt2density(int istep, int iter, double ethr)
 
 #ifdef __EXX
     if (GlobalC::exx_info.info_ri.real_number)
-        this->exd->exx_hamilt2density(*this->pelec, *this->LOWF.ParaV);
+        this->exd->exx_hamilt2density(*this->pelec, *this->LOWF.ParaV, symm);
     else
-        this->exc->exx_hamilt2density(*this->pelec, *this->LOWF.ParaV);
+        this->exc->exx_hamilt2density(*this->pelec, *this->LOWF.ParaV, symm);
 #endif
 
     // if DFT+U calculation is needed, this function will calculate
@@ -822,7 +825,7 @@ void ESolver_KS_LCAO::afterscf(const int istep)
         // rpa_interface.rpa_exx_lcao().info.files_abfs = GlobalV::rpa_orbitals;
         // rpa_interface.out_for_RPA(*(this->LOWF.ParaV), *(this->psi), this->LOC, this->pelec);
         RPA_LRI<double> rpa_lri_double(GlobalC::exx_info.info_ri);
-        rpa_lri_double.cal_postSCF_exx(this->LOC, MPI_COMM_WORLD, kv, *this->LOWF.ParaV);
+        rpa_lri_double.cal_postSCF_exx(this->LOC, MPI_COMM_WORLD, kv, *this->LOWF.ParaV, symm);
         rpa_lri_double.init(MPI_COMM_WORLD, kv);
         rpa_lri_double.out_for_RPA(*(this->LOWF.ParaV), *(this->psi), this->pelec);
     }
@@ -847,10 +850,12 @@ void ESolver_KS_LCAO::afterscf(const int istep)
 bool ESolver_KS_LCAO::do_after_converge(int& iter)
 {
 #ifdef __EXX
+    if (!GlobalV::GAMMA_ONLY_LOCAL && ModuleSymmetry::Symmetry::symm_flag == 1)
+        this->exc->restore_dm(this->UHM, this->LOC, kv, GlobalC::ucell, *psi, symm, pelec->wg);
     if (GlobalC::exx_info.info_ri.real_number)
-        return this->exd->exx_after_converge(*this->p_hamilt, this->LM, this->LOC, kv, iter);
+        return this->exd->exx_after_converge(*this->p_hamilt, this->LM, this->LOC, kv, symm, iter);
     else
-        return this->exc->exx_after_converge(*this->p_hamilt, this->LM, this->LOC, kv, iter);
+        return this->exc->exx_after_converge(*this->p_hamilt, this->LM, this->LOC, kv, symm, iter);
 #endif // __EXX
     return true;
 }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
index 27af5e95d..dd884748e 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
@@ -132,7 +132,7 @@ void Force_LCAO_k::ftable_k(const bool isforce,
             GlobalC::ld.check_v_delta_k(pv->nnr);
             for (int ik = 0; ik < kv.nks; ik++)
             {
-                uhm.LM->folding_fixedH(ik, kv.kvec_d);
+                uhm.LM->folding_fixedH(kv.kvec_d[ik]);
             }
             GlobalC::ld.cal_e_delta_band_k(loc.dm_k,
                 kv.nks);
@@ -265,7 +265,7 @@ void Force_LCAO_k::allocate_k(const Parallel_Orbitals& pv,
         for (int ik = 0; ik < nks; ik++)
         {
             this->UHM->genH.LM->zeros_HSk('S');
-            this->UHM->genH.LM->folding_fixedH(ik, kvec_d, 1);
+            this->UHM->genH.LM->folding_fixedH(kvec_d[ik], 1);
             bool bit = false; // LiuXh, 2017-03-21
             ModuleIO::saving_HS(0,
                                 this->UHM->genH.LM->Hloc2.data(),
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h
index 3d5ec6578..af26b7d69 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h
@@ -24,9 +24,7 @@ class LCAO_Matrix
     
     // folding the fixed Hamiltonian (T+Vnl) if
 	// k-point algorithm is used.
-	void folding_fixedH(const int &ik, 
-                        const std::vector<ModuleBase::Vector3<double>>& kvec_d, 
-                        bool cal_syns = false);
+    void folding_fixedH(const ModuleBase::Vector3<double>& kvec_d, bool cal_syns = false);
 
     Parallel_Orbitals *ParaV;
     
@@ -124,6 +122,9 @@ class LCAO_Matrix
     // Records the R direct coordinates of HR and SR output, This variable will be filled with data when HR and SR files are output.
     std::set<Abfs::Vector3_Order<int>> output_R_coor;
 
+    /// for restoring DM(R) when symmetry == 1 and multi - k in EXX calculation.
+    /// $S^{-1}'(k)S(gk)$ for each kstar. size: [nks_ibz][nsymm][nbasis*nbasis(local)]
+    std::vector<std::vector<std::vector<std::complex<double>>>> invSkrot_Sgk;
 
     //========================================
     // FORCE
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nnr.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nnr.cpp
index 9344cdd08..f82171807 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nnr.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nnr.cpp
@@ -318,9 +318,8 @@ int Grid_Technique::binary_search_find_R2_offset(int val, int iat) const
 
 // be called in LCAO_Hamilt::calculate_Hk.
 void LCAO_Matrix::folding_fixedH(
-						const int &ik, 
-						const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-						bool cal_syns)
+    const ModuleBase::Vector3<double>& kvec_d,
+    bool cal_syns)
 {
 	ModuleBase::TITLE("LCAO_nnr","folding_fixedH");
     ModuleBase::timer::tick("LCAO_nnr", "folding_fixedH");
@@ -432,8 +431,8 @@ void LCAO_Matrix::folding_fixedH(
 					// dR is the index of box in Crystal coordinates
 					//------------------------------------------------
 					ModuleBase::Vector3<double> dR(adjs.box[ad].x, adjs.box[ad].y, adjs.box[ad].z); 
-					const double arg = ( kvec_d[ik] * dR ) * ModuleBase::TWO_PI;
-					//const double arg = ( kvec_d[ik] * GlobalC::GridD.getBox(ad) ) * ModuleBase::TWO_PI;
+                    const double arg = (kvec_d * dR) * ModuleBase::TWO_PI;
+                    //const double arg = ( kvec_d * GlobalC::GridD.getBox(ad) ) * ModuleBase::TWO_PI;
 					double sinp, cosp;
 					ModuleBase::libm::sincos(arg, &sinp, &cosp);
 					const std::complex<double> kphase = std::complex <double> ( cosp,  sinp );
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
index eb1ff80bc..93f60b6ac 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
@@ -76,13 +76,42 @@ void OperatorLCAO<std::complex<double>>::folding_fixed(const int ik, const std::
     //-----------------------------------------
     this->LM->zeros_HSk('S');
     this->LM->zeros_HSk('T');
-    this->LM->folding_fixedH(ik, kvec_d);
+    this->LM->folding_fixedH(kvec_d[ik]);
 
     //------------------------------------------
     // Add T(k)+Vnl(k)+Vlocal(k)
     // (Hloc2 += Hloc_fixed2), (std::complex matrix)
     //------------------------------------------
-	this->LM->update_Hloc2(ik);
+    this->LM->update_Hloc2(ik);
+    // test: output HSk
+    GlobalV::ofs_running << "ik=" << ik << std::endl;
+    // GlobalV::ofs_running << "Hloc2 " << std::endl;
+    // for (int ir = 0;ir < GlobalV::NLOCAL;++ir)
+    // {
+    //     for (int ic = 0;ic < GlobalV::NLOCAL;++ic)
+    //     {
+    //         GlobalV::ofs_running << this->LM->Hloc2[ir * GlobalV::NLOCAL + ic] << " ";
+    //     }
+    //     GlobalV::ofs_running << std::endl;
+    // }
+    // GlobalV::ofs_running << "Hloc_fixed2 " << std::endl;
+    // for (int ir = 0;ir < GlobalV::NLOCAL;++ir)
+    // {
+    //     for (int ic = 0;ic < GlobalV::NLOCAL;++ic)
+    //     {
+    //         GlobalV::ofs_running << this->LM->Hloc_fixed2[ir * GlobalV::NLOCAL + ic] << " ";
+    //     }
+    //     GlobalV::ofs_running << std::endl;
+    // }
+    GlobalV::ofs_running << "Sloc2 with kvec_d=" << kvec_d[ik].x << " " << kvec_d[ik].y << " " << kvec_d[ik].z << std::endl;
+    for (int ir = 0;ir < GlobalV::NLOCAL;++ir)
+    {
+        for (int ic = 0;ic < GlobalV::NLOCAL;++ic)
+        {
+            GlobalV::ofs_running << this->LM->Sloc2[ir * GlobalV::NLOCAL + ic] << " ";
+        }
+        GlobalV::ofs_running << std::endl;
+    }
     ModuleBase::timer::tick("OperatorLCAO", "folding_fixed");
 }
 
diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt
index 07b4fcddb..bf267b64a 100644
--- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt
+++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt
@@ -7,6 +7,7 @@ list(APPEND objects
     gint_rho.cpp
     gint_tau.cpp
     gint_vl.cpp
+    gint_Srot.cpp
     gint_k_env.cpp
     gint_k_sparse.cpp
     gint_k_sparse1.cpp
diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp
index 53e34600e..f8022c8c8 100644
--- a/source/module_hamilt_lcao/module_gint/gint.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint.cpp
@@ -23,7 +23,9 @@ void Gint::cal_gint(Gint_inout *inout)
 	if(inout->job==Gint_Tools::job_type::rho) ModuleBase::TITLE("Gint_interface","cal_gint_rho");
 	if(inout->job==Gint_Tools::job_type::tau) ModuleBase::TITLE("Gint_interface","cal_gint_tau");
 	if(inout->job==Gint_Tools::job_type::force) ModuleBase::TITLE("Gint_interface","cal_gint_force");
-	if(inout->job==Gint_Tools::job_type::force_meta) ModuleBase::TITLE("Gint_interface","cal_gint_force_meta");
+    if (inout->job == Gint_Tools::job_type::force_meta) ModuleBase::TITLE("Gint_interface", "cal_gint_force_meta");
+    if (inout->job == Gint_Tools::job_type::srot) ModuleBase::TITLE("Gint_interface", "cal_gint_srot");
+    if (inout->job == Gint_Tools::job_type::srotk) ModuleBase::TITLE("Gint_interface", "cal_gint_srotk");
 
 	if(inout->job==Gint_Tools::job_type::vlocal) ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal");
 	if(inout->job==Gint_Tools::job_type::vlocal_meta) ModuleBase::timer::tick("Gint_interface","cal_gint_vlocal_meta");
@@ -31,8 +33,10 @@ void Gint::cal_gint(Gint_inout *inout)
 	if(inout->job==Gint_Tools::job_type::tau) ModuleBase::timer::tick("Gint_interface","cal_gint_tau");
 	if(inout->job==Gint_Tools::job_type::force) ModuleBase::timer::tick("Gint_interface","cal_gint_force");
 	if(inout->job==Gint_Tools::job_type::force_meta) ModuleBase::timer::tick("Gint_interface","cal_gint_force_meta");
+    if (inout->job == Gint_Tools::job_type::srot) ModuleBase::timer::tick("Gint_interface", "cal_gint_srot");
+    if (inout->job == Gint_Tools::job_type::srotk) ModuleBase::timer::tick("Gint_interface", "cal_gint_srotk");
 
-	const int max_size = this->gridt->max_atom;
+    const int max_size = this->gridt->max_atom;
 	const int LD_pool = max_size*GlobalC::ucell.nwmax;
     const int lgd = this->gridt->lgd;
     const int nnrg = this->gridt->nnrg;
@@ -55,7 +59,8 @@ void Gint::cal_gint(Gint_inout *inout)
 			// it's a uniform grid to save orbital values, so the delta_r is a constant.
 			const double delta_r = GlobalC::ORB.dr_uniform;
 
-            if((inout->job==Gint_Tools::job_type::vlocal || inout->job==Gint_Tools::job_type::vlocal_meta) && !GlobalV::GAMMA_ONLY_LOCAL)
+            if ((inout->job == Gint_Tools::job_type::vlocal || inout->job == Gint_Tools::job_type::vlocal_meta || inout->job == Gint_Tools::job_type::srot)
+                && !GlobalV::GAMMA_ONLY_LOCAL)
             {
                 if(!pvpR_alloc_flag)
                 {
@@ -81,8 +86,9 @@ void Gint::cal_gint(Gint_inout *inout)
             //perpare auxiliary arrays to store thread-specific values
 #ifdef _OPENMP
 			double* pvpR_thread;
-			if(inout->job==Gint_Tools::job_type::vlocal || inout->job==Gint_Tools::job_type::vlocal_meta)
-			{
+            if (inout->job == Gint_Tools::job_type::vlocal || inout->job == Gint_Tools::job_type::vlocal_meta
+                || inout->job == Gint_Tools::job_type::srot)
+            {
                 if(!GlobalV::GAMMA_ONLY_LOCAL)
                 {
                     pvpR_thread = new double[nnrg];
@@ -247,8 +253,21 @@ void Gint::cal_gint(Gint_inout *inout)
 					#endif
 					delete[] vldr3;
 					delete[] vkdr3;
-				}
-			} // int grid_index
+                }
+                else if (inout->job == Gint_Tools::job_type::srot)
+                {
+#ifdef _OPENMP
+                    this->gint_kernel_Srot(inout->ginv, na_grid, grid_index, delta_r, dv, LD_pool, pvpR_thread);
+#else
+                    this->gint_kernel_Srot(inout->ginv, na_grid, grid_index, delta_r, dv, LD_pool, this->pvpR_reduced[0]);
+#endif
+                }
+                else if (inout->job == Gint_Tools::job_type::srotk)
+                {
+                    this->gint_kernel_Srot(na_grid, grid_index, delta_r, dv, LD_pool, inout);
+                }
+
+            } // int grid_index
 
 #ifdef _OPENMP
 			if(inout->job==Gint_Tools::job_type::vlocal || inout->job==Gint_Tools::job_type::vlocal_meta)
@@ -284,7 +303,17 @@ void Gint::cal_gint(Gint_inout *inout)
 				{
 					inout->svl_dphi[0]+=svl_dphi_thread;
 				}
-			}
+            }
+
+            if (inout->job == Gint_Tools::job_type::srot)
+            {
+#pragma omp critical(gint_k)
+                for (int innrg = 0; innrg < nnrg; innrg++)
+                {
+                    pvpR_reduced[0][innrg] += pvpR_thread[innrg];
+                }
+                delete[] pvpR_thread;
+            }
 #endif
 		} // end of #pragma omp parallel
 
@@ -300,8 +329,11 @@ void Gint::cal_gint(Gint_inout *inout)
 	if(inout->job==Gint_Tools::job_type::rho) ModuleBase::timer::tick("Gint_interface","cal_gint_rho");
 	if(inout->job==Gint_Tools::job_type::tau) ModuleBase::timer::tick("Gint_interface","cal_gint_tau");
 	if(inout->job==Gint_Tools::job_type::force) ModuleBase::timer::tick("Gint_interface","cal_gint_force");
-	if(inout->job==Gint_Tools::job_type::force_meta) ModuleBase::timer::tick("Gint_interface","cal_gint_force_meta");
-	return;
+    if (inout->job == Gint_Tools::job_type::force_meta) ModuleBase::timer::tick("Gint_interface", "cal_gint_force_meta");
+    if (inout->job == Gint_Tools::job_type::srot) ModuleBase::timer::tick("Gint_interface", "cal_gint_srot");
+    if (inout->job == Gint_Tools::job_type::srotk) ModuleBase::timer::tick("Gint_interface", "cal_gint_srotk");
+
+    return;
 }
 
 void Gint::prep_grid(
diff --git a/source/module_hamilt_lcao/module_gint/gint.h b/source/module_hamilt_lcao/module_gint/gint.h
index 87ca5bae3..a838107f8 100644
--- a/source/module_hamilt_lcao/module_gint/gint.h
+++ b/source/module_hamilt_lcao/module_gint/gint.h
@@ -196,6 +196,28 @@ class Gint
         double** dpsiz_dm,
         double* rho);
 
+    //------------------------------------------------------
+    // in gint_Skrot.cpp 
+    //------------------------------------------------------
+    ///calculate < phi_0|g^{-1}phi_R > for exx-symmetry
+    void gint_kernel_Srot(
+        const ModuleBase::Matrix3& ginv,
+        const int na_grid,
+        const int grid_index,
+        const double delta_r,
+        const double dv,
+        const int LD_pool,
+        double* pvpR_reduced);
+
+    /// calculate $<\phi_{k_1}|\phi_{k_2}> = \int{dr} \sum_{R_1}\phi_{R_1}(r)exp(-ik_1*R_1) \sum_{R_2}\phi_{R_2}(r)exp(ik_2*R_2)$
+    void gint_kernel_Srot(
+        const int na_grid,
+        const int grid_index,
+        const double delta_r,
+        const double dv,
+        const int LD_pool,
+        Gint_inout* inout);
+
     // dimension: [GlobalC::LNNR.nnrg] 
     // save the < phi_0i | V | phi_Rj > in sparse H matrix.
     bool pvpR_alloc_flag = false;
diff --git a/source/module_hamilt_lcao/module_gint/gint_Srot.cpp b/source/module_hamilt_lcao/module_gint/gint_Srot.cpp
new file mode 100644
index 000000000..e9f8001d1
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/gint_Srot.cpp
@@ -0,0 +1,394 @@
+#include "module_base/global_function.h"
+#include "module_base/global_variable.h"
+#include "gint_k.h"
+#include "module_basis/module_ao/ORB_read.h"
+#include "grid_technique.h"
+#include "module_base/ylm.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
+#include "module_base/parallel_reduce.h"
+#include "module_base/timer.h"
+#include "module_base/tool_threading.h"
+#include "module_cell/module_neighbor/sltk_grid_driver.h"
+#include "module_base/libm/libm.h"
+
+void Gint::gint_kernel_Srot(
+    const ModuleBase::Matrix3& ginv,
+    const int na_grid,
+    const int grid_index,
+    const double delta_r,
+    const double dv,
+    const int LD_pool,
+    double* pvpR_in)
+{
+    //prepare block information
+    int* block_iw, * block_index, * block_size;
+    bool** cal_flag;
+    Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, block_iw, block_index, block_size, cal_flag);
+
+    //evaluate phi and phirot on grids
+    Gint_Tools::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
+    Gint_Tools::Array_Pool<double> psir_ylmrot(this->bxyz, LD_pool);
+    Gint_Tools::cal_psir_ylm(*this->gridt,
+        this->bxyz, na_grid, grid_index, delta_r,
+        block_index, block_size,
+        cal_flag,
+        psir_ylm.ptr_2D);
+    //phirot(r) = f(r)*Ylm(g^{-1}r), here calculate phirot(r)*dv
+    Gint_Tools::cal_psir_ylmrot_dv(*this->gridt, ginv,
+        this->bxyz, na_grid, grid_index, delta_r, dv,
+        block_index, block_size,
+        cal_flag,
+        psir_ylmrot.ptr_2D);
+
+
+    //integrate (psi_mu*v(r)*dv) * psi_nu on grid
+    //and accumulates to the corresponding element in Hamiltonian
+    this->cal_meshball_vlocal_k(
+        na_grid, LD_pool, grid_index, block_size, block_index, block_iw, cal_flag,
+        psir_ylm.ptr_2D, psir_ylmrot.ptr_2D, pvpR_in);
+
+    //release memories
+    delete[] block_iw;
+    delete[] block_index;
+    delete[] block_size;
+    for (int ib = 0; ib < this->bxyz; ++ib)
+    {
+        delete[] cal_flag[ib];
+    }
+    delete[] cal_flag;
+
+    return;
+}
+
+
+void Gint::gint_kernel_Srot(
+    const int na_grid,
+    const int grid_index,
+    const double delta_r,
+    const double dv,
+    const int LD_pool,
+    Gint_inout* inout)
+{
+    const int& lgd = this->gridt->lgd;
+    // ModuleBase::GlobalFunc::ZEROS(inout->Srotk_grid->data(), lgd * lgd);
+    //prepare block information
+    // block_index[ia]: start orbital index of atom ia in na_grid
+    int* block_iw, * block_index, * block_size;
+    bool** cal_flag;
+    Gint_Tools::get_block_info(*this->gridt, this->bxyz, na_grid, grid_index, block_iw, block_index, block_size, cal_flag);
+
+    //evaluate psi on grids
+    Gint_Tools::Array_Pool<double> psir_ylm(this->bxyz, LD_pool);
+    Gint_Tools::cal_psir_ylm(*this->gridt,
+        this->bxyz, na_grid, grid_index, delta_r,
+        block_index, block_size,
+        cal_flag,
+        psir_ylm.ptr_2D);
+
+    for (int ia1 = 0; ia1 < na_grid; ++ia1)
+    {
+        const int mcell_index1 = this->gridt->bcell_start[grid_index] + ia1;
+        const int iat1 = this->gridt->which_atom[mcell_index1];
+        const int T1 = GlobalC::ucell.iat2it[iat1];
+        Atom* atom1 = &GlobalC::ucell.atoms[T1];
+        const int I1 = GlobalC::ucell.iat2ia[iat1];
+        //find R by which_unitcell and cal kphase
+        const int id1_ucell = this->gridt->which_unitcell[mcell_index1];
+        const int R1x = this->gridt->ucell_index2x[id1_ucell] + this->gridt->minu1;
+        const int R1y = this->gridt->ucell_index2y[id1_ucell] + this->gridt->minu2;
+        const int R1z = this->gridt->ucell_index2z[id1_ucell] + this->gridt->minu3;
+        ModuleBase::Vector3<double> R1((double)R1x, (double)R1y, (double)R1z);
+        const double arg1 = -(inout->kd1 * R1) * ModuleBase::TWO_PI;
+        const std::complex<double> kphase1 = std::complex <double>(cos(arg1), sin(arg1));
+        // get the start index of local orbitals.
+        const int start1 = GlobalC::ucell.itiaiw2iwt(T1, I1, 0);
+        const int* iw1_lo = &this->gridt->trace_lo[start1];
+        for (int ia2 = 0;ia2 < na_grid;++ia2)
+        {
+            const int mcell_index2 = this->gridt->bcell_start[grid_index] + ia2;
+            const int iat2 = this->gridt->which_atom[mcell_index2];
+            const int T2 = GlobalC::ucell.iat2it[iat2];
+            Atom* atom2 = &GlobalC::ucell.atoms[T2];
+            const int I2 = GlobalC::ucell.iat2ia[iat2];
+            //find R by which_unitcell and cal kphase
+            const int id2_ucell = this->gridt->which_unitcell[mcell_index2];
+            const int R2x = this->gridt->ucell_index2x[id2_ucell] + this->gridt->minu1;
+            const int R2y = this->gridt->ucell_index2y[id2_ucell] + this->gridt->minu2;
+            const int R2z = this->gridt->ucell_index2z[id2_ucell] + this->gridt->minu3;
+            ModuleBase::Vector3<double> R2((double)R2x, (double)R2y, (double)R2z);
+            if (grid_index == 0 && inout->kd1.norm() < 1e-5 && inout->kd2.norm() < 1e-5)
+            {
+                GlobalV::ofs_running << "R1_int=" << R1x << " " << R1y << " " << R1z << std::endl;
+                GlobalV::ofs_running << "R2_int=" << R2x << " " << R2y << " " << R2z << std::endl;
+            }
+            const double arg2 = (inout->kd2 * R2) * ModuleBase::TWO_PI;
+            const std::complex<double> kphase2 = std::complex <double>(cos(arg2), sin(arg2));
+            // get the start index of local orbitals.
+            const int start2 = GlobalC::ucell.itiaiw2iwt(T2, I2, 0);
+            const int* iw2_lo = &this->gridt->trace_lo[start2];
+            for (int iw1 = 0; iw1 < atom1->nw; ++iw1)
+            {
+                for (int iw2 = 0; iw2 < atom2->nw; ++iw2)
+                {
+                    for (int ib = 0; ib < this->bxyz; ib++)
+                    {
+                        if (cal_flag[ib][ia1] && cal_flag[ib][ia2])
+                        {
+                            double* psi1 = &psir_ylm.ptr_2D[ib][block_index[ia1]];
+                            double* psi2 = &psir_ylm.ptr_2D[ib][block_index[ia2]];
+                            inout->Srotk_grid->at(iw1_lo[iw1] * lgd + iw2_lo[iw2]) += std::complex<double>(psi1[iw1], 0.0) * psi2[iw2] * kphase1 * kphase2 * dv;
+                        }// cal_flag
+                    }//ib
+                }//iw2
+            }//iw1
+        }//ia2
+    }// ia1
+    delete[] block_iw;
+    delete[] block_index;
+    delete[] block_size;
+    for (int ib = 0; ib < this->bxyz; ++ib)
+    {
+        delete[] cal_flag[ib];
+    }
+    delete[] cal_flag;
+}
+
+std::vector<std::complex<double>> Gint_k::grid_to_2d(
+    const Parallel_2D& p2d,
+    const std::vector<std::complex<double>>& mat_grid)const
+{
+    ModuleBase::TITLE("Gint_k", "grid_to_2d");
+    std::cout << "lgd=" << this->gridt->lgd << std::endl;
+    const int& lgd = this->gridt->lgd;
+    const int& nlocal = GlobalV::NLOCAL;
+    assert(mat_grid.size() == lgd * lgd);
+
+    std::vector<std::complex<double>> mat_2d(p2d.get_local_size(), 0);
+    for (int i = 0; i < nlocal; ++i)
+    {
+        std::vector<std::complex<double>> tmp(nlocal, 0);
+        int mug = this->gridt->trace_lo[i];
+        if (mug > 0)
+            for (int j = 0; j < nlocal; ++j)
+            {
+                int nug = this->gridt->trace_lo[j];
+                if (nug > 0) tmp[j] = mat_grid[mug * lgd + nug];
+            }
+        Parallel_Reduce::reduce_complex_double_pool(tmp.data(), tmp.size());
+        for (int j = 0; j < nlocal; j++)
+        {
+            if (p2d.in_this_processor(i, j))
+            {   //set the value
+                const int li = p2d.global2local_row(i);
+                const int lj = p2d.global2local_col(j);
+                long index = ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER() ?
+                    lj * p2d.get_row_size() + li : lj * p2d.get_row_size() + li;
+                mat_2d[index] += tmp[j];
+            }
+        }
+    }
+    return mat_2d;
+}
+
+std::vector<std::complex<double>> Gint_k::folding_Srot_k(
+    const ModuleBase::Vector3<double>& kvec_d,
+    const Parallel_2D& p2d,
+    const double* data)
+{
+    ModuleBase::TITLE("Gint_k", "folding_Srot_k");
+    ModuleBase::timer::tick("Gint_k", "folding_Srot_k");
+    if (!pvpR_alloc_flag)
+    {
+        ModuleBase::WARNING_QUIT("Gint_k::destroy_pvpR", "pvpR hasnot been allocated yet!");
+    }
+    int lgd = this->gridt->lgd;
+    std::complex<double>** pvp = new std::complex<double>*[lgd];
+    std::complex<double>* pvp_base = new std::complex<double>[lgd * lgd];
+    for (int i = 0; i < lgd; i++)
+    {
+        pvp[i] = pvp_base + i * lgd;
+    }
+
+    auto init_pvp = [&](int num_threads, int thread_id)
+        {
+            int beg, len;
+            ModuleBase::BLOCK_TASK_DIST_1D(num_threads, thread_id, lgd * lgd, 256, beg, len);
+            ModuleBase::GlobalFunc::ZEROS(pvp_base + beg, len);
+        };
+    ModuleBase::OMP_PARALLEL(init_pvp);
+#ifdef _OPENMP
+#pragma omp parallel
+    {
+#endif
+        ModuleBase::Vector3<double> tau1, dtau, dR;
+#ifdef _OPENMP
+#pragma omp for schedule(dynamic)
+#endif
+        for (int iat = 0; iat < GlobalC::ucell.nat; ++iat)
+        {
+            const int T1 = GlobalC::ucell.iat2it[iat];
+            const int I1 = GlobalC::ucell.iat2ia[iat];
+            {
+                // atom in this grid piece.
+                if (this->gridt->in_this_processor[iat])
+                {
+                    Atom* atom1 = &GlobalC::ucell.atoms[T1];
+                    const int start1 = GlobalC::ucell.itiaiw2iwt(T1, I1, 0);
+
+                    // get the start positions of elements.
+                    const int DM_start = this->gridt->nlocstartg[iat];
+
+                    // get the coordinates of adjacent atoms.
+                    tau1 = GlobalC::ucell.atoms[T1].tau[I1];
+                    //GlobalC::GridD.Find_atom(tau1);	
+                    AdjacentAtomInfo adjs;
+                    GlobalC::GridD.Find_atom(GlobalC::ucell, tau1, T1, I1, &adjs);
+                    // search for the adjacent atoms.
+                    int nad = 0;
+
+                    for (int ad = 0; ad < adjs.adj_num + 1; ad++)
+                    {
+                        // get iat2
+                        const int T2 = adjs.ntype[ad];
+                        const int I2 = adjs.natom[ad];
+                        const int iat2 = GlobalC::ucell.itia2iat(T2, I2);
+
+
+                        // adjacent atom is also on the grid.
+                        if (this->gridt->in_this_processor[iat2])
+                        {
+                            Atom* atom2 = &GlobalC::ucell.atoms[T2];
+                            dtau = adjs.adjacent_tau[ad] - tau1;
+                            double distance = dtau.norm() * GlobalC::ucell.lat0;
+                            double rcut = GlobalC::ORB.Phi[T1].getRcut() + GlobalC::ORB.Phi[T2].getRcut();
+
+                            // for the local part, only need to calculate <phi_i | phi_j> within range
+                            // mohan note 2012-07-06
+                            if (distance < rcut)
+                            {
+                                const int start2 = GlobalC::ucell.itiaiw2iwt(T2, I2, 0);
+
+                                // calculate the distance between iat1 and iat2.
+                                dR.x = adjs.box[ad].x;
+                                dR.y = adjs.box[ad].y;
+                                dR.z = adjs.box[ad].z;
+
+                                // calculate the phase factor exp(ikR).
+                                const double arg = (kvec_d * dR) * ModuleBase::TWO_PI;
+                                double sinp, cosp;
+                                ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                                const std::complex<double> phase = std::complex<double>(cosp, sinp);
+                                int ixxx = DM_start + this->gridt->find_R2st[iat][nad];
+
+                                for (int iw = 0; iw < atom1->nw; iw++)
+                                {
+                                    std::complex<double>* vij = pvp[this->gridt->trace_lo[start1 + iw]];
+                                    const int* iw2_lo = &this->gridt->trace_lo[start2];
+                                    // get the <phi | V | phi>(R) Hamiltonian.
+                                    // const double* vijR = &pvpR_reduced[0][ixxx];
+                                    const double* vijR = &data[ixxx];
+                                    for (int iw2 = 0; iw2 < atom2->nw; ++iw2)
+                                    {
+                                        vij[iw2_lo[iw2]] += vijR[iw2] * phase;
+                                    }
+                                    ixxx += atom2->nw;
+                                }
+                                ++nad;
+                            }// end distance<rcut
+                        }
+                    }// end ad
+                }
+            }// end ia
+        }// end it
+#ifdef _OPENMP
+    }
+#endif
+
+    // Distribution of data.
+    ModuleBase::timer::tick("Gint_k", "Distri");
+    const int nlocal = GlobalV::NLOCAL;
+    std::vector<std::complex<double>> tmp(nlocal);
+    std::vector<std::complex<double>> Srotk(p2d.get_local_size(), 0);  //result
+
+#ifdef _OPENMP
+#pragma omp parallel
+    {
+#endif
+        //loop each row with index i, than loop each col with index j 
+        for (int i = 0; i < nlocal; i++)
+        {
+#ifdef _OPENMP
+#pragma omp for
+#endif
+            for (int j = 0; j < nlocal; j++)
+            {
+                tmp[j] = std::complex<double>(0.0, 0.0);
+            }
+            int i_flag = i & 1; // i % 2 == 0
+            const int mug = this->gridt->trace_lo[i];
+            // if the row element is on this processor.
+            if (mug >= 0)
+            {
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                for (int j = 0; j < nlocal; j++)
+                {
+                    const int nug = this->gridt->trace_lo[j];
+                    // if the col element is on this processor.
+                    if (nug >= 0)
+                    {
+                        tmp[j] = pvp[mug][nug];
+                    }
+                }
+            }
+#ifdef _OPENMP
+#pragma omp single
+            {
+#endif
+                // collect the matrix after folding.
+                Parallel_Reduce::reduce_complex_double_pool(tmp.data(), tmp.size());
+#ifdef _OPENMP
+            }
+#endif
+
+            //-----------------------------------------------------
+            // NOW! Redistribute the Hamiltonian matrix elements
+            // according to the HPSEPS's 2D distribution methods.
+            //-----------------------------------------------------
+#ifdef _OPENMP
+#pragma omp for
+#endif
+            for (int j = 0; j < nlocal; j++)
+            {
+                if (p2d.in_this_processor(i, j))
+                {   //set the value
+                    const int li = p2d.global2local_row(i);
+                    const int lj = p2d.global2local_col(j);
+                    long index;
+                    if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
+                    {
+                        index = lj * p2d.get_row_size() + li;
+                    }
+                    else
+                    {
+                        index = li * p2d.get_col_size() + lj;
+                    }
+                    Srotk[index] += tmp[j];
+                }
+            }
+        }
+#ifdef _OPENMP
+    }
+#endif
+
+    // delete the tmp matrix.
+    delete[] pvp;
+    delete[] pvp_base;
+    ModuleBase::timer::tick("Gint_k", "Distri");
+
+    ModuleBase::timer::tick("Gint_k", "folding_Srot_k");
+
+    return Srotk;
+}
+
diff --git a/source/module_hamilt_lcao/module_gint/gint_k.h b/source/module_hamilt_lcao/module_gint/gint_k.h
index ab739a6d5..b887d51ce 100644
--- a/source/module_hamilt_lcao/module_gint/gint_k.h
+++ b/source/module_hamilt_lcao/module_gint/gint_k.h
@@ -55,6 +55,8 @@ class Gint_k : public Gint
     // destroy the temporary <phi_0 | V | dphi_R> matrix element.
     void destroy_pvdpR(void);
 
+    const double get_pvpR(int ispin, int innrg) { return this->pvpR_reduced[ispin][innrg]; }
+
     // folding the < phi_0 | V | phi_R> matrix to 
     // <phi_0i | V | phi_0j>
     // V is (Vl + Vh + Vxc) if no Vna is used,
@@ -66,10 +68,9 @@ class Gint_k : public Gint
     //------------------------------------------------------
     // calculate the envelop function via grid integrals
     void cal_env_k(int ik,
-                   const std::complex<double>* psi_k,
-                   double* rho,
-                   const std::vector<ModuleBase::Vector3<double>>& kvec_c,
-                   const std::vector<ModuleBase::Vector3<double>>& kvec_d);
+        const std::complex<double>* psi_k,
+        double* rho,
+        const std::vector<ModuleBase::Vector3<double>>& kvec_d);
 
     //------------------------------------------------------
     // in gint_k_sparse.cpp 
@@ -119,7 +120,18 @@ class Gint_k : public Gint
         const double &sparse_threshold,
         LCAO_Matrix *LM);
 
-    private:
+    //------------------------------------------------------
+    // in gint_k_srot.cpp 
+    //------------------------------------------------------  
+    /// folding the S_rot(R) = < phi_0 | g^{-1}phi_R> matrix to Srot(k)=\sum{R} Srot(R)exp(ikR) , for exx-symmetry
+    std::vector<std::complex<double>> folding_Srot_k(
+        const ModuleBase::Vector3<double>& kvec_d,
+        const Parallel_2D& p2d,
+        const double* data);
+    std::vector<std::complex<double>> grid_to_2d(
+        const Parallel_2D& p2d,
+        const std::vector<std::complex<double>>& mat_grid) const;
+private:
 
     //----------------------------
     // key variable 
diff --git a/source/module_hamilt_lcao/module_gint/gint_k_env.cpp b/source/module_hamilt_lcao/module_gint/gint_k_env.cpp
index d3859a730..158911ab9 100644
--- a/source/module_hamilt_lcao/module_gint/gint_k_env.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint_k_env.cpp
@@ -6,10 +6,9 @@
 #include "module_base/timer.h"
 
 void Gint_k::cal_env_k(int ik, 
-                       const std::complex<double>* psi_k, 
-                       double* rho, 
-                       const std::vector<ModuleBase::Vector3<double>>& kvec_c, 
-                       const std::vector<ModuleBase::Vector3<double>>& kvec_d)
+    const std::complex<double>* psi_k,
+    double* rho,
+    const std::vector<ModuleBase::Vector3<double>>& kvec_d)
 {
     ModuleBase::TITLE("Gint_k", "cal_env_k");
     ModuleBase::timer::tick("Gint_k", "cal_env_k");
@@ -63,12 +62,7 @@ void Gint_k::cal_env_k(int ik,
                 const int Ry = this->gridt->ucell_index2y[id_ucell] + this->gridt->minu2;
                 const int Rz = this->gridt->ucell_index2z[id_ucell] + this->gridt->minu3;
                 ModuleBase::Vector3<double> R((double)Rx, (double)Ry, (double)Rz);
-                //std::cout << "kvec_d: " << kvec_d[ik].x << " " << kvec_d[ik].y << " " << kvec_d[ik].z << std::endl;
-                //std::cout << "kvec_c: " << kvec_c[ik].x << " " << kvec_c[ik].y << " " << kvec_c[ik].z << std::endl;
-                //std::cout << "R: " << R.x << " " << R.y << " " << R.z << std::endl;
                 const double arg = (kvec_d[ik] * R) * ModuleBase::TWO_PI;
-                const double arg1 = (kvec_c[ik] * (R.x * GlobalC::ucell.a1 + R.y * GlobalC::ucell.a2 + R.z * GlobalC::ucell.a3)) * ModuleBase::TWO_PI;
-                //std::cout << "arg0=" << arg << ", arg1=" << arg1 << std::endl;
                 const std::complex<double> kphase = std::complex <double>(cos(arg), sin(arg));
 
                 // get the start index of local orbitals.
diff --git a/source/module_hamilt_lcao/module_gint/gint_tools.cpp b/source/module_hamilt_lcao/module_gint/gint_tools.cpp
index 4827563c7..1af24c672 100644
--- a/source/module_hamilt_lcao/module_gint/gint_tools.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint_tools.cpp
@@ -777,9 +777,123 @@ namespace Gint_Tools
 		}
 		ModuleBase::timer::tick("Gint_Tools", "cal_dpsirr_ylm");
 		return;
-	}
-
-	// atomic basis sets
+    }
+    // #ifdef __EXX
+    void cal_psir_ylmrot_dv(
+        const Grid_Technique& gt,
+        const ModuleBase::Matrix3& ginv,
+        const int bxyz,
+        const int na_grid, 					// number of atoms on this grid
+        const int grid_index, 				// 1d index of FFT index (i,j,k)
+        const double delta_r, 				// delta_r of the uniform FFT grid
+        const double dv,                // volume of the FFT grid
+        const int* const block_index,  		// block_index[na_grid+1], count total number of atomis orbitals
+        const int* const block_size, 		// block_size[na_grid],	number of columns of a band
+        const bool* const* const cal_flag,
+        double* const* const psir_ylm) 	// cal_flag[bxyz][na_grid],	whether the atom-grid distance is larger than cutoff
+    {
+        ModuleBase::timer::tick("Gint_Tools", "cal_psir_ylmrot_dv");
+        std::vector<double> ylma;
+        for (int id = 0; id < na_grid; id++)
+        {
+            // there are two parameters we want to know here:
+            // in which bigcell of the meshball the atom is in?
+            // what's the cartesian coordinate of the bigcell?
+            const int mcell_index = gt.bcell_start[grid_index] + id;
+
+            const int iat = gt.which_atom[mcell_index]; // index of atom
+            const int it = GlobalC::ucell.iat2it[iat]; // index of atom type
+            const Atom* const atom = &GlobalC::ucell.atoms[it];
+            auto& OrbPhi = GlobalC::ORB.Phi[it];
+            std::vector<const double*> it_psi_uniform(atom->nw);
+            std::vector<const double*> it_dpsi_uniform(atom->nw);
+            // preprocess index
+            for (int iw = 0; iw < atom->nw; ++iw)
+            {
+                if (atom->iw2_new[iw])
+                {
+                    auto philn = &OrbPhi.PhiLN(atom->iw2l[iw], atom->iw2n[iw]);
+                    it_psi_uniform[iw] = &philn->psi_uniform[0];
+                    it_dpsi_uniform[iw] = &philn->dpsi_uniform[0];
+                }
+            }
+
+            // meshball_positions should be the bigcell position in meshball
+            // mt: from tau to bigcell
+            const int imcell = gt.which_bigcell[mcell_index];
+            const double mt[3] = {
+                gt.meshball_positions[imcell][0] - gt.tau_in_bigcell[iat][0],
+                gt.meshball_positions[imcell][1] - gt.tau_in_bigcell[iat][1],
+                gt.meshball_positions[imcell][2] - gt.tau_in_bigcell[iat][2] };
+
+            // number of grids in each big cell (bxyz)
+            for (int ib = 0; ib < bxyz; ib++)
+            {
+                double* p = &psir_ylm[ib][block_index[id]];
+                if (!cal_flag[ib][id])
+                {
+                    ModuleBase::GlobalFunc::ZEROS(p, block_size[id]);
+                }
+                else
+                {
+                    // dr: from tau to meshcell
+                    const double dr[3] = {
+                        gt.meshcell_pos[ib][0] + mt[0],
+                        gt.meshcell_pos[ib][1] + mt[1],
+                        gt.meshcell_pos[ib][2] + mt[2] };
+                    const double ginv_dr[3] =
+                    {
+                        dr[0] * ginv.e11 + dr[1] * ginv.e21 + dr[2] * ginv.e31,
+                        dr[0] * ginv.e12 + dr[1] * ginv.e22 + dr[2] * ginv.e32,
+                        dr[0] * ginv.e13 + dr[1] * ginv.e23 + dr[2] * ginv.e33 };
+                    double distance = std::sqrt(dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]);	// distance between atom and grid
+                    //if(distance[id] > gt.orbital_rmax) continue;
+                    if (distance < 1.0E-9) distance += 1.0E-9;
+
+                    //------------------------------------------------------
+                    // spherical harmonic functions Ylm
+                    //------------------------------------------------------
+                    //	Ylm::get_ylm_real(this->nnn[it], this->dr[id], ylma);
+                    ModuleBase::Ylm::sph_harm(GlobalC::ucell.atoms[it].nwl,
+                        ginv_dr[0] / distance,
+                        ginv_dr[1] / distance,
+                        ginv_dr[2] / distance,
+                        ylma);
+                    // these parameters are related to interpolation
+                    // because once the distance from atom to grid point is known,
+                    // we can obtain the parameters for interpolation and
+                    // store them first! these operations can save lots of efforts.
+                    const double position = distance / delta_r;
+                    const int ip = static_cast<int>(position);
+                    const double dx = position - ip;
+                    const double dx2 = dx * dx;
+                    const double dx3 = dx2 * dx;
+
+                    const double c3 = 3.0 * dx2 - 2.0 * dx3;
+                    const double c1 = 1.0 - c3;
+                    const double c2 = (dx - 2.0 * dx2 + dx3) * delta_r;
+                    const double c4 = (dx3 - dx2) * delta_r;
+
+                    double phi = 0;
+                    for (int iw = 0; iw < atom->nw; ++iw)
+                    {
+                        if (atom->iw2_new[iw])
+                        {
+                            auto psi_uniform = it_psi_uniform[iw];
+                            auto dpsi_uniform = it_dpsi_uniform[iw];
+                            phi = c1 * psi_uniform[ip] + c2 * dpsi_uniform[ip]			 // radial wave functions
+                                + c3 * psi_uniform[ip + 1] + c4 * dpsi_uniform[ip + 1];
+                        }
+                        p[iw] = phi * ylma[atom->iw2_ylm[iw]] * dv;
+                    } // end iw
+                }// end distance<=(GlobalC::ORB.Phi[it].getRcut()-1.0e-15)
+            }// end ib
+        }// end id
+        ModuleBase::timer::tick("Gint_Tools", "cal_psir_ylmrot_dv");
+        return;
+    }
+    // #endif
+    // atomic basis sets
 	// psir_vlbr3[bxyz][LD_pool]
     Gint_Tools::Array_Pool<double> get_psir_vlbr3(
         const int bxyz,
diff --git a/source/module_hamilt_lcao/module_gint/gint_tools.h b/source/module_hamilt_lcao/module_gint/gint_tools.h
index 5743371d1..7d0c5f60c 100644
--- a/source/module_hamilt_lcao/module_gint/gint_tools.h
+++ b/source/module_hamilt_lcao/module_gint/gint_tools.h
@@ -10,8 +10,8 @@
 
 namespace Gint_Tools
 {
-    enum class job_type{vlocal, rho, force, tau, vlocal_meta, force_meta, dvlocal};
-	//Hamiltonian, electron density, force, kinetic energy density, Hamiltonian for mGGA
+    enum class job_type { vlocal, rho, force, tau, vlocal_meta, force_meta, dvlocal, srot, srotk };
+    //Hamiltonian, electron density, force, kinetic energy density, Hamiltonian for mGGA, <phi(r)|phi(g^{-1}r) >for exx-symmetry
 }
 
 //the class is used to pass input/output variables
@@ -30,7 +30,11 @@ class Gint_inout
         int ispin;
 		LCAO_Matrix *lm;
 
-    //output
+        ModuleBase::Matrix3 ginv;
+        ModuleBase::Vector3<double> kd1, kd2;
+        std::vector<std::complex<double>>* Srotk_grid;//lgd*lgd
+
+        //output
         double** rho;
         ModuleBase::matrix* fvl_dphi;
         ModuleBase::matrix* svl_dphi;
@@ -144,6 +148,16 @@ class Gint_inout
             lm = lm_in;
             job = job_in;
         }
+
+        // srot, multi-k
+        Gint_inout(const ModuleBase::Matrix3& ginv_in, Gint_Tools::job_type job_in) { ginv = ginv_in; job = job_in; }
+        Gint_inout(ModuleBase::Vector3<double> kd1_in, ModuleBase::Vector3<double> kd2_in,
+            std::vector<std::complex<double>>* Srotk_grid_in, Gint_Tools::job_type job_in) {
+            kd1 = kd1_in;
+            kd2 = kd2_in;
+            Srotk_grid = Srotk_grid_in;
+            job = job_in;
+        }
 };
  
 namespace Gint_Tools
@@ -243,9 +257,25 @@ namespace Gint_Tools
 		double*const*const ddpsir_ylm_xz,
 		double*const*const ddpsir_ylm_yy,
 		double*const*const ddpsir_ylm_yz,
-		double*const*const ddpsir_ylm_zz);
+        double* const* const ddpsir_ylm_zz);
+
+    // #ifdef __EXX
+        /// $\phi(r)Ylm(g^{-1}r)$, for exx-symmetry
+    void cal_psir_ylmrot_dv(
+        const Grid_Technique& gt,
+        const ModuleBase::Matrix3& ginv,    ///< inverse rotation matirx
+        const int bxyz,
+        const int na_grid, // number of atoms on this grid 
+        const int grid_index, // 1d index of FFT index (i,j,k) 
+        const double delta_r, // delta_r of the uniform FFT grid
+        const double dv,                // volume of the FFT grid
+        const int* const block_index,  // count total number of atomis orbitals
+        const int* const block_size,
+        const bool* const* const cal_flag,
+        double* const* const psir_ylm); // whether the atom-grid distance is larger than cutoff
+    // #endif
 
-	// psir_ylm * vldr3
+    // psir_ylm * vldr3
     Gint_Tools::Array_Pool<double> get_psir_vlbr3(
         const int bxyz,
         const int na_grid,  					    // how many atoms on this (i,j,k) grid
diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp
index a988d8738..2d6498e6c 100644
--- a/source/module_io/input_conv.cpp
+++ b/source/module_io/input_conv.cpp
@@ -570,8 +570,8 @@ void Input_Conv::Convert(void)
         Exx_Abfs::Jle::tolerence = INPUT.exx_opt_orb_tolerence;
 
         // EXX does not support any symmetry analyse, force symmetry setting to -1
-        if (INPUT.calculation != "nscf")
-            ModuleSymmetry::Symmetry::symm_flag = -1;
+        // if (INPUT.calculation != "nscf")
+        //     ModuleSymmetry::Symmetry::symm_flag = -1;
     }
 #endif // __LCAO
 #endif // __EXX
diff --git a/source/module_io/istate_envelope.cpp b/source/module_io/istate_envelope.cpp
index f142e05e3..30380fea8 100644
--- a/source/module_io/istate_envelope.cpp
+++ b/source/module_io/istate_envelope.cpp
@@ -247,7 +247,7 @@ void IState_Envelope::begin(const psi::Psi<std::complex<double>>* psi,
                 }
 #endif
                 //deal with NSPIN=4
-                gk.cal_env_k(ik, lowf.wfc_k_grid[ik][ib], pes->charge->rho[ispin], kv.kvec_c, kv.kvec_d);
+                gk.cal_env_k(ik, lowf.wfc_k_grid[ik][ib], pes->charge->rho[ispin], kv.kvec_d);
 
                 std::stringstream ss;
                 ss << GlobalV::global_out_dir << "BAND" << ib + 1 << "_k_" << ik / nspin0 + 1 << "_s_" << ispin + 1 << "_ENV.cube";
diff --git a/source/module_io/mulliken_charge.cpp b/source/module_io/mulliken_charge.cpp
index 1ebfbe921..81cdce3c1 100644
--- a/source/module_io/mulliken_charge.cpp
+++ b/source/module_io/mulliken_charge.cpp
@@ -136,7 +136,7 @@ ModuleBase::matrix ModuleIO::cal_mulliken_k(const std::vector<ModuleBase::Comple
     for(size_t ik = 0; ik != kv.nks; ++ik)
     {
         uhm.LM->zeros_HSk('S');
-		uhm.LM->folding_fixedH(ik, kv.kvec_d);
+        uhm.LM->folding_fixedH(kv.kvec_d[ik]);
 
         ModuleBase::ComplexMatrix mud;
         mud.create(uhm.LM->ParaV->ncol, uhm.LM->ParaV->nrow);
diff --git a/source/module_io/write_dos_lcao.cpp b/source/module_io/write_dos_lcao.cpp
index 8bd62eb5c..fec715a2b 100644
--- a/source/module_io/write_dos_lcao.cpp
+++ b/source/module_io/write_dos_lcao.cpp
@@ -204,7 +204,7 @@ void ModuleIO::write_dos_lcao(const psi::Psi<double>* psid,
                 {
                     uhm.LM->allocate_HS_k(pv->nloc);
                     uhm.LM->zeros_HSk('S');
-                    uhm.LM->folding_fixedH(ik, kv.kvec_d);
+                    uhm.LM->folding_fixedH(kv.kvec_d[ik]);
 
                     psi->fix_k(ik);
                     psi::Psi<std::complex<double>> Dwfc(psi[0], 1);
diff --git a/source/module_io/write_proj_band_lcao.cpp b/source/module_io/write_proj_band_lcao.cpp
index 531383a27..848f9674a 100644
--- a/source/module_io/write_proj_band_lcao.cpp
+++ b/source/module_io/write_proj_band_lcao.cpp
@@ -130,7 +130,7 @@ void ModuleIO::write_proj_band_lcao(const psi::Psi<double> *psid,
                 {
                     uhm.LM->allocate_HS_k(pv->nloc);
                     uhm.LM->zeros_HSk('S');
-                    uhm.LM->folding_fixedH(ik,kv.kvec_d);
+                    uhm.LM->folding_fixedH(kv.kvec_d[ik]);
 
                     psi->fix_k(ik);
                     psi::Psi<std::complex<double>> Dwfc(psi[0], 1);
diff --git a/source/module_ri/Exx_LRI.h b/source/module_ri/Exx_LRI.h
index 06fd84d58..af0003043 100644
--- a/source/module_ri/Exx_LRI.h
+++ b/source/module_ri/Exx_LRI.h
@@ -64,7 +64,7 @@ class Exx_LRI
 	RI::Exx<TA,Tcell,Ndim,Tdata> exx_lri;
 
 	void cal_exx_ions();
-	void cal_exx_elec(const Parallel_Orbitals &pv);
+    void cal_exx_elec(const Parallel_Orbitals& pv, const ModuleSymmetry::Symmetry& symm);
 	void post_process_Hexx( std::map<TA, std::map<TAC, RI::Tensor<Tdata>>> &Hexxs_io ) const;
 	Tdata post_process_Eexx( const Tdata &Eexx_in ) const;
 
diff --git a/source/module_ri/Exx_LRI.hpp b/source/module_ri/Exx_LRI.hpp
index 05fd00ffe..efb6f8093 100644
--- a/source/module_ri/Exx_LRI.hpp
+++ b/source/module_ri/Exx_LRI.hpp
@@ -17,6 +17,7 @@
 #include "module_base/timer.h"
 #include "module_ri/serialization_cereal.h"
 #include "module_ri/Mix_DMk_2D.h"
+#include "module_ri/exx_symmetry.h"
 #include "module_basis/module_ao/parallel_orbitals.h"
 
 #include <RI/distribute/Distribute_Equally.h>
@@ -161,7 +162,7 @@ void Exx_LRI<Tdata>::cal_exx_ions()
 }
 
 template<typename Tdata>
-void Exx_LRI<Tdata>::cal_exx_elec(const Parallel_Orbitals &pv)
+void Exx_LRI<Tdata>::cal_exx_elec(const Parallel_Orbitals& pv, const ModuleSymmetry::Symmetry& symm)
 {
 	ModuleBase::TITLE("Exx_LRI","cal_exx_elec");
 	ModuleBase::timer::tick("Exx_LRI", "cal_exx_elec");
@@ -171,7 +172,7 @@ void Exx_LRI<Tdata>::cal_exx_elec(const Parallel_Orbitals &pv)
 	std::vector<std::map<TA,std::map<TAC,RI::Tensor<Tdata>>>> Ds =
 		GlobalV::GAMMA_ONLY_LOCAL
 		? RI_2D_Comm::split_m2D_ktoR<Tdata>(*p_kv, this->mix_DMk_2D.get_DMk_gamma_out(), pv)
-		: RI_2D_Comm::split_m2D_ktoR<Tdata>(*p_kv, this->mix_DMk_2D.get_DMk_k_out(), pv);
+        : RI_2D_Comm::split_m2D_ktoR<Tdata>(*p_kv, this->mix_DMk_2D.get_DMk_k_out(), pv, symm);
 
 	this->exx_lri.set_csm_threshold(this->info.cauchy_threshold);
 
diff --git a/source/module_ri/Exx_LRI_interface.h b/source/module_ri/Exx_LRI_interface.h
index 4cc34aabd..b35194f3b 100644
--- a/source/module_ri/Exx_LRI_interface.h
+++ b/source/module_ri/Exx_LRI_interface.h
@@ -3,7 +3,7 @@
 #include <memory>
 
 class Local_Orbital_Charge;
-class LCAO_Matrix;
+class LCAO_gen_fixedH;
 class Charge_Mixing;
 namespace elecstate
 {
@@ -32,10 +32,14 @@ class Exx_LRI_Interface
     void exx_beforescf(const K_Vectors& kv, const Charge_Mixing& chgmix);
 
     /// @brief in eachiterinit:  do DM mixing and calculate Hexx when entering 2nd SCF
-    void exx_eachiterinit(const Local_Orbital_Charge& loc, const Charge_Mixing& chgmix, const int& iter);
+    void exx_eachiterinit(
+        const Local_Orbital_Charge& loc,
+        const Charge_Mixing& chgmix,
+        const ModuleSymmetry::Symmetry& symm,
+        const int& iter);
 
     /// @brief in hamilt2density: calculate Hexx and Eexx
-    void exx_hamilt2density(elecstate::ElecState& elec, const Parallel_Orbitals& pv);
+    void exx_hamilt2density(elecstate::ElecState& elec, const Parallel_Orbitals& pv, const ModuleSymmetry::Symmetry& symm);
 
     /// @brief: in do_after_converge: add exx operators; do DM mixing if seperate loop
     bool exx_after_converge(
@@ -43,8 +47,19 @@ class Exx_LRI_Interface
         LCAO_Matrix& lm,
         const Local_Orbital_Charge& loc,
         const K_Vectors& kv,
+        const ModuleSymmetry::Symmetry& symm,
         int& iter);
-    
+
+    /// @brief calculate dm_k for all k points in kstar when symmetry is on
+    void restore_dm(
+        LCAO_Hamilt& uhm,
+        Local_Orbital_Charge& loc,
+        const K_Vectors& kv,
+        const UnitCell& ucell,
+        const psi::Psi<Tdata, psi::DEVICE_CPU>& psi,
+        const ModuleSymmetry::Symmetry& symm,
+        const ModuleBase::matrix& wg);
+
 private:
     std::shared_ptr<Exx_LRI<Tdata>> exx_lri;
 };
diff --git a/source/module_ri/Exx_LRI_interface.hpp b/source/module_ri/Exx_LRI_interface.hpp
index 3a677165c..318f257d8 100644
--- a/source/module_ri/Exx_LRI_interface.hpp
+++ b/source/module_ri/Exx_LRI_interface.hpp
@@ -3,6 +3,9 @@
 #include "module_ri/exx_opt_orb.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.h"
+#include "module_ri/exx_symmetry.h"
+#include "module_elecstate/cal_dm.h"
+#include "module_base/libm/libm.h"
 
 template<typename Tdata>
 void Exx_LRI_Interface<Tdata>::write_Hexxs(const std::string &file_name) const
@@ -54,8 +57,11 @@ void Exx_LRI_Interface<Tdata>::exx_beforescf(const K_Vectors& kv, const Charge_M
 		
 		// set initial parameter for mix_DMk_2D
 		if(GlobalC::exx_info.info_global.cal_exx)
-		{
-			exx_lri->mix_DMk_2D.set_nks(kv.nks, GlobalV::GAMMA_ONLY_LOCAL);
+        {
+            if (!GlobalV::GAMMA_ONLY_LOCAL && ModuleSymmetry::Symmetry::symm_flag == 1)
+                exx_lri->mix_DMk_2D.set_nks(kv.nkstot_full, GlobalV::GAMMA_ONLY_LOCAL);
+            else
+                exx_lri->mix_DMk_2D.set_nks(kv.nks, GlobalV::GAMMA_ONLY_LOCAL);
 			if(GlobalC::exx_info.info_global.separate_loop)
 			{
 				if(GlobalC::exx_info.info_global.mixing_beta_for_loop1==1.0)
@@ -81,14 +87,284 @@ void Exx_LRI_Interface<Tdata>::exx_beforescf(const K_Vectors& kv, const Charge_M
 #endif // __MPI
 }
 
+///adjust S'(k) acording to orbital parity of the column
+inline void orb_parity_Sk(
+    const Parallel_2D& p2d,
+    const ModuleSymmetry::Symmetry& symm,
+    const std::map<int, ModuleBase::Vector3<double>>& kstar_ibz,
+    std::vector<std::vector<std::complex<double>>>& sloc_kstar,
+    const UnitCell& ucell,
+    const LCAO_Orbitals& orb,
+    bool& col_inside)
+{
+    int iks = 0;
+    for (auto ks : kstar_ibz)
+    {
+        int isym = ks.first;
+        if (static_cast<int>(symm.gmatrix[isym].Det()) == -1)// inverse
+        {
+            // S'(k)_ij *= (-1)^{l_j}
+            for (int i = 0;i < p2d.get_row_size();++i)
+            {
+                int gj = 0; // global index of j
+                for (int it_j = 0;it_j < GlobalC::ucell.ntype;++it_j)
+                {
+                    for (int ia_j = 0;ia_j < GlobalC::ucell.atoms[it_j].na;++ia_j)
+                    {
+                        std::cout << "lmax:" << GlobalC::ORB.Phi[it_j].getLmax() << std::endl;
+                        std::cout << "ia=" << ia_j << std::endl;
+                        for (int l = 0;l <= GlobalC::ORB.Phi[it_j].getLmax();++l)
+                        {
+                            for (int n = 0;n < GlobalC::ORB.Phi[it_j].getNchi(l);++n)
+                            {
+                                for (int m = 0;m < 2 * l + 1;++m)
+                                {
+                                    std::cout << "l=" << l << ",m=" << m << ",n=" << n << std::endl;
+                                    int j = p2d.global2local_col(gj);
+                                    if (l % 2 == 1 && j >= 0)
+                                    {
+                                        std::cout << "p-orbs: global_index=" << gj << ", local_index=" << j << std::endl;
+                                        if (col_inside)
+                                            sloc_kstar[iks][i * p2d.get_col_size() + j] *= -1;
+                                        else
+                                            sloc_kstar[iks][i + j * p2d.get_row_size()] *= -1;
+                                    }
+                                    ++gj;
+                                }// end m
+                            }//end n
+                        }//end l
+                    }//end ia_j
+                } //end it_j
+            }//end i
+        }//end if inverse
+        ++iks;
+    }//end kstar
+}
+
+inline std::vector<std::complex<double>> folding_Srotk(
+    const ModuleBase::Vector3<double>& gk,
+    const ModuleBase::Vector3<double>& k,
+    const LCAO_Hamilt& uhm,
+    const UnitCell& ucell)
+{
+    ModuleBase::TITLE("Exx_LRI_Interface", "folding_Srotk");
+    bool row_inside = ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER();
+    auto kphase = [k](const double& arg) -> std::complex<double> {
+        double sinp, cosp;
+        ModuleBase::libm::sincos(arg, &sinp, &cosp);
+        return std::complex<double>(cosp, sinp);
+        };
+    std::vector<std::complex<double>> sloc_rot(uhm.LM->ParaV->get_local_size(), 0);
+    for (int T1 = 0; T1 < ucell.ntype; ++T1)
+    {
+        Atom* atom1 = &ucell.atoms[T1];
+        for (int I1 = 0; I1 < atom1->na; ++I1)
+        {
+            const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+            ModuleBase::Vector3<double> tau1 = atom1->tau[I1];
+            for (int T2 = 0;T2 < ucell.ntype;++T2)
+            {
+                Atom* atom2 = &ucell.atoms[T2];
+                double rcut = GlobalC::ORB.Phi[T1].getRcut() + GlobalC::ORB.Phi[T2].getRcut();
+                for (int I2 = 0;I2 < atom2->na;++I2)
+                {
+                    ModuleBase::Vector3<double> tau2 = atom2->tau[I2];
+                    const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+                    for (auto R1_int : uhm.LM->all_R_coor)
+                    {
+                        for (auto R2_int : uhm.LM->all_R_coor)
+                        {
+                            ModuleBase::Vector3<double> R1_d(static_cast<double>(R1_int.x), static_cast<double>(R1_int.y), static_cast<double>(R1_int.z));
+                            ModuleBase::Vector3<double> R2_d(static_cast<double>(R2_int.x), static_cast<double>(R2_int.y), static_cast<double>(R2_int.z));
+                            auto dr_c = (R1_d - R2_d) * ucell.latvec + tau1 - tau2;
+                            double distance = dr_c.norm() * ucell.lat0;
+                            if (distance < rcut)
+                            {
+                                const double arg1 = -(gk * R1_d) * ModuleBase::TWO_PI;
+                                const double arg2 = (k * R2_d) * ModuleBase::TWO_PI;
+                                auto dR_int = R1_int - R2_int;
+                                for (int ii = 0; ii < atom1->nw * GlobalV::NPOL; ii++)
+                                {
+                                    const int iw1_all = start1 + ii;
+                                    const int mu = uhm.LM->ParaV->global2local_row(iw1_all);
+                                    if (mu < 0)continue;
+                                    for (int jj = 0; jj < atom2->nw * GlobalV::NPOL; jj++)
+                                    {
+                                        int iw2_all = start2 + jj;
+                                        const int nu = uhm.LM->ParaV->global2local_col(iw2_all);
+                                        if (nu < 0)continue;
+                                        long index = row_inside ? nu * uhm.LM->ParaV->get_row_size() + mu : mu * uhm.LM->ParaV->get_col_size() + nu;
+                                        sloc_rot[index] += kphase(arg1) * kphase(arg2) * uhm.LM->SR_sparse[dR_int][iw1_all][iw2_all];
+                                    }
+                                }
+                            }
+                        }
+                    }//R1_int
+                }
+            }
+        }
+    }
+    for (auto& item : sloc_rot) item /= static_cast<double>(uhm.LM->all_R_coor.size());
+    return sloc_rot;
+}
+template<typename Tdata>
+void Exx_LRI_Interface<Tdata>::restore_dm(
+    LCAO_Hamilt& uhm,
+    Local_Orbital_Charge& loc,
+    const K_Vectors& kv,
+    const UnitCell& ucell,
+    const psi::Psi<Tdata, psi::DEVICE_CPU>& psi,
+    const ModuleSymmetry::Symmetry& symm,
+    const ModuleBase::matrix& wg)
+{
+    ModuleBase::TITLE("Exx_LRI_Interface", "restore_dm");
+    bool col_inside = !ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER();
+    // prepare: calculate the index of inverse matirx of each symmetry operation
+    std::vector<int> invgmat(symm.nrotk);
+    symm.gmatrix_invmap(symm.gmatrix, symm.nrotk, invgmat.data());
+
+
+    // if symmetry-on and multi-k, restore psi for all k-points in kstars before cal_dm
+    // calculate =S'^{-1}(k)S(gk) from S(gk) (once for all)
+    if (uhm.LM->invSkrot_Sgk.size() == 0)
+    {
+        uhm.calculate_SR_sparse(0);
+        // std::map<int, std::vector<double>> SrotR; // store Srot(R) for subsequent kstars' use
+        for (int ikibz = 0; ikibz < kv.nkstot; ++ikibz)
+        {
+
+            std::vector<std::vector<std::complex<double>>> sloc_kstar; //Srot(k) for each k current kstar
+
+            // first trail: calculate Srot(k) from Srot(g, R)  (wrong)
+            // for (auto& isym_kvecd : kv.kstars[ikibz])
+            // {
+            //     int isym = isym_kvecd.first;
+            //     ModuleBase::Vector3<double> kvec_d = isym_kvecd.second;
+            //     auto haveisym = SrotR.find(isym);
+            //     if (haveisym == SrotR.end())
+            //     {   //if SrotR[isym] is not calculated, calculate it. g^{-1} should be transformed to Cartesian coordinate
+            //         Gint_inout inout(ucell.latvec.Inverse() * symm.gmatrix[invgmat[0]] * ucell.latvec, Gint_Tools::job_type::srot);
+            //         uhm.GK.cal_gint(&inout);
+            //         // store SrotR[isym] for other kstars' use
+            //         std::vector<double> SrotR_isym(uhm.GK.gridt->nnrg);
+            //         for (int i = 0;i < uhm.GK.gridt->nnrg;++i) SrotR_isym[i] = uhm.GK.get_pvpR(0, i);
+            //         SrotR.insert(std::make_pair(isym, SrotR_isym));
+            //     }
+            //     sloc_kstar.push_back(ExxSym::rearrange_col(GlobalV::NLOCAL, *loc.ParaV, ucell, col_inside, symm.isym_rotiat_iat[isym],
+            //         uhm.GK.folding_Srot_k(kvec_d, *loc.ParaV, SrotR[isym].data())/*Srot(R) -> Srot(k) whose col to be rearranged*/));
+            // }
+
+            //second trail: calculate S(k) from S(gk) (wrong)
+            //S(gk) (S(R) have been calculated)
+            // uhm.LM->folding_fixedH(ikibz, kv.kvec_d);
+            //S'(k)
+            // for (auto& isym_kvecd : kv.kstars[ikibz])
+            // {
+            //     uhm.LM->zeros_HSk('S');
+            //     // uhm.LM->folding_fixedH(isym_kvecd.second);
+            //     uhm.LM->folding_fixedH(kv.kvec_d[ikibz] + kv.kvec_d[ikibz] - isym_kvecd.second);
+            //     //S'(k)
+            //     sloc_kstar.push_back(ExxSym::rearrange_col(GlobalV::NLOCAL, *uhm.LM->ParaV, GlobalC::ucell, col_inside, symm.isym_rotiat_iat[isym_kvecd.first],
+            //         uhm.LM->Sloc2));
+            // }
+            // orb_parity_Sk(*loc.ParaV, symm, kv.kstars[ikibz], sloc_kstar, GlobalC::ucell, GlobalC::ORB, col_inside);
+
+
+            //third trail: calculate gint of 2 envelopes
+            for (auto& isym_kvecd : kv.kstars[ikibz])
+            {
+                std::vector<std::complex<double>> sloc_ik_grid(uhm.GK.gridt->lgd * uhm.GK.gridt->lgd, 0);
+                Gint_inout inout(kv.kvec_d[ikibz], isym_kvecd.second, &sloc_ik_grid, Gint_Tools::job_type::srotk);
+                uhm.GK.cal_gint(&inout);
+                // output sloc_ik
+                GlobalV::ofs_running << "sloc_ik_grid of isym=" << isym_kvecd.first << ", kvec=" << isym_kvecd.second.x << " " << isym_kvecd.second.y << " " << isym_kvecd.second.z << std::endl;
+                for (int i = 0;i < GlobalV::NLOCAL;++i)
+                {
+                    for (int j = 0;j < GlobalV::NLOCAL;++j)
+                    {
+                        GlobalV::ofs_running << sloc_ik_grid[j * GlobalV::NLOCAL + i] << " ";
+                    }
+                    GlobalV::ofs_running << std::endl;
+                }
+                sloc_kstar.push_back(ExxSym::rearrange_col(GlobalV::NLOCAL, *loc.ParaV, GlobalC::ucell, col_inside, symm.isym_rotiat_iat[isym_kvecd.first],
+                    sloc_ik_grid));
+                // uhm.GK.grid_to_2d(*loc.ParaV, sloc_ik_grid)));
+            // }
+
+            // //forth trail: calculate S(g, k) using 2-center integral 
+            // for (auto& isym_kvecd : kv.kstars[ikibz])
+            // {
+                std::vector<std::complex<double>> sloc_ik = folding_Srotk(kv.kvec_d[ikibz], isym_kvecd.second, uhm, GlobalC::ucell);
+                GlobalV::ofs_running << "sloc_ik_center2 of isym=" << isym_kvecd.first << ", kvec=" << isym_kvecd.second.x << " " << isym_kvecd.second.y << " " << isym_kvecd.second.z << std::endl;
+                for (int i = 0;i < GlobalV::NLOCAL;++i)
+                {
+                    for (int j = 0;j < GlobalV::NLOCAL;++j)
+                    {
+                        GlobalV::ofs_running << sloc_ik[j * GlobalV::NLOCAL + i] << " ";
+                    }
+                    GlobalV::ofs_running << std::endl;
+                }
+                // sloc_kstar.push_back(ExxSym::rearrange_col(GlobalV::NLOCAL, *loc.ParaV, GlobalC::ucell, col_inside, symm.isym_rotiat_iat[isym_kvecd.first],
+                //     sloc_ik));
+            }
+            //calculate S'^{-1}(k)S(gk)
+#ifdef __MPI
+            uhm.LM->invSkrot_Sgk.push_back(ExxSym::cal_invSkrot_Sgk_scalapack(kv.nkstot, uhm.LM->Sloc2, sloc_kstar, GlobalV::NLOCAL, *uhm.LM->ParaV));
+#else
+            uhm.LM->invSkrot_Sgk.push_back(ExxSym::cal_invSkrot_Sgk_lapack(kv.nkstot, uhm.LM->Sloc2, sloc_kstar, GlobalV::NLOCAL));
+#endif
+        }//end ikibz
+        uhm.destroy_all_HSR_sparse();
+    }
+
+    //c(k)=S^{-1}(k)S(gk)c(gk)
+    psi::Psi<std::complex<double >, psi::DEVICE_CPU> psi_full =
+        ExxSym::restore_psik(kv.nkstot_full, psi, uhm.LM->invSkrot_Sgk,
+            GlobalV::NLOCAL, GlobalV::NBANDS, *uhm.LM->ParaV);
+    // test: output psifull
+    GlobalV::ofs_running << "restored psi_full" << std::endl;
+    for (int ik = 0;ik < kv.nkstot_full;++ik)
+    {
+        GlobalV::ofs_running << "ik=" << ik << std::endl;
+        psi_full.fix_k(ik);
+        for (int ir = 0;ir < GlobalV::NLOCAL;++ir)
+        {
+            for (int ic = 0;ic < GlobalV::NBANDS;++ic)
+            {
+                GlobalV::ofs_running << psi_full(ic, ir) << " ";
+            }
+            GlobalV::ofs_running << std::endl;
+        }
+    }
+    // set wg_full
+    ModuleBase::matrix wg_full(kv.nkstot_full, wg.nc);
+    int ik_full = 0;
+    for (int ikibz = 0; ikibz < kv.nkstot; ++ikibz)
+    {
+        int nkstar_ibz = kv.kstars[ikibz].size();
+        for (int ikstar = 0;ikstar < nkstar_ibz;++ikstar)
+        {
+            for (int ib = 0; ib < GlobalV::NBANDS; ++ib) wg_full(ik_full, ib) = wg(ikibz, ib) / nkstar_ibz;
+            ++ik_full;
+        }
+    }
+    assert(ik_full = kv.nkstot_full);
+
+    loc.dm_k.resize(kv.nkstot_full);
+    elecstate::cal_dm(loc.ParaV, wg_full, psi_full, loc.dm_k);
+}
 template<typename Tdata>
-void Exx_LRI_Interface<Tdata>::exx_eachiterinit(const Local_Orbital_Charge& loc, const Charge_Mixing& chgmix, const int& iter)
+void Exx_LRI_Interface<Tdata>::exx_eachiterinit(
+    const Local_Orbital_Charge& loc,
+    const Charge_Mixing& chgmix,
+    const ModuleSymmetry::Symmetry& symm,
+    const int& iter)
 {
     if (GlobalC::exx_info.info_global.cal_exx)
     {
         if (!GlobalC::exx_info.info_global.separate_loop && exx_lri->two_level_step)
         {
-			exx_lri->mix_DMk_2D.set_mixing_beta(chgmix.get_mixing_beta());
+            exx_lri->mix_DMk_2D.set_mixing_beta(chgmix.get_mixing_beta());
 			if(chgmix.get_mixing_mode() == "pulay")
 				exx_lri->mix_DMk_2D.set_coef_pulay(iter, chgmix);
 			const bool flag_restart = (iter==1) ? true : false;
@@ -97,13 +373,13 @@ void Exx_LRI_Interface<Tdata>::exx_eachiterinit(const Local_Orbital_Charge& loc,
 			else
 				exx_lri->mix_DMk_2D.mix(loc.dm_k, flag_restart);
 
-            exx_lri->cal_exx_elec(*loc.LOWF->ParaV);
+            exx_lri->cal_exx_elec(*loc.LOWF->ParaV, symm);
         }
     }
 }
 
 template<typename Tdata>
-void Exx_LRI_Interface<Tdata>::exx_hamilt2density(elecstate::ElecState& elec, const Parallel_Orbitals& pv)
+void Exx_LRI_Interface<Tdata>::exx_hamilt2density(elecstate::ElecState& elec, const Parallel_Orbitals& pv, const ModuleSymmetry::Symmetry& symm)
 {
     // Peize Lin add 2020.04.04
     if (XC_Functional::get_func_type() == 4 || XC_Functional::get_func_type() == 5)
@@ -117,7 +393,7 @@ void Exx_LRI_Interface<Tdata>::exx_hamilt2density(elecstate::ElecState& elec, co
         {
             XC_Functional::set_xc_type(GlobalC::ucell.atoms[0].ncpp.xc_func);
 
-            exx_lri->cal_exx_elec(pv);
+            exx_lri->cal_exx_elec(pv, symm);
             GlobalC::restart.info_load.restart_exx = true;
         }
     }
@@ -133,6 +409,7 @@ bool Exx_LRI_Interface<Tdata>::exx_after_converge(
     LCAO_Matrix& lm,
     const Local_Orbital_Charge& loc,
     const K_Vectors& kv,
+    const ModuleSymmetry::Symmetry& symm,
     int& iter)
 {
     // Add EXX operator
@@ -208,7 +485,7 @@ bool Exx_LRI_Interface<Tdata>::exx_after_converge(
 				exx_lri->mix_DMk_2D.mix(loc.dm_k, flag_restart);
 
             // GlobalC::exx_lcao.cal_exx_elec(p_esolver->LOC, p_esolver->LOWF.wfc_k_grid);
-            exx_lri->cal_exx_elec(*loc.LOWF->ParaV);
+            exx_lri->cal_exx_elec(*loc.LOWF->ParaV, symm);
             iter = 0;
             std::cout << " Updating EXX and rerun SCF" << std::endl;
             exx_lri->two_level_step++;
diff --git a/source/module_ri/Mix_DMk_2D.cpp b/source/module_ri/Mix_DMk_2D.cpp
index ad56cdfdc..d3c50d78d 100644
--- a/source/module_ri/Mix_DMk_2D.cpp
+++ b/source/module_ri/Mix_DMk_2D.cpp
@@ -51,7 +51,9 @@ void Mix_DMk_2D::mix(const std::vector<ModuleBase::matrix> &dm, const bool flag_
 void Mix_DMk_2D::mix(const std::vector<ModuleBase::ComplexMatrix> &dm, const bool flag_restart)
 {
 	ModuleBase::TITLE("Mix_DMk_2D","mix");
-	assert(this->mix_DMk_k.size() == dm.size());
+    std::cout << "mix_DMk_k.size() = " << this->mix_DMk_k.size() << std::endl;
+    std::cout << "dm.size() = " << dm.size() << std::endl;
+    assert(this->mix_DMk_k.size() == dm.size());
 	for(int ik=0; ik<dm.size(); ++ik)
 		this->mix_DMk_k[ik].mix(dm[ik], flag_restart);
 }
diff --git a/source/module_ri/RI_2D_Comm.cpp b/source/module_ri/RI_2D_Comm.cpp
index 336511813..8f7baa2e1 100644
--- a/source/module_ri/RI_2D_Comm.cpp
+++ b/source/module_ri/RI_2D_Comm.cpp
@@ -10,6 +10,17 @@
 #include <string>
 #include <stdexcept>
 
+RI::Tensor<double> RI_2D_Comm::tensor_conj(const RI::Tensor<double>& t)
+{
+    return t;
+}
+RI::Tensor<std::complex<double>> RI_2D_Comm::tensor_conj(const RI::Tensor<std::complex<double>>& t)
+{
+    RI::Tensor<std::complex<double>> r(t.shape);
+    for (int i = 0;i < t.data->size();++i)
+        (*r.data)[i] = std::conj((*t.data)[i]);
+    return r;
+}
 // judge[is] = {s0, s1}
 auto RI_2D_Comm::get_2D_judge(const Parallel_Orbitals &pv)
 -> std::vector<std::tuple<std::set<TA>, std::set<TA>>>
diff --git a/source/module_ri/RI_2D_Comm.h b/source/module_ri/RI_2D_Comm.h
index 719403527..1532ed049 100644
--- a/source/module_ri/RI_2D_Comm.h
+++ b/source/module_ri/RI_2D_Comm.h
@@ -30,7 +30,7 @@ namespace RI_2D_Comm
 //public:
 	template<typename Tdata, typename Tmatrix>
 	extern std::vector<std::map<TA,std::map<TAC,RI::Tensor<Tdata>>>>
-	split_m2D_ktoR(const K_Vectors &kv, const std::vector<const Tmatrix*> &mks_2D, const Parallel_Orbitals &pv);
+        split_m2D_ktoR(const K_Vectors& kv, const std::vector<const Tmatrix*>& mks_2D, const Parallel_Orbitals& pv, const ModuleSymmetry::Symmetry& symm = ModuleSymmetry::Symmetry());
 
 	// judge[is] = {s0, s1}
 	extern std::vector<std::tuple<std::set<TA>, std::set<TA>>>
@@ -64,6 +64,9 @@ namespace RI_2D_Comm
 	extern inline int get_is_block(const int is_k, const int is_row_b, const int is_col_b);
 	extern inline std::tuple<int,int> split_is_block(const int is_b);
 	extern inline int get_iwt(const int iat, const int iw_b, const int is_b);
+
+    RI::Tensor<double> tensor_conj(const RI::Tensor<double>& t);
+    RI::Tensor<std::complex<double>> tensor_conj(const RI::Tensor<std::complex<double>>& t);
 }
 
 #include "RI_2D_Comm.hpp"
diff --git a/source/module_ri/RI_2D_Comm.hpp b/source/module_ri/RI_2D_Comm.hpp
index 4a5746272..550089967 100644
--- a/source/module_ri/RI_2D_Comm.hpp
+++ b/source/module_ri/RI_2D_Comm.hpp
@@ -1,199 +1,285 @@
-//=======================
-// AUTHOR : Peize Lin
-// DATE :   2022-08-17
-//=======================
-
-#ifndef RI_2D_COMM_HPP
-#define RI_2D_COMM_HPP
-
-#include "RI_2D_Comm.h"
-#include "RI_Util.h"
-#include "module_hamilt_pw/hamilt_pwdft/global.h"
-#include "module_base/tool_title.h"
-#include "module_base/timer.h"
-
-#include <RI/global/Global_Func-2.h>
-
-#include <cmath>
-#include <string>
-#include <stdexcept>
-
-template<typename Tdata, typename Tmatrix>
-auto RI_2D_Comm::split_m2D_ktoR(const K_Vectors &kv, const std::vector<const Tmatrix*> &mks_2D, const Parallel_Orbitals &pv)
--> std::vector<std::map<TA,std::map<TAC,RI::Tensor<Tdata>>>>
-{
-	ModuleBase::TITLE("RI_2D_Comm","split_m2D_ktoR");
-	ModuleBase::timer::tick("RI_2D_Comm", "split_m2D_ktoR");
-
-	const TC period = RI_Util::get_Born_vonKarmen_period(kv);
-	const std::map<int,int> nspin_k = {{1,1}, {2,2}, {4,1}};
-	const double SPIN_multiple = std::map<int,double>{{1,0.5}, {2,1}, {4,1}}.at(GlobalV::NSPIN);							// why?
-
-	std::vector<std::map<TA,std::map<TAC,RI::Tensor<Tdata>>>> mRs_a2D(GlobalV::NSPIN);
-	for(int is_k=0; is_k<nspin_k.at(GlobalV::NSPIN); ++is_k)
-	{
-		const std::vector<int> ik_list = RI_2D_Comm::get_ik_list(kv, is_k);
-		for(const TC &cell : RI_Util::get_Born_von_Karmen_cells(period))
-		{
-			RI::Tensor<Tdata> mR_2D;
-			for(const int ik : ik_list)
-			{
-				using Tdata_m = typename Tmatrix::type;
-				RI::Tensor<Tdata_m> mk_2D = RI_Util::Matrix_to_Tensor<Tdata_m>(*mks_2D[ik]);
-				const Tdata_m frac = SPIN_multiple
-					* RI::Global_Func::convert<Tdata_m>( std::exp(
-						- ModuleBase::TWO_PI*ModuleBase::IMAG_UNIT * (kv.kvec_c[ik] * (RI_Util::array3_to_Vector3(cell)*GlobalC::ucell.latvec))));
-				if(mR_2D.empty())
-					mR_2D = RI::Global_Func::convert<Tdata>(mk_2D * frac);
-				else
-					mR_2D = mR_2D + RI::Global_Func::convert<Tdata>(mk_2D * frac);
-			}
-
-			for(int iwt0_2D=0; iwt0_2D!=mR_2D.shape[0]; ++iwt0_2D)
-			{
-				const int iwt0 =
-					ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER()
-                    ? pv.local2global_col(iwt0_2D)
-                    : pv.local2global_row(iwt0_2D);
-				int iat0, iw0_b, is0_b;
-				std::tie(iat0,iw0_b,is0_b) = RI_2D_Comm::get_iat_iw_is_block(iwt0);
-				const int it0 = GlobalC::ucell.iat2it[iat0];
-				for(int iwt1_2D=0; iwt1_2D!=mR_2D.shape[1]; ++iwt1_2D)
-				{
-					const int iwt1 =
-						ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER()
-                        ? pv.local2global_row(iwt1_2D)
-                        : pv.local2global_col(iwt1_2D);
-					int iat1, iw1_b, is1_b;
-					std::tie(iat1,iw1_b,is1_b) = RI_2D_Comm::get_iat_iw_is_block(iwt1);
-					const int it1 = GlobalC::ucell.iat2it[iat1];
-
-					const int is_b = RI_2D_Comm::get_is_block(is_k, is0_b, is1_b);
-					RI::Tensor<Tdata> &mR_a2D = mRs_a2D[is_b][iat0][{iat1,cell}];
-					if(mR_a2D.empty())
-						mR_a2D = RI::Tensor<Tdata>({static_cast<size_t>(GlobalC::ucell.atoms[it0].nw), static_cast<size_t>(GlobalC::ucell.atoms[it1].nw)});
-					mR_a2D(iw0_b,iw1_b) = mR_2D(iwt0_2D, iwt1_2D);
-				}
-			}
-		}
-	}
-	ModuleBase::timer::tick("RI_2D_Comm", "split_m2D_ktoR");
-	return mRs_a2D;
-}
-
-
-template<typename Tdata>
-void RI_2D_Comm::add_Hexx(
-	const K_Vectors &kv,
-	const int ik,
-	const double alpha,
-	const std::vector<std::map<TA,std::map<TAC,RI::Tensor<Tdata>>>> &Hs,
-	LCAO_Matrix &lm)
-{
-	ModuleBase::TITLE("RI_2D_Comm","add_Hexx");
-	ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx");
-  
-    const Parallel_Orbitals& pv = *lm.ParaV;
-
-	const std::map<int, std::vector<int>> is_list = {{1,{0}}, {2,{kv.isk[ik]}}, {4,{0,1,2,3}}};
-	for(const int is_b : is_list.at(GlobalV::NSPIN))
-	{
-		int is0_b, is1_b;
-		std::tie(is0_b,is1_b) = RI_2D_Comm::split_is_block(is_b);
-		for(const auto &Hs_tmpA : Hs[is_b])
-		{
-			const TA &iat0 = Hs_tmpA.first;
-			for(const auto &Hs_tmpB : Hs_tmpA.second)
-			{
-				const TA &iat1 = Hs_tmpB.first.first;
-				const TC &cell1 = Hs_tmpB.first.second;
-				const std::complex<double> frac = alpha
-					* std::exp( ModuleBase::TWO_PI*ModuleBase::IMAG_UNIT * (kv.kvec_c[ik] * (RI_Util::array3_to_Vector3(cell1)*GlobalC::ucell.latvec)) );
-				const RI::Tensor<Tdata> &H = Hs_tmpB.second;
-				for(size_t iw0_b=0; iw0_b<H.shape[0]; ++iw0_b)
-				{
-					const int iwt0 = RI_2D_Comm::get_iwt(iat0, iw0_b, is0_b);
-                    if (pv.global2local_row(iwt0) < 0)	continue;
-					for(size_t iw1_b=0; iw1_b<H.shape[1]; ++iw1_b)
-					{
-						const int iwt1 = RI_2D_Comm::get_iwt(iat1, iw1_b, is1_b);
-                        if (pv.global2local_col(iwt1) < 0)	continue;
-
-						if(GlobalV::GAMMA_ONLY_LOCAL)
-							lm.set_HSgamma(iwt0, iwt1,
-								RI::Global_Func::convert<double>(H(iw0_b, iw1_b)) * RI::Global_Func::convert<double>(frac),
-								'L', lm.Hloc.data());
-						else
-							lm.set_HSk(iwt0, iwt1,
-								RI::Global_Func::convert<std::complex<double>>(H(iw0_b, iw1_b)) * frac,
-								'L', -1);
-					}
-				}
-			}
-		}
-	}
-	ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx");
-}
-
-std::tuple<int,int,int>
-RI_2D_Comm::get_iat_iw_is_block(const int iwt)
-{
-	const int iat = GlobalC::ucell.iwt2iat[iwt];
-	const int iw = GlobalC::ucell.iwt2iw[iwt];
-	switch(GlobalV::NSPIN)
-	{
-		case 1: case 2:
-			return std::make_tuple(iat, iw, 0);
-		case 4:
-			return std::make_tuple(iat, iw/2, iw%2);
-		default:
-			throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
-	}
-}
-
-int RI_2D_Comm::get_is_block(const int is_k, const int is_row_b, const int is_col_b)
-{
-	switch(GlobalV::NSPIN)
-	{
-		case 1:		return 0;
-		case 2:		return is_k;
-		case 4:		return is_row_b*2+is_col_b;
-		default:	throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
-	}
-}
-
-std::tuple<int,int>
-RI_2D_Comm::split_is_block(const int is_b)
-{
-	switch(GlobalV::NSPIN)
-	{
-		case 1:	case 2:
-			return std::make_tuple(0, 0);
-		case 4:
-			return std::make_tuple(is_b/2, is_b%2);
-		default:
-			throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
-	}
-}
-
-
-
-int RI_2D_Comm::get_iwt(const int iat, const int iw_b, const int is_b)
-{
-	const int it = GlobalC::ucell.iat2it[iat];
-	const int ia = GlobalC::ucell.iat2ia[iat];
-	int iw=-1;
-	switch(GlobalV::NSPIN)
-	{
-		case 1: case 2:
-			iw = iw_b;			break;
-		case 4:
-			iw = iw_b*2+is_b;	break;
-		default:
-			throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
-	}
-	const int iwt = GlobalC::ucell.itiaiw2iwt(it,ia,iw);
-	return iwt;
-}
-
+//=======================
+// AUTHOR : Peize Lin
+// DATE :   2022-08-17
+//=======================
+
+#ifndef RI_2D_COMM_HPP
+#define RI_2D_COMM_HPP
+
+#include "RI_2D_Comm.h"
+#include "RI_Util.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
+#include "module_base/tool_title.h"
+#include "module_base/timer.h"
+
+#include <RI/global/Global_Func-2.h>
+
+#include <cmath>
+#include <string>
+#include <stdexcept>
+template<typename Tdata, typename Tmatrix>
+auto RI_2D_Comm::split_m2D_ktoR(const K_Vectors& kv, const std::vector<const Tmatrix*>& mks_2D, const Parallel_Orbitals& pv, const ModuleSymmetry::Symmetry& symm)
+-> std::vector<std::map<TA,std::map<TAC,RI::Tensor<Tdata>>>>
+{
+    GlobalV::ofs_running << "nkstot_full=" << kv.nkstot_full << std::endl;
+	ModuleBase::TITLE("RI_2D_Comm","split_m2D_ktoR");
+	ModuleBase::timer::tick("RI_2D_Comm", "split_m2D_ktoR");
+
+	const TC period = RI_Util::get_Born_vonKarmen_period(kv);
+	const std::map<int,int> nspin_k = {{1,1}, {2,2}, {4,1}};
+	const double SPIN_multiple = std::map<int,double>{{1,0.5}, {2,1}, {4,1}}.at(GlobalV::NSPIN);							// why?
+
+	std::vector<std::map<TA,std::map<TAC,RI::Tensor<Tdata>>>> mRs_a2D(GlobalV::NSPIN);
+	for(int is_k=0; is_k<nspin_k.at(GlobalV::NSPIN); ++is_k)
+	{
+		const std::vector<int> ik_list = RI_2D_Comm::get_ik_list(kv, is_k);
+		for(const TC &cell : RI_Util::get_Born_von_Karmen_cells(period))
+		{
+            RI::Tensor<Tdata> mR_2D;
+            int ik = 0;
+            for (const int ik_ibz : ik_list)
+            {
+                using Tdata_m = typename Tmatrix::type;
+                if (ModuleSymmetry::Symmetry::symm_flag == 1)
+                    for (auto ik_star : kv.kstars[ik_ibz])
+                    {
+                        RI::Tensor<Tdata_m> mk_2D = RI_Util::Matrix_to_Tensor<Tdata_m>(*mks_2D[ik]);
+                        //test: output mks_2D
+                        GlobalV::ofs_running << "ik=" << ik << ", kvec_d[ik]=" << ik_star.second.x << " " << ik_star.second.y << " " << ik_star.second.z << std::endl;
+                        GlobalV::ofs_running << "mk_2D=" << std::endl;
+                        for (int i = 0; i < mk_2D.shape[0]; ++i)
+                        {
+                            for (int j = 0; j < mk_2D.shape[1]; ++j)
+                            {
+                                GlobalV::ofs_running << mk_2D(i, j) << " ";
+                            }
+                            GlobalV::ofs_running << std::endl;
+                        }
+                        Tdata_m frac = SPIN_multiple
+                            * RI::Global_Func::convert<Tdata_m>(std::exp(
+                                -ModuleBase::TWO_PI * ModuleBase::IMAG_UNIT * (ik_star.second * GlobalC::ucell.G * (RI_Util::array3_to_Vector3(cell) * GlobalC::ucell.latvec))));
+                        if (static_cast<int>(std::round(SPIN_multiple * kv.wk[ik_ibz] / kv.kstars[ik_ibz].size() * kv.nkstot_full)) == 2) //time reversal symmetry
+                        {
+                            if (mR_2D.empty())
+                                mR_2D = RI::Global_Func::convert<Tdata>((mk_2D * 0.5 * frac) + tensor_conj(mk_2D * 0.5 * frac));
+                            else
+                                mR_2D = mR_2D + RI::Global_Func::convert<Tdata>((mk_2D * 0.5 * frac) + tensor_conj(mk_2D * 0.5 * frac));
+                        }
+                        else
+                        {
+                            if (mR_2D.empty())
+                                mR_2D = RI::Global_Func::convert<Tdata>(mk_2D * frac);
+                            else
+                                mR_2D = mR_2D + RI::Global_Func::convert<Tdata>(mk_2D * frac);
+                        }
+                        ++ik;
+                        //test: output mR_2D
+                        if (ik == kv.nkstot_full)
+                        {
+                            GlobalV::ofs_running << "mR_2D=" << std::endl;
+                            for (int i = 0; i < mR_2D.shape[0]; ++i)
+                            {
+                                for (int j = 0; j < mR_2D.shape[1]; ++j)
+                                {
+                                    GlobalV::ofs_running << mR_2D(i, j) << " ";
+                                }
+                                GlobalV::ofs_running << std::endl;
+                            }
+                        }
+                    }
+                else
+                {
+                    RI::Tensor<Tdata_m> mk_2D = RI_Util::Matrix_to_Tensor<Tdata_m>(*mks_2D[ik_ibz]);
+                    //test: output mks_2D
+                    GlobalV::ofs_running << "ik=" << ik << ", kvec_d[ik]=" << kv.kvec_d[ik_ibz].x << " " << kv.kvec_d[ik_ibz].y << " " << kv.kvec_d[ik_ibz].z << std::endl;
+                    GlobalV::ofs_running << "mk_2D=" << std::endl;
+                    for (int i = 0; i < mk_2D.shape[0]; ++i)
+                    {
+                        for (int j = 0; j < mk_2D.shape[1]; ++j)
+                        {
+                            GlobalV::ofs_running << mk_2D(i, j) << " ";
+                        }
+                        GlobalV::ofs_running << std::endl;
+                    }
+                    Tdata_m frac = SPIN_multiple
+                        * RI::Global_Func::convert<Tdata_m>(std::exp(
+                            -ModuleBase::TWO_PI * ModuleBase::IMAG_UNIT * (kv.kvec_c[ik_ibz] * (RI_Util::array3_to_Vector3(cell) * GlobalC::ucell.latvec))));
+                    if (static_cast<int>(std::round(SPIN_multiple * kv.wk[ik_ibz] * kv.nkstot_full)) == 2) //time reversal symmetry
+                    {
+                        if (mR_2D.empty())
+                            mR_2D = RI::Global_Func::convert<Tdata>((mk_2D * 0.5 * frac) + tensor_conj(mk_2D * 0.5 * frac));
+                        else
+                            mR_2D = mR_2D + RI::Global_Func::convert<Tdata>((mk_2D * 0.5 * frac) + tensor_conj(mk_2D * 0.5 * frac));
+                    }
+                    else
+                    {
+                        if (mR_2D.empty())
+                            mR_2D = RI::Global_Func::convert<Tdata>(mk_2D * frac);
+                        else
+                            mR_2D = mR_2D + RI::Global_Func::convert<Tdata>(mk_2D * frac);
+                    }
+                    ++ik;
+                    //test: output mR_2D
+                    if (ik == kv.nkstot)
+                    {
+                        GlobalV::ofs_running << "mR_2D=" << std::endl;
+                        for (int i = 0; i < mR_2D.shape[0]; ++i)
+                        {
+                            for (int j = 0; j < mR_2D.shape[1]; ++j)
+                            {
+                                GlobalV::ofs_running << mR_2D(i, j) << " ";
+                            }
+                            GlobalV::ofs_running << std::endl;
+                        }
+                    }
+                }
+            }
+
+			for(int iwt0_2D=0; iwt0_2D!=mR_2D.shape[0]; ++iwt0_2D)
+			{
+				const int iwt0 =
+					ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER()
+                    ? pv.local2global_col(iwt0_2D)
+                    : pv.local2global_row(iwt0_2D);
+				int iat0, iw0_b, is0_b;
+				std::tie(iat0,iw0_b,is0_b) = RI_2D_Comm::get_iat_iw_is_block(iwt0);
+				const int it0 = GlobalC::ucell.iat2it[iat0];
+				for(int iwt1_2D=0; iwt1_2D!=mR_2D.shape[1]; ++iwt1_2D)
+				{
+					const int iwt1 =
+						ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER()
+                        ? pv.local2global_row(iwt1_2D)
+                        : pv.local2global_col(iwt1_2D);
+					int iat1, iw1_b, is1_b;
+					std::tie(iat1,iw1_b,is1_b) = RI_2D_Comm::get_iat_iw_is_block(iwt1);
+					const int it1 = GlobalC::ucell.iat2it[iat1];
+
+					const int is_b = RI_2D_Comm::get_is_block(is_k, is0_b, is1_b);
+					RI::Tensor<Tdata> &mR_a2D = mRs_a2D[is_b][iat0][{iat1,cell}];
+					if(mR_a2D.empty())
+						mR_a2D = RI::Tensor<Tdata>({static_cast<size_t>(GlobalC::ucell.atoms[it0].nw), static_cast<size_t>(GlobalC::ucell.atoms[it1].nw)});
+					mR_a2D(iw0_b,iw1_b) = mR_2D(iwt0_2D, iwt1_2D);
+				}
+			}
+		}
+	}
+	ModuleBase::timer::tick("RI_2D_Comm", "split_m2D_ktoR");
+	return mRs_a2D;
+}
+
+
+template<typename Tdata>
+void RI_2D_Comm::add_Hexx(
+	const K_Vectors &kv,
+	const int ik,
+	const double alpha,
+	const std::vector<std::map<TA,std::map<TAC,RI::Tensor<Tdata>>>> &Hs,
+	LCAO_Matrix &lm)
+{
+	ModuleBase::TITLE("RI_2D_Comm","add_Hexx");
+	ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx");
+  
+    const Parallel_Orbitals& pv = *lm.ParaV;
+
+	const std::map<int, std::vector<int>> is_list = {{1,{0}}, {2,{kv.isk[ik]}}, {4,{0,1,2,3}}};
+	for(const int is_b : is_list.at(GlobalV::NSPIN))
+	{
+		int is0_b, is1_b;
+		std::tie(is0_b,is1_b) = RI_2D_Comm::split_is_block(is_b);
+		for(const auto &Hs_tmpA : Hs[is_b])
+		{
+			const TA &iat0 = Hs_tmpA.first;
+			for(const auto &Hs_tmpB : Hs_tmpA.second)
+			{
+				const TA &iat1 = Hs_tmpB.first.first;
+				const TC &cell1 = Hs_tmpB.first.second;
+				const std::complex<double> frac = alpha
+					* std::exp( ModuleBase::TWO_PI*ModuleBase::IMAG_UNIT * (kv.kvec_c[ik] * (RI_Util::array3_to_Vector3(cell1)*GlobalC::ucell.latvec)) );
+				const RI::Tensor<Tdata> &H = Hs_tmpB.second;
+				for(size_t iw0_b=0; iw0_b<H.shape[0]; ++iw0_b)
+				{
+					const int iwt0 = RI_2D_Comm::get_iwt(iat0, iw0_b, is0_b);
+                    if (pv.global2local_row(iwt0) < 0)	continue;
+					for(size_t iw1_b=0; iw1_b<H.shape[1]; ++iw1_b)
+					{
+						const int iwt1 = RI_2D_Comm::get_iwt(iat1, iw1_b, is1_b);
+                        if (pv.global2local_col(iwt1) < 0)	continue;
+
+						if(GlobalV::GAMMA_ONLY_LOCAL)
+							lm.set_HSgamma(iwt0, iwt1,
+								RI::Global_Func::convert<double>(H(iw0_b, iw1_b)) * RI::Global_Func::convert<double>(frac),
+								'L', lm.Hloc.data());
+						else
+							lm.set_HSk(iwt0, iwt1,
+								RI::Global_Func::convert<std::complex<double>>(H(iw0_b, iw1_b)) * frac,
+								'L', -1);
+					}
+				}
+			}
+		}
+	}
+	ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx");
+}
+
+std::tuple<int,int,int>
+RI_2D_Comm::get_iat_iw_is_block(const int iwt)
+{
+	const int iat = GlobalC::ucell.iwt2iat[iwt];
+	const int iw = GlobalC::ucell.iwt2iw[iwt];
+	switch(GlobalV::NSPIN)
+	{
+		case 1: case 2:
+			return std::make_tuple(iat, iw, 0);
+		case 4:
+			return std::make_tuple(iat, iw/2, iw%2);
+		default:
+			throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
+	}
+}
+
+int RI_2D_Comm::get_is_block(const int is_k, const int is_row_b, const int is_col_b)
+{
+	switch(GlobalV::NSPIN)
+	{
+		case 1:		return 0;
+		case 2:		return is_k;
+		case 4:		return is_row_b*2+is_col_b;
+		default:	throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
+	}
+}
+
+std::tuple<int,int>
+RI_2D_Comm::split_is_block(const int is_b)
+{
+	switch(GlobalV::NSPIN)
+	{
+		case 1:	case 2:
+			return std::make_tuple(0, 0);
+		case 4:
+			return std::make_tuple(is_b/2, is_b%2);
+		default:
+			throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
+	}
+}
+
+
+
+int RI_2D_Comm::get_iwt(const int iat, const int iw_b, const int is_b)
+{
+	const int it = GlobalC::ucell.iat2it[iat];
+	const int ia = GlobalC::ucell.iat2ia[iat];
+	int iw=-1;
+	switch(GlobalV::NSPIN)
+	{
+		case 1: case 2:
+			iw = iw_b;			break;
+		case 4:
+			iw = iw_b*2+is_b;	break;
+		default:
+			throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
+	}
+	const int iwt = GlobalC::ucell.itiaiw2iwt(it,ia,iw);
+	return iwt;
+}
+
 #endif
\ No newline at end of file
diff --git a/source/module_ri/RPA_LRI.h b/source/module_ri/RPA_LRI.h
index 5ada94e5d..f7c19d00f 100644
--- a/source/module_ri/RPA_LRI.h
+++ b/source/module_ri/RPA_LRI.h
@@ -41,9 +41,10 @@ template <typename Tdata> class RPA_LRI
     void init(const MPI_Comm &mpi_comm_in, const K_Vectors &kv_in);
     void cal_rpa_cv();
     void cal_postSCF_exx(const Local_Orbital_Charge& loc,
-                    const MPI_Comm& mpi_comm_in,
-                    const K_Vectors& kv,
-                    const Parallel_Orbitals& pv);
+        const MPI_Comm& mpi_comm_in,
+        const K_Vectors& kv,
+        const Parallel_Orbitals& pv,
+        const ModuleSymmetry::Symmetry& symm);
     void out_for_RPA(const Parallel_Orbitals& parav,
                      const psi::Psi<std::complex<double>> &psi,
                      const elecstate::ElecState *pelec);
diff --git a/source/module_ri/RPA_LRI.hpp b/source/module_ri/RPA_LRI.hpp
index a794b9b8f..d1ba2e3c3 100644
--- a/source/module_ri/RPA_LRI.hpp
+++ b/source/module_ri/RPA_LRI.hpp
@@ -59,11 +59,15 @@ template <typename Tdata> void RPA_LRI<Tdata>::cal_rpa_cv()
 
 template <typename Tdata>
 void RPA_LRI<Tdata>::cal_postSCF_exx(const Local_Orbital_Charge& loc,
-                const MPI_Comm& mpi_comm_in,
-                const K_Vectors& kv,
-                const Parallel_Orbitals& pv)
+    const MPI_Comm& mpi_comm_in,
+    const K_Vectors& kv,
+    const Parallel_Orbitals& pv,
+    const ModuleSymmetry::Symmetry& symm)
 {
-    exx_lri_rpa.mix_DMk_2D.set_nks(kv.nks, GlobalV::GAMMA_ONLY_LOCAL);
+    if (!GlobalV::GAMMA_ONLY_LOCAL && ModuleSymmetry::Symmetry::symm_flag == 1)
+        exx_lri_rpa.mix_DMk_2D.set_nks(kv.nkstot_full, GlobalV::GAMMA_ONLY_LOCAL);
+    else
+        exx_lri_rpa.mix_DMk_2D.set_nks(kv.nks, GlobalV::GAMMA_ONLY_LOCAL);
     exx_lri_rpa.mix_DMk_2D.set_mixing_mode(Mixing_Mode::No);
     if(GlobalV::GAMMA_ONLY_LOCAL)
         exx_lri_rpa.mix_DMk_2D.mix(loc.dm_gamma, true);
@@ -71,7 +75,7 @@ void RPA_LRI<Tdata>::cal_postSCF_exx(const Local_Orbital_Charge& loc,
         exx_lri_rpa.mix_DMk_2D.mix(loc.dm_k, true);
     exx_lri_rpa.init(mpi_comm_in, kv);
     exx_lri_rpa.cal_exx_ions();
-    exx_lri_rpa.cal_exx_elec(pv);
+    exx_lri_rpa.cal_exx_elec(pv, symm);
     // cout<<"postSCF_Eexx: "<<exx_lri_rpa.Eexx<<endl;
 }
 
diff --git a/source/module_ri/exx_symmetry.cpp b/source/module_ri/exx_symmetry.cpp
index 56f250e12..f30342ade 100644
--- a/source/module_ri/exx_symmetry.cpp
+++ b/source/module_ri/exx_symmetry.cpp
@@ -362,4 +362,14 @@ namespace ExxSym
         }
     }
 #endif
+
+    std::vector<ModuleBase::Vector3<double>> matfunc(
+        const std::vector<ModuleBase::Vector3<double>>& vec,
+        const ModuleBase::Matrix3& gmat,
+        const std::function<ModuleBase::Vector3<double>(const ModuleBase::Vector3<double>&, const ModuleBase::Matrix3&)>& f)
+    {
+        std::vector<ModuleBase::Vector3<double>> res;
+        for (auto& v : vec) res.push_back(f(v, gmat));
+        return res;
+    };
 }
\ No newline at end of file
diff --git a/source/module_ri/exx_symmetry.h b/source/module_ri/exx_symmetry.h
index e766a4e9e..b021ccc28 100644
--- a/source/module_ri/exx_symmetry.h
+++ b/source/module_ri/exx_symmetry.h
@@ -133,4 +133,10 @@ namespace ExxSym
         const int& nbasis,
         const Parallel_2D& p2d,
         const bool col_inside);
+
+
+    std::vector<ModuleBase::Vector3<double>> matfunc(
+        const std::vector<ModuleBase::Vector3<double>>& vec,
+        const ModuleBase::Matrix3& gmat,
+        const std::function<ModuleBase::Vector3<double>(const ModuleBase::Vector3<double>&, const ModuleBase::Matrix3&)>& f);
 }
\ No newline at end of file
diff --git a/source/module_ri/test/exx_symmetry_test.cpp b/source/module_ri/test/exx_symmetry_test.cpp
index 0dd0162af..beaa5066f 100644
--- a/source/module_ri/test/exx_symmetry_test.cpp
+++ b/source/module_ri/test/exx_symmetry_test.cpp
@@ -294,4 +294,37 @@ int main(int argc, char** argv)
 #ifdef __MPI
     MPI_Finalize();
 #endif
-}
\ No newline at end of file
+}
+
+TEST_F(SymExxTest, matfunc)
+{
+    // supercells
+    int nR = 1;
+    std::vector<ModuleBase::Vector3<double>> supercells;
+
+    for (int i = -nR;i <= nR;++i)
+        for (int j = -nR;j <= nR;++j)
+            for (int k = -nR;k <= nR;++k)
+                supercells.push_back(ModuleBase::Vector3<double>(i, j, k));
+
+    ModuleBase::Matrix3 inv2 = ModuleBase::Matrix3(-1, 1, 0, -1, 0, 0, -1, 0, 1);
+    ModuleBase::Matrix3 latvec(0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0);
+    ModuleBase::Matrix3 inv2_cart = latvec.Inverse() * inv2 * latvec;
+
+
+    // auto f = [](const ModuleBase::Vector3<double>& R, const ModuleBase::Matrix3& g) {return R - R * g; };
+    auto f = [](const ModuleBase::Vector3<double>& R, const ModuleBase::Matrix3& g) {return R * g; };
+    // std::vector<ModuleBase::Vector3<double>> R_minus_gR = ExxSym::matfunc(supercells, inv2, f);
+    std::vector<ModuleBase::Vector3<double>> R_minus_gR_cart = ExxSym::matfunc(supercells, inv2_cart, f);
+
+    // output 
+    // std::cout << "R_minus_gR(Direct)" << std::endl;
+    // for (auto& v : R_minus_gR) std::cout << v.x << " " << v.y << " " << v.z << std::endl;
+    //cartesian
+    // std::cout << "R_minus_gR (Cartesian)" << std::endl;
+    // for (auto& v : R_minus_gR) std::cout << (v * latvec).x << " " << (v * latvec).y << " " << (v * latvec).z << std::endl;
+    std::cout << "R_minus_gR_cart (Cartesian)" << std::endl;
+    for (auto& v : R_minus_gR_cart) std::cout << v.x << " " << v.y << " " << v.z << std::endl;
+    std::cout << "R_minus_gR(Direct)" << std::endl;
+    for (auto& v : R_minus_gR_cart) std::cout << (v * latvec.Inverse()).x << " " << (v * latvec.Inverse()).y << " " << (v * latvec.Inverse()).z << std::endl;
+}