diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index f72cc9ae6c..5416eb69e7 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -286,6 +286,7 @@ OBJS_HCONTAINER=base_matrix.o\
 
 OBJS_HSOLVER=diago_cg.o\
     diago_david.o\
+    diago_dav_subspace.o\
     diagh_consts.o\
     diago_bpcg.o\
     hsolver_pw.o\
diff --git a/source/module_base/lapack_connector.h b/source/module_base/lapack_connector.h
index 8dd312b177..1142ef93c5 100644
--- a/source/module_base/lapack_connector.h
+++ b/source/module_base/lapack_connector.h
@@ -22,6 +22,9 @@
 extern "C"
 {
 
+    int ilaenv_(int* ispec,const char* name,const char* opts,
+                const int* n1,const int* n2,const int* n3,const int* n4);
+
     // solve the generalized eigenproblem Ax=eBx, where A is Hermitian and complex couble
     // zhegv_ & zhegvd_ returns all eigenvalues while zhegvx_ returns selected ones
     void dsygvd_(const int* itype, const char* jobz, const char* uplo, const int* n,
@@ -60,9 +63,12 @@ extern "C"
                  const int* m, double* w, std::complex<double> *z, const int *ldz, 
                  std::complex<double> *work, const int* lwork, double* rwork, int* iwork, int* ifail, int* info);
 
-    void zhegv_(const int* itype,const char* jobz,const char* uplo,const int* n,
-                std::complex<double>* a,const int* lda,std::complex<double>* b,const int* ldb,
-                double* w,std::complex<double>* work,int* lwork,double* rwork,int* info);
+
+    void dsygvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+                const int* n, double* A, const int* lda, double* B, const int* ldb,
+                const double* vl, const double* vu, const int* il, const int* iu,
+                const double* abstol, const int* m, double* w, double* Z, const int* ldz,
+                double* work, int* lwork, int*iwork, int* ifail, int* info);
 
     void chegvx_(const int* itype,const char* jobz,const char* range,const char* uplo,
              const int* n,std::complex<float> *a,const int* lda,std::complex<float> *b,
@@ -78,6 +84,16 @@ extern "C"
                  std::complex<double> *z,const int *ldz,std::complex<double> *work,const int* lwork,
                  double* rwork,int* iwork,int* ifail,int* info);
 
+    void zhegv_(const int* itype,const char* jobz,const char* uplo,const int* n,
+                std::complex<double>* a,const int* lda,std::complex<double>* b,const int* ldb,
+                double* w,std::complex<double>* work,int* lwork,double* rwork,int* info);
+    void chegv_(const int* itype,const char* jobz,const char* uplo,const int* n,
+                std::complex<float>* a,const int* lda,std::complex<float>* b,const int* ldb,
+                float* w,std::complex<float>* work,int* lwork,float* rwork,int* info);
+	void dsygv_(const int* itype, const char* jobz,const char* uplo, const int* n,
+				double* a,const int* lda,double* b,const int* ldb,
+	 			double* w,double* work,int* lwork,int* info);
+
     // solve the eigenproblem Ax=ex, where A is Hermitian and complex couble
     // zheev_ returns all eigenvalues while zheevx_ returns selected ones
     void zheev_(const char* jobz,const char* uplo,const int* n,std::complex<double> *a,
@@ -86,18 +102,6 @@ extern "C"
     void cheev_(const char* jobz,const char* uplo,const int* n,std::complex<float> *a,
                 const int* lda,float* w,std::complex<float >* work,const int* lwork,
                 float* rwork,int* info);
-
-	// solve the generalized eigenproblem Ax=eBx, where A is Symmetric and real couble
-    // dsygv_ returns all eigenvalues while dsygvx_ returns selected ones
-	void dsygv_(const int* itype, const char* jobz,const char* uplo, const int* n,
-				double* a,const int* lda,double* b,const int* ldb,
-	 			double* w,double* work,int* lwork,int* info);
-	void dsygvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
-                const int* n, double* A, const int* lda, double* B, const int* ldb,
-                const double* vl, const double* vu, const int* il, const int* iu,
-                const double* abstol, int* m, double* w, double* Z, const int* ldz,
-                double* work, int* lwork, int*iwork, int* ifail, int* info);
-    // solve the eigenproblem Ax=ex, where A is Symmetric and real double
 	void dsyev_(const char* jobz,const char* uplo,const int* n,double *a,
                 const int* lda,double* w,double* work,const int* lwork, int* info);
 
@@ -314,23 +318,19 @@ class LapackConnector
 	}
 
 public:
-    // wrap function of fortran lapack routine zhegvd.
     static inline
-    void zhegvd(const int itype, const char jobz, const char uplo, const int n, 
-                std::complex<double>* a, const int lda, 
-                const std::complex<double>* b, const int ldb, double* w, 
-                std::complex<double>* work, int lwork, double* rwork, int lrwork,
-                int* iwork, int liwork, int& info)
+    int ilaenv( int ispec, const char *name,const char *opts,const int n1,const int n2,
+                const int n3,const int n4)
     {
-        zhegvd_(&itype, &jobz, &uplo, &n, 
-                a, &lda, b, &ldb, w,
-                work, &lwork, rwork, &lrwork,
-                iwork, &liwork, &info);
+        const int nb = ilaenv_(&ispec, name, opts, &n1, &n2, &n3, &n4);
+        return nb;
     }
 
+
+
     // wrap function of fortran lapack routine zhegvd. (pointer version)
     static inline
-        void xhegvd(const int itype, const char jobz, const char uplo, const int n,
+    void xhegvd(const int itype, const char jobz, const char uplo, const int n,
             double* a, const int lda,
             const double* b, const int ldb, double* w,
             double* work, int lwork, double* rwork, int lrwork,
@@ -373,23 +373,9 @@ class LapackConnector
                 iwork, &liwork, &info);
     }
 
-    // wrap function of fortran lapack routine zheevx.
-    static inline
-    void zheevx( const int itype, const char jobz, const char range, const char uplo, const int n, 
-                 std::complex<double>* a, const int lda, 
-                 const double vl, const double vu, const int il, const int iu, const double abstol, 
-                 const int m, double* w, std::complex<double>* z, const int ldz,
-                 std::complex<double>* work, const int lwork, double* rwork, int* iwork, int* ifail, int& info)
-    {
-        zheevx_(&jobz, &range, &uplo, &n, 
-                a, &lda, &vl, &vu, &il, &iu, 
-                &abstol, &m, w, z, &ldz, 
-                work, &lwork, rwork, iwork, ifail, &info);
-    }
-
     // wrap function of fortran lapack routine dsyevx.
     static inline
-        void xheevx(const int itype, const char jobz, const char range, const char uplo, const int n,
+    void xheevx(const int itype, const char jobz, const char range, const char uplo, const int n,
             double* a, const int lda,
             const double vl, const double vu, const int il, const int iu, const double abstol,
             const int m, double* w, double* z, const int ldz,
@@ -428,6 +414,98 @@ class LapackConnector
                 &abstol, &m, w, z, &ldz,
                 work, &lwork, rwork, iwork, ifail, &info);
     }
+    
+    // wrap function of fortran lapack routine xhegvx ( pointer version ).
+    static inline
+    void xhegvx( const int itype, const char jobz, const char range, const char uplo,
+                 const int n, std::complex<float>* a, const int lda, std::complex<float>* b,
+                 const int ldb, const float vl, const float vu, const int il, const int iu,
+                 const float abstol, const int m, float* w, std::complex<float>* z, const int ldz,
+                 std::complex<float>* work, const int lwork, float* rwork, int* iwork,
+                 int* ifail, int& info)
+    {
+        chegvx_(&itype, &jobz, &range, &uplo, &n, a, &lda, b, &ldb, &vl,
+                &vu, &il,&iu, &abstol, &m, w, z, &ldz, work, &lwork, rwork, iwork, ifail, &info);
+    }
+
+    // wrap function of fortran lapack routine xhegvx ( pointer version ).
+    static inline
+    void xhegvx( const int itype, const char jobz, const char range, const char uplo,
+                 const int n, std::complex<double>* a, const int lda, std::complex<double>* b,
+                 const int ldb, const double vl, const double vu, const int il, const int iu,
+                 const double abstol, const int m, double* w, std::complex<double>* z, const int ldz,
+                 std::complex<double>* work, const int lwork, double* rwork, int* iwork,
+                 int* ifail, int& info)
+    {
+        zhegvx_(&itype, &jobz, &range, &uplo, &n, a, &lda, b, &ldb, &vl,
+                &vu, &il,&iu, &abstol, &m, w, z, &ldz, work, &lwork, rwork, iwork, ifail, &info);
+    }
+    // wrap function of fortran lapack routine xhegvx ( pointer version ).
+    static inline
+    void xhegvx( const int itype, const char jobz, const char range, const char uplo,
+                 const int n, double* a, const int lda, double* b,
+                 const int ldb, const double vl, const double vu, const int il, const int iu,
+                 const double abstol, const int m, double* w, double* z, const int ldz,
+                 double* work, const int lwork, double* rwork, int* iwork,
+                 int* ifail, int& info)
+    {
+        // dsygvx_(&itype, &jobz, &range, &uplo, &n, a, &lda, b, &ldb, &vl,
+        //         &vu, &il,&iu, &abstol, &m, w, z, &ldz, work, &lwork, rwork, iwork, ifail, &info);
+    }
+
+
+    // wrap function of fortran lapack routine xhegvx ( pointer version ).
+    static inline
+    void xhegv( const int itype, const char jobz, const char uplo,
+                const int n, 
+                double* a, const int lda, 
+                double* b, const int ldb, 
+                double* w, 
+                double* work, int lwork, 
+                double* rwork, int& info)
+    {
+        // TODO
+    }
+
+    // wrap function of fortran lapack routine xhegvx ( pointer version ).
+    static inline
+    void xhegv( const int itype, const char jobz, const char uplo,
+                const int n, 
+                std::complex<float>* a, const int lda, 
+                std::complex<float>* b, const int ldb, 
+                float* w, 
+                std::complex<float>* work, int lwork, 
+                float* rwork, int& info)
+    {
+        // TODO
+    }
+    // wrap function of fortran lapack routine xhegvx ( pointer version ).
+    static inline
+    void xhegv( const int itype, const char jobz, const char uplo,
+                const int n, 
+                std::complex<double>* a, const int lda, 
+                std::complex<double>* b, const int ldb, 
+                double* w, 
+                std::complex<double>* work, int lwork, 
+                double* rwork, int& info)
+    {
+        zhegv_(&itype, &jobz, &uplo, &n, a, &lda, b, &ldb, w, work, &lwork, rwork, &info);
+    }
+
+
+    // wrap function of fortran lapack routine zhegvd.
+    static inline
+    void zhegvd(const int itype, const char jobz, const char uplo, const int n, 
+                std::complex<double>* a, const int lda, 
+                const std::complex<double>* b, const int ldb, double* w, 
+                std::complex<double>* work, int lwork, double* rwork, int lrwork,
+                int* iwork, int liwork, int& info)
+    {
+        zhegvd_(&itype, &jobz, &uplo, &n, 
+                a, &lda, b, &ldb, w,
+                work, &lwork, rwork, &lrwork,
+                iwork, &liwork, &info);
+    }
 
     // wrap function of fortran lapack routine zhegv ( ModuleBase::ComplexMatrix version ).
     static inline
@@ -543,30 +621,6 @@ class LapackConnector
         delete[] zux;
     }
 
-    // wrap function of fortran lapack routine xhegvx ( pointer version ).
-    static inline
-    void xhegvx( const int itype, const char jobz, const char range, const char uplo,
-                 const int n, const std::complex<float>* a, const int lda, const std::complex<float>* b,
-                 const int ldb, const float vl, const float vu, const int il, const int iu,
-                 const float abstol, const int m, float* w, std::complex<float>* z, const int ldz,
-                 std::complex<float>* work, const int lwork, float* rwork, int* iwork,
-                 int* ifail, int& info, int nbase_x)
-    {
-        chegvx(itype, jobz, range, uplo, n, a, lda, b, ldb, vl, vu, il, iu, abstol, m, w, z, ldz, work, lwork, rwork, iwork, ifail, info, nbase_x);
-    }
-
-    // wrap function of fortran lapack routine xhegvx ( pointer version ).
-    static inline
-    void xhegvx( const int itype, const char jobz, const char range, const char uplo,
-                 const int n, const std::complex<double>* a, const int lda, const std::complex<double>* b,
-                 const int ldb, const double vl, const double vu, const int il, const int iu,
-                 const double abstol, const int m, double* w, std::complex<double>* z, const int ldz,
-                 std::complex<double>* work, const int lwork, double* rwork, int* iwork,
-                 int* ifail, int& info, int nbase_x)
-    {
-        zhegvx(itype, jobz, range, uplo, n, a, lda, b, ldb, vl, vu, il, iu, abstol, m, w, z, ldz, work, lwork, rwork, iwork, ifail, info, nbase_x);
-    }
-
 	// calculate the eigenvalues and eigenfunctions of a real symmetric matrix.
     static inline
     void dsygv(	const int itype,const char jobz,const char uplo,const int n,ModuleBase::matrix& a,
diff --git a/source/module_basis/module_ao/ORB_control.cpp b/source/module_basis/module_ao/ORB_control.cpp
index 1435bf8e93..6966ca99cf 100644
--- a/source/module_basis/module_ao/ORB_control.cpp
+++ b/source/module_basis/module_ao/ORB_control.cpp
@@ -203,7 +203,7 @@ void ORB_control::setup_2d_division(std::ofstream& ofs_running,
 
     // determine whether 2d-division or not according to ks_solver
     bool div_2d;
-    if (ks_solver == "lapack" || ks_solver == "cg" || ks_solver == "dav") div_2d = false;
+    if (ks_solver == "lapack" || ks_solver == "cg" || ks_solver == "dav" || ks_solver == "dav_subspace") div_2d = false;
 #ifdef __MPI
     else if (ks_solver == "genelpa" || ks_solver == "scalapack_gvx" || ks_solver == "cusolver" || ks_solver == "cg_in_lcao") div_2d = true;
 #endif
diff --git a/source/module_elecstate/elecstate_print.cpp b/source/module_elecstate/elecstate_print.cpp
index 1a20566aa0..78be828c66 100644
--- a/source/module_elecstate/elecstate_print.cpp
+++ b/source/module_elecstate/elecstate_print.cpp
@@ -289,6 +289,10 @@ void ElecState::print_etot(const bool converged,
     {
         label = "DA";
     }
+    else if (ks_solver_type == "dav_subspace")
+    {
+        label = "DS";
+    }
     else if (ks_solver_type == "scalapack_gvx")
     {
         label = "GV";
diff --git a/source/module_hamilt_pw/hamilt_pwdft/wavefunc.cpp b/source/module_hamilt_pw/hamilt_pwdft/wavefunc.cpp
index ac16819967..c10bb386e4 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/wavefunc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/wavefunc.cpp
@@ -574,7 +574,7 @@ void diago_PAO_in_pw_k2(const psi::DEVICE_GPU *ctx,
             //GlobalC::hm.diagH_subspace(ik ,starting_nw, nbands, wfcatom, wfcatom, etatom.data());
         }
     }
-    else if(GlobalV::KS_SOLVER=="dav")
+    else if (GlobalV::KS_SOLVER == "dav" || GlobalV::KS_SOLVER == "dav_subspace")
     {
         assert(nbands <= wfcatom.nr);
         // replace by haozhihan 2022-11-23
@@ -685,8 +685,8 @@ void diago_PAO_in_pw_k2(const psi::DEVICE_GPU *ctx,
 			//GlobalC::hm.diagH_subspace(ik ,starting_nw, nbands, wfcatom, wfcatom, etatom.data());
 		}
 	}
-	else if(GlobalV::KS_SOLVER=="dav")
-	{
+    else if (GlobalV::KS_SOLVER == "dav" || GlobalV::KS_SOLVER == "dav_subspace")
+    {
 		assert(nbands <= wfcatom.nr);
 		// replace by haozhihan 2022-11-23
         hsolver::matrixSetToAnother<std::complex<double>, psi::DEVICE_GPU>()(
diff --git a/source/module_hsolver/CMakeLists.txt b/source/module_hsolver/CMakeLists.txt
index bfb50f21ea..ec64e49f5f 100644
--- a/source/module_hsolver/CMakeLists.txt
+++ b/source/module_hsolver/CMakeLists.txt
@@ -2,6 +2,7 @@ list(APPEND objects
     diagh_consts.cpp
     diago_cg.cpp
     diago_david.cpp
+    diago_dav_subspace.cpp
     diago_bpcg.cpp
     hsolver_pw.cpp
     hsolver_pw_sdft.cpp
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
new file mode 100644
index 0000000000..5ee48e60bf
--- /dev/null
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -0,0 +1,992 @@
+#include "diago_dav_subspace.h"
+
+#include <algorithm>
+#include <type_traits>
+
+#include "diago_iter_assist.h"
+#include "module_base/blas_connector.h"
+#include "module_base/constants.h"
+#include "module_base/lapack_connector.h"
+#include "module_base/memory.h"
+#include "module_base/parallel_common.h"
+#include "module_base/parallel_reduce.h"
+#include "module_base/timer.h"
+#include "module_hsolver/kernels/dngvd_op.h"
+#include "module_hsolver/kernels/math_kernel_op.h"
+
+using namespace hsolver;
+
+template <typename T, typename Device>
+Diago_DavSubspace<T, Device>::Diago_DavSubspace(const Real* precondition_in)
+{
+    this->device = psi::device::get_device_type<Device>(this->ctx);
+    this->precondition = precondition_in;
+
+    test_david = 2;
+    // 1: check which function is called and which step is executed
+    // 2: check the eigenvalues of the result of each iteration
+    // 3: check the eigenvalues and errors of the last result
+    // default: no check
+
+    this->one = &this->cs.one;
+    this->zero = &this->cs.zero;
+    this->neg_one = &this->cs.neg_one;
+}
+
+template <typename T, typename Device>
+Diago_DavSubspace<T, Device>::~Diago_DavSubspace()
+{
+    delmem_complex_op()(this->ctx, this->hphi);
+    delmem_complex_op()(this->ctx, this->hcc);
+    delmem_complex_op()(this->ctx, this->scc);
+    delmem_complex_op()(this->ctx, this->vcc);
+
+    delmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav);
+
+    if (this->device == psi::GpuDevice)
+    {
+        delmem_real_op()(this->ctx, this->d_precondition);
+    }
+}
+
+template <typename T, typename Device>
+void Diago_DavSubspace<T, Device>::diag_once(hamilt::Hamilt<T, Device>* phm_in,
+                                        psi::Psi<T, Device>& psi,
+                                        Real* eigenvalue_in_hsolver,
+                                        std::vector<bool>& is_occupied)
+{
+    if (test_david == 1)
+    {
+        ModuleBase::TITLE("Diago_DavSubspace", "diag_once");
+    }
+    ModuleBase::timer::tick("Diago_DavSubspace", "diag_once");
+
+    assert(Diago_DavSubspace::PW_DIAG_NDIM > 1);
+    assert(Diago_DavSubspace::PW_DIAG_NDIM * psi.get_nbands() < psi.get_current_nbas() * GlobalV::NPROC_IN_POOL);
+
+    // qianrui change it 2021-7-25.
+    // In strictly speaking, it shoule be PW_DIAG_NDIM*nband < npw sum of all pools. We roughly estimate it here.
+    // However, in most cases, total number of plane waves should be much larger than nband * PW_DIAG_NDIM
+
+    this->dim = psi.get_k_first() ? psi.get_current_nbas() : psi.get_nk() * psi.get_nbasis();
+    this->n_band = psi.get_nbands();
+
+    // maximum dimension of the reduced basis set
+    this->nbase_x = 2 * this->n_band;
+
+    psi::Psi<T, Device> basis(1, this->nbase_x, this->dim, &(psi.get_ngk(0)));
+    ModuleBase::Memory::record("DAV::basis", this->nbase_x * this->dim * sizeof(T));
+
+    // the eigenvalues in dav iter
+    resmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav, this->nbase_x, "DAV::eig");
+    setmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav, 0, this->nbase_x);
+
+    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+    // the product of H and psi in the reduced basis set
+    resmem_complex_op()(this->ctx, this->hphi, this->nbase_x * this->dim, "DAV::hphi");
+    setmem_complex_op()(this->ctx, this->hphi, 0, this->nbase_x * this->dim);
+
+    // Hamiltonian on the reduced basis set
+    resmem_complex_op()(this->ctx, this->hcc, this->nbase_x * this->nbase_x, "DAV::hcc");
+    setmem_complex_op()(this->ctx, this->hcc, 0, this->nbase_x * this->nbase_x);
+
+    // Overlap on the reduced basis set
+    resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc");
+    setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x);
+
+    // Eigenvectors
+    resmem_complex_op()(this->ctx, this->vcc, this->nbase_x * this->nbase_x, "DAV::vcc");
+    setmem_complex_op()(this->ctx, this->vcc, 0, this->nbase_x * this->nbase_x);
+    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+    // convflag[m] = true if the m th band is convergent
+    std::vector<bool> convflag(this->n_band, false);
+
+    // unconv[m] store the number of the m th unconvergent band
+    std::vector<int> unconv(this->n_band);
+
+    // the dimension of the reduced basis set
+    int nbase = 0;
+
+    // the number of the unconvergent bands
+    this->notconv = this->n_band;
+
+    ModuleBase::timer::tick("Diago_DavSubspace", "first");
+
+    for (int m = 0; m < this->n_band; m++)
+    {
+        unconv[m] = m;
+
+        syncmem_complex_op()(this->ctx,
+                             this->ctx,
+                             &basis(m, 0),
+                             psi.get_k_first() ? &psi(m, 0) : &psi(m, 0, 0),
+                             this->dim);
+    }
+
+    // calculate H|psi>
+    hpsi_info dav_hpsi_in(&basis, psi::Range(1, 0, 0, this->n_band - 1), this->hphi);
+    phm_in->ops->hPsi(dav_hpsi_in);
+
+    this->cal_elem(this->dim, 
+                   nbase, 
+                   this->notconv, 
+                   basis, 
+                   this->hphi, 
+                   this->hcc, 
+                   this->scc, 
+                   true);
+
+    this->diag_zhegvx(nbase,
+                      this->n_band,
+                      this->hcc,
+                      this->scc,
+                      this->nbase_x,
+                      this->eigenvalue_in_dav,
+                      this->vcc,
+                      true,
+                      this->is_subspace);
+
+    for (size_t m = 0; m < this->n_band; m++)
+    {
+        eigenvalue_in_hsolver[m] = this->eigenvalue_in_dav[m];
+    }
+
+    ModuleBase::timer::tick("Diago_DavSubspace", "first");
+
+    int dav_iter = 0;
+    do
+    {
+        dav_iter++;
+
+        this->cal_grad(phm_in,
+                       this->dim,
+                       nbase,
+                       this->notconv,
+                       basis,
+                       this->hphi,
+                       this->vcc,
+                       unconv.data(),
+                       this->eigenvalue_in_dav);
+
+        this->cal_elem(this->dim, 
+                       nbase, 
+                       this->notconv, 
+                       basis, 
+                       this->hphi, 
+                       this->hcc, 
+                       this->scc, 
+                       false);
+
+        this->diag_zhegvx(nbase,
+                          this->n_band,
+                          this->hcc,
+                          this->scc,
+                          this->nbase_x,
+                          this->eigenvalue_in_dav,
+                          this->vcc,
+                          false,
+                          false);
+
+        // check convergence and update eigenvalues
+        ModuleBase::timer::tick("Diago_DavSubspace", "check_update");
+
+        this->notconv = 0;
+        for (int m = 0; m < this->n_band; m++)
+        {
+            if (is_occupied[m])
+            {
+                convflag[m] = (std::abs(this->eigenvalue_in_dav[m] - eigenvalue_in_hsolver[m])
+                               < DiagoIterAssist<T, Device>::PW_DIAG_THR);
+            }
+            else
+            {
+                double empty_ethr = std::max(DiagoIterAssist<T, Device>::PW_DIAG_THR * 5.0, 1e-5);
+                convflag[m] = (std::abs(this->eigenvalue_in_dav[m] - eigenvalue_in_hsolver[m]) < empty_ethr);
+            }
+
+            if (!convflag[m])
+            {
+                unconv[this->notconv] = m;
+                this->notconv++;
+            }
+
+            eigenvalue_in_hsolver[m] = this->eigenvalue_in_dav[m];
+        }
+
+        ModuleBase::timer::tick("Diago_DavSubspace", "check_update");
+
+        if ((this->notconv == 0) || 
+            (nbase + this->notconv + 1 > this->nbase_x) || 
+            (dav_iter == DiagoIterAssist<T, Device>::PW_DIAG_NMAX))
+        {
+            ModuleBase::timer::tick("Diago_DavSubspace", "last");
+
+            // updata eigenvectors of Hamiltonian
+            setmem_complex_op()(this->ctx, psi.get_pointer(), 0, n_band * psi.get_nbasis());
+            //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+            // haozhihan repalce 2022-10-18
+            gemm_op<T, Device>()(this->ctx,
+                                 'N',
+                                 'N',
+                                 this->dim,    // m: row of A,C
+                                 this->n_band, // n: col of B,C
+                                 nbase,        // k: col of A, row of B
+                                 this->one,
+                                 basis.get_pointer(), // A dim * nbase
+                                 this->dim,
+                                 this->vcc, // B nbase * n_band
+                                 this->nbase_x,
+                                 this->zero,
+                                 psi.get_pointer(), // C dim * n_band
+                                 psi.get_nbasis());
+
+            if (!this->notconv || (dav_iter == DiagoIterAssist<T, Device>::PW_DIAG_NMAX))
+            {
+                // overall convergence or last iteration: exit the iteration
+
+                ModuleBase::timer::tick("Diago_DavSubspace", "last");
+                break;
+            }
+            else
+            {
+                // if the dimension of the reduced basis set is becoming too large,
+                // then replace the first N (=nband) basis vectors with the current
+                // estimate of the eigenvectors and set the basis dimension to N;
+
+                this->refresh(this->dim,
+                              this->n_band,
+                              nbase,
+                              eigenvalue_in_hsolver,
+                              psi,
+                              basis,
+                              this->hphi,
+                              this->hcc,
+                              this->scc,
+                              this->vcc);
+                ModuleBase::timer::tick("Diago_DavSubspace", "last");
+            }
+        }
+
+    } while (1);
+
+    // std::cout << "dav_iter == " << dav_iter << std::endl;
+
+    DiagoIterAssist<T, Device>::avg_iter += static_cast<double>(dav_iter);
+
+    ModuleBase::timer::tick("Diago_DavSubspace", "diag_once");
+
+    return;
+}
+
+template <typename T, typename Device>
+void Diago_DavSubspace<T, Device>::cal_grad(hamilt::Hamilt<T, Device>* phm_in,
+                                       const int& dim,
+                                       const int& nbase,
+                                       const int& notconv,
+                                       psi::Psi<T, Device>& basis,
+                                       T* hphi,
+                                       T* vcc,
+                                       const int* unconv,
+                                       Real* eigenvalue_in_dav)
+{
+    ModuleBase::timer::tick("Diago_DavSubspace", "cal_grad");
+
+    for (size_t i = 0; i < notconv; i++)
+    {
+        if (unconv[i] != i)
+        {
+            syncmem_complex_op()(
+                this->ctx, 
+                this->ctx, 
+                vcc + i * this->nbase_x, 
+                vcc + unconv[i] * this->nbase_x, 
+                nbase
+            );
+            this->eigenvalue_in_dav[i] = this->eigenvalue_in_dav[unconv[i]];
+        }
+    }
+
+    gemm_op<T, Device>()(this->ctx,
+                         'N',
+                         'N',
+                         this->dim,        // m: row of A,C
+                         notconv,          // n: col of B,C
+                         nbase,            // k: col of A, row of B
+                         this->one,        // alpha
+                         &basis(0, 0),     // A
+                         this->dim,        // LDA
+                         vcc,              // B
+                         this->nbase_x,    // LDB
+                         this->zero,       // belta
+                         &basis(nbase, 0), // C dim * notconv
+                         this->dim         // LDC
+    );
+
+    for (int m = 0; m < notconv; m++)
+    {
+
+        std::vector<Real> e_temp_cpu(this->dim, (-this->eigenvalue_in_dav[m]));
+
+        vector_mul_vector_op<T, Device>()(this->ctx,
+                                          this->dim,
+                                          &basis(nbase + m, 0),
+                                          &basis(nbase + m, 0),
+                                          e_temp_cpu.data());
+    }
+
+    gemm_op<T, Device>()(this->ctx,
+                         'N',
+                         'N',
+                         this->dim,        // m: row of A,C
+                         notconv,          // n: col of B,C
+                         nbase,            // k: col of A, row of B
+                         this->one,        // alpha
+                         hphi,             // A dim * nbase
+                         this->dim,        // LDA
+                         vcc,              // B nbase * notconv
+                         this->nbase_x,    // LDB
+                         this->one,        // belta
+                         &basis(nbase, 0), // C dim * notconv
+                         this->dim         // LDC
+    );
+
+    // "precondition!!!"
+    std::vector<Real> pre(this->dim, 0.0);
+    for (int m = 0; m < notconv; m++)
+    {
+        for (size_t i = 0; i < this->dim; i++)
+        {
+            double x = this->precondition[i] - this->eigenvalue_in_dav[m];
+            pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+        }
+        vector_div_vector_op<T, Device>()(
+            this->ctx, 
+            this->dim, 
+            &basis(nbase + m, 0), 
+            &basis(nbase + m, 0), 
+            pre.data()
+        );
+    }
+
+    // "normalize!!!" in order to improve numerical stability of subspace diagonalization
+    std::vector<Real> psi_norm(notconv, 0.0);
+    for (size_t i = 0; i < notconv; i++)
+    {
+        psi_norm[i] = dot_real_op<T, Device>()(this->ctx, this->dim, &basis(nbase + i, 0), &basis(nbase + i, 0), false);
+        assert(psi_norm[i] > 0.0);
+        psi_norm[i] = sqrt(psi_norm[i]);
+
+        vector_div_constant_op<T, Device>()(this->ctx,
+                                            this->dim,
+                                            &basis(nbase + i, 0),
+                                            &basis(nbase + i, 0),
+                                            psi_norm[i]);
+    }
+
+    // "calculate H|psi>" for not convergence bands
+    hpsi_info dav_hpsi_in(&basis, psi::Range(1, 0, nbase, nbase + notconv - 1), &hphi[nbase * this->dim]);
+    phm_in->ops->hPsi(dav_hpsi_in);
+
+    ModuleBase::timer::tick("Diago_DavSubspace", "cal_grad");
+    return;
+}
+
+template <typename T, typename Device>
+void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
+                                       int& nbase,
+                                       const int& notconv,
+                                       const psi::Psi<T, Device>& basis,
+                                       const T* hphi,
+                                       T* hcc,
+                                       T* scc,
+                                       bool init)
+{
+    ModuleBase::timer::tick("Diago_DavSubspace", "cal_elem");
+
+    if (init)
+    {
+        assert(nbase == 0);
+        assert(this->n_band == notconv);
+        gemm_op<T, Device>()(this->ctx,
+                             'C',
+                             'N',
+                             notconv,
+                             notconv,
+                             this->dim,
+                             this->one,
+                             &basis(0, 0),
+                             this->dim,
+                             hphi,
+                             this->dim,
+                             this->zero,
+                             hcc,
+                             this->nbase_x);
+
+        gemm_op<T, Device>()(this->ctx,
+                             'C',
+                             'N',
+                             notconv,
+                             notconv,
+                             this->dim,
+                             this->one,
+                             &basis(0, 0),
+                             this->dim,
+                             &basis(0, 0),
+                             this->dim,
+                             this->zero,
+                             scc,
+                             this->nbase_x);
+    }
+    else
+    {
+        gemm_op<T, Device>()(this->ctx,
+                             'C',
+                             'N',
+                             notconv,
+                             nbase + notconv,
+                             this->dim,
+                             this->one,
+                             &hphi[nbase * this->dim],
+                             this->dim,
+                             &basis(0, 0),
+                             this->dim,
+                             this->zero,
+                             hcc + nbase,
+                             this->nbase_x);
+
+        gemm_op<T, Device>()(this->ctx,
+                             'C',
+                             'N',
+                             notconv,
+                             nbase + notconv,
+                             this->dim,
+                             this->one,
+                             &basis(nbase, 0),
+                             this->dim,
+                             &basis(0, 0),
+                             this->dim,
+                             this->zero,
+                             scc + nbase,
+                             this->nbase_x);
+    }
+
+#ifdef __MPI
+    if (GlobalV::NPROC_IN_POOL > 1)
+    {
+        matrixTranspose_op<T, Device>()(this->ctx, this->nbase_x, this->nbase_x, hcc, hcc);
+        matrixTranspose_op<T, Device>()(this->ctx, this->nbase_x, this->nbase_x, scc, scc);
+
+        auto* swap = new T[notconv * this->nbase_x];
+        syncmem_complex_op()(this->ctx, this->ctx, swap, hcc + nbase * this->nbase_x, notconv * this->nbase_x);
+
+        if (std::is_same<T, double>::value)
+        {
+            Parallel_Reduce::reduce_pool(hcc + nbase * this->nbase_x, notconv * this->nbase_x);
+            Parallel_Reduce::reduce_pool(scc + nbase * this->nbase_x, notconv * this->nbase_x);
+        }
+        else
+        {
+            if (psi::device::get_current_precision(swap) == "single")
+            {
+                MPI_Reduce(swap,
+                           hcc + nbase * this->nbase_x,
+                           notconv * this->nbase_x,
+                           MPI_COMPLEX,
+                           MPI_SUM,
+                           0,
+                           POOL_WORLD);
+            }
+            else
+            {
+                MPI_Reduce(swap,
+                           hcc + nbase * this->nbase_x,
+                           notconv * this->nbase_x,
+                           MPI_DOUBLE_COMPLEX,
+                           MPI_SUM,
+                           0,
+                           POOL_WORLD);
+            }
+
+            syncmem_complex_op()(this->ctx, this->ctx, swap, scc + nbase * this->nbase_x, notconv * this->nbase_x);
+
+            if (psi::device::get_current_precision(swap) == "single")
+            {
+                MPI_Reduce(swap,
+                           scc + nbase * this->nbase_x,
+                           notconv * this->nbase_x,
+                           MPI_COMPLEX,
+                           MPI_SUM,
+                           0,
+                           POOL_WORLD);
+            }
+            else
+            {
+                MPI_Reduce(swap,
+                           scc + nbase * this->nbase_x,
+                           notconv * this->nbase_x,
+                           MPI_DOUBLE_COMPLEX,
+                           MPI_SUM,
+                           0,
+                           POOL_WORLD);
+            }
+        }
+        delete[] swap;
+
+        matrixTranspose_op<T, Device>()(this->ctx, this->nbase_x, this->nbase_x, hcc, hcc);
+        matrixTranspose_op<T, Device>()(this->ctx, this->nbase_x, this->nbase_x, scc, scc);
+    }
+#endif
+
+    nbase += notconv;
+    int nb1 = nbase - notconv;
+    // reset:
+    if (init)
+    {
+        for (size_t i = 0; i < nbase; i++)
+        {
+
+            hcc[i * this->nbase_x + i] = set_real_tocomplex(hcc[i * this->nbase_x + i]);
+            scc[i * this->nbase_x + i] = set_real_tocomplex(scc[i * this->nbase_x + i]);
+
+            for (size_t j = i + 1; j < nbase; j++)
+            {
+                hcc[j * this->nbase_x + i] = get_conj(hcc[i * this->nbase_x + j]);
+                scc[j * this->nbase_x + i] = get_conj(scc[i * this->nbase_x + j]);
+            }
+        }
+        for (size_t i = nbase; i < this->nbase_x; i++)
+        {
+            for (size_t j = nbase; j < this->nbase_x; j++)
+            {
+                hcc[i * this->nbase_x + j] = cs.zero;
+                scc[i * this->nbase_x + j] = cs.zero;
+                hcc[j * this->nbase_x + i] = cs.zero;
+                scc[j * this->nbase_x + i] = cs.zero;
+            }
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < nbase; i++)
+        {
+            if (i >= nb1)
+            {
+                hcc[i * this->nbase_x + i] = set_real_tocomplex(hcc[i * this->nbase_x + i]);
+                scc[i * this->nbase_x + i] = set_real_tocomplex(scc[i * this->nbase_x + i]);
+            }
+            for (size_t j = std::max(i + 1, (size_t)nb1); j < nbase; j++)
+            {
+                hcc[j * this->nbase_x + i] = get_conj(hcc[i * this->nbase_x + j]);
+                scc[j * this->nbase_x + i] = get_conj(scc[i * this->nbase_x + j]);
+            }
+        }
+        for (size_t i = nbase; i < this->nbase_x; i++)
+        {
+            for (size_t j = nbase; j < this->nbase_x; j++)
+            {
+                hcc[i * this->nbase_x + j] = cs.zero;
+                scc[i * this->nbase_x + j] = cs.zero;
+                hcc[j * this->nbase_x + i] = cs.zero;
+                scc[j * this->nbase_x + i] = cs.zero;
+            }
+        }
+    }
+
+    ModuleBase::timer::tick("Diago_DavSubspace", "cal_elem");
+    return;
+}
+
+template <typename T, typename Device>
+void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
+                                          const int& nband,
+                                          T* hcc,
+                                          T* scc,
+                                          const int& nbase_x,
+                                          Real* eigenvalue_in_dav,
+                                          T* vcc,
+                                          bool init,
+                                          bool is_subspace)
+{
+    ModuleBase::timer::tick("Diago_DavSubspace", "diag_zhegvx");
+
+    if (is_subspace == false)
+    {
+        if (GlobalV::RANK_IN_POOL == 0)
+        {
+            assert(nbase_x >= std::max(1, nbase));
+
+            std::vector<std::vector<T>> h_diag(nbase, std::vector<T>(nbase, cs.zero));
+            std::vector<std::vector<T>> s_diag(nbase, std::vector<T>(nbase, cs.zero));
+
+            for (size_t i = 0; i < nbase; i++)
+            {
+                for (size_t j = 0; j < nbase; j++)
+                {
+                    h_diag[i][j] = hcc[i * this->nbase_x + j];
+                    s_diag[i][j] = scc[i * this->nbase_x + j];
+                }
+            }
+
+            if (this->device == psi::GpuDevice)
+            {
+#if defined(__CUDA) || defined(__ROCM)
+                Real* eigenvalue_gpu = nullptr;
+                resmem_real_op()(this->ctx, eigenvalue_gpu, this->nbase_x);
+                
+                syncmem_var_h2d_op()(
+                    this->ctx, 
+                    this->cpu_ctx, 
+                    eigenvalue_gpu, 
+                    this->eigenvalue_in_dav, 
+                    this->nbase_x
+                );
+
+                dnevx_op<T, Device>()(this->ctx, nbase, this->nbase_x, this->hcc, nband, eigenvalue_gpu, this->vcc);
+
+                syncmem_var_d2h_op()(
+                    this->cpu_ctx, 
+                    this->ctx, 
+                    this->eigenvalue_in_dav, 
+                    eigenvalue_gpu, 
+                    this->nbase_x
+                );
+                
+                delmem_real_op()(this->ctx, eigenvalue_gpu);
+#endif
+            }
+            else
+            {
+                if (init)
+                {
+                    dnevx_op<T, Device>()(this->ctx,
+                                          nbase,
+                                          this->nbase_x,
+                                          this->hcc,
+                                          nband,
+                                          this->eigenvalue_in_dav,
+                                          this->vcc);
+                }
+                else
+                {
+
+                    dngvx_op<T, Device>()(this->ctx,
+                                          nbase,
+                                          this->nbase_x,
+                                          this->hcc,
+                                          this->scc,
+                                          nband,
+                                          this->eigenvalue_in_dav,
+                                          this->vcc);
+                }
+            }
+
+            // reset:
+            for (size_t i = 0; i < nbase; i++)
+            {
+                for (size_t j = 0; j < nbase; j++)
+                {
+                    hcc[i * this->nbase_x + j] = h_diag[i][j];
+                    scc[i * this->nbase_x + j] = s_diag[i][j];
+                }
+
+                for (size_t j = nbase; j < this->nbase_x; j++)
+                {
+                    hcc[i * this->nbase_x + j] = cs.zero;
+                    hcc[j * this->nbase_x + i] = cs.zero;
+                    scc[i * this->nbase_x + j] = cs.zero;
+                    scc[j * this->nbase_x + i] = cs.zero;
+                }
+            }
+        }
+
+#ifdef __MPI
+        if (GlobalV::NPROC_IN_POOL > 1)
+        {
+            // vcc: nbase * nband
+            for (int i = 0; i < nband; i++)
+            {
+                MPI_Bcast(&vcc[i * this->nbase_x], nbase, MPI_DOUBLE_COMPLEX, 0, POOL_WORLD);
+            }
+            MPI_Bcast(this->eigenvalue_in_dav, nband, MPI_DOUBLE, 0, POOL_WORLD);
+        }
+#endif
+
+    }
+    else if (is_subspace == true)
+    {
+        for (size_t m = 0; m < nband; m++)
+        {
+            this->eigenvalue_in_dav[m] = get_real(hcc[m * this->nbase_x + m]);
+
+            vcc[m * this->nbase_x + m] = set_real_tocomplex(1.0);
+        }
+        
+#ifdef __MPI
+        if (GlobalV::NPROC_IN_POOL > 1)
+        {
+            MPI_Bcast(this->eigenvalue_in_dav, this->n_band, MPI_DOUBLE, 0, POOL_WORLD);
+        }
+#endif
+
+    }
+
+    ModuleBase::timer::tick("Diago_DavSubspace", "diag_zhegvx");
+    return;
+}
+
+template <typename T, typename Device>
+void Diago_DavSubspace<T, Device>::refresh(const int& dim,
+                                      const int& nband,
+                                      int& nbase,
+                                      const Real* eigenvalue_in_hsolver,
+                                      const psi::Psi<T, Device>& psi,
+                                      psi::Psi<T, Device>& basis,
+                                      T* hp,
+                                      T* sp,
+                                      T* hc,
+                                      T* vc)
+{
+    ModuleBase::timer::tick("Diago_DavSubspace", "refresh");
+
+    // update basis
+    for (size_t i = 0; i < nband; i++)
+    {
+        syncmem_complex_op()(this->ctx, this->ctx, &basis(i, 0), &psi(i, 0), this->dim);
+    }
+    gemm_op<T, Device>()(this->ctx,
+                         'N',
+                         'N',
+                         this->dim,
+                         nband,
+                         nbase,
+                         this->one,
+                         this->hphi,
+                         this->dim,
+                         this->vcc,
+                         this->nbase_x,
+                         this->zero,
+                         &basis(nband, 0),
+                         this->dim);
+
+    // update hphi
+    syncmem_complex_op()(this->ctx, this->ctx, hphi, &basis(nband, 0), this->dim * nband);
+
+    nbase = nband;
+
+    // set hcc/scc/vcc to 0
+    for (size_t i = 0; i < nbase; i++)
+    {
+        setmem_complex_op()(this->ctx, &hcc[this->nbase_x * i], 0, nbase);
+        setmem_complex_op()(this->ctx, &scc[this->nbase_x * i], 0, nbase);
+        setmem_complex_op()(this->ctx, &vcc[this->nbase_x * i], 0, nbase);
+    }
+
+    if (this->device == psi::GpuDevice)
+    {
+#if defined(__CUDA) || defined(__ROCM)
+        T* hcc_cpu = nullptr;
+        T* scc_cpu = nullptr;
+        T* vcc_cpu = nullptr;
+        psi::memory::resize_memory_op<T, psi::DEVICE_CPU>()(this->cpu_ctx,
+                                                            hcc_cpu,
+                                                            this->nbase_x * this->nbase_x,
+                                                            "DAV::hcc");
+        psi::memory::resize_memory_op<T, psi::DEVICE_CPU>()(this->cpu_ctx,
+                                                            scc_cpu,
+                                                            this->nbase_x * this->nbase_x,
+                                                            "DAV::scc");
+        psi::memory::resize_memory_op<T, psi::DEVICE_CPU>()(this->cpu_ctx,
+                                                            vcc_cpu,
+                                                            this->nbase_x * this->nbase_x,
+                                                            "DAV::vcc");
+
+        syncmem_d2h_op()(this->cpu_ctx, this->ctx, hcc_cpu, hcc, this->nbase_x * this->nbase_x);
+        syncmem_d2h_op()(this->cpu_ctx, this->ctx, scc_cpu, scc, this->nbase_x * this->nbase_x);
+        syncmem_d2h_op()(this->cpu_ctx, this->ctx, vcc_cpu, vcc, this->nbase_x * this->nbase_x);
+
+        for (int i = 0; i < nbase; i++)
+        {
+            hcc_cpu[i * this->nbase_x + i] = eigenvalue_in_hsolver[i];
+            scc_cpu[i * this->nbase_x + i] = this->one[0];
+            vcc_cpu[i * this->nbase_x + i] = this->one[0];
+        }
+
+        syncmem_h2d_op()(this->ctx, this->cpu_ctx, hcc, hcc_cpu, this->nbase_x * this->nbase_x);
+        syncmem_h2d_op()(this->ctx, this->cpu_ctx, scc, scc_cpu, this->nbase_x * this->nbase_x);
+        syncmem_h2d_op()(this->ctx, this->cpu_ctx, vcc, vcc_cpu, this->nbase_x * this->nbase_x);
+
+        psi::memory::delete_memory_op<T, psi::DEVICE_CPU>()(this->cpu_ctx, hcc_cpu);
+        psi::memory::delete_memory_op<T, psi::DEVICE_CPU>()(this->cpu_ctx, scc_cpu);
+        psi::memory::delete_memory_op<T, psi::DEVICE_CPU>()(this->cpu_ctx, vcc_cpu);
+#endif
+    }
+    else
+    {
+        for (int i = 0; i < nbase; i++)
+        {
+            hcc[i * this->nbase_x + i] = eigenvalue_in_hsolver[i];
+            scc[i * this->nbase_x + i] = this->one[0];
+            vcc[i * this->nbase_x + i] = this->one[0];
+        }
+    }
+    ModuleBase::timer::tick("Diago_DavSubspace", "refresh");
+
+    return;
+}
+
+template <typename T, typename Device>
+void Diago_DavSubspace<T, Device>::diag(hamilt::Hamilt<T, Device>* phm_in,
+                                   psi::Psi<T, Device>& psi,
+                                   Real* eigenvalue_in_hsolver,
+                                   std::vector<bool>& is_occupied)
+{
+    /// record the times of trying iterative diagonalization
+    int ntry = 0;
+    this->notconv = 0;
+
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == psi::GpuDevice)
+    {
+        resmem_real_op()(this->ctx, this->d_precondition, psi.get_nbasis());
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition, psi.get_nbasis());
+    }
+#endif
+
+    if (DiagoIterAssist<T, Device>::SCF_ITER == 1)
+    {
+        // std::cout << "diagH_subspace" << std::endl;
+        DiagoIterAssist<T, Device>::diagH_subspace(phm_in, psi, psi, eigenvalue_in_hsolver, psi.get_nbands());
+
+        this->is_subspace = true;
+    }
+    else
+    {
+        this->is_subspace = false;
+    }
+
+    bool outputscc = false;
+    bool outputeigenvalue = false;
+
+    if (outputscc)
+    {
+        this->nbase_x = 2 * psi.get_nbands();
+        resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc");
+        setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x);
+
+        std::cout << "before dav 111" << std::endl;
+        gemm_op<T, Device>()(this->ctx,
+                             'C',
+                             'N',
+                             psi.get_nbands(),
+                             psi.get_nbands(),
+                             psi.get_current_nbas(),
+                             this->one,
+                             &psi(0, 0),
+                             psi.get_current_nbas(),
+                             &psi(0, 0),
+                             psi.get_current_nbas(),
+                             this->zero,
+                             this->scc,
+                             this->nbase_x);
+        // output scc
+        for (size_t i = 0; i < psi.get_nbands(); i++)
+        {
+            for (size_t j = 0; j < psi.get_nbands(); j++)
+            {
+                std::cout << this->scc[i * this->nbase_x + j] << "\t";
+            }
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+    }
+
+    if (outputeigenvalue)
+    {
+        // output: eigenvalue_in_hsolver
+        std::cout << "before dav, output eigenvalue_in_hsolver" << std::endl;
+        for (size_t i = 0; i < psi.get_nbands(); i++)
+        {
+            std::cout << eigenvalue_in_hsolver[i] << "\t";
+        }
+        std::cout << std::endl;
+    }
+
+    do
+    {
+
+        this->diag_once(phm_in, psi, eigenvalue_in_hsolver, is_occupied);
+
+        ++ntry;
+
+    } while (DiagoIterAssist<T, Device>::test_exit_cond(ntry, this->notconv));
+
+    if (notconv > std::max(5, psi.get_nbands() / 4))
+    {
+        std::cout << "\n notconv = " << this->notconv;
+        std::cout << "\n Diago_DavSubspace::diag', too many bands are not converged! \n";
+    }
+
+    if (outputeigenvalue)
+    {
+        // output: eigenvalue_in_hsolver
+        std::cout << "after dav, output eigenvalue_in_hsolver" << std::endl;
+        for (size_t i = 0; i < psi.get_nbands(); i++)
+        {
+            std::cout << eigenvalue_in_hsolver[i] << "\t";
+        }
+        std::cout << std::endl;
+    }
+
+    if (outputscc)
+    {
+        std::cout << "after dav 222 " << std::endl;
+        gemm_op<T, Device>()(this->ctx,
+                             'C',
+                             'N',
+                             psi.get_nbands(),
+                             psi.get_nbands(),
+                             psi.get_current_nbas(),
+                             this->one,
+                             &psi(0, 0),
+                             psi.get_current_nbas(),
+                             &psi(0, 0),
+                             psi.get_current_nbas(),
+                             this->zero,
+                             this->scc,
+                             this->nbase_x);
+        // output scc
+        for (size_t i = 0; i < psi.get_nbands(); i++)
+        {
+            for (size_t j = 0; j < psi.get_nbands(); j++)
+            {
+                std::cout << this->scc[i * this->nbase_x + j] << "\t";
+            }
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+    }
+
+    return;
+}
+
+namespace hsolver
+{
+
+template class Diago_DavSubspace<std::complex<float>, psi::DEVICE_CPU>;
+template class Diago_DavSubspace<std::complex<double>, psi::DEVICE_CPU>;
+
+#if ((defined __CUDA) || (defined __ROCM))
+template class Diago_DavSubspace<std::complex<float>, psi::DEVICE_GPU>;
+template class Diago_DavSubspace<std::complex<double>, psi::DEVICE_GPU>;
+#endif
+
+#ifdef __LCAO
+template class Diago_DavSubspace<double, psi::DEVICE_CPU>;
+
+#if ((defined __CUDA) || (defined __ROCM))
+template class Diago_DavSubspace<double, psi::DEVICE_GPU>;
+#endif
+
+#endif
+} // namespace hsolver
\ No newline at end of file
diff --git a/source/module_hsolver/diago_dav_subspace.h b/source/module_hsolver/diago_dav_subspace.h
new file mode 100644
index 0000000000..e55a536608
--- /dev/null
+++ b/source/module_hsolver/diago_dav_subspace.h
@@ -0,0 +1,143 @@
+#ifndef DIAGO_NEW_DAV_H
+#define DIAGO_NEW_DAV_H
+
+#include "diagh.h"
+#include "module_base/complexmatrix.h"
+#include "module_base/macros.h"
+#include "module_hamilt_pw/hamilt_pwdft/structure_factor.h"
+#include "module_psi/kernels/device.h"
+
+namespace hsolver
+{
+
+template <typename T = std::complex<double>, typename Device = psi::DEVICE_CPU>
+class Diago_DavSubspace : public DiagH<T, Device>
+{
+  private:
+    // Note GetTypeReal<T>::type will
+    // return T if T is real type(float, double),
+    // otherwise return the real type of T(complex<float>, complex<double>)
+    using Real = typename GetTypeReal<T>::type;
+
+  public:
+    Diago_DavSubspace(const Real* precondition_in);
+    ~Diago_DavSubspace();
+
+    // this is the override function diag() for CG method
+    void diag(hamilt::Hamilt<T, Device>* phm_in,
+              psi::Psi<T, Device>& phi,
+              Real* eigenvalue_in,
+              std::vector<bool>& is_occupied);
+
+    static int PW_DIAG_NDIM;
+
+  private:
+    bool is_subspace = false;
+
+    int test_david = 0;
+
+    /// record for how many bands not have convergence eigenvalues
+    int notconv = 0;
+
+    /// row size for input psi matrix
+    int n_band = 0;
+    /// non-zero col size for inputted psi matrix
+    int dim = 0;
+    // maximum dimension of the reduced basis set
+    int nbase_x = 0;
+    /// precondition for cg diag
+    const Real* precondition = nullptr;
+    Real* d_precondition = nullptr;
+
+    /// eigenvalue results
+    Real* eigenvalue_in_dav = nullptr;
+
+    T* hphi = nullptr; // the product of H and psi in the reduced basis set
+
+    T* hcc = nullptr; // Hamiltonian on the reduced basis
+
+    T* scc = nullptr; // Overlap on the reduced basis
+
+    T* vcc = nullptr; // Eigenvectors on the reduced basis
+
+    /// device type of psi
+    Device* ctx = {};
+    psi::DEVICE_CPU* cpu_ctx = {};
+    psi::AbacusDevice_t device = {};
+
+    void cal_grad(hamilt::Hamilt<T, Device>* phm_in,
+                  const int& dim,
+                  const int& nbase,
+                  const int& notconv,
+                  psi::Psi<T, Device>& basis,
+                  T* hphi,
+                  T* vcc,
+                  const int* unconv,
+                  Real* eigenvalue);
+
+    void cal_elem(const int& dim,
+                  int& nbase,
+                  const int& notconv,
+                  const psi::Psi<T, Device>& basis,
+                  const T* hphi,
+                  T* hcc,
+                  T* scc,
+                  bool init);
+
+    void refresh(const int& dim,
+                 const int& nband,
+                 int& nbase,
+                 const Real* eigenvalue,
+                 const psi::Psi<T, Device>& psi,
+                 psi::Psi<T, Device>& basis,
+                 T* hphi,
+                 T* hcc,
+                 T* scc,
+                 T* vcc);
+
+    void diag_zhegvx(const int& nbase,
+                     const int& nband,
+                     T* hcc,
+                     T* scc,
+                     const int& nbase_x,
+                     Real* eigenvalue,
+                     T* vcc,
+                     bool init,
+                     bool is_subspace);
+
+    void diag_once(hamilt::Hamilt<T, Device>* phm_in,
+                   psi::Psi<T, Device>& psi,
+                   Real* eigenvalue_in,
+                   std::vector<bool>& is_occupied);
+
+    using resmem_complex_op = psi::memory::resize_memory_op<T, Device>;
+    using delmem_complex_op = psi::memory::delete_memory_op<T, Device>;
+    using setmem_complex_op = psi::memory::set_memory_op<T, Device>;
+
+    using resmem_real_op = psi::memory::resize_memory_op<Real, Device>;
+    using delmem_real_op = psi::memory::delete_memory_op<Real, Device>;
+    using setmem_real_op = psi::memory::set_memory_op<Real, Device>;
+
+    using resmem_real_h_op = psi::memory::resize_memory_op<Real, psi::DEVICE_CPU>;
+    using delmem_real_h_op = psi::memory::delete_memory_op<Real, psi::DEVICE_CPU>;
+    using setmem_real_h_op = psi::memory::set_memory_op<Real, psi::DEVICE_CPU>;
+
+    using syncmem_var_h2d_op = psi::memory::synchronize_memory_op<Real, Device, psi::DEVICE_CPU>;
+    using syncmem_var_d2h_op = psi::memory::synchronize_memory_op<Real, psi::DEVICE_CPU, Device>;
+    using syncmem_complex_op = psi::memory::synchronize_memory_op<T, Device, Device>;
+    using castmem_complex_op = psi::memory::cast_memory_op<std::complex<double>, T, Device, Device>;
+    using syncmem_h2d_op = psi::memory::synchronize_memory_op<T, Device, psi::DEVICE_CPU>;
+    using syncmem_d2h_op = psi::memory::synchronize_memory_op<T, psi::DEVICE_CPU, Device>;
+
+    using hpsi_info = typename hamilt::Operator<T, Device>::hpsi_info;
+
+    consts<T> cs;
+    const T *one = nullptr, *zero = nullptr, *neg_one = nullptr;
+};
+
+template <typename Real, typename Device>
+int Diago_DavSubspace<Real, Device>::PW_DIAG_NDIM = 4;
+
+} // namespace hsolver
+
+#endif
\ No newline at end of file
diff --git a/source/module_hsolver/diago_david.cpp b/source/module_hsolver/diago_david.cpp
index 926f8e4e15..c3865724a3 100644
--- a/source/module_hsolver/diago_david.cpp
+++ b/source/module_hsolver/diago_david.cpp
@@ -17,19 +17,6 @@
 
 using namespace hsolver;
 
-inline double get_real(const double& x)
-{
-    return x;
-}
-inline double get_real(const std::complex<double>& x)
-{
-    return x.real();
-}
-inline float get_real(const std::complex<float>& x)
-{
-    return x.real();
-}
-
 template <typename T, typename Device> DiagoDavid<T, Device>::DiagoDavid(const Real* precondition_in)
 {
     this->device = psi::device::get_device_type<Device>(this->ctx);
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index d1502d1106..759306c4ee 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -5,17 +5,19 @@
 #include "diago_bpcg.h"
 #include "diago_cg.h"
 #include "diago_david.h"
+#include "diago_dav_subspace.h"
 #include "module_base/timer.h"
 #include "module_base/tool_quit.h"
 #include "module_elecstate/elecstate_pw.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_hamilt_pw/hamilt_pwdft/hamilt_pw.h"
 #include "module_hamilt_pw/hamilt_pwdft/wavefunc.h"
 #include "module_hsolver/diago_iter_assist.h"
-#include "module_hamilt_pw/hamilt_pwdft/global.h"
 #ifdef USE_PAW
 #include "module_cell/module_paw/paw_cell.h"
 #endif
-namespace hsolver {
+namespace hsolver
+{
 
 template <typename T, typename Device>
 HSolverPW<T, Device>::HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in, wavefunc* pwf_in)
@@ -26,17 +28,8 @@ HSolverPW<T, Device>::HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in, wavefunc* pw
     this->diag_ethr = GlobalV::PW_DIAG_THR;
     /*this->init(pbas_in);*/
 }
-/*void HSolverPW::init(const PW_Basis* pbas_in)
-{
-    this->pbas = pbas_in;
-    return;
-}
 
-void HSolverPW::update()
-{
-    return;
-}*/
-template<typename T, typename Device>
+template <typename T, typename Device>
 void HSolverPW<T, Device>::initDiagh(const psi::Psi<T, Device>& psi)
 {
     if (this->method == "cg")
@@ -47,7 +40,8 @@ void HSolverPW<T, Device>::initDiagh(const psi::Psi<T, Device>& psi)
             {
                 delete reinterpret_cast<DiagoCG<T, Device>*>(this->pdiagh);
             }
-            else {
+            else
+            {
                 return;
             }
         }
@@ -55,36 +49,34 @@ void HSolverPW<T, Device>::initDiagh(const psi::Psi<T, Device>& psi)
         // warp the subspace_func into a lambda function
         auto ngk_pointer = psi.get_ngk_pointer();
         auto subspace_func = [this, ngk_pointer](const ct::Tensor& psi_in, ct::Tensor& psi_out) {
-            // psi_in should be a 2D tensor: 
+            // psi_in should be a 2D tensor:
             // psi_in.shape() = [nbands, nbasis]
             const auto ndim = psi_in.shape().ndim();
             REQUIRES_OK(ndim == 2, "dims of psi_in should be less than or equal to 2");
             // Convert a Tensor object to a psi::Psi object
-            auto psi_in_wrapper = psi::Psi<T, Device>(
-                psi_in.data<T>(), 1, 
-                psi_in.shape().dim_size(0), 
-                psi_in.shape().dim_size(1),
-                ngk_pointer);
-            auto psi_out_wrapper = psi::Psi<T, Device>(
-                psi_out.data<T>(), 1, 
-                psi_out.shape().dim_size(0), 
-                psi_out.shape().dim_size(1),
-                ngk_pointer);
-            auto eigen = ct::Tensor(
-                ct::DataTypeToEnum<Real>::value,
-                ct::DeviceType::CpuDevice, 
-                ct::TensorShape({psi_in.shape().dim_size(0)}));
-            
+            auto psi_in_wrapper = psi::Psi<T, Device>(psi_in.data<T>(),
+                                                      1,
+                                                      psi_in.shape().dim_size(0),
+                                                      psi_in.shape().dim_size(1),
+                                                      ngk_pointer);
+            auto psi_out_wrapper = psi::Psi<T, Device>(psi_out.data<T>(),
+                                                       1,
+                                                       psi_out.shape().dim_size(0),
+                                                       psi_out.shape().dim_size(1),
+                                                       ngk_pointer);
+            auto eigen = ct::Tensor(ct::DataTypeToEnum<Real>::value,
+                                    ct::DeviceType::CpuDevice,
+                                    ct::TensorShape({psi_in.shape().dim_size(0)}));
+
             DiagoIterAssist<T, Device>::diagH_subspace(hamilt_, psi_in_wrapper, psi_out_wrapper, eigen.data<Real>());
         };
-        this->pdiagh = new DiagoCG<T, Device>(
-            GlobalV::BASIS_TYPE,
-            GlobalV::CALCULATION,
-            DiagoIterAssist<T, Device>::need_subspace,
-            subspace_func,
-            DiagoIterAssist<T, Device>::PW_DIAG_THR,
-            DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
-            GlobalV::NPROC_IN_POOL);
+        this->pdiagh = new DiagoCG<T, Device>(GlobalV::BASIS_TYPE,
+                                              GlobalV::CALCULATION,
+                                              DiagoIterAssist<T, Device>::need_subspace,
+                                              subspace_func,
+                                              DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                              DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
+                                              GlobalV::NPROC_IN_POOL);
         this->pdiagh->method = this->method;
     }
     else if (this->method == "dav")
@@ -101,20 +93,42 @@ void HSolverPW<T, Device>::initDiagh(const psi::Psi<T, Device>& psi)
         }
         else
         {
-            this->pdiagh = new DiagoDavid<T, Device>( precondition.data());
+            this->pdiagh = new DiagoDavid<T, Device>(precondition.data());
+            this->pdiagh->method = this->method;
+        }
+    }
+    else if (this->method == "dav_subspace")
+    {
+        Diago_DavSubspace<T>::PW_DIAG_NDIM = GlobalV::PW_DIAG_NDIM;
+        if (this->pdiagh != nullptr)
+        {
+            if (this->pdiagh->method != this->method)
+            {
+                delete (Diago_DavSubspace<T, Device>*)this->pdiagh;
+                this->pdiagh = new Diago_DavSubspace<T, Device>(precondition.data());
+                this->pdiagh->method = this->method;
+            }
+        }
+        else
+        {
+            this->pdiagh = new Diago_DavSubspace<T, Device>(precondition.data());
             this->pdiagh->method = this->method;
         }
     }
-    else if (this->method == "bpcg") {
-        if(this->pdiagh!=nullptr) {
-            if(this->pdiagh->method != this->method) {
+    else if (this->method == "bpcg")
+    {
+        if (this->pdiagh != nullptr)
+        {
+            if (this->pdiagh->method != this->method)
+            {
                 delete (DiagoBPCG<T, Device>*)this->pdiagh;
                 this->pdiagh = new DiagoBPCG<T, Device>(precondition.data());
                 this->pdiagh->method = this->method;
                 reinterpret_cast<DiagoBPCG<T, Device>*>(this->pdiagh)->init_iter(psi);
             }
         }
-        else {
+        else
+        {
             this->pdiagh = new DiagoBPCG<T, Device>(precondition.data());
             this->pdiagh->method = this->method;
             reinterpret_cast<DiagoBPCG<T, Device>*>(this->pdiagh)->init_iter(psi);
@@ -128,10 +142,10 @@ void HSolverPW<T, Device>::initDiagh(const psi::Psi<T, Device>& psi)
 
 template <typename T, typename Device>
 void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
-                                      psi::Psi<T, Device>& psi,
-                                      elecstate::ElecState* pes,
-                                      const std::string method_in,
-                                      const bool skip_charge)
+                                 psi::Psi<T, Device>& psi,
+                                 elecstate::ElecState* pes,
+                                 const std::string method_in,
+                                 const bool skip_charge)
 {
     ModuleBase::TITLE("HSolverPW", "solve");
     ModuleBase::timer::tick("HSolverPW", "solve");
@@ -141,7 +155,27 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
     // select the method of diagonalization
     this->method = method_in;
     this->initDiagh(psi);
+
     std::vector<Real> eigenvalues(pes->ekb.nr * pes->ekb.nc, 0);
+
+    if (this->is_first_scf == true)
+    {
+        is_occupied.resize(psi.get_nk() * psi.get_nbands(), true);
+    }
+    else
+    {
+        for (size_t i = 0; i < psi.get_nk(); i++)
+        {
+            for (size_t j = 0; j < psi.get_nbands(); j++)
+            {
+                if (pes->wg(i, j) < 1.0)
+                {
+                    is_occupied[i * psi.get_nbands() + j] = false;
+                }
+            }
+        }
+    }
+
     /// Loop over k points for solve Hamiltonian to charge density
     for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
     {
@@ -149,13 +183,13 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
         pHamilt->updateHk(ik);
 
 #ifdef USE_PAW
-	    if(GlobalV::use_paw)
+        if (GlobalV::use_paw)
         {
             const int npw = this->wfc_basis->npwk[ik];
-            ModuleBase::Vector3<double> *_gk = new ModuleBase::Vector3<double>[npw];
-            for (int ig = 0;ig < npw; ig++)
+            ModuleBase::Vector3<double>* _gk = new ModuleBase::Vector3<double>[npw];
+            for (int ig = 0; ig < npw; ig++)
             {
-                _gk[ig] = this->wfc_basis->getgpluskcar(ik,ig);
+                _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig);
             }
 
             double* kpt;
@@ -164,11 +198,11 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
             kpt[1] = this->wfc_basis->kvec_c[ik].y;
             kpt[2] = this->wfc_basis->kvec_c[ik].z;
 
-            double ** kpg;
-            double ** gcar;
+            double** kpg;
+            double** gcar;
             kpg = new double*[npw];
             gcar = new double*[npw];
-            for(int ipw=0;ipw<npw;ipw++)
+            for (int ipw = 0; ipw < npw; ipw++)
             {
                 kpg[ipw] = new double[3];
                 kpg[ipw][0] = _gk[ipw].x;
@@ -176,19 +210,23 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
                 kpg[ipw][2] = _gk[ipw].z;
 
                 gcar[ipw] = new double[3];
-                gcar[ipw][0] = this->wfc_basis->getgcar(ik,ipw).x;
-                gcar[ipw][1] = this->wfc_basis->getgcar(ik,ipw).y;
-                gcar[ipw][2] = this->wfc_basis->getgcar(ik,ipw).z;
+                gcar[ipw][0] = this->wfc_basis->getgcar(ik, ipw).x;
+                gcar[ipw][1] = this->wfc_basis->getgcar(ik, ipw).y;
+                gcar[ipw][2] = this->wfc_basis->getgcar(ik, ipw).z;
             }
 
-            GlobalC::paw_cell.set_paw_k(npw,wfc_basis->npwk_max,kpt,
-                this->wfc_basis->get_ig2ix(ik).data(),
-                this->wfc_basis->get_ig2iy(ik).data(),
-                this->wfc_basis->get_ig2iz(ik).data(),
-                (const double **) kpg,GlobalC::ucell.tpiba,(const double **) gcar);
+            GlobalC::paw_cell.set_paw_k(npw,
+                                        wfc_basis->npwk_max,
+                                        kpt,
+                                        this->wfc_basis->get_ig2ix(ik).data(),
+                                        this->wfc_basis->get_ig2iy(ik).data(),
+                                        this->wfc_basis->get_ig2iz(ik).data(),
+                                        (const double**)kpg,
+                                        GlobalC::ucell.tpiba,
+                                        (const double**)gcar);
 
             delete[] kpt;
-            for(int ipw = 0; ipw < npw; ipw++)
+            for (int ipw = 0; ipw < npw; ipw++)
             {
                 delete[] kpg[ipw];
                 delete[] gcar[ipw];
@@ -213,19 +251,24 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
 
         /// solve eigenvector and eigenvalue for H(k)
         this->hamiltSolvePsiK(pHamilt, psi, eigenvalues.data() + ik * pes->ekb.nc);
-        if(skip_charge)
+
+        if (skip_charge)
         {
-            GlobalV::ofs_running<< "Average iterative diagonalization steps for k-points "<<ik<<" is: "<<DiagoIterAssist<T, Device>::avg_iter
-                <<" ; where current threshold is: "<<DiagoIterAssist<T, Device>::PW_DIAG_THR<<" . "<<std::endl;
+            GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+                                 << " is: " << DiagoIterAssist<T, Device>::avg_iter
+                                 << " ; where current threshold is: " << DiagoIterAssist<T, Device>::PW_DIAG_THR
+                                 << " . " << std::endl;
             DiagoIterAssist<T, Device>::avg_iter = 0.0;
         }
         /// calculate the contribution of Psi for charge density rho
     }
     castmem_2d_2h_op()(cpu_ctx, cpu_ctx, pes->ekb.c, eigenvalues.data(), pes->ekb.nr * pes->ekb.nc);
 
+    this->is_first_scf = false;
+
     this->endDiagh();
 
-    if(skip_charge)
+    if (skip_charge)
     {
         ModuleBase::timer::tick("HSolverPW", "solve");
         return;
@@ -233,9 +276,9 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
     reinterpret_cast<elecstate::ElecStatePW<T, Device>*>(pes)->psiToRho(psi);
 
 #ifdef USE_PAW
-    if(GlobalV::use_paw)
+    if (GlobalV::use_paw)
     {
-        if(typeid(Real) != typeid(double))
+        if (typeid(Real) != typeid(double))
         {
             ModuleBase::WARNING_QUIT("HSolverPW::solve", "PAW is only supported for double precision!");
         }
@@ -244,10 +287,10 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
         for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
         {
             const int npw = this->wfc_basis->npwk[ik];
-            ModuleBase::Vector3<double> *_gk = new ModuleBase::Vector3<double>[npw];
-            for (int ig = 0;ig < npw; ig++)
+            ModuleBase::Vector3<double>* _gk = new ModuleBase::Vector3<double>[npw];
+            for (int ig = 0; ig < npw; ig++)
             {
-                _gk[ig] = this->wfc_basis->getgpluskcar(ik,ig);
+                _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig);
             }
 
             double* kpt;
@@ -256,11 +299,11 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
             kpt[1] = this->wfc_basis->kvec_c[ik].y;
             kpt[2] = this->wfc_basis->kvec_c[ik].z;
 
-            double ** kpg;
-            double ** gcar;
+            double** kpg;
+            double** gcar;
             kpg = new double*[npw];
             gcar = new double*[npw];
-            for(int ipw=0;ipw<npw;ipw++)
+            for (int ipw = 0; ipw < npw; ipw++)
             {
                 kpg[ipw] = new double[3];
                 kpg[ipw][0] = _gk[ipw].x;
@@ -268,19 +311,23 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
                 kpg[ipw][2] = _gk[ipw].z;
 
                 gcar[ipw] = new double[3];
-                gcar[ipw][0] = this->wfc_basis->getgcar(ik,ipw).x;
-                gcar[ipw][1] = this->wfc_basis->getgcar(ik,ipw).y;
-                gcar[ipw][2] = this->wfc_basis->getgcar(ik,ipw).z;
+                gcar[ipw][0] = this->wfc_basis->getgcar(ik, ipw).x;
+                gcar[ipw][1] = this->wfc_basis->getgcar(ik, ipw).y;
+                gcar[ipw][2] = this->wfc_basis->getgcar(ik, ipw).z;
             }
 
-            GlobalC::paw_cell.set_paw_k(npw,wfc_basis->npwk_max,kpt,
-                this->wfc_basis->get_ig2ix(ik).data(),
-                this->wfc_basis->get_ig2iy(ik).data(),
-                this->wfc_basis->get_ig2iz(ik).data(),
-                (const double **) kpg,GlobalC::ucell.tpiba,(const double **) gcar);
+            GlobalC::paw_cell.set_paw_k(npw,
+                                        wfc_basis->npwk_max,
+                                        kpt,
+                                        this->wfc_basis->get_ig2ix(ik).data(),
+                                        this->wfc_basis->get_ig2iy(ik).data(),
+                                        this->wfc_basis->get_ig2iz(ik).data(),
+                                        (const double**)kpg,
+                                        GlobalC::ucell.tpiba,
+                                        (const double**)gcar);
 
             delete[] kpt;
-            for(int ipw = 0; ipw < npw; ipw++)
+            for (int ipw = 0; ipw < npw; ipw++)
             {
                 delete[] kpg[ipw];
                 delete[] gcar[ipw];
@@ -289,13 +336,14 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
             delete[] gcar;
 
             GlobalC::paw_cell.get_vkb();
-            
+
             psi.fix_k(ik);
             GlobalC::paw_cell.set_currentk(ik);
             int nbands = psi.get_nbands();
-            for(int ib = 0; ib < nbands; ib ++)
+            for (int ib = 0; ib < nbands; ib++)
             {
-                GlobalC::paw_cell.accumulate_rhoij(reinterpret_cast<std::complex<double>*> (psi.get_pointer(ib)), pes->wg(ik,ib));
+                GlobalC::paw_cell.accumulate_rhoij(reinterpret_cast<std::complex<double>*>(psi.get_pointer(ib)),
+                                                   pes->wg(ik, ib));
             }
         }
 
@@ -304,41 +352,49 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
         std::vector<int> nrhoijsel;
 
 #ifdef __MPI
-        if(GlobalV::RANK_IN_POOL == 0)
+        if (GlobalV::RANK_IN_POOL == 0)
         {
             GlobalC::paw_cell.get_rhoijp(rhoijp, rhoijselect, nrhoijsel);
 
-            for(int iat = 0; iat < GlobalC::ucell.nat; iat ++)
+            for (int iat = 0; iat < GlobalC::ucell.nat; iat++)
             {
-                GlobalC::paw_cell.set_rhoij(iat,nrhoijsel[iat],rhoijselect[iat].size(),rhoijselect[iat].data(),rhoijp[iat].data());
-            }  
+                GlobalC::paw_cell.set_rhoij(iat,
+                                            nrhoijsel[iat],
+                                            rhoijselect[iat].size(),
+                                            rhoijselect[iat].data(),
+                                            rhoijp[iat].data());
+            }
         }
 #else
         GlobalC::paw_cell.get_rhoijp(rhoijp, rhoijselect, nrhoijsel);
 
-        for(int iat = 0; iat < GlobalC::ucell.nat; iat ++)
+        for (int iat = 0; iat < GlobalC::ucell.nat; iat++)
         {
-            GlobalC::paw_cell.set_rhoij(iat,nrhoijsel[iat],rhoijselect[iat].size(),rhoijselect[iat].data(),rhoijp[iat].data());
+            GlobalC::paw_cell.set_rhoij(iat,
+                                        nrhoijsel[iat],
+                                        rhoijselect[iat].size(),
+                                        rhoijselect[iat].data(),
+                                        rhoijp[iat].data());
         }
 
 #endif
         double* nhatgr;
-        GlobalC::paw_cell.get_nhat(pes->charge->nhat,nhatgr);
+        GlobalC::paw_cell.get_nhat(pes->charge->nhat, nhatgr);
     }
 #endif
     ModuleBase::timer::tick("HSolverPW", "solve");
     return;
 }
 
-/* 
+/*
     lcao_in_pw
 */
 template <typename T, typename Device>
 void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_KS_PW::p_hamilt
-                                      psi::Psi<T, Device>& psi,      // ESolver_KS_PW::kspw_psi
-                                      elecstate::ElecState* pes,     // ESolver_KS_PW::pes
-                                      psi::Psi<T, Device>& transform,
-                                      const bool skip_charge)
+                                 psi::Psi<T, Device>& psi,           // ESolver_KS_PW::kspw_psi
+                                 elecstate::ElecState* pes,          // ESolver_KS_PW::pes
+                                 psi::Psi<T, Device>& transform,
+                                 const bool skip_charge)
 {
     ModuleBase::TITLE("HSolverPW", "solve");
     ModuleBase::timer::tick("HSolverPW", "solve");
@@ -348,13 +404,13 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
         /// update H(k) for each k point
         pHamilt->updateHk(ik);
 #ifdef USE_PAW
-	    if(GlobalV::use_paw)
+        if (GlobalV::use_paw)
         {
             const int npw = this->wfc_basis->npwk[ik];
-            ModuleBase::Vector3<double> *_gk = new ModuleBase::Vector3<double>[npw];
-            for (int ig = 0;ig < npw; ig++)
+            ModuleBase::Vector3<double>* _gk = new ModuleBase::Vector3<double>[npw];
+            for (int ig = 0; ig < npw; ig++)
             {
-                _gk[ig] = this->wfc_basis->getgpluskcar(ik,ig);
+                _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig);
             }
 
             double* kpt;
@@ -363,11 +419,11 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
             kpt[1] = this->wfc_basis->kvec_c[ik].y;
             kpt[2] = this->wfc_basis->kvec_c[ik].z;
 
-            double ** kpg;
-            double ** gcar;
+            double** kpg;
+            double** gcar;
             kpg = new double*[npw];
             gcar = new double*[npw];
-            for(int ipw=0;ipw<npw;ipw++)
+            for (int ipw = 0; ipw < npw; ipw++)
             {
                 kpg[ipw] = new double[3];
                 kpg[ipw][0] = _gk[ipw].x;
@@ -375,19 +431,23 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
                 kpg[ipw][2] = _gk[ipw].z;
 
                 gcar[ipw] = new double[3];
-                gcar[ipw][0] = this->wfc_basis->getgcar(ik,ipw).x;
-                gcar[ipw][1] = this->wfc_basis->getgcar(ik,ipw).y;
-                gcar[ipw][2] = this->wfc_basis->getgcar(ik,ipw).z;
+                gcar[ipw][0] = this->wfc_basis->getgcar(ik, ipw).x;
+                gcar[ipw][1] = this->wfc_basis->getgcar(ik, ipw).y;
+                gcar[ipw][2] = this->wfc_basis->getgcar(ik, ipw).z;
             }
 
-            GlobalC::paw_cell.set_paw_k(npw,wfc_basis->npwk_max,kpt,
-                this->wfc_basis->get_ig2ix(ik).data(),
-                this->wfc_basis->get_ig2iy(ik).data(),
-                this->wfc_basis->get_ig2iz(ik).data(),
-                (const double **) kpg,GlobalC::ucell.tpiba,(const double **) gcar);
+            GlobalC::paw_cell.set_paw_k(npw,
+                                        wfc_basis->npwk_max,
+                                        kpt,
+                                        this->wfc_basis->get_ig2ix(ik).data(),
+                                        this->wfc_basis->get_ig2iy(ik).data(),
+                                        this->wfc_basis->get_ig2iz(ik).data(),
+                                        (const double**)kpg,
+                                        GlobalC::ucell.tpiba,
+                                        (const double**)gcar);
 
             delete[] kpt;
-            for(int ipw = 0; ipw < npw; ipw++)
+            for (int ipw = 0; ipw < npw; ipw++)
             {
                 delete[] kpg[ipw];
                 delete[] gcar[ipw];
@@ -404,35 +464,36 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
         transform.fix_k(ik);
         /// solve eigenvector and eigenvalue for H(k)
 
-        //hsolver::DiagoIterAssist<T, Device>::diagH_subspace(
-        //    pHamilt,                                // interface to hamilt
-        //    transform,                              // transform matrix between lcao and pw
-        //    psi,                                    // psi in pw basis
-        //    eigenvalues.data() + ik * pes->ekb.nc,  // eigenvalues
-        //    psi.get_nbands()                        // number of the lowest energies bands
-        //    );
+        // hsolver::DiagoIterAssist<T, Device>::diagH_subspace(
+        //     pHamilt,                                // interface to hamilt
+        //     transform,                              // transform matrix between lcao and pw
+        //     psi,                                    // psi in pw basis
+        //     eigenvalues.data() + ik * pes->ekb.nc,  // eigenvalues
+        //     psi.get_nbands()                        // number of the lowest energies bands
+        //     );
 
-        
         hsolver::DiagoIterAssist<T, Device>::diagH_subspace_init(
-            pHamilt,                                // interface to hamilt
-            transform.get_pointer(),                // transform matrix between lcao and pw
+            pHamilt,                 // interface to hamilt
+            transform.get_pointer(), // transform matrix between lcao and pw
             transform.get_nbands(),
             transform.get_nbasis(),
-            psi,                                    // psi in pw basis
-            eigenvalues.data() + ik * pes->ekb.nc   // eigenvalues
-            );
-        
-        if(skip_charge)
+            psi,                                  // psi in pw basis
+            eigenvalues.data() + ik * pes->ekb.nc // eigenvalues
+        );
+
+        if (skip_charge)
         {
-            GlobalV::ofs_running<< "Average iterative diagonalization steps for k-points "<<ik<<" is: "<<DiagoIterAssist<T, Device>::avg_iter
-                <<" ; where current threshold is: "<<DiagoIterAssist<T, Device>::PW_DIAG_THR<<" . "<<std::endl;
+            GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+                                 << " is: " << DiagoIterAssist<T, Device>::avg_iter
+                                 << " ; where current threshold is: " << DiagoIterAssist<T, Device>::PW_DIAG_THR
+                                 << " . " << std::endl;
             DiagoIterAssist<T, Device>::avg_iter = 0.0;
         }
         /// calculate the contribution of Psi for charge density rho
-     }
+    }
     castmem_2d_2h_op()(cpu_ctx, cpu_ctx, pes->ekb.c, eigenvalues.data(), pes->ekb.nr * pes->ekb.nc);
 
-    if(skip_charge)
+    if (skip_charge)
     {
         ModuleBase::timer::tick("HSolverPW", "solve");
         return;
@@ -440,9 +501,9 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
     reinterpret_cast<elecstate::ElecStatePW<T, Device>*>(pes)->psiToRho(psi);
 
 #ifdef USE_PAW
-    if(GlobalV::use_paw)
+    if (GlobalV::use_paw)
     {
-        if(typeid(Real) != typeid(double))
+        if (typeid(Real) != typeid(double))
         {
             ModuleBase::WARNING_QUIT("HSolverPW::solve", "PAW is only supported for double precision!");
         }
@@ -451,10 +512,10 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
         for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
         {
             const int npw = this->wfc_basis->npwk[ik];
-            ModuleBase::Vector3<double> *_gk = new ModuleBase::Vector3<double>[npw];
-            for (int ig = 0;ig < npw; ig++)
+            ModuleBase::Vector3<double>* _gk = new ModuleBase::Vector3<double>[npw];
+            for (int ig = 0; ig < npw; ig++)
             {
-                _gk[ig] = this->wfc_basis->getgpluskcar(ik,ig);
+                _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig);
             }
 
             double* kpt;
@@ -463,11 +524,11 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
             kpt[1] = this->wfc_basis->kvec_c[ik].y;
             kpt[2] = this->wfc_basis->kvec_c[ik].z;
 
-            double ** kpg;
-            double ** gcar;
+            double** kpg;
+            double** gcar;
             kpg = new double*[npw];
             gcar = new double*[npw];
-            for(int ipw=0;ipw<npw;ipw++)
+            for (int ipw = 0; ipw < npw; ipw++)
             {
                 kpg[ipw] = new double[3];
                 kpg[ipw][0] = _gk[ipw].x;
@@ -475,19 +536,23 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
                 kpg[ipw][2] = _gk[ipw].z;
 
                 gcar[ipw] = new double[3];
-                gcar[ipw][0] = this->wfc_basis->getgcar(ik,ipw).x;
-                gcar[ipw][1] = this->wfc_basis->getgcar(ik,ipw).y;
-                gcar[ipw][2] = this->wfc_basis->getgcar(ik,ipw).z;
+                gcar[ipw][0] = this->wfc_basis->getgcar(ik, ipw).x;
+                gcar[ipw][1] = this->wfc_basis->getgcar(ik, ipw).y;
+                gcar[ipw][2] = this->wfc_basis->getgcar(ik, ipw).z;
             }
 
-            GlobalC::paw_cell.set_paw_k(npw,wfc_basis->npwk_max,kpt,
-                this->wfc_basis->get_ig2ix(ik).data(),
-                this->wfc_basis->get_ig2iy(ik).data(),
-                this->wfc_basis->get_ig2iz(ik).data(),
-                (const double **) kpg,GlobalC::ucell.tpiba,(const double **) gcar);
+            GlobalC::paw_cell.set_paw_k(npw,
+                                        wfc_basis->npwk_max,
+                                        kpt,
+                                        this->wfc_basis->get_ig2ix(ik).data(),
+                                        this->wfc_basis->get_ig2iy(ik).data(),
+                                        this->wfc_basis->get_ig2iz(ik).data(),
+                                        (const double**)kpg,
+                                        GlobalC::ucell.tpiba,
+                                        (const double**)gcar);
 
             delete[] kpt;
-            for(int ipw = 0; ipw < npw; ipw++)
+            for (int ipw = 0; ipw < npw; ipw++)
             {
                 delete[] kpg[ipw];
                 delete[] gcar[ipw];
@@ -496,13 +561,14 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
             delete[] gcar;
 
             GlobalC::paw_cell.get_vkb();
-            
+
             psi.fix_k(ik);
             GlobalC::paw_cell.set_currentk(ik);
             int nbands = psi.get_nbands();
-            for(int ib = 0; ib < nbands; ib ++)
+            for (int ib = 0; ib < nbands; ib++)
             {
-                GlobalC::paw_cell.accumulate_rhoij(reinterpret_cast<std::complex<double>*> (psi.get_pointer(ib)), pes->wg(ik,ib));
+                GlobalC::paw_cell.accumulate_rhoij(reinterpret_cast<std::complex<double>*>(psi.get_pointer(ib)),
+                                                   pes->wg(ik, ib));
             }
         }
 
@@ -511,82 +577,98 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
         std::vector<int> nrhoijsel;
 
 #ifdef __MPI
-        if(GlobalV::RANK_IN_POOL == 0)
+        if (GlobalV::RANK_IN_POOL == 0)
         {
             GlobalC::paw_cell.get_rhoijp(rhoijp, rhoijselect, nrhoijsel);
 
-            for(int iat = 0; iat < GlobalC::ucell.nat; iat ++)
+            for (int iat = 0; iat < GlobalC::ucell.nat; iat++)
             {
-                GlobalC::paw_cell.set_rhoij(iat,nrhoijsel[iat],rhoijselect[iat].size(),rhoijselect[iat].data(),rhoijp[iat].data());
-            }  
+                GlobalC::paw_cell.set_rhoij(iat,
+                                            nrhoijsel[iat],
+                                            rhoijselect[iat].size(),
+                                            rhoijselect[iat].data(),
+                                            rhoijp[iat].data());
+            }
         }
 #else
         GlobalC::paw_cell.get_rhoijp(rhoijp, rhoijselect, nrhoijsel);
 
-        for(int iat = 0; iat < GlobalC::ucell.nat; iat ++)
+        for (int iat = 0; iat < GlobalC::ucell.nat; iat++)
         {
-            GlobalC::paw_cell.set_rhoij(iat,nrhoijsel[iat],rhoijselect[iat].size(),rhoijselect[iat].data(),rhoijp[iat].data());
+            GlobalC::paw_cell.set_rhoij(iat,
+                                        nrhoijsel[iat],
+                                        rhoijselect[iat].size(),
+                                        rhoijselect[iat].data(),
+                                        rhoijp[iat].data());
         }
 
 #endif
         double* nhatgr;
-        GlobalC::paw_cell.get_nhat(pes->charge->nhat,nhatgr);
+        GlobalC::paw_cell.get_nhat(pes->charge->nhat, nhatgr);
     }
 #endif
     ModuleBase::timer::tick("HSolverPW", "solve");
     return;
 }
 
-template<typename T, typename Device>
+template <typename T, typename Device>
 void HSolverPW<T, Device>::endDiagh()
 {
     // DiagoCG would keep 9*nbasis memory in cache during loop-k
     // it should be deleted before calculating charge
-    if(this->method == "cg")
+    if (this->method == "cg")
     {
         delete reinterpret_cast<DiagoCG<T, Device>*>(this->pdiagh);
         this->pdiagh = nullptr;
     }
-    if(this->method == "dav")
+    if (this->method == "dav")
     {
         delete (DiagoDavid<T, Device>*)this->pdiagh;
         this->pdiagh = nullptr;
     }
-    if(this->method == "bpcg")
+    if (this->method == "dav_subspace")
+    {
+        delete (Diago_DavSubspace<T, Device>*)this->pdiagh;
+        this->pdiagh = nullptr;
+    }
+    if (this->method == "bpcg")
     {
         delete (DiagoBPCG<T, Device>*)this->pdiagh;
         this->pdiagh = nullptr;
     }
 
-    //in PW base, average iteration steps for each band and k-point should be printing
-    if(DiagoIterAssist<T, Device>::avg_iter > 0.0)
+    // in PW base, average iteration steps for each band and k-point should be printing
+    if (DiagoIterAssist<T, Device>::avg_iter > 0.0)
     {
-        GlobalV::ofs_running<< "Average iterative diagonalization steps: "<<DiagoIterAssist<T, Device>::avg_iter / this->wfc_basis->nks
-            <<" ; where current threshold is: "<<DiagoIterAssist<T, Device>::PW_DIAG_THR<<" . "<<std::endl;
-        //reset avg_iter
+        GlobalV::ofs_running << "Average iterative diagonalization steps: "
+                             << DiagoIterAssist<T, Device>::avg_iter / this->wfc_basis->nks
+                             << " ; where current threshold is: " << DiagoIterAssist<T, Device>::PW_DIAG_THR << " . "
+                             << std::endl;
+
+        // std::cout << "avg_iter == " << DiagoIterAssist<T, Device>::avg_iter << std::endl;
+
+        // reset avg_iter
         DiagoIterAssist<T, Device>::avg_iter = 0.0;
     }
-    //psi only should be initialed once for PW
-    if(!this->initialed_psi)
+    // psi only should be initialed once for PW
+    if (!this->initialed_psi)
     {
         this->initialed_psi = true;
     }
 }
 
 template <typename T, typename Device>
-void HSolverPW<T, Device>::updatePsiK(hamilt::Hamilt<T, Device>* pHamilt,
-                                           psi::Psi<T, Device>& psi,
-                                           const int ik)
+void HSolverPW<T, Device>::updatePsiK(hamilt::Hamilt<T, Device>* pHamilt, psi::Psi<T, Device>& psi, const int ik)
 {
     psi.fix_k(ik);
-    if(GlobalV::psi_initializer) // new psi initialization method branch
+    if (GlobalV::psi_initializer) // new psi initialization method branch
     {
         // do nothing here, because we have already initialize, allocate and make initial guess
         // basis_type lcao_in_pw function may be inserted here
     }
-    else if(!this->initialed_psi) // old psi initialization method branch
+    else if (!this->initialed_psi) // old psi initialization method branch
     {
-        if(GlobalV::BASIS_TYPE=="pw")
+        if (GlobalV::BASIS_TYPE == "pw")
         {
             hamilt::diago_PAO_in_pw_k2(this->ctx, ik, psi, this->wfc_basis, this->pwf, pHamilt);
         }
@@ -594,11 +676,20 @@ void HSolverPW<T, Device>::updatePsiK(hamilt::Hamilt<T, Device>* pHamilt,
     }
 }
 
-template<typename T, typename Device>
+template <typename T, typename Device>
 void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm, psi::Psi<T, Device>& psi, Real* eigenvalue)
 {
-    if (this->method != "cg") {
-        this->pdiagh->diag(hm, psi, eigenvalue);
+    if (this->method != "cg")
+    {
+        if (this->method == "dav_subspace")
+        {
+
+            ((Diago_DavSubspace<T, Device>*)this->pdiagh)->diag(hm, psi, eigenvalue, is_occupied);
+        }
+        else
+        {
+            this->pdiagh->diag(hm, psi, eigenvalue);
+        }
         return;
     }
     // warp the hpsi_func and spsi_func into a lambda function
@@ -608,16 +699,16 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm, psi::P
     auto ngk_pointer = psi.get_ngk_pointer();
     auto hpsi_func = [hm, ngk_pointer](const ct::Tensor& psi_in, ct::Tensor& hpsi_out) {
         ModuleBase::timer::tick("DiagoCG_New", "hpsi_func");
-        // psi_in should be a 2D tensor: 
+        // psi_in should be a 2D tensor:
         // psi_in.shape() = [nbands, nbasis]
         const auto ndim = psi_in.shape().ndim();
         REQUIRES_OK(ndim <= 2, "dims of psi_in should be less than or equal to 2");
         // Convert a Tensor object to a psi::Psi object
-        auto psi_wrapper = psi::Psi<T, Device>(
-            psi_in.data<T>(), 1, 
-            ndim == 1 ? 1 : psi_in.shape().dim_size(0), 
-            ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1),
-            ngk_pointer);
+        auto psi_wrapper = psi::Psi<T, Device>(psi_in.data<T>(),
+                                               1,
+                                               ndim == 1 ? 1 : psi_in.shape().dim_size(0),
+                                               ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1),
+                                               ngk_pointer);
         psi::Range all_bands_range(true, psi_wrapper.get_current_k(), 0, psi_wrapper.get_nbands() - 1);
         using hpsi_info = typename hamilt::Operator<T, Device>::hpsi_info;
         hpsi_info info(&psi_wrapper, all_bands_range, hpsi_out.data<T>());
@@ -626,7 +717,7 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm, psi::P
     };
     auto spsi_func = [this, hm](const ct::Tensor& psi_in, ct::Tensor& spsi_out) {
         ModuleBase::timer::tick("DiagoCG_New", "spsi_func");
-        // psi_in should be a 2D tensor: 
+        // psi_in should be a 2D tensor:
         // psi_in.shape() = [nbands, nbasis]
         const auto ndim = psi_in.shape().ndim();
         REQUIRES_OK(ndim <= 2, "dims of psi_in should be less than or equal to 2");
@@ -634,11 +725,13 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm, psi::P
         if (GlobalV::use_uspp)
         {
             // Convert a Tensor object to a psi::Psi object
-            hm->sPsi(psi_in.data<T>(), spsi_out.data<T>(), 
-            ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1), 
-            ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1), 
-            ndim == 1 ? 1 : psi_in.shape().dim_size(0));
-        } else
+            hm->sPsi(psi_in.data<T>(),
+                     spsi_out.data<T>(),
+                     ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1),
+                     ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1),
+                     ndim == 1 ? 1 : psi_in.shape().dim_size(0));
+        }
+        else
         {
             psi::memory::synchronize_memory_op<T, Device, Device>()(
                 this->ctx,
@@ -648,32 +741,32 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm, psi::P
                 static_cast<size_t>((ndim == 1 ? 1 : psi_in.shape().dim_size(0))
                                     * (ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1))));
         }
-        
+
         ModuleBase::timer::tick("DiagoCG_New", "spsi_func");
     };
-    auto psi_tensor = ct::TensorMap(
-        psi.get_pointer(), 
-        ct::DataTypeToEnum<T>::value, 
-        ct::DeviceTypeToEnum<ct_Device>::value,
-        ct::TensorShape({psi.get_nbands(), psi.get_nbasis()})).slice({0, 0}, {psi.get_nbands(), psi.get_current_nbas()});
-    auto eigen_tensor = ct::TensorMap(
-        eigenvalue,
-        ct::DataTypeToEnum<Real>::value,
-        ct::DeviceTypeToEnum<ct::DEVICE_CPU>::value,
-        ct::TensorShape({psi.get_nbands()}));
-    auto prec_tensor = ct::TensorMap(
-        precondition.data(),
-        ct::DataTypeToEnum<Real>::value, 
-        ct::DeviceTypeToEnum<ct::DEVICE_CPU>::value,
-        ct::TensorShape({static_cast<int>(precondition.size())})).to_device<ct_Device>().slice({0}, {psi.get_current_nbas()});
-    
+    auto psi_tensor = ct::TensorMap(psi.get_pointer(),
+                                    ct::DataTypeToEnum<T>::value,
+                                    ct::DeviceTypeToEnum<ct_Device>::value,
+                                    ct::TensorShape({psi.get_nbands(), psi.get_nbasis()}))
+                          .slice({0, 0}, {psi.get_nbands(), psi.get_current_nbas()});
+    auto eigen_tensor = ct::TensorMap(eigenvalue,
+                                      ct::DataTypeToEnum<Real>::value,
+                                      ct::DeviceTypeToEnum<ct::DEVICE_CPU>::value,
+                                      ct::TensorShape({psi.get_nbands()}));
+    auto prec_tensor = ct::TensorMap(precondition.data(),
+                                     ct::DataTypeToEnum<Real>::value,
+                                     ct::DeviceTypeToEnum<ct::DEVICE_CPU>::value,
+                                     ct::TensorShape({static_cast<int>(precondition.size())}))
+                           .to_device<ct_Device>()
+                           .slice({0}, {psi.get_current_nbas()});
+
     cg->diag(hpsi_func, spsi_func, psi_tensor, eigen_tensor, prec_tensor);
     // TODO: Double check tensormap's potential problem
     ct::TensorMap(psi.get_pointer(), psi_tensor, {psi.get_nbands(), psi.get_nbasis()}).sync(psi_tensor);
 }
 
-template<typename T, typename Device>
-void HSolverPW<T, Device>::update_precondition(std::vector<Real> &h_diag, const int ik, const int npw)
+template <typename T, typename Device>
+void HSolverPW<T, Device>::update_precondition(std::vector<Real>& h_diag, const int ik, const int npw)
 {
     h_diag.assign(h_diag.size(), 1.0);
     int precondition_type = 2;
@@ -688,7 +781,7 @@ void HSolverPW<T, Device>::update_precondition(std::vector<Real> &h_diag, const
     {
         for (int ig = 0; ig < npw; ig++)
         {
-            Real g2kin = static_cast<Real>(this->wfc_basis->getgk2(ik,ig)) * tpiba2;
+            Real g2kin = static_cast<Real>(this->wfc_basis->getgk2(ik, ig)) * tpiba2;
             h_diag[ig] = std::max(static_cast<Real>(1.0), g2kin);
         }
     }
@@ -696,30 +789,38 @@ void HSolverPW<T, Device>::update_precondition(std::vector<Real> &h_diag, const
     {
         for (int ig = 0; ig < npw; ig++)
         {
-            Real g2kin = static_cast<Real>(this->wfc_basis->getgk2(ik,ig)) * tpiba2;
-            h_diag[ig] = 1 + g2kin + sqrt(1 + (g2kin - 1) * (g2kin - 1));
+            Real g2kin = static_cast<Real>(this->wfc_basis->getgk2(ik, ig)) * tpiba2;
+
+            if (this->method == "dav_subspace")
+            {
+                h_diag[ig] = g2kin;
+            }
+            else
+            {
+                h_diag[ig] = 1 + g2kin + sqrt(1 + (g2kin - 1) * (g2kin - 1));
+            }
         }
     }
-    if(GlobalV::NSPIN==4)
+    if (GlobalV::NSPIN == 4)
     {
         const int size = h_diag.size();
         for (int ig = 0; ig < npw; ig++)
         {
-            h_diag[ig+size/2] = h_diag[ig];
+            h_diag[ig + size / 2] = h_diag[ig];
         }
     }
 }
 
-template<typename T, typename Device>
+template <typename T, typename Device>
 typename HSolverPW<T, Device>::Real HSolverPW<T, Device>::cal_hsolerror()
 {
     return this->diag_ethr * static_cast<Real>(std::max(1.0, GlobalV::nelec));
 }
 
-template<typename T, typename Device>
+template <typename T, typename Device>
 typename HSolverPW<T, Device>::Real HSolverPW<T, Device>::set_diagethr(const int istep, const int iter, const Real drho)
 {
-    //It is too complex now and should be modified.
+    // It is too complex now and should be modified.
     if (iter == 1)
     {
         if (std::abs(this->diag_ethr - 1.0e-2) < 1.0e-6)
@@ -754,19 +855,24 @@ typename HSolverPW<T, Device>::Real HSolverPW<T, Device>::set_diagethr(const int
         {
             this->diag_ethr = 1.e-2;
         }
-        this->diag_ethr = std::min(this->diag_ethr, static_cast<Real>(0.1) * drho / std::max(static_cast<Real>(1.0), static_cast<Real>(GlobalV::nelec)));
+        this->diag_ethr = std::min(this->diag_ethr,
+                                   static_cast<Real>(0.1) * drho
+                                       / std::max(static_cast<Real>(1.0), static_cast<Real>(GlobalV::nelec)));
     }
     // It is essential for single precision implementation to keep the diag_ethr value
     // less or equal to the single-precision limit of convergence(0.5e-4).
     // modified by denghuilu at 2023-05-15
-    if (GlobalV::precision_flag == "single") {
+    if (GlobalV::precision_flag == "single")
+    {
         this->diag_ethr = std::max(this->diag_ethr, static_cast<Real>(0.5e-4));
     }
     return this->diag_ethr;
 }
 
-template<typename T, typename Device>
-typename HSolverPW<T, Device>::Real HSolverPW<T, Device>::reset_diagethr(std::ofstream& ofs_running, const Real hsover_error, const Real drho)
+template <typename T, typename Device>
+typename HSolverPW<T, Device>::Real HSolverPW<T, Device>::reset_diagethr(std::ofstream& ofs_running,
+                                                                         const Real hsover_error,
+                                                                         const Real drho)
 {
     ofs_running << " Notice: Threshold on eigenvalues was too large.\n";
     ModuleBase::WARNING("scf", "Threshold on eigenvalues was too large.");
@@ -776,7 +882,8 @@ typename HSolverPW<T, Device>::Real HSolverPW<T, Device>::reset_diagethr(std::of
     // It is essential for single precision implementation to keep the diag_ethr value
     // less or equal to the single-precision limit of convergence(0.5e-4).
     // modified by denghuilu at 2023-05-15
-    if (GlobalV::precision_flag == "single") {
+    if (GlobalV::precision_flag == "single")
+    {
         this->diag_ethr = std::max(this->diag_ethr, static_cast<Real>(0.5e-4));
     }
     ofs_running << " New    diag_ethr = " << this->diag_ethr << std::endl;
diff --git a/source/module_hsolver/hsolver_pw.h b/source/module_hsolver/hsolver_pw.h
index a6aedf1cff..afc19ac65c 100644
--- a/source/module_hsolver/hsolver_pw.h
+++ b/source/module_hsolver/hsolver_pw.h
@@ -12,6 +12,7 @@ template<typename T, typename Device = psi::DEVICE_CPU>
 class HSolverPW: public HSolver<T, Device>
 {
   private:
+    bool is_first_scf = true;
     // Note GetTypeReal<T>::type will 
     // return T if T is real type(float, double), 
     // otherwise return the real type of T(complex<float>, complex<double>)
@@ -67,6 +68,8 @@ class HSolverPW: public HSolver<T, Device>
     void update_precondition(std::vector<Real> &h_diag, const int ik, const int npw);
 
     std::vector<Real> precondition;
+    std::vector<Real> eigenvalues;
+    std::vector<bool> is_occupied;
 
     bool initialed_psi = false;
 
diff --git a/source/module_hsolver/kernels/cuda/dngvd_op.cu b/source/module_hsolver/kernels/cuda/dngvd_op.cu
index f22fe4c7f4..cc9d606dd0 100644
--- a/source/module_hsolver/kernels/cuda/dngvd_op.cu
+++ b/source/module_hsolver/kernels/cuda/dngvd_op.cu
@@ -243,14 +243,36 @@ struct dnevx_op<T, psi::DEVICE_GPU> {
     }
 };
 
+template <typename T>
+struct dngvx_op<T, psi::DEVICE_GPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const psi::DEVICE_GPU* d,
+                    const int nbase,
+                    const int ldh,
+                    T* hcc,
+                    T* scc,
+                    const int m,
+                    Real* eigenvalue,
+                    T* vcc)
+    {
+
+    }
+};
+
+
 template struct dngvd_op<std::complex<float>, psi::DEVICE_GPU>;
 template struct dnevx_op<std::complex<float>, psi::DEVICE_GPU>;
+template struct dngvx_op<std::complex<float>, psi::DEVICE_GPU>;
+
 template struct dngvd_op<std::complex<double>, psi::DEVICE_GPU>;
 template struct dnevx_op<std::complex<double>, psi::DEVICE_GPU>;
+template struct dngvx_op<std::complex<double>, psi::DEVICE_GPU>;
 
 #ifdef __LCAO
 template struct dngvd_op<double, psi::DEVICE_GPU>;
 template struct dnevx_op<double, psi::DEVICE_GPU>;
+template struct dngvx_op<double, psi::DEVICE_GPU>;
 #endif
 
 } // namespace hsolver
\ No newline at end of file
diff --git a/source/module_hsolver/kernels/dngvd_op.cpp b/source/module_hsolver/kernels/dngvd_op.cpp
index c4d8193c6e..3280794ee2 100644
--- a/source/module_hsolver/kernels/dngvd_op.cpp
+++ b/source/module_hsolver/kernels/dngvd_op.cpp
@@ -2,34 +2,23 @@
 
 #include <algorithm>
 
-namespace hsolver {
+namespace hsolver
+{
 
-    inline double get_real(const double& x)
+template <typename T>
+struct dngvd_op<T, psi::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const psi::DEVICE_CPU* d,
+                    const int nstart,
+                    const int ldh,
+                    const T* hcc,
+                    const T* scc,
+                    Real* eigenvalue,
+                    T* vcc)
     {
-        return x;
-    }
-    inline double get_real(const std::complex<double>& x)
-    {
-        return x.real();
-    }
-    inline float get_real(const std::complex<float>& x)
-    {
-        return x.real();
-    }
-
-    template <typename T>
-    struct dngvd_op<T, psi::DEVICE_CPU> {
-        using Real = typename GetTypeReal<T>::type;
-        void operator()(
-            const psi::DEVICE_CPU* d,
-            const int nstart,
-            const int ldh,
-            const T* hcc,
-            const T* scc,
-            Real* eigenvalue,
-            T* vcc)
-    {
-        for (int i = 0; i < nstart * ldh; i++) {
+        for (int i = 0; i < nstart * ldh; i++)
+        {
             vcc[i] = hcc[i];
         }
         int info = 0;
@@ -42,23 +31,41 @@ namespace hsolver {
         ModuleBase::GlobalFunc::ZEROS(rwork, lrwork);
 
         int liwork = 3 + 5 * nstart;
-        int *iwork = new int[liwork];
+        int* iwork = new int[liwork];
         ModuleBase::GlobalFunc::ZEROS(iwork, liwork);
 
         //===========================
         // calculate all eigenvalues
         //===========================
-        LapackConnector::xhegvd(1, 'V', 'U', nstart, vcc, ldh, scc, ldh, eigenvalue, work, lwork, rwork, lrwork, iwork, liwork, info);
+        LapackConnector::xhegvd(1,
+                                'V',
+                                'U',
+                                nstart,
+                                vcc,
+                                ldh,
+                                scc,
+                                ldh,
+                                eigenvalue,
+                                work,
+                                lwork,
+                                rwork,
+                                lrwork,
+                                iwork,
+                                liwork,
+                                info);
 
-        if (info != 0) {
-            std::cout << "Error: xhegvd failed, linear dependent basis functions\n" 
+        if (info != 0)
+        {
+            std::cout << "Error: xhegvd failed, linear dependent basis functions\n"
                       << ", wrong initialization of wavefunction, or wavefunction information loss\n"
                       << ", output overlap matrix scc.txt to check\n"
                       << std::endl;
             // print scc to file scc.txt
             std::ofstream ofs("scc.txt");
-            for (int i = 0; i < nstart; i++) {
-                for (int j = 0; j < nstart; j++) {
+            for (int i = 0; i < nstart; i++)
+            {
+                for (int j = 0; j < nstart; j++)
+                {
                     ofs << scc[i * ldh + j] << " ";
                 }
                 ofs << std::endl;
@@ -73,21 +80,78 @@ namespace hsolver {
     }
 };
 
+template <typename T>
+struct dngv_op<T, psi::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const psi::DEVICE_CPU* d,
+                    const int nbase,
+                    const int ldh,
+                    const T* hcc,
+                    T* scc,
+                    Real* eigenvalue,
+                    T* vcc)
+    {
+        for (int i = 0; i < nbase * ldh; i++)
+        {
+            vcc[i] = hcc[i];
+        }
+
+        int info = 0;
+
+        int lwork = 2 * nbase - 1;
+        T* work = new T[lwork];
+        ModuleBase::GlobalFunc::ZEROS(work, lwork);
+
+        int lrwork = 3 * nbase - 2;
+        Real* rwork = new Real[lrwork];
+        ModuleBase::GlobalFunc::ZEROS(rwork, lrwork);
+
+        //===========================
+        // calculate all eigenvalues
+        //===========================
+        LapackConnector::xhegv(1, 'V', 'U', nbase, vcc, ldh, scc, ldh, eigenvalue, work, lwork, rwork, info);
+
+        if (info != 0)
+        {
+            std::cout << "Error: xhegv failed, linear dependent basis functions\n"
+                      << ", wrong initialization of wavefunction, or wavefunction information loss\n"
+                      << ", output overlap matrix scc.txt to check\n"
+                      << std::endl;
+            // print scc to file scc.txt
+            std::ofstream ofs("scc.txt");
+            for (int i = 0; i < nbase; i++)
+            {
+                for (int j = 0; j < nbase; j++)
+                {
+                    ofs << scc[i * ldh + j] << " ";
+                }
+                ofs << std::endl;
+            }
+            ofs.close();
+        }
+        assert(0 == info);
+
+        delete[] work;
+        delete[] rwork;
+    }
+};
 
-    template <typename T>
-    struct dnevx_op<T, psi::DEVICE_CPU> {
-        using Real = typename GetTypeReal<T>::type;
-    void operator()(
-        const psi::DEVICE_CPU* /*ctx*/,
-        const int nstart,
-        const int ldh,
-        const T* hcc, // hcc
-        const int nbands, // nbands
-        Real* eigenvalue, // eigenvalue
-        T* vcc) // vcc
+template <typename T>
+struct dnevx_op<T, psi::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const psi::DEVICE_CPU* /*ctx*/,
+                    const int nstart,
+                    const int ldh,
+                    const T* hcc,     // hcc
+                    const int nbands, // nbands
+                    Real* eigenvalue, // eigenvalue
+                    T* vcc)           // vcc
     {
         T* aux = new T[nstart * ldh];
-        for (int ii = 0; ii < nstart * ldh; ii++) {
+        for (int ii = 0; ii < nstart * ldh; ii++)
+        {
             aux[ii] = hcc[ii];
         }
 
@@ -95,38 +159,38 @@ namespace hsolver {
         int lwork = -1;
         T* work = new T[1];
         Real* rwork = new Real[7 * nstart];
-        int *iwork = new int[5 * nstart];
-        int *ifail = new int[nstart];
-        
+        int* iwork = new int[5 * nstart];
+        int* ifail = new int[nstart];
+
         // When lwork = -1, the demension of work will be assumed
         // Assume the denmension of work by output work[0]
         LapackConnector::xheevx(
-            1, // ITYPE = 1:  A*x = (lambda)*B*x
-            'V', // JOBZ = 'V':  Compute eigenvalues and eigenvectors.
-            'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
-            'L', // UPLO = 'L':  Lower triangles of A and B are stored.
-            nstart, // N = base
-            aux, // A is COMPLEX*16 array  dimension (LDA, N)
-            ldh, // LDA = base
-            0.0, // Not referenced if RANGE = 'A' or 'I'.
-            0.0, // Not referenced if RANGE = 'A' or 'I'.
-            1, // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N,
-            nbands, // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N,
-            0.0, // ABSTOL
-            nbands, // M: The total number of eigenvalues found.  0 <= M <= N. if RANGE = 'I', M = IU-IL+1.
+            1,          // ITYPE = 1:  A*x = (lambda)*B*x
+            'V',        // JOBZ = 'V':  Compute eigenvalues and eigenvectors.
+            'I',        // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
+            'L',        // UPLO = 'L':  Lower triangles of A and B are stored.
+            nstart,     // N = base
+            aux,        // A is COMPLEX*16 array  dimension (LDA, N)
+            ldh,        // LDA = base
+            0.0,        // Not referenced if RANGE = 'A' or 'I'.
+            0.0,        // Not referenced if RANGE = 'A' or 'I'.
+            1,          // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N,
+            nbands,     // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N,
+            0.0,        // ABSTOL
+            nbands,     // M: The total number of eigenvalues found.  0 <= M <= N. if RANGE = 'I', M = IU-IL+1.
             eigenvalue, // W store eigenvalues
-            vcc, // store eigenvector
-            ldh, // LDZ: The leading dimension of the array Z.
+            vcc,        // store eigenvector
+            ldh,        // LDZ: The leading dimension of the array Z.
             work,
             lwork,
             rwork,
             iwork,
             ifail,
             info);
-       
-    
+
         lwork = int(get_real(work[0]));
-        delete[] work; work = new T[lwork];
+        delete[] work;
+        work = new T[lwork];
 
         // The A and B storage space is (nstart * ldh), and the data that really participates in the zhegvx
         // operation is (nstart * nstart). In this function, the data that A and B participate in the operation will
@@ -135,22 +199,22 @@ namespace hsolver {
         // obtained by the zhegvx operation is (nstart * nstart) and stored in zux (internal to the function). When
         // the function is output, the data of zux will be mapped to the corresponding position of V.
         LapackConnector::xheevx(
-            1, // ITYPE = 1:  A*x = (lambda)*B*x
-            'V', // JOBZ = 'V':  Compute eigenvalues and eigenvectors.
-            'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
-            'L', // UPLO = 'L':  Lower triangles of A and B are stored.
-            nstart, // N = base
-            aux, // A is COMPLEX*16 array  dimension (LDA, N)
-            ldh, // LDA = base
-            0.0, // Not referenced if RANGE = 'A' or 'I'.
-            0.0, // Not referenced if RANGE = 'A' or 'I'.
-            1, // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N,
-            nbands, // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N,
-            0.0, // ABSTOL
-            nbands, // M: The total number of eigenvalues found.  0 <= M <= N. if RANGE = 'I', M = IU-IL+1.
+            1,          // ITYPE = 1:  A*x = (lambda)*B*x
+            'V',        // JOBZ = 'V':  Compute eigenvalues and eigenvectors.
+            'I',        // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
+            'L',        // UPLO = 'L':  Lower triangles of A and B are stored.
+            nstart,     // N = base
+            aux,        // A is COMPLEX*16 array  dimension (LDA, N)
+            ldh,        // LDA = base
+            0.0,        // Not referenced if RANGE = 'A' or 'I'.
+            0.0,        // Not referenced if RANGE = 'A' or 'I'.
+            1,          // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N,
+            nbands,     // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N,
+            0.0,        // ABSTOL
+            nbands,     // M: The total number of eigenvalues found.  0 <= M <= N. if RANGE = 'I', M = IU-IL+1.
             eigenvalue, // W store eigenvalues
-            vcc, // store eigenvector
-            ldh, // LDZ: The leading dimension of the array Z.
+            vcc,        // store eigenvector
+            ldh,        // LDZ: The leading dimension of the array Z.
             work,
             lwork,
             rwork,
@@ -168,14 +232,108 @@ namespace hsolver {
     }
 };
 
+template <typename T>
+struct dngvx_op<T, psi::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const psi::DEVICE_CPU* d,
+                    const int nbase,
+                    const int ldh,
+                    T* hcc,
+                    T* scc,
+                    const int m,
+                    Real* eigenvalue,
+                    T* vcc)
+    {
+
+        int info = 0;
+
+        int mm = m;
+
+        int lwork = -1;
+
+        T* work = new T[1];
+        Real* rwork = new Real[7 * nbase];
+        int* iwork = new int[5 * nbase];
+        int* ifail = new int[nbase];
+
+        LapackConnector::xhegvx(
+            1,     // ITYPE = 1:  A*x = (lambda)*B*x
+            'V',   // JOBZ = 'V':  Compute eigenvalues and eigenvectors.
+            'I',   // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
+            'U',   // UPLO = 'L':  Lower triangles of A and B are stored.
+            nbase, // N = base
+            hcc,   // A is COMPLEX*16 array  dimension (LDA, N)
+            ldh,   // LDA = base
+            scc,
+            ldh,
+            0.0,        // Not referenced if RANGE = 'A' or 'I'.
+            0.0,        // Not referenced if RANGE = 'A' or 'I'.
+            1,          // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N,
+            m,          // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N,
+            0.0,        // ABSTOL
+            mm,         // M: The total number of eigenvalues found.  0 <= M <= N. if RANGE = 'I', M = IU-IL+1.
+            eigenvalue, // W store eigenvalues
+            vcc,        // store eigenvector
+            ldh,        // LDZ: The leading dimension of the array Z.
+            work,
+            lwork,
+            rwork,
+            iwork,
+            ifail,
+            info);
+
+        lwork = int(get_real(work[0]));
+        delete[] work;
+        work = new T[lwork];
+
+        LapackConnector::xhegvx(1,
+                                'V',
+                                'I',
+                                'U',
+                                nbase,
+                                hcc,
+                                ldh,
+                                scc,
+                                ldh,
+                                0.0,
+                                0.0,
+                                1,
+                                m,
+                                0.0,
+                                mm,
+                                eigenvalue,
+                                vcc,
+                                ldh,
+                                work,
+                                lwork,
+                                rwork,
+                                iwork,
+                                ifail,
+                                info);
+
+        delete[] work;
+        delete[] rwork;
+        delete[] iwork;
+        delete[] ifail;
+    }
+};
+
+template struct dngvd_op<std::complex<float>, psi::DEVICE_CPU>;
+template struct dngvd_op<std::complex<double>, psi::DEVICE_CPU>;
+
+template struct dnevx_op<std::complex<float>, psi::DEVICE_CPU>;
+template struct dnevx_op<std::complex<double>, psi::DEVICE_CPU>;
 
-    template struct dngvd_op<std::complex<float>, psi::DEVICE_CPU>;
-    template struct dngvd_op<std::complex<double>, psi::DEVICE_CPU>;
+template struct dngvx_op<std::complex<float>, psi::DEVICE_CPU>;
+template struct dngvx_op<std::complex<double>, psi::DEVICE_CPU>;
 
-    template struct dnevx_op <std::complex<float>, psi::DEVICE_CPU >;
-    template struct dnevx_op<std::complex<double>, psi::DEVICE_CPU>;
+template struct dngv_op<std::complex<float>, psi::DEVICE_CPU>;
+template struct dngv_op<std::complex<double>, psi::DEVICE_CPU>;
 #ifdef __LCAO
-    template struct dngvd_op<double, psi::DEVICE_CPU>;
-    template struct dnevx_op <double, psi::DEVICE_CPU >;
+template struct dngvd_op<double, psi::DEVICE_CPU>;
+template struct dnevx_op<double, psi::DEVICE_CPU>;
+template struct dngvx_op<double, psi::DEVICE_CPU>;
+template struct dngv_op<double, psi::DEVICE_CPU>;
 #endif
 } // namespace hsolver
\ No newline at end of file
diff --git a/source/module_hsolver/kernels/dngvd_op.h b/source/module_hsolver/kernels/dngvd_op.h
index 23d5855087..72e0ae1521 100644
--- a/source/module_hsolver/kernels/dngvd_op.h
+++ b/source/module_hsolver/kernels/dngvd_op.h
@@ -3,24 +3,24 @@
 #ifndef MODULE_HSOLVER_DNGVD_H
 #define MODULE_HSOLVER_DNGVD_H
 
-#include "module_base/complexmatrix.h"
-#include "module_base/lapack_connector.h"
 #include "math_kernel_op.h"
-#include "module_psi/kernels/memory_op.h"
+#include "module_base/lapack_connector.h"
 
 namespace hsolver
 {
 
-    template <typename T, typename Device> struct dngvd_op
-    {
-        using Real = typename GetTypeReal<T>::type;
-        /// @brief DNGVD computes all the eigenvalues and eigenvectors of a complex generalized
+template <typename T, typename Device>
+struct dngvd_op
+{
+    using Real = typename GetTypeReal<T>::type;
+    /// @brief DNGVD computes all the eigenvalues and eigenvectors of a complex generalized
     /// Hermitian-definite eigenproblem. If eigenvectors are desired, it uses a divide and conquer algorithm.
     ///
     /// In this op, the CPU version is implemented through the `gvd` interface, and the CUDA version
     /// is implemented through the `gvd` interface.
     /// API doc:
-    /// 1. zhegvd: https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_ga74fdf9b5a16c90d8b7a589dec5ca058a.html
+    /// 1. zhegvd:
+    /// https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_ga74fdf9b5a16c90d8b7a589dec5ca058a.html
     /// 2. cusolverDnZhegvd: https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-sygvd
     ///
     /// Input Parameters
@@ -32,26 +32,56 @@ namespace hsolver
     /// Output Parameter
     ///     @param W : calculated eigenvalues
     ///     @param V : calculated eigenvectors (col major)
-        void operator()(const Device* d,
-            const int nstart,
-            const int ldh,
-            const T* A,
-            const T* B,
-            Real* W,
-            T* V);
+    void operator()(const Device* d, const int nstart, const int ldh, const T* A, const T* B, Real* W, T* V);
 };
 
+template <typename T, typename Device>
+struct dngv_op
+{
+    using Real = typename GetTypeReal<T>::type;
+    /// @brief DNGVX computes first m eigenvalues and eigenvectors of a complex generalized
+    /// Input Parameters
+    ///     @param d : the type of device
+    ///     @param nbase : the number of dim of the matrix
+    ///     @param ldh : the number of dmx of the matrix
+    ///     @param A : the hermitian matrix A in A x=lambda B x (col major)
+    ///     @param B : the overlap matrix B in A x=lambda B x (col major)
+    /// Output Parameter
+    ///     @param W : calculated eigenvalues
+    ///     @param V : calculated eigenvectors (col major)
+    void operator()(const Device* d, const int nstart, const int ldh, const T* A, T* B, Real* W, T* V);
+};
 
-    template <typename T, typename Device> struct dnevx_op
+template <typename T, typename Device>
+struct dngvx_op
 {
-        using Real = typename GetTypeReal<T>::type;
+    using Real = typename GetTypeReal<T>::type;
+    /// @brief DNGVX computes first m eigenvalues and eigenvectors of a complex generalized
+    /// Input Parameters
+    ///     @param d : the type of device
+    ///     @param nbase : the number of dim of the matrix
+    ///     @param ldh : the number of dmx of the matrix
+    ///     @param A : the hermitian matrix A in A x=lambda B x (col major)
+    ///     @param B : the overlap matrix B in A x=lambda B x (col major)
+    ///     @param m : the number of eigenpair
+    /// Output Parameter
+    ///     @param W : calculated eigenvalues
+    ///     @param V : calculated eigenvectors (col major)
+    void operator()(const Device* d, const int nstart, const int ldh, T* A, T* B, const int m, Real* W, T* V);
+};
+
+template <typename T, typename Device>
+struct dnevx_op
+{
+    using Real = typename GetTypeReal<T>::type;
     /// @brief DNEVX computes the first m eigenvalues and their corresponding eigenvectors of
     /// a complex generalized Hermitian-definite eigenproblem
     ///
     /// In this op, the CPU version is implemented through the `evx` interface, and the CUDA version
     /// is implemented through the `evd` interface and acquires the first m eigenpairs.
     /// API doc:
-    /// 1. zheevx: https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_gaabef68a9c7b10df7aef8f4fec89fddbe.html
+    /// 1. zheevx:
+    /// https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_gaabef68a9c7b10df7aef8f4fec89fddbe.html
     /// 2. cusolverDnZheevd: https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-syevd
     ///
     /// Input Parameters
@@ -62,16 +92,9 @@ namespace hsolver
     /// Output Parameter
     ///     @param W : calculated eigenvalues
     ///     @param V : calculated eigenvectors (row major)
-    void operator()(const Device* d,
-        const int nstart,
-        const int ldh,
-        const T* A,
-        const int m,
-        Real* W,
-        T* V);
+    void operator()(const Device* d, const int nstart, const int ldh, const T* A, const int m, Real* W, T* V);
 };
 
-
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 
 void createGpuSolverHandle();
diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp
index 597d9cd5ea..b494f57890 100644
--- a/source/module_hsolver/kernels/math_kernel_op.cpp
+++ b/source/module_hsolver/kernels/math_kernel_op.cpp
@@ -7,24 +7,26 @@ namespace hsolver
 {
 
 template <typename T>
-struct line_minimize_with_block_op<T, psi::DEVICE_CPU> {
+struct line_minimize_with_block_op<T, psi::DEVICE_CPU>
+{
     using Real = typename GetTypeReal<T>::type;
-    void operator() (
-        T* grad_out,
-        T* hgrad_out,
-        T* psi_out,
-        T* hpsi_out,
-        const int &n_basis,
-        const int &n_basis_max,
-        const int &n_band)
+    void operator()(T* grad_out,
+                    T* hgrad_out,
+                    T* psi_out,
+                    T* hpsi_out,
+                    const int& n_basis,
+                    const int& n_basis_max,
+                    const int& n_band)
     {
-        for (int band_idx = 0; band_idx < n_band; band_idx++) {
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
             Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0;
             Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
-            auto A = reinterpret_cast<const Real *>(grad_out + band_idx * n_basis_max);
+            auto A = reinterpret_cast<const Real*>(grad_out + band_idx * n_basis_max);
             Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
             norm = 1.0 / sqrt(norm);
-            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) {
+            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
+            {
                 auto item = band_idx * n_basis_max + basis_idx;
                 grad_out[item] *= norm;
                 hgrad_out[item] *= norm;
@@ -32,12 +34,13 @@ struct line_minimize_with_block_op<T, psi::DEVICE_CPU> {
                 epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item]));
                 epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item]));
             }
-            theta = 0.5 * std::abs(std::atan(2 * epsilo_1/(epsilo_0 - epsilo_2)));
+            theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
             cos_theta = std::cos(theta);
             sin_theta = std::sin(theta);
-            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) {
+            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
+            {
                 auto item = band_idx * n_basis_max + basis_idx;
-                psi_out [item] = psi_out [item] * cos_theta + grad_out [item] * sin_theta;
+                psi_out[item] = psi_out[item] * cos_theta + grad_out[item] * sin_theta;
                 hpsi_out[item] = hpsi_out[item] * cos_theta + hgrad_out[item] * sin_theta;
             }
         }
@@ -45,43 +48,47 @@ struct line_minimize_with_block_op<T, psi::DEVICE_CPU> {
 };
 
 template <typename T>
-struct calc_grad_with_block_op<T, psi::DEVICE_CPU> {
+struct calc_grad_with_block_op<T, psi::DEVICE_CPU>
+{
     using Real = typename GetTypeReal<T>::type;
-    void operator() (
-        const Real* prec_in,
-        Real* err_out,
-        Real* beta_out,
-        T* psi_out,
-        T* hpsi_out,
-        T* grad_out,
-        T* grad_old_out,
-        const int &n_basis,
-        const int &n_basis_max,
-        const int &n_band)
+    void operator()(const Real* prec_in,
+                    Real* err_out,
+                    Real* beta_out,
+                    T* psi_out,
+                    T* hpsi_out,
+                    T* grad_out,
+                    T* grad_old_out,
+                    const int& n_basis,
+                    const int& n_basis_max,
+                    const int& n_band)
     {
-        for (int band_idx = 0; band_idx < n_band; band_idx++) {
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
             Real err = 0.0;
             Real beta = 0.0;
             Real epsilo = 0.0;
             Real grad_2 = {0.0};
             T grad_1 = {0.0, 0.0};
-            auto A = reinterpret_cast<const Real *>(psi_out + band_idx * n_basis_max);
+            auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
             Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
             norm = 1.0 / sqrt(norm);
-            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) {
+            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
+            {
                 auto item = band_idx * n_basis_max + basis_idx;
                 psi_out[item] *= norm;
                 hpsi_out[item] *= norm;
                 epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
             }
-            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) {
+            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
+            {
                 auto item = band_idx * n_basis_max + basis_idx;
                 grad_1 = hpsi_out[item] - epsilo * psi_out[item];
                 grad_2 = std::norm(grad_1);
                 err += grad_2;
                 beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
             }
-            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) {
+            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
+            {
                 auto item = band_idx * n_basis_max + basis_idx;
                 grad_1 = hpsi_out[item] - epsilo * psi_out[item];
                 grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item];
@@ -93,16 +100,17 @@ struct calc_grad_with_block_op<T, psi::DEVICE_CPU> {
 };
 
 template <typename FPTYPE>
-struct dot_real_op<FPTYPE, psi::DEVICE_CPU> {
-    FPTYPE operator() (
-        const psi::DEVICE_CPU* d,
-        const int& dim,
-        const FPTYPE* psi_L,
-        const FPTYPE* psi_R,
-        const bool reduce)
+struct dot_real_op<FPTYPE, psi::DEVICE_CPU>
+{
+    FPTYPE operator()(const psi::DEVICE_CPU* d,
+                      const int& dim,
+                      const FPTYPE* psi_L,
+                      const FPTYPE* psi_R,
+                      const bool reduce)
     {
         FPTYPE result = BlasConnector::dot(dim, psi_L, 1, psi_R, 1);
-        if (reduce) {
+        if (reduce)
+        {
             Parallel_Reduce::reduce_pool(result);
         }
         return result;
@@ -110,39 +118,37 @@ struct dot_real_op<FPTYPE, psi::DEVICE_CPU> {
 };
 
 // CPU specialization of actual computation.
-template <typename FPTYPE> 
-struct dot_real_op<std::complex<FPTYPE>, psi::DEVICE_CPU> {
-  FPTYPE operator() (
-      const psi::DEVICE_CPU* d,
-      const int& dim,
-      const std::complex<FPTYPE>* psi_L,
-      const std::complex<FPTYPE>* psi_R,
-      const bool reduce) 
-  {
-    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-    // qianrui modify 2021-3-14
-    // Note that  ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) )
-    const FPTYPE* pL = reinterpret_cast<const FPTYPE*>(psi_L);
-    const FPTYPE* pR = reinterpret_cast<const FPTYPE*>(psi_R);
-    FPTYPE result = BlasConnector::dot(2 * dim, pL, 1, pR, 1);
-    if (reduce) {
-        Parallel_Reduce::reduce_pool(result);
+template <typename FPTYPE>
+struct dot_real_op<std::complex<FPTYPE>, psi::DEVICE_CPU>
+{
+    FPTYPE operator()(const psi::DEVICE_CPU* d,
+                      const int& dim,
+                      const std::complex<FPTYPE>* psi_L,
+                      const std::complex<FPTYPE>* psi_R,
+                      const bool reduce)
+    {
+        //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+        // qianrui modify 2021-3-14
+        // Note that  ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) )
+        const FPTYPE* pL = reinterpret_cast<const FPTYPE*>(psi_L);
+        const FPTYPE* pR = reinterpret_cast<const FPTYPE*>(psi_R);
+        FPTYPE result = BlasConnector::dot(2 * dim, pL, 1, pR, 1);
+        if (reduce)
+        {
+            Parallel_Reduce::reduce_pool(result);
+        }
+        return result;
     }
-    return result;
-  }
 };
 
-template <typename T> struct vector_div_constant_op<T, psi::DEVICE_CPU>
+template <typename T>
+struct vector_div_constant_op<T, psi::DEVICE_CPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const psi::DEVICE_CPU* d,
-        const int dim,
-        T* result,
-        const T* vector,
-        const Real constant)
+    void operator()(const psi::DEVICE_CPU* d, const int dim, T* result, const T* vector, const Real constant)
     {
 #ifdef _OPENMP
-#pragma omp parallel for schedule(static, 4096/sizeof(Real))
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
 #endif
         for (int i = 0; i < dim; i++)
         {
@@ -151,17 +157,14 @@ template <typename T> struct vector_div_constant_op<T, psi::DEVICE_CPU>
     }
 };
 
-template <typename T> struct vector_mul_vector_op<T, psi::DEVICE_CPU>
+template <typename T>
+struct vector_mul_vector_op<T, psi::DEVICE_CPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const psi::DEVICE_CPU* d,
-        const int& dim,
-        T* result,
-        const T* vector1,
-        const Real* vector2)
+    void operator()(const psi::DEVICE_CPU* d, const int& dim, T* result, const T* vector1, const Real* vector2)
     {
 #ifdef _OPENMP
-#pragma omp parallel for schedule(static, 4096/sizeof(Real))
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
 #endif
         for (int i = 0; i < dim; i++)
         {
@@ -170,17 +173,14 @@ template <typename T> struct vector_mul_vector_op<T, psi::DEVICE_CPU>
     }
 };
 
-template <typename T> struct vector_div_vector_op<T, psi::DEVICE_CPU>
+template <typename T>
+struct vector_div_vector_op<T, psi::DEVICE_CPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const psi::DEVICE_CPU* d,
-                    const int& dim,
-        T* result,
-        const T* vector1,
-        const Real* vector2)
+    void operator()(const psi::DEVICE_CPU* d, const int& dim, T* result, const T* vector1, const Real* vector2)
     {
 #ifdef _OPENMP
-#pragma omp parallel for schedule(static, 4096/sizeof(Real))
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
 #endif
         for (int i = 0; i < dim; i++)
         {
@@ -189,19 +189,20 @@ template <typename T> struct vector_div_vector_op<T, psi::DEVICE_CPU>
     }
 };
 
-template <typename T> struct constantvector_addORsub_constantVector_op<T, psi::DEVICE_CPU>
+template <typename T>
+struct constantvector_addORsub_constantVector_op<T, psi::DEVICE_CPU>
 {
     using Real = typename GetTypeReal<T>::type;
     void operator()(const psi::DEVICE_CPU* d,
-        const int& dim,
-        T* result,
-        const T* vector1,
-        const Real constant1,
-        const T* vector2,
-        const Real constant2)
+                    const int& dim,
+                    T* result,
+                    const T* vector1,
+                    const Real constant1,
+                    const T* vector2,
+                    const Real constant2)
     {
 #ifdef _OPENMP
-#pragma omp parallel for schedule(static, 8192/sizeof(T))
+#pragma omp parallel for schedule(static, 8192 / sizeof(T))
 #endif
         for (int i = 0; i < dim; i++)
         {
@@ -211,87 +212,84 @@ template <typename T> struct constantvector_addORsub_constantVector_op<T, psi::D
 };
 
 template <typename FPTYPE>
-struct scal_op<FPTYPE, psi::DEVICE_CPU> {
-    void operator()(
-            const psi::DEVICE_CPU * /*ctx*/,
-            const int &N,
-            const std::complex<FPTYPE> *alpha,
-            std::complex<FPTYPE> *X,
-            const int &incx)
+struct scal_op<FPTYPE, psi::DEVICE_CPU>
+{
+    void operator()(const psi::DEVICE_CPU* /*ctx*/,
+                    const int& N,
+                    const std::complex<FPTYPE>* alpha,
+                    std::complex<FPTYPE>* X,
+                    const int& incx)
     {
         BlasConnector::scal(N, *alpha, X, incx);
     }
 };
 
 template <typename T>
-struct gemv_op<T, psi::DEVICE_CPU> {
-    void operator()(
-        const psi::DEVICE_CPU* d,
-        const char& trans,
-        const int& m,
-        const int& n,
-        const T* alpha,
-        const T* A,
-        const int& lda,
-        const T* X,
-        const int& incx,
-        const T* beta,
-        T* Y,
-        const int& incy)
+struct gemv_op<T, psi::DEVICE_CPU>
+{
+    void operator()(const psi::DEVICE_CPU* d,
+                    const char& trans,
+                    const int& m,
+                    const int& n,
+                    const T* alpha,
+                    const T* A,
+                    const int& lda,
+                    const T* X,
+                    const int& incx,
+                    const T* beta,
+                    T* Y,
+                    const int& incy)
     {
         BlasConnector::gemv(trans, m, n, *alpha, A, lda, X, incx, *beta, Y, incy);
     }
 };
 
 template <typename T>
-struct axpy_op<T, psi::DEVICE_CPU> {
-    void operator()(
-        const psi::DEVICE_CPU* /*ctx*/,
-        const int& dim,
-        const T* alpha,
-        const T* X,
-        const int& incX,
-        T* Y,
-        const int& incY)
+struct axpy_op<T, psi::DEVICE_CPU>
+{
+    void operator()(const psi::DEVICE_CPU* /*ctx*/,
+                    const int& dim,
+                    const T* alpha,
+                    const T* X,
+                    const int& incX,
+                    T* Y,
+                    const int& incY)
     {
         BlasConnector::axpy(dim, *alpha, X, incX, Y, incY);
     }
 };
 
 template <typename T>
-struct gemm_op<T, psi::DEVICE_CPU> {
-    void operator()(
-        const psi::DEVICE_CPU* /*ctx*/,
-        const char& transa,
-        const char& transb,
-        const int& m,
-        const int& n,
-        const int& k,
-        const T* alpha,
-        const T* a,
-        const int& lda,
-        const T* b,
-        const int& ldb,
-        const T* beta,
-        T* c,
-        const int& ldc)
+struct gemm_op<T, psi::DEVICE_CPU>
+{
+    void operator()(const psi::DEVICE_CPU* /*ctx*/,
+                    const char& transa,
+                    const char& transb,
+                    const int& m,
+                    const int& n,
+                    const int& k,
+                    const T* alpha,
+                    const T* a,
+                    const int& lda,
+                    const T* b,
+                    const int& ldb,
+                    const T* beta,
+                    T* c,
+                    const int& ldc)
     {
         BlasConnector::gemm(transb, transa, n, m, k, *alpha, b, ldb, a, lda, *beta, c, ldc);
     }
 };
 
-template <typename T> struct matrixTranspose_op<T, psi::DEVICE_CPU>
+template <typename T>
+struct matrixTranspose_op<T, psi::DEVICE_CPU>
 {
-    void operator()(const psi::DEVICE_CPU* d,
-        const int& row,
-        const int& col,
-        const T* input_matrix,
-        T* output_matrix)
+    void operator()(const psi::DEVICE_CPU* d, const int& row, const int& col, const T* input_matrix, T* output_matrix)
     {
         T* temp = nullptr;
         psi::memory::resize_memory_op<T, psi::DEVICE_CPU>()(d, temp, row * col, "MTransOp");
 #ifdef _OPENMP
-#pragma omp parallel for collapse(2) schedule(static, 8192/sizeof(T))
+#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
 #endif
         for (int j = 0; j < col; j++)
         {
@@ -301,7 +299,7 @@ template <typename T> struct matrixTranspose_op<T, psi::DEVICE_CPU>
             }
         }
 #ifdef _OPENMP
-#pragma omp parallel for schedule(static, 8192/sizeof(T))
+#pragma omp parallel for schedule(static, 8192 / sizeof(T))
 #endif
         for (int i = 0; i < row * col; i++)
         {
@@ -311,17 +309,13 @@ template <typename T> struct matrixTranspose_op<T, psi::DEVICE_CPU>
     }
 };
 
-template <typename T> struct matrixSetToAnother<T, psi::DEVICE_CPU>
+template <typename T>
+struct matrixSetToAnother<T, psi::DEVICE_CPU>
 {
-    void operator()(const psi::DEVICE_CPU* d,
-        const int& n,
-        const T* A,
-        const int& LDA,
-        T* B,
-        const int& LDB)
+    void operator()(const psi::DEVICE_CPU* d, const int& n, const T* A, const int& LDA, T* B, const int& LDB)
     {
 #ifdef _OPENMP
-#pragma omp parallel for collapse(2) schedule(static, 8192/sizeof(T))
+#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
 #endif
         for (int i = 0; i < n; i++)
         {
@@ -333,8 +327,6 @@ template <typename T> struct matrixSetToAnother<T, psi::DEVICE_CPU>
     }
 };
 
-
-
 // Explicitly instantiate functors for the types of functor registered.
 template struct scal_op<float, psi::DEVICE_CPU>;
 template struct axpy_op<std::complex<float>, psi::DEVICE_CPU>;
diff --git a/source/module_hsolver/kernels/math_kernel_op.h b/source/module_hsolver/kernels/math_kernel_op.h
index 1f062431e9..774d16cb04 100644
--- a/source/module_hsolver/kernels/math_kernel_op.h
+++ b/source/module_hsolver/kernels/math_kernel_op.h
@@ -3,23 +3,84 @@
 #ifndef MODULE_HSOLVER_MATH_KERNEL_H
 #define MODULE_HSOLVER_MATH_KERNEL_H
 
-#include "module_base/blas_connector.h"
-#include "module_psi/psi.h"
-#include "module_base/parallel_reduce.h"
-#include "module_psi/kernels/memory_op.h"
 #include <module_base/macros.h>
 
+#include "module_base/blas_connector.h"
+#include "module_base/parallel_reduce.h"
+#include "module_psi/psi.h"
 
 #if defined(__CUDA) || defined(__UT_USE_CUDA)
 #include <cuda_runtime.h>
+
 #include "cublas_v2.h"
 #endif //__CUDA || __UT_USE_CUDA
 
 namespace hsolver
 {
 
+inline std::complex<double> set_real_tocomplex(const std::complex<double>& x)
+{
+    return {x.real(), 0.0};
+}
+
+inline std::complex<float> set_real_tocomplex(const std::complex<float>& x)
+{
+    return {x.real(), 0.0};
+}
+
+inline double set_real_tocomplex(const double& x)
+{
+    return x;
+}
+
+inline float set_real_tocomplex(const float& x)
+{
+    return x;
+}
+
+inline std::complex<double> get_conj(const std::complex<double>& x)
+{
+    return {x.real(), -x.imag()};
+}
+
+inline std::complex<float> get_conj(const std::complex<float>& x)
+{
+    return {x.real(), -x.imag()};
+}
+
+inline double get_conj(const double& x)
+{
+    return x;
+}
+
+inline float get_conj(const float& x)
+{
+    return x;
+}
+
+inline double get_real(const std::complex<double>& x)
+{
+    return x.real();
+}
+
+inline float get_real(const std::complex<float>& x)
+{
+    return x.real();
+}
+
+inline double get_real(const double& x)
+{
+    return x;
+}
+
+inline float get_real(const float& x)
+{
+    return x;
+}
+
 template <typename T, typename Device>
-struct line_minimize_with_block_op {
+struct line_minimize_with_block_op
+{
     /// @brief dot_real_op computes the dot product of the given complex arrays(treated as float arrays).
     /// And there's may have MPI communications while enabling planewave parallization strategy.
     ///
@@ -32,19 +93,18 @@ struct line_minimize_with_block_op {
     ///
     /// \return res : the result vector
     /// T : dot product result
-    void operator() (
-        T* grad_out,
-        T* hgrad_out,
-        T* psi_out,
-        T* hpsi_out,
-        const int &n_basis,
-        const int &n_basis_max,
-        const int &n_band);
+    void operator()(T* grad_out,
+                    T* hgrad_out,
+                    T* psi_out,
+                    T* hpsi_out,
+                    const int& n_basis,
+                    const int& n_basis_max,
+                    const int& n_band);
 };
 
-
 template <typename T, typename Device>
-struct calc_grad_with_block_op {
+struct calc_grad_with_block_op
+{
     /// @brief dot_real_op computes the dot product of the given complex arrays(treated as float arrays).
     /// And there's may have MPI communications while enabling planewave parallization strategy.
     ///
@@ -58,44 +118,40 @@ struct calc_grad_with_block_op {
     /// \return res : the result vector
     /// T : dot product result
     using Real = typename GetTypeReal<T>::type;
-    void operator() (
-        const Real* prec_in,
-        Real* err_out,
-        Real* beta_out,
-        T* psi_out,
-        T* hpsi_out,
-        T* grad_out,
-        T* grad_old_out,
-        const int &n_basis,
-        const int &n_basis_max,
-        const int &n_band);
+    void operator()(const Real* prec_in,
+                    Real* err_out,
+                    Real* beta_out,
+                    T* psi_out,
+                    T* hpsi_out,
+                    T* grad_out,
+                    T* grad_old_out,
+                    const int& n_basis,
+                    const int& n_basis_max,
+                    const int& n_band);
 };
 
 template <typename T, typename Device>
-struct dot_real_op {
+struct dot_real_op
+{
     using Real = typename GetTypeReal<T>::type;
     /// @brief dot_real_op computes the dot product of the given complex arrays(treated as float arrays).
-  /// And there's may have MPI communications while enabling planewave parallization strategy.
-  ///
-  /// Input Parameters
-  /// \param d : the type of computing device
-  /// \param dim : array size
-  /// \param psi_L : input array A
-  /// \param psi_R : input array B
-  /// \param reduce : flag to control whether to perform the MPI communications
-  ///
-  /// \return
-  /// FPTYPE : dot product result
-    Real operator() (
-      const Device* d,
-      const int& dim,
-        const T* psi_L,
-        const T* psi_R,
-      const bool reduce = true);
+    /// And there's may have MPI communications while enabling planewave parallization strategy.
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param dim : array size
+    /// \param psi_L : input array A
+    /// \param psi_R : input array B
+    /// \param reduce : flag to control whether to perform the MPI communications
+    ///
+    /// \return
+    /// FPTYPE : dot product result
+    Real operator()(const Device* d, const int& dim, const T* psi_L, const T* psi_R, const bool reduce = true);
 };
 
 // vector operator: result[i] = vector[i] / constant
-template <typename T, typename Device> struct vector_div_constant_op
+template <typename T, typename Device>
+struct vector_div_constant_op
 {
     using Real = typename GetTypeReal<T>::type;
     /// @brief result[i] = vector[i] / constant
@@ -108,15 +164,12 @@ template <typename T, typename Device> struct vector_div_constant_op
     ///
     /// Output Parameters
     /// \param result : output array
-    void operator()(const Device* d,
-                    const int dim,
-        T* result,
-        const T* vector,
-        const Real constant);
+    void operator()(const Device* d, const int dim, T* result, const T* vector, const Real constant);
 };
 
 // replace vector_div_constant_op : x = alpha * x
-template <typename FPTYPE, typename Device> struct scal_op
+template <typename FPTYPE, typename Device>
+struct scal_op
 {
     /// @brief x = alpha * x
     ///
@@ -137,7 +190,8 @@ template <typename FPTYPE, typename Device> struct scal_op
 };
 
 // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
-template <typename T, typename Device> struct vector_mul_vector_op
+template <typename T, typename Device>
+struct vector_mul_vector_op
 {
     using Real = typename GetTypeReal<T>::type;
     /// @brief result[i] = vector1[i](complex) * vector2[i](not complex)
@@ -150,15 +204,12 @@ template <typename T, typename Device> struct vector_mul_vector_op
     ///
     /// Output Parameters
     /// \param result : output array
-    void operator()(const Device* d,
-        const int& dim,
-        T* result,
-        const T* vector1,
-        const Real* vector2);
+    void operator()(const Device* d, const int& dim, T* result, const T* vector1, const Real* vector2);
 };
 
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
-template <typename T, typename Device> struct vector_div_vector_op
+template <typename T, typename Device>
+struct vector_div_vector_op
 {
     using Real = typename GetTypeReal<T>::type;
     /// @brief result[i] = vector1[i](complex) / vector2[i](not complex)
@@ -171,15 +222,12 @@ template <typename T, typename Device> struct vector_div_vector_op
     ///
     /// Output Parameters
     /// \param result : output array
-    void operator()(const Device* d,
-        const int& dim,
-        T* result,
-        const T* vector1,
-        const Real* vector2);
+    void operator()(const Device* d, const int& dim, T* result, const T* vector1, const Real* vector2);
 };
 
 // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
-template <typename T, typename Device> struct constantvector_addORsub_constantVector_op
+template <typename T, typename Device>
+struct constantvector_addORsub_constantVector_op
 {
     using Real = typename GetTypeReal<T>::type;
     /// @brief result[i] = vector1[i] * constant1 + vector2[i] * constant2
@@ -195,16 +243,17 @@ template <typename T, typename Device> struct constantvector_addORsub_constantVe
     /// Output Parameters
     /// \param result : output array
     void operator()(const Device* d,
-        const int& dim,
-        T* result,
-        const T* vector1,
-        const Real constant1,
-        const T* vector2,
-        const Real constant2);
+                    const int& dim,
+                    T* result,
+                    const T* vector1,
+                    const Real constant1,
+                    const T* vector2,
+                    const Real constant2);
 };
 
 //  compute Y = alpha * X + Y
-template <typename T, typename Device> struct axpy_op
+template <typename T, typename Device>
+struct axpy_op
 {
     /// @brief Y = alpha * X + Y
     ///
@@ -219,17 +268,12 @@ template <typename T, typename Device> struct axpy_op
     ///
     /// Output Parameters
     /// \param Y : output array Y
-    void operator()(const Device* d,
-        const int& N,
-        const T* alpha,
-        const T* X,
-        const int& incX,
-        T* Y,
-        const int& incY);
+    void operator()(const Device* d, const int& N, const T* alpha, const T* X, const int& incX, T* Y, const int& incY);
 };
 
 // compute y = alpha * op(A) * x + beta * y
-template <typename T, typename Device> struct gemv_op
+template <typename T, typename Device>
+struct gemv_op
 {
     /// @brief y = alpha * op(A) * x + beta * y
     ///
@@ -253,19 +297,19 @@ template <typename T, typename Device> struct gemv_op
                     const char& trans,
                     const int& m,
                     const int& n,
-        const T* alpha,
-        const T* A,
+                    const T* alpha,
+                    const T* A,
                     const int& lda,
-        const T* X,
+                    const T* X,
                     const int& incx,
-        const T* beta,
-        T* Y,
+                    const T* beta,
+                    T* Y,
                     const int& incy);
 };
 
-
 // compute C = alpha * op(A) * op(B) + beta * C
-template <typename T, typename Device> struct gemm_op
+template <typename T, typename Device>
+struct gemm_op
 {
     /// @brief C = alpha * op(A) * op(B) + beta * C
     ///
@@ -288,22 +332,23 @@ template <typename T, typename Device> struct gemm_op
     /// Output Parameters
     /// \param c : output matrix C
     void operator()(const Device* d,
-        const char& transa,
-        const char& transb,
-        const int& m,
-        const int& n,
-        const int& k,
-        const T* alpha,
-        const T* a,
-        const int& lda,
-        const T* b,
-        const int& ldb,
-        const T* beta,
-        T* c,
-        const int& ldc);
+                    const char& transa,
+                    const char& transb,
+                    const int& m,
+                    const int& n,
+                    const int& k,
+                    const T* alpha,
+                    const T* a,
+                    const int& lda,
+                    const T* b,
+                    const int& ldb,
+                    const T* beta,
+                    T* c,
+                    const int& ldc);
 };
 
-template <typename T, typename Device> struct matrixTranspose_op
+template <typename T, typename Device>
+struct matrixTranspose_op
 {
     /// @brief transpose the input matrix
     ///
@@ -315,14 +360,11 @@ template <typename T, typename Device> struct matrixTranspose_op
     ///
     /// Output Parameters
     /// \param output_matrix : output matrix
-    void operator()(const Device* d,
-        const int& row,
-        const int& col,
-        const T* input_matrix,
-        T* output_matrix);
+    void operator()(const Device* d, const int& row, const int& col, const T* input_matrix, T* output_matrix);
 };
 
-template <typename T, typename Device> struct matrixSetToAnother
+template <typename T, typename Device>
+struct matrixSetToAnother
 {
     /// @brief initialize matrix B with A
     ///
@@ -335,114 +377,97 @@ template <typename T, typename Device> struct matrixSetToAnother
     ///
     /// Output Parameters
     /// \param B : output matrix B
-    void operator()(const Device* d,
-        const int& n,
-        const T* A,
-        const int& LDA,
-        T* B,
-        const int& LDB);
+    void operator()(const Device* d, const int& n, const T* A, const int& LDA, T* B, const int& LDB);
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 
 template <typename T>
-struct line_minimize_with_block_op<T, psi::DEVICE_GPU> {
-  using Real = typename GetTypeReal<T>::type;
-  void operator()(
-    T* grad_out,
-    T* hgrad_out,
-    T* psi_out,
-    T* hpsi_out,
-    const int &n_basis,
-    const int &n_basis_max,
-    const int &n_band);
+struct line_minimize_with_block_op<T, psi::DEVICE_GPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(T* grad_out,
+                    T* hgrad_out,
+                    T* psi_out,
+                    T* hpsi_out,
+                    const int& n_basis,
+                    const int& n_basis_max,
+                    const int& n_band);
 };
 
 template <typename T>
-struct calc_grad_with_block_op<T, psi::DEVICE_GPU> {
-  using Real = typename GetTypeReal<T>::type;
-  void operator()(
-    const Real* prec_in,
-    Real* err_out,
-    Real* beta_out,
-    T* psi_out,
-    T* hpsi_out,
-    T* grad_out,
-    T* grad_old_out,
-    const int &n_basis,
-    const int &n_basis_max,
-    const int &n_band);
+struct calc_grad_with_block_op<T, psi::DEVICE_GPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Real* prec_in,
+                    Real* err_out,
+                    Real* beta_out,
+                    T* psi_out,
+                    T* hpsi_out,
+                    T* grad_out,
+                    T* grad_old_out,
+                    const int& n_basis,
+                    const int& n_basis_max,
+                    const int& n_band);
 };
 
 // Partially specialize functor for psi::GpuDevice.
 template <typename T>
-struct dot_real_op<T, psi::DEVICE_GPU> {
+struct dot_real_op<T, psi::DEVICE_GPU>
+{
     using Real = typename GetTypeReal<T>::type;
-    Real operator()(
-        const psi::DEVICE_GPU* d,
-        const int& dim,
-        const T* psi_L,
-        const T* psi_R,
-        const bool reduce = true);
+    Real operator()(const psi::DEVICE_GPU* d, const int& dim, const T* psi_L, const T* psi_R, const bool reduce = true);
 };
 
 // vector operator: result[i] = vector[i] / constant
-template <typename T> struct vector_div_constant_op<T, psi::DEVICE_GPU>
+template <typename T>
+struct vector_div_constant_op<T, psi::DEVICE_GPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const psi::DEVICE_GPU* d,
-        const int dim,
-        T* result,
-        const T* vector,
-        const Real constant);
+    void operator()(const psi::DEVICE_GPU* d, const int dim, T* result, const T* vector, const Real constant);
 };
 
 // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
-template <typename T> struct vector_mul_vector_op<T, psi::DEVICE_GPU>
+template <typename T>
+struct vector_mul_vector_op<T, psi::DEVICE_GPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const psi::DEVICE_GPU* d,
-        const int& dim,
-        T* result,
-        const T* vector1,
-        const  Real* vector2);
+    void operator()(const psi::DEVICE_GPU* d, const int& dim, T* result, const T* vector1, const Real* vector2);
 };
 
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
-template <typename T> struct vector_div_vector_op<T, psi::DEVICE_GPU>
+template <typename T>
+struct vector_div_vector_op<T, psi::DEVICE_GPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const psi::DEVICE_GPU* d,
-        const int& dim,
-        T* result,
-        const T* vector1,
-        const Real* vector2);
+    void operator()(const psi::DEVICE_GPU* d, const int& dim, T* result, const T* vector1, const Real* vector2);
 };
 
 // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
-template <typename T> struct constantvector_addORsub_constantVector_op<T, psi::DEVICE_GPU>
+template <typename T>
+struct constantvector_addORsub_constantVector_op<T, psi::DEVICE_GPU>
 {
     using Real = typename GetTypeReal<T>::type;
     void operator()(const psi::DEVICE_GPU* d,
-        const int& dim,
-        T* result,
-        const T* vector1,
-        const Real constant1,
-        const T* vector2,
-        const Real constant2);
+                    const int& dim,
+                    T* result,
+                    const T* vector1,
+                    const Real constant1,
+                    const T* vector2,
+                    const Real constant2);
 };
 
-template <typename T> struct matrixSetToAnother<T, psi::DEVICE_GPU>
+template <typename T>
+struct matrixSetToAnother<T, psi::DEVICE_GPU>
 {
     void operator()(const psi::DEVICE_GPU* d,
-        const int& n,
-        const T* A, // input
-        const int& LDA,
-        T* B, // output
-        const int& LDB);
+                    const int& n,
+                    const T* A, // input
+                    const int& LDA,
+                    T* B, // output
+                    const int& LDB);
 };
 
-
 void createGpuBlasHandle();
 void destoryBLAShandle();
 
diff --git a/source/module_hsolver/test/CMakeLists.txt b/source/module_hsolver/test/CMakeLists.txt
index 5dd16d10ee..90dcddbacd 100644
--- a/source/module_hsolver/test/CMakeLists.txt
+++ b/source/module_hsolver/test/CMakeLists.txt
@@ -70,13 +70,13 @@ AddTest(
 AddTest(
   TARGET HSolver_pw
   LIBS ${math_libs} psi device base container
-  SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_iter_assist.cpp
+  SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diagh_consts.cpp ../diago_iter_assist.cpp
 )
 
 AddTest(
   TARGET HSolver_sdft
   LIBS ${math_libs} psi device base container
-  SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_iter_assist.cpp
+  SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diagh_consts.cpp ../diago_iter_assist.cpp
 )
 
 if(ENABLE_LCAO)
diff --git a/source/module_io/input.cpp b/source/module_io/input.cpp
index 8e65755cad..59dd6fff81 100644
--- a/source/module_io/input.cpp
+++ b/source/module_io/input.cpp
@@ -3009,7 +3009,7 @@ void Input::Default_2(void) // jiyy add 2019-08-04
                 diago_proc = GlobalV::NPROC;
             }
         }
-        else if (ks_solver == "dav")
+        else if (ks_solver == "dav" || ks_solver == "dav_subspace")
         {
             GlobalV::ofs_warning << " It's ok to use dav." << std::endl;
         }
@@ -3959,7 +3959,11 @@ void Input::Check(void)
         {
             ModuleBase::WARNING_QUIT("Input", "lapack can not be used with plane wave basis.");
         }
-        else if (ks_solver != "default" && ks_solver != "cg" && ks_solver != "dav" && ks_solver != "bpcg")
+        else if (ks_solver != "default" && 
+                 ks_solver != "cg" && 
+                 ks_solver != "dav" && 
+                 ks_solver != "dav_subspace" && 
+                 ks_solver != "bpcg")
         {
             ModuleBase::WARNING_QUIT("Input", "please check the ks_solver parameter!");
         }
diff --git a/source/module_psi/kernels/device.cpp b/source/module_psi/kernels/device.cpp
index 3bba66f2a6..7e5bea7705 100644
--- a/source/module_psi/kernels/device.cpp
+++ b/source/module_psi/kernels/device.cpp
@@ -567,7 +567,11 @@ std::string get_device_flag(const std::string& device, const std::string& ks_sol
 #else
     str = "cpu";
 #endif
-    if (ks_solver != "cg" && ks_solver != "dav" && ks_solver != "bpcg") {
+    if (ks_solver != "cg" && 
+        ks_solver != "dav" && 
+        ks_solver != "dav_subspace" && 
+        ks_solver != "bpcg")
+    {
         str = "cpu";
     }
     if (basis_type != "pw") {