diff --git a/source/Makefile.Objects b/source/Makefile.Objects index f72cc9ae6c..5416eb69e7 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -286,6 +286,7 @@ OBJS_HCONTAINER=base_matrix.o\ OBJS_HSOLVER=diago_cg.o\ diago_david.o\ + diago_dav_subspace.o\ diagh_consts.o\ diago_bpcg.o\ hsolver_pw.o\ diff --git a/source/module_base/lapack_connector.h b/source/module_base/lapack_connector.h index 8dd312b177..1142ef93c5 100644 --- a/source/module_base/lapack_connector.h +++ b/source/module_base/lapack_connector.h @@ -22,6 +22,9 @@ extern "C" { + int ilaenv_(int* ispec,const char* name,const char* opts, + const int* n1,const int* n2,const int* n3,const int* n4); + // solve the generalized eigenproblem Ax=eBx, where A is Hermitian and complex couble // zhegv_ & zhegvd_ returns all eigenvalues while zhegvx_ returns selected ones void dsygvd_(const int* itype, const char* jobz, const char* uplo, const int* n, @@ -60,9 +63,12 @@ extern "C" const int* m, double* w, std::complex *z, const int *ldz, std::complex *work, const int* lwork, double* rwork, int* iwork, int* ifail, int* info); - void zhegv_(const int* itype,const char* jobz,const char* uplo,const int* n, - std::complex* a,const int* lda,std::complex* b,const int* ldb, - double* w,std::complex* work,int* lwork,double* rwork,int* info); + + void dsygvx_(const int* itype, const char* jobz, const char* range, const char* uplo, + const int* n, double* A, const int* lda, double* B, const int* ldb, + const double* vl, const double* vu, const int* il, const int* iu, + const double* abstol, const int* m, double* w, double* Z, const int* ldz, + double* work, int* lwork, int*iwork, int* ifail, int* info); void chegvx_(const int* itype,const char* jobz,const char* range,const char* uplo, const int* n,std::complex *a,const int* lda,std::complex *b, @@ -78,6 +84,16 @@ extern "C" std::complex *z,const int *ldz,std::complex *work,const int* lwork, double* rwork,int* iwork,int* ifail,int* info); + void zhegv_(const int* itype,const char* jobz,const char* uplo,const int* n, + std::complex* a,const int* lda,std::complex* b,const int* ldb, + double* w,std::complex* work,int* lwork,double* rwork,int* info); + void chegv_(const int* itype,const char* jobz,const char* uplo,const int* n, + std::complex* a,const int* lda,std::complex* b,const int* ldb, + float* w,std::complex* work,int* lwork,float* rwork,int* info); + void dsygv_(const int* itype, const char* jobz,const char* uplo, const int* n, + double* a,const int* lda,double* b,const int* ldb, + double* w,double* work,int* lwork,int* info); + // solve the eigenproblem Ax=ex, where A is Hermitian and complex couble // zheev_ returns all eigenvalues while zheevx_ returns selected ones void zheev_(const char* jobz,const char* uplo,const int* n,std::complex *a, @@ -86,18 +102,6 @@ extern "C" void cheev_(const char* jobz,const char* uplo,const int* n,std::complex *a, const int* lda,float* w,std::complex* work,const int* lwork, float* rwork,int* info); - - // solve the generalized eigenproblem Ax=eBx, where A is Symmetric and real couble - // dsygv_ returns all eigenvalues while dsygvx_ returns selected ones - void dsygv_(const int* itype, const char* jobz,const char* uplo, const int* n, - double* a,const int* lda,double* b,const int* ldb, - double* w,double* work,int* lwork,int* info); - void dsygvx_(const int* itype, const char* jobz, const char* range, const char* uplo, - const int* n, double* A, const int* lda, double* B, const int* ldb, - const double* vl, const double* vu, const int* il, const int* iu, - const double* abstol, int* m, double* w, double* Z, const int* ldz, - double* work, int* lwork, int*iwork, int* ifail, int* info); - // solve the eigenproblem Ax=ex, where A is Symmetric and real double void dsyev_(const char* jobz,const char* uplo,const int* n,double *a, const int* lda,double* w,double* work,const int* lwork, int* info); @@ -314,23 +318,19 @@ class LapackConnector } public: - // wrap function of fortran lapack routine zhegvd. static inline - void zhegvd(const int itype, const char jobz, const char uplo, const int n, - std::complex* a, const int lda, - const std::complex* b, const int ldb, double* w, - std::complex* work, int lwork, double* rwork, int lrwork, - int* iwork, int liwork, int& info) + int ilaenv( int ispec, const char *name,const char *opts,const int n1,const int n2, + const int n3,const int n4) { - zhegvd_(&itype, &jobz, &uplo, &n, - a, &lda, b, &ldb, w, - work, &lwork, rwork, &lrwork, - iwork, &liwork, &info); + const int nb = ilaenv_(&ispec, name, opts, &n1, &n2, &n3, &n4); + return nb; } + + // wrap function of fortran lapack routine zhegvd. (pointer version) static inline - void xhegvd(const int itype, const char jobz, const char uplo, const int n, + void xhegvd(const int itype, const char jobz, const char uplo, const int n, double* a, const int lda, const double* b, const int ldb, double* w, double* work, int lwork, double* rwork, int lrwork, @@ -373,23 +373,9 @@ class LapackConnector iwork, &liwork, &info); } - // wrap function of fortran lapack routine zheevx. - static inline - void zheevx( const int itype, const char jobz, const char range, const char uplo, const int n, - std::complex* a, const int lda, - const double vl, const double vu, const int il, const int iu, const double abstol, - const int m, double* w, std::complex* z, const int ldz, - std::complex* work, const int lwork, double* rwork, int* iwork, int* ifail, int& info) - { - zheevx_(&jobz, &range, &uplo, &n, - a, &lda, &vl, &vu, &il, &iu, - &abstol, &m, w, z, &ldz, - work, &lwork, rwork, iwork, ifail, &info); - } - // wrap function of fortran lapack routine dsyevx. static inline - void xheevx(const int itype, const char jobz, const char range, const char uplo, const int n, + void xheevx(const int itype, const char jobz, const char range, const char uplo, const int n, double* a, const int lda, const double vl, const double vu, const int il, const int iu, const double abstol, const int m, double* w, double* z, const int ldz, @@ -428,6 +414,98 @@ class LapackConnector &abstol, &m, w, z, &ldz, work, &lwork, rwork, iwork, ifail, &info); } + + // wrap function of fortran lapack routine xhegvx ( pointer version ). + static inline + void xhegvx( const int itype, const char jobz, const char range, const char uplo, + const int n, std::complex* a, const int lda, std::complex* b, + const int ldb, const float vl, const float vu, const int il, const int iu, + const float abstol, const int m, float* w, std::complex* z, const int ldz, + std::complex* work, const int lwork, float* rwork, int* iwork, + int* ifail, int& info) + { + chegvx_(&itype, &jobz, &range, &uplo, &n, a, &lda, b, &ldb, &vl, + &vu, &il,&iu, &abstol, &m, w, z, &ldz, work, &lwork, rwork, iwork, ifail, &info); + } + + // wrap function of fortran lapack routine xhegvx ( pointer version ). + static inline + void xhegvx( const int itype, const char jobz, const char range, const char uplo, + const int n, std::complex* a, const int lda, std::complex* b, + const int ldb, const double vl, const double vu, const int il, const int iu, + const double abstol, const int m, double* w, std::complex* z, const int ldz, + std::complex* work, const int lwork, double* rwork, int* iwork, + int* ifail, int& info) + { + zhegvx_(&itype, &jobz, &range, &uplo, &n, a, &lda, b, &ldb, &vl, + &vu, &il,&iu, &abstol, &m, w, z, &ldz, work, &lwork, rwork, iwork, ifail, &info); + } + // wrap function of fortran lapack routine xhegvx ( pointer version ). + static inline + void xhegvx( const int itype, const char jobz, const char range, const char uplo, + const int n, double* a, const int lda, double* b, + const int ldb, const double vl, const double vu, const int il, const int iu, + const double abstol, const int m, double* w, double* z, const int ldz, + double* work, const int lwork, double* rwork, int* iwork, + int* ifail, int& info) + { + // dsygvx_(&itype, &jobz, &range, &uplo, &n, a, &lda, b, &ldb, &vl, + // &vu, &il,&iu, &abstol, &m, w, z, &ldz, work, &lwork, rwork, iwork, ifail, &info); + } + + + // wrap function of fortran lapack routine xhegvx ( pointer version ). + static inline + void xhegv( const int itype, const char jobz, const char uplo, + const int n, + double* a, const int lda, + double* b, const int ldb, + double* w, + double* work, int lwork, + double* rwork, int& info) + { + // TODO + } + + // wrap function of fortran lapack routine xhegvx ( pointer version ). + static inline + void xhegv( const int itype, const char jobz, const char uplo, + const int n, + std::complex* a, const int lda, + std::complex* b, const int ldb, + float* w, + std::complex* work, int lwork, + float* rwork, int& info) + { + // TODO + } + // wrap function of fortran lapack routine xhegvx ( pointer version ). + static inline + void xhegv( const int itype, const char jobz, const char uplo, + const int n, + std::complex* a, const int lda, + std::complex* b, const int ldb, + double* w, + std::complex* work, int lwork, + double* rwork, int& info) + { + zhegv_(&itype, &jobz, &uplo, &n, a, &lda, b, &ldb, w, work, &lwork, rwork, &info); + } + + + // wrap function of fortran lapack routine zhegvd. + static inline + void zhegvd(const int itype, const char jobz, const char uplo, const int n, + std::complex* a, const int lda, + const std::complex* b, const int ldb, double* w, + std::complex* work, int lwork, double* rwork, int lrwork, + int* iwork, int liwork, int& info) + { + zhegvd_(&itype, &jobz, &uplo, &n, + a, &lda, b, &ldb, w, + work, &lwork, rwork, &lrwork, + iwork, &liwork, &info); + } // wrap function of fortran lapack routine zhegv ( ModuleBase::ComplexMatrix version ). static inline @@ -543,30 +621,6 @@ class LapackConnector delete[] zux; } - // wrap function of fortran lapack routine xhegvx ( pointer version ). - static inline - void xhegvx( const int itype, const char jobz, const char range, const char uplo, - const int n, const std::complex* a, const int lda, const std::complex* b, - const int ldb, const float vl, const float vu, const int il, const int iu, - const float abstol, const int m, float* w, std::complex* z, const int ldz, - std::complex* work, const int lwork, float* rwork, int* iwork, - int* ifail, int& info, int nbase_x) - { - chegvx(itype, jobz, range, uplo, n, a, lda, b, ldb, vl, vu, il, iu, abstol, m, w, z, ldz, work, lwork, rwork, iwork, ifail, info, nbase_x); - } - - // wrap function of fortran lapack routine xhegvx ( pointer version ). - static inline - void xhegvx( const int itype, const char jobz, const char range, const char uplo, - const int n, const std::complex* a, const int lda, const std::complex* b, - const int ldb, const double vl, const double vu, const int il, const int iu, - const double abstol, const int m, double* w, std::complex* z, const int ldz, - std::complex* work, const int lwork, double* rwork, int* iwork, - int* ifail, int& info, int nbase_x) - { - zhegvx(itype, jobz, range, uplo, n, a, lda, b, ldb, vl, vu, il, iu, abstol, m, w, z, ldz, work, lwork, rwork, iwork, ifail, info, nbase_x); - } - // calculate the eigenvalues and eigenfunctions of a real symmetric matrix. static inline void dsygv( const int itype,const char jobz,const char uplo,const int n,ModuleBase::matrix& a, diff --git a/source/module_basis/module_ao/ORB_control.cpp b/source/module_basis/module_ao/ORB_control.cpp index 1435bf8e93..6966ca99cf 100644 --- a/source/module_basis/module_ao/ORB_control.cpp +++ b/source/module_basis/module_ao/ORB_control.cpp @@ -203,7 +203,7 @@ void ORB_control::setup_2d_division(std::ofstream& ofs_running, // determine whether 2d-division or not according to ks_solver bool div_2d; - if (ks_solver == "lapack" || ks_solver == "cg" || ks_solver == "dav") div_2d = false; + if (ks_solver == "lapack" || ks_solver == "cg" || ks_solver == "dav" || ks_solver == "dav_subspace") div_2d = false; #ifdef __MPI else if (ks_solver == "genelpa" || ks_solver == "scalapack_gvx" || ks_solver == "cusolver" || ks_solver == "cg_in_lcao") div_2d = true; #endif diff --git a/source/module_elecstate/elecstate_print.cpp b/source/module_elecstate/elecstate_print.cpp index 1a20566aa0..78be828c66 100644 --- a/source/module_elecstate/elecstate_print.cpp +++ b/source/module_elecstate/elecstate_print.cpp @@ -289,6 +289,10 @@ void ElecState::print_etot(const bool converged, { label = "DA"; } + else if (ks_solver_type == "dav_subspace") + { + label = "DS"; + } else if (ks_solver_type == "scalapack_gvx") { label = "GV"; diff --git a/source/module_hamilt_pw/hamilt_pwdft/wavefunc.cpp b/source/module_hamilt_pw/hamilt_pwdft/wavefunc.cpp index ac16819967..c10bb386e4 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/wavefunc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/wavefunc.cpp @@ -574,7 +574,7 @@ void diago_PAO_in_pw_k2(const psi::DEVICE_GPU *ctx, //GlobalC::hm.diagH_subspace(ik ,starting_nw, nbands, wfcatom, wfcatom, etatom.data()); } } - else if(GlobalV::KS_SOLVER=="dav") + else if (GlobalV::KS_SOLVER == "dav" || GlobalV::KS_SOLVER == "dav_subspace") { assert(nbands <= wfcatom.nr); // replace by haozhihan 2022-11-23 @@ -685,8 +685,8 @@ void diago_PAO_in_pw_k2(const psi::DEVICE_GPU *ctx, //GlobalC::hm.diagH_subspace(ik ,starting_nw, nbands, wfcatom, wfcatom, etatom.data()); } } - else if(GlobalV::KS_SOLVER=="dav") - { + else if (GlobalV::KS_SOLVER == "dav" || GlobalV::KS_SOLVER == "dav_subspace") + { assert(nbands <= wfcatom.nr); // replace by haozhihan 2022-11-23 hsolver::matrixSetToAnother, psi::DEVICE_GPU>()( diff --git a/source/module_hsolver/CMakeLists.txt b/source/module_hsolver/CMakeLists.txt index bfb50f21ea..ec64e49f5f 100644 --- a/source/module_hsolver/CMakeLists.txt +++ b/source/module_hsolver/CMakeLists.txt @@ -2,6 +2,7 @@ list(APPEND objects diagh_consts.cpp diago_cg.cpp diago_david.cpp + diago_dav_subspace.cpp diago_bpcg.cpp hsolver_pw.cpp hsolver_pw_sdft.cpp diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp new file mode 100644 index 0000000000..5ee48e60bf --- /dev/null +++ b/source/module_hsolver/diago_dav_subspace.cpp @@ -0,0 +1,992 @@ +#include "diago_dav_subspace.h" + +#include +#include + +#include "diago_iter_assist.h" +#include "module_base/blas_connector.h" +#include "module_base/constants.h" +#include "module_base/lapack_connector.h" +#include "module_base/memory.h" +#include "module_base/parallel_common.h" +#include "module_base/parallel_reduce.h" +#include "module_base/timer.h" +#include "module_hsolver/kernels/dngvd_op.h" +#include "module_hsolver/kernels/math_kernel_op.h" + +using namespace hsolver; + +template +Diago_DavSubspace::Diago_DavSubspace(const Real* precondition_in) +{ + this->device = psi::device::get_device_type(this->ctx); + this->precondition = precondition_in; + + test_david = 2; + // 1: check which function is called and which step is executed + // 2: check the eigenvalues of the result of each iteration + // 3: check the eigenvalues and errors of the last result + // default: no check + + this->one = &this->cs.one; + this->zero = &this->cs.zero; + this->neg_one = &this->cs.neg_one; +} + +template +Diago_DavSubspace::~Diago_DavSubspace() +{ + delmem_complex_op()(this->ctx, this->hphi); + delmem_complex_op()(this->ctx, this->hcc); + delmem_complex_op()(this->ctx, this->scc); + delmem_complex_op()(this->ctx, this->vcc); + + delmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav); + + if (this->device == psi::GpuDevice) + { + delmem_real_op()(this->ctx, this->d_precondition); + } +} + +template +void Diago_DavSubspace::diag_once(hamilt::Hamilt* phm_in, + psi::Psi& psi, + Real* eigenvalue_in_hsolver, + std::vector& is_occupied) +{ + if (test_david == 1) + { + ModuleBase::TITLE("Diago_DavSubspace", "diag_once"); + } + ModuleBase::timer::tick("Diago_DavSubspace", "diag_once"); + + assert(Diago_DavSubspace::PW_DIAG_NDIM > 1); + assert(Diago_DavSubspace::PW_DIAG_NDIM * psi.get_nbands() < psi.get_current_nbas() * GlobalV::NPROC_IN_POOL); + + // qianrui change it 2021-7-25. + // In strictly speaking, it shoule be PW_DIAG_NDIM*nband < npw sum of all pools. We roughly estimate it here. + // However, in most cases, total number of plane waves should be much larger than nband * PW_DIAG_NDIM + + this->dim = psi.get_k_first() ? psi.get_current_nbas() : psi.get_nk() * psi.get_nbasis(); + this->n_band = psi.get_nbands(); + + // maximum dimension of the reduced basis set + this->nbase_x = 2 * this->n_band; + + psi::Psi basis(1, this->nbase_x, this->dim, &(psi.get_ngk(0))); + ModuleBase::Memory::record("DAV::basis", this->nbase_x * this->dim * sizeof(T)); + + // the eigenvalues in dav iter + resmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav, this->nbase_x, "DAV::eig"); + setmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav, 0, this->nbase_x); + + //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + // the product of H and psi in the reduced basis set + resmem_complex_op()(this->ctx, this->hphi, this->nbase_x * this->dim, "DAV::hphi"); + setmem_complex_op()(this->ctx, this->hphi, 0, this->nbase_x * this->dim); + + // Hamiltonian on the reduced basis set + resmem_complex_op()(this->ctx, this->hcc, this->nbase_x * this->nbase_x, "DAV::hcc"); + setmem_complex_op()(this->ctx, this->hcc, 0, this->nbase_x * this->nbase_x); + + // Overlap on the reduced basis set + resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc"); + setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x); + + // Eigenvectors + resmem_complex_op()(this->ctx, this->vcc, this->nbase_x * this->nbase_x, "DAV::vcc"); + setmem_complex_op()(this->ctx, this->vcc, 0, this->nbase_x * this->nbase_x); + //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + // convflag[m] = true if the m th band is convergent + std::vector convflag(this->n_band, false); + + // unconv[m] store the number of the m th unconvergent band + std::vector unconv(this->n_band); + + // the dimension of the reduced basis set + int nbase = 0; + + // the number of the unconvergent bands + this->notconv = this->n_band; + + ModuleBase::timer::tick("Diago_DavSubspace", "first"); + + for (int m = 0; m < this->n_band; m++) + { + unconv[m] = m; + + syncmem_complex_op()(this->ctx, + this->ctx, + &basis(m, 0), + psi.get_k_first() ? &psi(m, 0) : &psi(m, 0, 0), + this->dim); + } + + // calculate H|psi> + hpsi_info dav_hpsi_in(&basis, psi::Range(1, 0, 0, this->n_band - 1), this->hphi); + phm_in->ops->hPsi(dav_hpsi_in); + + this->cal_elem(this->dim, + nbase, + this->notconv, + basis, + this->hphi, + this->hcc, + this->scc, + true); + + this->diag_zhegvx(nbase, + this->n_band, + this->hcc, + this->scc, + this->nbase_x, + this->eigenvalue_in_dav, + this->vcc, + true, + this->is_subspace); + + for (size_t m = 0; m < this->n_band; m++) + { + eigenvalue_in_hsolver[m] = this->eigenvalue_in_dav[m]; + } + + ModuleBase::timer::tick("Diago_DavSubspace", "first"); + + int dav_iter = 0; + do + { + dav_iter++; + + this->cal_grad(phm_in, + this->dim, + nbase, + this->notconv, + basis, + this->hphi, + this->vcc, + unconv.data(), + this->eigenvalue_in_dav); + + this->cal_elem(this->dim, + nbase, + this->notconv, + basis, + this->hphi, + this->hcc, + this->scc, + false); + + this->diag_zhegvx(nbase, + this->n_band, + this->hcc, + this->scc, + this->nbase_x, + this->eigenvalue_in_dav, + this->vcc, + false, + false); + + // check convergence and update eigenvalues + ModuleBase::timer::tick("Diago_DavSubspace", "check_update"); + + this->notconv = 0; + for (int m = 0; m < this->n_band; m++) + { + if (is_occupied[m]) + { + convflag[m] = (std::abs(this->eigenvalue_in_dav[m] - eigenvalue_in_hsolver[m]) + < DiagoIterAssist::PW_DIAG_THR); + } + else + { + double empty_ethr = std::max(DiagoIterAssist::PW_DIAG_THR * 5.0, 1e-5); + convflag[m] = (std::abs(this->eigenvalue_in_dav[m] - eigenvalue_in_hsolver[m]) < empty_ethr); + } + + if (!convflag[m]) + { + unconv[this->notconv] = m; + this->notconv++; + } + + eigenvalue_in_hsolver[m] = this->eigenvalue_in_dav[m]; + } + + ModuleBase::timer::tick("Diago_DavSubspace", "check_update"); + + if ((this->notconv == 0) || + (nbase + this->notconv + 1 > this->nbase_x) || + (dav_iter == DiagoIterAssist::PW_DIAG_NMAX)) + { + ModuleBase::timer::tick("Diago_DavSubspace", "last"); + + // updata eigenvectors of Hamiltonian + setmem_complex_op()(this->ctx, psi.get_pointer(), 0, n_band * psi.get_nbasis()); + //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + // haozhihan repalce 2022-10-18 + gemm_op()(this->ctx, + 'N', + 'N', + this->dim, // m: row of A,C + this->n_band, // n: col of B,C + nbase, // k: col of A, row of B + this->one, + basis.get_pointer(), // A dim * nbase + this->dim, + this->vcc, // B nbase * n_band + this->nbase_x, + this->zero, + psi.get_pointer(), // C dim * n_band + psi.get_nbasis()); + + if (!this->notconv || (dav_iter == DiagoIterAssist::PW_DIAG_NMAX)) + { + // overall convergence or last iteration: exit the iteration + + ModuleBase::timer::tick("Diago_DavSubspace", "last"); + break; + } + else + { + // if the dimension of the reduced basis set is becoming too large, + // then replace the first N (=nband) basis vectors with the current + // estimate of the eigenvectors and set the basis dimension to N; + + this->refresh(this->dim, + this->n_band, + nbase, + eigenvalue_in_hsolver, + psi, + basis, + this->hphi, + this->hcc, + this->scc, + this->vcc); + ModuleBase::timer::tick("Diago_DavSubspace", "last"); + } + } + + } while (1); + + // std::cout << "dav_iter == " << dav_iter << std::endl; + + DiagoIterAssist::avg_iter += static_cast(dav_iter); + + ModuleBase::timer::tick("Diago_DavSubspace", "diag_once"); + + return; +} + +template +void Diago_DavSubspace::cal_grad(hamilt::Hamilt* phm_in, + const int& dim, + const int& nbase, + const int& notconv, + psi::Psi& basis, + T* hphi, + T* vcc, + const int* unconv, + Real* eigenvalue_in_dav) +{ + ModuleBase::timer::tick("Diago_DavSubspace", "cal_grad"); + + for (size_t i = 0; i < notconv; i++) + { + if (unconv[i] != i) + { + syncmem_complex_op()( + this->ctx, + this->ctx, + vcc + i * this->nbase_x, + vcc + unconv[i] * this->nbase_x, + nbase + ); + this->eigenvalue_in_dav[i] = this->eigenvalue_in_dav[unconv[i]]; + } + } + + gemm_op()(this->ctx, + 'N', + 'N', + this->dim, // m: row of A,C + notconv, // n: col of B,C + nbase, // k: col of A, row of B + this->one, // alpha + &basis(0, 0), // A + this->dim, // LDA + vcc, // B + this->nbase_x, // LDB + this->zero, // belta + &basis(nbase, 0), // C dim * notconv + this->dim // LDC + ); + + for (int m = 0; m < notconv; m++) + { + + std::vector e_temp_cpu(this->dim, (-this->eigenvalue_in_dav[m])); + + vector_mul_vector_op()(this->ctx, + this->dim, + &basis(nbase + m, 0), + &basis(nbase + m, 0), + e_temp_cpu.data()); + } + + gemm_op()(this->ctx, + 'N', + 'N', + this->dim, // m: row of A,C + notconv, // n: col of B,C + nbase, // k: col of A, row of B + this->one, // alpha + hphi, // A dim * nbase + this->dim, // LDA + vcc, // B nbase * notconv + this->nbase_x, // LDB + this->one, // belta + &basis(nbase, 0), // C dim * notconv + this->dim // LDC + ); + + // "precondition!!!" + std::vector pre(this->dim, 0.0); + for (int m = 0; m < notconv; m++) + { + for (size_t i = 0; i < this->dim; i++) + { + double x = this->precondition[i] - this->eigenvalue_in_dav[m]; + pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); + } + vector_div_vector_op()( + this->ctx, + this->dim, + &basis(nbase + m, 0), + &basis(nbase + m, 0), + pre.data() + ); + } + + // "normalize!!!" in order to improve numerical stability of subspace diagonalization + std::vector psi_norm(notconv, 0.0); + for (size_t i = 0; i < notconv; i++) + { + psi_norm[i] = dot_real_op()(this->ctx, this->dim, &basis(nbase + i, 0), &basis(nbase + i, 0), false); + assert(psi_norm[i] > 0.0); + psi_norm[i] = sqrt(psi_norm[i]); + + vector_div_constant_op()(this->ctx, + this->dim, + &basis(nbase + i, 0), + &basis(nbase + i, 0), + psi_norm[i]); + } + + // "calculate H|psi>" for not convergence bands + hpsi_info dav_hpsi_in(&basis, psi::Range(1, 0, nbase, nbase + notconv - 1), &hphi[nbase * this->dim]); + phm_in->ops->hPsi(dav_hpsi_in); + + ModuleBase::timer::tick("Diago_DavSubspace", "cal_grad"); + return; +} + +template +void Diago_DavSubspace::cal_elem(const int& dim, + int& nbase, + const int& notconv, + const psi::Psi& basis, + const T* hphi, + T* hcc, + T* scc, + bool init) +{ + ModuleBase::timer::tick("Diago_DavSubspace", "cal_elem"); + + if (init) + { + assert(nbase == 0); + assert(this->n_band == notconv); + gemm_op()(this->ctx, + 'C', + 'N', + notconv, + notconv, + this->dim, + this->one, + &basis(0, 0), + this->dim, + hphi, + this->dim, + this->zero, + hcc, + this->nbase_x); + + gemm_op()(this->ctx, + 'C', + 'N', + notconv, + notconv, + this->dim, + this->one, + &basis(0, 0), + this->dim, + &basis(0, 0), + this->dim, + this->zero, + scc, + this->nbase_x); + } + else + { + gemm_op()(this->ctx, + 'C', + 'N', + notconv, + nbase + notconv, + this->dim, + this->one, + &hphi[nbase * this->dim], + this->dim, + &basis(0, 0), + this->dim, + this->zero, + hcc + nbase, + this->nbase_x); + + gemm_op()(this->ctx, + 'C', + 'N', + notconv, + nbase + notconv, + this->dim, + this->one, + &basis(nbase, 0), + this->dim, + &basis(0, 0), + this->dim, + this->zero, + scc + nbase, + this->nbase_x); + } + +#ifdef __MPI + if (GlobalV::NPROC_IN_POOL > 1) + { + matrixTranspose_op()(this->ctx, this->nbase_x, this->nbase_x, hcc, hcc); + matrixTranspose_op()(this->ctx, this->nbase_x, this->nbase_x, scc, scc); + + auto* swap = new T[notconv * this->nbase_x]; + syncmem_complex_op()(this->ctx, this->ctx, swap, hcc + nbase * this->nbase_x, notconv * this->nbase_x); + + if (std::is_same::value) + { + Parallel_Reduce::reduce_pool(hcc + nbase * this->nbase_x, notconv * this->nbase_x); + Parallel_Reduce::reduce_pool(scc + nbase * this->nbase_x, notconv * this->nbase_x); + } + else + { + if (psi::device::get_current_precision(swap) == "single") + { + MPI_Reduce(swap, + hcc + nbase * this->nbase_x, + notconv * this->nbase_x, + MPI_COMPLEX, + MPI_SUM, + 0, + POOL_WORLD); + } + else + { + MPI_Reduce(swap, + hcc + nbase * this->nbase_x, + notconv * this->nbase_x, + MPI_DOUBLE_COMPLEX, + MPI_SUM, + 0, + POOL_WORLD); + } + + syncmem_complex_op()(this->ctx, this->ctx, swap, scc + nbase * this->nbase_x, notconv * this->nbase_x); + + if (psi::device::get_current_precision(swap) == "single") + { + MPI_Reduce(swap, + scc + nbase * this->nbase_x, + notconv * this->nbase_x, + MPI_COMPLEX, + MPI_SUM, + 0, + POOL_WORLD); + } + else + { + MPI_Reduce(swap, + scc + nbase * this->nbase_x, + notconv * this->nbase_x, + MPI_DOUBLE_COMPLEX, + MPI_SUM, + 0, + POOL_WORLD); + } + } + delete[] swap; + + matrixTranspose_op()(this->ctx, this->nbase_x, this->nbase_x, hcc, hcc); + matrixTranspose_op()(this->ctx, this->nbase_x, this->nbase_x, scc, scc); + } +#endif + + nbase += notconv; + int nb1 = nbase - notconv; + // reset: + if (init) + { + for (size_t i = 0; i < nbase; i++) + { + + hcc[i * this->nbase_x + i] = set_real_tocomplex(hcc[i * this->nbase_x + i]); + scc[i * this->nbase_x + i] = set_real_tocomplex(scc[i * this->nbase_x + i]); + + for (size_t j = i + 1; j < nbase; j++) + { + hcc[j * this->nbase_x + i] = get_conj(hcc[i * this->nbase_x + j]); + scc[j * this->nbase_x + i] = get_conj(scc[i * this->nbase_x + j]); + } + } + for (size_t i = nbase; i < this->nbase_x; i++) + { + for (size_t j = nbase; j < this->nbase_x; j++) + { + hcc[i * this->nbase_x + j] = cs.zero; + scc[i * this->nbase_x + j] = cs.zero; + hcc[j * this->nbase_x + i] = cs.zero; + scc[j * this->nbase_x + i] = cs.zero; + } + } + } + else + { + for (size_t i = 0; i < nbase; i++) + { + if (i >= nb1) + { + hcc[i * this->nbase_x + i] = set_real_tocomplex(hcc[i * this->nbase_x + i]); + scc[i * this->nbase_x + i] = set_real_tocomplex(scc[i * this->nbase_x + i]); + } + for (size_t j = std::max(i + 1, (size_t)nb1); j < nbase; j++) + { + hcc[j * this->nbase_x + i] = get_conj(hcc[i * this->nbase_x + j]); + scc[j * this->nbase_x + i] = get_conj(scc[i * this->nbase_x + j]); + } + } + for (size_t i = nbase; i < this->nbase_x; i++) + { + for (size_t j = nbase; j < this->nbase_x; j++) + { + hcc[i * this->nbase_x + j] = cs.zero; + scc[i * this->nbase_x + j] = cs.zero; + hcc[j * this->nbase_x + i] = cs.zero; + scc[j * this->nbase_x + i] = cs.zero; + } + } + } + + ModuleBase::timer::tick("Diago_DavSubspace", "cal_elem"); + return; +} + +template +void Diago_DavSubspace::diag_zhegvx(const int& nbase, + const int& nband, + T* hcc, + T* scc, + const int& nbase_x, + Real* eigenvalue_in_dav, + T* vcc, + bool init, + bool is_subspace) +{ + ModuleBase::timer::tick("Diago_DavSubspace", "diag_zhegvx"); + + if (is_subspace == false) + { + if (GlobalV::RANK_IN_POOL == 0) + { + assert(nbase_x >= std::max(1, nbase)); + + std::vector> h_diag(nbase, std::vector(nbase, cs.zero)); + std::vector> s_diag(nbase, std::vector(nbase, cs.zero)); + + for (size_t i = 0; i < nbase; i++) + { + for (size_t j = 0; j < nbase; j++) + { + h_diag[i][j] = hcc[i * this->nbase_x + j]; + s_diag[i][j] = scc[i * this->nbase_x + j]; + } + } + + if (this->device == psi::GpuDevice) + { +#if defined(__CUDA) || defined(__ROCM) + Real* eigenvalue_gpu = nullptr; + resmem_real_op()(this->ctx, eigenvalue_gpu, this->nbase_x); + + syncmem_var_h2d_op()( + this->ctx, + this->cpu_ctx, + eigenvalue_gpu, + this->eigenvalue_in_dav, + this->nbase_x + ); + + dnevx_op()(this->ctx, nbase, this->nbase_x, this->hcc, nband, eigenvalue_gpu, this->vcc); + + syncmem_var_d2h_op()( + this->cpu_ctx, + this->ctx, + this->eigenvalue_in_dav, + eigenvalue_gpu, + this->nbase_x + ); + + delmem_real_op()(this->ctx, eigenvalue_gpu); +#endif + } + else + { + if (init) + { + dnevx_op()(this->ctx, + nbase, + this->nbase_x, + this->hcc, + nband, + this->eigenvalue_in_dav, + this->vcc); + } + else + { + + dngvx_op()(this->ctx, + nbase, + this->nbase_x, + this->hcc, + this->scc, + nband, + this->eigenvalue_in_dav, + this->vcc); + } + } + + // reset: + for (size_t i = 0; i < nbase; i++) + { + for (size_t j = 0; j < nbase; j++) + { + hcc[i * this->nbase_x + j] = h_diag[i][j]; + scc[i * this->nbase_x + j] = s_diag[i][j]; + } + + for (size_t j = nbase; j < this->nbase_x; j++) + { + hcc[i * this->nbase_x + j] = cs.zero; + hcc[j * this->nbase_x + i] = cs.zero; + scc[i * this->nbase_x + j] = cs.zero; + scc[j * this->nbase_x + i] = cs.zero; + } + } + } + +#ifdef __MPI + if (GlobalV::NPROC_IN_POOL > 1) + { + // vcc: nbase * nband + for (int i = 0; i < nband; i++) + { + MPI_Bcast(&vcc[i * this->nbase_x], nbase, MPI_DOUBLE_COMPLEX, 0, POOL_WORLD); + } + MPI_Bcast(this->eigenvalue_in_dav, nband, MPI_DOUBLE, 0, POOL_WORLD); + } +#endif + + } + else if (is_subspace == true) + { + for (size_t m = 0; m < nband; m++) + { + this->eigenvalue_in_dav[m] = get_real(hcc[m * this->nbase_x + m]); + + vcc[m * this->nbase_x + m] = set_real_tocomplex(1.0); + } + +#ifdef __MPI + if (GlobalV::NPROC_IN_POOL > 1) + { + MPI_Bcast(this->eigenvalue_in_dav, this->n_band, MPI_DOUBLE, 0, POOL_WORLD); + } +#endif + + } + + ModuleBase::timer::tick("Diago_DavSubspace", "diag_zhegvx"); + return; +} + +template +void Diago_DavSubspace::refresh(const int& dim, + const int& nband, + int& nbase, + const Real* eigenvalue_in_hsolver, + const psi::Psi& psi, + psi::Psi& basis, + T* hp, + T* sp, + T* hc, + T* vc) +{ + ModuleBase::timer::tick("Diago_DavSubspace", "refresh"); + + // update basis + for (size_t i = 0; i < nband; i++) + { + syncmem_complex_op()(this->ctx, this->ctx, &basis(i, 0), &psi(i, 0), this->dim); + } + gemm_op()(this->ctx, + 'N', + 'N', + this->dim, + nband, + nbase, + this->one, + this->hphi, + this->dim, + this->vcc, + this->nbase_x, + this->zero, + &basis(nband, 0), + this->dim); + + // update hphi + syncmem_complex_op()(this->ctx, this->ctx, hphi, &basis(nband, 0), this->dim * nband); + + nbase = nband; + + // set hcc/scc/vcc to 0 + for (size_t i = 0; i < nbase; i++) + { + setmem_complex_op()(this->ctx, &hcc[this->nbase_x * i], 0, nbase); + setmem_complex_op()(this->ctx, &scc[this->nbase_x * i], 0, nbase); + setmem_complex_op()(this->ctx, &vcc[this->nbase_x * i], 0, nbase); + } + + if (this->device == psi::GpuDevice) + { +#if defined(__CUDA) || defined(__ROCM) + T* hcc_cpu = nullptr; + T* scc_cpu = nullptr; + T* vcc_cpu = nullptr; + psi::memory::resize_memory_op()(this->cpu_ctx, + hcc_cpu, + this->nbase_x * this->nbase_x, + "DAV::hcc"); + psi::memory::resize_memory_op()(this->cpu_ctx, + scc_cpu, + this->nbase_x * this->nbase_x, + "DAV::scc"); + psi::memory::resize_memory_op()(this->cpu_ctx, + vcc_cpu, + this->nbase_x * this->nbase_x, + "DAV::vcc"); + + syncmem_d2h_op()(this->cpu_ctx, this->ctx, hcc_cpu, hcc, this->nbase_x * this->nbase_x); + syncmem_d2h_op()(this->cpu_ctx, this->ctx, scc_cpu, scc, this->nbase_x * this->nbase_x); + syncmem_d2h_op()(this->cpu_ctx, this->ctx, vcc_cpu, vcc, this->nbase_x * this->nbase_x); + + for (int i = 0; i < nbase; i++) + { + hcc_cpu[i * this->nbase_x + i] = eigenvalue_in_hsolver[i]; + scc_cpu[i * this->nbase_x + i] = this->one[0]; + vcc_cpu[i * this->nbase_x + i] = this->one[0]; + } + + syncmem_h2d_op()(this->ctx, this->cpu_ctx, hcc, hcc_cpu, this->nbase_x * this->nbase_x); + syncmem_h2d_op()(this->ctx, this->cpu_ctx, scc, scc_cpu, this->nbase_x * this->nbase_x); + syncmem_h2d_op()(this->ctx, this->cpu_ctx, vcc, vcc_cpu, this->nbase_x * this->nbase_x); + + psi::memory::delete_memory_op()(this->cpu_ctx, hcc_cpu); + psi::memory::delete_memory_op()(this->cpu_ctx, scc_cpu); + psi::memory::delete_memory_op()(this->cpu_ctx, vcc_cpu); +#endif + } + else + { + for (int i = 0; i < nbase; i++) + { + hcc[i * this->nbase_x + i] = eigenvalue_in_hsolver[i]; + scc[i * this->nbase_x + i] = this->one[0]; + vcc[i * this->nbase_x + i] = this->one[0]; + } + } + ModuleBase::timer::tick("Diago_DavSubspace", "refresh"); + + return; +} + +template +void Diago_DavSubspace::diag(hamilt::Hamilt* phm_in, + psi::Psi& psi, + Real* eigenvalue_in_hsolver, + std::vector& is_occupied) +{ + /// record the times of trying iterative diagonalization + int ntry = 0; + this->notconv = 0; + +#if defined(__CUDA) || defined(__ROCM) + if (this->device == psi::GpuDevice) + { + resmem_real_op()(this->ctx, this->d_precondition, psi.get_nbasis()); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition, psi.get_nbasis()); + } +#endif + + if (DiagoIterAssist::SCF_ITER == 1) + { + // std::cout << "diagH_subspace" << std::endl; + DiagoIterAssist::diagH_subspace(phm_in, psi, psi, eigenvalue_in_hsolver, psi.get_nbands()); + + this->is_subspace = true; + } + else + { + this->is_subspace = false; + } + + bool outputscc = false; + bool outputeigenvalue = false; + + if (outputscc) + { + this->nbase_x = 2 * psi.get_nbands(); + resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc"); + setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x); + + std::cout << "before dav 111" << std::endl; + gemm_op()(this->ctx, + 'C', + 'N', + psi.get_nbands(), + psi.get_nbands(), + psi.get_current_nbas(), + this->one, + &psi(0, 0), + psi.get_current_nbas(), + &psi(0, 0), + psi.get_current_nbas(), + this->zero, + this->scc, + this->nbase_x); + // output scc + for (size_t i = 0; i < psi.get_nbands(); i++) + { + for (size_t j = 0; j < psi.get_nbands(); j++) + { + std::cout << this->scc[i * this->nbase_x + j] << "\t"; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + + if (outputeigenvalue) + { + // output: eigenvalue_in_hsolver + std::cout << "before dav, output eigenvalue_in_hsolver" << std::endl; + for (size_t i = 0; i < psi.get_nbands(); i++) + { + std::cout << eigenvalue_in_hsolver[i] << "\t"; + } + std::cout << std::endl; + } + + do + { + + this->diag_once(phm_in, psi, eigenvalue_in_hsolver, is_occupied); + + ++ntry; + + } while (DiagoIterAssist::test_exit_cond(ntry, this->notconv)); + + if (notconv > std::max(5, psi.get_nbands() / 4)) + { + std::cout << "\n notconv = " << this->notconv; + std::cout << "\n Diago_DavSubspace::diag', too many bands are not converged! \n"; + } + + if (outputeigenvalue) + { + // output: eigenvalue_in_hsolver + std::cout << "after dav, output eigenvalue_in_hsolver" << std::endl; + for (size_t i = 0; i < psi.get_nbands(); i++) + { + std::cout << eigenvalue_in_hsolver[i] << "\t"; + } + std::cout << std::endl; + } + + if (outputscc) + { + std::cout << "after dav 222 " << std::endl; + gemm_op()(this->ctx, + 'C', + 'N', + psi.get_nbands(), + psi.get_nbands(), + psi.get_current_nbas(), + this->one, + &psi(0, 0), + psi.get_current_nbas(), + &psi(0, 0), + psi.get_current_nbas(), + this->zero, + this->scc, + this->nbase_x); + // output scc + for (size_t i = 0; i < psi.get_nbands(); i++) + { + for (size_t j = 0; j < psi.get_nbands(); j++) + { + std::cout << this->scc[i * this->nbase_x + j] << "\t"; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + + return; +} + +namespace hsolver +{ + +template class Diago_DavSubspace, psi::DEVICE_CPU>; +template class Diago_DavSubspace, psi::DEVICE_CPU>; + +#if ((defined __CUDA) || (defined __ROCM)) +template class Diago_DavSubspace, psi::DEVICE_GPU>; +template class Diago_DavSubspace, psi::DEVICE_GPU>; +#endif + +#ifdef __LCAO +template class Diago_DavSubspace; + +#if ((defined __CUDA) || (defined __ROCM)) +template class Diago_DavSubspace; +#endif + +#endif +} // namespace hsolver \ No newline at end of file diff --git a/source/module_hsolver/diago_dav_subspace.h b/source/module_hsolver/diago_dav_subspace.h new file mode 100644 index 0000000000..e55a536608 --- /dev/null +++ b/source/module_hsolver/diago_dav_subspace.h @@ -0,0 +1,143 @@ +#ifndef DIAGO_NEW_DAV_H +#define DIAGO_NEW_DAV_H + +#include "diagh.h" +#include "module_base/complexmatrix.h" +#include "module_base/macros.h" +#include "module_hamilt_pw/hamilt_pwdft/structure_factor.h" +#include "module_psi/kernels/device.h" + +namespace hsolver +{ + +template , typename Device = psi::DEVICE_CPU> +class Diago_DavSubspace : public DiagH +{ + private: + // Note GetTypeReal::type will + // return T if T is real type(float, double), + // otherwise return the real type of T(complex, complex) + using Real = typename GetTypeReal::type; + + public: + Diago_DavSubspace(const Real* precondition_in); + ~Diago_DavSubspace(); + + // this is the override function diag() for CG method + void diag(hamilt::Hamilt* phm_in, + psi::Psi& phi, + Real* eigenvalue_in, + std::vector& is_occupied); + + static int PW_DIAG_NDIM; + + private: + bool is_subspace = false; + + int test_david = 0; + + /// record for how many bands not have convergence eigenvalues + int notconv = 0; + + /// row size for input psi matrix + int n_band = 0; + /// non-zero col size for inputted psi matrix + int dim = 0; + // maximum dimension of the reduced basis set + int nbase_x = 0; + /// precondition for cg diag + const Real* precondition = nullptr; + Real* d_precondition = nullptr; + + /// eigenvalue results + Real* eigenvalue_in_dav = nullptr; + + T* hphi = nullptr; // the product of H and psi in the reduced basis set + + T* hcc = nullptr; // Hamiltonian on the reduced basis + + T* scc = nullptr; // Overlap on the reduced basis + + T* vcc = nullptr; // Eigenvectors on the reduced basis + + /// device type of psi + Device* ctx = {}; + psi::DEVICE_CPU* cpu_ctx = {}; + psi::AbacusDevice_t device = {}; + + void cal_grad(hamilt::Hamilt* phm_in, + const int& dim, + const int& nbase, + const int& notconv, + psi::Psi& basis, + T* hphi, + T* vcc, + const int* unconv, + Real* eigenvalue); + + void cal_elem(const int& dim, + int& nbase, + const int& notconv, + const psi::Psi& basis, + const T* hphi, + T* hcc, + T* scc, + bool init); + + void refresh(const int& dim, + const int& nband, + int& nbase, + const Real* eigenvalue, + const psi::Psi& psi, + psi::Psi& basis, + T* hphi, + T* hcc, + T* scc, + T* vcc); + + void diag_zhegvx(const int& nbase, + const int& nband, + T* hcc, + T* scc, + const int& nbase_x, + Real* eigenvalue, + T* vcc, + bool init, + bool is_subspace); + + void diag_once(hamilt::Hamilt* phm_in, + psi::Psi& psi, + Real* eigenvalue_in, + std::vector& is_occupied); + + using resmem_complex_op = psi::memory::resize_memory_op; + using delmem_complex_op = psi::memory::delete_memory_op; + using setmem_complex_op = psi::memory::set_memory_op; + + using resmem_real_op = psi::memory::resize_memory_op; + using delmem_real_op = psi::memory::delete_memory_op; + using setmem_real_op = psi::memory::set_memory_op; + + using resmem_real_h_op = psi::memory::resize_memory_op; + using delmem_real_h_op = psi::memory::delete_memory_op; + using setmem_real_h_op = psi::memory::set_memory_op; + + using syncmem_var_h2d_op = psi::memory::synchronize_memory_op; + using syncmem_var_d2h_op = psi::memory::synchronize_memory_op; + using syncmem_complex_op = psi::memory::synchronize_memory_op; + using castmem_complex_op = psi::memory::cast_memory_op, T, Device, Device>; + using syncmem_h2d_op = psi::memory::synchronize_memory_op; + using syncmem_d2h_op = psi::memory::synchronize_memory_op; + + using hpsi_info = typename hamilt::Operator::hpsi_info; + + consts cs; + const T *one = nullptr, *zero = nullptr, *neg_one = nullptr; +}; + +template +int Diago_DavSubspace::PW_DIAG_NDIM = 4; + +} // namespace hsolver + +#endif \ No newline at end of file diff --git a/source/module_hsolver/diago_david.cpp b/source/module_hsolver/diago_david.cpp index 926f8e4e15..c3865724a3 100644 --- a/source/module_hsolver/diago_david.cpp +++ b/source/module_hsolver/diago_david.cpp @@ -17,19 +17,6 @@ using namespace hsolver; -inline double get_real(const double& x) -{ - return x; -} -inline double get_real(const std::complex& x) -{ - return x.real(); -} -inline float get_real(const std::complex& x) -{ - return x.real(); -} - template DiagoDavid::DiagoDavid(const Real* precondition_in) { this->device = psi::device::get_device_type(this->ctx); diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp index d1502d1106..759306c4ee 100644 --- a/source/module_hsolver/hsolver_pw.cpp +++ b/source/module_hsolver/hsolver_pw.cpp @@ -5,17 +5,19 @@ #include "diago_bpcg.h" #include "diago_cg.h" #include "diago_david.h" +#include "diago_dav_subspace.h" #include "module_base/timer.h" #include "module_base/tool_quit.h" #include "module_elecstate/elecstate_pw.h" +#include "module_hamilt_pw/hamilt_pwdft/global.h" #include "module_hamilt_pw/hamilt_pwdft/hamilt_pw.h" #include "module_hamilt_pw/hamilt_pwdft/wavefunc.h" #include "module_hsolver/diago_iter_assist.h" -#include "module_hamilt_pw/hamilt_pwdft/global.h" #ifdef USE_PAW #include "module_cell/module_paw/paw_cell.h" #endif -namespace hsolver { +namespace hsolver +{ template HSolverPW::HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in, wavefunc* pwf_in) @@ -26,17 +28,8 @@ HSolverPW::HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in, wavefunc* pw this->diag_ethr = GlobalV::PW_DIAG_THR; /*this->init(pbas_in);*/ } -/*void HSolverPW::init(const PW_Basis* pbas_in) -{ - this->pbas = pbas_in; - return; -} -void HSolverPW::update() -{ - return; -}*/ -template +template void HSolverPW::initDiagh(const psi::Psi& psi) { if (this->method == "cg") @@ -47,7 +40,8 @@ void HSolverPW::initDiagh(const psi::Psi& psi) { delete reinterpret_cast*>(this->pdiagh); } - else { + else + { return; } } @@ -55,36 +49,34 @@ void HSolverPW::initDiagh(const psi::Psi& psi) // warp the subspace_func into a lambda function auto ngk_pointer = psi.get_ngk_pointer(); auto subspace_func = [this, ngk_pointer](const ct::Tensor& psi_in, ct::Tensor& psi_out) { - // psi_in should be a 2D tensor: + // psi_in should be a 2D tensor: // psi_in.shape() = [nbands, nbasis] const auto ndim = psi_in.shape().ndim(); REQUIRES_OK(ndim == 2, "dims of psi_in should be less than or equal to 2"); // Convert a Tensor object to a psi::Psi object - auto psi_in_wrapper = psi::Psi( - psi_in.data(), 1, - psi_in.shape().dim_size(0), - psi_in.shape().dim_size(1), - ngk_pointer); - auto psi_out_wrapper = psi::Psi( - psi_out.data(), 1, - psi_out.shape().dim_size(0), - psi_out.shape().dim_size(1), - ngk_pointer); - auto eigen = ct::Tensor( - ct::DataTypeToEnum::value, - ct::DeviceType::CpuDevice, - ct::TensorShape({psi_in.shape().dim_size(0)})); - + auto psi_in_wrapper = psi::Psi(psi_in.data(), + 1, + psi_in.shape().dim_size(0), + psi_in.shape().dim_size(1), + ngk_pointer); + auto psi_out_wrapper = psi::Psi(psi_out.data(), + 1, + psi_out.shape().dim_size(0), + psi_out.shape().dim_size(1), + ngk_pointer); + auto eigen = ct::Tensor(ct::DataTypeToEnum::value, + ct::DeviceType::CpuDevice, + ct::TensorShape({psi_in.shape().dim_size(0)})); + DiagoIterAssist::diagH_subspace(hamilt_, psi_in_wrapper, psi_out_wrapper, eigen.data()); }; - this->pdiagh = new DiagoCG( - GlobalV::BASIS_TYPE, - GlobalV::CALCULATION, - DiagoIterAssist::need_subspace, - subspace_func, - DiagoIterAssist::PW_DIAG_THR, - DiagoIterAssist::PW_DIAG_NMAX, - GlobalV::NPROC_IN_POOL); + this->pdiagh = new DiagoCG(GlobalV::BASIS_TYPE, + GlobalV::CALCULATION, + DiagoIterAssist::need_subspace, + subspace_func, + DiagoIterAssist::PW_DIAG_THR, + DiagoIterAssist::PW_DIAG_NMAX, + GlobalV::NPROC_IN_POOL); this->pdiagh->method = this->method; } else if (this->method == "dav") @@ -101,20 +93,42 @@ void HSolverPW::initDiagh(const psi::Psi& psi) } else { - this->pdiagh = new DiagoDavid( precondition.data()); + this->pdiagh = new DiagoDavid(precondition.data()); + this->pdiagh->method = this->method; + } + } + else if (this->method == "dav_subspace") + { + Diago_DavSubspace::PW_DIAG_NDIM = GlobalV::PW_DIAG_NDIM; + if (this->pdiagh != nullptr) + { + if (this->pdiagh->method != this->method) + { + delete (Diago_DavSubspace*)this->pdiagh; + this->pdiagh = new Diago_DavSubspace(precondition.data()); + this->pdiagh->method = this->method; + } + } + else + { + this->pdiagh = new Diago_DavSubspace(precondition.data()); this->pdiagh->method = this->method; } } - else if (this->method == "bpcg") { - if(this->pdiagh!=nullptr) { - if(this->pdiagh->method != this->method) { + else if (this->method == "bpcg") + { + if (this->pdiagh != nullptr) + { + if (this->pdiagh->method != this->method) + { delete (DiagoBPCG*)this->pdiagh; this->pdiagh = new DiagoBPCG(precondition.data()); this->pdiagh->method = this->method; reinterpret_cast*>(this->pdiagh)->init_iter(psi); } } - else { + else + { this->pdiagh = new DiagoBPCG(precondition.data()); this->pdiagh->method = this->method; reinterpret_cast*>(this->pdiagh)->init_iter(psi); @@ -128,10 +142,10 @@ void HSolverPW::initDiagh(const psi::Psi& psi) template void HSolverPW::solve(hamilt::Hamilt* pHamilt, - psi::Psi& psi, - elecstate::ElecState* pes, - const std::string method_in, - const bool skip_charge) + psi::Psi& psi, + elecstate::ElecState* pes, + const std::string method_in, + const bool skip_charge) { ModuleBase::TITLE("HSolverPW", "solve"); ModuleBase::timer::tick("HSolverPW", "solve"); @@ -141,7 +155,27 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // select the method of diagonalization this->method = method_in; this->initDiagh(psi); + std::vector eigenvalues(pes->ekb.nr * pes->ekb.nc, 0); + + if (this->is_first_scf == true) + { + is_occupied.resize(psi.get_nk() * psi.get_nbands(), true); + } + else + { + for (size_t i = 0; i < psi.get_nk(); i++) + { + for (size_t j = 0; j < psi.get_nbands(); j++) + { + if (pes->wg(i, j) < 1.0) + { + is_occupied[i * psi.get_nbands() + j] = false; + } + } + } + } + /// Loop over k points for solve Hamiltonian to charge density for (int ik = 0; ik < this->wfc_basis->nks; ++ik) { @@ -149,13 +183,13 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, pHamilt->updateHk(ik); #ifdef USE_PAW - if(GlobalV::use_paw) + if (GlobalV::use_paw) { const int npw = this->wfc_basis->npwk[ik]; - ModuleBase::Vector3 *_gk = new ModuleBase::Vector3[npw]; - for (int ig = 0;ig < npw; ig++) + ModuleBase::Vector3* _gk = new ModuleBase::Vector3[npw]; + for (int ig = 0; ig < npw; ig++) { - _gk[ig] = this->wfc_basis->getgpluskcar(ik,ig); + _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig); } double* kpt; @@ -164,11 +198,11 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, kpt[1] = this->wfc_basis->kvec_c[ik].y; kpt[2] = this->wfc_basis->kvec_c[ik].z; - double ** kpg; - double ** gcar; + double** kpg; + double** gcar; kpg = new double*[npw]; gcar = new double*[npw]; - for(int ipw=0;ipw::solve(hamilt::Hamilt* pHamilt, kpg[ipw][2] = _gk[ipw].z; gcar[ipw] = new double[3]; - gcar[ipw][0] = this->wfc_basis->getgcar(ik,ipw).x; - gcar[ipw][1] = this->wfc_basis->getgcar(ik,ipw).y; - gcar[ipw][2] = this->wfc_basis->getgcar(ik,ipw).z; + gcar[ipw][0] = this->wfc_basis->getgcar(ik, ipw).x; + gcar[ipw][1] = this->wfc_basis->getgcar(ik, ipw).y; + gcar[ipw][2] = this->wfc_basis->getgcar(ik, ipw).z; } - GlobalC::paw_cell.set_paw_k(npw,wfc_basis->npwk_max,kpt, - this->wfc_basis->get_ig2ix(ik).data(), - this->wfc_basis->get_ig2iy(ik).data(), - this->wfc_basis->get_ig2iz(ik).data(), - (const double **) kpg,GlobalC::ucell.tpiba,(const double **) gcar); + GlobalC::paw_cell.set_paw_k(npw, + wfc_basis->npwk_max, + kpt, + this->wfc_basis->get_ig2ix(ik).data(), + this->wfc_basis->get_ig2iy(ik).data(), + this->wfc_basis->get_ig2iz(ik).data(), + (const double**)kpg, + GlobalC::ucell.tpiba, + (const double**)gcar); delete[] kpt; - for(int ipw = 0; ipw < npw; ipw++) + for (int ipw = 0; ipw < npw; ipw++) { delete[] kpg[ipw]; delete[] gcar[ipw]; @@ -213,19 +251,24 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, /// solve eigenvector and eigenvalue for H(k) this->hamiltSolvePsiK(pHamilt, psi, eigenvalues.data() + ik * pes->ekb.nc); - if(skip_charge) + + if (skip_charge) { - GlobalV::ofs_running<< "Average iterative diagonalization steps for k-points "<::avg_iter - <<" ; where current threshold is: "<::PW_DIAG_THR<<" . "<::avg_iter + << " ; where current threshold is: " << DiagoIterAssist::PW_DIAG_THR + << " . " << std::endl; DiagoIterAssist::avg_iter = 0.0; } /// calculate the contribution of Psi for charge density rho } castmem_2d_2h_op()(cpu_ctx, cpu_ctx, pes->ekb.c, eigenvalues.data(), pes->ekb.nr * pes->ekb.nc); + this->is_first_scf = false; + this->endDiagh(); - if(skip_charge) + if (skip_charge) { ModuleBase::timer::tick("HSolverPW", "solve"); return; @@ -233,9 +276,9 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, reinterpret_cast*>(pes)->psiToRho(psi); #ifdef USE_PAW - if(GlobalV::use_paw) + if (GlobalV::use_paw) { - if(typeid(Real) != typeid(double)) + if (typeid(Real) != typeid(double)) { ModuleBase::WARNING_QUIT("HSolverPW::solve", "PAW is only supported for double precision!"); } @@ -244,10 +287,10 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, for (int ik = 0; ik < this->wfc_basis->nks; ++ik) { const int npw = this->wfc_basis->npwk[ik]; - ModuleBase::Vector3 *_gk = new ModuleBase::Vector3[npw]; - for (int ig = 0;ig < npw; ig++) + ModuleBase::Vector3* _gk = new ModuleBase::Vector3[npw]; + for (int ig = 0; ig < npw; ig++) { - _gk[ig] = this->wfc_basis->getgpluskcar(ik,ig); + _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig); } double* kpt; @@ -256,11 +299,11 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, kpt[1] = this->wfc_basis->kvec_c[ik].y; kpt[2] = this->wfc_basis->kvec_c[ik].z; - double ** kpg; - double ** gcar; + double** kpg; + double** gcar; kpg = new double*[npw]; gcar = new double*[npw]; - for(int ipw=0;ipw::solve(hamilt::Hamilt* pHamilt, kpg[ipw][2] = _gk[ipw].z; gcar[ipw] = new double[3]; - gcar[ipw][0] = this->wfc_basis->getgcar(ik,ipw).x; - gcar[ipw][1] = this->wfc_basis->getgcar(ik,ipw).y; - gcar[ipw][2] = this->wfc_basis->getgcar(ik,ipw).z; + gcar[ipw][0] = this->wfc_basis->getgcar(ik, ipw).x; + gcar[ipw][1] = this->wfc_basis->getgcar(ik, ipw).y; + gcar[ipw][2] = this->wfc_basis->getgcar(ik, ipw).z; } - GlobalC::paw_cell.set_paw_k(npw,wfc_basis->npwk_max,kpt, - this->wfc_basis->get_ig2ix(ik).data(), - this->wfc_basis->get_ig2iy(ik).data(), - this->wfc_basis->get_ig2iz(ik).data(), - (const double **) kpg,GlobalC::ucell.tpiba,(const double **) gcar); + GlobalC::paw_cell.set_paw_k(npw, + wfc_basis->npwk_max, + kpt, + this->wfc_basis->get_ig2ix(ik).data(), + this->wfc_basis->get_ig2iy(ik).data(), + this->wfc_basis->get_ig2iz(ik).data(), + (const double**)kpg, + GlobalC::ucell.tpiba, + (const double**)gcar); delete[] kpt; - for(int ipw = 0; ipw < npw; ipw++) + for (int ipw = 0; ipw < npw; ipw++) { delete[] kpg[ipw]; delete[] gcar[ipw]; @@ -289,13 +336,14 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, delete[] gcar; GlobalC::paw_cell.get_vkb(); - + psi.fix_k(ik); GlobalC::paw_cell.set_currentk(ik); int nbands = psi.get_nbands(); - for(int ib = 0; ib < nbands; ib ++) + for (int ib = 0; ib < nbands; ib++) { - GlobalC::paw_cell.accumulate_rhoij(reinterpret_cast*> (psi.get_pointer(ib)), pes->wg(ik,ib)); + GlobalC::paw_cell.accumulate_rhoij(reinterpret_cast*>(psi.get_pointer(ib)), + pes->wg(ik, ib)); } } @@ -304,41 +352,49 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, std::vector nrhoijsel; #ifdef __MPI - if(GlobalV::RANK_IN_POOL == 0) + if (GlobalV::RANK_IN_POOL == 0) { GlobalC::paw_cell.get_rhoijp(rhoijp, rhoijselect, nrhoijsel); - for(int iat = 0; iat < GlobalC::ucell.nat; iat ++) + for (int iat = 0; iat < GlobalC::ucell.nat; iat++) { - GlobalC::paw_cell.set_rhoij(iat,nrhoijsel[iat],rhoijselect[iat].size(),rhoijselect[iat].data(),rhoijp[iat].data()); - } + GlobalC::paw_cell.set_rhoij(iat, + nrhoijsel[iat], + rhoijselect[iat].size(), + rhoijselect[iat].data(), + rhoijp[iat].data()); + } } #else GlobalC::paw_cell.get_rhoijp(rhoijp, rhoijselect, nrhoijsel); - for(int iat = 0; iat < GlobalC::ucell.nat; iat ++) + for (int iat = 0; iat < GlobalC::ucell.nat; iat++) { - GlobalC::paw_cell.set_rhoij(iat,nrhoijsel[iat],rhoijselect[iat].size(),rhoijselect[iat].data(),rhoijp[iat].data()); + GlobalC::paw_cell.set_rhoij(iat, + nrhoijsel[iat], + rhoijselect[iat].size(), + rhoijselect[iat].data(), + rhoijp[iat].data()); } #endif double* nhatgr; - GlobalC::paw_cell.get_nhat(pes->charge->nhat,nhatgr); + GlobalC::paw_cell.get_nhat(pes->charge->nhat, nhatgr); } #endif ModuleBase::timer::tick("HSolverPW", "solve"); return; } -/* +/* lcao_in_pw */ template void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_KS_PW::p_hamilt - psi::Psi& psi, // ESolver_KS_PW::kspw_psi - elecstate::ElecState* pes, // ESolver_KS_PW::pes - psi::Psi& transform, - const bool skip_charge) + psi::Psi& psi, // ESolver_KS_PW::kspw_psi + elecstate::ElecState* pes, // ESolver_KS_PW::pes + psi::Psi& transform, + const bool skip_charge) { ModuleBase::TITLE("HSolverPW", "solve"); ModuleBase::timer::tick("HSolverPW", "solve"); @@ -348,13 +404,13 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_ /// update H(k) for each k point pHamilt->updateHk(ik); #ifdef USE_PAW - if(GlobalV::use_paw) + if (GlobalV::use_paw) { const int npw = this->wfc_basis->npwk[ik]; - ModuleBase::Vector3 *_gk = new ModuleBase::Vector3[npw]; - for (int ig = 0;ig < npw; ig++) + ModuleBase::Vector3* _gk = new ModuleBase::Vector3[npw]; + for (int ig = 0; ig < npw; ig++) { - _gk[ig] = this->wfc_basis->getgpluskcar(ik,ig); + _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig); } double* kpt; @@ -363,11 +419,11 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_ kpt[1] = this->wfc_basis->kvec_c[ik].y; kpt[2] = this->wfc_basis->kvec_c[ik].z; - double ** kpg; - double ** gcar; + double** kpg; + double** gcar; kpg = new double*[npw]; gcar = new double*[npw]; - for(int ipw=0;ipw::solve(hamilt::Hamilt* pHamilt, // ESolver_ kpg[ipw][2] = _gk[ipw].z; gcar[ipw] = new double[3]; - gcar[ipw][0] = this->wfc_basis->getgcar(ik,ipw).x; - gcar[ipw][1] = this->wfc_basis->getgcar(ik,ipw).y; - gcar[ipw][2] = this->wfc_basis->getgcar(ik,ipw).z; + gcar[ipw][0] = this->wfc_basis->getgcar(ik, ipw).x; + gcar[ipw][1] = this->wfc_basis->getgcar(ik, ipw).y; + gcar[ipw][2] = this->wfc_basis->getgcar(ik, ipw).z; } - GlobalC::paw_cell.set_paw_k(npw,wfc_basis->npwk_max,kpt, - this->wfc_basis->get_ig2ix(ik).data(), - this->wfc_basis->get_ig2iy(ik).data(), - this->wfc_basis->get_ig2iz(ik).data(), - (const double **) kpg,GlobalC::ucell.tpiba,(const double **) gcar); + GlobalC::paw_cell.set_paw_k(npw, + wfc_basis->npwk_max, + kpt, + this->wfc_basis->get_ig2ix(ik).data(), + this->wfc_basis->get_ig2iy(ik).data(), + this->wfc_basis->get_ig2iz(ik).data(), + (const double**)kpg, + GlobalC::ucell.tpiba, + (const double**)gcar); delete[] kpt; - for(int ipw = 0; ipw < npw; ipw++) + for (int ipw = 0; ipw < npw; ipw++) { delete[] kpg[ipw]; delete[] gcar[ipw]; @@ -404,35 +464,36 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_ transform.fix_k(ik); /// solve eigenvector and eigenvalue for H(k) - //hsolver::DiagoIterAssist::diagH_subspace( - // pHamilt, // interface to hamilt - // transform, // transform matrix between lcao and pw - // psi, // psi in pw basis - // eigenvalues.data() + ik * pes->ekb.nc, // eigenvalues - // psi.get_nbands() // number of the lowest energies bands - // ); + // hsolver::DiagoIterAssist::diagH_subspace( + // pHamilt, // interface to hamilt + // transform, // transform matrix between lcao and pw + // psi, // psi in pw basis + // eigenvalues.data() + ik * pes->ekb.nc, // eigenvalues + // psi.get_nbands() // number of the lowest energies bands + // ); - hsolver::DiagoIterAssist::diagH_subspace_init( - pHamilt, // interface to hamilt - transform.get_pointer(), // transform matrix between lcao and pw + pHamilt, // interface to hamilt + transform.get_pointer(), // transform matrix between lcao and pw transform.get_nbands(), transform.get_nbasis(), - psi, // psi in pw basis - eigenvalues.data() + ik * pes->ekb.nc // eigenvalues - ); - - if(skip_charge) + psi, // psi in pw basis + eigenvalues.data() + ik * pes->ekb.nc // eigenvalues + ); + + if (skip_charge) { - GlobalV::ofs_running<< "Average iterative diagonalization steps for k-points "<::avg_iter - <<" ; where current threshold is: "<::PW_DIAG_THR<<" . "<::avg_iter + << " ; where current threshold is: " << DiagoIterAssist::PW_DIAG_THR + << " . " << std::endl; DiagoIterAssist::avg_iter = 0.0; } /// calculate the contribution of Psi for charge density rho - } + } castmem_2d_2h_op()(cpu_ctx, cpu_ctx, pes->ekb.c, eigenvalues.data(), pes->ekb.nr * pes->ekb.nc); - if(skip_charge) + if (skip_charge) { ModuleBase::timer::tick("HSolverPW", "solve"); return; @@ -440,9 +501,9 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_ reinterpret_cast*>(pes)->psiToRho(psi); #ifdef USE_PAW - if(GlobalV::use_paw) + if (GlobalV::use_paw) { - if(typeid(Real) != typeid(double)) + if (typeid(Real) != typeid(double)) { ModuleBase::WARNING_QUIT("HSolverPW::solve", "PAW is only supported for double precision!"); } @@ -451,10 +512,10 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_ for (int ik = 0; ik < this->wfc_basis->nks; ++ik) { const int npw = this->wfc_basis->npwk[ik]; - ModuleBase::Vector3 *_gk = new ModuleBase::Vector3[npw]; - for (int ig = 0;ig < npw; ig++) + ModuleBase::Vector3* _gk = new ModuleBase::Vector3[npw]; + for (int ig = 0; ig < npw; ig++) { - _gk[ig] = this->wfc_basis->getgpluskcar(ik,ig); + _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig); } double* kpt; @@ -463,11 +524,11 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_ kpt[1] = this->wfc_basis->kvec_c[ik].y; kpt[2] = this->wfc_basis->kvec_c[ik].z; - double ** kpg; - double ** gcar; + double** kpg; + double** gcar; kpg = new double*[npw]; gcar = new double*[npw]; - for(int ipw=0;ipw::solve(hamilt::Hamilt* pHamilt, // ESolver_ kpg[ipw][2] = _gk[ipw].z; gcar[ipw] = new double[3]; - gcar[ipw][0] = this->wfc_basis->getgcar(ik,ipw).x; - gcar[ipw][1] = this->wfc_basis->getgcar(ik,ipw).y; - gcar[ipw][2] = this->wfc_basis->getgcar(ik,ipw).z; + gcar[ipw][0] = this->wfc_basis->getgcar(ik, ipw).x; + gcar[ipw][1] = this->wfc_basis->getgcar(ik, ipw).y; + gcar[ipw][2] = this->wfc_basis->getgcar(ik, ipw).z; } - GlobalC::paw_cell.set_paw_k(npw,wfc_basis->npwk_max,kpt, - this->wfc_basis->get_ig2ix(ik).data(), - this->wfc_basis->get_ig2iy(ik).data(), - this->wfc_basis->get_ig2iz(ik).data(), - (const double **) kpg,GlobalC::ucell.tpiba,(const double **) gcar); + GlobalC::paw_cell.set_paw_k(npw, + wfc_basis->npwk_max, + kpt, + this->wfc_basis->get_ig2ix(ik).data(), + this->wfc_basis->get_ig2iy(ik).data(), + this->wfc_basis->get_ig2iz(ik).data(), + (const double**)kpg, + GlobalC::ucell.tpiba, + (const double**)gcar); delete[] kpt; - for(int ipw = 0; ipw < npw; ipw++) + for (int ipw = 0; ipw < npw; ipw++) { delete[] kpg[ipw]; delete[] gcar[ipw]; @@ -496,13 +561,14 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_ delete[] gcar; GlobalC::paw_cell.get_vkb(); - + psi.fix_k(ik); GlobalC::paw_cell.set_currentk(ik); int nbands = psi.get_nbands(); - for(int ib = 0; ib < nbands; ib ++) + for (int ib = 0; ib < nbands; ib++) { - GlobalC::paw_cell.accumulate_rhoij(reinterpret_cast*> (psi.get_pointer(ib)), pes->wg(ik,ib)); + GlobalC::paw_cell.accumulate_rhoij(reinterpret_cast*>(psi.get_pointer(ib)), + pes->wg(ik, ib)); } } @@ -511,82 +577,98 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_ std::vector nrhoijsel; #ifdef __MPI - if(GlobalV::RANK_IN_POOL == 0) + if (GlobalV::RANK_IN_POOL == 0) { GlobalC::paw_cell.get_rhoijp(rhoijp, rhoijselect, nrhoijsel); - for(int iat = 0; iat < GlobalC::ucell.nat; iat ++) + for (int iat = 0; iat < GlobalC::ucell.nat; iat++) { - GlobalC::paw_cell.set_rhoij(iat,nrhoijsel[iat],rhoijselect[iat].size(),rhoijselect[iat].data(),rhoijp[iat].data()); - } + GlobalC::paw_cell.set_rhoij(iat, + nrhoijsel[iat], + rhoijselect[iat].size(), + rhoijselect[iat].data(), + rhoijp[iat].data()); + } } #else GlobalC::paw_cell.get_rhoijp(rhoijp, rhoijselect, nrhoijsel); - for(int iat = 0; iat < GlobalC::ucell.nat; iat ++) + for (int iat = 0; iat < GlobalC::ucell.nat; iat++) { - GlobalC::paw_cell.set_rhoij(iat,nrhoijsel[iat],rhoijselect[iat].size(),rhoijselect[iat].data(),rhoijp[iat].data()); + GlobalC::paw_cell.set_rhoij(iat, + nrhoijsel[iat], + rhoijselect[iat].size(), + rhoijselect[iat].data(), + rhoijp[iat].data()); } #endif double* nhatgr; - GlobalC::paw_cell.get_nhat(pes->charge->nhat,nhatgr); + GlobalC::paw_cell.get_nhat(pes->charge->nhat, nhatgr); } #endif ModuleBase::timer::tick("HSolverPW", "solve"); return; } -template +template void HSolverPW::endDiagh() { // DiagoCG would keep 9*nbasis memory in cache during loop-k // it should be deleted before calculating charge - if(this->method == "cg") + if (this->method == "cg") { delete reinterpret_cast*>(this->pdiagh); this->pdiagh = nullptr; } - if(this->method == "dav") + if (this->method == "dav") { delete (DiagoDavid*)this->pdiagh; this->pdiagh = nullptr; } - if(this->method == "bpcg") + if (this->method == "dav_subspace") + { + delete (Diago_DavSubspace*)this->pdiagh; + this->pdiagh = nullptr; + } + if (this->method == "bpcg") { delete (DiagoBPCG*)this->pdiagh; this->pdiagh = nullptr; } - //in PW base, average iteration steps for each band and k-point should be printing - if(DiagoIterAssist::avg_iter > 0.0) + // in PW base, average iteration steps for each band and k-point should be printing + if (DiagoIterAssist::avg_iter > 0.0) { - GlobalV::ofs_running<< "Average iterative diagonalization steps: "<::avg_iter / this->wfc_basis->nks - <<" ; where current threshold is: "<::PW_DIAG_THR<<" . "<::avg_iter / this->wfc_basis->nks + << " ; where current threshold is: " << DiagoIterAssist::PW_DIAG_THR << " . " + << std::endl; + + // std::cout << "avg_iter == " << DiagoIterAssist::avg_iter << std::endl; + + // reset avg_iter DiagoIterAssist::avg_iter = 0.0; } - //psi only should be initialed once for PW - if(!this->initialed_psi) + // psi only should be initialed once for PW + if (!this->initialed_psi) { this->initialed_psi = true; } } template -void HSolverPW::updatePsiK(hamilt::Hamilt* pHamilt, - psi::Psi& psi, - const int ik) +void HSolverPW::updatePsiK(hamilt::Hamilt* pHamilt, psi::Psi& psi, const int ik) { psi.fix_k(ik); - if(GlobalV::psi_initializer) // new psi initialization method branch + if (GlobalV::psi_initializer) // new psi initialization method branch { // do nothing here, because we have already initialize, allocate and make initial guess // basis_type lcao_in_pw function may be inserted here } - else if(!this->initialed_psi) // old psi initialization method branch + else if (!this->initialed_psi) // old psi initialization method branch { - if(GlobalV::BASIS_TYPE=="pw") + if (GlobalV::BASIS_TYPE == "pw") { hamilt::diago_PAO_in_pw_k2(this->ctx, ik, psi, this->wfc_basis, this->pwf, pHamilt); } @@ -594,11 +676,20 @@ void HSolverPW::updatePsiK(hamilt::Hamilt* pHamilt, } } -template +template void HSolverPW::hamiltSolvePsiK(hamilt::Hamilt* hm, psi::Psi& psi, Real* eigenvalue) { - if (this->method != "cg") { - this->pdiagh->diag(hm, psi, eigenvalue); + if (this->method != "cg") + { + if (this->method == "dav_subspace") + { + + ((Diago_DavSubspace*)this->pdiagh)->diag(hm, psi, eigenvalue, is_occupied); + } + else + { + this->pdiagh->diag(hm, psi, eigenvalue); + } return; } // warp the hpsi_func and spsi_func into a lambda function @@ -608,16 +699,16 @@ void HSolverPW::hamiltSolvePsiK(hamilt::Hamilt* hm, psi::P auto ngk_pointer = psi.get_ngk_pointer(); auto hpsi_func = [hm, ngk_pointer](const ct::Tensor& psi_in, ct::Tensor& hpsi_out) { ModuleBase::timer::tick("DiagoCG_New", "hpsi_func"); - // psi_in should be a 2D tensor: + // psi_in should be a 2D tensor: // psi_in.shape() = [nbands, nbasis] const auto ndim = psi_in.shape().ndim(); REQUIRES_OK(ndim <= 2, "dims of psi_in should be less than or equal to 2"); // Convert a Tensor object to a psi::Psi object - auto psi_wrapper = psi::Psi( - psi_in.data(), 1, - ndim == 1 ? 1 : psi_in.shape().dim_size(0), - ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1), - ngk_pointer); + auto psi_wrapper = psi::Psi(psi_in.data(), + 1, + ndim == 1 ? 1 : psi_in.shape().dim_size(0), + ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1), + ngk_pointer); psi::Range all_bands_range(true, psi_wrapper.get_current_k(), 0, psi_wrapper.get_nbands() - 1); using hpsi_info = typename hamilt::Operator::hpsi_info; hpsi_info info(&psi_wrapper, all_bands_range, hpsi_out.data()); @@ -626,7 +717,7 @@ void HSolverPW::hamiltSolvePsiK(hamilt::Hamilt* hm, psi::P }; auto spsi_func = [this, hm](const ct::Tensor& psi_in, ct::Tensor& spsi_out) { ModuleBase::timer::tick("DiagoCG_New", "spsi_func"); - // psi_in should be a 2D tensor: + // psi_in should be a 2D tensor: // psi_in.shape() = [nbands, nbasis] const auto ndim = psi_in.shape().ndim(); REQUIRES_OK(ndim <= 2, "dims of psi_in should be less than or equal to 2"); @@ -634,11 +725,13 @@ void HSolverPW::hamiltSolvePsiK(hamilt::Hamilt* hm, psi::P if (GlobalV::use_uspp) { // Convert a Tensor object to a psi::Psi object - hm->sPsi(psi_in.data(), spsi_out.data(), - ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1), - ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1), - ndim == 1 ? 1 : psi_in.shape().dim_size(0)); - } else + hm->sPsi(psi_in.data(), + spsi_out.data(), + ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1), + ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1), + ndim == 1 ? 1 : psi_in.shape().dim_size(0)); + } + else { psi::memory::synchronize_memory_op()( this->ctx, @@ -648,32 +741,32 @@ void HSolverPW::hamiltSolvePsiK(hamilt::Hamilt* hm, psi::P static_cast((ndim == 1 ? 1 : psi_in.shape().dim_size(0)) * (ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1)))); } - + ModuleBase::timer::tick("DiagoCG_New", "spsi_func"); }; - auto psi_tensor = ct::TensorMap( - psi.get_pointer(), - ct::DataTypeToEnum::value, - ct::DeviceTypeToEnum::value, - ct::TensorShape({psi.get_nbands(), psi.get_nbasis()})).slice({0, 0}, {psi.get_nbands(), psi.get_current_nbas()}); - auto eigen_tensor = ct::TensorMap( - eigenvalue, - ct::DataTypeToEnum::value, - ct::DeviceTypeToEnum::value, - ct::TensorShape({psi.get_nbands()})); - auto prec_tensor = ct::TensorMap( - precondition.data(), - ct::DataTypeToEnum::value, - ct::DeviceTypeToEnum::value, - ct::TensorShape({static_cast(precondition.size())})).to_device().slice({0}, {psi.get_current_nbas()}); - + auto psi_tensor = ct::TensorMap(psi.get_pointer(), + ct::DataTypeToEnum::value, + ct::DeviceTypeToEnum::value, + ct::TensorShape({psi.get_nbands(), psi.get_nbasis()})) + .slice({0, 0}, {psi.get_nbands(), psi.get_current_nbas()}); + auto eigen_tensor = ct::TensorMap(eigenvalue, + ct::DataTypeToEnum::value, + ct::DeviceTypeToEnum::value, + ct::TensorShape({psi.get_nbands()})); + auto prec_tensor = ct::TensorMap(precondition.data(), + ct::DataTypeToEnum::value, + ct::DeviceTypeToEnum::value, + ct::TensorShape({static_cast(precondition.size())})) + .to_device() + .slice({0}, {psi.get_current_nbas()}); + cg->diag(hpsi_func, spsi_func, psi_tensor, eigen_tensor, prec_tensor); // TODO: Double check tensormap's potential problem ct::TensorMap(psi.get_pointer(), psi_tensor, {psi.get_nbands(), psi.get_nbasis()}).sync(psi_tensor); } -template -void HSolverPW::update_precondition(std::vector &h_diag, const int ik, const int npw) +template +void HSolverPW::update_precondition(std::vector& h_diag, const int ik, const int npw) { h_diag.assign(h_diag.size(), 1.0); int precondition_type = 2; @@ -688,7 +781,7 @@ void HSolverPW::update_precondition(std::vector &h_diag, const { for (int ig = 0; ig < npw; ig++) { - Real g2kin = static_cast(this->wfc_basis->getgk2(ik,ig)) * tpiba2; + Real g2kin = static_cast(this->wfc_basis->getgk2(ik, ig)) * tpiba2; h_diag[ig] = std::max(static_cast(1.0), g2kin); } } @@ -696,30 +789,38 @@ void HSolverPW::update_precondition(std::vector &h_diag, const { for (int ig = 0; ig < npw; ig++) { - Real g2kin = static_cast(this->wfc_basis->getgk2(ik,ig)) * tpiba2; - h_diag[ig] = 1 + g2kin + sqrt(1 + (g2kin - 1) * (g2kin - 1)); + Real g2kin = static_cast(this->wfc_basis->getgk2(ik, ig)) * tpiba2; + + if (this->method == "dav_subspace") + { + h_diag[ig] = g2kin; + } + else + { + h_diag[ig] = 1 + g2kin + sqrt(1 + (g2kin - 1) * (g2kin - 1)); + } } } - if(GlobalV::NSPIN==4) + if (GlobalV::NSPIN == 4) { const int size = h_diag.size(); for (int ig = 0; ig < npw; ig++) { - h_diag[ig+size/2] = h_diag[ig]; + h_diag[ig + size / 2] = h_diag[ig]; } } } -template +template typename HSolverPW::Real HSolverPW::cal_hsolerror() { return this->diag_ethr * static_cast(std::max(1.0, GlobalV::nelec)); } -template +template typename HSolverPW::Real HSolverPW::set_diagethr(const int istep, const int iter, const Real drho) { - //It is too complex now and should be modified. + // It is too complex now and should be modified. if (iter == 1) { if (std::abs(this->diag_ethr - 1.0e-2) < 1.0e-6) @@ -754,19 +855,24 @@ typename HSolverPW::Real HSolverPW::set_diagethr(const int { this->diag_ethr = 1.e-2; } - this->diag_ethr = std::min(this->diag_ethr, static_cast(0.1) * drho / std::max(static_cast(1.0), static_cast(GlobalV::nelec))); + this->diag_ethr = std::min(this->diag_ethr, + static_cast(0.1) * drho + / std::max(static_cast(1.0), static_cast(GlobalV::nelec))); } // It is essential for single precision implementation to keep the diag_ethr value // less or equal to the single-precision limit of convergence(0.5e-4). // modified by denghuilu at 2023-05-15 - if (GlobalV::precision_flag == "single") { + if (GlobalV::precision_flag == "single") + { this->diag_ethr = std::max(this->diag_ethr, static_cast(0.5e-4)); } return this->diag_ethr; } -template -typename HSolverPW::Real HSolverPW::reset_diagethr(std::ofstream& ofs_running, const Real hsover_error, const Real drho) +template +typename HSolverPW::Real HSolverPW::reset_diagethr(std::ofstream& ofs_running, + const Real hsover_error, + const Real drho) { ofs_running << " Notice: Threshold on eigenvalues was too large.\n"; ModuleBase::WARNING("scf", "Threshold on eigenvalues was too large."); @@ -776,7 +882,8 @@ typename HSolverPW::Real HSolverPW::reset_diagethr(std::of // It is essential for single precision implementation to keep the diag_ethr value // less or equal to the single-precision limit of convergence(0.5e-4). // modified by denghuilu at 2023-05-15 - if (GlobalV::precision_flag == "single") { + if (GlobalV::precision_flag == "single") + { this->diag_ethr = std::max(this->diag_ethr, static_cast(0.5e-4)); } ofs_running << " New diag_ethr = " << this->diag_ethr << std::endl; diff --git a/source/module_hsolver/hsolver_pw.h b/source/module_hsolver/hsolver_pw.h index a6aedf1cff..afc19ac65c 100644 --- a/source/module_hsolver/hsolver_pw.h +++ b/source/module_hsolver/hsolver_pw.h @@ -12,6 +12,7 @@ template class HSolverPW: public HSolver { private: + bool is_first_scf = true; // Note GetTypeReal::type will // return T if T is real type(float, double), // otherwise return the real type of T(complex, complex) @@ -67,6 +68,8 @@ class HSolverPW: public HSolver void update_precondition(std::vector &h_diag, const int ik, const int npw); std::vector precondition; + std::vector eigenvalues; + std::vector is_occupied; bool initialed_psi = false; diff --git a/source/module_hsolver/kernels/cuda/dngvd_op.cu b/source/module_hsolver/kernels/cuda/dngvd_op.cu index f22fe4c7f4..cc9d606dd0 100644 --- a/source/module_hsolver/kernels/cuda/dngvd_op.cu +++ b/source/module_hsolver/kernels/cuda/dngvd_op.cu @@ -243,14 +243,36 @@ struct dnevx_op { } }; +template +struct dngvx_op +{ + using Real = typename GetTypeReal::type; + void operator()(const psi::DEVICE_GPU* d, + const int nbase, + const int ldh, + T* hcc, + T* scc, + const int m, + Real* eigenvalue, + T* vcc) + { + + } +}; + + template struct dngvd_op, psi::DEVICE_GPU>; template struct dnevx_op, psi::DEVICE_GPU>; +template struct dngvx_op, psi::DEVICE_GPU>; + template struct dngvd_op, psi::DEVICE_GPU>; template struct dnevx_op, psi::DEVICE_GPU>; +template struct dngvx_op, psi::DEVICE_GPU>; #ifdef __LCAO template struct dngvd_op; template struct dnevx_op; +template struct dngvx_op; #endif } // namespace hsolver \ No newline at end of file diff --git a/source/module_hsolver/kernels/dngvd_op.cpp b/source/module_hsolver/kernels/dngvd_op.cpp index c4d8193c6e..3280794ee2 100644 --- a/source/module_hsolver/kernels/dngvd_op.cpp +++ b/source/module_hsolver/kernels/dngvd_op.cpp @@ -2,34 +2,23 @@ #include -namespace hsolver { +namespace hsolver +{ - inline double get_real(const double& x) +template +struct dngvd_op +{ + using Real = typename GetTypeReal::type; + void operator()(const psi::DEVICE_CPU* d, + const int nstart, + const int ldh, + const T* hcc, + const T* scc, + Real* eigenvalue, + T* vcc) { - return x; - } - inline double get_real(const std::complex& x) - { - return x.real(); - } - inline float get_real(const std::complex& x) - { - return x.real(); - } - - template - struct dngvd_op { - using Real = typename GetTypeReal::type; - void operator()( - const psi::DEVICE_CPU* d, - const int nstart, - const int ldh, - const T* hcc, - const T* scc, - Real* eigenvalue, - T* vcc) - { - for (int i = 0; i < nstart * ldh; i++) { + for (int i = 0; i < nstart * ldh; i++) + { vcc[i] = hcc[i]; } int info = 0; @@ -42,23 +31,41 @@ namespace hsolver { ModuleBase::GlobalFunc::ZEROS(rwork, lrwork); int liwork = 3 + 5 * nstart; - int *iwork = new int[liwork]; + int* iwork = new int[liwork]; ModuleBase::GlobalFunc::ZEROS(iwork, liwork); //=========================== // calculate all eigenvalues //=========================== - LapackConnector::xhegvd(1, 'V', 'U', nstart, vcc, ldh, scc, ldh, eigenvalue, work, lwork, rwork, lrwork, iwork, liwork, info); + LapackConnector::xhegvd(1, + 'V', + 'U', + nstart, + vcc, + ldh, + scc, + ldh, + eigenvalue, + work, + lwork, + rwork, + lrwork, + iwork, + liwork, + info); - if (info != 0) { - std::cout << "Error: xhegvd failed, linear dependent basis functions\n" + if (info != 0) + { + std::cout << "Error: xhegvd failed, linear dependent basis functions\n" << ", wrong initialization of wavefunction, or wavefunction information loss\n" << ", output overlap matrix scc.txt to check\n" << std::endl; // print scc to file scc.txt std::ofstream ofs("scc.txt"); - for (int i = 0; i < nstart; i++) { - for (int j = 0; j < nstart; j++) { + for (int i = 0; i < nstart; i++) + { + for (int j = 0; j < nstart; j++) + { ofs << scc[i * ldh + j] << " "; } ofs << std::endl; @@ -73,21 +80,78 @@ namespace hsolver { } }; +template +struct dngv_op +{ + using Real = typename GetTypeReal::type; + void operator()(const psi::DEVICE_CPU* d, + const int nbase, + const int ldh, + const T* hcc, + T* scc, + Real* eigenvalue, + T* vcc) + { + for (int i = 0; i < nbase * ldh; i++) + { + vcc[i] = hcc[i]; + } + + int info = 0; + + int lwork = 2 * nbase - 1; + T* work = new T[lwork]; + ModuleBase::GlobalFunc::ZEROS(work, lwork); + + int lrwork = 3 * nbase - 2; + Real* rwork = new Real[lrwork]; + ModuleBase::GlobalFunc::ZEROS(rwork, lrwork); + + //=========================== + // calculate all eigenvalues + //=========================== + LapackConnector::xhegv(1, 'V', 'U', nbase, vcc, ldh, scc, ldh, eigenvalue, work, lwork, rwork, info); + + if (info != 0) + { + std::cout << "Error: xhegv failed, linear dependent basis functions\n" + << ", wrong initialization of wavefunction, or wavefunction information loss\n" + << ", output overlap matrix scc.txt to check\n" + << std::endl; + // print scc to file scc.txt + std::ofstream ofs("scc.txt"); + for (int i = 0; i < nbase; i++) + { + for (int j = 0; j < nbase; j++) + { + ofs << scc[i * ldh + j] << " "; + } + ofs << std::endl; + } + ofs.close(); + } + assert(0 == info); + + delete[] work; + delete[] rwork; + } +}; - template - struct dnevx_op { - using Real = typename GetTypeReal::type; - void operator()( - const psi::DEVICE_CPU* /*ctx*/, - const int nstart, - const int ldh, - const T* hcc, // hcc - const int nbands, // nbands - Real* eigenvalue, // eigenvalue - T* vcc) // vcc +template +struct dnevx_op +{ + using Real = typename GetTypeReal::type; + void operator()(const psi::DEVICE_CPU* /*ctx*/, + const int nstart, + const int ldh, + const T* hcc, // hcc + const int nbands, // nbands + Real* eigenvalue, // eigenvalue + T* vcc) // vcc { T* aux = new T[nstart * ldh]; - for (int ii = 0; ii < nstart * ldh; ii++) { + for (int ii = 0; ii < nstart * ldh; ii++) + { aux[ii] = hcc[ii]; } @@ -95,38 +159,38 @@ namespace hsolver { int lwork = -1; T* work = new T[1]; Real* rwork = new Real[7 * nstart]; - int *iwork = new int[5 * nstart]; - int *ifail = new int[nstart]; - + int* iwork = new int[5 * nstart]; + int* ifail = new int[nstart]; + // When lwork = -1, the demension of work will be assumed // Assume the denmension of work by output work[0] LapackConnector::xheevx( - 1, // ITYPE = 1: A*x = (lambda)*B*x - 'V', // JOBZ = 'V': Compute eigenvalues and eigenvectors. - 'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found. - 'L', // UPLO = 'L': Lower triangles of A and B are stored. - nstart, // N = base - aux, // A is COMPLEX*16 array dimension (LDA, N) - ldh, // LDA = base - 0.0, // Not referenced if RANGE = 'A' or 'I'. - 0.0, // Not referenced if RANGE = 'A' or 'I'. - 1, // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N, - nbands, // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N, - 0.0, // ABSTOL - nbands, // M: The total number of eigenvalues found. 0 <= M <= N. if RANGE = 'I', M = IU-IL+1. + 1, // ITYPE = 1: A*x = (lambda)*B*x + 'V', // JOBZ = 'V': Compute eigenvalues and eigenvectors. + 'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found. + 'L', // UPLO = 'L': Lower triangles of A and B are stored. + nstart, // N = base + aux, // A is COMPLEX*16 array dimension (LDA, N) + ldh, // LDA = base + 0.0, // Not referenced if RANGE = 'A' or 'I'. + 0.0, // Not referenced if RANGE = 'A' or 'I'. + 1, // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N, + nbands, // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N, + 0.0, // ABSTOL + nbands, // M: The total number of eigenvalues found. 0 <= M <= N. if RANGE = 'I', M = IU-IL+1. eigenvalue, // W store eigenvalues - vcc, // store eigenvector - ldh, // LDZ: The leading dimension of the array Z. + vcc, // store eigenvector + ldh, // LDZ: The leading dimension of the array Z. work, lwork, rwork, iwork, ifail, info); - - + lwork = int(get_real(work[0])); - delete[] work; work = new T[lwork]; + delete[] work; + work = new T[lwork]; // The A and B storage space is (nstart * ldh), and the data that really participates in the zhegvx // operation is (nstart * nstart). In this function, the data that A and B participate in the operation will @@ -135,22 +199,22 @@ namespace hsolver { // obtained by the zhegvx operation is (nstart * nstart) and stored in zux (internal to the function). When // the function is output, the data of zux will be mapped to the corresponding position of V. LapackConnector::xheevx( - 1, // ITYPE = 1: A*x = (lambda)*B*x - 'V', // JOBZ = 'V': Compute eigenvalues and eigenvectors. - 'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found. - 'L', // UPLO = 'L': Lower triangles of A and B are stored. - nstart, // N = base - aux, // A is COMPLEX*16 array dimension (LDA, N) - ldh, // LDA = base - 0.0, // Not referenced if RANGE = 'A' or 'I'. - 0.0, // Not referenced if RANGE = 'A' or 'I'. - 1, // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N, - nbands, // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N, - 0.0, // ABSTOL - nbands, // M: The total number of eigenvalues found. 0 <= M <= N. if RANGE = 'I', M = IU-IL+1. + 1, // ITYPE = 1: A*x = (lambda)*B*x + 'V', // JOBZ = 'V': Compute eigenvalues and eigenvectors. + 'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found. + 'L', // UPLO = 'L': Lower triangles of A and B are stored. + nstart, // N = base + aux, // A is COMPLEX*16 array dimension (LDA, N) + ldh, // LDA = base + 0.0, // Not referenced if RANGE = 'A' or 'I'. + 0.0, // Not referenced if RANGE = 'A' or 'I'. + 1, // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N, + nbands, // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N, + 0.0, // ABSTOL + nbands, // M: The total number of eigenvalues found. 0 <= M <= N. if RANGE = 'I', M = IU-IL+1. eigenvalue, // W store eigenvalues - vcc, // store eigenvector - ldh, // LDZ: The leading dimension of the array Z. + vcc, // store eigenvector + ldh, // LDZ: The leading dimension of the array Z. work, lwork, rwork, @@ -168,14 +232,108 @@ namespace hsolver { } }; +template +struct dngvx_op +{ + using Real = typename GetTypeReal::type; + void operator()(const psi::DEVICE_CPU* d, + const int nbase, + const int ldh, + T* hcc, + T* scc, + const int m, + Real* eigenvalue, + T* vcc) + { + + int info = 0; + + int mm = m; + + int lwork = -1; + + T* work = new T[1]; + Real* rwork = new Real[7 * nbase]; + int* iwork = new int[5 * nbase]; + int* ifail = new int[nbase]; + + LapackConnector::xhegvx( + 1, // ITYPE = 1: A*x = (lambda)*B*x + 'V', // JOBZ = 'V': Compute eigenvalues and eigenvectors. + 'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found. + 'U', // UPLO = 'L': Lower triangles of A and B are stored. + nbase, // N = base + hcc, // A is COMPLEX*16 array dimension (LDA, N) + ldh, // LDA = base + scc, + ldh, + 0.0, // Not referenced if RANGE = 'A' or 'I'. + 0.0, // Not referenced if RANGE = 'A' or 'I'. + 1, // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N, + m, // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N, + 0.0, // ABSTOL + mm, // M: The total number of eigenvalues found. 0 <= M <= N. if RANGE = 'I', M = IU-IL+1. + eigenvalue, // W store eigenvalues + vcc, // store eigenvector + ldh, // LDZ: The leading dimension of the array Z. + work, + lwork, + rwork, + iwork, + ifail, + info); + + lwork = int(get_real(work[0])); + delete[] work; + work = new T[lwork]; + + LapackConnector::xhegvx(1, + 'V', + 'I', + 'U', + nbase, + hcc, + ldh, + scc, + ldh, + 0.0, + 0.0, + 1, + m, + 0.0, + mm, + eigenvalue, + vcc, + ldh, + work, + lwork, + rwork, + iwork, + ifail, + info); + + delete[] work; + delete[] rwork; + delete[] iwork; + delete[] ifail; + } +}; + +template struct dngvd_op, psi::DEVICE_CPU>; +template struct dngvd_op, psi::DEVICE_CPU>; + +template struct dnevx_op, psi::DEVICE_CPU>; +template struct dnevx_op, psi::DEVICE_CPU>; - template struct dngvd_op, psi::DEVICE_CPU>; - template struct dngvd_op, psi::DEVICE_CPU>; +template struct dngvx_op, psi::DEVICE_CPU>; +template struct dngvx_op, psi::DEVICE_CPU>; - template struct dnevx_op , psi::DEVICE_CPU >; - template struct dnevx_op, psi::DEVICE_CPU>; +template struct dngv_op, psi::DEVICE_CPU>; +template struct dngv_op, psi::DEVICE_CPU>; #ifdef __LCAO - template struct dngvd_op; - template struct dnevx_op ; +template struct dngvd_op; +template struct dnevx_op; +template struct dngvx_op; +template struct dngv_op; #endif } // namespace hsolver \ No newline at end of file diff --git a/source/module_hsolver/kernels/dngvd_op.h b/source/module_hsolver/kernels/dngvd_op.h index 23d5855087..72e0ae1521 100644 --- a/source/module_hsolver/kernels/dngvd_op.h +++ b/source/module_hsolver/kernels/dngvd_op.h @@ -3,24 +3,24 @@ #ifndef MODULE_HSOLVER_DNGVD_H #define MODULE_HSOLVER_DNGVD_H -#include "module_base/complexmatrix.h" -#include "module_base/lapack_connector.h" #include "math_kernel_op.h" -#include "module_psi/kernels/memory_op.h" +#include "module_base/lapack_connector.h" namespace hsolver { - template struct dngvd_op - { - using Real = typename GetTypeReal::type; - /// @brief DNGVD computes all the eigenvalues and eigenvectors of a complex generalized +template +struct dngvd_op +{ + using Real = typename GetTypeReal::type; + /// @brief DNGVD computes all the eigenvalues and eigenvectors of a complex generalized /// Hermitian-definite eigenproblem. If eigenvectors are desired, it uses a divide and conquer algorithm. /// /// In this op, the CPU version is implemented through the `gvd` interface, and the CUDA version /// is implemented through the `gvd` interface. /// API doc: - /// 1. zhegvd: https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_ga74fdf9b5a16c90d8b7a589dec5ca058a.html + /// 1. zhegvd: + /// https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_ga74fdf9b5a16c90d8b7a589dec5ca058a.html /// 2. cusolverDnZhegvd: https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-sygvd /// /// Input Parameters @@ -32,26 +32,56 @@ namespace hsolver /// Output Parameter /// @param W : calculated eigenvalues /// @param V : calculated eigenvectors (col major) - void operator()(const Device* d, - const int nstart, - const int ldh, - const T* A, - const T* B, - Real* W, - T* V); + void operator()(const Device* d, const int nstart, const int ldh, const T* A, const T* B, Real* W, T* V); }; +template +struct dngv_op +{ + using Real = typename GetTypeReal::type; + /// @brief DNGVX computes first m eigenvalues and eigenvectors of a complex generalized + /// Input Parameters + /// @param d : the type of device + /// @param nbase : the number of dim of the matrix + /// @param ldh : the number of dmx of the matrix + /// @param A : the hermitian matrix A in A x=lambda B x (col major) + /// @param B : the overlap matrix B in A x=lambda B x (col major) + /// Output Parameter + /// @param W : calculated eigenvalues + /// @param V : calculated eigenvectors (col major) + void operator()(const Device* d, const int nstart, const int ldh, const T* A, T* B, Real* W, T* V); +}; - template struct dnevx_op +template +struct dngvx_op { - using Real = typename GetTypeReal::type; + using Real = typename GetTypeReal::type; + /// @brief DNGVX computes first m eigenvalues and eigenvectors of a complex generalized + /// Input Parameters + /// @param d : the type of device + /// @param nbase : the number of dim of the matrix + /// @param ldh : the number of dmx of the matrix + /// @param A : the hermitian matrix A in A x=lambda B x (col major) + /// @param B : the overlap matrix B in A x=lambda B x (col major) + /// @param m : the number of eigenpair + /// Output Parameter + /// @param W : calculated eigenvalues + /// @param V : calculated eigenvectors (col major) + void operator()(const Device* d, const int nstart, const int ldh, T* A, T* B, const int m, Real* W, T* V); +}; + +template +struct dnevx_op +{ + using Real = typename GetTypeReal::type; /// @brief DNEVX computes the first m eigenvalues and their corresponding eigenvectors of /// a complex generalized Hermitian-definite eigenproblem /// /// In this op, the CPU version is implemented through the `evx` interface, and the CUDA version /// is implemented through the `evd` interface and acquires the first m eigenpairs. /// API doc: - /// 1. zheevx: https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_gaabef68a9c7b10df7aef8f4fec89fddbe.html + /// 1. zheevx: + /// https://netlib.org/lapack/explore-html/df/d9a/group__complex16_h_eeigen_gaabef68a9c7b10df7aef8f4fec89fddbe.html /// 2. cusolverDnZheevd: https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-syevd /// /// Input Parameters @@ -62,16 +92,9 @@ namespace hsolver /// Output Parameter /// @param W : calculated eigenvalues /// @param V : calculated eigenvectors (row major) - void operator()(const Device* d, - const int nstart, - const int ldh, - const T* A, - const int m, - Real* W, - T* V); + void operator()(const Device* d, const int nstart, const int ldh, const T* A, const int m, Real* W, T* V); }; - #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM void createGpuSolverHandle(); diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp index 597d9cd5ea..b494f57890 100644 --- a/source/module_hsolver/kernels/math_kernel_op.cpp +++ b/source/module_hsolver/kernels/math_kernel_op.cpp @@ -7,24 +7,26 @@ namespace hsolver { template -struct line_minimize_with_block_op { +struct line_minimize_with_block_op +{ using Real = typename GetTypeReal::type; - void operator() ( - T* grad_out, - T* hgrad_out, - T* psi_out, - T* hpsi_out, - const int &n_basis, - const int &n_basis_max, - const int &n_band) + void operator()(T* grad_out, + T* hgrad_out, + T* psi_out, + T* hpsi_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band) { - for (int band_idx = 0; band_idx < n_band; band_idx++) { + for (int band_idx = 0; band_idx < n_band; band_idx++) + { Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; - auto A = reinterpret_cast(grad_out + band_idx * n_basis_max); + auto A = reinterpret_cast(grad_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); norm = 1.0 / sqrt(norm); - for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) + { auto item = band_idx * n_basis_max + basis_idx; grad_out[item] *= norm; hgrad_out[item] *= norm; @@ -32,12 +34,13 @@ struct line_minimize_with_block_op { epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item])); epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item])); } - theta = 0.5 * std::abs(std::atan(2 * epsilo_1/(epsilo_0 - epsilo_2))); + theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); cos_theta = std::cos(theta); sin_theta = std::sin(theta); - for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) + { auto item = band_idx * n_basis_max + basis_idx; - psi_out [item] = psi_out [item] * cos_theta + grad_out [item] * sin_theta; + psi_out[item] = psi_out[item] * cos_theta + grad_out[item] * sin_theta; hpsi_out[item] = hpsi_out[item] * cos_theta + hgrad_out[item] * sin_theta; } } @@ -45,43 +48,47 @@ struct line_minimize_with_block_op { }; template -struct calc_grad_with_block_op { +struct calc_grad_with_block_op +{ using Real = typename GetTypeReal::type; - void operator() ( - const Real* prec_in, - Real* err_out, - Real* beta_out, - T* psi_out, - T* hpsi_out, - T* grad_out, - T* grad_old_out, - const int &n_basis, - const int &n_basis_max, - const int &n_band) + void operator()(const Real* prec_in, + Real* err_out, + Real* beta_out, + T* psi_out, + T* hpsi_out, + T* grad_out, + T* grad_old_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band) { - for (int band_idx = 0; band_idx < n_band; band_idx++) { + for (int band_idx = 0; band_idx < n_band; band_idx++) + { Real err = 0.0; Real beta = 0.0; Real epsilo = 0.0; Real grad_2 = {0.0}; T grad_1 = {0.0, 0.0}; - auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); + auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); norm = 1.0 / sqrt(norm); - for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) + { auto item = band_idx * n_basis_max + basis_idx; psi_out[item] *= norm; hpsi_out[item] *= norm; epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); } - for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) + { auto item = band_idx * n_basis_max + basis_idx; grad_1 = hpsi_out[item] - epsilo * psi_out[item]; grad_2 = std::norm(grad_1); err += grad_2; beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec? } - for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) + { auto item = band_idx * n_basis_max + basis_idx; grad_1 = hpsi_out[item] - epsilo * psi_out[item]; grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item]; @@ -93,16 +100,17 @@ struct calc_grad_with_block_op { }; template -struct dot_real_op { - FPTYPE operator() ( - const psi::DEVICE_CPU* d, - const int& dim, - const FPTYPE* psi_L, - const FPTYPE* psi_R, - const bool reduce) +struct dot_real_op +{ + FPTYPE operator()(const psi::DEVICE_CPU* d, + const int& dim, + const FPTYPE* psi_L, + const FPTYPE* psi_R, + const bool reduce) { FPTYPE result = BlasConnector::dot(dim, psi_L, 1, psi_R, 1); - if (reduce) { + if (reduce) + { Parallel_Reduce::reduce_pool(result); } return result; @@ -110,39 +118,37 @@ struct dot_real_op { }; // CPU specialization of actual computation. -template -struct dot_real_op, psi::DEVICE_CPU> { - FPTYPE operator() ( - const psi::DEVICE_CPU* d, - const int& dim, - const std::complex* psi_L, - const std::complex* psi_R, - const bool reduce) - { - //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - // qianrui modify 2021-3-14 - // Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) - const FPTYPE* pL = reinterpret_cast(psi_L); - const FPTYPE* pR = reinterpret_cast(psi_R); - FPTYPE result = BlasConnector::dot(2 * dim, pL, 1, pR, 1); - if (reduce) { - Parallel_Reduce::reduce_pool(result); +template +struct dot_real_op, psi::DEVICE_CPU> +{ + FPTYPE operator()(const psi::DEVICE_CPU* d, + const int& dim, + const std::complex* psi_L, + const std::complex* psi_R, + const bool reduce) + { + //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + // qianrui modify 2021-3-14 + // Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) + const FPTYPE* pL = reinterpret_cast(psi_L); + const FPTYPE* pR = reinterpret_cast(psi_R); + FPTYPE result = BlasConnector::dot(2 * dim, pL, 1, pR, 1); + if (reduce) + { + Parallel_Reduce::reduce_pool(result); + } + return result; } - return result; - } }; -template struct vector_div_constant_op +template +struct vector_div_constant_op { using Real = typename GetTypeReal::type; - void operator()(const psi::DEVICE_CPU* d, - const int dim, - T* result, - const T* vector, - const Real constant) + void operator()(const psi::DEVICE_CPU* d, const int dim, T* result, const T* vector, const Real constant) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096/sizeof(Real)) +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) #endif for (int i = 0; i < dim; i++) { @@ -151,17 +157,14 @@ template struct vector_div_constant_op } }; -template struct vector_mul_vector_op +template +struct vector_mul_vector_op { using Real = typename GetTypeReal::type; - void operator()(const psi::DEVICE_CPU* d, - const int& dim, - T* result, - const T* vector1, - const Real* vector2) + void operator()(const psi::DEVICE_CPU* d, const int& dim, T* result, const T* vector1, const Real* vector2) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096/sizeof(Real)) +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) #endif for (int i = 0; i < dim; i++) { @@ -170,17 +173,14 @@ template struct vector_mul_vector_op } }; -template struct vector_div_vector_op +template +struct vector_div_vector_op { using Real = typename GetTypeReal::type; - void operator()(const psi::DEVICE_CPU* d, - const int& dim, - T* result, - const T* vector1, - const Real* vector2) + void operator()(const psi::DEVICE_CPU* d, const int& dim, T* result, const T* vector1, const Real* vector2) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096/sizeof(Real)) +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) #endif for (int i = 0; i < dim; i++) { @@ -189,19 +189,20 @@ template struct vector_div_vector_op } }; -template struct constantvector_addORsub_constantVector_op +template +struct constantvector_addORsub_constantVector_op { using Real = typename GetTypeReal::type; void operator()(const psi::DEVICE_CPU* d, - const int& dim, - T* result, - const T* vector1, - const Real constant1, - const T* vector2, - const Real constant2) + const int& dim, + T* result, + const T* vector1, + const Real constant1, + const T* vector2, + const Real constant2) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192/sizeof(T)) +#pragma omp parallel for schedule(static, 8192 / sizeof(T)) #endif for (int i = 0; i < dim; i++) { @@ -211,87 +212,84 @@ template struct constantvector_addORsub_constantVector_op -struct scal_op { - void operator()( - const psi::DEVICE_CPU * /*ctx*/, - const int &N, - const std::complex *alpha, - std::complex *X, - const int &incx) +struct scal_op +{ + void operator()(const psi::DEVICE_CPU* /*ctx*/, + const int& N, + const std::complex* alpha, + std::complex* X, + const int& incx) { BlasConnector::scal(N, *alpha, X, incx); } }; template -struct gemv_op { - void operator()( - const psi::DEVICE_CPU* d, - const char& trans, - const int& m, - const int& n, - const T* alpha, - const T* A, - const int& lda, - const T* X, - const int& incx, - const T* beta, - T* Y, - const int& incy) +struct gemv_op +{ + void operator()(const psi::DEVICE_CPU* d, + const char& trans, + const int& m, + const int& n, + const T* alpha, + const T* A, + const int& lda, + const T* X, + const int& incx, + const T* beta, + T* Y, + const int& incy) { BlasConnector::gemv(trans, m, n, *alpha, A, lda, X, incx, *beta, Y, incy); } }; template -struct axpy_op { - void operator()( - const psi::DEVICE_CPU* /*ctx*/, - const int& dim, - const T* alpha, - const T* X, - const int& incX, - T* Y, - const int& incY) +struct axpy_op +{ + void operator()(const psi::DEVICE_CPU* /*ctx*/, + const int& dim, + const T* alpha, + const T* X, + const int& incX, + T* Y, + const int& incY) { BlasConnector::axpy(dim, *alpha, X, incX, Y, incY); } }; template -struct gemm_op { - void operator()( - const psi::DEVICE_CPU* /*ctx*/, - const char& transa, - const char& transb, - const int& m, - const int& n, - const int& k, - const T* alpha, - const T* a, - const int& lda, - const T* b, - const int& ldb, - const T* beta, - T* c, - const int& ldc) +struct gemm_op +{ + void operator()(const psi::DEVICE_CPU* /*ctx*/, + const char& transa, + const char& transb, + const int& m, + const int& n, + const int& k, + const T* alpha, + const T* a, + const int& lda, + const T* b, + const int& ldb, + const T* beta, + T* c, + const int& ldc) { BlasConnector::gemm(transb, transa, n, m, k, *alpha, b, ldb, a, lda, *beta, c, ldc); } }; -template struct matrixTranspose_op +template +struct matrixTranspose_op { - void operator()(const psi::DEVICE_CPU* d, - const int& row, - const int& col, - const T* input_matrix, - T* output_matrix) + void operator()(const psi::DEVICE_CPU* d, const int& row, const int& col, const T* input_matrix, T* output_matrix) { T* temp = nullptr; psi::memory::resize_memory_op()(d, temp, row * col, "MTransOp"); #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 8192/sizeof(T)) +#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T)) #endif for (int j = 0; j < col; j++) { @@ -301,7 +299,7 @@ template struct matrixTranspose_op } } #ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192/sizeof(T)) +#pragma omp parallel for schedule(static, 8192 / sizeof(T)) #endif for (int i = 0; i < row * col; i++) { @@ -311,17 +309,13 @@ template struct matrixTranspose_op } }; -template struct matrixSetToAnother +template +struct matrixSetToAnother { - void operator()(const psi::DEVICE_CPU* d, - const int& n, - const T* A, - const int& LDA, - T* B, - const int& LDB) + void operator()(const psi::DEVICE_CPU* d, const int& n, const T* A, const int& LDA, T* B, const int& LDB) { #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 8192/sizeof(T)) +#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T)) #endif for (int i = 0; i < n; i++) { @@ -333,8 +327,6 @@ template struct matrixSetToAnother } }; - - // Explicitly instantiate functors for the types of functor registered. template struct scal_op; template struct axpy_op, psi::DEVICE_CPU>; diff --git a/source/module_hsolver/kernels/math_kernel_op.h b/source/module_hsolver/kernels/math_kernel_op.h index 1f062431e9..774d16cb04 100644 --- a/source/module_hsolver/kernels/math_kernel_op.h +++ b/source/module_hsolver/kernels/math_kernel_op.h @@ -3,23 +3,84 @@ #ifndef MODULE_HSOLVER_MATH_KERNEL_H #define MODULE_HSOLVER_MATH_KERNEL_H -#include "module_base/blas_connector.h" -#include "module_psi/psi.h" -#include "module_base/parallel_reduce.h" -#include "module_psi/kernels/memory_op.h" #include +#include "module_base/blas_connector.h" +#include "module_base/parallel_reduce.h" +#include "module_psi/psi.h" #if defined(__CUDA) || defined(__UT_USE_CUDA) #include + #include "cublas_v2.h" #endif //__CUDA || __UT_USE_CUDA namespace hsolver { +inline std::complex set_real_tocomplex(const std::complex& x) +{ + return {x.real(), 0.0}; +} + +inline std::complex set_real_tocomplex(const std::complex& x) +{ + return {x.real(), 0.0}; +} + +inline double set_real_tocomplex(const double& x) +{ + return x; +} + +inline float set_real_tocomplex(const float& x) +{ + return x; +} + +inline std::complex get_conj(const std::complex& x) +{ + return {x.real(), -x.imag()}; +} + +inline std::complex get_conj(const std::complex& x) +{ + return {x.real(), -x.imag()}; +} + +inline double get_conj(const double& x) +{ + return x; +} + +inline float get_conj(const float& x) +{ + return x; +} + +inline double get_real(const std::complex& x) +{ + return x.real(); +} + +inline float get_real(const std::complex& x) +{ + return x.real(); +} + +inline double get_real(const double& x) +{ + return x; +} + +inline float get_real(const float& x) +{ + return x; +} + template -struct line_minimize_with_block_op { +struct line_minimize_with_block_op +{ /// @brief dot_real_op computes the dot product of the given complex arrays(treated as float arrays). /// And there's may have MPI communications while enabling planewave parallization strategy. /// @@ -32,19 +93,18 @@ struct line_minimize_with_block_op { /// /// \return res : the result vector /// T : dot product result - void operator() ( - T* grad_out, - T* hgrad_out, - T* psi_out, - T* hpsi_out, - const int &n_basis, - const int &n_basis_max, - const int &n_band); + void operator()(T* grad_out, + T* hgrad_out, + T* psi_out, + T* hpsi_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band); }; - template -struct calc_grad_with_block_op { +struct calc_grad_with_block_op +{ /// @brief dot_real_op computes the dot product of the given complex arrays(treated as float arrays). /// And there's may have MPI communications while enabling planewave parallization strategy. /// @@ -58,44 +118,40 @@ struct calc_grad_with_block_op { /// \return res : the result vector /// T : dot product result using Real = typename GetTypeReal::type; - void operator() ( - const Real* prec_in, - Real* err_out, - Real* beta_out, - T* psi_out, - T* hpsi_out, - T* grad_out, - T* grad_old_out, - const int &n_basis, - const int &n_basis_max, - const int &n_band); + void operator()(const Real* prec_in, + Real* err_out, + Real* beta_out, + T* psi_out, + T* hpsi_out, + T* grad_out, + T* grad_old_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band); }; template -struct dot_real_op { +struct dot_real_op +{ using Real = typename GetTypeReal::type; /// @brief dot_real_op computes the dot product of the given complex arrays(treated as float arrays). - /// And there's may have MPI communications while enabling planewave parallization strategy. - /// - /// Input Parameters - /// \param d : the type of computing device - /// \param dim : array size - /// \param psi_L : input array A - /// \param psi_R : input array B - /// \param reduce : flag to control whether to perform the MPI communications - /// - /// \return - /// FPTYPE : dot product result - Real operator() ( - const Device* d, - const int& dim, - const T* psi_L, - const T* psi_R, - const bool reduce = true); + /// And there's may have MPI communications while enabling planewave parallization strategy. + /// + /// Input Parameters + /// \param d : the type of computing device + /// \param dim : array size + /// \param psi_L : input array A + /// \param psi_R : input array B + /// \param reduce : flag to control whether to perform the MPI communications + /// + /// \return + /// FPTYPE : dot product result + Real operator()(const Device* d, const int& dim, const T* psi_L, const T* psi_R, const bool reduce = true); }; // vector operator: result[i] = vector[i] / constant -template struct vector_div_constant_op +template +struct vector_div_constant_op { using Real = typename GetTypeReal::type; /// @brief result[i] = vector[i] / constant @@ -108,15 +164,12 @@ template struct vector_div_constant_op /// /// Output Parameters /// \param result : output array - void operator()(const Device* d, - const int dim, - T* result, - const T* vector, - const Real constant); + void operator()(const Device* d, const int dim, T* result, const T* vector, const Real constant); }; // replace vector_div_constant_op : x = alpha * x -template struct scal_op +template +struct scal_op { /// @brief x = alpha * x /// @@ -137,7 +190,8 @@ template struct scal_op }; // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex) -template struct vector_mul_vector_op +template +struct vector_mul_vector_op { using Real = typename GetTypeReal::type; /// @brief result[i] = vector1[i](complex) * vector2[i](not complex) @@ -150,15 +204,12 @@ template struct vector_mul_vector_op /// /// Output Parameters /// \param result : output array - void operator()(const Device* d, - const int& dim, - T* result, - const T* vector1, - const Real* vector2); + void operator()(const Device* d, const int& dim, T* result, const T* vector1, const Real* vector2); }; // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) -template struct vector_div_vector_op +template +struct vector_div_vector_op { using Real = typename GetTypeReal::type; /// @brief result[i] = vector1[i](complex) / vector2[i](not complex) @@ -171,15 +222,12 @@ template struct vector_div_vector_op /// /// Output Parameters /// \param result : output array - void operator()(const Device* d, - const int& dim, - T* result, - const T* vector1, - const Real* vector2); + void operator()(const Device* d, const int& dim, T* result, const T* vector1, const Real* vector2); }; // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2 -template struct constantvector_addORsub_constantVector_op +template +struct constantvector_addORsub_constantVector_op { using Real = typename GetTypeReal::type; /// @brief result[i] = vector1[i] * constant1 + vector2[i] * constant2 @@ -195,16 +243,17 @@ template struct constantvector_addORsub_constantVe /// Output Parameters /// \param result : output array void operator()(const Device* d, - const int& dim, - T* result, - const T* vector1, - const Real constant1, - const T* vector2, - const Real constant2); + const int& dim, + T* result, + const T* vector1, + const Real constant1, + const T* vector2, + const Real constant2); }; // compute Y = alpha * X + Y -template struct axpy_op +template +struct axpy_op { /// @brief Y = alpha * X + Y /// @@ -219,17 +268,12 @@ template struct axpy_op /// /// Output Parameters /// \param Y : output array Y - void operator()(const Device* d, - const int& N, - const T* alpha, - const T* X, - const int& incX, - T* Y, - const int& incY); + void operator()(const Device* d, const int& N, const T* alpha, const T* X, const int& incX, T* Y, const int& incY); }; // compute y = alpha * op(A) * x + beta * y -template struct gemv_op +template +struct gemv_op { /// @brief y = alpha * op(A) * x + beta * y /// @@ -253,19 +297,19 @@ template struct gemv_op const char& trans, const int& m, const int& n, - const T* alpha, - const T* A, + const T* alpha, + const T* A, const int& lda, - const T* X, + const T* X, const int& incx, - const T* beta, - T* Y, + const T* beta, + T* Y, const int& incy); }; - // compute C = alpha * op(A) * op(B) + beta * C -template struct gemm_op +template +struct gemm_op { /// @brief C = alpha * op(A) * op(B) + beta * C /// @@ -288,22 +332,23 @@ template struct gemm_op /// Output Parameters /// \param c : output matrix C void operator()(const Device* d, - const char& transa, - const char& transb, - const int& m, - const int& n, - const int& k, - const T* alpha, - const T* a, - const int& lda, - const T* b, - const int& ldb, - const T* beta, - T* c, - const int& ldc); + const char& transa, + const char& transb, + const int& m, + const int& n, + const int& k, + const T* alpha, + const T* a, + const int& lda, + const T* b, + const int& ldb, + const T* beta, + T* c, + const int& ldc); }; -template struct matrixTranspose_op +template +struct matrixTranspose_op { /// @brief transpose the input matrix /// @@ -315,14 +360,11 @@ template struct matrixTranspose_op /// /// Output Parameters /// \param output_matrix : output matrix - void operator()(const Device* d, - const int& row, - const int& col, - const T* input_matrix, - T* output_matrix); + void operator()(const Device* d, const int& row, const int& col, const T* input_matrix, T* output_matrix); }; -template struct matrixSetToAnother +template +struct matrixSetToAnother { /// @brief initialize matrix B with A /// @@ -335,114 +377,97 @@ template struct matrixSetToAnother /// /// Output Parameters /// \param B : output matrix B - void operator()(const Device* d, - const int& n, - const T* A, - const int& LDA, - T* B, - const int& LDB); + void operator()(const Device* d, const int& n, const T* A, const int& LDA, T* B, const int& LDB); }; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template -struct line_minimize_with_block_op { - using Real = typename GetTypeReal::type; - void operator()( - T* grad_out, - T* hgrad_out, - T* psi_out, - T* hpsi_out, - const int &n_basis, - const int &n_basis_max, - const int &n_band); +struct line_minimize_with_block_op +{ + using Real = typename GetTypeReal::type; + void operator()(T* grad_out, + T* hgrad_out, + T* psi_out, + T* hpsi_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band); }; template -struct calc_grad_with_block_op { - using Real = typename GetTypeReal::type; - void operator()( - const Real* prec_in, - Real* err_out, - Real* beta_out, - T* psi_out, - T* hpsi_out, - T* grad_out, - T* grad_old_out, - const int &n_basis, - const int &n_basis_max, - const int &n_band); +struct calc_grad_with_block_op +{ + using Real = typename GetTypeReal::type; + void operator()(const Real* prec_in, + Real* err_out, + Real* beta_out, + T* psi_out, + T* hpsi_out, + T* grad_out, + T* grad_old_out, + const int& n_basis, + const int& n_basis_max, + const int& n_band); }; // Partially specialize functor for psi::GpuDevice. template -struct dot_real_op { +struct dot_real_op +{ using Real = typename GetTypeReal::type; - Real operator()( - const psi::DEVICE_GPU* d, - const int& dim, - const T* psi_L, - const T* psi_R, - const bool reduce = true); + Real operator()(const psi::DEVICE_GPU* d, const int& dim, const T* psi_L, const T* psi_R, const bool reduce = true); }; // vector operator: result[i] = vector[i] / constant -template struct vector_div_constant_op +template +struct vector_div_constant_op { using Real = typename GetTypeReal::type; - void operator()(const psi::DEVICE_GPU* d, - const int dim, - T* result, - const T* vector, - const Real constant); + void operator()(const psi::DEVICE_GPU* d, const int dim, T* result, const T* vector, const Real constant); }; // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex) -template struct vector_mul_vector_op +template +struct vector_mul_vector_op { using Real = typename GetTypeReal::type; - void operator()(const psi::DEVICE_GPU* d, - const int& dim, - T* result, - const T* vector1, - const Real* vector2); + void operator()(const psi::DEVICE_GPU* d, const int& dim, T* result, const T* vector1, const Real* vector2); }; // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) -template struct vector_div_vector_op +template +struct vector_div_vector_op { using Real = typename GetTypeReal::type; - void operator()(const psi::DEVICE_GPU* d, - const int& dim, - T* result, - const T* vector1, - const Real* vector2); + void operator()(const psi::DEVICE_GPU* d, const int& dim, T* result, const T* vector1, const Real* vector2); }; // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2 -template struct constantvector_addORsub_constantVector_op +template +struct constantvector_addORsub_constantVector_op { using Real = typename GetTypeReal::type; void operator()(const psi::DEVICE_GPU* d, - const int& dim, - T* result, - const T* vector1, - const Real constant1, - const T* vector2, - const Real constant2); + const int& dim, + T* result, + const T* vector1, + const Real constant1, + const T* vector2, + const Real constant2); }; -template struct matrixSetToAnother +template +struct matrixSetToAnother { void operator()(const psi::DEVICE_GPU* d, - const int& n, - const T* A, // input - const int& LDA, - T* B, // output - const int& LDB); + const int& n, + const T* A, // input + const int& LDA, + T* B, // output + const int& LDB); }; - void createGpuBlasHandle(); void destoryBLAShandle(); diff --git a/source/module_hsolver/test/CMakeLists.txt b/source/module_hsolver/test/CMakeLists.txt index 5dd16d10ee..90dcddbacd 100644 --- a/source/module_hsolver/test/CMakeLists.txt +++ b/source/module_hsolver/test/CMakeLists.txt @@ -70,13 +70,13 @@ AddTest( AddTest( TARGET HSolver_pw LIBS ${math_libs} psi device base container - SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_iter_assist.cpp + SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diagh_consts.cpp ../diago_iter_assist.cpp ) AddTest( TARGET HSolver_sdft LIBS ${math_libs} psi device base container - SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_iter_assist.cpp + SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diagh_consts.cpp ../diago_iter_assist.cpp ) if(ENABLE_LCAO) diff --git a/source/module_io/input.cpp b/source/module_io/input.cpp index 8e65755cad..59dd6fff81 100644 --- a/source/module_io/input.cpp +++ b/source/module_io/input.cpp @@ -3009,7 +3009,7 @@ void Input::Default_2(void) // jiyy add 2019-08-04 diago_proc = GlobalV::NPROC; } } - else if (ks_solver == "dav") + else if (ks_solver == "dav" || ks_solver == "dav_subspace") { GlobalV::ofs_warning << " It's ok to use dav." << std::endl; } @@ -3959,7 +3959,11 @@ void Input::Check(void) { ModuleBase::WARNING_QUIT("Input", "lapack can not be used with plane wave basis."); } - else if (ks_solver != "default" && ks_solver != "cg" && ks_solver != "dav" && ks_solver != "bpcg") + else if (ks_solver != "default" && + ks_solver != "cg" && + ks_solver != "dav" && + ks_solver != "dav_subspace" && + ks_solver != "bpcg") { ModuleBase::WARNING_QUIT("Input", "please check the ks_solver parameter!"); } diff --git a/source/module_psi/kernels/device.cpp b/source/module_psi/kernels/device.cpp index 3bba66f2a6..7e5bea7705 100644 --- a/source/module_psi/kernels/device.cpp +++ b/source/module_psi/kernels/device.cpp @@ -567,7 +567,11 @@ std::string get_device_flag(const std::string& device, const std::string& ks_sol #else str = "cpu"; #endif - if (ks_solver != "cg" && ks_solver != "dav" && ks_solver != "bpcg") { + if (ks_solver != "cg" && + ks_solver != "dav" && + ks_solver != "dav_subspace" && + ks_solver != "bpcg") + { str = "cpu"; } if (basis_type != "pw") {