diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md index 781886e65b..af0cbdf4ac 100644 --- a/docs/advanced/input_files/input-main.md +++ b/docs/advanced/input_files/input-main.md @@ -143,6 +143,7 @@ - [out\_level](#out_level) - [out\_alllog](#out_alllog) - [out\_mat\_hs](#out_mat_hs) + - [out\_mat\_tk](#out_mat_tk) - [out\_mat\_r](#out_mat_r) - [out\_mat\_hs2](#out_mat_hs2) - [out\_mat\_t](#out_mat_t) @@ -1667,6 +1668,13 @@ These variables are used to control the output of properties. - **Description**: Whether to print the upper triangular part of the Hamiltonian matrices (in Ry) and overlap matrices for each k point into files in the directory `OUT.${suffix}`. The second number controls precision. For more information, please refer to [hs_matrix.md](../elec_properties/hs_matrix.md#out_mat_hs). Also controled by [out_interval](#out_interval) and [out_app_flag](#out_app_flag). - **Default**: False 8 +### out_mat_tk + +- **Type**: Boolean \[Integer\](optional) +- **Availability**: Numerical atomic orbital basis +- **Description**: Whether to print the upper triangular part of the kinetic matrices (in Ry) for each k point into `OUT.${suffix}/data-i-T`, where i is the index of k points (see `OUT.${suffix}/kpoints`). One may optionally provide a second parameter to specify the precision. +- **Default**: False \[8\] + ### out_mat_r - **Type**: Boolean diff --git a/source/module_esolver/esolver_ks.h b/source/module_esolver/esolver_ks.h index feb4d5ee17..36a85eac68 100644 --- a/source/module_esolver/esolver_ks.h +++ b/source/module_esolver/esolver_ks.h @@ -70,7 +70,7 @@ class ESolver_KS : public ESolver_FP virtual void iter_finish(int& iter); //! Something to do after SCF iterations when SCF is converged or comes to the max iter step. - virtual void after_scf(const int istep); + virtual void after_scf(const int istep) override; //! It should be replaced by a function in Hamilt Class virtual void update_pot(const int istep, const int iter) {}; diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp index 07d8936fb8..177b791b98 100644 --- a/source/module_esolver/esolver_ks_lcao.cpp +++ b/source/module_esolver/esolver_ks_lcao.cpp @@ -26,6 +26,7 @@ #include "module_elecstate/module_charge/symmetry_rho.h" #include "module_elecstate/occupy.h" #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h" // need divide_HS_in_frag +#include "module_hamilt_lcao/hamilt_lcaodft/hs_matrix_k.hpp" #include "module_hamilt_lcao/module_dftu/dftu.h" #include "module_hamilt_pw/hamilt_pwdft/global.h" #include "module_io/print_info.h" @@ -994,7 +995,10 @@ void ESolver_KS_LCAO::iter_finish(int& iter) *this->p_hamilt, *dynamic_cast*>(this->pelec)->get_DM(), this->kv, - iter); + PARAM.inp.nspin, + iter, + this->pelec->f_en.etot, + this->scf_ene_thr); } else { @@ -1002,7 +1006,10 @@ void ESolver_KS_LCAO::iter_finish(int& iter) *this->p_hamilt, *dynamic_cast*>(this->pelec)->get_DM(), this->kv, - iter); + PARAM.inp.nspin, + iter, + this->pelec->f_en.etot, + this->scf_ene_thr); } } #endif @@ -1130,11 +1137,10 @@ void ESolver_KS_LCAO::after_scf(const int istep) } #ifdef __EXX - // 4) write Exx matrix - if (GlobalC::exx_info.info_global.cal_exx) // Peize Lin add if 2022.11.14 + // 4) write Hexx matrix for NSCF (see `out_chg` in docs/advanced/input_files/input-main.md) + if (GlobalC::exx_info.info_global.cal_exx && PARAM.inp.out_chg[0] && istep % PARAM.inp.out_interval == 0) // Peize Lin add if 2022.11.14 { - const std::string file_name_exx = GlobalV::global_out_dir + "HexxR" - + std::to_string(GlobalV::MY_RANK); + const std::string file_name_exx = GlobalV::global_out_dir + "HexxR" + std::to_string(GlobalV::MY_RANK); if (GlobalC::exx_info.info_ri.real_number) { ModuleIO::write_Hexxs_csr(file_name_exx, GlobalC::ucell, this->exd->get_Hexxs()); } else { @@ -1255,6 +1261,38 @@ void ESolver_KS_LCAO::after_scf(const int istep) GlobalV::NPROC); tqo.calculate(); } + + if (PARAM.inp.out_mat_tk[0]) + { + hamilt::HS_Matrix_K hsk(&pv, true); + hamilt::HContainer hR(&pv); + hamilt::Operator* ekinetic = + new hamilt::EkineticNew>(&hsk, + this->kv.kvec_d, + &hR, + &GlobalC::ucell, + &GlobalC::GridD, + two_center_bundle_.kinetic_orb.get()); + + const int nspin_k = (GlobalV::NSPIN == 2 ? 2 : 1); + for (int ik = 0; ik < this->kv.get_nks() / nspin_k; ++ik) + { + ekinetic->init(ik); + ModuleIO::save_mat(0, + hsk.get_hk(), + GlobalV::NLOCAL, + false, + PARAM.inp.out_mat_tk[1], + 1, + GlobalV::out_app_flag, + "T", + "data-" + std::to_string(ik), + this->pv, + GlobalV::DRANK); + } + + delete ekinetic; + } } //------------------------------------------------------------------------------ diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp index 9efc766902..fc996fad52 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp @@ -760,7 +760,7 @@ void Force_Stress_LCAO::calForcePwPart(ModuleBase::matrix& fvl_dvl, //-------------------------------------------------------- // force due to core correlation. //-------------------------------------------------------- - f_pw.cal_force_cc(fcc, rhopw, chr); + f_pw.cal_force_cc(fcc, rhopw, chr,GlobalC::ucell); //-------------------------------------------------------- // force due to self-consistent charge. //-------------------------------------------------------- diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index cb84387efa..6ea8e894cd 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -157,7 +157,7 @@ void Forces::cal_force(ModuleBase::matrix& force, // not relevant for PAW if (!GlobalV::use_paw) { - Forces::cal_force_cc(forcecc, rho_basis, chr); + Forces::cal_force_cc(forcecc, rho_basis, chr,GlobalC::ucell); } else { diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h index 8c734932a1..0247fbfae7 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.h +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h @@ -48,7 +48,7 @@ class Forces void cal_force_loc(ModuleBase::matrix& forcelc, ModulePW::PW_Basis* rho_basis, const Charge* const chr); void cal_force_ew(ModuleBase::matrix& forceion, ModulePW::PW_Basis* rho_basis, const Structure_Factor* p_sf); - void cal_force_cc(ModuleBase::matrix& forcecc, ModulePW::PW_Basis* rho_basis, const Charge* const chr); + void cal_force_cc(ModuleBase::matrix& forcecc, ModulePW::PW_Basis* rho_basis, const Charge* const chr, UnitCell& ucell_in); /** * @brief This routine computes the atomic force of non-local pseudopotential * F^{NL}_i = \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}D_{l,l'}^{I} [ @@ -86,7 +86,8 @@ class Forces const FPTYPE* rhoc, FPTYPE* drhocg, ModulePW::PW_Basis* rho_basis, - int type); // used in nonlinear core correction stress + int type, + const UnitCell& ucell_in); // used in nonlinear core correction stress void deriv_drhoc_scc(const bool& numeric, const int mesh, const FPTYPE* r, diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp index c111a78938..b4b09c768a 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp @@ -27,7 +27,8 @@ template void Forces::cal_force_cc(ModuleBase::matrix& forcecc, ModulePW::PW_Basis* rho_basis, - const Charge* const chr) + const Charge* const chr, + UnitCell& ucell_in) { ModuleBase::TITLE("Forces", "cal_force_cc"); // recalculate the exchange-correlation potential. @@ -35,11 +36,11 @@ void Forces::cal_force_cc(ModuleBase::matrix& forcecc, int total_works = 0; // cal total works for skipping preprocess - for (int it = 0; it < GlobalC::ucell.ntype; ++it) + for (int it = 0; it < ucell_in.ntype; ++it) { - if (GlobalC::ucell.atoms[it].ncpp.nlcc) + if (ucell_in.atoms[it].ncpp.nlcc) { - total_works += GlobalC::ucell.atoms[it].na; + total_works += ucell_in.atoms[it].na; } } if (total_works == 0) @@ -54,7 +55,7 @@ void Forces::cal_force_cc(ModuleBase::matrix& forcecc, { #ifdef USE_LIBXC const auto etxc_vtxc_v - = XC_Functional::v_xc_meta(rho_basis->nrxx, GlobalC::ucell.omega, GlobalC::ucell.tpiba, chr); + = XC_Functional::v_xc_meta(rho_basis->nrxx, ucell_in.omega, ucell_in.tpiba, chr); // etxc = std::get<0>(etxc_vtxc_v); // vtxc = std::get<1>(etxc_vtxc_v); @@ -66,9 +67,9 @@ void Forces::cal_force_cc(ModuleBase::matrix& forcecc, else { if (GlobalV::NSPIN == 4) { - GlobalC::ucell.cal_ux(); + ucell_in.cal_ux(); } - const auto etxc_vtxc_v = XC_Functional::v_xc(rho_basis->nrxx, chr, &GlobalC::ucell); + const auto etxc_vtxc_v = XC_Functional::v_xc(rho_basis->nrxx, chr, &ucell_in); // etxc = std::get<0>(etxc_vtxc_v); // vtxc = std::get<1>(etxc_vtxc_v); @@ -105,83 +106,116 @@ void Forces::cal_force_cc(ModuleBase::matrix& forcecc, double* rhocg = new double[rho_basis->ngg]; ModuleBase::GlobalFunc::ZEROS(rhocg, rho_basis->ngg); - for (int it = 0; it < GlobalC::ucell.ntype; ++it) + std::vector gv_x(rho_basis->npw); + std::vector gv_y(rho_basis->npw); + std::vector gv_z(rho_basis->npw); + std::vector rhocgigg_vec(rho_basis->npw); + double *gv_x_d = nullptr; + double *gv_y_d = nullptr; + double *gv_z_d = nullptr; + double *force_d = nullptr; + double *rhocgigg_vec_d = nullptr; + std::complex* psiv_d = nullptr; + this->device = base_device::get_device_type(this->ctx); + + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int ig = 0; ig < rho_basis->npw; ig++) { - if (GlobalC::ucell.atoms[it].ncpp.nlcc) + gv_x[ig] = rho_basis->gcar[ig].x; + gv_y[ig] = rho_basis->gcar[ig].y; + gv_z[ig] = rho_basis->gcar[ig].z; + } + + if(this->device == base_device::GpuDevice ) { + resmem_var_op()(this->ctx, gv_x_d, rho_basis->npw); + resmem_var_op()(this->ctx, gv_y_d, rho_basis->npw); + resmem_var_op()(this->ctx, gv_z_d, rho_basis->npw); + resmem_var_op()(this->ctx, rhocgigg_vec_d, rho_basis->npw); + resmem_complex_op()(this->ctx, psiv_d, rho_basis->nmaxgr); + resmem_var_op()(this->ctx, force_d, 3); + + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_x_d, gv_x.data(), rho_basis->npw); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_y_d, gv_y.data(), rho_basis->npw); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_z_d, gv_z.data(), rho_basis->npw); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, psiv_d, psiv, rho_basis->nmaxgr); + } + + + for (int it = 0; it < ucell_in.ntype; ++it) + { + if (ucell_in.atoms[it].ncpp.nlcc) { // chr->non_linear_core_correction(GlobalC::ppcell.numeric, - // GlobalC::ucell.atoms[it].ncpp.msh, - // GlobalC::ucell.atoms[it].ncpp.r, - // GlobalC::ucell.atoms[it].ncpp.rab, - // GlobalC::ucell.atoms[it].ncpp.rho_atc, + // ucell_in.atoms[it].ncpp.msh, + // ucell_in.atoms[it].ncpp.r, + // ucell_in.atoms[it].ncpp.rab, + // ucell_in.atoms[it].ncpp.rho_atc, // rhocg); this->deriv_drhoc(GlobalC::ppcell.numeric, - GlobalC::ucell.atoms[it].ncpp.msh, - GlobalC::ucell.atoms[it].ncpp.r.data(), - GlobalC::ucell.atoms[it].ncpp.rab.data(), - GlobalC::ucell.atoms[it].ncpp.rho_atc.data(), + ucell_in.atoms[it].ncpp.msh, + ucell_in.atoms[it].ncpp.r.data(), + ucell_in.atoms[it].ncpp.rab.data(), + ucell_in.atoms[it].ncpp.rho_atc.data(), rhocg, rho_basis, - 1); + 1, + ucell_in); + #ifdef _OPENMP -#pragma omp parallel +#pragma omp parallel for +#endif + for (int ig = 0; ig < rho_basis->npw; ig++) { -#endif - for (int ia = 0; ia < GlobalC::ucell.atoms[it].na; ++ia) + rhocgigg_vec[ig] = rhocg[rho_basis->ig2igg[ig]]; + } + + if(this->device == base_device::GpuDevice ) { + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhocgigg_vec_d, rhocgigg_vec.data(), rho_basis->npw); + } + for (int ia = 0; ia < ucell_in.atoms[it].na; ++ia) + { + const ModuleBase::Vector3 pos = ucell_in.atoms[it].tau[ia]; + // get iat form table + int iat = ucell_in.itia2iat(it, ia); + double force[3] = {0, 0, 0}; + + if(this->device == base_device::GpuDevice ) { + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, force_d, force, 3); + hamilt::cal_force_npw_op()( + psiv_d, gv_x_d, gv_y_d, gv_z_d, rhocgigg_vec_d, force_d, pos.x, pos.y, pos.z, + rho_basis->npw, ucell_in.omega, ucell_in.tpiba + ); + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force, force_d, 3); + + } else { + hamilt::cal_force_npw_op()( + psiv, gv_x.data(), gv_y.data(), gv_z.data(), rhocgigg_vec.data(), force, pos.x, pos.y, pos.z, + rho_basis->npw, ucell_in.omega, ucell_in.tpiba + ); + } + { - // get iat form table - int iat = GlobalC::ucell.itia2iat(it, ia); - double force[3] = {0, 0, 0}; -#ifdef _OPENMP -#pragma omp for nowait -#endif - for (int ig = 0; ig < rho_basis->npw; ig++) - { - const ModuleBase::Vector3 gv = rho_basis->gcar[ig]; - const ModuleBase::Vector3 pos = GlobalC::ucell.atoms[it].tau[ia]; - const double rhocgigg = rhocg[rho_basis->ig2igg[ig]]; - const std::complex psiv_conj = conj(psiv[ig]); - - const double arg = ModuleBase::TWO_PI * (gv.x * pos.x + gv.y * pos.y + gv.z * pos.z); - double sinp, cosp; - ModuleBase::libm::sincos(arg, &sinp, &cosp); - const std::complex expiarg = std::complex(sinp, cosp); - - auto ipol0 - = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.x * psiv_conj * expiarg; - force[0] += ipol0.real(); - - auto ipol1 - = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.y * psiv_conj * expiarg; - force[1] += ipol1.real(); - - auto ipol2 - = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.z * psiv_conj * expiarg; - force[2] += ipol2.real(); - } -#ifdef _OPENMP -#pragma omp critical - if (omp_get_num_threads() > 1) - { - forcecc(iat, 0) += force[0]; - forcecc(iat, 1) += force[1]; - forcecc(iat, 2) += force[2]; - } - else -#endif - { - forcecc(iat, 0) += force[0]; - forcecc(iat, 1) += force[1]; - forcecc(iat, 2) += force[2]; - } + forcecc(iat, 0) += force[0]; + forcecc(iat, 1) += force[1]; + forcecc(iat, 2) += force[2]; } -#ifdef _OPENMP - } // omp parallel -#endif + } + } } - + if (this->device == base_device::GpuDevice) + { + delmem_var_op()(this->ctx, gv_x_d); + delmem_var_op()(this->ctx, gv_y_d); + delmem_var_op()(this->ctx, gv_z_d); + delmem_var_op()(this->ctx, force_d); + delmem_var_op()(this->ctx, rhocgigg_vec_d); + delmem_complex_op()(this->ctx, psiv_d); + } delete[] rhocg; delete[] psiv; // mohan fix bug 2012-03-22 @@ -202,7 +236,8 @@ void Forces::deriv_drhoc const FPTYPE *rhoc, FPTYPE *drhocg, ModulePW::PW_Basis* rho_basis, - int type + int type, + const UnitCell& ucell_in ) { int igl0; @@ -241,7 +276,7 @@ void Forces::deriv_drhoc aux [ir] = r [ir] * r [ir] * rhoc [ir]; } ModuleBase::Integral::Simpson_Integral(mesh, aux.data(), rab, rhocg1); - drhocg [0] = ModuleBase::FOUR_PI * rhocg1 / GlobalC::ucell.omega; + drhocg [0] = ModuleBase::FOUR_PI * rhocg1 / ucell_in.omega; igl0 = 1; } else @@ -260,7 +295,7 @@ void Forces::deriv_drhoc #endif for(int igl = igl0;igl< rho_basis->ngg;igl++) { - gx_arr[igl] = sqrt(rho_basis->gg_uniq[igl] * GlobalC::ucell.tpiba2); + gx_arr[igl] = sqrt(rho_basis->gg_uniq[igl] * ucell_in.tpiba2); } double *r_d = nullptr; @@ -285,14 +320,14 @@ void Forces::deriv_drhoc if(this->device == base_device::GpuDevice) { hamilt::cal_stress_drhoc_aux_op()( - r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,mesh,igl0,rho_basis->ngg-igl0,GlobalC::ucell.omega,type); + r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,mesh,igl0,rho_basis->ngg-igl0,ucell_in.omega,type); syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0); } else { hamilt::cal_stress_drhoc_aux_op()( - r,rhoc,gx_arr.data()+igl0,rab,drhocg+igl0,mesh,igl0,rho_basis->ngg-igl0,GlobalC::ucell.omega,type); + r,rhoc,gx_arr.data()+igl0,rab,drhocg+igl0,mesh,igl0,rho_basis->ngg-igl0,ucell_in.omega,type); } delmem_var_op()(this->ctx, r_d); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index 769d6351e9..e965446d77 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -23,7 +23,11 @@ void warp_reduce(FPTYPE & val) { val += __shfl_down_sync(FULL_MASK, val, offset); } } - +template +__device__ static inline +thrust::complex conj(thrust::complex& in) { + return thrust::conj(in); +} template __global__ void cal_stress_mgga( const int spin, @@ -504,6 +508,48 @@ __global__ void cal_stress_drhoc_aux3( +template +__global__ void cal_force_npw( + const thrust::complex *psiv, + const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z, + const FPTYPE* rhocgigg_vec, + FPTYPE* force, + const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z, + const int npw, + const FPTYPE omega, const FPTYPE tpiba +){ + const double TWO_PI = 2.0 * 3.14159265358979323846; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int begin_idx = tid * 1024; + if(begin_idx > npw) return; + + FPTYPE t_force0 = 0; + FPTYPE t_force1 = 0; + FPTYPE t_force2 = 0; + for(int ig = begin_idx; ig psiv_conj = conj(psiv[ig]); + + const FPTYPE arg = TWO_PI * (gv_x[ig] * pos_x + gv_y[ig] * pos_y + gv_z[ig] * pos_z); + const FPTYPE sinp = sin(arg); + const FPTYPE cosp = cos(arg); + const thrust::complex expiarg = thrust::complex(sinp, cosp); + + const thrust::complex tmp_var = psiv_conj * expiarg * tpiba * omega * rhocgigg_vec[ig]; + + const thrust::complex ipol0 = tmp_var * gv_x[ig]; + t_force0 += ipol0.real(); + + const thrust::complex ipol1 = tmp_var * gv_y[ig]; + t_force1 += ipol1.real(); + + const thrust::complex ipol2 = tmp_var * gv_z[ig]; + t_force2 += ipol2.real(); + } + atomicAdd(&force[0], t_force0); + atomicAdd(&force[1], t_force1); + atomicAdd(&force[2], t_force2); +} + template void cal_vkb_op::operator()( const base_device::DEVICE_GPU* ctx, @@ -638,6 +684,32 @@ void cal_stress_drhoc_aux_op::operator()( return ; } + +template +void cal_force_npw_op::operator()( + const std::complex *psiv, + const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z, + const FPTYPE* rhocgigg_vec, + FPTYPE* force, + const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z, + const int npw, + const FPTYPE omega, const FPTYPE tpiba + ) +{ + // Divide the npw size range into blocksize 1024 blocks + int t_size = 1024; + int t_num = (npw%t_size) ? (npw/t_size + 1) : (npw/t_size); + dim3 npwgrid(((t_num%THREADS_PER_BLOCK) ? (t_num/THREADS_PER_BLOCK + 1) : (t_num/THREADS_PER_BLOCK))); + + cal_force_npw << < npwgrid, THREADS_PER_BLOCK >> > ( + reinterpret_cast*>(psiv), + gv_x, gv_y, gv_z, rhocgigg_vec, force, pos_x, pos_y, pos_z, + npw, omega, tpiba + ); + return ; +} + + // template // void prepare_vkb_deri_ptr_op::operator()( // const base_device::DEVICE_GPU* ctx, @@ -713,6 +785,9 @@ template struct cal_vkb_deri_op; template struct cal_stress_drhoc_aux_op; template struct cal_stress_drhoc_aux_op; +template struct cal_force_npw_op; +template struct cal_force_npw_op; + // template struct prepare_vkb_deri_ptr_op; // template struct prepare_vkb_deri_ptr_op; } // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index d7db3b3eea..17cfae404a 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -502,6 +502,49 @@ __global__ void cal_stress_drhoc_aux3( } +template +__global__ void cal_force_npw( + const thrust::complex *psiv, + const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z, + const FPTYPE* rhocgigg_vec, + FPTYPE* force, + const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z, + const int npw, + const FPTYPE omega, const FPTYPE tpiba +){ + const double TWO_PI = 2.0 * 3.14159265358979323846; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int begin_idx = tid * 1024; + if(begin_idx > npw) return; + + FPTYPE t_force0 = 0; + FPTYPE t_force1 = 0; + FPTYPE t_force2 = 0; + for(int ig = begin_idx; ig psiv_conj = conj(psiv[ig]); + + const FPTYPE arg = TWO_PI * (gv_x[ig] * pos_x + gv_y[ig] * pos_y + gv_z[ig] * pos_z); + const FPTYPE sinp = sin(arg); + const FPTYPE cosp = cos(arg); + const thrust::complex expiarg = thrust::complex(sinp, cosp); + + const thrust::complex tmp_var = psiv_conj * expiarg * tpiba * omega * rhocgigg_vec[ig]; + + const thrust::complex ipol0 = tmp_var * gv_x[ig]; + t_force0 += ipol0.real(); + + const thrust::complex ipol1 = tmp_var * gv_y[ig]; + t_force1 += ipol1.real(); + + const thrust::complex ipol2 = tmp_var * gv_z[ig]; + t_force2 += ipol2.real(); + } + atomicAdd(&force[0], t_force0); + atomicAdd(&force[1], t_force1); + atomicAdd(&force[2], t_force2); +} + + template void cal_vkb_op::operator()( const base_device::DEVICE_GPU* ctx, @@ -629,8 +672,32 @@ void cal_stress_drhoc_aux_op::operator()( return ; } +template +void cal_force_npw_op::operator()( + const std::complex *psiv, + const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z, + const FPTYPE* rhocgigg_vec, + FPTYPE* force, + const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z, + const int npw, + const FPTYPE omega, const FPTYPE tpiba + ) +{ + int t_size = 1024; + int t_num = (npw%t_size) ? (npw/t_size + 1) : (npw/t_size); + + dim3 npwgrid(((t_num%THREADS_PER_BLOCK) ? (t_num/THREADS_PER_BLOCK + 1) : (t_num/THREADS_PER_BLOCK))); + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_npw), npwgrid, THREADS_PER_BLOCK,0,0, + reinterpret_cast*>(psiv), + gv_x, gv_y, gv_z, rhocgigg_vec, force, pos_x, pos_y, pos_z, + npw, omega, tpiba + ); + + return ; +} + template struct cal_vq_op; template struct cal_vq_op; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index be03cee401..c3a8008e04 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -401,6 +401,45 @@ struct cal_stress_drhoc_aux_op { }; +template +struct cal_force_npw_op { + void operator()(const std::complex *psiv, + const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z, + const FPTYPE* rhocgigg_vec, + FPTYPE* force, + const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_z, + const int npw, + const FPTYPE omega, const FPTYPE tpiba) { + const double TWO_PI = 2.0 * 3.14159265358979323846; + // printf("%d,%d,%lf\n",ngg,mesh,omega); + + // printf("iminininininin\n\n\n\n"); +#ifdef _OPENMP +#pragma omp for nowait +#endif + for (int ig = 0; ig < npw; ig++) + { + const std::complex psiv_conj = conj(psiv[ig]); + + const FPTYPE arg = TWO_PI * (gv_x[ig] * pos_x + gv_y[ig] * pos_y + gv_z[ig] * pos_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(arg, &sinp, &cosp); + const std::complex expiarg = std::complex(sinp, cosp); + + const std::complex tmp_var = psiv_conj * expiarg * tpiba * omega * rhocgigg_vec[ig]; + + const std::complex ipol0 = tmp_var * gv_x[ig]; + force[0] += ipol0.real(); + + const std::complex ipol1 = tmp_var * gv_y[ig]; + force[1] += ipol1.real(); + + const std::complex ipol2 = tmp_var * gv_z[ig]; + force[2] += ipol2.real(); + } + } +}; + // // cpu version first, gpu version later // template // struct prepare_vkb_deri_ptr_op{ @@ -476,6 +515,10 @@ template struct cal_vq_deri_op; template struct cal_stress_drhoc_aux_op; template struct cal_stress_drhoc_aux_op; +template struct cal_force_npw_op; +template struct cal_force_npw_op; + + // template struct prepare_vkb_deri_ptr_op; // template struct prepare_vkb_deri_ptr_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index b4dfc35ff8..62e373f1ff 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -190,6 +190,19 @@ struct cal_stress_drhoc_aux_op{ ); }; +template +struct cal_force_npw_op{ + void operator()(const std::complex *psiv, + const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z, + const FPTYPE* rhocgigg_vec, + FPTYPE* force, + const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_xz, + const int npw, + const FPTYPE omega, const FPTYPE tpiba + ); +}; + + #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template struct cal_dbecp_noevc_nl_op @@ -340,6 +353,27 @@ struct cal_stress_drhoc_aux_op{ }; +/** + * This operator is used to compute the force force in three directions for each atom in force_cc + * in parallel on GPU [0~3], which is: + * Force_p = (2* pi * tpiba * omega * rhocg[ig] * gv_p[ig] + * * (gv_x[ig] * pos_x + gv_y[ig] * pos_y + gv_z[ig] * pos_z) + * * complex(sinp, cosp) * psiv[ig]).real() + * + * The operator splits NPW into blocks on the GPU in parallel, and the block size is t_size = 1024. + */ +template +struct cal_force_npw_op{ + void operator()(const std::complex *psiv, + const FPTYPE* gv_x, const FPTYPE* gv_y, const FPTYPE* gv_z, + const FPTYPE* rhocgigg_vec, + FPTYPE* force, + const FPTYPE pos_x, const FPTYPE pos_y, const FPTYPE pos_xz, + const int npw, + const FPTYPE omega, const FPTYPE tpiba + ); +}; + #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp index 5567f51ef9..e4974afbeb 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp +++ b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp @@ -39,8 +39,8 @@ void Sto_Forces::cal_stoforce(ModuleBase::matrix& force, this->cal_force_loc(forcelc, rho_basis, chr); this->cal_force_ew(forceion, rho_basis, p_sf); this->cal_sto_force_nl(forcenl, wg, pkv, wfc_basis, psi_in, stowf); - this->cal_force_cc(forcecc, rho_basis, chr); - this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist,GlobalC::ucell); + this->cal_force_cc(forcecc, rho_basis, chr, GlobalC::ucell); + this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, GlobalC::ucell); //impose total force = 0 int iat = 0; diff --git a/source/module_io/read_input_item_output.cpp b/source/module_io/read_input_item_output.cpp index 1f38e9fe69..b22cebec7c 100644 --- a/source/module_io/read_input_item_output.cpp +++ b/source/module_io/read_input_item_output.cpp @@ -254,6 +254,29 @@ void ReadInput::item_output() sync_intvec(input.out_mat_hs, 2, 0); this->add_item(item); } + { + Input_Item item("out_mat_tk"); + item.annotation = "output T(k)"; + item.read_value = [](const Input_Item& item, Parameter& para) { + size_t count = item.get_size(); + if (count == 1) + { + para.input.out_mat_tk[0] = std::stoi(item.str_values[0]); + para.input.out_mat_tk[1] = 8; + } + else if (count == 2) + { + para.input.out_mat_tk[0] = std::stoi(item.str_values[0]); + para.input.out_mat_tk[1] = std::stoi(item.str_values[1]); + } + else + { + ModuleBase::WARNING_QUIT("ReadInput", "out_mat_tk should have 1 or 2 values"); + } + }; + sync_intvec(input.out_mat_tk, 2, 0); + this->add_item(item); + } { Input_Item item("out_mat_hs2"); item.annotation = "output H(R) and S(R) matrix"; @@ -453,4 +476,4 @@ void ReadInput::item_output() this->add_item(item); } } -} // namespace ModuleIO \ No newline at end of file +} // namespace ModuleIO diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h index d1d417cb96..bd326cd3b6 100644 --- a/source/module_parameter/input_parameter.h +++ b/source/module_parameter/input_parameter.h @@ -324,6 +324,7 @@ struct Input_para bool out_dm1 = false; ///< output density matrix. (multi-k points) bool out_bandgap = false; ///< QO added for bandgap printing std::vector out_mat_hs = {0, 8}; ///< output H matrix and S matrix in local basis. + std::vector out_mat_tk = {0, 8}; ///< output T(k) matrix in local basis. bool out_mat_hs2 = false; ///< LiuXh add 2019-07-16, output H(R) matrix and ///< S(R) matrix in local basis. bool out_mat_dh = false; @@ -567,4 +568,4 @@ struct Input_para bool test_stress = false; ///< test the stress. bool test_skip_ewald = false; ///< variables for test only }; -#endif \ No newline at end of file +#endif diff --git a/source/module_ri/Exx_LRI_interface.h b/source/module_ri/Exx_LRI_interface.h index cf30babb77..9f16e95acc 100644 --- a/source/module_ri/Exx_LRI_interface.h +++ b/source/module_ri/Exx_LRI_interface.h @@ -50,8 +50,12 @@ class Exx_LRI_Interface hamilt::Hamilt& hamilt, const elecstate::DensityMatrix& dm/**< double should be Tdata if complex-PBE-DM is supported*/, const K_Vectors& kv, - int& iter); + const int& nspin, + int& iter, + const double& etot, + const double& scf_ene_thr); int two_level_step = 0; + double etot_last_outer_loop = 0.0; private: std::shared_ptr> exx_ptr; Mix_DMk_2D mix_DMk_2D; diff --git a/source/module_ri/Exx_LRI_interface.hpp b/source/module_ri/Exx_LRI_interface.hpp index 2da1fd43f2..aaa338845d 100644 --- a/source/module_ri/Exx_LRI_interface.hpp +++ b/source/module_ri/Exx_LRI_interface.hpp @@ -7,6 +7,7 @@ #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h" #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.h" #include "module_base/parallel_common.h" +#include "module_base/formatter.h" #include #include "module_io/csr_reader.h" @@ -126,7 +127,10 @@ bool Exx_LRI_Interface::exx_after_converge( hamilt::Hamilt& hamilt, const elecstate::DensityMatrix& dm, const K_Vectors& kv, - int& iter) + const int& nspin, + int& iter, + const double& etot, + const double& scf_ene_thr) { // only called if (GlobalC::exx_info.info_global.cal_exx) auto restart_reset = [this]() { // avoid calling restart related procedure in the subsequent ion steps @@ -158,45 +162,50 @@ bool Exx_LRI_Interface::exx_after_converge( return false; } } - // has separate_loop case - // exx converged or get max exx steps - else if (this->two_level_step == GlobalC::exx_info.info_global.hybrid_step - || (iter == 1 && this->two_level_step != 0)) - { - restart_reset(); - return true; - } else - { - // update exx and redo scf - if (this->two_level_step == 0) + { // has separate_loop case + const double ediff = std::abs(etot - etot_last_outer_loop) * ModuleBase::Ry_to_eV; + if (two_level_step) { std::cout << FmtCore::format("EDIFF/eV (outer loop): %.8e \n", ediff); } + // exx converged or get max exx steps + if (this->two_level_step == GlobalC::exx_info.info_global.hybrid_step + || (iter == 1 && this->two_level_step != 0) // density convergence of outer loop + || (ediff < scf_ene_thr && this->two_level_step != 0)) //energy convergence of outer loop { - XC_Functional::set_xc_type(GlobalC::ucell.atoms[0].ncpp.xc_func); + restart_reset(); + return true; } + else + { + this->etot_last_outer_loop = etot; + // update exx and redo scf + if (this->two_level_step == 0) + { + XC_Functional::set_xc_type(GlobalC::ucell.atoms[0].ncpp.xc_func); + } + + std::cout << " Updating EXX " << std::flush; + timeval t_start; gettimeofday(&t_start, nullptr); + + const bool flag_restart = (this->two_level_step == 0) ? true : false; + this->mix_DMk_2D.mix(dm.get_DMK_vector(), flag_restart); + + // GlobalC::exx_lcao.cal_exx_elec(p_esolver->LOC, p_esolver->LOWF.wfc_k_grid); + const std::vector>, RI::Tensor>>> + Ds = std::is_same::value //gamma_only_local + ? RI_2D_Comm::split_m2D_ktoR(*this->exx_ptr->p_kv, this->mix_DMk_2D.get_DMk_gamma_out(), *dm.get_paraV_pointer(), nspin) + : RI_2D_Comm::split_m2D_ktoR(*this->exx_ptr->p_kv, this->mix_DMk_2D.get_DMk_k_out(), *dm.get_paraV_pointer(), nspin); + this->exx_ptr->cal_exx_elec(Ds, *dm.get_paraV_pointer()); + iter = 0; + this->two_level_step++; - std::cout << " Updating EXX " << std::flush; - timeval t_start; gettimeofday(&t_start, nullptr); - - const bool flag_restart = (this->two_level_step == 0) ? true : false; - this->mix_DMk_2D.mix(dm.get_DMK_vector(), flag_restart); - - // GlobalC::exx_lcao.cal_exx_elec(p_esolver->LOC, p_esolver->LOWF.wfc_k_grid); - const std::vector>,RI::Tensor>>> - Ds = GlobalV::GAMMA_ONLY_LOCAL - ? RI_2D_Comm::split_m2D_ktoR(*this->exx_ptr->p_kv, this->mix_DMk_2D.get_DMk_gamma_out(), *dm.get_paraV_pointer(), GlobalV::NSPIN) - : RI_2D_Comm::split_m2D_ktoR(*this->exx_ptr->p_kv, this->mix_DMk_2D.get_DMk_k_out(), *dm.get_paraV_pointer(), GlobalV::NSPIN); - this->exx_ptr->cal_exx_elec(Ds, *dm.get_paraV_pointer()); - iter = 0; - this->two_level_step++; - - timeval t_end; gettimeofday(&t_end, nullptr); - std::cout << "and rerun SCF\t" - << std::setprecision(3) << std::setiosflags(std::ios::scientific) - << (double)(t_end.tv_sec-t_start.tv_sec) + (double)(t_end.tv_usec-t_start.tv_usec)/1000000.0 - << std::defaultfloat << " (s)" << std::endl; - return false; + timeval t_end; gettimeofday(&t_end, nullptr); + std::cout << "and rerun SCF\t" + << std::setprecision(3) << std::setiosflags(std::ios::scientific) + << (double)(t_end.tv_sec - t_start.tv_sec) + (double)(t_end.tv_usec - t_start.tv_usec) / 1000000.0 + << std::defaultfloat << " (s)" << std::endl; + return false; + } } - restart_reset(); return true; }