From f2e91bd5919cb4529170aeef41a4e48e71ec6869 Mon Sep 17 00:00:00 2001 From: Qianrui Liu <76200646+Qianruipku@users.noreply.github.com> Date: Fri, 15 Nov 2024 09:58:18 +0800 Subject: [PATCH] Feature: make force and stress of sDFT support GPU (#5487) * refactor force in sdft * refactor stress in sDFT * make stress_ekin GPU * finish sdft GPU * fix compile * add annotations * fix bug of stress and force * modify --- source/Makefile.Objects | 2 + source/module_base/CMakeLists.txt | 1 + source/module_base/module_device/device.cpp | 31 +- source/module_base/module_device/device.h | 8 +- source/module_base/parallel_common.cpp | 13 +- source/module_base/parallel_device.cpp | 38 ++ source/module_base/parallel_device.h | 45 +- source/module_elecstate/elecstate_pw_sdft.cpp | 9 +- .../module_charge/charge_extra.cpp | 17 +- source/module_esolver/esolver_ks_pw.cpp | 1 - source/module_esolver/esolver_sdft_pw.cpp | 34 +- .../hamilt_pwdft/CMakeLists.txt | 1 + .../module_hamilt_pw/hamilt_pwdft/forces.cpp | 3 +- source/module_hamilt_pw/hamilt_pwdft/forces.h | 3 +- .../hamilt_pwdft/forces_nl.cpp | 13 +- .../hamilt_pwdft/fs_kin_tools.cpp | 160 ++++++ .../hamilt_pwdft/fs_kin_tools.h | 72 +++ .../hamilt_pwdft/fs_nonlocal_tools.cpp | 340 ++++++++----- .../hamilt_pwdft/fs_nonlocal_tools.h | 96 +++- .../hamilt_pwdft/kernels/cuda/force_op.cu | 62 ++- .../hamilt_pwdft/kernels/cuda/stress_op.cu | 92 +++- .../hamilt_pwdft/kernels/force_op.cpp | 57 ++- .../hamilt_pwdft/kernels/force_op.h | 5 + .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 62 ++- .../kernels/rocm/stress_op.hip.cu | 90 +++- .../hamilt_pwdft/kernels/stress_op.cpp | 78 ++- .../hamilt_pwdft/kernels/stress_op.h | 24 + .../kernels/test/force_op_test.cpp | 2 + .../kernels/test/stress_op_test.cpp | 2 + .../hamilt_pwdft/stress_func.h | 6 +- .../hamilt_pwdft/stress_func_kin.cpp | 168 +------ .../hamilt_pwdft/stress_func_nl.cpp | 24 +- .../hamilt_pwdft/stress_pw.cpp | 3 +- .../module_hamilt_pw/hamilt_pwdft/stress_pw.h | 1 - .../hamilt_stodft/sto_forces.cpp | 382 +++++---------- .../hamilt_stodft/sto_forces.h | 25 +- .../hamilt_stodft/sto_iter.cpp | 8 + .../hamilt_stodft/sto_stress_pw.cpp | 461 ++++-------------- .../hamilt_stodft/sto_stress_pw.h | 23 +- source/module_hsolver/hsolver_pw_sdft.cpp | 4 +- source/module_io/input_conv.cpp | 2 +- source/module_io/read_input_item_system.cpp | 6 + source/module_psi/psi.cpp | 2 +- tests/integrate/187_PW_MD_SDFT_ALL_GPU/INPUT | 43 ++ tests/integrate/187_PW_MD_SDFT_ALL_GPU/KPT | 4 + tests/integrate/187_PW_MD_SDFT_ALL_GPU/README | 11 + tests/integrate/187_PW_MD_SDFT_ALL_GPU/STRU | 19 + tests/integrate/187_PW_MD_SDFT_ALL_GPU/jd | 1 + .../187_PW_MD_SDFT_ALL_GPU/result.ref | 5 + tests/integrate/187_PW_SDFT_ALL_GPU/INPUT | 38 ++ tests/integrate/187_PW_SDFT_ALL_GPU/KPT | 4 + tests/integrate/187_PW_SDFT_ALL_GPU/README | 11 + tests/integrate/187_PW_SDFT_ALL_GPU/STRU | 19 + tests/integrate/187_PW_SDFT_ALL_GPU/jd | 1 + .../integrate/187_PW_SDFT_ALL_GPU/result.ref | 5 + tests/integrate/187_PW_SDFT_MALL_GPU/INPUT | 38 ++ tests/integrate/187_PW_SDFT_MALL_GPU/KPT | 4 + tests/integrate/187_PW_SDFT_MALL_GPU/README | 11 + tests/integrate/187_PW_SDFT_MALL_GPU/STRU | 19 + tests/integrate/187_PW_SDFT_MALL_GPU/jd | 1 + .../integrate/187_PW_SDFT_MALL_GPU/result.ref | 5 + tests/integrate/CASES_GPU.txt | 3 + 62 files changed, 1645 insertions(+), 1073 deletions(-) create mode 100644 source/module_base/parallel_device.cpp create mode 100644 source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp create mode 100644 source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/INPUT create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/KPT create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/README create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/STRU create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/jd create mode 100644 tests/integrate/187_PW_MD_SDFT_ALL_GPU/result.ref create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/INPUT create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/KPT create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/README create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/STRU create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/jd create mode 100644 tests/integrate/187_PW_SDFT_ALL_GPU/result.ref create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/INPUT create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/KPT create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/README create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/STRU create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/jd create mode 100644 tests/integrate/187_PW_SDFT_MALL_GPU/result.ref diff --git a/source/Makefile.Objects b/source/Makefile.Objects index 78e3080356..430e7b05c9 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -617,6 +617,7 @@ OBJS_PARALLEL=parallel_common.o\ parallel_grid.o\ parallel_kpoints.o\ parallel_reduce.o\ + parallel_device.o OBJS_SRCPW=H_Ewald_pw.o\ dnrm2.o\ @@ -640,6 +641,7 @@ OBJS_SRCPW=H_Ewald_pw.o\ forces_cc.o\ forces_scc.o\ fs_nonlocal_tools.o\ + fs_kin_tools.o\ force_op.o\ stress_op.o\ wf_op.o\ diff --git a/source/module_base/CMakeLists.txt b/source/module_base/CMakeLists.txt index 712cb902ba..b5129f0c6f 100644 --- a/source/module_base/CMakeLists.txt +++ b/source/module_base/CMakeLists.txt @@ -50,6 +50,7 @@ add_library( parallel_global.cpp parallel_comm.cpp parallel_reduce.cpp + parallel_device.cpp spherical_bessel_transformer.cpp cubic_spline.cpp module_mixing/mixing_data.cpp diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp index 4897ef1e8a..b20ea9f3ad 100644 --- a/source/module_base/module_device/device.cpp +++ b/source/module_base/module_device/device.cpp @@ -191,27 +191,30 @@ else { return "cpu"; } } -int get_device_kpar(const int &kpar) { +int get_device_kpar(const int& kpar, const int& bndpar) +{ #if __MPI && (__CUDA || __ROCM) - int temp_nproc; - MPI_Comm_size(MPI_COMM_WORLD, &temp_nproc); - if (temp_nproc != kpar) { - ModuleBase::WARNING("Input_conv", - "None kpar set in INPUT file, auto set kpar value."); - } - // GlobalV::KPAR = temp_nproc; - // band the CPU processor to the devices - int node_rank = base_device::information::get_node_rank(); + int temp_nproc = 0; + int new_kpar = kpar; + MPI_Comm_size(MPI_COMM_WORLD, &temp_nproc); + if (temp_nproc != kpar * bndpar) + { + new_kpar = temp_nproc / bndpar; + ModuleBase::WARNING("Input_conv", "kpar is not compatible with the number of processors, auto set kpar value."); + } + + // get the CPU rank of current node + int node_rank = base_device::information::get_node_rank(); - int device_num = -1; + int device_num = -1; #if defined(__CUDA) - cudaGetDeviceCount(&device_num); - cudaSetDevice(node_rank % device_num); + cudaGetDeviceCount(&device_num); // get the number of GPU devices of current node + cudaSetDevice(node_rank % device_num); // band the CPU processor to the devices #elif defined(__ROCM) hipGetDeviceCount(&device_num); hipSetDevice(node_rank % device_num); #endif - return temp_nproc; + return new_kpar; #endif return kpar; } diff --git a/source/module_base/module_device/device.h b/source/module_base/module_device/device.h index 6be4952f14..c89d3bc9cd 100644 --- a/source/module_base/module_device/device.h +++ b/source/module_base/module_device/device.h @@ -40,7 +40,7 @@ std::string get_device_info(std::string device_flag); * @brief Get the device kpar object * for module_io GlobalV::KPAR */ -int get_device_kpar(const int& kpar); +int get_device_kpar(const int& kpar, const int& bndpar); /** * @brief Get the device flag object @@ -50,6 +50,12 @@ std::string get_device_flag(const std::string& device, const std::string& basis_type); #if __MPI +/** + * @brief Get the rank of current node + * Note that GPU can only be binded with CPU in the same node + * + * @return int + */ int get_node_rank(); int get_node_rank_with_mpi_shared(const MPI_Comm mpi_comm = MPI_COMM_WORLD); int stringCmp(const void* a, const void* b); diff --git a/source/module_base/parallel_common.cpp b/source/module_base/parallel_common.cpp index 1bc3a9e627..6f8ce79fbc 100644 --- a/source/module_base/parallel_common.cpp +++ b/source/module_base/parallel_common.cpp @@ -11,15 +11,16 @@ void Parallel_Common::bcast_string(std::string& object) // Peize Lin fix bug 201 { int size = object.size(); MPI_Bcast(&size, 1, MPI_INT, 0, MPI_COMM_WORLD); - char* swap = new char[size + 1]; + int my_rank; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - if (0 == my_rank) - strcpy(swap, object.c_str()); - MPI_Bcast(swap, size + 1, MPI_CHAR, 0, MPI_COMM_WORLD); + if (0 != my_rank) - object = static_cast(swap); - delete[] swap; + { + object.resize(size); + } + + MPI_Bcast(&object[0], size, MPI_CHAR, 0, MPI_COMM_WORLD); return; } diff --git a/source/module_base/parallel_device.cpp b/source/module_base/parallel_device.cpp new file mode 100644 index 0000000000..269a41821e --- /dev/null +++ b/source/module_base/parallel_device.cpp @@ -0,0 +1,38 @@ +#include "parallel_device.h" +#ifdef __MPI +namespace Parallel_Common +{ +void bcast_data(std::complex* object, const int& n, const MPI_Comm& comm) +{ + MPI_Bcast(object, n * 2, MPI_DOUBLE, 0, comm); +} +void bcast_data(std::complex* object, const int& n, const MPI_Comm& comm) +{ + MPI_Bcast(object, n * 2, MPI_FLOAT, 0, comm); +} +void bcast_data(double* object, const int& n, const MPI_Comm& comm) +{ + MPI_Bcast(object, n, MPI_DOUBLE, 0, comm); +} +void bcast_data(float* object, const int& n, const MPI_Comm& comm) +{ + MPI_Bcast(object, n, MPI_FLOAT, 0, comm); +} +void reduce_data(std::complex* object, const int& n, const MPI_Comm& comm) +{ + MPI_Allreduce(MPI_IN_PLACE, object, n * 2, MPI_DOUBLE, MPI_SUM, comm); +} +void reduce_data(std::complex* object, const int& n, const MPI_Comm& comm) +{ + MPI_Allreduce(MPI_IN_PLACE, object, n * 2, MPI_FLOAT, MPI_SUM, comm); +} +void reduce_data(double* object, const int& n, const MPI_Comm& comm) +{ + MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_DOUBLE, MPI_SUM, comm); +} +void reduce_data(float* object, const int& n, const MPI_Comm& comm) +{ + MPI_Allreduce(MPI_IN_PLACE, object, n, MPI_FLOAT, MPI_SUM, comm); +} +} +#endif \ No newline at end of file diff --git a/source/module_base/parallel_device.h b/source/module_base/parallel_device.h index 8d867ba4fc..09625f6303 100644 --- a/source/module_base/parallel_device.h +++ b/source/module_base/parallel_device.h @@ -1,39 +1,34 @@ +#ifndef __PARALLEL_DEVICE_H__ +#define __PARALLEL_DEVICE_H__ #ifdef __MPI #include "mpi.h" #include "module_base/module_device/device.h" +#include "module_base/module_device/memory_op.h" #include -#include -#include namespace Parallel_Common { -void bcast_complex(std::complex* object, const int& n, const MPI_Comm& comm) -{ - MPI_Bcast(object, n * 2, MPI_DOUBLE, 0, comm); -} -void bcast_complex(std::complex* object, const int& n, const MPI_Comm& comm) -{ - MPI_Bcast(object, n * 2, MPI_FLOAT, 0, comm); -} -void bcast_real(double* object, const int& n, const MPI_Comm& comm) -{ - MPI_Bcast(object, n, MPI_DOUBLE, 0, comm); -} -void bcast_real(float* object, const int& n, const MPI_Comm& comm) -{ - MPI_Bcast(object, n, MPI_FLOAT, 0, comm); -} +void bcast_data(std::complex* object, const int& n, const MPI_Comm& comm); +void bcast_data(std::complex* object, const int& n, const MPI_Comm& comm); +void bcast_data(double* object, const int& n, const MPI_Comm& comm); +void bcast_data(float* object, const int& n, const MPI_Comm& comm); +void reduce_data(std::complex* object, const int& n, const MPI_Comm& comm); +void reduce_data(std::complex* object, const int& n, const MPI_Comm& comm); +void reduce_data(double* object, const int& n, const MPI_Comm& comm); +void reduce_data(float* object, const int& n, const MPI_Comm& comm); -template /** - * @brief bcast complex in Device + * @brief bcast data in Device * + * @tparam T: float, double, std::complex, std::complex + * @tparam Device * @param ctx Device ctx * @param object complex arrays in Device * @param n the size of complex arrays * @param comm MPI_Comm * @param tmp_space tmp space in CPU */ -void bcast_complex(const Device* ctx, T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr) +template +void bcast_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr) { const base_device::DEVICE_CPU* cpu_ctx = {}; T* object_cpu = nullptr; @@ -56,7 +51,7 @@ void bcast_complex(const Device* ctx, T* object, const int& n, const MPI_Comm& c object_cpu = object; } - bcast_complex(object_cpu, n, comm); + bcast_data(object_cpu, n, comm); if (base_device::get_device_type(ctx) == base_device::GpuDevice) { @@ -70,7 +65,7 @@ void bcast_complex(const Device* ctx, T* object, const int& n, const MPI_Comm& c } template -void bcast_real(const Device* ctx, T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr) +void reduce_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr) { const base_device::DEVICE_CPU* cpu_ctx = {}; T* object_cpu = nullptr; @@ -93,7 +88,7 @@ void bcast_real(const Device* ctx, T* object, const int& n, const MPI_Comm& comm object_cpu = object; } - bcast_real(object_cpu, n, comm); + reduce_data(object_cpu, n, comm); if (base_device::get_device_type(ctx) == base_device::GpuDevice) { @@ -105,7 +100,9 @@ void bcast_real(const Device* ctx, T* object, const int& n, const MPI_Comm& comm } return; } + } +#endif #endif \ No newline at end of file diff --git a/source/module_elecstate/elecstate_pw_sdft.cpp b/source/module_elecstate/elecstate_pw_sdft.cpp index 9a3be4a42f..ad6f98c3c3 100644 --- a/source/module_elecstate/elecstate_pw_sdft.cpp +++ b/source/module_elecstate/elecstate_pw_sdft.cpp @@ -14,14 +14,13 @@ void ElecStatePW_SDFT::psiToRho(const psi::Psi& psi) ModuleBase::TITLE(this->classname, "psiToRho"); ModuleBase::timer::tick(this->classname, "psiToRho"); const int nspin = PARAM.inp.nspin; + for (int is = 0; is < nspin; is++) + { + setmem_var_op()(this->ctx, this->rho[is], 0, this->charge->nrxx); + } if (GlobalV::MY_STOGROUP == 0) { - for (int is = 0; is < nspin; is++) - { - setmem_var_op()(this->ctx, this->rho[is], 0, this->charge->nrxx); - } - for (int ik = 0; ik < psi.get_nk(); ++ik) { psi.fix_k(ik); diff --git a/source/module_elecstate/module_charge/charge_extra.cpp b/source/module_elecstate/module_charge/charge_extra.cpp index fc40a2d494..d0e0ff1299 100644 --- a/source/module_elecstate/module_charge/charge_extra.cpp +++ b/source/module_elecstate/module_charge/charge_extra.cpp @@ -47,9 +47,20 @@ void Charge_Extra::Init_CE(const int& nspin, const int& natom, const int& nrxx, if (pot_order > 0) { - delta_rho1.resize(this->nspin, std::vector(nrxx, 0.0)); - delta_rho2.resize(this->nspin, std::vector(nrxx, 0.0)); - delta_rho3.resize(this->nspin, std::vector(nrxx, 0.0)); + // delta_rho1.resize(this->nspin, std::vector(nrxx, 0.0)); + // delta_rho2.resize(this->nspin, std::vector(nrxx, 0.0)); + // delta_rho3.resize(this->nspin, std::vector(nrxx, 0.0)); + // qianrui replace the above code with the following code. + // The above code cannot passed valgrind tests, which has an invalid read of size 32. + delta_rho1.resize(this->nspin); + delta_rho2.resize(this->nspin); + delta_rho3.resize(this->nspin); + for (int is = 0; is < this->nspin; is++) + { + delta_rho1[is].resize(nrxx, 0.0); + delta_rho2[is].resize(nrxx, 0.0); + delta_rho3[is].resize(nrxx, 0.0); + } } if(pot_order == 3) diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp index 8af664799d..f6a4e335dc 100644 --- a/source/module_esolver/esolver_ks_pw.cpp +++ b/source/module_esolver/esolver_ks_pw.cpp @@ -623,7 +623,6 @@ void ESolver_KS_PW::cal_stress(ModuleBase::matrix& stress) &this->sf, &this->kv, this->pw_wfc, - this->psi, this->__kspw_psi); // external stress diff --git a/source/module_esolver/esolver_sdft_pw.cpp b/source/module_esolver/esolver_sdft_pw.cpp index 8ee6098fe2..19ec08137e 100644 --- a/source/module_esolver/esolver_sdft_pw.cpp +++ b/source/module_esolver/esolver_sdft_pw.cpp @@ -169,6 +169,9 @@ void ESolver_SDFT_PW::after_scf(const int istep) template void ESolver_SDFT_PW::hamilt2density_single(int istep, int iter, double ethr) { + ModuleBase::TITLE("ESolver_SDFT_PW", "hamilt2density"); + ModuleBase::timer::tick("ESolver_SDFT_PW", "hamilt2density"); + // reset energy this->pelec->f_en.eband = 0.0; this->pelec->f_en.demet = 0.0; @@ -241,6 +244,7 @@ void ESolver_SDFT_PW::hamilt2density_single(int istep, int iter, doub #ifdef __MPI MPI_Bcast(&(this->pelec->f_en.deband), 1, MPI_DOUBLE, 0, PARAPW_WORLD); #endif + ModuleBase::timer::tick("ESolver_SDFT_PW", "hamilt2density"); } template @@ -249,10 +253,10 @@ double ESolver_SDFT_PW::cal_energy() return this->pelec->f_en.etot; } -template <> -void ESolver_SDFT_PW, base_device::DEVICE_CPU>::cal_force(ModuleBase::matrix& force) +template +void ESolver_SDFT_PW::cal_force(ModuleBase::matrix& force) { - Sto_Forces ff(GlobalC::ucell.nat); + Sto_Forces ff(GlobalC::ucell.nat); ff.cal_stoforce(force, *this->pelec, @@ -261,20 +265,16 @@ void ESolver_SDFT_PW, base_device::DEVICE_CPU>::cal_force(M &this->sf, &this->kv, this->pw_wfc, - this->psi, + GlobalC::ppcell, + GlobalC::ucell, + *this->kspw_psi, this->stowf); } -template <> -void ESolver_SDFT_PW, base_device::DEVICE_GPU>::cal_force(ModuleBase::matrix& force) -{ - ModuleBase::WARNING_QUIT("ESolver_SDFT_PW::cal_force", "DEVICE_GPU is not supported"); -} - -template <> -void ESolver_SDFT_PW, base_device::DEVICE_CPU>::cal_stress(ModuleBase::matrix& stress) +template +void ESolver_SDFT_PW::cal_stress(ModuleBase::matrix& stress) { - Sto_Stress_PW ss; + Sto_Stress_PW ss; ss.cal_stress(stress, *this->pelec, this->pw_rho, @@ -282,19 +282,13 @@ void ESolver_SDFT_PW, base_device::DEVICE_CPU>::cal_stress( &this->sf, &this->kv, this->pw_wfc, - this->psi, + *this->kspw_psi, this->stowf, this->pelec->charge, &GlobalC::ppcell, GlobalC::ucell); } -template <> -void ESolver_SDFT_PW, base_device::DEVICE_GPU>::cal_stress(ModuleBase::matrix& stress) -{ - ModuleBase::WARNING_QUIT("ESolver_SDFT_PW::cal_stress", "DEVICE_GPU is not supported"); -} - template void ESolver_SDFT_PW::after_all_runners() { diff --git a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt index 2cb1ae5247..95636d8959 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt +++ b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt @@ -36,6 +36,7 @@ list(APPEND objects parallel_grid.cpp elecond.cpp fs_nonlocal_tools.cpp + fs_kin_tools.cpp radial_proj.cpp ) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index 589fc71009..41fb91da6f 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -33,6 +33,7 @@ void Forces::cal_force(ModuleBase::matrix& force, ModulePW::PW_Basis_K* wfc_basis, const psi::Psi, Device>* psi_in) { + ModuleBase::timer::tick("Forces", "cal_force"); ModuleBase::TITLE("Forces", "init"); this->device = base_device::get_device_type(this->ctx); const ModuleBase::matrix& wg = elec.wg; @@ -455,7 +456,7 @@ void Forces::cal_force(ModuleBase::matrix& force, } } ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "TOTAL-FORCE (eV/Angstrom)", force, false); - + ModuleBase::timer::tick("Forces", "cal_force"); return; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h index 0247fbfae7..712cceac66 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.h +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h @@ -96,10 +96,11 @@ class Forces FPTYPE* drhocg, ModulePW::PW_Basis* rho_basis, const UnitCell& ucell_in); // used in nonlinear core correction stress - private: + protected: Device* ctx = {}; base_device::DEVICE_CPU* cpu_ctx = {}; base_device::AbacusDevice_t device = {}; + private: using gemm_op = hsolver::gemm_op, Device>; using resmem_complex_op = base_device::memory::resize_memory_op, Device>; diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp index 92733745b4..a8fc6976c6 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp @@ -30,9 +30,10 @@ void Forces::cal_force_nl(ModuleBase::matrix& forcenl, resmem_var_op()(this->ctx, force, ucell_in.nat * 3); base_device::memory::set_memory_op()(this->ctx, force, 0.0, ucell_in.nat * 3); - hamilt::FS_Nonlocal_tools nl_tools(nlpp_in, &ucell_in, psi_in, p_kv, wfc_basis, p_sf, wg, ekb); + hamilt::FS_Nonlocal_tools nl_tools(nlpp_in, &ucell_in, p_kv, wfc_basis, p_sf, wg, &ekb); const int nks = wfc_basis->nks; + const int max_nbands = wg.nc; for (int ik = 0; ik < nks; ik++) // loop k points { // skip zero weights to speed up @@ -46,15 +47,19 @@ void Forces::cal_force_nl(ModuleBase::matrix& forcenl, } } const int npm = nbands_occ; + nl_tools.cal_vkb(ik, max_nbands); // calculate becp = for all beta functions - nl_tools.cal_becp(ik, npm); + nl_tools.cal_becp(ik, npm, &psi_in[0](ik,0,0)); + nl_tools.reduce_pool_becp(max_nbands); for (int ipol = 0; ipol < 3; ipol++) { + nl_tools.cal_vkb_deri_f(ik, max_nbands, ipol); // calculate dbecp = for all beta functions - nl_tools.cal_dbecp_f(ik, npm, ipol); + nl_tools.cal_dbecp_f(ik, max_nbands, npm, ipol, &psi_in[0](ik,0,0)); + nl_tools.revert_vkb(ik, ipol); } // calculate the force_i = \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_i - nl_tools.cal_force(ik, npm, force); + nl_tools.cal_force(ik, max_nbands, npm, true, force); } // end ik syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcenl.c, force, forcenl.nr * forcenl.nc); diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp new file mode 100644 index 0000000000..89efb3f879 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp @@ -0,0 +1,160 @@ +#include "fs_kin_tools.h" + +#include "module_base/parallel_reduce.h" +namespace hamilt +{ +template +FS_Kin_tools::FS_Kin_tools(const UnitCell& ucell_in, + const K_Vectors* p_kv, + const ModulePW::PW_Basis_K* wfc_basis_in, + const ModuleBase::matrix& wg) + : ucell_(ucell_in), nksbands_(wg.nc), wg(wg.c), wk(p_kv->wk.data()) +{ + this->device = base_device::get_device_type(this->ctx); + this->wfc_basis_ = wfc_basis_in; + const int npwk_max = this->wfc_basis_->npwk_max; + const int nks = this->wfc_basis_->nks; + const int npol = ucell_in.get_npol(); + + this->gk3_.resize(3 * npwk_max); + this->gk.resize(3); + for (int i = 0; i < 3; ++i) + { + this->gk[i] = &this->gk3_[i * npwk_max]; + } + this->kfac.resize(npwk_max); + this->s_kin.resize(9, 0.0); + + if (this->device == base_device::GpuDevice) + { + resmem_var_op()(this->ctx, d_gk, 3 * npwk_max); + resmem_var_op()(this->ctx, d_kfac, npwk_max); + } + else + { + d_gk = gk3_.data(); + d_kfac = kfac.data(); + } +} + +template +FS_Kin_tools::~FS_Kin_tools() +{ + if (this->device == base_device::GpuDevice) + { + delmem_var_op()(this->ctx, d_gk); + delmem_var_op()(this->ctx, d_kfac); + } +} + +template +void FS_Kin_tools::cal_gk(const int& ik) +{ + const double tpiba = ModuleBase::TWO_PI / this->ucell_.lat0; + const double twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI); + const int npw = wfc_basis_->npwk[ik]; + const int npwk_max = wfc_basis_->npwk_max; + for (int i = 0; i < npw; ++i) + { + gk[0][i] = wfc_basis_->getgpluskcar(ik, i)[0] * tpiba; + gk[1][i] = wfc_basis_->getgpluskcar(ik, i)[1] * tpiba; + gk[2][i] = wfc_basis_->getgpluskcar(ik, i)[2] * tpiba; + if (wfc_basis_->erf_height > 0) + { + double gk2 = gk[0][i] * gk[0][i] + gk[1][i] * gk[1][i] + gk[2][i] * gk[2][i]; + double arg = (gk2 - wfc_basis_->erf_ecut) / wfc_basis_->erf_sigma; + kfac[i] = 1.0 + wfc_basis_->erf_height / wfc_basis_->erf_sigma * twobysqrtpi * std::exp(-arg * arg); + } + else + { + kfac[i] = 1.0; + } + } + if (this->device == base_device::GpuDevice) + { + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gk, gk[0], 3 * npwk_max); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_kfac, kfac.data(), npwk_max); + } +} + +template +void FS_Kin_tools::cal_stress_kin(const int& ik, + const int& npm, + const bool& occ, + const std::complex* psi) +{ + if (npm == 0) + { + return; + } + const int npw = wfc_basis_->npwk[ik]; + const int npwk_max = wfc_basis_->npwk_max; + const int npol = this->ucell_.get_npol(); + for (int ib = 0; ib < npm; ib++) + { + const std::complex* ppsi = psi + ib * npwk_max * npol; + const std::complex* ppsi2 = ppsi + npwk_max; + FPTYPE fac = 0.0; + if (occ) + { + fac = wg[ik * this->nksbands_ + ib]; + if (fac == 0.0) + { + continue; + } + } + else + { + fac = wk[ik]; + } + for (int l = 0; l < 3; l++) + { + const FPTYPE* d_gkl = d_gk + l * npwk_max; + for (int m = 0; m < l + 1; m++) + { + const FPTYPE* d_gkm = d_gk + m * npwk_max; + FPTYPE sum = 0; + sum += cal_multi_dot_op()(npw, fac, d_gkl, d_gkm, d_kfac, ppsi); + if (npol == 2) + { + sum += cal_multi_dot_op()(npw, fac, d_gkl, d_gkm, d_kfac, ppsi2); + } + s_kin[l * 3 + m] += sum; + } + } + } +} + +template +void FS_Kin_tools::symmetrize_stress(ModuleSymmetry::Symmetry* p_symm, ModuleBase::matrix& sigma) +{ + for (int l = 0; l < 3; ++l) + { + for (int m = 0; m < l; ++m) + { + s_kin[m * 3 + l] = s_kin[l * 3 + m]; + } + } + + Parallel_Reduce::reduce_all(s_kin.data(), 9); + + for (int l = 0; l < 3; ++l) + { + for (int m = 0; m < 3; ++m) + { + sigma(l, m) = s_kin[l * 3 + m] * ModuleBase::e2 / this->ucell_.omega; + } + } + // do symmetry + if (ModuleSymmetry::Symmetry::symm_flag == 1) + { + p_symm->symmetrize_mat3(sigma, this->ucell_.lat); + } +} + +template class FS_Kin_tools; +#if ((defined __CUDA) || (defined __ROCM)) +template class FS_Kin_tools; +#endif + +} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h new file mode 100644 index 0000000000..b6327cf47f --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h @@ -0,0 +1,72 @@ +#ifndef FS_KIN_TOOLS_H +#define FS_KIN_TOOLS_H +#include "module_base/module_device/device.h" +#include "module_basis/module_pw/pw_basis_k.h" +#include "module_cell/klist.h" +#include "module_cell/unitcell.h" +#include "module_cell/module_symmetry/symmetry.h" +#include "module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h" + +#include +namespace hamilt +{ +template +class FS_Kin_tools +{ + public: + FS_Kin_tools(const UnitCell& ucell_in, + const K_Vectors* kv_in, + const ModulePW::PW_Basis_K* wfc_basis_in, + const ModuleBase::matrix& wg); + ~FS_Kin_tools(); + + /** + * @brief calculate G+k and store it in gk and also calculate kfac + */ + void cal_gk(const int& ik); + + /** + * @brief calculate stress tensor for kinetic energy + * stress = \sum_{G,k,i} wk(k) * gk_l(G) * gk_m(G) * d_kfac(G) * occ_i*|ppsi_i(G)|^2 + * + * @param ik k-point index + * @param npm number of bands + * @param occ if use the occupation of the bands + * @param psi wavefunctions + */ + void cal_stress_kin(const int& ik, const int& npm, const bool& occ, const std::complex* psi); + + /** + * @brief symmetrize the stress tensor + */ + void symmetrize_stress(ModuleSymmetry::Symmetry* p_symm, ModuleBase::matrix& sigma); + + protected: + Device* ctx = {}; + base_device::DEVICE_CPU* cpu_ctx = {}; + base_device::AbacusDevice_t device = {}; + std::vector gk3_; + std::vector gk; + std::vector kfac; + std::vector s_kin; + FPTYPE* d_gk = nullptr; + FPTYPE* d_kfac = nullptr; + const FPTYPE* wg = nullptr; + const FPTYPE* wk = nullptr; + const ModulePW::PW_Basis_K* wfc_basis_ = nullptr; + const UnitCell& ucell_; + const int nksbands_; + + + private: + using resmem_var_op = base_device::memory::resize_memory_op; + using setmem_var_op = base_device::memory::set_memory_op; + using delmem_var_op = base_device::memory::delete_memory_op; + using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op; + using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op; + using cal_multi_dot_op = hamilt::cal_multi_dot_op; +}; + +} // namespace hamilt + +#endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index 4dbdbc558f..b219678f4c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -1,12 +1,13 @@ #include "fs_nonlocal_tools.h" #include "module_base/math_polyint.h" -#include "module_parameter/parameter.h" #include "module_base/math_ylmreal.h" #include "module_base/memory.h" +#include "module_base/parallel_device.h" #include "module_base/timer.h" #include "module_base/tool_title.h" #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" +#include "module_parameter/parameter.h" #include "nonlocal_maths.hpp" namespace hamilt @@ -15,27 +16,26 @@ namespace hamilt template FS_Nonlocal_tools::FS_Nonlocal_tools(const pseudopot_cell_vnl* nlpp_in, const UnitCell* ucell_in, - const psi::Psi, Device>* psi_in, const K_Vectors* kv_in, const ModulePW::PW_Basis_K* wfc_basis_in, const Structure_Factor* sf_in, const ModuleBase::matrix& wg, - const ModuleBase::matrix& ekb) - : nlpp_(nlpp_in), ucell_(ucell_in), psi_(psi_in), kv_(kv_in), wfc_basis_(wfc_basis_in), sf_(sf_in) + const ModuleBase::matrix* p_ekb) + : nlpp_(nlpp_in), ucell_(ucell_in), kv_(kv_in), wfc_basis_(wfc_basis_in), sf_(sf_in) { // get the device context this->device = base_device::get_device_type(this->ctx); this->nkb = nlpp_->nkb; - this->nbands = psi_->get_nbands(); this->max_npw = wfc_basis_->npwk_max; this->ntype = ucell_->ntype; + this->nbands = wg.nc; // There is a contribution for jh<>ih in US case or multi projectors case // Actually, the judge of nondiagonal should be done on every atom type this->nondiagonal = (PARAM.globalv.use_uspp || this->nlpp_->multi_proj) ? true : false; // allocate memory - this->allocate_memory(wg, ekb); + this->allocate_memory(wg, p_ekb); } template @@ -46,7 +46,7 @@ FS_Nonlocal_tools::~FS_Nonlocal_tools() } template -void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix& wg, const ModuleBase::matrix& ekb) +void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix& wg, const ModuleBase::matrix* p_ekb) { // allocate memory @@ -81,13 +81,19 @@ void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix const int _lmax = this->nlpp_->lmaxkb; resmem_var_op()(this->ctx, this->hd_ylm, (_lmax + 1) * (_lmax + 1) * max_npw); resmem_var_op()(this->ctx, this->hd_ylm_deri, 3 * (_lmax + 1) * (_lmax + 1) * max_npw); + const int nks = this->kv_->get_nks(); + resmem_var_op()(this->ctx, d_wk, nks); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wk, this->kv_->wk.data(), nks); if (this->device == base_device::GpuDevice) { resmem_var_op()(this->ctx, d_wg, wg.nr * wg.nc); - resmem_var_op()(this->ctx, d_ekb, ekb.nr * ekb.nc); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, wg.c, wg.nr * wg.nc); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_ekb, ekb.c, ekb.nr * ekb.nc); + if (p_ekb != nullptr) + { + resmem_var_op()(this->ctx, d_ekb, p_ekb->nr * p_ekb->nc); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_ekb, p_ekb->c, p_ekb->nr * p_ekb->nc); + } resmem_int_op()(this->ctx, atom_nh, this->ntype); resmem_int_op()(this->ctx, atom_na, this->ntype); syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_nh, h_atom_nh.data(), this->ntype); @@ -103,7 +109,10 @@ void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix else { this->d_wg = wg.c; - this->d_ekb = ekb.c; + if (p_ekb != nullptr) + { + this->d_ekb = p_ekb->c; + } this->atom_nh = h_atom_nh.data(); this->atom_na = h_atom_na.data(); this->ppcell_vkb = this->nlpp_->vkb.c; @@ -119,6 +128,7 @@ void FS_Nonlocal_tools::delete_memory() delmem_var_op()(this->ctx, hd_vq_deri); delmem_var_op()(this->ctx, hd_ylm); delmem_var_op()(this->ctx, hd_ylm_deri); + delmem_var_op()(this->ctx, d_wk); // delete memory on GPU if (this->device == base_device::GpuDevice) @@ -151,15 +161,13 @@ void FS_Nonlocal_tools::delete_memory() } } -// cal_becp +// cal_vkb template -void FS_Nonlocal_tools::cal_becp(int ik, int npm) +void FS_Nonlocal_tools::cal_vkb(const int& ik, const int& nbdall) { - ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp"); - ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); + ModuleBase::TITLE("FS_Nonlocal_tools", "cal_vkb"); const int npol = this->ucell_->get_npol(); - const int size_becp = this->nbands * npol * this->nkb; - const int size_becp_act = npm * npol * this->nkb; + const int size_becp = nbdall * npol * this->nkb; if (this->becp == nullptr) { resmem_complex_op()(this->ctx, becp, size_becp); @@ -167,8 +175,6 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) // prepare math tools Nonlocal_maths maths(this->nlpp_, this->ucell_); - - const std::complex* ppsi = &(this->psi_[0](ik, 0, 0)); const int npw = this->wfc_basis_->npwk[ik]; std::complex* vkb_ptr = this->ppcell_vkb; @@ -244,50 +250,66 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) d_sk += npw; } } +} + +// cal_becp +template +void FS_Nonlocal_tools::cal_becp(const int& ik, + const int& npm, + const std::complex* ppsi, + const int& nbd0) +{ + if (npm == 0) + { + return; + } + const int npol = this->ucell_->get_npol(); + const int npw = this->wfc_basis_->npwk[ik]; const char transa = 'C'; const char transb = 'N'; - int npm_npol = npm * npol; + const int npm_npol = npm * npol; + const int index0 = nbd0 * npol * nkb; gemm_op()(this->ctx, transa, transb, - nkb, + this->nkb, npm_npol, npw, &ModuleBase::ONE, - ppcell_vkb, + this->ppcell_vkb, npw, ppsi, this->max_npw, &ModuleBase::ZERO, - becp, - nkb); + this->becp + index0, + this->nkb); + ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); +} +template +void FS_Nonlocal_tools::reduce_pool_becp(const int& npm) +{ + const int npol = this->ucell_->get_npol(); + const int size_becp_act = npm * npol * this->nkb; // becp calculate is over , now we should broadcast this data. - if (this->device == base_device::GpuDevice) +#ifdef __MPI + if (GlobalV::NPROC_IN_POOL > 1) { - std::complex* h_becp = nullptr; - resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act); - syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, size_becp_act); - Parallel_Reduce::reduce_pool(h_becp, size_becp_act); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, size_becp_act); - delmem_complex_h_op()(this->cpu_ctx, h_becp); + Parallel_Common::reduce_dev(this->ctx, this->becp, size_becp_act, POOL_WORLD); } - else - { - Parallel_Reduce::reduce_pool(becp, size_becp_act); - } - ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); +#endif } // cal_dbecp template -void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, int jpol, FPTYPE* stress) +void FS_Nonlocal_tools::cal_vkb_deri_s(const int& ik, + const int& nbdall, + const int& ipol, + const int& jpol) { - ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s"); - ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); + ModuleBase::TITLE("FS_Nonlocal_tools", "cal_vkb_deri_s"); const int npol = this->ucell_->get_npol(); - const int size_becp = this->nbands * npol * this->nkb; - const int npm_npol = npm * npol; + const int size_becp = nbdall * npol * this->nkb; if (this->dbecp == nullptr) { resmem_complex_op()(this->ctx, dbecp, size_becp); @@ -296,7 +318,6 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i // prepare math tools Nonlocal_maths maths(this->nlpp_, this->ucell_); - const std::complex* ppsi = &(this->psi_[0](ik, 0, 0)); const int npw = this->wfc_basis_->npwk[ik]; std::complex* vkb_deri_ptr = this->ppcell_vkb; @@ -394,51 +415,99 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i vkb_deri_ptr += nh * npw; } } +} + +// cal_dbecp +template +void FS_Nonlocal_tools::cal_dbecp_s(const int& ik, + const int& npm, + const std::complex* ppsi, + const int& nbd0) +{ + ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s"); + const int npol = this->ucell_->get_npol(); + const int npm_npol = npm * npol; + const int npw = this->wfc_basis_->npwk[ik]; + std::complex* dbecp_ptr = this->dbecp + nbd0 * npol * this->nkb; + // 2.b calculate dbecp = dbecp_noevc * psi const char transa = 'C'; const char transb = 'N'; - gemm_op()(this->ctx, transa, transb, - nkb, + this->nkb, npm_npol, npw, &ModuleBase::ONE, - ppcell_vkb, + this->ppcell_vkb, npw, ppsi, this->max_npw, &ModuleBase::ZERO, - dbecp, - nkb); + dbecp_ptr, + this->nkb); +} + +template +void FS_Nonlocal_tools::cal_stress(const int& ik, + const int& npm, + const bool& occ, + const int& ipol, + const int& jpol, + FPTYPE* stress, + const int& nbd0) +{ + const int npol = this->ucell_->get_npol(); + const int index0 = nbd0 * npol * this->nkb; // calculate stress for target (ipol, jpol) - if(npol == 1) + if (npol == 1) { const int current_spin = this->kv_->isk[ik]; + FPTYPE* d_ekb_ik = nullptr; + if (d_ekb != nullptr) + { + d_ekb_ik = d_ekb + this->nbands * ik; + } + FPTYPE* d_wg_ik = d_wk + ik; + if (occ) + { + d_wg_ik = d_wg + this->nbands * ik; + } cal_stress_nl_op()(this->ctx, - nondiagonal, - ipol, - jpol, - nkb, - npm, - this->ntype, - current_spin, // uspp only - this->nlpp_->deeq.getBound2(), - this->nlpp_->deeq.getBound3(), - this->nlpp_->deeq.getBound4(), - atom_nh, - atom_na, - d_wg + this->nbands * ik, - d_ekb + this->nbands * ik, - qq_nt, - deeq, - becp, - dbecp, - stress); + nondiagonal, + ipol, + jpol, + nkb, + npm, + this->ntype, + current_spin, // uspp only + this->nlpp_->deeq.getBound2(), + this->nlpp_->deeq.getBound3(), + this->nlpp_->deeq.getBound4(), + atom_nh, + atom_na, + d_wg_ik, + occ, + d_ekb_ik, + qq_nt, + deeq, + becp + index0, + dbecp + index0, + stress); } else { + FPTYPE* d_ekb_ik = nullptr; + if (d_ekb != nullptr) + { + d_ekb_ik = d_ekb + this->nbands * ik; + } + FPTYPE* d_wg_ik = d_wk + ik; + if (occ) + { + d_wg_ik = d_wg + this->nbands * ik; + } cal_stress_nl_op()(this->ctx, ipol, jpol, @@ -450,35 +519,31 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i this->nlpp_->deeq_nc.getBound4(), atom_nh, atom_na, - d_wg + this->nbands * ik, - d_ekb + this->nbands * ik, + d_wg_ik, + occ, + d_ekb_ik, qq_nt, this->nlpp_->template get_deeq_nc_data(), - becp, - dbecp, + becp + index0, + dbecp + index0, stress); } - ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); } template -void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) +void FS_Nonlocal_tools::cal_vkb_deri_f(const int& ik, const int& nbdall, const int& ipol) { - ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f"); - ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f"); + ModuleBase::TITLE("FS_Nonlocal_tools", "cal_vkb_deri"); const int npol = this->ucell_->get_npol(); - const int npm_npol = npm * npol; - const int size_becp = this->nbands * npol * this->nkb; + const int size_becp = nbdall * npol * this->nkb; if (this->dbecp == nullptr) { resmem_complex_op()(this->ctx, dbecp, 3 * size_becp); } - std::complex* dbecp_ptr = this->dbecp + ipol * size_becp; const std::complex* vkb_ptr = this->ppcell_vkb; std::complex* vkb_deri_ptr = this->ppcell_vkb; - const std::complex* ppsi = &(this->psi_[0](ik, 0, 0)); const int npw = this->wfc_basis_->npwk[ik]; if (this->pre_ik_f == -1) { @@ -493,13 +558,32 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) &(this->wfc_basis_->gcar[ik * this->wfc_basis_->npwk_max].x)); } - this->save_vkb(npw, ipol); + this->save_vkb(ik, ipol); const std::complex coeff = ipol == 0 ? ModuleBase::NEG_IMAG_UNIT : ModuleBase::ONE; // calculate the vkb_deri for ipol with the memory of ppcell_vkb cal_vkb1_nl_op()(this->ctx, nkb, npw, npw, npw, ipol, coeff, vkb_ptr, gcar, vkb_deri_ptr); +} + +template +void FS_Nonlocal_tools::cal_dbecp_f(const int& ik, + const int& nbdall, + const int& npm, + const int& ipol, + const std::complex* ppsi, + const int& nbd0) +{ + ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f"); + const int npol = this->ucell_->get_npol(); + const int size_becp = nbdall * npol * this->nkb; + const int index0 = nbd0 * npol * this->nkb; + std::complex* dbecp_ptr = this->dbecp + ipol * size_becp + index0; + std::complex* vkb_deri_ptr = this->ppcell_vkb; + const int npm_npol = npm * npol; + const int npw = this->wfc_basis_->npwk[ik]; + // do gemm to get dbecp and revert the ppcell_vkb for next ipol const char transa = 'C'; const char transb = 'N'; @@ -516,16 +600,14 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) this->max_npw, &ModuleBase::ZERO, dbecp_ptr, - nkb); - this->revert_vkb(npw, ipol); - this->pre_ik_f = ik; - ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f"); + this->nkb); } // save_vkb template -void FS_Nonlocal_tools::save_vkb(int npw, int ipol) +void FS_Nonlocal_tools::save_vkb(const int& ik, const int& ipol) { + const int npw = this->wfc_basis_->npwk[ik]; if (this->device == base_device::CpuDevice) { const int gcar_zero_count = this->gcar_zero_indexes[ipol * this->wfc_basis_->npwk_max]; @@ -560,8 +642,9 @@ void FS_Nonlocal_tools::save_vkb(int npw, int ipol) // revert_vkb template -void FS_Nonlocal_tools::revert_vkb(int npw, int ipol) +void FS_Nonlocal_tools::revert_vkb(const int& ik, const int& ipol) { + const int npw = this->wfc_basis_->npwk[ik]; const std::complex coeff = ipol == 0 ? ModuleBase::NEG_IMAG_UNIT : ModuleBase::ONE; if (this->device == base_device::CpuDevice) { @@ -594,10 +677,11 @@ void FS_Nonlocal_tools::revert_vkb(int npw, int ipol) coeff); #endif } + this->pre_ik_f = ik; } template -void FS_Nonlocal_tools::transfer_gcar(int npw, int npw_max, const FPTYPE* gcar_in) +void FS_Nonlocal_tools::transfer_gcar(const int& npw, const int& npw_max, const FPTYPE* gcar_in) { std::vector gcar_tmp(3 * npw_max); gcar_tmp.assign(gcar_in, gcar_in + 3 * npw_max); @@ -654,37 +738,66 @@ void FS_Nonlocal_tools::transfer_gcar(int npw, int npw_max, cons // cal_force template -void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force) +void FS_Nonlocal_tools::cal_force(const int& ik, + const int& nbdall, + const int& npm, + const bool& occ, + FPTYPE* force, + const int& nbd0) { const int current_spin = this->kv_->isk[ik]; const int force_nc = 3; + const int npol = this->ucell_->get_npol(); + const int index0 = nbd0 * npol * this->nkb; // calculate the force - if(this->ucell_->get_npol() == 1) + if (npol == 1) { + FPTYPE* d_ekb_ik = nullptr; + if (d_ekb != nullptr) + { + d_ekb_ik = d_ekb + this->nbands * ik; + } + FPTYPE* d_wg_ik = d_wk + ik; + if (occ) + { + d_wg_ik = d_wg + this->nbands * ik; + } + cal_force_nl_op()(this->ctx, - nondiagonal, - npm, - this->ntype, - current_spin, - this->nlpp_->deeq.getBound2(), - this->nlpp_->deeq.getBound3(), - this->nlpp_->deeq.getBound4(), - force_nc, - this->nbands, - nkb, - atom_nh, - atom_na, - this->ucell_->tpiba, - d_wg + this->nbands * ik, - d_ekb + this->nbands * ik, - qq_nt, - deeq, - becp, - dbecp, - force); + nondiagonal, + npm, + this->ntype, + current_spin, + this->nlpp_->deeq.getBound2(), + this->nlpp_->deeq.getBound3(), + this->nlpp_->deeq.getBound4(), + force_nc, + nbdall, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg_ik, + occ, + d_ekb_ik, + qq_nt, + deeq, + becp + index0, + dbecp + index0, + force); } else { + FPTYPE* d_ekb_ik = nullptr; + if (d_ekb != nullptr) + { + d_ekb_ik = d_ekb + this->nbands * ik; + } + FPTYPE* d_wg_ik = d_wk + ik; + if (occ) + { + d_wg_ik = d_wg + this->nbands * ik; + } cal_force_nl_op()(this->ctx, npm, this->ntype, @@ -692,17 +805,18 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force this->nlpp_->deeq_nc.getBound3(), this->nlpp_->deeq_nc.getBound4(), force_nc, - this->nbands, + nbdall, nkb, atom_nh, atom_na, this->ucell_->tpiba, - d_wg + this->nbands * ik, - d_ekb + this->nbands * ik, + d_wg_ik, + occ, + d_ekb_ik, qq_nt, this->nlpp_->template get_deeq_nc_data(), - becp, - dbecp, + becp + index0, + dbecp + index0, force); } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h index 6680edde05..0cc640f27c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h @@ -32,37 +32,111 @@ class FS_Nonlocal_tools public: FS_Nonlocal_tools(const pseudopot_cell_vnl* nlpp_in, const UnitCell* ucell_in, - const psi::Psi, Device>* psi_in, const K_Vectors* kv_in, const ModulePW::PW_Basis_K* wfc_basis_in, const Structure_Factor* sf_in, const ModuleBase::matrix& wg, - const ModuleBase::matrix& ekb); + const ModuleBase::matrix* p_ekb); ~FS_Nonlocal_tools(); + /** + * @brief calculate the projectors |beta> + * + */ + void cal_vkb(const int& ik, const int& nbdall); + /** * @brief calculate the becp = for all beta functions + * + * @param ik the index of k point + * @param npm the number of bands + * @param ppsi the wave functions + * @param nbd0 the start index of the bands */ - void cal_becp(int ik, int npm); + void cal_becp(const int& ik, const int& npm, const std::complex* ppsi, const int& nbd0 = 0); + + /// @brief mpi_allreduce the becp in the pool + void reduce_pool_becp(const int& npm); + + /** + * @brief calculate vkb_deri + * + * @param ik the index of k point + * @param nbdall the number of all bands, it decides the size of vkb_deri + * @param ipol the i index of the direction + * @param jpol the j index of the direction + */ + void cal_vkb_deri_s(const int& ik, const int& nbdall, const int& ipol, const int& jpol); + /** * @brief calculate the dbecp_{ij} = for all beta functions * stress_{ij} = -1/omega \sum_{n,k}f_{nk} \sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_{ij} also calculated + * + * @param ik the index of k point + * @param npm the number of bands + * @param ppsi the wave functions + * @param nbd0 the start index of the bands */ - void cal_dbecp_s(int ik, int npm, int ipol, int jpol, FPTYPE* stress); + void cal_dbecp_s(const int& ik, const int& npm, const std::complex* ppsi, const int& nbd0 = 0); + + /** + * @brief calculate stress + * + * @param ik the index of k point + * @param npm the number of bands + * @param occ if use the occupation of the bands + * @param ipol the i index of the direction + * @param jpol the j index of the direction + * @param stress [out] the stress tensor + * @param nbd0 the start index of the bands + */ + void cal_stress(const int& ik, + const int& npm, + const bool& occ, + const int& ipol, + const int& jpol, + FPTYPE* stress, + const int& nbd0 = 0); + + /** + * @brief calculate vkb_deri + * + * @param ik the index of k point + * @param nbdall the number of all bands, it decides the size of vkb_deri + * @param ipol the index of the polar + */ + void cal_vkb_deri_f(const int& ik, const int& nbdall, const int& ipol); /** * @brief calculate the dbecp_i = for all beta functions + * + * @param ik the index of k point + * @param nbdall the number of all bands, which is the dimension of dbecp and becp + * @param npm the number of bands + * @param ipol the index of the polar + * @param ppsi the wave functions + * @param nbd0 the start index of the bands */ - void cal_dbecp_f(int ik, int npm, int ipol); + void cal_dbecp_f(const int& ik, const int& nbdall, const int& npm, const int& ipol, const std::complex* ppsi, const int& nbd0 = 0); /** * @brief calculate the force^I_i = - \sum_{n,k}f_{nk} \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_i + * + * @param ik the index of k point + * @param npm the number of bands + * @param nbdall the number of all bands, which is the dimension of dbecp and becp + * @param nbd0 start band index for dbecp and becp + * @param occ if use the occupation of the bands + * @param force [out] the force */ - void cal_force(int ik, int npm, FPTYPE* force); + void cal_force(const int& ik, const int& nbdall, const int& npm, const bool& occ, FPTYPE* force, const int& nbd0 = 0); + + /// @brief revert the 0-value dvkbs for calculating the dbecp_i in the force calculation + void revert_vkb(const int& ik, const int& ipol); private: /** * @brief allocate the memory for the variables */ - void allocate_memory(const ModuleBase::matrix& wg, const ModuleBase::matrix& ekb); + void allocate_memory(const ModuleBase::matrix& wg, const ModuleBase::matrix* p_ekb); /** * @brief delete the memory for the variables */ @@ -73,7 +147,6 @@ class FS_Nonlocal_tools const Structure_Factor* sf_; const pseudopot_cell_vnl* nlpp_; const UnitCell* ucell_; - const psi::Psi, Device>* psi_; const K_Vectors* kv_; const ModulePW::PW_Basis_K* wfc_basis_; @@ -102,15 +175,14 @@ class FS_Nonlocal_tools int gcar_zero_counts[3] = {0, 0, 0}; std::complex* vkb_save = nullptr; /// @brief count zero gcar indexes and prepare zero_indexes, do gcar_y /= gcar_x, gcar_z /= gcar_y - void transfer_gcar(int npw, int npw_max, const FPTYPE* gcar_in); + void transfer_gcar(const int& npw, const int& npw_max, const FPTYPE* gcar_in); /// @brief save the 0-value dvkbs for calculating the dbecp_i in the force calculation - void save_vkb(int npw, int ipol); - /// @brief revert the 0-value dvkbs for calculating the dbecp_i in the force calculation - void revert_vkb(int npw, int ipol); + void save_vkb(const int& npw, const int& ipol); /// --------------------------------------------------------------------- /// pointers to access the data without memory arrangement FPTYPE* d_wg = nullptr; + FPTYPE* d_wk = nullptr; FPTYPE* d_ekb = nullptr; FPTYPE* gcar = nullptr; FPTYPE* deeq = nullptr; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index dddf889de1..991a81e746 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -47,6 +47,7 @@ __global__ void cal_force_nl( const int *atom_na, const FPTYPE tpiba, const FPTYPE *d_wg, + const bool occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE *deeq, @@ -64,13 +65,29 @@ __global__ void cal_force_nl( } int nproj = atom_nh[it]; - FPTYPE fac = d_wg[ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ib]; + FPTYPE fac; + if(occ) + { + fac = d_wg[ib] * 2.0 * tpiba; + } + else + { + fac = d_wg[0] * 2.0 * tpiba; + } + FPTYPE ekb_now = 0.0; + if (d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } for (int ia = 0; ia < atom_na[it]; ia++) { for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { + FPTYPE ps_qq = 0; + if(ekb_now != 0) + { + ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; + } // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip]; - FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] - - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; + FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] + ps_qq; const int inkb = sum + ip; //out<<"\n ps = "<::operator()(const base_dev const int* atom_na, const FPTYPE& tpiba, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE* deeq, @@ -158,7 +180,7 @@ void cal_force_nl_op::operator()(const base_dev forcenl_nc, nbands, nkb, atom_nh, atom_na, tpiba, - d_wg, d_ekb, qq_nt, deeq, + d_wg, occ, d_ekb, qq_nt, deeq, reinterpret_cast*>(becp), reinterpret_cast*>(dbecp), force);// array of data @@ -179,6 +201,7 @@ __global__ void cal_force_nl( const int *atom_na, const FPTYPE tpiba, const FPTYPE *d_wg, + const bool occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const thrust::complex *deeq_nc, @@ -197,15 +220,31 @@ __global__ void cal_force_nl( } int nproj = atom_nh[it]; - FPTYPE fac = d_wg[ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ib]; + FPTYPE fac; + if(occ) + { + fac = d_wg[ib] * 2.0 * tpiba; + } + else + { + fac = d_wg[0] * 2.0 * tpiba; + } + FPTYPE ekb_now = 0.0; + if (d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } for (int ia = 0; ia < atom_na[it]; ia++) { for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { const int inkb = sum + ip; for (int ip2 = 0; ip2 < nproj; ip2++) { // Effective values of the D-eS coefficients - const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + thrust::complex ps_qq = 0; + if (ekb_now) + { + ps_qq = thrust::complex(-ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2], 0.0); + } const int jnkb = sum + ip2; const thrust::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; @@ -244,6 +283,7 @@ void cal_force_nl_op::operator()(const base_dev const int* atom_na, const FPTYPE& tpiba, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const std::complex* deeq_nc, @@ -257,7 +297,7 @@ void cal_force_nl_op::operator()(const base_dev forcenl_nc, nbands, nkb, atom_nh, atom_na, tpiba, - d_wg, d_ekb, qq_nt, + d_wg, occ, d_ekb, qq_nt, reinterpret_cast*>(deeq_nc), reinterpret_cast*>(becp), reinterpret_cast*>(dbecp), diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index e566126876..36e0aac37a 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -113,6 +113,7 @@ __global__ void cal_stress_nl( const int *atom_nh, const int *atom_na, const FPTYPE *d_wg, + const bool occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE *deeq, @@ -131,8 +132,20 @@ __global__ void cal_stress_nl( } FPTYPE stress_var = 0; - const FPTYPE fac = d_wg[ib]; - const FPTYPE ekb_now = d_ekb[ib]; + FPTYPE fac; + if (occ) + { + fac = d_wg[ib]; + } + else + { + fac = d_wg[0]; + } + FPTYPE ekb_now = 0.0; + if (d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } const int nproj = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { @@ -141,8 +154,12 @@ __global__ void cal_stress_nl( if(!nondiagonal && ip1 != ip2) { continue; } - const FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] - - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + FPTYPE ps_qq = 0; + if (ekb_now != 0) + { + ps_qq = -ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + } + const FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; const int inkb1 = sum + ip1; const int inkb2 = sum + ip2; //out<<"\n ps = "< +__global__ void cal_multi_dot(const int npw, + const FPTYPE fac, + const FPTYPE* gk1, + const FPTYPE* gk2, + const FPTYPE* d_kfac, + const thrust::complex* psi, + FPTYPE* sum) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < npw) { + atomicAdd(sum, fac * gk1[idx] * gk2[idx] * d_kfac[idx] * thrust::norm(psi[idx])); + } +} + template void cal_dbecp_noevc_nl_op::operator()(const base_device::DEVICE_GPU* ctx, const int& ipol, @@ -211,6 +243,7 @@ void cal_stress_nl_op::operator()(const base_de const int* atom_nh, const int* atom_na, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE* deeq, @@ -231,6 +264,7 @@ void cal_stress_nl_op::operator()(const base_de atom_nh, atom_na, d_wg, + occ, d_ekb, qq_nt, deeq, @@ -253,6 +287,7 @@ __global__ void cal_stress_nl( const int *atom_nh, const int *atom_na, const FPTYPE *d_wg, + const bool occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const thrust::complex *deeq_nc, @@ -272,15 +307,31 @@ __global__ void cal_stress_nl( } FPTYPE stress_var = 0; - const FPTYPE fac = d_wg[ib]; - const FPTYPE ekb_now = d_ekb[ib]; + FPTYPE fac; + if (occ) + { + fac = d_wg[ib]; + } + else + { + fac = d_wg[0]; + } + FPTYPE ekb_now = 0.0; + if (d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } const int nproj = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) { const int ip1 = ii / nproj; const int ip2 = ii % nproj; - const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + thrust::complex ps_qq = 0; + if(ekb_now != 0) + { + ps_qq = thrust::complex(- ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2], 0.0); + } const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; @@ -317,6 +368,7 @@ void cal_stress_nl_op::operator()(const base_de const int* atom_nh, const int* atom_na, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const std::complex* deeq_nc, @@ -335,6 +387,7 @@ void cal_stress_nl_op::operator()(const base_de atom_nh, atom_na, d_wg, + occ, d_ekb, qq_nt, reinterpret_cast*>(deeq_nc), @@ -345,6 +398,28 @@ void cal_stress_nl_op::operator()(const base_de cudaCheckOnDebug(); } +template +FPTYPE cal_multi_dot_op::operator()(const int& npw, + const FPTYPE& fac, + const FPTYPE* gk1, + const FPTYPE* gk2, + const FPTYPE* d_kfac, + const std::complex* psi) +{ + FPTYPE* d_sum = nullptr; + cudaMalloc(&d_sum, sizeof(FPTYPE) * 1); + cudaMemset(d_sum, 0, sizeof(FPTYPE) * 1); + int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + cal_multi_dot<<>>( + npw, fac, gk1, gk2, d_kfac, reinterpret_cast*>(psi), d_sum); + FPTYPE sum; + cudaMemcpy(&sum, d_sum, sizeof(FPTYPE) * 1, cudaMemcpyDeviceToHost); + cudaFree(d_sum); + + cudaCheckOnDebug(); + return sum; +} + template void cal_stress_mgga_op::operator()( const int& spin, @@ -861,6 +936,9 @@ template struct cal_stress_drhoc_aux_op; template struct cal_force_npw_op; template struct cal_force_npw_op; +template struct cal_multi_dot_op; +template struct cal_multi_dot_op; + // template struct prepare_vkb_deri_ptr_op; // template struct prepare_vkb_deri_ptr_op; } // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 2ee9cb4530..261c510efc 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -54,6 +54,7 @@ struct cal_force_nl_op const int* atom_na, const FPTYPE& tpiba, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE* deeq, @@ -78,15 +79,31 @@ struct cal_force_nl_op for (int ib = 0; ib < nbands_occ; ib++) { FPTYPE local_force[3] = {0, 0, 0}; - FPTYPE fac = d_wg[ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ib]; + FPTYPE fac; + if(occ) + { + fac = d_wg[ib] * 2.0 * tpiba; + } + else + { + fac = d_wg[0] * 2.0 * tpiba; + } + FPTYPE ekb_now = 0.0; + if (d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } int iat = iat0 + ia; int sum = sum0 + ia * nproj; for (int ip = 0; ip < nproj; ip++) { + FPTYPE ps_qq = 0; + if(ekb_now != 0) + { + ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; + } // Effective values of the D-eS coefficients - FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] - - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; + FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] + ps_qq; const int inkb = sum + ip; // out<<"\n ps = "< if (ip != ip2) { const int jnkb = sum + ip2; - FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] - - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; - + FPTYPE ps_qq = 0; + if (ekb_now != 0) + { + ps_qq = -ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + } + FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; for (int ipol = 0; ipol < 3; ipol++) { const FPTYPE dbb = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) @@ -159,6 +179,7 @@ struct cal_force_nl_op const int* atom_na, const FPTYPE& tpiba, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const std::complex* deeq_nc, @@ -184,8 +205,20 @@ struct cal_force_nl_op { const int ib2 = ib*2; FPTYPE local_force[3] = {0, 0, 0}; - FPTYPE fac = d_wg[ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ib]; + FPTYPE fac; + if(occ) + { + fac = d_wg[ib] * 2.0 * tpiba; + } + else + { + fac = d_wg[0] * 2.0 * tpiba; + } + FPTYPE ekb_now = 0.0; + if (d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } int iat = iat0 + ia; int sum = sum0 + ia * nprojs; for (int ip = 0; ip < nprojs; ip++) @@ -195,7 +228,11 @@ struct cal_force_nl_op for (int ip2 = 0; ip2 < nprojs; ip2++) { // Effective values of the D-eS coefficients - std::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + std::complex ps_qq = 0; + if(ekb_now) + { + ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + } const int jnkb = sum + ip2; std::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; std::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index c68df7bd0f..ce6ae7afb3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -60,6 +60,7 @@ struct cal_force_nl_op /// @param atom_na - GlobalC::ucell.atoms[ii].na /// @param tpiba - GlobalC::ucell.tpiba /// @param d_wg - input parameter wg + /// @param occ - if use the occupation of the bands /// @param d_ekb - input parameter ekb /// @param qq_nt - GlobalC::ppcell.qq_nt /// @param deeq - GlobalC::ppcell.deeq @@ -83,6 +84,7 @@ struct cal_force_nl_op const int* atom_na, const FPTYPE& tpiba, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE* deeq, @@ -103,6 +105,7 @@ struct cal_force_nl_op const int* atom_na, const FPTYPE& tpiba, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const std::complex* deeq_nc, @@ -145,6 +148,7 @@ struct cal_force_nl_op const int* atom_na, const FPTYPE& tpiba, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE* deeq, @@ -165,6 +169,7 @@ struct cal_force_nl_op const int* atom_na, const FPTYPE& tpiba, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const std::complex* deeq_nc, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index da45a375e7..b89a380133 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -43,6 +43,7 @@ __global__ void cal_force_nl( const int *atom_na, const FPTYPE tpiba, const FPTYPE *d_wg, + const bool occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE *deeq, @@ -61,13 +62,29 @@ __global__ void cal_force_nl( } int nproj = atom_nh[it]; - FPTYPE fac = d_wg[ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ib]; + FPTYPE fac; + if(occ) + { + fac = d_wg[ib] * 2.0 * tpiba; + } + else + { + fac = d_wg[0] * 2.0 * tpiba; + } + FPTYPE ekb_now = 0.0; + if (d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } for (int ia = 0; ia < atom_na[it]; ia++) { for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { + FPTYPE ps_qq = 0; + if(ekb_now != 0) + { + ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; + } // FPTYPE ps = GlobalC::ppcell.deeq[spin, iat, ip, ip]; - FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] - - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip]; + FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip] + ps_qq; const int inkb = sum + ip; //out<<"\n ps = "<::operator()(const base_dev const int* atom_na, const FPTYPE& tpiba, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE* deeq, @@ -155,7 +177,7 @@ void cal_force_nl_op::operator()(const base_dev forcenl_nc, nbands, nkb, atom_nh, atom_na, tpiba, - d_wg, d_ekb, qq_nt, deeq, + d_wg, occ, d_ekb, qq_nt, deeq, reinterpret_cast*>(becp), reinterpret_cast*>(dbecp), force);// array of data @@ -176,6 +198,7 @@ __global__ void cal_force_nl( const int *atom_na, const FPTYPE tpiba, const FPTYPE *d_wg, + const bool occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const thrust::complex *deeq_nc, @@ -195,15 +218,31 @@ __global__ void cal_force_nl( } int nproj = atom_nh[it]; - FPTYPE fac = d_wg[ib] * 2.0 * tpiba; - FPTYPE ekb_now = d_ekb[ib]; + FPTYPE fac; + if(occ) + { + fac = d_wg[ib] * 2.0 * tpiba; + } + else + { + fac = d_wg[0] * 2.0 * tpiba; + } + FPTYPE ekb_now = 0.0; + if (d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } for (int ia = 0; ia < atom_na[it]; ia++) { for (int ip = threadIdx.x; ip < nproj; ip += blockDim.x) { const int inkb = sum + ip; for (int ip2 = 0; ip2 < nproj; ip2++) { // Effective values of the D-eS coefficients - const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + thrust::complex ps_qq = 0; + if (ekb_now) + { + ps_qq = thrust::complex(-ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2], 0.0); + } const int jnkb = sum + ip2; const thrust::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; @@ -242,6 +281,7 @@ void cal_force_nl_op::operator()(const base_dev const int* atom_na, const FPTYPE& tpiba, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const std::complex* deeq_nc, @@ -255,7 +295,7 @@ void cal_force_nl_op::operator()(const base_dev forcenl_nc, nbands, nkb, atom_nh, atom_na, tpiba, - d_wg, d_ekb, qq_nt, + d_wg, occ, d_ekb, qq_nt, reinterpret_cast*>(deeq_nc), reinterpret_cast*>(becp), reinterpret_cast*>(dbecp), diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index b3e7b0a34a..cd26e312a5 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -108,6 +108,7 @@ __global__ void cal_stress_nl( const int *atom_nh, const int *atom_na, const FPTYPE *d_wg, + const bool occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE *deeq, @@ -126,8 +127,20 @@ __global__ void cal_stress_nl( } FPTYPE stress_var = 0; - FPTYPE fac = d_wg[ib]; - FPTYPE ekb_now = d_ekb[ib]; + FPTYPE fac; + if (occ) + { + fac = d_wg[ib]; + } + else + { + fac = d_wg[0]; + } + FPTYPE ekb_now = 0.0; + if (d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } const int nproj = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { @@ -136,8 +149,12 @@ __global__ void cal_stress_nl( if(!nondiagonal && ip1 != ip2) { continue; } - FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] - - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + FPTYPE ps_qq = 0; + if (ekb_now != 0) + { + ps_qq = -ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + } + FPTYPE ps = deeq[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; const int inkb1 = sum + ip1; const int inkb2 = sum + ip2; //out<<"\n ps = "<::operator()(const base_de const int* atom_nh, const int* atom_na, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE* deeq, @@ -225,6 +243,7 @@ void cal_stress_nl_op::operator()(const base_de atom_nh, atom_na, d_wg, + occ, d_ekb, qq_nt, deeq, @@ -247,6 +266,7 @@ __global__ void cal_stress_nl( const int *atom_nh, const int *atom_na, const FPTYPE *d_wg, + const bool occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const thrust::complex *deeq_nc, @@ -266,15 +286,31 @@ __global__ void cal_stress_nl( } FPTYPE stress_var = 0; - FPTYPE fac = d_wg[ib]; - FPTYPE ekb_now = d_ekb[ib]; + FPTYPE fac; + if (occ) + { + fac = d_wg[ib]; + } + else + { + fac = d_wg[0]; + } + FPTYPE ekb_now = 0.0; + if (d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } const int nproj = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { for (int ii = threadIdx.x; ii < nproj * nproj; ii += blockDim.x) { const int ip1 = ii / nproj; const int ip2 = ii % nproj; - const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + thrust::complex ps_qq = 0; + if (ekb_now != 0) + { + ps_qq = thrust::complex(- ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2], 0.0); + } const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; @@ -297,6 +333,21 @@ __global__ void cal_stress_nl( } } +template +__global__ void cal_multi_dot(const int npw, + const FPTYPE fac, + const FPTYPE* gk1, + const FPTYPE* gk2, + const FPTYPE* d_kfac, + const thrust::complex* psi, + FPTYPE* sum) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < npw) { + atomicAdd(sum, fac * gk1[idx] * gk2[idx] * d_kfac[idx] * thrust::norm(psi[idx])); + } +} + template void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, const int& ipol, @@ -310,6 +361,7 @@ void cal_stress_nl_op::operator()(const base_de const int* atom_nh, const int* atom_na, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const std::complex* deeq_nc, @@ -328,6 +380,7 @@ void cal_stress_nl_op::operator()(const base_de atom_nh, atom_na, d_wg, + occ, d_ekb, qq_nt, reinterpret_cast*>(deeq_nc), @@ -798,6 +851,27 @@ void cal_force_npw_op::operator()( return ; } +template +FPTYPE cal_multi_dot_op::operator()(const int& npw, + const FPTYPE& fac, + const FPTYPE* gk1, + const FPTYPE* gk2, + const FPTYPE* d_kfac, + const std::complex* psi) +{ + FPTYPE* d_sum = nullptr; + hipMalloc(&d_sum, sizeof(FPTYPE) * 1); + hipMemset(d_sum, 0, sizeof(FPTYPE) * 1); + int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_multi_dot), dim3(block), dim3(THREADS_PER_BLOCK), 0, 0, + npw, fac, gk1, gk2, d_kfac, reinterpret_cast*>(psi), d_sum); + FPTYPE sum; + hipMemcpy(&sum, d_sum, sizeof(FPTYPE) * 1, hipMemcpyDeviceToHost); + hipFree(d_sum); + hipCheckOnDebug(); + return sum; +} + template struct cal_vq_op; template struct cal_vq_op; @@ -846,4 +920,6 @@ template struct cal_stress_nl_op; template struct cal_force_npw_op; template struct cal_force_npw_op; +template struct cal_multi_dot_op; +template struct cal_multi_dot_op; } // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index f36f2e8316..979955d3e8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -9,7 +9,6 @@ #include "module_base/libm/libm.h" namespace hamilt { - template struct cal_dbecp_noevc_nl_op { @@ -83,6 +82,7 @@ struct cal_stress_nl_op const int* atom_nh, const int* atom_na, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE* deeq, @@ -101,10 +101,15 @@ struct cal_stress_nl_op { const int nproj = atom_nh[it]; #ifdef _OPENMP -#pragma omp for collapse(4) +#pragma omp for #endif for (int ib = 0; ib < nbands_occ; ib++) { + FPTYPE ekb_now = 0.0; + if(d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } for (int ia = 0; ia < atom_na[it]; ia++) { for (int ip1 = 0; ip1 < nproj; ip1++) @@ -115,10 +120,22 @@ struct cal_stress_nl_op { continue; } - FPTYPE fac = d_wg[ib] * 1.0; - FPTYPE ekb_now = d_ekb[ib]; - FPTYPE ps = deeq[((spin * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] - - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + + FPTYPE fac; + if (occ) + { + fac = d_wg[ib]; + } + else + { + fac = d_wg[0]; + } + FPTYPE ps_qq = 0; + if(ekb_now != 0) + { + ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + } + FPTYPE ps = deeq[((spin * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; const int inkb1 = sum + ia * nproj + ip1; const int inkb2 = sum + ia * nproj + ip2; // out<<"\n ps = "< const int* atom_nh, const int* atom_na, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const std::complex* deeq_nc, @@ -167,10 +185,15 @@ struct cal_stress_nl_op { const int nproj = atom_nh[it]; #ifdef _OPENMP -#pragma omp for collapse(4) +#pragma omp for #endif for (int ib = 0; ib < nbands_occ; ib++) { + FPTYPE ekb_now = 0.0; + if(d_ekb != nullptr) + { + ekb_now = d_ekb[ib]; + } for (int ia = 0; ia < atom_na[it]; ia++) { for (int ip1 = 0; ip1 < nproj; ip1++) @@ -178,9 +201,20 @@ struct cal_stress_nl_op for (int ip2 = 0; ip2 < nproj; ip2++) { const int ib2 = ib*2; - FPTYPE fac = d_wg[ib] * 1.0; - FPTYPE ekb_now = d_ekb[ib]; - std::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + FPTYPE fac; + if (occ) + { + fac = d_wg[ib]; + } + else + { + fac = d_wg[0]; + } + std::complex ps_qq = 0; + if(ekb_now != 0) + { + ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + } std::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; std::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; std::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; @@ -508,6 +542,27 @@ struct cal_force_npw_op { } }; +template +struct cal_multi_dot_op { + FPTYPE operator()(const int& npw, + const FPTYPE& fac, + const FPTYPE* gk1, + const FPTYPE* gk2, + const FPTYPE* d_kfac, + const std::complex* psi) + { + FPTYPE sum = 0; +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : sum) +#endif + for (int i = 0; i < npw; i++) + { + sum += fac * gk1[i] * gk2[i] * d_kfac[i] * std::norm(psi[i]); + } + return sum; + } +}; + // // cpu version first, gpu version later // template // struct prepare_vkb_deri_ptr_op{ @@ -586,6 +641,9 @@ template struct cal_stress_drhoc_aux_op; template struct cal_force_npw_op; template struct cal_force_npw_op; +template struct cal_multi_dot_op; +template struct cal_multi_dot_op; + // template struct prepare_vkb_deri_ptr_op; // template struct prepare_vkb_deri_ptr_op; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index 2f4bbd0989..800bb3eba3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -72,6 +72,7 @@ struct cal_stress_nl_op /// @param atom_nh - GlobalC::ucell.atoms[ii].ncpp.nh /// @param atom_na - GlobalC::ucell.atoms[ii].na /// @param d_wg - input parameter wg + /// @param occ - if use the occupation of the bands /// @param d_ekb - input parameter ekb /// @param qq_nt - GlobalC::ppcell.qq_nt /// @param deeq - GlobalC::ppcell.deeq @@ -94,6 +95,7 @@ struct cal_stress_nl_op const int* atom_nh, const int* atom_na, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE* deeq, @@ -113,6 +115,7 @@ struct cal_stress_nl_op const int* atom_nh, const int* atom_na, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const std::complex* deeq_nc, @@ -218,6 +221,16 @@ struct cal_force_npw_op{ ); }; +template +struct cal_multi_dot_op{ + FPTYPE operator()(const int& npw, + const FPTYPE& fac, + const FPTYPE* gk1, + const FPTYPE* gk2, + const FPTYPE* d_kfac, + const std::complex* psi); +}; + #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template @@ -258,6 +271,7 @@ struct cal_stress_nl_op const int* atom_nh, const int* atom_na, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const FPTYPE* deeq, @@ -277,6 +291,7 @@ struct cal_stress_nl_op const int* atom_nh, const int* atom_na, const FPTYPE* d_wg, + const bool& occ, const FPTYPE* d_ekb, const FPTYPE* qq_nt, const std::complex* deeq_nc, @@ -351,6 +366,15 @@ struct cal_vq_deri_op FPTYPE* vq); }; +template +struct cal_multi_dot_op{ + FPTYPE operator()(const int& npw, + const FPTYPE& fac, + const FPTYPE* gk1, + const FPTYPE* gk2, + const FPTYPE* d_kfac, + const std::complex* psi); +}; /** * The operator is used to compute the auxiliary amount of stress /force diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp index 3da3ca3b19..0507ff3358 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp @@ -2899,6 +2899,7 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_cpu) atom_na.data(), tpiba, wg.data(), + true, ekb.data(), qq_nt.data(), deeq.data(), @@ -2990,6 +2991,7 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu) d_atom_na, tpiba, d_wg, + true, d_ekb, d_qq_nt, d_deeq, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp index 4f84fa098d..cbf434da0c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp @@ -102,6 +102,7 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_cpu) atom_nh.data(), atom_na.data(), wg.data(), + true, ekb.data(), qq_nt.data(), deeq.data(), @@ -275,6 +276,7 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu) d_atom_nh, d_atom_na, d_wg, + true, d_ekb, d_qq_nt, d_deeq, diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h index 73cf31bf2d..ddd1c6ca44 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h @@ -66,7 +66,8 @@ class Stress_Func ModuleSymmetry::Symmetry* p_symm, K_Vectors* p_kv, ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in = nullptr); // electron kinetic part in PW basis + const UnitCell& ucell_in, + const psi::Psi, Device>* psi_in = nullptr); // electron kinetic part in PW basis // 2) the stress from the Hartree term void stress_har(ModuleBase::matrix& sigma, @@ -207,10 +208,11 @@ class Stress_Func const ModuleBase::matrix& dylmk0, std::complex* dqg); - private: + protected: Device* ctx = {}; base_device::DEVICE_CPU* cpu_ctx = {}; base_device::AbacusDevice_t device = {}; + private: using gemm_op = hsolver::gemm_op, Device>; using cal_stress_nl_op = hamilt::cal_stress_nl_op; using cal_dbecp_noevc_nl_op = hamilt::cal_dbecp_noevc_nl_op; diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp index 489e1fe0f8..a5afae2340 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp @@ -2,6 +2,7 @@ #include "module_hamilt_pw/hamilt_pwdft/global.h" #include "module_parameter/parameter.h" #include "module_base/timer.h" +#include "module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h" //calculate the kinetic stress in PW base template @@ -10,168 +11,29 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, ModuleSymmetry::Symmetry* p_symm, K_Vectors* p_kv, ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in) + const UnitCell& ucell_in, + const psi::Psi, Device>* psi_in) { ModuleBase::TITLE("Stress_Func","stress_kin"); ModuleBase::timer::tick("Stress_Func","stress_kin"); - - FPTYPE **gk; - gk=new FPTYPE* [3]; - int npw; - FPTYPE s_kin[3][3]; - for(int l=0;l<3;l++) - { - for(int m=0;m<3;m++) - { - s_kin[l][m]=0.0; - } - } - - int npwx=0; - for (int ik = 0; ik < p_kv->get_nks(); ik++) + this->ucell = &ucell_in; + hamilt::FS_Kin_tools kin_tool(*this->ucell, p_kv, wfc_basis, wg); + for (int ik = 0; ik < wfc_basis->nks; ++ik) { - if (npwx < p_kv->ngk[ik]) { - npwx = p_kv->ngk[ik]; -} - } - - gk[0]= new FPTYPE[npwx]; - gk[1]= new FPTYPE[npwx]; - gk[2]= new FPTYPE[npwx]; - FPTYPE tpiba = ModuleBase::TWO_PI / GlobalC::ucell.lat0; - FPTYPE twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI); - FPTYPE* kfac = new FPTYPE[npwx]; - const int npol = GlobalC::ucell.get_npol(); - - for (int ik = 0; ik < p_kv->get_nks(); ik++) - { - npw = p_kv->ngk[ik]; -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int i = 0; i < npw; i++) - { - gk[0][i] = wfc_basis->getgpluskcar(ik, i)[0] * tpiba; - gk[1][i] = wfc_basis->getgpluskcar(ik, i)[1] * tpiba; - gk[2][i] = wfc_basis->getgpluskcar(ik, i)[2] * tpiba; - if (wfc_basis->erf_height > 0) - { - FPTYPE gk2 = gk[0][i] * gk[0][i] + gk[1][i] * gk[1][i] + gk[2][i] * gk[2][i]; - FPTYPE arg = (gk2 - wfc_basis->erf_ecut) / wfc_basis->erf_sigma; - kfac[i] = 1.0 + wfc_basis->erf_height / wfc_basis->erf_sigma * twobysqrtpi * std::exp(-arg * arg); - } - else - { - kfac[i] = 1.0; - } - } - - // kinetic contribution - - for (int l = 0; l < 3; l++) + int nbands_occ = wg.nc; + while (wg(ik, nbands_occ - 1) == 0.0) { - for (int m = 0; m < l + 1; m++) + nbands_occ--; + if (nbands_occ == 0) { - for (int ibnd = 0; ibnd < PARAM.inp.nbands; ibnd++) - { - if (wg(ik, ibnd) == 0.0) { continue; -} - const std::complex* ppsi = nullptr; - ppsi = &(psi_in[0](ik, ibnd, 0)); - - FPTYPE sum = 0; -#ifdef _OPENMP -#pragma omp parallel for reduction(+ : sum) -#endif - for (int i = 0; i < npw; i++) - { - sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] - * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real())); - } - if(npol == 2) - { - ppsi = &(psi_in[0](ik, ibnd, npwx)); -#ifdef _OPENMP -#pragma omp parallel for reduction(+ : sum) -#endif - for (int i = 0; i < npw; i++) - { - sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] - * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real())); - } - } - s_kin[l][m] += sum; - } + break; } } - - //contribution from the nonlocal part - - //stres_us(ik, gk, npw); + const int npm = nbands_occ; + kin_tool.cal_gk(ik); + kin_tool.cal_stress_kin(ik, npm, true, &psi_in[0](ik, 0, 0)); } - - // add the US term from augmentation charge derivatives - - // addussstres(sigmanlc); - - //mp_cast - - for(int l=0;l<3;l++) - { - for(int m=0;m 1 - } - } - - - for(int l=0;l<3;l++) - { - for(int m=0;m<3;m++) - { - sigma(l,m) = s_kin[l][m]; - } - } - //do symmetry - if (ModuleSymmetry::Symmetry::symm_flag == 1) - { - p_symm->symmetrize_mat3(sigma, GlobalC::ucell.lat); - } // end symmetry - - delete[] gk[0]; - delete[] gk[1]; - delete[] gk[2]; - delete[] gk; - delete[] kfac; + kin_tool.symmetrize_stress(p_symm, sigma); ModuleBase::timer::tick("Stress_Func","stress_kin"); return; diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp index 5b177b37b6..2a761d8118 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp @@ -34,9 +34,10 @@ void Stress_Func::stress_nl(ModuleBase::matrix& sigma, setmem_var_op()(this->ctx, stress_device, 0, 9); std::vector sigmanlc(9, 0.0); - hamilt::FS_Nonlocal_tools nl_tools(nlpp_in, &ucell_in, psi_in, p_kv, wfc_basis, p_sf, wg, ekb); + hamilt::FS_Nonlocal_tools nl_tools(nlpp_in, &ucell_in, p_kv, wfc_basis, p_sf, wg, &ekb); const int nks = p_kv->get_nks(); + const int max_nbands = wg.nc; for (int ik = 0; ik < nks; ik++) // loop k points { // skip zero weights to speed up @@ -51,15 +52,19 @@ void Stress_Func::stress_nl(ModuleBase::matrix& sigma, } const int npm = nbands_occ; + nl_tools.cal_vkb(ik, max_nbands); // calculate becp = for all beta functions - nl_tools.cal_becp(ik, npm); + nl_tools.cal_becp(ik, npm, &psi_in[0](ik,0,0)); + nl_tools.reduce_pool_becp(max_nbands); // calculate dbecp = for all beta functions // calculate stress = \sum * * D_{ij} for (int ipol = 0; ipol < 3; ipol++) { for (int jpol = 0; jpol <= ipol; jpol++) { - nl_tools.cal_dbecp_s(ik, npm, ipol, jpol, stress_device); + nl_tools.cal_vkb_deri_s(ik, max_nbands, ipol, jpol); + nl_tools.cal_dbecp_s(ik, npm, &psi_in[0](ik,0,0)); + nl_tools.cal_stress(ik, npm, true, ipol, jpol, stress_device); } } } @@ -75,23 +80,16 @@ void Stress_Func::stress_nl(ModuleBase::matrix& sigma, { sigmanlc[l * 3 + m] = sigmanlc[m * 3 + l]; } - Parallel_Reduce::reduce_all(sigmanlc[l * 3 + m]); // qianrui fix a bug for kpar > 1 - } - } - // rescale the stress with 1/omega - for (int ipol = 0; ipol < 3; ipol++) - { - for (int jpol = 0; jpol < 3; jpol++) - { - sigmanlc[ipol * 3 + jpol] *= 1.0 / ucell_in.omega; } } + // sum up forcenl from all processors + Parallel_Reduce::reduce_all(sigmanlc.data(), 9); for (int ipol = 0; ipol < 3; ipol++) { for (int jpol = 0; jpol < 3; jpol++) { - sigma(ipol, jpol) = sigmanlc[ipol * 3 + jpol]; + sigma(ipol, jpol) = sigmanlc[ipol * 3 + jpol] / ucell_in.omega; } } // do symmetry diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp index b56553c708..f32f0067dc 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp @@ -14,7 +14,6 @@ void Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, Structure_Factor* p_sf, K_Vectors* p_kv, ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, const psi::Psi, Device>* d_psi_in) { ModuleBase::TITLE("Stress_PW", "cal_stress"); @@ -64,7 +63,7 @@ void Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, } // kinetic contribution - this->stress_kin(sigmakin, this->pelec->wg, p_symm, p_kv, wfc_basis, psi_in); + this->stress_kin(sigmakin, this->pelec->wg, p_symm, p_kv, wfc_basis, ucell, d_psi_in); // hartree contribution this->stress_har(sigmahar, rho_basis, 1, pelec->charge); diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.h b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.h index 454f5e5280..4671d25567 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.h +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.h @@ -19,7 +19,6 @@ class Stress_PW : public Stress_Func Structure_Factor* p_sf, K_Vectors* p_kv, ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in = nullptr, const psi::Psi, Device>* d_psi_in = nullptr); protected: diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp index 8cb543b026..c652b38635 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp +++ b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp @@ -8,6 +8,7 @@ #include "module_hamilt_pw/hamilt_pwdft/global.h" #include "module_io/output_log.h" #include "module_parameter/parameter.h" +#include "module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h" // new #include "module_base/math_integral.h" @@ -15,59 +16,61 @@ #include "module_base/timer.h" #include "module_hamilt_general/module_xc/xc_functional.h" -void Sto_Forces::cal_stoforce(ModuleBase::matrix& force, - const elecstate::ElecState& elec, - ModulePW::PW_Basis* rho_basis, - ModuleSymmetry::Symmetry* p_symm, - Structure_Factor* p_sf, - K_Vectors* pkv, - ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, - Stochastic_WF, base_device::DEVICE_CPU>& stowf) +template +void Sto_Forces::cal_stoforce(ModuleBase::matrix& force, + const elecstate::ElecState& elec, + ModulePW::PW_Basis* rho_basis, + ModuleSymmetry::Symmetry* p_symm, + const Structure_Factor* p_sf, + K_Vectors* pkv, + ModulePW::PW_Basis_K* wfc_basis, + const pseudopot_cell_vnl& nlpp, + UnitCell& ucell, + const psi::Psi, Device>& psi, + const Stochastic_WF, Device>& stowf) { - ModuleBase::timer::tick("Sto_Force", "cal_force"); + ModuleBase::timer::tick("Sto_Forces", "cal_force"); ModuleBase::TITLE("Sto_Forces", "init"); - this->nat = GlobalC::ucell.nat; + this->device = base_device::get_device_type(this->ctx); const ModuleBase::matrix& wg = elec.wg; const Charge* chr = elec.charge; - force.create(nat, 3); + force.create(this->nat, 3); - ModuleBase::matrix forcelc(nat, 3); - ModuleBase::matrix forceion(nat, 3); - ModuleBase::matrix forcecc(nat, 3); - ModuleBase::matrix forcenl(nat, 3); - ModuleBase::matrix forcescc(nat, 3); + ModuleBase::matrix forcelc(this->nat, 3); + ModuleBase::matrix forceion(this->nat, 3); + ModuleBase::matrix forcecc(this->nat, 3); + ModuleBase::matrix forcenl(this->nat, 3); + ModuleBase::matrix forcescc(this->nat, 3); this->cal_force_loc(forcelc, rho_basis, chr); this->cal_force_ew(forceion, rho_basis, p_sf); - this->cal_sto_force_nl(forcenl, wg, pkv, wfc_basis, psi_in, stowf); - this->cal_force_cc(forcecc, rho_basis, chr, GlobalC::ucell); - this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, GlobalC::ucell); + this->cal_sto_force_nl(forcenl, wg, pkv, wfc_basis, p_sf, nlpp, ucell, psi, stowf); + this->cal_force_cc(forcecc, rho_basis, chr, ucell); + this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, ucell); // impose total force = 0 - int iat = 0; - ModuleBase::matrix force_e; if (PARAM.inp.efield_flag) { - force_e.create(GlobalC::ucell.nat, 3); - elecstate::Efield::compute_force(GlobalC::ucell, force_e); + force_e.create(this->nat, 3); + elecstate::Efield::compute_force(ucell, force_e); } ModuleBase::matrix force_gate; if (PARAM.inp.gate_flag) { - force_gate.create(GlobalC::ucell.nat, 3); - elecstate::Gatefield::compute_force(GlobalC::ucell, force_gate); + force_gate.create(this->nat, 3); + elecstate::Gatefield::compute_force(ucell, force_gate); } + int iat = 0; for (int ipol = 0; ipol < 3; ipol++) { double sum = 0.0; iat = 0; - for (int it = 0; it < GlobalC::ucell.ntype; it++) + for (int it = 0; it < ucell.ntype; it++) { - for (int ia = 0; ia < GlobalC::ucell.atoms[it].na; ia++) + for (int ia = 0; ia < ucell.atoms[it].na; ia++) { force(iat, ipol) = forcelc(iat, ipol) + forceion(iat, ipol) + forcenl(iat, ipol) + forcecc(iat, ipol) + forcescc(iat, ipol); @@ -90,8 +93,8 @@ void Sto_Forces::cal_stoforce(ModuleBase::matrix& force, if (!(PARAM.inp.gate_flag || PARAM.inp.efield_flag)) { - double compen = sum / GlobalC::ucell.nat; - for (int iat = 0; iat < GlobalC::ucell.nat; ++iat) + double compen = sum / ucell.nat; + for (int iat = 0; iat < ucell.nat; ++iat) { force(iat, ipol) = force(iat, ipol) - compen; } @@ -106,20 +109,20 @@ void Sto_Forces::cal_stoforce(ModuleBase::matrix& force, if (ModuleSymmetry::Symmetry::symm_flag == 1) { double d1, d2, d3; - for (int iat = 0; iat < GlobalC::ucell.nat; iat++) + for (int iat = 0; iat < ucell.nat; iat++) { ModuleBase::Mathzone::Cartesian_to_Direct(force(iat, 0), force(iat, 1), force(iat, 2), - GlobalC::ucell.a1.x, - GlobalC::ucell.a1.y, - GlobalC::ucell.a1.z, - GlobalC::ucell.a2.x, - GlobalC::ucell.a2.y, - GlobalC::ucell.a2.z, - GlobalC::ucell.a3.x, - GlobalC::ucell.a3.y, - GlobalC::ucell.a3.z, + ucell.a1.x, + ucell.a1.y, + ucell.a1.z, + ucell.a2.x, + ucell.a2.y, + ucell.a2.z, + ucell.a3.x, + ucell.a3.y, + ucell.a3.z, d1, d2, d3); @@ -129,20 +132,20 @@ void Sto_Forces::cal_stoforce(ModuleBase::matrix& force, force(iat, 2) = d3; } p_symm->symmetrize_vec3_nat(force.c); - for (int iat = 0; iat < GlobalC::ucell.nat; iat++) + for (int iat = 0; iat < ucell.nat; iat++) { ModuleBase::Mathzone::Direct_to_Cartesian(force(iat, 0), force(iat, 1), force(iat, 2), - GlobalC::ucell.a1.x, - GlobalC::ucell.a1.y, - GlobalC::ucell.a1.z, - GlobalC::ucell.a2.x, - GlobalC::ucell.a2.y, - GlobalC::ucell.a2.z, - GlobalC::ucell.a3.x, - GlobalC::ucell.a3.y, - GlobalC::ucell.a3.z, + ucell.a1.x, + ucell.a1.y, + ucell.a1.z, + ucell.a2.x, + ucell.a2.y, + ucell.a2.z, + ucell.a3.x, + ucell.a3.y, + ucell.a3.z, d1, d2, d3); @@ -153,271 +156,110 @@ void Sto_Forces::cal_stoforce(ModuleBase::matrix& force, } GlobalV::ofs_running << setiosflags(std::ios::fixed) << std::setprecision(6) << std::endl; - if (PARAM.inp.test_force) - { - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "LOCAL FORCE (Ry/Bohr)", forcelc); - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "NONLOCAL FORCE (Ry/Bohr)", forcenl); - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "NLCC FORCE (Ry/Bohr)", forcecc); - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "ION FORCE (Ry/Bohr)", forceion); - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "SCC FORCE (Ry/Bohr)", forcescc); - if (PARAM.inp.efield_flag) - { - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "EFIELD FORCE (Ry/Bohr)", force_e); - } - if (PARAM.inp.gate_flag) - { - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "GATEFIELD FORCE (Ry/Bohr)", force_gate); - } - } // output force in unit eV/Angstrom GlobalV::ofs_running << std::endl; if (PARAM.inp.test_force) { - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "LOCAL FORCE (eV/Angstrom)", forcelc, false); - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "NONLOCAL FORCE (eV/Angstrom)", forcenl, false); - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "NLCC FORCE (eV/Angstrom)", forcecc, false); - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "ION FORCE (eV/Angstrom)", forceion, false); - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "SCC FORCE (eV/Angstrom)", forcescc, false); + ModuleIO::print_force(GlobalV::ofs_running, ucell, "LOCAL FORCE (eV/Angstrom)", forcelc, false); + ModuleIO::print_force(GlobalV::ofs_running, ucell, "NONLOCAL FORCE (eV/Angstrom)", forcenl, false); + ModuleIO::print_force(GlobalV::ofs_running, ucell, "NLCC FORCE (eV/Angstrom)", forcecc, false); + ModuleIO::print_force(GlobalV::ofs_running, ucell, "ION FORCE (eV/Angstrom)", forceion, false); + ModuleIO::print_force(GlobalV::ofs_running, ucell, "SCC FORCE (eV/Angstrom)", forcescc, false); if (PARAM.inp.efield_flag) { - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "EFIELD FORCE (eV/Angstrom)", force_e, false); + ModuleIO::print_force(GlobalV::ofs_running, ucell, "EFIELD FORCE (eV/Angstrom)", force_e, false); } if (PARAM.inp.gate_flag) { ModuleIO::print_force(GlobalV::ofs_running, - GlobalC::ucell, + ucell, "GATEFIELD FORCE (eV/Angstrom)", force_gate, false); } } - ModuleIO::print_force(GlobalV::ofs_running, GlobalC::ucell, "TOTAL-FORCE (eV/Angstrom)", force, false); - ModuleBase::timer::tick("Sto_Force", "cal_force"); + ModuleIO::print_force(GlobalV::ofs_running, ucell, "TOTAL-FORCE (eV/Angstrom)", force, false); + ModuleBase::timer::tick("Sto_Forces", "cal_force"); return; } -void Sto_Forces::cal_sto_force_nl(ModuleBase::matrix& forcenl, - const ModuleBase::matrix& wg, - K_Vectors* p_kv, - ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, - Stochastic_WF, base_device::DEVICE_CPU>& stowf) +template +void Sto_Forces::cal_sto_force_nl( + ModuleBase::matrix& forcenl, + const ModuleBase::matrix& wg, + K_Vectors* p_kv, + ModulePW::PW_Basis_K* wfc_basis, + const Structure_Factor* p_sf, + const pseudopot_cell_vnl& nlpp, + const UnitCell& ucell, + const psi::Psi, Device>& psi_in, + const Stochastic_WF, Device>& stowf) { ModuleBase::TITLE("Sto_Forces", "cal_force_nl"); - ModuleBase::timer::tick("Sto_Forces", "cal_force_nl"); - - const int nkb = GlobalC::ppcell.nkb; - int* nchip = stowf.nchip; + const int nkb = nlpp.nkb; if (nkb == 0) { - return; // mohan add 2010-07-25 + return; } + ModuleBase::timer::tick("Sto_Forces", "cal_force_nl"); + + const int* nchip = stowf.nchip; const int npwx = wfc_basis->npwk_max; - // vkb1: |Beta(nkb,npw)> - ModuleBase::ComplexMatrix vkb1(nkb, npwx); - int nksbands = psi_in->get_nbands(); + int nksbands = psi_in.get_nbands(); if (GlobalV::MY_STOGROUP != 0) { nksbands = 0; } + // allocate memory for the force + FPTYPE* force = nullptr; + resmem_var_op()(this->ctx, force, ucell.nat * 3); + base_device::memory::set_memory_op()(this->ctx, force, 0.0, ucell.nat * 3); + + hamilt::FS_Nonlocal_tools nl_tools(&nlpp, &ucell, p_kv, wfc_basis, p_sf, wg, nullptr); + + for (int ik = 0; ik < wfc_basis->nks; ik++) { const int nstobands = nchip[ik]; - const int nbandstot = nstobands + nksbands; + const int max_nbands = stowf.shchi->get_nbands() + nksbands; const int npw = wfc_basis->npwk[ik]; + psi_in.fix_k(ik); + stowf.shchi->fix_k(ik); - // dbecp: conj( -iG * ) - ModuleBase::ComplexArray dbecp(3, nbandstot, nkb); - ModuleBase::ComplexMatrix becp(nbandstot, nkb); - const int current_spin = p_kv->isk[ik]; + nl_tools.cal_vkb(ik, max_nbands); // vkb has dimension of nkb * max_nbands * npol - // generate vkb - if (GlobalC::ppcell.nkb > 0) - { - GlobalC::ppcell.getvnl(ik, GlobalC::ppcell.vkb); - } + // calculate becp = for all beta functions + nl_tools.cal_becp(ik, nksbands, psi_in.get_pointer(), 0); + nl_tools.cal_becp(ik, nstobands, stowf.shchi->get_pointer(), nksbands); + nl_tools.reduce_pool_becp(max_nbands); - // get becp according to wave functions and vkb - // important here ! becp must set zero!! - // vkb: Beta(nkb,npw) - // becp(nkb,nbnd): - becp.zero_out(); - char transa = 'C'; - char transb = 'N'; - psi_in->fix_k(ik); - stowf.shchi->fix_k(ik); - // KS orbitals - int npmks = PARAM.globalv.npol * nksbands; - zgemm_(&transa, - &transb, - &nkb, - &npmks, - &npw, - &ModuleBase::ONE, - GlobalC::ppcell.vkb.c, - &npwx, - psi_in->get_pointer(), - &npwx, - &ModuleBase::ZERO, - becp.c, - &nkb); - // stochastic orbitals - int npmsto = PARAM.globalv.npol * nstobands; - zgemm_(&transa, - &transb, - &nkb, - &npmsto, - &npw, - &ModuleBase::ONE, - GlobalC::ppcell.vkb.c, - &npwx, - stowf.shchi->get_pointer(), - &npwx, - &ModuleBase::ZERO, - &becp(nksbands, 0), - &nkb); - - Parallel_Reduce::reduce_pool(becp.c, becp.size); - - // out.printcm_real("becp",becp,1.0e-4); - // Calculate the derivative of beta, - // |dbeta> = -ig * |beta> - dbecp.zero_out(); for (int ipol = 0; ipol < 3; ipol++) { - for (int i = 0; i < nkb; i++) - { - std::complex* pvkb1 = &vkb1(i, 0); - std::complex* pvkb = &GlobalC::ppcell.vkb(i, 0); - if (ipol == 0) - { - for (int ig = 0; ig < npw; ig++) - { - pvkb1[ig] = pvkb[ig] * ModuleBase::NEG_IMAG_UNIT * wfc_basis->getgcar(ik, ig)[0]; - } - } - if (ipol == 1) - { - for (int ig = 0; ig < npw; ig++) - { - pvkb1[ig] = pvkb[ig] * ModuleBase::NEG_IMAG_UNIT * wfc_basis->getgcar(ik, ig)[1]; - } - } - if (ipol == 2) - { - for (int ig = 0; ig < npw; ig++) - { - pvkb1[ig] = pvkb[ig] * ModuleBase::NEG_IMAG_UNIT * wfc_basis->getgcar(ik, ig)[2]; - } - } - } - // KS orbitals - zgemm_(&transa, - &transb, - &nkb, - &npmks, - &npw, - &ModuleBase::ONE, - vkb1.c, - &npwx, - psi_in->get_pointer(), - &npwx, - &ModuleBase::ZERO, - &dbecp(ipol, 0, 0), - &nkb); - // stochastic orbitals - zgemm_(&transa, - &transb, - &nkb, - &npmsto, - &npw, - &ModuleBase::ONE, - vkb1.c, - &npwx, - stowf.shchi->get_pointer(), - &npwx, - &ModuleBase::ZERO, - &dbecp(ipol, nksbands, 0), - &nkb); - } // end ipol - - // don't need to reduce here, keep dbecp different in each processor, - // and at last sum up all the forces. - // Parallel_Reduce::reduce_complex_double_pool( dbecp.ptr, dbecp.ndata); - - // double *cf = new double[ucell.nat*3]; - // ZEROS(cf, ucell.nat); - for (int ib = 0; ib < nbandstot; ib++) - { - double fac; - if (ib < nksbands) - { - fac = wg(ik, ib) * 2.0 * GlobalC::ucell.tpiba; - } - else - { - fac = p_kv->wk[ik] * 2.0 * GlobalC::ucell.tpiba; - } - int iat = 0; - int sum = 0; - for (int it = 0; it < GlobalC::ucell.ntype; it++) - { - const int Nprojs = GlobalC::ucell.atoms[it].ncpp.nh; - for (int ia = 0; ia < GlobalC::ucell.atoms[it].na; ia++) - { - for (int ip = 0; ip < Nprojs; ip++) - { - double ps = GlobalC::ppcell.deeq(current_spin, iat, ip, ip); - const int inkb = sum + ip; - // out<<"\n ps = "< ucell.atoms[it].lmax+1 ) //{zws add 20160110 - //{ - // cout << " \n multi-projector force calculation ... " << endl; - for (int ip = 0; ip < Nprojs; ip++) - { - const int inkb = sum + ip; - // for (int ip2=0; ip2 for all beta functions + nl_tools.cal_dbecp_f(ik, max_nbands, nksbands, ipol, psi_in.get_pointer(), 0); + nl_tools.cal_dbecp_f(ik, max_nbands, nstobands, ipol, stowf.shchi->get_pointer(), nksbands); + nl_tools.revert_vkb(ik, ipol); + } + nl_tools.cal_force(ik, max_nbands, nksbands, true, force, 0); + nl_tools.cal_force(ik, max_nbands, nstobands, false, force, nksbands); + } // end ik + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcenl.c, force, forcenl.nr * forcenl.nc); + delmem_var_op()(this->ctx, force); // sum up forcenl from all processors Parallel_Reduce::reduce_all(forcenl.c, forcenl.nr * forcenl.nc); + ModuleBase::timer::tick("Sto_Forces", "cal_force_nl"); return; } + +template class Sto_Forces; +#if ((defined __CUDA) || (defined __ROCM)) +template class Sto_Forces; +#endif diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_forces.h b/source/module_hamilt_pw/hamilt_stodft/sto_forces.h index 2655821f0b..e6620a7791 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_forces.h +++ b/source/module_hamilt_pw/hamilt_stodft/sto_forces.h @@ -5,7 +5,8 @@ #include "module_psi/psi.h" #include "sto_wf.h" -class Sto_Forces : public Forces +template +class Sto_Forces : public Forces { public: /* This routine is a driver routine which compute the forces @@ -17,26 +18,36 @@ class Sto_Forces : public Forces * (4) cal_nl: contribution due to the non-local pseudopotential. * (4) cal_scc: contributino due to incomplete SCF calculation. */ - Sto_Forces(const int nat_in) : Forces(nat_in){}; + Sto_Forces(const int nat_in) : Forces(nat_in){}; ~Sto_Forces(){}; void cal_stoforce(ModuleBase::matrix& force, const elecstate::ElecState& elec, ModulePW::PW_Basis* rho_basis, ModuleSymmetry::Symmetry* p_symm, - Structure_Factor* p_sf, + const Structure_Factor* p_sf, K_Vectors* pkv, ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, - Stochastic_WF, base_device::DEVICE_CPU>& stowf); + const pseudopot_cell_vnl& nlpp, + UnitCell& ucell, + const psi::Psi, Device>& psi_in, + const Stochastic_WF, Device>& stowf); private: void cal_sto_force_nl(ModuleBase::matrix& forcenl, const ModuleBase::matrix& wg, K_Vectors* p_kv, ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, - Stochastic_WF, base_device::DEVICE_CPU>& stowf); + const Structure_Factor* p_sf, + const pseudopot_cell_vnl& nlpp, + const UnitCell& ucell, + const psi::Psi, Device>& psi_in, + const Stochastic_WF, Device>& stowf); + + using resmem_var_op = base_device::memory::resize_memory_op; + using delmem_var_op = base_device::memory::delete_memory_op; + using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op; + using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op; }; #endif diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp index 4983ab1a6f..0f450bdc31 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp +++ b/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp @@ -55,6 +55,7 @@ template void Stochastic_Iter::orthog(const int& ik, psi::Psi& psi, Stochastic_WF& stowf) { ModuleBase::TITLE("Stochastic_Iter", "orthog"); + ModuleBase::timer::tick("Stochastic_Iter", "orthog"); // orthogonal part if (PARAM.inp.nbands > 0) { @@ -110,6 +111,7 @@ void Stochastic_Iter::orthog(const int& ik, psi::Psi& psi, npwx); delmem_complex_op()(this->ctx, sum); } + ModuleBase::timer::tick("Stochastic_Iter", "orthog"); } template @@ -119,6 +121,7 @@ void Stochastic_Iter::checkemm(const int& ik, Stochastic_WF& stowf) { ModuleBase::TITLE("Stochastic_Iter", "checkemm"); + ModuleBase::timer::tick("Stochastic_Iter", "checkemm"); // iter = 1,2,... istep = 0,1,2,... // if( istep%PARAM.inp.initsto_freq != 0 ) return; const int npw = stowf.ngk[ik]; @@ -192,6 +195,7 @@ void Stochastic_Iter::checkemm(const int& ik, } change = false; } + ModuleBase::timer::tick("Stochastic_Iter", "checkemm"); } template @@ -412,6 +416,7 @@ void Stochastic_Iter::calPn(const int& ik, Stochastic_WF& template double Stochastic_Iter::calne(elecstate::ElecState* pes) { + ModuleBase::TITLE("Stochastic_Iter", "calne"); ModuleBase::timer::tick("Stochastic_Iter", "calne"); double totne = 0; KS_ne = 0; @@ -455,12 +460,15 @@ double Stochastic_Iter::calne(elecstate::ElecState* pes) template void Stochastic_Iter::calHsqrtchi(Stochastic_WF& stowf) { + ModuleBase::TITLE("Stochastic_Iter", "calHsqrtchi"); + ModuleBase::timer::tick("Stochastic_Iter", "calHsqrtchi"); auto nroot_fd = std::bind(&Sto_Func::nroot_fd, &this->stofunc, std::placeholders::_1); p_che->calcoef_real(nroot_fd); for (int ik = 0; ik < this->pkv->get_nks(); ++ik) { this->calTnchi_ik(ik, stowf); } + ModuleBase::timer::tick("Stochastic_Iter", "calHsqrtchi"); } template diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp index 73205edecd..1ea3b51321 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp +++ b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp @@ -5,19 +5,22 @@ #include "module_hamilt_pw/hamilt_pwdft/structure_factor.h" #include "module_io/output_log.h" #include "module_parameter/parameter.h" - -void Sto_Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, - const elecstate::ElecState& elec, - ModulePW::PW_Basis* rho_basis, - ModuleSymmetry::Symmetry* p_symm, - Structure_Factor* p_sf, - K_Vectors* p_kv, - ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, - Stochastic_WF, base_device::DEVICE_CPU>& stowf, - const Charge* const chr, - pseudopot_cell_vnl* nlpp_in, - const UnitCell& ucell_in) +#include "module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.h" +#include "module_hamilt_pw/hamilt_pwdft/fs_kin_tools.h" + +template +void Sto_Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, + const elecstate::ElecState& elec, + ModulePW::PW_Basis* rho_basis, + ModuleSymmetry::Symmetry* p_symm, + Structure_Factor* p_sf, + K_Vectors* p_kv, + ModulePW::PW_Basis_K* wfc_basis, + const psi::Psi, Device>& psi_in, + const Stochastic_WF, Device>& stowf, + const Charge* const chr, + pseudopot_cell_vnl* nlpp, + const UnitCell& ucell_in) { ModuleBase::TITLE("Sto_Stress_PW", "cal_stress"); ModuleBase::timer::tick("Sto_Stress_PW", "cal_stress"); @@ -33,29 +36,29 @@ void Sto_Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, ModuleBase::matrix sigmaxcc(3, 3); // kinetic contribution - sto_stress_kin(sigmakin, wg, p_symm, p_kv, wfc_basis, psi_in, stowf); + this->sto_stress_kin(sigmakin, wg, p_symm, p_kv, wfc_basis, psi_in, stowf); // hartree contribution - stress_har(sigmahar, rho_basis, true, chr); + this->stress_har(sigmahar, rho_basis, true, chr); // ewald contribution - stress_ewa(sigmaewa, rho_basis, true); + this->stress_ewa(sigmaewa, rho_basis, true); // xc contribution: add gradient corrections(non diagonal) for (int i = 0; i < 3; ++i) { sigmaxc(i, i) = -(elec.f_en.etxc - elec.f_en.vtxc) / this->ucell->omega; } - stress_gga(sigmaxc, rho_basis, chr); + this->stress_gga(sigmaxc, rho_basis, chr); // local contribution - stress_loc(sigmaloc, rho_basis, p_sf, true, chr); + this->stress_loc(sigmaloc, rho_basis, p_sf, true, chr); // nlcc - stress_cc(sigmaxcc, rho_basis, p_sf, true, chr); + this->stress_cc(sigmaxcc, rho_basis, p_sf, true, chr); // nonlocal - sto_stress_nl(sigmanl, wg, p_sf, p_symm, p_kv, wfc_basis, psi_in, stowf, nlpp_in); + this->sto_stress_nl(sigmanl, wg, p_sf, p_symm, p_kv, wfc_basis, *nlpp, ucell_in, psi_in, stowf); for (int ipol = 0; ipol < 3; ++ipol) { @@ -77,9 +80,10 @@ void Sto_Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, if (PARAM.inp.test_stress) { + ry = true; GlobalV::ofs_running << "\n PARTS OF STRESS: " << std::endl; - GlobalV::ofs_running << setiosflags(std::ios::showpos); - GlobalV::ofs_running << setiosflags(std::ios::fixed) << std::setprecision(10) << std::endl; + GlobalV::ofs_running << std::setiosflags(std::ios::showpos); + GlobalV::ofs_running << std::setiosflags(std::ios::fixed) << std::setprecision(10) << std::endl; ModuleIO::print_stress("KINETIC STRESS", sigmakin, PARAM.inp.test_stress, ry); ModuleIO::print_stress("LOCAL STRESS", sigmaloc, PARAM.inp.test_stress, ry); ModuleIO::print_stress("HARTREE STRESS", sigmahar, PARAM.inp.test_stress, ry); @@ -93,394 +97,145 @@ void Sto_Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, return; } -void Sto_Stress_PW::sto_stress_kin(ModuleBase::matrix& sigma, - const ModuleBase::matrix& wg, - ModuleSymmetry::Symmetry* p_symm, - K_Vectors* p_kv, - ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, - Stochastic_WF, base_device::DEVICE_CPU>& stowf) +template +void Sto_Stress_PW::sto_stress_kin(ModuleBase::matrix& sigma, + const ModuleBase::matrix& wg, + ModuleSymmetry::Symmetry* p_symm, + K_Vectors* p_kv, + ModulePW::PW_Basis_K* wfc_basis, + const psi::Psi, Device>& psi, + const Stochastic_WF, Device>& stowf) { - ModuleBase::TITLE("Sto_Stress_PW", "cal_stress"); - ModuleBase::timer::tick("Sto_Stress_PW", "cal_stress"); - double** gk; - gk = new double*[3]; - ModuleBase::matrix s_kin(3, 3, true); + ModuleBase::TITLE("Sto_Stress_PW", "stress_kin"); + ModuleBase::timer::tick("Sto_Stress_PW", "stress_kin"); - const int npwx = wfc_basis->npwk_max; - const int nks = wfc_basis->nks; - gk[0] = new double[npwx]; - gk[1] = new double[npwx]; - gk[2] = new double[npwx]; - double tpiba = ModuleBase::TWO_PI / this->ucell->lat0; - double twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI); - double* kfac = new double[npwx]; - int nksbands = psi_in->get_nbands(); + int nksbands = psi.get_nbands(); if (GlobalV::MY_STOGROUP != 0) { nksbands = 0; } + + hamilt::FS_Kin_tools kin_tool(*this->ucell, p_kv, wfc_basis, wg); - for (int ik = 0; ik < nks; ++ik) + for (int ik = 0; ik < wfc_basis->nks; ++ik) { - const int nstobands = stowf.nchip[ik]; - const int nbandstot = nstobands + nksbands; - const int npw = wfc_basis->npwk[ik]; - for (int i = 0; i < npw; ++i) - { - gk[0][i] = wfc_basis->getgpluskcar(ik, i)[0] * tpiba; - gk[1][i] = wfc_basis->getgpluskcar(ik, i)[1] * tpiba; - gk[2][i] = wfc_basis->getgpluskcar(ik, i)[2] * tpiba; - if (wfc_basis->erf_height > 0) - { - double gk2 = gk[0][i] * gk[0][i] + gk[1][i] * gk[1][i] + gk[2][i] * gk[2][i]; - double arg = (gk2 - wfc_basis->erf_ecut) / wfc_basis->erf_sigma; - kfac[i] = 1.0 + wfc_basis->erf_height / wfc_basis->erf_sigma * twobysqrtpi * std::exp(-arg * arg); - } - else - { - kfac[i] = 1.0; - } - } - - // kinetic contribution - - for (int l = 0; l < 3; ++l) - { - for (int m = 0; m < l + 1; ++m) - { - for (int ibnd = 0; ibnd < nbandstot; ++ibnd) - { - if (ibnd < nksbands) - { - for (int i = 0; i < npw; ++i) - { - std::complex p = psi_in->operator()(ik, ibnd, i); - double np = p.real() * p.real() + p.imag() * p.imag(); - s_kin(l, m) += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] * np; - } - } - else - { - for (int i = 0; i < npw; ++i) - { - std::complex p = stowf.shchi->operator()(ik, ibnd - nksbands, i); - double np = p.real() * p.real() + p.imag() * p.imag(); - s_kin(l, m) += p_kv->wk[ik] * gk[l][i] * gk[m][i] * kfac[i] * np; - } - } - } - } - } - } - - for (int l = 0; l < 3; ++l) - { - for (int m = 0; m < l; ++m) - { - s_kin(m, l) = s_kin(l, m); - } - } + const int stobands = stowf.nchip[ik]; + psi.fix_k(ik); + stowf.shchi->fix_k(ik); + + kin_tool.cal_gk(ik); - for (int l = 0; l < 3; ++l) - { - for (int m = 0; m < 3; ++m) - { - s_kin(l, m) *= ModuleBase::e2 / this->ucell->omega; - } + kin_tool.cal_stress_kin(ik, nksbands, true, psi.get_pointer()); + kin_tool.cal_stress_kin(ik, stobands, false, stowf.shchi->get_pointer()); } - Parallel_Reduce::reduce_all(s_kin.c, 9); - - for (int l = 0; l < 3; ++l) - { - for (int m = 0; m < 3; ++m) - { - sigma(l, m) = s_kin(l, m); - } - } - // do symmetry - if (ModuleSymmetry::Symmetry::symm_flag == 1) - { - p_symm->symmetrize_mat3(sigma, this->ucell->lat); - } - delete[] gk[0]; - delete[] gk[1]; - delete[] gk[2]; - delete[] gk; - delete[] kfac; - ModuleBase::timer::tick("Sto_Stress_PW", "cal_stress"); + kin_tool.symmetrize_stress(p_symm, sigma); + ModuleBase::timer::tick("Sto_Stress_PW", "stress_kin"); return; } -void Sto_Stress_PW::sto_stress_nl(ModuleBase::matrix& sigma, - const ModuleBase::matrix& wg, - Structure_Factor* p_sf, - ModuleSymmetry::Symmetry* p_symm, - K_Vectors* p_kv, - ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, - Stochastic_WF, base_device::DEVICE_CPU>& stowf, - pseudopot_cell_vnl* nlpp_in) +template +void Sto_Stress_PW::sto_stress_nl(ModuleBase::matrix& sigma, + const ModuleBase::matrix& wg, + Structure_Factor* p_sf, + ModuleSymmetry::Symmetry* p_symm, + K_Vectors* p_kv, + ModulePW::PW_Basis_K* wfc_basis, + const pseudopot_cell_vnl& nlpp, + const UnitCell& ucell, + const psi::Psi, Device>& psi_in, + const Stochastic_WF, Device>& stowf) { ModuleBase::TITLE("Sto_Stress_Func", "stres_nl"); - ModuleBase::timer::tick("Sto_Stress_Func", "stres_nl"); - - this->nlpp = nlpp_in; - const int nkb = this->nlpp->nkb; + const int nkb = nlpp.nkb; if (nkb == 0) { - ModuleBase::timer::tick("Stress_Func", "stres_nl"); return; } - ModuleBase::matrix sigmanlc(3, 3, true); + ModuleBase::timer::tick("Sto_Stress_Func", "stres_nl"); + int* nchip = stowf.nchip; const int npwx = wfc_basis->npwk_max; - int nksbands = psi_in->get_nbands(); + int nksbands = psi_in.get_nbands(); if (GlobalV::MY_STOGROUP != 0) { nksbands = 0; } - // vkb1: |Beta(nkb,npw)> - ModuleBase::ComplexMatrix vkb1(nkb, npwx); - ModuleBase::ComplexMatrix vkb0[3]; - for (int i = 0; i < 3; ++i) - { - vkb0[i].create(nkb, npwx); - } - ModuleBase::ComplexMatrix vkb2(nkb, npwx); + // allocate memory for the stress + FPTYPE* stress_device = nullptr; + resmem_var_op()(this->ctx, stress_device, 9); + setmem_var_op()(this->ctx, stress_device, 0, 9); + std::vector sigmanlc(9, 0.0); + + hamilt::FS_Nonlocal_tools nl_tools(&nlpp, &ucell, p_kv, wfc_basis, p_sf, wg, nullptr); for (int ik = 0; ik < p_kv->get_nks(); ik++) { - const int nstobands = stowf.nchip[ik]; - const int nbandstot = nstobands + nksbands; - const int npw = p_kv->ngk[ik]; - // dbecp: conj( -iG * ) - ModuleBase::ComplexMatrix dbecp(nbandstot, nkb); - ModuleBase::ComplexMatrix becp(nbandstot, nkb); - - const int current_spin = p_kv->isk[ik]; - // generate vkb - if (this->nlpp->nkb > 0) - { - this->nlpp->getvnl(ik, this->nlpp->vkb); - } - - // get becp according to wave functions and vkb - // important here ! becp must set zero!! - // vkb: Beta(nkb,npw) - // becp(nkb,nbnd): - becp.zero_out(); - char transa = 'C'; - char transb = 'N'; - psi_in[0].fix_k(ik); + const int nstobands = nchip[ik]; + const int max_nbands = stowf.shchi->get_nbands() + nksbands; + const int npw = wfc_basis->npwk[ik]; + psi_in.fix_k(ik); stowf.shchi->fix_k(ik); - // KS orbitals - int npmks = PARAM.globalv.npol * nksbands; - zgemm_(&transa, - &transb, - &nkb, - &npmks, - &npw, - &ModuleBase::ONE, - this->nlpp->vkb.c, - &npwx, - psi_in->get_pointer(), - &npwx, - &ModuleBase::ZERO, - becp.c, - &nkb); - // stochastic orbitals - int npmsto = PARAM.globalv.npol * nstobands; - zgemm_(&transa, - &transb, - &nkb, - &npmsto, - &npw, - &ModuleBase::ONE, - this->nlpp->vkb.c, - &npwx, - stowf.shchi->get_pointer(), - &npwx, - &ModuleBase::ZERO, - &becp(nksbands, 0), - &nkb); - - Parallel_Reduce::reduce_pool(becp.c, becp.size); - - for (int i = 0; i < 3; ++i) + nl_tools.cal_vkb(ik, max_nbands); + // calculate becp = for all beta functions + nl_tools.cal_becp(ik, nksbands, psi_in.get_pointer(), 0); + nl_tools.cal_becp(ik, nstobands, stowf.shchi->get_pointer(), nksbands); + nl_tools.reduce_pool_becp(max_nbands); + // calculate dbecp = for all beta functions + // calculate stress = \sum * * D_{ij} + for (int ipol = 0; ipol < 3; ipol++) { - get_dvnl1(vkb0[i], ik, i, p_sf, wfc_basis); - } - - get_dvnl2(vkb2, ik, p_sf, wfc_basis); - - ModuleBase::Vector3 qvec; - double* qvec0[3]; - qvec0[0] = &(qvec.x); - qvec0[1] = &(qvec.y); - qvec0[2] = &(qvec.z); - - for (int ipol = 0; ipol < 3; ++ipol) - { - for (int jpol = 0; jpol < ipol + 1; ++jpol) + for (int jpol = 0; jpol <= ipol; jpol++) { - dbecp.zero_out(); - vkb1.zero_out(); - for (int i = 0; i < nkb; ++i) - { - std::complex* pvkb0i = &vkb0[ipol](i, 0); - std::complex* pvkb0j = &vkb0[jpol](i, 0); - std::complex* pvkb1 = &vkb1(i, 0); - // third term of dbecp_noevc - for (int ig = 0; ig < npw; ++ig) - { - qvec = wfc_basis->getgpluskcar(ik, ig); - pvkb1[ig] += 0.5 * qvec0[ipol][0] * pvkb0j[ig] + 0.5 * qvec0[jpol][0] * pvkb0i[ig]; - } // end ig - } // end nkb - - ModuleBase::ComplexMatrix dbecp_noevc(nkb, npwx, true); - for (int i = 0; i < nkb; ++i) - { - std::complex* pdbecp_noevc = &dbecp_noevc(i, 0); - std::complex* pvkb = &vkb1(i, 0); - // first term - for (int ig = 0; ig < npw; ++ig) - { - pdbecp_noevc[ig] -= 2.0 * pvkb[ig]; - } - // second termi - if (ipol == jpol) - { - pvkb = &this->nlpp->vkb(i, 0); - for (int ig = 0; ig < npw; ++ig) - { - pdbecp_noevc[ig] -= pvkb[ig]; - } - } - // third term - pvkb = &vkb2(i, 0); - for (int ig = 0; ig < npw; ++ig) - { - qvec = wfc_basis->getgpluskcar(ik, ig); - double qm1; - if (qvec.norm2() > 1e-16) - { - qm1 = 1.0 / qvec.norm(); - } - else - { - qm1 = 0; - } - pdbecp_noevc[ig] -= 2.0 * pvkb[ig] * qvec0[ipol][0] * qvec0[jpol][0] * qm1 * this->ucell->tpiba; - } // end ig - } // end i - - // //KS orbitals - zgemm_(&transa, - &transb, - &nkb, - &npmks, - &npw, - &ModuleBase::ONE, - dbecp_noevc.c, - &npwx, - psi_in->get_pointer(), - &npwx, - &ModuleBase::ZERO, - dbecp.c, - &nkb); - // stochastic orbitals - zgemm_(&transa, - &transb, - &nkb, - &npmsto, - &npw, - &ModuleBase::ONE, - dbecp_noevc.c, - &npwx, - stowf.shchi->get_pointer(), - &npwx, - &ModuleBase::ZERO, - &dbecp(nksbands, 0), - &nkb); - - for (int ib = 0; ib < nbandstot; ++ib) - { - double fac; - if (ib < nksbands) - { - fac = wg(ik, ib); - } - else - { - fac = p_kv->wk[ik]; - } - int iat = 0; - int sum = 0; - for (int it = 0; it < this->ucell->ntype; ++it) - { - const int Nprojs = this->ucell->atoms[it].ncpp.nh; - for (int ia = 0; ia < this->ucell->atoms[it].na; ++ia) - { - for (int ip = 0; ip < Nprojs; ++ip) - { - double ps = this->nlpp->deeq(current_spin, iat, ip, ip); - const int inkb = sum + ip; - // out<<"\n ps = "<get_pointer(), nksbands); + nl_tools.cal_stress(ik, nksbands, true, ipol, jpol, stress_device, 0); + nl_tools.cal_stress(ik, nstobands, false, ipol, jpol, stress_device, nksbands); + } + } - } // end ip - ++iat; - sum += Nprojs; - } // ia - } // end it - } // end band - } // end jpol - } // end ipol - } // end ik + } - for (int l = 0; l < 3; ++l) + // transfer stress from device to host + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigmanlc.data(), stress_device, 9); + delmem_var_op()(this->ctx, stress_device); + // sum up forcenl from all processors + for (int l = 0; l < 3; l++) { - for (int m = 0; m < 3; ++m) + for (int m = 0; m < 3; m++) { if (m > l) { - sigmanlc(l, m) = sigmanlc(m, l); + sigmanlc[l * 3 + m] = sigmanlc[m * 3 + l]; } } } // sum up forcenl from all processors - Parallel_Reduce::reduce_all(sigmanlc.c, 9); + Parallel_Reduce::reduce_all(sigmanlc.data(), 9); for (int ipol = 0; ipol < 3; ++ipol) { for (int jpol = 0; jpol < 3; ++jpol) { - sigmanlc(ipol, jpol) *= 1.0 / this->ucell->omega; - } - } - - for (int ipol = 0; ipol < 3; ++ipol) - { - for (int jpol = 0; jpol < 3; ++jpol) - { - sigma(ipol, jpol) = sigmanlc(ipol, jpol); + sigma(ipol, jpol) = sigmanlc[ipol * 3 + jpol] / ucell.omega; } } // do symmetry if (ModuleSymmetry::Symmetry::symm_flag == 1) { - p_symm->symmetrize_mat3(sigma, this->ucell->lat); + p_symm->symmetrize_mat3(sigma, ucell.lat); } - // this->print(ofs_running, "nonlocal stress", stresnl); ModuleBase::timer::tick("Sto_Stress_Func", "stres_nl"); return; } + + +template class Sto_Stress_PW; +#if ((defined __CUDA) || (defined __ROCM)) +template class Sto_Stress_PW; +#endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.h b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.h index e3fc3eadd6..7b25166dac 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.h +++ b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.h @@ -7,7 +7,8 @@ #include "sto_wf.h" // qianrui create 2021-6-4 -class Sto_Stress_PW : public Stress_Func +template +class Sto_Stress_PW : public Stress_Func { public: Sto_Stress_PW(){}; @@ -21,8 +22,8 @@ class Sto_Stress_PW : public Stress_Func Structure_Factor* p_sf, K_Vectors* p_kv, ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, - Stochastic_WF, base_device::DEVICE_CPU>& stowf, + const psi::Psi, Device>& psi_in, + const Stochastic_WF, Device>& stowf, const Charge* const chr, pseudopot_cell_vnl* nlpp_in, const UnitCell& ucell_in); @@ -33,8 +34,8 @@ class Sto_Stress_PW : public Stress_Func ModuleSymmetry::Symmetry* p_symm, K_Vectors* p_kv, ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, - Stochastic_WF, base_device::DEVICE_CPU>& stowf); + const psi::Psi, Device>& psi_in, + const Stochastic_WF, Device>& stowf); void sto_stress_nl(ModuleBase::matrix& sigma, const ModuleBase::matrix& wg, @@ -42,8 +43,14 @@ class Sto_Stress_PW : public Stress_Func ModuleSymmetry::Symmetry* p_symm, K_Vectors* p_kv, ModulePW::PW_Basis_K* wfc_basis, - const psi::Psi>* psi_in, - Stochastic_WF, base_device::DEVICE_CPU>& stowf, - pseudopot_cell_vnl* nlpp_in); + const pseudopot_cell_vnl& nlpp, + const UnitCell& ucell, + const psi::Psi, Device>& psi, + const Stochastic_WF, Device>& stowf); + private: + using resmem_var_op = base_device::memory::resize_memory_op; + using setmem_var_op = base_device::memory::set_memory_op; + using delmem_var_op = base_device::memory::delete_memory_op; + using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op; }; #endif diff --git a/source/module_hsolver/hsolver_pw_sdft.cpp b/source/module_hsolver/hsolver_pw_sdft.cpp index 8698324f79..971873a13b 100644 --- a/source/module_hsolver/hsolver_pw_sdft.cpp +++ b/source/module_hsolver/hsolver_pw_sdft.cpp @@ -41,6 +41,7 @@ void HSolverPW_SDFT::solve(hamilt::Hamilt* pHamilt, // part of KSDFT to get KS orbitals for (int ik = 0; ik < nks; ++ik) { + ModuleBase::timer::tick("HSolverPW_SDFT", "solve_KS"); pHamilt->updateHk(ik); if (nbands > 0 && GlobalV::MY_STOGROUP == 0) { @@ -55,10 +56,11 @@ void HSolverPW_SDFT::solve(hamilt::Hamilt* pHamilt, #ifdef __MPI if (nbands > 0 && PARAM.inp.bndpar > 1) { - Parallel_Common::bcast_complex(this->ctx, &psi(ik, 0, 0), npwx * nbands, PARAPW_WORLD, &psi_cpu(ik, 0, 0)); + Parallel_Common::bcast_dev(this->ctx, &psi(ik, 0, 0), npwx * nbands, PARAPW_WORLD, &psi_cpu(ik, 0, 0)); MPI_Bcast(&pes->ekb(ik, 0), nbands, MPI_DOUBLE, 0, PARAPW_WORLD); } #endif + ModuleBase::timer::tick("HSolverPW_SDFT", "solve_KS"); stoiter.orthog(ik, psi, stowf); stoiter.checkemm(ik, istep, iter, stowf); // check and reset emax & emin } diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp index 7c3997bc22..badf2e4b98 100644 --- a/source/module_io/input_conv.cpp +++ b/source/module_io/input_conv.cpp @@ -183,7 +183,7 @@ void Input_Conv::Convert() if (PARAM.inp.device == "gpu" && PARAM.inp.basis_type == "pw") { - GlobalV::KPAR = base_device::information::get_device_kpar(PARAM.inp.kpar); + GlobalV::KPAR = base_device::information::get_device_kpar(PARAM.inp.kpar, PARAM.inp.bndpar); } #ifdef __LCAO else if (PARAM.inp.basis_type == "lcao") { diff --git a/source/module_io/read_input_item_system.cpp b/source/module_io/read_input_item_system.cpp index fb698ae877..1ceae0b79c 100644 --- a/source/module_io/read_input_item_system.cpp +++ b/source/module_io/read_input_item_system.cpp @@ -256,6 +256,12 @@ void ReadInput::item_system() para.input.bndpar = GlobalV::NPROC; } }; + item.check_value = [](const Input_Item& item, const Parameter& para) { + if (GlobalV::NPROC % para.input.bndpar != 0) + { + ModuleBase::WARNING_QUIT("ReadInput", "The number of processors can not be divided by bndpar"); + } + }; this->add_item(item); } { diff --git a/source/module_psi/psi.cpp b/source/module_psi/psi.cpp index a164da46a8..f129d3e422 100644 --- a/source/module_psi/psi.cpp +++ b/source/module_psi/psi.cpp @@ -227,7 +227,7 @@ template T* Psi::get_pointer(const int& { assert(ikb >= 0); assert(this->k_first ? ikb < this->nbands : ikb < this->nk); - return &this->psi_current[ikb * this->nbasis]; + return this->psi_current + ikb * this->nbasis; } template const int* Psi::get_ngk_pointer() const diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/INPUT b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/INPUT new file mode 100644 index 0000000000..beef5ef8b4 --- /dev/null +++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/INPUT @@ -0,0 +1,43 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation md +esolver_type sdft +method_sto 2 +device gpu + +symmetry 0 +pseudo_dir ../../PP_ORB + +nbands 0 +nbands_sto all + +nche_sto 90 +seed_sto 20000 +kpar 1 +bndpar 2 +cal_force 1 +cal_stress 1 + +#Parameters (2.Iteration) +ecutwfc 30 +scf_thr 1e-4 +scf_nmax 20 + + +#Parameters (3.Basis) +basis_type pw + +#Parameters (4.Smearing) +smearing_method fd +smearing_sigma 0.6 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.4 + +#MD +md_tfirst 10 +md_tfreq 0.1 +md_nstep 2 +init_vel 1 diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/KPT b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/README b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/README new file mode 100644 index 0000000000..cea120ea70 --- /dev/null +++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/README @@ -0,0 +1,11 @@ +This test for: +*SDFT +*MD +*Si +*kpoints 1*1*1 +*All complete basis +*mixing_type broyden +*mixing_beta 0.4 +*seed_sto > 0 +*sto_method 2 +*bndpar 2 diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/STRU b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/STRU new file mode 100644 index 0000000000..408b4bd8fa --- /dev/null +++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/STRU @@ -0,0 +1,19 @@ +ATOMIC_SPECIES +Si 28 Si.pz-vbc.UPF + +LATTICE_CONSTANT +4 + +LATTICE_VECTORS +1 0 0 #latvec1 +0 1 0 #latvec2 +0 0 1 #latvec3 + +ATOMIC_POSITIONS +Cartesian + +Si #label +0 #magnetism +2 #number of atoms +0 0 0 m 1 1 1 v 1.62777250569e-06 -2.01685158301e-05 2.2830936097e-05 +0.5 0.5 0.5 m 1 1 1 v -1.62777250569e-06 2.01685158301e-05 -2.2830936097e-05 diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/jd b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/jd new file mode 100644 index 0000000000..ad033a1631 --- /dev/null +++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/jd @@ -0,0 +1 @@ +test sdft MD, nstep = 2 diff --git a/tests/integrate/187_PW_MD_SDFT_ALL_GPU/result.ref b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/result.ref new file mode 100644 index 0000000000..ef2e7fcc80 --- /dev/null +++ b/tests/integrate/187_PW_MD_SDFT_ALL_GPU/result.ref @@ -0,0 +1,5 @@ +etotref -228.9492279657028 +etotperatomref -114.4746139829 +totalforceref 0.510260 +totalstressref 42801.474893 +totaltimeref 7.82 diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/INPUT b/tests/integrate/187_PW_SDFT_ALL_GPU/INPUT new file mode 100644 index 0000000000..a6804500a7 --- /dev/null +++ b/tests/integrate/187_PW_SDFT_ALL_GPU/INPUT @@ -0,0 +1,38 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf +esolver_type sdft +method_sto 1 + +symmetry 0 +pseudo_dir ../../PP_ORB +device gpu + +kpar 2 +bndpar 1 + +nbands 0 +nbands_sto all + +nche_sto 120 +seed_sto 20000 +cal_force 1 +cal_stress 1 + +#Parameters (2.Iteration) +ecutwfc 20 +scf_thr 1e-6 +scf_nmax 30 + + +#Parameters (3.Basis) +basis_type pw + +#Parameters (4.Smearing) +smearing_method fd +smearing_sigma 0.6 + +#Parameters (5.Mixing) +mixing_type plain +mixing_beta 0.7 diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/KPT b/tests/integrate/187_PW_SDFT_ALL_GPU/KPT new file mode 100644 index 0000000000..24c3230814 --- /dev/null +++ b/tests/integrate/187_PW_SDFT_ALL_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +2 2 2 0.5 0.5 0.5 diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/README b/tests/integrate/187_PW_SDFT_ALL_GPU/README new file mode 100644 index 0000000000..9cd65665cb --- /dev/null +++ b/tests/integrate/187_PW_SDFT_ALL_GPU/README @@ -0,0 +1,11 @@ +This test for: +*SDFT +*Si +*kpoints 2*2*2 shift +*complete orbitals +*mixing_type plain +*mixing_beta 0.7 +*method_sto 1 +*seed_sto > 0 +*bndpar 1 +*kpar 2 diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/STRU b/tests/integrate/187_PW_SDFT_ALL_GPU/STRU new file mode 100644 index 0000000000..28a94e8c0c --- /dev/null +++ b/tests/integrate/187_PW_SDFT_ALL_GPU/STRU @@ -0,0 +1,19 @@ +ATOMIC_SPECIES +Si 28 Si.pz-vbc.UPF + +LATTICE_CONSTANT +5 // add lattice constant + +LATTICE_VECTORS +0.5 0 0.5 +0.5 0.5 0 +0 0.5 0.5 + +ATOMIC_POSITIONS +Direct + +Si // Element type +0.0 // magnetism +2 +0.10 0.00 0.20 1 1 1 +0.5 0.5 0.5 1 1 1 diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/jd b/tests/integrate/187_PW_SDFT_ALL_GPU/jd new file mode 100644 index 0000000000..1fb7d0f4b9 --- /dev/null +++ b/tests/integrate/187_PW_SDFT_ALL_GPU/jd @@ -0,0 +1 @@ +test parallel method kpar and GPU \ No newline at end of file diff --git a/tests/integrate/187_PW_SDFT_ALL_GPU/result.ref b/tests/integrate/187_PW_SDFT_ALL_GPU/result.ref new file mode 100644 index 0000000000..dd61868257 --- /dev/null +++ b/tests/integrate/187_PW_SDFT_ALL_GPU/result.ref @@ -0,0 +1,5 @@ +etotref -105.2612355454176907 +etotperatomref -52.6306177727 +totalforceref 197.906706 +totalstressref 254537.682905 +totaltimeref 20.82 diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/INPUT b/tests/integrate/187_PW_SDFT_MALL_GPU/INPUT new file mode 100644 index 0000000000..a0855d63f9 --- /dev/null +++ b/tests/integrate/187_PW_SDFT_MALL_GPU/INPUT @@ -0,0 +1,38 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf +esolver_type sdft +method_sto 2 +device gpu + +symmetry 0 +pseudo_dir ../../PP_ORB + +kpar 1 +bndpar 2 + +nbands 10 +nbands_sto all + +nche_sto 120 +seed_sto 20000 +cal_force 1 +cal_stress 1 + +#Parameters (2.Iteration) +ecutwfc 30 +scf_thr 1e-6 +scf_nmax 20 + + +#Parameters (3.Basis) +basis_type pw + +#Parameters (4.Smearing) +smearing_method fd +smearing_sigma 0.6 + +#Parameters (5.Mixing) +mixing_type plain +mixing_beta 0.7 diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/KPT b/tests/integrate/187_PW_SDFT_MALL_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/187_PW_SDFT_MALL_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/README b/tests/integrate/187_PW_SDFT_MALL_GPU/README new file mode 100644 index 0000000000..b0fc10b3f9 --- /dev/null +++ b/tests/integrate/187_PW_SDFT_MALL_GPU/README @@ -0,0 +1,11 @@ +This test for: +*SDFT +*Si +*kpoints 1*1*1 +*10 KS + complete orbitals +*mixing_type plain +*mixing_beta 0.7 +*sto_method 2 +*seed_sto > 0 +*bndpar 2 +*kpar 1 diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/STRU b/tests/integrate/187_PW_SDFT_MALL_GPU/STRU new file mode 100644 index 0000000000..28a94e8c0c --- /dev/null +++ b/tests/integrate/187_PW_SDFT_MALL_GPU/STRU @@ -0,0 +1,19 @@ +ATOMIC_SPECIES +Si 28 Si.pz-vbc.UPF + +LATTICE_CONSTANT +5 // add lattice constant + +LATTICE_VECTORS +0.5 0 0.5 +0.5 0.5 0 +0 0.5 0.5 + +ATOMIC_POSITIONS +Direct + +Si // Element type +0.0 // magnetism +2 +0.10 0.00 0.20 1 1 1 +0.5 0.5 0.5 1 1 1 diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/jd b/tests/integrate/187_PW_SDFT_MALL_GPU/jd new file mode 100644 index 0000000000..7356c219e1 --- /dev/null +++ b/tests/integrate/187_PW_SDFT_MALL_GPU/jd @@ -0,0 +1 @@ +test parallel method bndpar and GPU diff --git a/tests/integrate/187_PW_SDFT_MALL_GPU/result.ref b/tests/integrate/187_PW_SDFT_MALL_GPU/result.ref new file mode 100644 index 0000000000..dafd960f93 --- /dev/null +++ b/tests/integrate/187_PW_SDFT_MALL_GPU/result.ref @@ -0,0 +1,5 @@ +etotref -96.9361115889958853 +etotperatomref -48.4680557945 +totalforceref 248.975546 +totalstressref 230454.805813 +totaltimeref 5.17 diff --git a/tests/integrate/CASES_GPU.txt b/tests/integrate/CASES_GPU.txt index 641c3c0a36..00de10ccb6 100644 --- a/tests/integrate/CASES_GPU.txt +++ b/tests/integrate/CASES_GPU.txt @@ -1,6 +1,9 @@ 102_PW_CG_GPU 102_PW_DA_davidson_GPU 102_PW_BPCG_GPU +187_PW_SDFT_ALL_GPU +187_PW_SDFT_MALL_GPU +187_PW_MD_SDFT_ALL_GPU 930_NO_BI2SE2CU2O2_GPU 930_NO_BI2SE2CU2O2_k_GPU 931_NO_H20_GPU