diff --git a/source/module_hamilt_lcao/module_tddft/td_current.cpp b/source/module_hamilt_lcao/module_tddft/td_current.cpp index 08491769f1..84ca001195 100644 --- a/source/module_hamilt_lcao/module_tddft/td_current.cpp +++ b/source/module_hamilt_lcao/module_tddft/td_current.cpp @@ -5,6 +5,7 @@ #include "module_hamilt_lcao/module_tddft/snap_psibeta_half_tddft.h" #ifdef _OPENMP #include +#include #endif TD_current::TD_current(const UnitCell* ucell_in, @@ -161,7 +162,7 @@ void TD_current::initialize_grad_term(Grid_Driver* GridD, const Parallel_Orbital this->current_term[dir]->allocate(nullptr, true); } - ModuleBase::timer::tick("EkineticNew", "initialize_HR"); + ModuleBase::timer::tick("TD_current", "initialize_grad_term"); } void TD_current::calculate_vcomm_r() @@ -171,17 +172,8 @@ void TD_current::calculate_vcomm_r() const Parallel_Orbitals* paraV = this->current_term[0]->get_atom_pair(0).get_paraV(); const int npol = this->ucell->get_npol(); + // 1. calculate for each pair of atoms -#ifdef _OPENMP -#pragma omp parallel -{ - std::unordered_set atom_row_list; - #pragma omp for - for (int iat0 = 0; iat0 < this->ucell->nat; iat0++) - { - atom_row_list.insert(iat0); - } -#endif for (int iat0 = 0; iat0 < this->ucell->nat; iat0++) { auto tau0 = ucell->get_tau(iat0); @@ -196,99 +188,120 @@ void TD_current::calculate_vcomm_r() nlm_tot[i].resize(4); } - for (int ad = 0; ad < adjs.adj_num + 1; ++ad) + #pragma omp parallel { - const int T1 = adjs.ntype[ad]; - const int I1 = adjs.natom[ad]; - const int iat1 = ucell->itia2iat(T1, I1); - const ModuleBase::Vector3& tau1 = adjs.adjacent_tau[ad]; - const Atom* atom1 = &ucell->atoms[T1]; - - auto all_indexes = paraV->get_indexes_row(iat1); -#ifdef _OPENMP - if(atom_row_list.find(iat1) == atom_row_list.end()) + #pragma omp for schedule(dynamic) + for (int ad = 0; ad < adjs.adj_num + 1; ++ad) { - all_indexes.clear(); - } -#endif - auto col_indexes = paraV->get_indexes_col(iat1); - // insert col_indexes into all_indexes to get universal set with no repeat elements - all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end()); - std::sort(all_indexes.begin(), all_indexes.end()); - all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end()); - for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) - { - const int iw1 = all_indexes[iw1l] / npol; - std::vector>> nlm; - // nlm is a vector of vectors, but size of outer vector is only 1 when out_current is false - // and size of outer vector is 4 when out_current is true (3 for , 1 for - // ) inner loop : all projectors (L0,M0) - - // snap_psibeta_half_tddft() are used to calculate - // and as well if current are needed - - module_tddft::snap_psibeta_half_tddft(orb_, - this->ucell->infoNL, - nlm, - tau1 * this->ucell->lat0, - T1, - atom1->iw2l[iw1], - atom1->iw2m[iw1], - atom1->iw2n[iw1], - tau0 * this->ucell->lat0, - T0, - this->cart_At, - true); - for (int dir = 0; dir < 4; dir++) + const int T1 = adjs.ntype[ad]; + const int I1 = adjs.natom[ad]; + const int iat1 = ucell->itia2iat(T1, I1); + const ModuleBase::Vector3& tau1 = adjs.adjacent_tau[ad]; + const Atom* atom1 = &ucell->atoms[T1]; + auto all_indexes = paraV->get_indexes_row(iat1); + auto col_indexes = paraV->get_indexes_col(iat1); + // insert col_indexes into all_indexes to get universal set with no repeat elements + all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end()); + std::sort(all_indexes.begin(), all_indexes.end()); + all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end()); + for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) { - nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]}); + const int iw1 = all_indexes[iw1l] / npol; + std::vector>> nlm; + // nlm is a vector of vectors, but size of outer vector is only 1 when out_current is false + // and size of outer vector is 4 when out_current is true (3 for , 1 for + // ) inner loop : all projectors (L0,M0) + + // snap_psibeta_half_tddft() are used to calculate + // and as well if current are needed + + module_tddft::snap_psibeta_half_tddft(orb_, + this->ucell->infoNL, + nlm, + tau1 * this->ucell->lat0, + T1, + atom1->iw2l[iw1], + atom1->iw2m[iw1], + atom1->iw2n[iw1], + tau0 * this->ucell->lat0, + T0, + this->cart_At, + true); + for (int dir = 0; dir < 4; dir++) + { + nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]}); + } } } - } -// 2. calculate D for each pair of atoms - for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1) - { - const int T1 = adjs.ntype[ad1]; - const int I1 = adjs.natom[ad1]; - const int iat1 = ucell->itia2iat(T1, I1); -#ifdef _OPENMP - if(atom_row_list.find(iat1) == atom_row_list.end()) + + #ifdef _OPENMP + // record the iat number of the adjacent atoms + std::set ad_atom_set; + for (int ad = 0; ad < adjs.adj_num + 1; ++ad) { - continue; + const int T1 = adjs.ntype[ad]; + const int I1 = adjs.natom[ad]; + const int iat1 = ucell->itia2iat(T1, I1); + ad_atom_set.insert(iat1); } -#endif - ModuleBase::Vector3& R_index1 = adjs.box[ad1]; - for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2) + + // split the ad_atom_set into num_threads parts + const int num_threads = omp_get_num_threads(); + const int thread_id = omp_get_thread_num(); + std::set ad_atom_set_thread; + int i = 0; + for(const auto iat1 : ad_atom_set) { - const int T2 = adjs.ntype[ad2]; - const int I2 = adjs.natom[ad2]; - const int iat2 = ucell->itia2iat(T2, I2); - ModuleBase::Vector3& R_index2 = adjs.box[ad2]; - ModuleBase::Vector3 R_vector(R_index2[0] - R_index1[0], - R_index2[1] - R_index1[1], - R_index2[2] - R_index1[2]); - std::complex* tmp_c[3] = {nullptr, nullptr, nullptr}; - for (int i = 0; i < 3; i++) + if (i % num_threads == thread_id) { - tmp_c[i] = this->current_term[i]->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2])->get_pointer(); + ad_atom_set_thread.insert(iat1); } - // if not found , skip this pair of atoms - if (tmp_c[0] != nullptr) + i++; + } + #endif + + // 2. calculate D for each pair of atoms + for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1) + { + const int T1 = adjs.ntype[ad1]; + const int I1 = adjs.natom[ad1]; + const int iat1 = ucell->itia2iat(T1, I1); + #ifdef _OPENMP + if (ad_atom_set_thread.find(iat1) == ad_atom_set_thread.end()) + { + continue; + } + #endif + ModuleBase::Vector3& R_index1 = adjs.box[ad1]; + for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2) { - this->cal_vcomm_r_IJR(iat1, - iat2, - T0, - paraV, - nlm_tot[ad1], - nlm_tot[ad2], - tmp_c); + const int T2 = adjs.ntype[ad2]; + const int I2 = adjs.natom[ad2]; + const int iat2 = ucell->itia2iat(T2, I2); + ModuleBase::Vector3& R_index2 = adjs.box[ad2]; + ModuleBase::Vector3 R_vector(R_index2[0] - R_index1[0], + R_index2[1] - R_index1[1], + R_index2[2] - R_index1[2]); + std::complex* tmp_c[3] = {nullptr, nullptr, nullptr}; + for (int i = 0; i < 3; i++) + { + tmp_c[i] = this->current_term[i]->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2])->get_pointer(); + } + // if not found , skip this pair of atoms + if (tmp_c[0] != nullptr) + { + this->cal_vcomm_r_IJR(iat1, + iat2, + T0, + paraV, + nlm_tot[ad1], + nlm_tot[ad2], + tmp_c); + } } } } } -#ifdef _OPENMP -} -#endif ModuleBase::timer::tick("TD_current", "calculate_vcomm_r"); }