diff --git a/source/module_hsolver/diago_cusolver.cpp b/source/module_hsolver/diago_cusolver.cpp index 185788c07b..5e0c707bab 100644 --- a/source/module_hsolver/diago_cusolver.cpp +++ b/source/module_hsolver/diago_cusolver.cpp @@ -5,12 +5,23 @@ #include "module_base/scalapack_connector.h" #include +#include +#include using complex = std::complex; // Namespace for the diagonalization solver namespace hsolver { + // this struct is used for collecting matrices from all processes to root process + template struct Matrix_g + { + std::shared_ptr p; + size_t row; + size_t col; + std::shared_ptr desc; + }; + // Initialize the DecomposedState variable for real and complex numbers template int DiagoCusolver::DecomposedState = 0; @@ -52,33 +63,34 @@ namespace hsolver } // Use Cpxgemr2d to collect matrices from all processes to root process - template + template static void gatherMatrix(const int myid, const int root_proc, const mat& mat_l, - mat& mat_g) + matg& mat_g) { auto a = mat_l.p; - decltype(a) b; const int* desca = mat_l.desc; int ctxt = desca[1]; int nrows = desca[2]; int ncols = desca[3]; if (myid == root_proc) - b = new typename std::remove_reference::type[nrows * ncols]; + { + mat_g.p.reset(new typename std::remove_reference::type[nrows * ncols]); + } else - b = new typename std::remove_reference::type[1]; + { + mat_g.p.reset(new typename std::remove_reference::type[1]); + } // Set descb, which has all elements in the only block in the root process - int descb[9] = {1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows}; + mat_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows}); - mat_g.desc = descb; mat_g.row = nrows; mat_g.col = ncols; - mat_g.p = b; - Cpxgemr2d(nrows, ncols, a, 1, 1, desca, b, 1, 1, descb, ctxt); + Cpxgemr2d(nrows, ncols, a, 1, 1, desca, mat_g.p.get(), 1, 1, mat_g.desc.get(), ctxt); } // Convert the Psi to a 2D block storage format @@ -109,23 +121,25 @@ namespace hsolver ModuleBase::TITLE("DiagoCusolver", "diag"); // Create matrices for the Hamiltonian and overlap - hamilt::MatrixBlock h_mat, s_mat; + hamilt::MatrixBlock h_mat; + hamilt::MatrixBlock s_mat; phm_in->matrix(h_mat, s_mat); #ifdef __MPI // global matrix - hamilt::MatrixBlock h_mat_g, s_mat_g; - - // global psi for distribute - T* psi_g; + Matrix_g h_mat_g; + Matrix_g s_mat_g; // get the context and process information int ctxt = ParaV->blacs_ctxt; - int nprows, npcols, myprow, mypcol; + int nprows = 0; + int npcols = 0; + int myprow = 0; + int mypcol = 0; Cblacs_gridinfo(ctxt, &nprows, &npcols, &myprow, &mypcol); - int myid = Cblacs_pnum(ctxt, myprow, mypcol); + const int num_procs = nprows * npcols; + const int myid = Cblacs_pnum(ctxt, myprow, mypcol); const int root_proc = Cblacs_pnum(ctxt, ParaV->desc[6], ParaV->desc[7]); - #endif // Allocate memory for eigenvalues @@ -135,29 +149,39 @@ namespace hsolver ModuleBase::timer::tick("DiagoCusolver", "cusolver"); #ifdef __MPI - // gather matrices from processes to root process - gatherMatrix(myid, root_proc, h_mat, h_mat_g); - gatherMatrix(myid, root_proc, s_mat, s_mat_g); + if(num_procs > 1) + { + // gather matrices from processes to root process + gatherMatrix(myid, root_proc, h_mat, h_mat_g); + gatherMatrix(myid, root_proc, s_mat, s_mat_g); + } #endif // Call the dense diagonalization routine #ifdef __MPI - MPI_Barrier(MPI_COMM_WORLD); - if (myid == root_proc) + if(num_procs > 1) { - psi_g = new T[h_mat_g.row * h_mat_g.col]; - this->dc.Dngvd(h_mat_g.col, h_mat_g.row, h_mat_g.p, s_mat_g.p, eigen.data(), psi_g); + MPI_Barrier(MPI_COMM_WORLD); + // global psi for distribute + int psi_len = myid == root_proc ? h_mat_g.row * h_mat_g.col : 1; + std::vector psi_g(psi_len); + if (myid == root_proc) + { + this->dc.Dngvd(h_mat_g.col, h_mat_g.row, h_mat_g.p.get(), s_mat_g.p.get(), eigen.data(), psi_g.data()); + } + + MPI_Barrier(MPI_COMM_WORLD); + + // broadcast eigenvalues to all processes + MPI_Bcast(eigen.data(), GlobalV::NBANDS, MPI_DOUBLE, root_proc, MPI_COMM_WORLD); + + // distribute psi to all processes + distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g.data()); } else { - psi_g = new T[1]; + this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), psi.get_pointer()); } - MPI_Barrier(MPI_COMM_WORLD); - // broadcast eigenvalues to all processes - MPI_Bcast(eigen.data(), GlobalV::NBANDS, MPI_DOUBLE, root_proc, MPI_COMM_WORLD); - - // distribute psi to all processes - distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g); #else // Call the dense diagonalization routine this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), psi.get_pointer()); @@ -165,16 +189,9 @@ namespace hsolver // Stop the timer for the cusolver operation ModuleBase::timer::tick("DiagoCusolver", "cusolver"); - // Copy the eigenvalues and eigenvectors to the output arrays + // Copy the eigenvalues to the output arrays const int inc = 1; BlasConnector::copy(GlobalV::NBANDS, eigen.data(), inc, eigenvalue_in, inc); - - // Free allocated memory -#ifdef __MPI - delete[] h_mat_g.p; - delete[] s_mat_g.p; - delete[] psi_g; -#endif } // Explicit instantiation of the DiagoCusolver class for real and complex numbers