Skip to content

Commit

Permalink
Perf: optimize single-process performence of cusolver (#4191)
Browse files Browse the repository at this point in the history
* optimize single-process performence of cusolver

* standardize the variable initialization.

* replace pointers with vector and shared_pointer

* add curly braces

---------

Co-authored-by: Mohan Chen <[email protected]>
  • Loading branch information
dzzz2001 and mohanchen authored May 27, 2024
1 parent f7ac628 commit 70229a0
Showing 1 changed file with 56 additions and 39 deletions.
95 changes: 56 additions & 39 deletions source/module_hsolver/diago_cusolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,23 @@
#include "module_base/scalapack_connector.h"

#include <type_traits>
#include <vector>
#include <memory>

using complex = std::complex<double>;

// Namespace for the diagonalization solver
namespace hsolver
{
// this struct is used for collecting matrices from all processes to root process
template <typename T> struct Matrix_g
{
std::shared_ptr<T> p;
size_t row;
size_t col;
std::shared_ptr<int> desc;
};

// Initialize the DecomposedState variable for real and complex numbers
template <typename T>
int DiagoCusolver<T>::DecomposedState = 0;
Expand Down Expand Up @@ -52,33 +63,34 @@ namespace hsolver
}

// Use Cpxgemr2d to collect matrices from all processes to root process
template <typename mat>
template <typename mat, typename matg>
static void gatherMatrix(const int myid,
const int root_proc,
const mat& mat_l,
mat& mat_g)
matg& mat_g)
{
auto a = mat_l.p;
decltype(a) b;
const int* desca = mat_l.desc;
int ctxt = desca[1];
int nrows = desca[2];
int ncols = desca[3];

if (myid == root_proc)
b = new typename std::remove_reference<decltype(*a)>::type[nrows * ncols];
{
mat_g.p.reset(new typename std::remove_reference<decltype(*a)>::type[nrows * ncols]);
}
else
b = new typename std::remove_reference<decltype(*a)>::type[1];
{
mat_g.p.reset(new typename std::remove_reference<decltype(*a)>::type[1]);
}

// Set descb, which has all elements in the only block in the root process
int descb[9] = {1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows};
mat_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows});

mat_g.desc = descb;
mat_g.row = nrows;
mat_g.col = ncols;
mat_g.p = b;

Cpxgemr2d(nrows, ncols, a, 1, 1, desca, b, 1, 1, descb, ctxt);
Cpxgemr2d(nrows, ncols, a, 1, 1, desca, mat_g.p.get(), 1, 1, mat_g.desc.get(), ctxt);
}

// Convert the Psi to a 2D block storage format
Expand Down Expand Up @@ -109,23 +121,25 @@ namespace hsolver
ModuleBase::TITLE("DiagoCusolver", "diag");

// Create matrices for the Hamiltonian and overlap
hamilt::MatrixBlock<T> h_mat, s_mat;
hamilt::MatrixBlock<T> h_mat;
hamilt::MatrixBlock<T> s_mat;
phm_in->matrix(h_mat, s_mat);

#ifdef __MPI
// global matrix
hamilt::MatrixBlock<T> h_mat_g, s_mat_g;

// global psi for distribute
T* psi_g;
Matrix_g<T> h_mat_g;
Matrix_g<T> s_mat_g;

// get the context and process information
int ctxt = ParaV->blacs_ctxt;
int nprows, npcols, myprow, mypcol;
int nprows = 0;
int npcols = 0;
int myprow = 0;
int mypcol = 0;
Cblacs_gridinfo(ctxt, &nprows, &npcols, &myprow, &mypcol);
int myid = Cblacs_pnum(ctxt, myprow, mypcol);
const int num_procs = nprows * npcols;
const int myid = Cblacs_pnum(ctxt, myprow, mypcol);
const int root_proc = Cblacs_pnum(ctxt, ParaV->desc[6], ParaV->desc[7]);

#endif

// Allocate memory for eigenvalues
Expand All @@ -135,46 +149,49 @@ namespace hsolver
ModuleBase::timer::tick("DiagoCusolver", "cusolver");

#ifdef __MPI
// gather matrices from processes to root process
gatherMatrix(myid, root_proc, h_mat, h_mat_g);
gatherMatrix(myid, root_proc, s_mat, s_mat_g);
if(num_procs > 1)
{
// gather matrices from processes to root process
gatherMatrix(myid, root_proc, h_mat, h_mat_g);
gatherMatrix(myid, root_proc, s_mat, s_mat_g);
}
#endif

// Call the dense diagonalization routine
#ifdef __MPI
MPI_Barrier(MPI_COMM_WORLD);
if (myid == root_proc)
if(num_procs > 1)
{
psi_g = new T[h_mat_g.row * h_mat_g.col];
this->dc.Dngvd(h_mat_g.col, h_mat_g.row, h_mat_g.p, s_mat_g.p, eigen.data(), psi_g);
MPI_Barrier(MPI_COMM_WORLD);
// global psi for distribute
int psi_len = myid == root_proc ? h_mat_g.row * h_mat_g.col : 1;
std::vector<T> psi_g(psi_len);
if (myid == root_proc)
{
this->dc.Dngvd(h_mat_g.col, h_mat_g.row, h_mat_g.p.get(), s_mat_g.p.get(), eigen.data(), psi_g.data());
}

MPI_Barrier(MPI_COMM_WORLD);

// broadcast eigenvalues to all processes
MPI_Bcast(eigen.data(), GlobalV::NBANDS, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);

// distribute psi to all processes
distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g.data());
}
else
{
psi_g = new T[1];
this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), psi.get_pointer());
}
MPI_Barrier(MPI_COMM_WORLD);
// broadcast eigenvalues to all processes
MPI_Bcast(eigen.data(), GlobalV::NBANDS, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);

// distribute psi to all processes
distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g);
#else
// Call the dense diagonalization routine
this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), psi.get_pointer());
#endif
// Stop the timer for the cusolver operation
ModuleBase::timer::tick("DiagoCusolver", "cusolver");

// Copy the eigenvalues and eigenvectors to the output arrays
// Copy the eigenvalues to the output arrays
const int inc = 1;
BlasConnector::copy(GlobalV::NBANDS, eigen.data(), inc, eigenvalue_in, inc);

// Free allocated memory
#ifdef __MPI
delete[] h_mat_g.p;
delete[] s_mat_g.p;
delete[] psi_g;
#endif
}

// Explicit instantiation of the DiagoCusolver class for real and complex numbers
Expand Down

0 comments on commit 70229a0

Please sign in to comment.