Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perf: Incorporate OpenMP in the force and rho computations on the GPU #4280

Merged
merged 2 commits into from
Jun 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
const int cuda_block
= std::min(64, (gridt.psir_size + cuda_threads - 1) / cuda_threads);
int iter_num = 0;
int pipeline_index = 0;
DensityMat denstiy_mat;
frc_strs_iat_gbl f_s_iat_dev;
grid_para para;
Expand All @@ -98,6 +97,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
}

/*compute the psi*/
#pragma omp parallel for num_threads(gridt.nstreams) private(para,f_s_iat) collapse(2)
for (int i = 0; i < gridt.nbx; i++)
{
for (int j = 0; j < gridt.nby; j++)
Expand All @@ -113,7 +113,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
dim3 grid_dot(cuda_block);
dim3 block_dot(cuda_threads);

pipeline_index = iter_num % gridt.nstreams;
int pipeline_index = omp_get_thread_num();
para_init(para, iter_num, nbz, pipeline_index,gridt);
cal_init(f_s_iat,
pipeline_index,
Expand Down
14 changes: 6 additions & 8 deletions source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include "module_hamilt_lcao/module_gint/gint_tools.h"
#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh"

#include <omp.h>

namespace GintKernel
{

Expand All @@ -19,12 +21,11 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
const int nbz = gridt.nbzp;
const int lgd = gridt.lgd;
const int max_size = gridt.max_atom;
double* dm_matrix_h = new double[lgd * lgd];
std::vector<double> dm_matrix_h(lgd * lgd, 0);

checkCuda(cudaMemset(gridt.rho_g, 0, gridt.ncxyz * sizeof(double)));

// retrieve the density matrix on the host
ModuleBase::GlobalFunc::ZEROS(dm_matrix_h, lgd * lgd);
for (int iat1 = 0; iat1 < ucell.nat; iat1++)
{
for (int iat2 = 0; iat2 < ucell.nat; iat2++)
Expand Down Expand Up @@ -58,7 +59,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
double* dm_matrix_g;
checkCuda(cudaMalloc((void**)&dm_matrix_g, lgd * lgd * sizeof(double)));
checkCuda(cudaMemcpy(dm_matrix_g,
dm_matrix_h,
dm_matrix_h.data(),
lgd * lgd * sizeof(double),
cudaMemcpyHostToDevice));

Expand All @@ -68,13 +69,13 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
}

// calculate the rho for every nbz bigcells
int iter_num = 0;
#pragma omp parallel for num_threads(gridt.nstreams) collapse(2)
for (int i = 0; i < gridt.nbx; i++)
{
for (int j = 0; j < gridt.nby; j++)
{
// get stream id
int stream_num = iter_num % gridt.nstreams;
int stream_num = omp_get_thread_num();

// psi_input contains data used to generate the psi values.
// The suffix "_g" indicates that the data is stored in the GPU,
Expand Down Expand Up @@ -355,8 +356,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
incy,
dot_product_g,
dot_count);

iter_num++;
}
}

Expand All @@ -374,7 +373,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,

// free the memory
checkCuda(cudaFree(dm_matrix_g));
delete[] dm_matrix_h;
}

} // namespace GintKernel