Skip to content

Commit

Permalink
Refactor: Add heterogeneous parallel code for the stress loc module. (d…
Browse files Browse the repository at this point in the history
…eepmodeling#4854)

* init stress_loc gpu ops.

* fix a bug.

* add dcu support.

* remove GlobalC and use vectors.

* fix dcu bug.

* [pre-commit.ci lite] apply automatic fixes

* replace gx_arr[idx] to gx to keep consistent.

* fix a bug of gx.

* add comment,

* fix a bug.

---------

Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
  • Loading branch information
grysgreat and pre-commit-ci-lite[bot] authored Aug 5, 2024
1 parent 0e2e2c1 commit d893fda
Show file tree
Hide file tree
Showing 6 changed files with 194 additions and 50 deletions.
56 changes: 56 additions & 0 deletions source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,49 @@ __global__ void cal_stress_drhoc_aux2(
drhocg [idx] = rhocg1;
}


template <typename FPTYPE>
__global__ void cal_stress_drhoc_aux3(
const FPTYPE* r, const FPTYPE* rhoc,
const FPTYPE *gx_arr, const FPTYPE *rab, FPTYPE *drhocg,
const int mesh, const int igl0, const int ngg, const double omega
){
const double FOUR_PI = 4.0 * 3.14159265358979323846;

int idx = threadIdx.x + blockIdx.x * blockDim.x;

if (idx >= ngg) {return;}

FPTYPE rhocg1=0.0;
FPTYPE gx = gx_arr[idx];
const FPTYPE pow_gx = gx * gx;

auto aux = [](FPTYPE r, FPTYPE rhoc, FPTYPE gx, FPTYPE rab) -> FPTYPE{
return rab * rhoc * (r * cos(gx * r)/gx - sin(gx * r)/(gx * gx));
};

FPTYPE f_0 = r[0] * r[0] * rhoc[0] * rab[0];
for( int ir = 1 ; ir< mesh - 2; ir+=2)
{
rhocg1 += 2 * aux(r[ir],rhoc[ir], gx, rab[ir]) + aux(r[ir+1],rhoc[ir+1], gx, rab[ir+1]);
}//ir
FPTYPE f_2 = aux(r[mesh - 2],rhoc[mesh - 2], gx, rab[mesh - 2]);
FPTYPE f_1 = aux(r[mesh - 1],rhoc[mesh - 1], gx, rab[mesh - 1]);

rhocg1 += f_2+f_2;
rhocg1 += rhocg1;
rhocg1 += f_0 + f_1;
rhocg1/=3.0;

// calculations after Simpson Integral
const double g2a = pow_gx / 4.0;
rhocg1 *= FOUR_PI / omega / 2.0 / gx;
rhocg1 += FOUR_PI / omega * gx_arr[ngg] * exp(-g2a) * (g2a + 1) / (pow_gx*pow_gx);
drhocg [idx] = rhocg1;
}



template <typename FPTYPE>
void cal_vkb_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
const base_device::DEVICE_GPU* ctx,
Expand Down Expand Up @@ -556,6 +599,15 @@ void cal_vq_deri_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
return ;
}


/**
* The implementation of this operator is detailed in stress_op.h.
* The function is called by the module as follows
* Type = 0 -> stress_cc
* Type = 1 -> stress_cc, force_cc
* Type = 2 -> force_scc
* Type = 3 -> stress_loc
*/
template <typename FPTYPE>
void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
const FPTYPE* r, const FPTYPE* rhoc,
Expand All @@ -578,6 +630,10 @@ void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
cal_stress_drhoc_aux2<FPTYPE><<<block,THREADS_PER_BLOCK>>>(
r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega
);
} else if(type == 3 ){
cal_stress_drhoc_aux3<FPTYPE><<<block,THREADS_PER_BLOCK>>>(
r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega
);
}
return ;
}
Expand Down
44 changes: 44 additions & 0 deletions source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,46 @@ __global__ void cal_stress_drhoc_aux2(
}


template <typename FPTYPE>
__global__ void cal_stress_drhoc_aux3(
const FPTYPE* r, const FPTYPE* rhoc,
const FPTYPE *gx_arr, const FPTYPE *rab, FPTYPE *drhocg,
const int mesh, const int igl0, const int ngg, const double omega
){
const double FOUR_PI = 4.0 * 3.14159265358979323846;

int idx = threadIdx.x + blockIdx.x * blockDim.x;

if (idx >= ngg) {return;}

FPTYPE rhocg1=0.0;
FPTYPE gx = gx_arr[idx];
const FPTYPE pow_gx = gx * gx;

auto aux = [](FPTYPE r, FPTYPE rhoc, FPTYPE gx, FPTYPE rab) -> FPTYPE{
return rab * rhoc * (r * cos(gx * r)/gx - sin(gx * r)/(gx * gx));
};

FPTYPE f_0 = r[0] * r[0] * rhoc[0] * rab[0];
for( int ir = 1 ; ir< mesh - 2; ir+=2)
{
rhocg1 += 2 * aux(r[ir],rhoc[ir], gx, rab[ir]) + aux(r[ir+1],rhoc[ir+1], gx, rab[ir+1]);
}//ir
FPTYPE f_2 = aux(r[mesh - 2],rhoc[mesh - 2], gx, rab[mesh - 2]);
FPTYPE f_1 = aux(r[mesh - 1],rhoc[mesh - 1], gx, rab[mesh - 1]);

rhocg1 += f_2+f_2;
rhocg1 += rhocg1;
rhocg1 += f_0 + f_1;
rhocg1/=3.0;

// calculations after Simpson Integral
const double g2a = pow_gx / 4.0;
rhocg1 *= FOUR_PI / omega / 2.0 / gx;
rhocg1 += FOUR_PI / omega * gx_arr[ngg] * exp(-g2a) * (g2a + 1) / (pow_gx*pow_gx);
drhocg [idx] = rhocg1;
}


template <typename FPTYPE>
void cal_vkb_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
Expand Down Expand Up @@ -581,6 +621,10 @@ void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_drhoc_aux2<FPTYPE>),block,THREADS_PER_BLOCK,0,0,
r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega
);
} else if(type == 3 ){
hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_drhoc_aux3<FPTYPE>),block,THREADS_PER_BLOCK,0,0,
r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega
);
}

return ;
Expand Down
21 changes: 16 additions & 5 deletions source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "vnl_tools.hpp"
#include <vector>
#include <iomanip>

#include "module_base/libm/libm.h"
namespace hamilt
{

Expand Down Expand Up @@ -374,14 +374,25 @@ struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_CPU> {
} else if(type == 1) {
aux [ir] = ir!=0 ? std::sin(gx_arr[igl] * r[ir]) / (gx_arr[igl] * r[ir]) : 1.0;
aux [ir] = r[ir] * r[ir] * rhoc [ir] * aux [ir];
} else {
} else if(type == 2) {
aux [ir] = r[ir] < 1.0e-8 ? rhoc [ir] : rhoc [ir] * sin(gx_arr[igl] * r[ir]) / (gx_arr[igl] * r[ir]);
} else if(type == 3) {
FPTYPE sinp, cosp;
sinp = std::sin(gx_arr[igl] * r[ir]);
cosp = std::cos(gx_arr[igl] * r[ir]);
aux[ir] = rhoc [ir] * (r [ir] * cosp / gx_arr[igl] - sinp / pow(gx_arr[igl],2));
}
}//ir
Simpson_Integral<FPTYPE>(mesh, aux.data(), rab, rhocg1);
if(type ==0 ) drhocg [igl] = FOUR_PI / omega * rhocg1;
else if(type == 1) drhocg [igl] = FOUR_PI * rhocg1 / omega;
else drhocg [igl] = rhocg1;
if(type ==0 ) { drhocg [igl] = FOUR_PI / omega * rhocg1;
} else if(type == 1) { drhocg [igl] = FOUR_PI * rhocg1 / omega;
} else if(type == 2) { drhocg [igl] = rhocg1;
} else if(type == 3) {
rhocg1 *= FOUR_PI / omega / 2.0 / gx_arr[igl];
FPTYPE g2a = (gx_arr[igl]*gx_arr[igl]) / 4.0;
rhocg1 += FOUR_PI / omega * gx_arr[ngg] * ModuleBase::libm::exp ( - g2a) * (g2a + 1) / pow(gx_arr[igl]*gx_arr[igl] , 2);
drhocg [igl] = rhocg1;
}
}
#ifdef _OPENMP
}
Expand Down
24 changes: 23 additions & 1 deletion source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,29 @@ struct cal_vq_deri_op<FPTYPE, base_device::DEVICE_GPU>
};



/**
* The operator is used to compute the auxiliary amount of stress /force
* in parallel on the GPU. They identify type with the type provided and
* select different calculation methods,
*
* The function is called by the module as follows
* Type = 0 -> stress_cc
* Type = 1 -> stress_cc, force_cc
* Type = 2 -> force_scc
* Type = 3 -> stress_loc
*
* Int the function aux is obtained by traversing the `ngg` and `mesh` firstly,
* and then aux is processed by Simpson integral method to obtain auxiliary
* quantities drhocg.
*
* In the GPU operator, temporary array space of mesh size is required in order
* not to apply Simpson interpolation (which causes GPU memory overflow).
* The Simpson integral is then reconstructed in the loop body of the mesh,
* using the Simpson integral computed in the loop, rather than executed once
* after the loop. After that, in order to reduce the if condition judgment brought
* by Simpson interpolation in the loop body, lambda expression is used to shift the
* boundary condition out.
*/
template <typename FPTYPE>
struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>{
void operator()(
Expand Down
3 changes: 2 additions & 1 deletion source/module_hamilt_pw/hamilt_pwdft/stress_func.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ class Stress_Func
const FPTYPE* vloc_at,
const FPTYPE& zp,
FPTYPE* dvloc,
ModulePW::PW_Basis* rho_basis); // used in local pseudopotential stress
ModulePW::PW_Basis* rho_basis,
const UnitCell& ucell_in); // used in local pseudopotential stress

/**
* @brief compute the derivative of the coulomb potential in reciprocal space
Expand Down
96 changes: 53 additions & 43 deletions source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
// normal case: dvloc contains dV_loc(G)/dG
//
this->dvloc_of_g ( atom->ncpp.msh, atom->ncpp.rab.data(), atom->ncpp.r.data(),
atom->ncpp.vloc_at.data(), atom->ncpp.zv, dvloc.data(), rho_basis);
atom->ncpp.vloc_at.data(), atom->ncpp.zv, dvloc.data(), rho_basis, GlobalC::ucell);
//
}
#ifndef _OPENMP
Expand Down Expand Up @@ -163,6 +163,7 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
return;
}


template<typename FPTYPE, typename Device>
void Stress_Func<FPTYPE, Device>::dvloc_of_g
(
Expand All @@ -172,7 +173,8 @@ const FPTYPE* r,
const FPTYPE* vloc_at,
const FPTYPE& zp,
FPTYPE* dvloc,
ModulePW::PW_Basis* rho_basis
ModulePW::PW_Basis* rho_basis,
const UnitCell& ucell_in
)
{
//----------------------------------------------------------------------
Expand All @@ -187,11 +189,15 @@ ModulePW::PW_Basis* rho_basis


int igl0;
this->device = base_device::get_device_type<Device>(this->ctx);

std::vector<double> gx_arr(rho_basis->ngg+1);
double* gx_arr_d = nullptr;
// counter on erf functions or gaussians
// counter on g shells vectors
// first shell with g != 0

std::vector<FPTYPE> aux1(msh);
std::vector<FPTYPE> aux(msh);

// the G=0 component is not computed
if (rho_basis->gg_uniq[0] < 1.0e-8)
Expand All @@ -207,57 +213,61 @@ ModulePW::PW_Basis* rho_basis
// Pseudopotentials in numerical form (Vloc contains the local part)
// In order to perform the Fourier transform, a term erf(r)/r is
// subtracted in real space and added again in G space

#ifdef _OPENMP
#pragma omp parallel
{
#pragma omp for
#pragma omp parallel for
#endif
for (int igl = igl0; igl < rho_basis->ngg; igl++) {
gx_arr[igl] = sqrt(rho_basis->gg_uniq[igl]) * ucell_in.tpiba;
}

gx_arr[rho_basis->ngg] = zp * ModuleBase::e2;

//
// This is the part of the integrand function
// indipendent of |G| in real space
//
for(int i = 0;i< msh; i++)
{
aux1[i] = r [i] * vloc_at [i] + zp * ModuleBase::e2 * erf(r[i]);
aux[i] = r [i] * vloc_at [i] + zp * ModuleBase::e2 * erf(r[i]);
}

std::vector<FPTYPE> aux(msh);
aux[0] = 0.0;
#ifdef _OPENMP
#pragma omp for
#endif
for(int igl = igl0;igl< rho_basis->ngg;igl++)
{
const FPTYPE g2 = rho_basis->gg_uniq[igl];
const FPTYPE gx = sqrt (g2 * GlobalC::ucell.tpiba2);
const FPTYPE gx2 = g2 * GlobalC::ucell.tpiba2;
//
// and here we perform the integral, after multiplying for the |G|
// dependent part
//
// DV(g)/Dg = ModuleBase::Integral of r (Dj_0(gr)/Dg) V(r) dr
for(int i = 1;i< msh;i++)
{
FPTYPE sinp, cosp;
ModuleBase::libm::sincos(gx * r [i], &sinp, &cosp);
aux [i] = aux1 [i] * (r [i] * cosp / gx - sinp / pow(gx,2));
}
FPTYPE vlcp=0;
// simpson (msh, aux, rab, vlcp);
ModuleBase::Integral::Simpson_Integral(msh, aux.data(), rab, vlcp );
// DV(g^2)/Dg^2 = (DV(g)/Dg)/2g
vlcp *= ModuleBase::FOUR_PI / GlobalC::ucell.omega / 2.0 / gx;
// subtract the long-range term
FPTYPE g2a = gx2 / 4.0;
vlcp += ModuleBase::FOUR_PI / GlobalC::ucell.omega * zp * ModuleBase::e2 * ModuleBase::libm::exp ( - g2a) * (g2a + 1) / pow(gx2 , 2);
dvloc [igl] = vlcp;
}

#ifdef _OPENMP
}
#endif


double *r_d = nullptr;
double *rhoc_d = nullptr;
double *rab_d = nullptr;
double *aux_d = nullptr;
double *drhocg_d = nullptr;
if (this->device == base_device::GpuDevice) {
resmem_var_op()(this->ctx, r_d, msh);
resmem_var_op()(this->ctx, rhoc_d, msh);
resmem_var_op()(this->ctx, rab_d, msh);

resmem_var_op()(this->ctx, aux_d, msh);
resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg+1);
resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg);

syncmem_var_h2d_op()(this->ctx,
this->cpu_ctx,
gx_arr_d,
gx_arr.data(),
rho_basis->ngg+1);
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, msh);
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, msh);
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, aux.data(), msh);
}



if(this->device == base_device::GpuDevice) {
hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,msh,igl0,rho_basis->ngg-igl0,ucell_in.omega,3);
syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, dvloc+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);

} else {
hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
r,aux.data(),gx_arr.data()+igl0,rab,dvloc+igl0,msh,igl0,rho_basis->ngg-igl0,ucell_in.omega,3);
}

return;
}
Expand Down

0 comments on commit d893fda

Please sign in to comment.