From d893fda0827d02beceb597976a452ecc12e24edc Mon Sep 17 00:00:00 2001 From: GRYS <57103981+grysgreat@users.noreply.github.com> Date: Mon, 5 Aug 2024 16:50:30 +0800 Subject: [PATCH] Refactor: Add heterogeneous parallel code for the stress loc module. (#4854) * init stress_loc gpu ops. * fix a bug. * add dcu support. * remove GlobalC and use vectors. * fix dcu bug. * [pre-commit.ci lite] apply automatic fixes * replace gx_arr[idx] to gx to keep consistent. * fix a bug of gx. * add comment, * fix a bug. --------- Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> --- .../hamilt_pwdft/kernels/cuda/stress_op.cu | 56 +++++++++++ .../kernels/rocm/stress_op.hip.cu | 44 +++++++++ .../hamilt_pwdft/kernels/stress_op.cpp | 21 +++- .../hamilt_pwdft/kernels/stress_op.h | 24 ++++- .../hamilt_pwdft/stress_func.h | 3 +- .../hamilt_pwdft/stress_func_loc.cpp | 96 ++++++++++--------- 6 files changed, 194 insertions(+), 50 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index 7167486fdf..769d6351e9 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -461,6 +461,49 @@ __global__ void cal_stress_drhoc_aux2( drhocg [idx] = rhocg1; } + +template +__global__ void cal_stress_drhoc_aux3( + const FPTYPE* r, const FPTYPE* rhoc, + const FPTYPE *gx_arr, const FPTYPE *rab, FPTYPE *drhocg, + const int mesh, const int igl0, const int ngg, const double omega +){ + const double FOUR_PI = 4.0 * 3.14159265358979323846; + + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx >= ngg) {return;} + + FPTYPE rhocg1=0.0; + FPTYPE gx = gx_arr[idx]; + const FPTYPE pow_gx = gx * gx; + + auto aux = [](FPTYPE r, FPTYPE rhoc, FPTYPE gx, FPTYPE rab) -> FPTYPE{ + return rab * rhoc * (r * cos(gx * r)/gx - sin(gx * r)/(gx * gx)); + }; + + FPTYPE f_0 = r[0] * r[0] * rhoc[0] * rab[0]; + for( int ir = 1 ; ir< mesh - 2; ir+=2) + { + rhocg1 += 2 * aux(r[ir],rhoc[ir], gx, rab[ir]) + aux(r[ir+1],rhoc[ir+1], gx, rab[ir+1]); + }//ir + FPTYPE f_2 = aux(r[mesh - 2],rhoc[mesh - 2], gx, rab[mesh - 2]); + FPTYPE f_1 = aux(r[mesh - 1],rhoc[mesh - 1], gx, rab[mesh - 1]); + + rhocg1 += f_2+f_2; + rhocg1 += rhocg1; + rhocg1 += f_0 + f_1; + rhocg1/=3.0; + + // calculations after Simpson Integral + const double g2a = pow_gx / 4.0; + rhocg1 *= FOUR_PI / omega / 2.0 / gx; + rhocg1 += FOUR_PI / omega * gx_arr[ngg] * exp(-g2a) * (g2a + 1) / (pow_gx*pow_gx); + drhocg [idx] = rhocg1; +} + + + template void cal_vkb_op::operator()( const base_device::DEVICE_GPU* ctx, @@ -556,6 +599,15 @@ void cal_vq_deri_op::operator()( return ; } + +/** + * The implementation of this operator is detailed in stress_op.h. + * The function is called by the module as follows + * Type = 0 -> stress_cc + * Type = 1 -> stress_cc, force_cc + * Type = 2 -> force_scc + * Type = 3 -> stress_loc + */ template void cal_stress_drhoc_aux_op::operator()( const FPTYPE* r, const FPTYPE* rhoc, @@ -578,6 +630,10 @@ void cal_stress_drhoc_aux_op::operator()( cal_stress_drhoc_aux2<<>>( r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega ); + } else if(type == 3 ){ + cal_stress_drhoc_aux3<<>>( + r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega + ); } return ; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index e2d3938c05..d7db3b3eea 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -461,6 +461,46 @@ __global__ void cal_stress_drhoc_aux2( } +template +__global__ void cal_stress_drhoc_aux3( + const FPTYPE* r, const FPTYPE* rhoc, + const FPTYPE *gx_arr, const FPTYPE *rab, FPTYPE *drhocg, + const int mesh, const int igl0, const int ngg, const double omega +){ + const double FOUR_PI = 4.0 * 3.14159265358979323846; + + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx >= ngg) {return;} + + FPTYPE rhocg1=0.0; + FPTYPE gx = gx_arr[idx]; + const FPTYPE pow_gx = gx * gx; + + auto aux = [](FPTYPE r, FPTYPE rhoc, FPTYPE gx, FPTYPE rab) -> FPTYPE{ + return rab * rhoc * (r * cos(gx * r)/gx - sin(gx * r)/(gx * gx)); + }; + + FPTYPE f_0 = r[0] * r[0] * rhoc[0] * rab[0]; + for( int ir = 1 ; ir< mesh - 2; ir+=2) + { + rhocg1 += 2 * aux(r[ir],rhoc[ir], gx, rab[ir]) + aux(r[ir+1],rhoc[ir+1], gx, rab[ir+1]); + }//ir + FPTYPE f_2 = aux(r[mesh - 2],rhoc[mesh - 2], gx, rab[mesh - 2]); + FPTYPE f_1 = aux(r[mesh - 1],rhoc[mesh - 1], gx, rab[mesh - 1]); + + rhocg1 += f_2+f_2; + rhocg1 += rhocg1; + rhocg1 += f_0 + f_1; + rhocg1/=3.0; + + // calculations after Simpson Integral + const double g2a = pow_gx / 4.0; + rhocg1 *= FOUR_PI / omega / 2.0 / gx; + rhocg1 += FOUR_PI / omega * gx_arr[ngg] * exp(-g2a) * (g2a + 1) / (pow_gx*pow_gx); + drhocg [idx] = rhocg1; +} + template void cal_vkb_op::operator()( @@ -581,6 +621,10 @@ void cal_stress_drhoc_aux_op::operator()( hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_drhoc_aux2),block,THREADS_PER_BLOCK,0,0, r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega ); + } else if(type == 3 ){ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_drhoc_aux3),block,THREADS_PER_BLOCK,0,0, + r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega + ); } return ; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index f70f21dafa..2f551db297 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -6,7 +6,7 @@ #include "vnl_tools.hpp" #include #include - +#include "module_base/libm/libm.h" namespace hamilt { @@ -374,14 +374,25 @@ struct cal_stress_drhoc_aux_op { } else if(type == 1) { aux [ir] = ir!=0 ? std::sin(gx_arr[igl] * r[ir]) / (gx_arr[igl] * r[ir]) : 1.0; aux [ir] = r[ir] * r[ir] * rhoc [ir] * aux [ir]; - } else { + } else if(type == 2) { aux [ir] = r[ir] < 1.0e-8 ? rhoc [ir] : rhoc [ir] * sin(gx_arr[igl] * r[ir]) / (gx_arr[igl] * r[ir]); + } else if(type == 3) { + FPTYPE sinp, cosp; + sinp = std::sin(gx_arr[igl] * r[ir]); + cosp = std::cos(gx_arr[igl] * r[ir]); + aux[ir] = rhoc [ir] * (r [ir] * cosp / gx_arr[igl] - sinp / pow(gx_arr[igl],2)); } }//ir Simpson_Integral(mesh, aux.data(), rab, rhocg1); - if(type ==0 ) drhocg [igl] = FOUR_PI / omega * rhocg1; - else if(type == 1) drhocg [igl] = FOUR_PI * rhocg1 / omega; - else drhocg [igl] = rhocg1; + if(type ==0 ) { drhocg [igl] = FOUR_PI / omega * rhocg1; + } else if(type == 1) { drhocg [igl] = FOUR_PI * rhocg1 / omega; + } else if(type == 2) { drhocg [igl] = rhocg1; + } else if(type == 3) { + rhocg1 *= FOUR_PI / omega / 2.0 / gx_arr[igl]; + FPTYPE g2a = (gx_arr[igl]*gx_arr[igl]) / 4.0; + rhocg1 += FOUR_PI / omega * gx_arr[ngg] * ModuleBase::libm::exp ( - g2a) * (g2a + 1) / pow(gx_arr[igl]*gx_arr[igl] , 2); + drhocg [igl] = rhocg1; + } } #ifdef _OPENMP } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index 8c67369b31..b4dfc35ff8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -306,7 +306,29 @@ struct cal_vq_deri_op }; - +/** + * The operator is used to compute the auxiliary amount of stress /force + * in parallel on the GPU. They identify type with the type provided and + * select different calculation methods, + * + * The function is called by the module as follows + * Type = 0 -> stress_cc + * Type = 1 -> stress_cc, force_cc + * Type = 2 -> force_scc + * Type = 3 -> stress_loc + * + * Int the function aux is obtained by traversing the `ngg` and `mesh` firstly, + * and then aux is processed by Simpson integral method to obtain auxiliary + * quantities drhocg. + * + * In the GPU operator, temporary array space of mesh size is required in order + * not to apply Simpson interpolation (which causes GPU memory overflow). + * The Simpson integral is then reconstructed in the loop body of the mesh, + * using the Simpson integral computed in the loop, rather than executed once + * after the loop. After that, in order to reduce the if condition judgment brought + * by Simpson interpolation in the loop body, lambda expression is used to shift the + * boundary condition out. + */ template struct cal_stress_drhoc_aux_op{ void operator()( diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h index f455215f63..73cf31bf2d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h @@ -93,7 +93,8 @@ class Stress_Func const FPTYPE* vloc_at, const FPTYPE& zp, FPTYPE* dvloc, - ModulePW::PW_Basis* rho_basis); // used in local pseudopotential stress + ModulePW::PW_Basis* rho_basis, + const UnitCell& ucell_in); // used in local pseudopotential stress /** * @brief compute the derivative of the coulomb potential in reciprocal space diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp index 2b340e2dc1..be64b4e573 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp @@ -96,7 +96,7 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, // normal case: dvloc contains dV_loc(G)/dG // this->dvloc_of_g ( atom->ncpp.msh, atom->ncpp.rab.data(), atom->ncpp.r.data(), - atom->ncpp.vloc_at.data(), atom->ncpp.zv, dvloc.data(), rho_basis); + atom->ncpp.vloc_at.data(), atom->ncpp.zv, dvloc.data(), rho_basis, GlobalC::ucell); // } #ifndef _OPENMP @@ -163,6 +163,7 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, return; } + template void Stress_Func::dvloc_of_g ( @@ -172,7 +173,8 @@ const FPTYPE* r, const FPTYPE* vloc_at, const FPTYPE& zp, FPTYPE* dvloc, -ModulePW::PW_Basis* rho_basis +ModulePW::PW_Basis* rho_basis, +const UnitCell& ucell_in ) { //---------------------------------------------------------------------- @@ -187,11 +189,15 @@ ModulePW::PW_Basis* rho_basis int igl0; + this->device = base_device::get_device_type(this->ctx); + + std::vector gx_arr(rho_basis->ngg+1); + double* gx_arr_d = nullptr; // counter on erf functions or gaussians // counter on g shells vectors // first shell with g != 0 - std::vector aux1(msh); + std::vector aux(msh); // the G=0 component is not computed if (rho_basis->gg_uniq[0] < 1.0e-8) @@ -207,57 +213,61 @@ ModulePW::PW_Basis* rho_basis // Pseudopotentials in numerical form (Vloc contains the local part) // In order to perform the Fourier transform, a term erf(r)/r is // subtracted in real space and added again in G space - #ifdef _OPENMP -#pragma omp parallel -{ - #pragma omp for +#pragma omp parallel for #endif + for (int igl = igl0; igl < rho_basis->ngg; igl++) { + gx_arr[igl] = sqrt(rho_basis->gg_uniq[igl]) * ucell_in.tpiba; + } + + gx_arr[rho_basis->ngg] = zp * ModuleBase::e2; + // // This is the part of the integrand function // indipendent of |G| in real space // for(int i = 0;i< msh; i++) { - aux1[i] = r [i] * vloc_at [i] + zp * ModuleBase::e2 * erf(r[i]); + aux[i] = r [i] * vloc_at [i] + zp * ModuleBase::e2 * erf(r[i]); } - std::vector aux(msh); - aux[0] = 0.0; -#ifdef _OPENMP - #pragma omp for -#endif - for(int igl = igl0;igl< rho_basis->ngg;igl++) - { - const FPTYPE g2 = rho_basis->gg_uniq[igl]; - const FPTYPE gx = sqrt (g2 * GlobalC::ucell.tpiba2); - const FPTYPE gx2 = g2 * GlobalC::ucell.tpiba2; - // - // and here we perform the integral, after multiplying for the |G| - // dependent part - // - // DV(g)/Dg = ModuleBase::Integral of r (Dj_0(gr)/Dg) V(r) dr - for(int i = 1;i< msh;i++) - { - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(gx * r [i], &sinp, &cosp); - aux [i] = aux1 [i] * (r [i] * cosp / gx - sinp / pow(gx,2)); - } - FPTYPE vlcp=0; - // simpson (msh, aux, rab, vlcp); - ModuleBase::Integral::Simpson_Integral(msh, aux.data(), rab, vlcp ); - // DV(g^2)/Dg^2 = (DV(g)/Dg)/2g - vlcp *= ModuleBase::FOUR_PI / GlobalC::ucell.omega / 2.0 / gx; - // subtract the long-range term - FPTYPE g2a = gx2 / 4.0; - vlcp += ModuleBase::FOUR_PI / GlobalC::ucell.omega * zp * ModuleBase::e2 * ModuleBase::libm::exp ( - g2a) * (g2a + 1) / pow(gx2 , 2); - dvloc [igl] = vlcp; - } -#ifdef _OPENMP -} -#endif - + + double *r_d = nullptr; + double *rhoc_d = nullptr; + double *rab_d = nullptr; + double *aux_d = nullptr; + double *drhocg_d = nullptr; + if (this->device == base_device::GpuDevice) { + resmem_var_op()(this->ctx, r_d, msh); + resmem_var_op()(this->ctx, rhoc_d, msh); + resmem_var_op()(this->ctx, rab_d, msh); + + resmem_var_op()(this->ctx, aux_d, msh); + resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg+1); + resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg); + + syncmem_var_h2d_op()(this->ctx, + this->cpu_ctx, + gx_arr_d, + gx_arr.data(), + rho_basis->ngg+1); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, msh); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, msh); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, aux.data(), msh); + } + + + + if(this->device == base_device::GpuDevice) { + hamilt::cal_stress_drhoc_aux_op()( + r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,msh,igl0,rho_basis->ngg-igl0,ucell_in.omega,3); + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, dvloc+igl0, drhocg_d+igl0, rho_basis->ngg-igl0); + + } else { + hamilt::cal_stress_drhoc_aux_op()( + r,aux.data(),gx_arr.data()+igl0,rab,dvloc+igl0,msh,igl0,rho_basis->ngg-igl0,ucell_in.omega,3); + } return; }