Refactor: Add heterogeneous parallel code for the stress loc module. (d…

…eepmodeling#4854) * init stress_loc gpu ops. * fix a bug. * add dcu support. * remove GlobalC and use vectors. * fix dcu bug. * [pre-commit.ci lite] apply automatic fixes * replace gx_arr[idx] to gx to keep consistent. * fix a bug of gx. * add comment, * fix a bug. --------- Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Zhuxuegang2022 · Aug 5, 2024 · d893fda · d893fda
1 parent 0e2e2c1
commit d893fda
Show file tree

Hide file tree

Showing 6 changed files with 194 additions and 50 deletions.
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -461,6 +461,49 @@ __global__ void cal_stress_drhoc_aux2(
     drhocg [idx] = rhocg1;
 }
 
+
+template <typename FPTYPE>
+__global__ void cal_stress_drhoc_aux3(
+        const FPTYPE* r, const FPTYPE* rhoc, 
+        const FPTYPE *gx_arr, const FPTYPE *rab, FPTYPE *drhocg, 
+        const int mesh, const int igl0, const int ngg, const double omega
+){
+    const double FOUR_PI =  4.0 * 3.14159265358979323846;
+
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (idx >= ngg) {return;}
+
+    FPTYPE rhocg1=0.0;
+    FPTYPE gx = gx_arr[idx];    
+    const FPTYPE pow_gx = gx * gx;
+
+    auto aux = [](FPTYPE r, FPTYPE rhoc, FPTYPE gx, FPTYPE rab) -> FPTYPE{
+        return rab * rhoc * (r * cos(gx * r)/gx - sin(gx * r)/(gx * gx));
+    };
+
+    FPTYPE f_0 = r[0] * r[0] * rhoc[0] * rab[0];
+    for( int ir = 1 ; ir< mesh - 2; ir+=2)
+    {
+        rhocg1 += 2 * aux(r[ir],rhoc[ir], gx, rab[ir]) + aux(r[ir+1],rhoc[ir+1], gx, rab[ir+1]);
+    }//ir
+    FPTYPE f_2 = aux(r[mesh - 2],rhoc[mesh - 2], gx, rab[mesh - 2]);
+    FPTYPE f_1 = aux(r[mesh - 1],rhoc[mesh - 1], gx, rab[mesh - 1]);
+
+    rhocg1 += f_2+f_2;
+    rhocg1 += rhocg1;
+    rhocg1 += f_0 + f_1;
+    rhocg1/=3.0;
+
+    // calculations after Simpson Integral
+    const double g2a = pow_gx / 4.0;
+    rhocg1 *= FOUR_PI / omega / 2.0 / gx;
+    rhocg1 += FOUR_PI / omega * gx_arr[ngg] * exp(-g2a) * (g2a + 1) / (pow_gx*pow_gx);
+    drhocg [idx] = rhocg1;
+}
+
+
+
 template <typename FPTYPE>
 void cal_vkb_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
         const base_device::DEVICE_GPU* ctx,
@@ -556,6 +599,15 @@ void cal_vq_deri_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     return ;
 }
 
+
+/**
+ * The implementation of this operator is detailed in stress_op.h.
+ * The function is called by the module as follows
+ *      Type = 0 -> stress_cc
+ *      Type = 1 -> stress_cc, force_cc
+ *      Type = 2 -> force_scc
+ *      Type = 3 -> stress_loc
+ */
 template <typename FPTYPE>
 void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
         const FPTYPE* r, const FPTYPE* rhoc,  
@@ -578,6 +630,10 @@ void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
         cal_stress_drhoc_aux2<FPTYPE><<<block,THREADS_PER_BLOCK>>>(
             r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega
         );        
+    } else if(type == 3 ){
+        cal_stress_drhoc_aux3<FPTYPE><<<block,THREADS_PER_BLOCK>>>(
+            r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega
+        );        
     }
     return ;
 }

diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -461,6 +461,46 @@ __global__ void cal_stress_drhoc_aux2(
 }
 
 
+template <typename FPTYPE>
+__global__ void cal_stress_drhoc_aux3(
+        const FPTYPE* r, const FPTYPE* rhoc, 
+        const FPTYPE *gx_arr, const FPTYPE *rab, FPTYPE *drhocg, 
+        const int mesh, const int igl0, const int ngg, const double omega
+){
+    const double FOUR_PI =  4.0 * 3.14159265358979323846;
+
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (idx >= ngg) {return;}
+
+    FPTYPE rhocg1=0.0;
+    FPTYPE gx = gx_arr[idx];    
+    const FPTYPE pow_gx = gx * gx;
+
+    auto aux = [](FPTYPE r, FPTYPE rhoc, FPTYPE gx, FPTYPE rab) -> FPTYPE{
+        return rab * rhoc * (r * cos(gx * r)/gx - sin(gx * r)/(gx * gx));
+    };
+
+    FPTYPE f_0 = r[0] * r[0] * rhoc[0] * rab[0];
+    for( int ir = 1 ; ir< mesh - 2; ir+=2)
+    {
+        rhocg1 += 2 * aux(r[ir],rhoc[ir], gx, rab[ir]) + aux(r[ir+1],rhoc[ir+1], gx, rab[ir+1]);
+    }//ir
+    FPTYPE f_2 = aux(r[mesh - 2],rhoc[mesh - 2], gx, rab[mesh - 2]);
+    FPTYPE f_1 = aux(r[mesh - 1],rhoc[mesh - 1], gx, rab[mesh - 1]);
+
+    rhocg1 += f_2+f_2;
+    rhocg1 += rhocg1;
+    rhocg1 += f_0 + f_1;
+    rhocg1/=3.0;
+
+    // calculations after Simpson Integral
+    const double g2a = pow_gx / 4.0;
+    rhocg1 *= FOUR_PI / omega / 2.0 / gx;
+    rhocg1 += FOUR_PI / omega * gx_arr[ngg] * exp(-g2a) * (g2a + 1) / (pow_gx*pow_gx);
+    drhocg [idx] = rhocg1;
+}
+
 
 template <typename FPTYPE>
 void cal_vkb_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
@@ -581,6 +621,10 @@ void cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
         hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_drhoc_aux2<FPTYPE>),block,THREADS_PER_BLOCK,0,0,
             r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega
         );     
+    } else if(type == 3 ){
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_drhoc_aux3<FPTYPE>),block,THREADS_PER_BLOCK,0,0,
+            r,rhoc,gx_arr,rab,drhocg,mesh,igl0,ngg,omega
+        );     
     }
 
     return ;

diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -6,7 +6,7 @@
 #include "vnl_tools.hpp"
 #include <vector>
 #include <iomanip>
-
+#include "module_base/libm/libm.h"
 namespace hamilt
 {
 
@@ -374,14 +374,25 @@ struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_CPU> {
                 } else if(type == 1) {
                     aux [ir] = ir!=0 ? std::sin(gx_arr[igl] * r[ir]) / (gx_arr[igl] * r[ir]) : 1.0;
                     aux [ir] = r[ir] * r[ir] * rhoc [ir] * aux [ir];
-                } else {
+                } else if(type == 2) {
                     aux [ir] = r[ir] < 1.0e-8 ? rhoc [ir] : rhoc [ir] * sin(gx_arr[igl] * r[ir]) / (gx_arr[igl] * r[ir]);
+                } else if(type == 3) {
+                    FPTYPE sinp, cosp;
+                    sinp = std::sin(gx_arr[igl] * r[ir]);
+                    cosp = std::cos(gx_arr[igl] * r[ir]);
+                    aux[ir] = rhoc [ir] *  (r [ir] * cosp / gx_arr[igl] - sinp / pow(gx_arr[igl],2));
                 }
             }//ir
             Simpson_Integral<FPTYPE>(mesh, aux.data(), rab, rhocg1);
-            if(type ==0 ) drhocg [igl] = FOUR_PI / omega * rhocg1;
-            else if(type == 1) drhocg [igl] = FOUR_PI * rhocg1 / omega;
-            else drhocg [igl] = rhocg1;
+            if(type ==0 ) { drhocg [igl] = FOUR_PI / omega * rhocg1;
+            } else if(type == 1) { drhocg [igl] = FOUR_PI * rhocg1 / omega;
+            } else if(type == 2) { drhocg [igl] = rhocg1;
+            } else if(type == 3) {
+                rhocg1 *= FOUR_PI / omega / 2.0 / gx_arr[igl];
+                FPTYPE g2a = (gx_arr[igl]*gx_arr[igl]) / 4.0;
+                rhocg1 += FOUR_PI / omega * gx_arr[ngg] * ModuleBase::libm::exp ( - g2a) * (g2a + 1) / pow(gx_arr[igl]*gx_arr[igl] , 2);
+                drhocg [igl] = rhocg1;
+            }
         }
 #ifdef _OPENMP
         }

diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -306,7 +306,29 @@ struct cal_vq_deri_op<FPTYPE, base_device::DEVICE_GPU>
 };
 
 
-
+/**
+ * The operator is used to compute the auxiliary amount of stress /force 
+ * in parallel on the GPU. They identify type with the type provided and 
+ * select different calculation methods,
+ *
+ * The function is called by the module as follows
+ *      Type = 0 -> stress_cc
+ *      Type = 1 -> stress_cc, force_cc
+ *      Type = 2 -> force_scc
+ *      Type = 3 -> stress_loc
+ *
+ *  Int the function aux is obtained by traversing the `ngg` and `mesh` firstly,
+ *  and then aux is processed by Simpson integral method to obtain auxiliary 
+ *  quantities drhocg.
+ *
+ * In the GPU operator, temporary array space of mesh size is required in order 
+ * not to apply Simpson interpolation (which causes GPU memory overflow). 
+ * The Simpson integral is then reconstructed in the loop body of the mesh, 
+ * using the Simpson integral computed in the loop, rather than executed once 
+ * after the loop. After that, in order to reduce the if condition judgment brought 
+ * by Simpson interpolation in the loop body, lambda expression is used to shift the 
+ * boundary condition out.
+ */
 template <typename FPTYPE>
 struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_GPU>{
     void operator()(

diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
@@ -93,7 +93,8 @@ class Stress_Func
                     const FPTYPE* vloc_at,
                     const FPTYPE& zp,
                     FPTYPE* dvloc,
-                    ModulePW::PW_Basis* rho_basis); // used in local pseudopotential stress
+                    ModulePW::PW_Basis* rho_basis,
+                    const UnitCell& ucell_in); // used in local pseudopotential stress
 
     /**
      * @brief compute the derivative of the coulomb potential in reciprocal space

diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
@@ -96,7 +96,7 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
 		// normal case: dvloc contains dV_loc(G)/dG
 		//
 			this->dvloc_of_g ( atom->ncpp.msh, atom->ncpp.rab.data(), atom->ncpp.r.data(),
-					atom->ncpp.vloc_at.data(), atom->ncpp.zv, dvloc.data(), rho_basis);
+					atom->ncpp.vloc_at.data(), atom->ncpp.zv, dvloc.data(), rho_basis, GlobalC::ucell);
 		//
 		}
 #ifndef _OPENMP
@@ -163,6 +163,7 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
 	return;
 }
 
+
 template<typename FPTYPE, typename Device>
 void Stress_Func<FPTYPE, Device>::dvloc_of_g
 (
@@ -172,7 +173,8 @@ const FPTYPE* r,
 const FPTYPE* vloc_at,
 const FPTYPE& zp,
 FPTYPE*  dvloc,
-ModulePW::PW_Basis* rho_basis
+ModulePW::PW_Basis* rho_basis,
+const UnitCell& ucell_in
 )
 {
   //----------------------------------------------------------------------
@@ -187,11 +189,15 @@ ModulePW::PW_Basis* rho_basis
 
 
 	int igl0;
+	this->device = base_device::get_device_type<Device>(this->ctx);
+
+	std::vector<double> gx_arr(rho_basis->ngg+1);
+    double* gx_arr_d = nullptr;
 	// counter on erf functions or gaussians
 	// counter on g shells vectors
 	// first shell with g != 0
 
-	std::vector<FPTYPE> aux1(msh);
+	std::vector<FPTYPE> aux(msh);
 
 	// the  G=0 component is not computed
 	if (rho_basis->gg_uniq[0] < 1.0e-8)
@@ -207,57 +213,61 @@ ModulePW::PW_Basis* rho_basis
 	// Pseudopotentials in numerical form (Vloc contains the local part)
 	// In order to perform the Fourier transform, a term erf(r)/r is
 	// subtracted in real space and added again in G space
-
 #ifdef _OPENMP
-#pragma omp parallel
-{
-	#pragma omp for
+#pragma omp parallel for
 #endif
+    for (int igl = igl0; igl < rho_basis->ngg; igl++) {
+        gx_arr[igl] = sqrt(rho_basis->gg_uniq[igl]) * ucell_in.tpiba;
+    }
+
+	gx_arr[rho_basis->ngg] = zp * ModuleBase::e2;
+
 	//
 	//   This is the part of the integrand function
 	//   indipendent of |G| in real space
 	//
 	for(int i = 0;i< msh; i++)
 	{
-		aux1[i] = r [i] * vloc_at [i] + zp * ModuleBase::e2 * erf(r[i]);
+		aux[i] = r [i] * vloc_at [i] + zp * ModuleBase::e2 * erf(r[i]);
 	}
 
-	std::vector<FPTYPE> aux(msh);
-	aux[0] = 0.0;
-#ifdef _OPENMP
-	#pragma omp for
-#endif
-	for(int igl = igl0;igl< rho_basis->ngg;igl++)
-	{
-		const FPTYPE g2 = rho_basis->gg_uniq[igl];
-		const FPTYPE gx = sqrt (g2 * GlobalC::ucell.tpiba2);
-		const FPTYPE gx2 = g2 * GlobalC::ucell.tpiba2;
-		//
-		//    and here we perform the integral, after multiplying for the |G|
-		//    dependent  part
-		//
-		// DV(g)/Dg = ModuleBase::Integral of r (Dj_0(gr)/Dg) V(r) dr
-		for(int i = 1;i< msh;i++)
-		{
-			FPTYPE sinp, cosp;
-            ModuleBase::libm::sincos(gx * r [i], &sinp, &cosp);
-			aux [i] = aux1 [i] * (r [i] * cosp / gx - sinp / pow(gx,2));
-		}
-		FPTYPE vlcp=0;
-		// simpson (msh, aux, rab, vlcp);
-		ModuleBase::Integral::Simpson_Integral(msh, aux.data(), rab, vlcp );
-		// DV(g^2)/Dg^2 = (DV(g)/Dg)/2g
-		vlcp *= ModuleBase::FOUR_PI / GlobalC::ucell.omega / 2.0 / gx;
-		// subtract the long-range term
-		FPTYPE g2a = gx2 / 4.0;
-		vlcp += ModuleBase::FOUR_PI / GlobalC::ucell.omega * zp * ModuleBase::e2 * ModuleBase::libm::exp ( - g2a) * (g2a + 1) / pow(gx2 , 2);
-		dvloc [igl] = vlcp;
-	}
 
-#ifdef _OPENMP
-}
-#endif
-
+
+    double *r_d = nullptr;
+	double *rhoc_d = nullptr;
+	double *rab_d = nullptr;
+    double *aux_d = nullptr;
+	double *drhocg_d = nullptr;
+    if (this->device == base_device::GpuDevice) {
+        resmem_var_op()(this->ctx, r_d, msh);
+        resmem_var_op()(this->ctx, rhoc_d, msh);
+        resmem_var_op()(this->ctx, rab_d, msh);
+
+        resmem_var_op()(this->ctx, aux_d, msh);
+        resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg+1);
+        resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg);
+
+        syncmem_var_h2d_op()(this->ctx,
+                             this->cpu_ctx,
+                             gx_arr_d,
+                             gx_arr.data(),
+                             rho_basis->ngg+1);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, msh);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, msh);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, aux.data(), msh);
+    }
+
+
+
+	if(this->device == base_device::GpuDevice) {
+		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
+			r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,msh,igl0,rho_basis->ngg-igl0,ucell_in.omega,3);
+		syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, dvloc+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);	
+
+	} else {
+		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
+			r,aux.data(),gx_arr.data()+igl0,rab,dvloc+igl0,msh,igl0,rho_basis->ngg-igl0,ucell_in.omega,3);
+	}	
 
 	return;
 }