Merge pull request SPECFEM#1618 from danielpeter/devel

adds GPU support for PML in elastic domains
elifo · Jun 9, 2023 · d3f66cb · d3f66cb
2 parents a73ce9d + 1531b76
commit d3f66cb
Show file tree

Hide file tree

Showing 40 changed files with 3,369 additions and 650 deletions.
diff --git a/src/generate_databases/save_arrays_solver_hdf5.F90 b/src/generate_databases/save_arrays_solver_hdf5.F90
@@ -83,7 +83,7 @@ subroutine save_arrays_solver_mesh_hdf5()
   integer :: info, comm
 
   ! if collective write
-  logical :: if_col= .true.
+  logical, parameter :: if_col = .true.
 
   ! hdf5 valiables
   character(len=64) :: dset_name, tempstr
@@ -92,12 +92,11 @@ subroutine save_arrays_solver_mesh_hdf5()
   ! the node ids are stored after dividing one NGLL*-th order spectral element into NGLLX*NGLLY*NGLLZ elements
   integer, dimension(9,nspec_ab*(NGLLX-1)*(NGLLY-1)*(NGLLZ-1)) :: spec_elm_conn_xdmf
 
-  ! dump dataset size
-  !integer, dimension(NPROC,4)                :: dsize_dump
-  integer, dimension(1,1)                    :: i2d_dummy = reshape((/0/),(/1,1/))
-  integer, dimension(1,1,1)                  :: i3d_dummy = reshape((/0/),(/1,1,1/))
-  real(kind=CUSTOM_REAL), dimension(1,1)     :: r2d_dummy = reshape((/0.0/),(/1,1/))
-  real(kind=CUSTOM_REAL), dimension(1,1,1)   :: r3d_dummy = reshape((/0.0/),(/1,1,1/))
+  ! dummy arrays
+  integer, dimension(1,1), parameter                    :: i2d_dummy = reshape((/0/),(/1,1/))
+  integer, dimension(1,1,1), parameter                  :: i3d_dummy = reshape((/0/),(/1,1,1/))
+  real(kind=CUSTOM_REAL), dimension(1,1), parameter     :: r2d_dummy = reshape((/0.0/),(/1,1/))
+  real(kind=CUSTOM_REAL), dimension(1,1,1), parameter   :: r3d_dummy = reshape((/0.0/),(/1,1,1/))
 
   ! offset arrays
   integer, dimension(0:NPROC-1) :: offset_nglob
@@ -143,6 +142,7 @@ subroutine save_arrays_solver_mesh_hdf5()
     write(IMAIN,*)
     call flush_IMAIN()
   endif
+  call synchronize_all()
 
   !---------------------------.
   ! Setup the values to write |

diff --git a/src/gpu/compute_forces_viscoelastic_cuda.cu b/src/gpu/compute_forces_viscoelastic_cuda.cu
diff --git a/src/gpu/compute_stacey_acoustic_cuda.cu b/src/gpu/compute_stacey_acoustic_cuda.cu
@@ -98,7 +98,6 @@ void FC_FUNC_(compute_stacey_acoustic_cuda,
                                                        mp->simulation_type,
                                                        mp->save_forward,
                                                        mp->d_num_abs_boundary_faces,
-                                                       mp->d_b_potential_dot_acoustic,
                                                        mp->d_b_potential_dot_dot_acoustic,
                                                        mp->d_b_absorb_potential,
                                                        mp->gravity);
@@ -119,7 +118,6 @@ void FC_FUNC_(compute_stacey_acoustic_cuda,
                                                          mp->simulation_type,
                                                          mp->save_forward,
                                                          mp->d_num_abs_boundary_faces,
-                                                         mp->d_b_potential_dot_acoustic,
                                                          mp->d_b_potential_dot_dot_acoustic,
                                                          mp->d_b_absorb_potential,
                                                          mp->gravity);

diff --git a/src/gpu/fault_solver_dynamics.cu b/src/gpu/fault_solver_dynamics.cu
@@ -136,22 +136,22 @@ void FC_FUNC_(transfer_fault_data_to_device,
   if (Flt->allow_opening){ exit_on_error("Fault opening not implemented yet on GPU\n"); }
 
   // copies data to GPU
-  if (*NGLOB_FLT > 0){
-    gpuCreateCopy_todevice_realw((void **)&(Flt->B),B,*NGLOB_FLT);
-    gpuCreateCopy_todevice_realw((void **)&(Flt->R),R,(*NGLOB_FLT)*9);
-    gpuCreateCopy_todevice_realw((void **)&(Flt->Z),Z,(*NGLOB_FLT));
+  if (Flt->NGLOB_FLT > 0){
+    gpuCreateCopy_todevice_realw((void **)&(Flt->B),B,Flt->NGLOB_FLT);
+    gpuCreateCopy_todevice_realw((void **)&(Flt->R),R,(Flt->NGLOB_FLT)*9);
+    gpuCreateCopy_todevice_realw((void **)&(Flt->Z),Z,(Flt->NGLOB_FLT));
 
-    gpuCreateCopy_todevice_realw((void **)&(Flt->D),D,(*NGLOB_FLT)*3);
-    gpuCreateCopy_todevice_realw((void **)&(Flt->V),V0,(*NGLOB_FLT)*3);
+    gpuCreateCopy_todevice_realw((void **)&(Flt->D),D,(Flt->NGLOB_FLT)*3);
+    gpuCreateCopy_todevice_realw((void **)&(Flt->V),V0,(Flt->NGLOB_FLT)*3);
 
-    gpuCreateCopy_todevice_realw((void **)&(Flt->T0),T0,(*NGLOB_FLT)*3);
-    gpuCreateCopy_todevice_realw((void **)&(Flt->T),T,(*NGLOB_FLT)*3);
+    gpuCreateCopy_todevice_realw((void **)&(Flt->T0),T0,(Flt->NGLOB_FLT)*3);
+    gpuCreateCopy_todevice_realw((void **)&(Flt->T),T,(Flt->NGLOB_FLT)*3);
 
-    gpuCreateCopy_todevice_realw((void **)&(Flt->invM1),invM1,*NGLOB_FLT);
-    gpuCreateCopy_todevice_realw((void **)&(Flt->invM2),invM2,*NGLOB_FLT);
+    gpuCreateCopy_todevice_realw((void **)&(Flt->invM1),invM1,Flt->NGLOB_FLT);
+    gpuCreateCopy_todevice_realw((void **)&(Flt->invM2),invM2,Flt->NGLOB_FLT);
 
-    gpuCreateCopy_todevice_int((void **)&(Flt->ibulk1),ibulk1,(*NGLOB_FLT));
-    gpuCreateCopy_todevice_int((void **)&(Flt->ibulk2),ibulk2,(*NGLOB_FLT));
+    gpuCreateCopy_todevice_int((void **)&(Flt->ibulk1),ibulk1,(Flt->NGLOB_FLT));
+    gpuCreateCopy_todevice_int((void **)&(Flt->ibulk2),ibulk2,(Flt->NGLOB_FLT));
   }
 
   GPU_ERROR_CHECKING("transfer_fault_data_to_device");

diff --git a/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu b/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu
@@ -556,7 +556,6 @@ __device__  __forceinline__ void sum_hprimewgll_gamma(int I, int J, int K,
 
 // computes the spatial derivatives
 
-
 __device__  __forceinline__ void get_spatial_derivatives(realw* xixl,realw* xiyl,realw* xizl,
                                                          realw* etaxl,realw* etayl,realw* etazl,
                                                          realw* gammaxl,realw* gammayl,realw* gammazl,
@@ -582,7 +581,7 @@ __device__  __forceinline__ void get_spatial_derivatives(realw* xixl,realw* xiyl
     // local padded index
     int offset = ispec_irreg*NGLL3_PADDED + tx;
 
-    *xixl = get_global_cr( &d_xix[offset]);
+    *xixl = get_global_cr(&d_xix[offset]);
     *xiyl = get_global_cr(&d_xiy[offset]);
     *xizl = get_global_cr(&d_xiz[offset]);
     *etaxl = get_global_cr(&d_etax[offset]);
@@ -739,6 +738,8 @@ Kernel_2_noatt_iso_impl(const int nb_blocks_to_compute,
                         realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
                         realw_const_p d_kappav,
                         realw_const_p d_muv,
+                        const int pml_conditions,
+                        const int* d_is_CPML,
                         const int FORWARD_OR_ADJOINT){
 
 // elastic compute kernel without attenuation for isotropic elements
@@ -820,6 +821,13 @@ Kernel_2_noatt_iso_impl(const int nb_blocks_to_compute,
   // spectral-element id
   // iphase-1 and working_element-1 for Fortran->C array conventions
   working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;
+
+  // PML
+  if (pml_conditions){
+    // PML elements will be computed later
+    if(d_is_CPML[working_element]) return;
+  }
+
   ispec_irreg = d_irregular_element_number[working_element] - 1;
 
   // local padded index
@@ -1042,6 +1050,8 @@ Kernel_2_noatt_iso_strain_impl(int nb_blocks_to_compute,
                               realw_p epsilondev_xz,realw_p epsilondev_yz,
                               realw_p epsilon_trace_over_3,
                               const int SIMULATION_TYPE,
+                              const int pml_conditions,
+                              const int* d_is_CPML,
                               const int FORWARD_OR_ADJOINT){
 
 // elastic compute kernel without attenuation for isotropic elements
@@ -1107,6 +1117,13 @@ Kernel_2_noatt_iso_strain_impl(int nb_blocks_to_compute,
   // spectral-element id
   // iphase-1 and working_element-1 for Fortran->C array conventions
   working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;
+
+  // PML
+  if (pml_conditions){
+    // PML elements will be computed later
+    if(d_is_CPML[working_element]) return;
+  }
+
   ispec_irreg = d_irregular_element_number[working_element] - 1;
 
   // local padded index
@@ -1253,6 +1270,8 @@ Kernel_2_noatt_iso_col_impl(int nb_blocks_to_compute,
                         realw_p epsilondev_xz,realw_p epsilondev_yz,
                         realw_p epsilon_trace_over_3,
                         const int SIMULATION_TYPE,
+                        const int pml_conditions,
+                        const int* d_is_CPML,
                         const int FORWARD_OR_ADJOINT){
 
 // elastic compute kernel without attenuation for isotropic elements
@@ -1327,6 +1346,13 @@ Kernel_2_noatt_iso_col_impl(int nb_blocks_to_compute,
     working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;
   }
 #endif
+
+  // PML
+  if (pml_conditions){
+    // PML elements will be computed later
+    if(d_is_CPML[working_element]) return;
+  }
+
   // local padded index
   offset = working_element*NGLL3_PADDED + tx;
 
@@ -1572,6 +1598,8 @@ Kernel_2_noatt_iso_grav_impl(int nb_blocks_to_compute,
                              realw_const_p d_minus_deriv_gravity,
                              realw_const_p d_rhostore,
                              realw_const_p wgll_cube,
+                             const int pml_conditions,
+                             const int* d_is_CPML,
                              const int FORWARD_OR_ADJOINT){
 
 // elastic compute kernel without attenuation for isotropic elements
@@ -1652,6 +1680,13 @@ Kernel_2_noatt_iso_grav_impl(int nb_blocks_to_compute,
     working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;
   }
 #endif
+
+  // PML
+  if (pml_conditions){
+    // PML elements will be computed later
+    if(d_is_CPML[working_element]) return;
+  }
+
   ispec_irreg = d_irregular_element_number[working_element] - 1;
 
   // local padded index
@@ -1881,6 +1916,8 @@ Kernel_2_noatt_ani_impl(int nb_blocks_to_compute,
                         realw_const_p d_minus_deriv_gravity,
                         realw_const_p d_rhostore,
                         realw_const_p wgll_cube,
+                        const int pml_conditions,
+                        const int* d_is_CPML,
                         const int FORWARD_OR_ADJOINT){
 
 // elastic compute kernel without attenuation for anisotropic elements
@@ -1959,6 +1996,13 @@ Kernel_2_noatt_ani_impl(int nb_blocks_to_compute,
     working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;
   }
 #endif
+
+  // PML
+  if (pml_conditions){
+    // PML elements will be computed later
+    if(d_is_CPML[working_element]) return;
+  }
+
   ispec_irreg = d_irregular_element_number[working_element] - 1;
 
   // local padded index
@@ -2240,6 +2284,8 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
                   realw_const_p d_minus_deriv_gravity,
                   realw_const_p d_rhostore,
                   realw_const_p wgll_cube,
+                  const int pml_conditions,
+                  const int* d_is_CPML,
                   const int FORWARD_OR_ADJOINT){
 
 
@@ -2322,6 +2368,13 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
     working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
   }
 #endif
+
+  // PML
+  if (pml_conditions){
+    // PML elements will be computed later
+    if(d_is_CPML[working_element]) return;
+  }
+
   ispec_irreg = d_irregular_element_number[working_element] - 1;
 
   // local padded index
@@ -2485,7 +2538,7 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
   sigma_zx = sigma_xz;
   sigma_zy = sigma_yz;
 
-  if (gravity ){
+  if (gravity){
     //  computes non-symmetric terms for gravity
     compute_element_gravity(tx,working_element,&iglob,d_minus_g,d_minus_deriv_gravity,
                             d_rhostore,wgll_cube,jacobianl,
@@ -2524,7 +2577,7 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
   sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
 
   // adds gravity term
-  if (gravity ){
+  if (gravity){
     sum_terms1 += rho_s_H1;
     sum_terms2 += rho_s_H2;
     sum_terms3 += rho_s_H3;
@@ -2557,7 +2610,7 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
 #else // MESH_COLORING
 
     //mesh coloring
-    if (use_mesh_coloring_gpu ){
+    if (use_mesh_coloring_gpu){
 
       // no atomic operation needed, colors don't share global points between elements
 #ifdef USE_TEXTURES_FIELDS
@@ -3351,6 +3404,8 @@ Kernel_2_noatt_iso_kelvinvoigt_impl(const int nb_blocks_to_compute,
                                     realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
                                     realw_const_p d_kappav,
                                     realw_const_p d_muv,
+                                    const int pml_conditions,
+                                    const int* d_is_CPML,
                                     const int FORWARD_OR_ADJOINT){
 
 // elastic compute kernel without attenuation for isotropic elements with kelvin voigt damping aroung the fault
@@ -3410,6 +3465,13 @@ Kernel_2_noatt_iso_kelvinvoigt_impl(const int nb_blocks_to_compute,
   // spectral-element id
   // iphase-1 and working_element-1 for Fortran->C array conventions
   working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;
+
+  // PML
+  if (pml_conditions){
+    // PML elements will be computed later
+    if(d_is_CPML[working_element]) return;
+  }
+
   ispec_irreg = d_irregular_element_number[working_element] - 1;
 
   // local padded index

diff --git a/src/gpu/kernels/UpdateDispVeloc_kernel.cu b/src/gpu/kernels/UpdateDispVeloc_kernel.cu
@@ -31,10 +31,10 @@
 __global__ void UpdateDispVeloc_kernel(realw* displ,
                                        realw* veloc,
                                        realw* accel,
-                                       int size,
-                                       realw deltat,
-                                       realw deltatsqover2,
-                                       realw deltatover2) {
+                                       const int size,
+                                       const realw deltat,
+                                       const realw deltatsqover2,
+                                       const realw deltatover2) {
 
   // two dimensional array of blocks on grid where each block has one dimensional array of threads
   int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
@@ -45,8 +45,8 @@ __global__ void UpdateDispVeloc_kernel(realw* displ,
     realw acc = accel[id];
     realw vel = veloc[id];
 
-    displ[id] = displ[id] + deltat*vel + deltatsqover2*acc;
-    veloc[id] = vel + deltatover2*acc;
+    displ[id] = displ[id] + deltat * vel + deltatsqover2 * acc;
+    veloc[id] = vel + deltatover2 * acc;
     accel[id] = 0.0f; // can do this using memset...not sure if faster,probably not
   }
 
@@ -60,4 +60,49 @@ __global__ void UpdateDispVeloc_kernel(realw* displ,
 // nvprof: 24599250 flops for 4099875 threads -> 6 FLOP per thread
 }
 
+/* ----------------------------------------------------------------------------------------------- */
 
+
+__global__ void UpdateDispVeloc_PML_kernel(realw* displ,
+                                           realw* veloc,
+                                           realw* accel,
+                                           realw* PML_displ,
+                                           const int NSPEC_CPML,
+                                           const int* d_CPML_to_spec,
+                                           const int* d_ibool,
+                                           const realw deltat,
+                                           const realw deltatsqover2,
+                                           const realw deltatover2) {
+
+  int ispec_cpml = blockIdx.x + blockIdx.y*gridDim.x;
+  int ijk = threadIdx.x;
+
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
+  if (ispec_cpml < NSPEC_CPML) {
+
+    int ispec = d_CPML_to_spec[ispec_cpml] - 1;
+
+    // local and global indices
+    int K = ijk/NGLL2;
+    int J = (ijk - K*NGLL2)/NGLLX;
+    int I = ijk - K*NGLL2 - J*NGLLX;
+
+    int iglob = d_ibool[ijk + NGLL3_PADDED*ispec] - 1;
+
+    const realw theta = 0.125f; // theta = 1.0 / 8.0;
+
+    // updates PML displacement
+    PML_displ[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,I,J,K,ispec_cpml)] = displ[3*iglob]
+                                                                   + deltatover2 * (1.0f - 2.0f * theta) * veloc[3*iglob]
+                                                                   + deltatsqover2 * (1.0f - theta) * accel[3*iglob];
+
+    PML_displ[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,I,J,K,ispec_cpml)] = displ[3*iglob+1]
+                                                                   + deltatover2 * (1.0f - 2.0f * theta) * veloc[3*iglob+1]
+                                                                   + deltatsqover2 * (1.0f - theta) * accel[3*iglob+1];
+
+    PML_displ[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,I,J,K,ispec_cpml)] = displ[3*iglob+2]
+                                                                   + deltatover2 * (1.0f - 2.0f * theta) * veloc[3*iglob+2]
+                                                                   + deltatsqover2 * (1.0f - theta) * accel[3*iglob+2];
+  }
+}
diff --git a/src/gpu/kernels/compute_stacey_acoustic_kernel.cu b/src/gpu/kernels/compute_stacey_acoustic_kernel.cu
@@ -40,7 +40,6 @@ __global__ void compute_stacey_acoustic_kernel(field* potential_dot_acoustic,
                                                int SIMULATION_TYPE,
                                                int SAVE_FORWARD,
                                                int num_abs_boundary_faces,
-                                               field* b_potential_dot_acoustic,
                                                field* b_potential_dot_dot_acoustic,
                                                field* b_absorb_potential,
                                                int gravity) {

diff --git a/src/gpu/kernels/kernel_cuda.mk b/src/gpu/kernels/kernel_cuda.mk
@@ -31,6 +31,8 @@ cuda_kernels_OBJS := \
 	$O/kernel_3_cuda_device.cuda-kernel.o \
 	$O/kernel_3_veloc_cuda_device.cuda-kernel.o \
 	$O/noise_read_add_surface_movie_cuda_kernel.cuda-kernel.o \
+	$O/pml_impose_boundary_condition_cuda_kernel.cuda-kernel.o \
+	$O/pml_kernel_2_viscoelastic_impl.cuda-kernel.o \
 	$O/prepare_boundary_accel_on_device.cuda-kernel.o \
 	$O/prepare_boundary_potential_on_device.cuda-kernel.o \
 	$O/process_smooth.cuda-kernel.o \