Skip to content

Commit

Permalink
Merge pull request SPECFEM#1618 from danielpeter/devel
Browse files Browse the repository at this point in the history
adds GPU support for PML in elastic domains
  • Loading branch information
daniel peter authored Jun 9, 2023
2 parents a73ce9d + 1531b76 commit d3f66cb
Show file tree
Hide file tree
Showing 40 changed files with 3,369 additions and 650 deletions.
14 changes: 7 additions & 7 deletions src/generate_databases/save_arrays_solver_hdf5.F90
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ subroutine save_arrays_solver_mesh_hdf5()
integer :: info, comm

! if collective write
logical :: if_col= .true.
logical, parameter :: if_col = .true.

! hdf5 valiables
character(len=64) :: dset_name, tempstr
Expand All @@ -92,12 +92,11 @@ subroutine save_arrays_solver_mesh_hdf5()
! the node ids are stored after dividing one NGLL*-th order spectral element into NGLLX*NGLLY*NGLLZ elements
integer, dimension(9,nspec_ab*(NGLLX-1)*(NGLLY-1)*(NGLLZ-1)) :: spec_elm_conn_xdmf

! dump dataset size
!integer, dimension(NPROC,4) :: dsize_dump
integer, dimension(1,1) :: i2d_dummy = reshape((/0/),(/1,1/))
integer, dimension(1,1,1) :: i3d_dummy = reshape((/0/),(/1,1,1/))
real(kind=CUSTOM_REAL), dimension(1,1) :: r2d_dummy = reshape((/0.0/),(/1,1/))
real(kind=CUSTOM_REAL), dimension(1,1,1) :: r3d_dummy = reshape((/0.0/),(/1,1,1/))
! dummy arrays
integer, dimension(1,1), parameter :: i2d_dummy = reshape((/0/),(/1,1/))
integer, dimension(1,1,1), parameter :: i3d_dummy = reshape((/0/),(/1,1,1/))
real(kind=CUSTOM_REAL), dimension(1,1), parameter :: r2d_dummy = reshape((/0.0/),(/1,1/))
real(kind=CUSTOM_REAL), dimension(1,1,1), parameter :: r3d_dummy = reshape((/0.0/),(/1,1,1/))

! offset arrays
integer, dimension(0:NPROC-1) :: offset_nglob
Expand Down Expand Up @@ -143,6 +142,7 @@ subroutine save_arrays_solver_mesh_hdf5()
write(IMAIN,*)
call flush_IMAIN()
endif
call synchronize_all()

!---------------------------.
! Setup the values to write |
Expand Down
63 changes: 63 additions & 0 deletions src/gpu/compute_forces_viscoelastic_cuda.cu

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions src/gpu/compute_stacey_acoustic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ void FC_FUNC_(compute_stacey_acoustic_cuda,
mp->simulation_type,
mp->save_forward,
mp->d_num_abs_boundary_faces,
mp->d_b_potential_dot_acoustic,
mp->d_b_potential_dot_dot_acoustic,
mp->d_b_absorb_potential,
mp->gravity);
Expand All @@ -119,7 +118,6 @@ void FC_FUNC_(compute_stacey_acoustic_cuda,
mp->simulation_type,
mp->save_forward,
mp->d_num_abs_boundary_faces,
mp->d_b_potential_dot_acoustic,
mp->d_b_potential_dot_dot_acoustic,
mp->d_b_absorb_potential,
mp->gravity);
Expand Down
24 changes: 12 additions & 12 deletions src/gpu/fault_solver_dynamics.cu
Original file line number Diff line number Diff line change
Expand Up @@ -136,22 +136,22 @@ void FC_FUNC_(transfer_fault_data_to_device,
if (Flt->allow_opening){ exit_on_error("Fault opening not implemented yet on GPU\n"); }

// copies data to GPU
if (*NGLOB_FLT > 0){
gpuCreateCopy_todevice_realw((void **)&(Flt->B),B,*NGLOB_FLT);
gpuCreateCopy_todevice_realw((void **)&(Flt->R),R,(*NGLOB_FLT)*9);
gpuCreateCopy_todevice_realw((void **)&(Flt->Z),Z,(*NGLOB_FLT));
if (Flt->NGLOB_FLT > 0){
gpuCreateCopy_todevice_realw((void **)&(Flt->B),B,Flt->NGLOB_FLT);
gpuCreateCopy_todevice_realw((void **)&(Flt->R),R,(Flt->NGLOB_FLT)*9);
gpuCreateCopy_todevice_realw((void **)&(Flt->Z),Z,(Flt->NGLOB_FLT));

gpuCreateCopy_todevice_realw((void **)&(Flt->D),D,(*NGLOB_FLT)*3);
gpuCreateCopy_todevice_realw((void **)&(Flt->V),V0,(*NGLOB_FLT)*3);
gpuCreateCopy_todevice_realw((void **)&(Flt->D),D,(Flt->NGLOB_FLT)*3);
gpuCreateCopy_todevice_realw((void **)&(Flt->V),V0,(Flt->NGLOB_FLT)*3);

gpuCreateCopy_todevice_realw((void **)&(Flt->T0),T0,(*NGLOB_FLT)*3);
gpuCreateCopy_todevice_realw((void **)&(Flt->T),T,(*NGLOB_FLT)*3);
gpuCreateCopy_todevice_realw((void **)&(Flt->T0),T0,(Flt->NGLOB_FLT)*3);
gpuCreateCopy_todevice_realw((void **)&(Flt->T),T,(Flt->NGLOB_FLT)*3);

gpuCreateCopy_todevice_realw((void **)&(Flt->invM1),invM1,*NGLOB_FLT);
gpuCreateCopy_todevice_realw((void **)&(Flt->invM2),invM2,*NGLOB_FLT);
gpuCreateCopy_todevice_realw((void **)&(Flt->invM1),invM1,Flt->NGLOB_FLT);
gpuCreateCopy_todevice_realw((void **)&(Flt->invM2),invM2,Flt->NGLOB_FLT);

gpuCreateCopy_todevice_int((void **)&(Flt->ibulk1),ibulk1,(*NGLOB_FLT));
gpuCreateCopy_todevice_int((void **)&(Flt->ibulk2),ibulk2,(*NGLOB_FLT));
gpuCreateCopy_todevice_int((void **)&(Flt->ibulk1),ibulk1,(Flt->NGLOB_FLT));
gpuCreateCopy_todevice_int((void **)&(Flt->ibulk2),ibulk2,(Flt->NGLOB_FLT));
}

GPU_ERROR_CHECKING("transfer_fault_data_to_device");
Expand Down
72 changes: 67 additions & 5 deletions src/gpu/kernels/Kernel_2_viscoelastic_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,6 @@ __device__ __forceinline__ void sum_hprimewgll_gamma(int I, int J, int K,

// computes the spatial derivatives


__device__ __forceinline__ void get_spatial_derivatives(realw* xixl,realw* xiyl,realw* xizl,
realw* etaxl,realw* etayl,realw* etazl,
realw* gammaxl,realw* gammayl,realw* gammazl,
Expand All @@ -582,7 +581,7 @@ __device__ __forceinline__ void get_spatial_derivatives(realw* xixl,realw* xiyl
// local padded index
int offset = ispec_irreg*NGLL3_PADDED + tx;

*xixl = get_global_cr( &d_xix[offset]);
*xixl = get_global_cr(&d_xix[offset]);
*xiyl = get_global_cr(&d_xiy[offset]);
*xizl = get_global_cr(&d_xiz[offset]);
*etaxl = get_global_cr(&d_etax[offset]);
Expand Down Expand Up @@ -739,6 +738,8 @@ Kernel_2_noatt_iso_impl(const int nb_blocks_to_compute,
realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
realw_const_p d_kappav,
realw_const_p d_muv,
const int pml_conditions,
const int* d_is_CPML,
const int FORWARD_OR_ADJOINT){

// elastic compute kernel without attenuation for isotropic elements
Expand Down Expand Up @@ -820,6 +821,13 @@ Kernel_2_noatt_iso_impl(const int nb_blocks_to_compute,
// spectral-element id
// iphase-1 and working_element-1 for Fortran->C array conventions
working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;

// PML
if (pml_conditions){
// PML elements will be computed later
if(d_is_CPML[working_element]) return;
}

ispec_irreg = d_irregular_element_number[working_element] - 1;

// local padded index
Expand Down Expand Up @@ -1042,6 +1050,8 @@ Kernel_2_noatt_iso_strain_impl(int nb_blocks_to_compute,
realw_p epsilondev_xz,realw_p epsilondev_yz,
realw_p epsilon_trace_over_3,
const int SIMULATION_TYPE,
const int pml_conditions,
const int* d_is_CPML,
const int FORWARD_OR_ADJOINT){

// elastic compute kernel without attenuation for isotropic elements
Expand Down Expand Up @@ -1107,6 +1117,13 @@ Kernel_2_noatt_iso_strain_impl(int nb_blocks_to_compute,
// spectral-element id
// iphase-1 and working_element-1 for Fortran->C array conventions
working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;

// PML
if (pml_conditions){
// PML elements will be computed later
if(d_is_CPML[working_element]) return;
}

ispec_irreg = d_irregular_element_number[working_element] - 1;

// local padded index
Expand Down Expand Up @@ -1253,6 +1270,8 @@ Kernel_2_noatt_iso_col_impl(int nb_blocks_to_compute,
realw_p epsilondev_xz,realw_p epsilondev_yz,
realw_p epsilon_trace_over_3,
const int SIMULATION_TYPE,
const int pml_conditions,
const int* d_is_CPML,
const int FORWARD_OR_ADJOINT){

// elastic compute kernel without attenuation for isotropic elements
Expand Down Expand Up @@ -1327,6 +1346,13 @@ Kernel_2_noatt_iso_col_impl(int nb_blocks_to_compute,
working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;
}
#endif

// PML
if (pml_conditions){
// PML elements will be computed later
if(d_is_CPML[working_element]) return;
}

// local padded index
offset = working_element*NGLL3_PADDED + tx;

Expand Down Expand Up @@ -1572,6 +1598,8 @@ Kernel_2_noatt_iso_grav_impl(int nb_blocks_to_compute,
realw_const_p d_minus_deriv_gravity,
realw_const_p d_rhostore,
realw_const_p wgll_cube,
const int pml_conditions,
const int* d_is_CPML,
const int FORWARD_OR_ADJOINT){

// elastic compute kernel without attenuation for isotropic elements
Expand Down Expand Up @@ -1652,6 +1680,13 @@ Kernel_2_noatt_iso_grav_impl(int nb_blocks_to_compute,
working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;
}
#endif

// PML
if (pml_conditions){
// PML elements will be computed later
if(d_is_CPML[working_element]) return;
}

ispec_irreg = d_irregular_element_number[working_element] - 1;

// local padded index
Expand Down Expand Up @@ -1881,6 +1916,8 @@ Kernel_2_noatt_ani_impl(int nb_blocks_to_compute,
realw_const_p d_minus_deriv_gravity,
realw_const_p d_rhostore,
realw_const_p wgll_cube,
const int pml_conditions,
const int* d_is_CPML,
const int FORWARD_OR_ADJOINT){

// elastic compute kernel without attenuation for anisotropic elements
Expand Down Expand Up @@ -1959,6 +1996,13 @@ Kernel_2_noatt_ani_impl(int nb_blocks_to_compute,
working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;
}
#endif

// PML
if (pml_conditions){
// PML elements will be computed later
if(d_is_CPML[working_element]) return;
}

ispec_irreg = d_irregular_element_number[working_element] - 1;

// local padded index
Expand Down Expand Up @@ -2240,6 +2284,8 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
realw_const_p d_minus_deriv_gravity,
realw_const_p d_rhostore,
realw_const_p wgll_cube,
const int pml_conditions,
const int* d_is_CPML,
const int FORWARD_OR_ADJOINT){


Expand Down Expand Up @@ -2322,6 +2368,13 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
}
#endif

// PML
if (pml_conditions){
// PML elements will be computed later
if(d_is_CPML[working_element]) return;
}

ispec_irreg = d_irregular_element_number[working_element] - 1;

// local padded index
Expand Down Expand Up @@ -2485,7 +2538,7 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
sigma_zx = sigma_xz;
sigma_zy = sigma_yz;

if (gravity ){
if (gravity){
// computes non-symmetric terms for gravity
compute_element_gravity(tx,working_element,&iglob,d_minus_g,d_minus_deriv_gravity,
d_rhostore,wgll_cube,jacobianl,
Expand Down Expand Up @@ -2524,7 +2577,7 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);

// adds gravity term
if (gravity ){
if (gravity){
sum_terms1 += rho_s_H1;
sum_terms2 += rho_s_H2;
sum_terms3 += rho_s_H3;
Expand Down Expand Up @@ -2557,7 +2610,7 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
#else // MESH_COLORING

//mesh coloring
if (use_mesh_coloring_gpu ){
if (use_mesh_coloring_gpu){

// no atomic operation needed, colors don't share global points between elements
#ifdef USE_TEXTURES_FIELDS
Expand Down Expand Up @@ -3351,6 +3404,8 @@ Kernel_2_noatt_iso_kelvinvoigt_impl(const int nb_blocks_to_compute,
realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
realw_const_p d_kappav,
realw_const_p d_muv,
const int pml_conditions,
const int* d_is_CPML,
const int FORWARD_OR_ADJOINT){

// elastic compute kernel without attenuation for isotropic elements with kelvin voigt damping aroung the fault
Expand Down Expand Up @@ -3410,6 +3465,13 @@ Kernel_2_noatt_iso_kelvinvoigt_impl(const int nb_blocks_to_compute,
// spectral-element id
// iphase-1 and working_element-1 for Fortran->C array conventions
working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)] - 1;

// PML
if (pml_conditions){
// PML elements will be computed later
if(d_is_CPML[working_element]) return;
}

ispec_irreg = d_irregular_element_number[working_element] - 1;

// local padded index
Expand Down
57 changes: 51 additions & 6 deletions src/gpu/kernels/UpdateDispVeloc_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
__global__ void UpdateDispVeloc_kernel(realw* displ,
realw* veloc,
realw* accel,
int size,
realw deltat,
realw deltatsqover2,
realw deltatover2) {
const int size,
const realw deltat,
const realw deltatsqover2,
const realw deltatover2) {

// two dimensional array of blocks on grid where each block has one dimensional array of threads
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
Expand All @@ -45,8 +45,8 @@ __global__ void UpdateDispVeloc_kernel(realw* displ,
realw acc = accel[id];
realw vel = veloc[id];

displ[id] = displ[id] + deltat*vel + deltatsqover2*acc;
veloc[id] = vel + deltatover2*acc;
displ[id] = displ[id] + deltat * vel + deltatsqover2 * acc;
veloc[id] = vel + deltatover2 * acc;
accel[id] = 0.0f; // can do this using memset...not sure if faster,probably not
}

Expand All @@ -60,4 +60,49 @@ __global__ void UpdateDispVeloc_kernel(realw* displ,
// nvprof: 24599250 flops for 4099875 threads -> 6 FLOP per thread
}

/* ----------------------------------------------------------------------------------------------- */


__global__ void UpdateDispVeloc_PML_kernel(realw* displ,
realw* veloc,
realw* accel,
realw* PML_displ,
const int NSPEC_CPML,
const int* d_CPML_to_spec,
const int* d_ibool,
const realw deltat,
const realw deltatsqover2,
const realw deltatover2) {

int ispec_cpml = blockIdx.x + blockIdx.y*gridDim.x;
int ijk = threadIdx.x;

// because of block and grid sizing problems, there is a small
// amount of buffer at the end of the calculation
if (ispec_cpml < NSPEC_CPML) {

int ispec = d_CPML_to_spec[ispec_cpml] - 1;

// local and global indices
int K = ijk/NGLL2;
int J = (ijk - K*NGLL2)/NGLLX;
int I = ijk - K*NGLL2 - J*NGLLX;

int iglob = d_ibool[ijk + NGLL3_PADDED*ispec] - 1;

const realw theta = 0.125f; // theta = 1.0 / 8.0;

// updates PML displacement
PML_displ[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,I,J,K,ispec_cpml)] = displ[3*iglob]
+ deltatover2 * (1.0f - 2.0f * theta) * veloc[3*iglob]
+ deltatsqover2 * (1.0f - theta) * accel[3*iglob];

PML_displ[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,I,J,K,ispec_cpml)] = displ[3*iglob+1]
+ deltatover2 * (1.0f - 2.0f * theta) * veloc[3*iglob+1]
+ deltatsqover2 * (1.0f - theta) * accel[3*iglob+1];

PML_displ[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,I,J,K,ispec_cpml)] = displ[3*iglob+2]
+ deltatover2 * (1.0f - 2.0f * theta) * veloc[3*iglob+2]
+ deltatsqover2 * (1.0f - theta) * accel[3*iglob+2];
}
}
1 change: 0 additions & 1 deletion src/gpu/kernels/compute_stacey_acoustic_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ __global__ void compute_stacey_acoustic_kernel(field* potential_dot_acoustic,
int SIMULATION_TYPE,
int SAVE_FORWARD,
int num_abs_boundary_faces,
field* b_potential_dot_acoustic,
field* b_potential_dot_dot_acoustic,
field* b_absorb_potential,
int gravity) {
Expand Down
2 changes: 2 additions & 0 deletions src/gpu/kernels/kernel_cuda.mk
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ cuda_kernels_OBJS := \
$O/kernel_3_cuda_device.cuda-kernel.o \
$O/kernel_3_veloc_cuda_device.cuda-kernel.o \
$O/noise_read_add_surface_movie_cuda_kernel.cuda-kernel.o \
$O/pml_impose_boundary_condition_cuda_kernel.cuda-kernel.o \
$O/pml_kernel_2_viscoelastic_impl.cuda-kernel.o \
$O/prepare_boundary_accel_on_device.cuda-kernel.o \
$O/prepare_boundary_potential_on_device.cuda-kernel.o \
$O/process_smooth.cuda-kernel.o \
Expand Down
Loading

0 comments on commit d3f66cb

Please sign in to comment.