From 75e10a78b9407782eb0b82cadb7feae428d40f20 Mon Sep 17 00:00:00 2001 From: liiutao <74701833+A-006@users.noreply.github.com> Date: Thu, 30 May 2024 09:31:37 +0800 Subject: [PATCH] Replace modify the functions form cu files to cpp files (#4245) * replace ParaV in module gint * change PV to pv in module gint * change GlobalC in module gint * fix LCAO_Orbitals in module gint * fix error in compile without abacus * fix error in init_gpu_gint_variables * remove GlobalC in grid_technique and grid_bigcell * remove GlobalC in gint_tools and vbatch matrix * fix relax have compute stress and change GPU force compute to acclerate * fix num stream in input.md and use num_stream in input * fix error in compute force * fix memory error in force compute * use std instead of double * and add const * fix error in vector use * fix error in compile * fix error in compile with force * fix compile error * fix paramter name and function name * add time ticker and fix nspin transport * delete printf in files * fix test bug and fix grid_size * init nstreams * move cpp function from cu file to cpp file --------- Co-authored-by: Mohan Chen --- .../module_gint/gint_force.h | 2 + .../module_gint/gint_force_gpu.cu | 19 +- .../module_gint/gtask_force.cpp | 365 ++++++++++++++++++ .../module_gint/kernels/cuda/gint_force.cu | 364 ----------------- 4 files changed, 378 insertions(+), 372 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/gint_force.h b/source/module_hamilt_lcao/module_gint/gint_force.h index 51851082de..c4118f4611 100644 --- a/source/module_hamilt_lcao/module_gint/gint_force.h +++ b/source/module_hamilt_lcao/module_gint/gint_force.h @@ -192,6 +192,7 @@ void allocateDm(double* matrix_host, void para_init(grid_para& para, const int iter_num, const int nbz, + const int pipeline_index, const Grid_Technique& gridt); /** * @brief frc_strs_iat on host and device Init @@ -224,6 +225,7 @@ void cal_init(frc_strs_iat& f_s_iat, void para_mem_copy(grid_para& para, const Grid_Technique& gridt, const int nbz, + const int pipeline_index, const int atom_num_grid); /** * @brief Force Stress Force Iat memCpy,from Host to Device diff --git a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu index 3718f48819..5824ba494b 100644 --- a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu @@ -77,6 +77,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer* dm, const int cuda_block = std::min(64, (gridt.psir_size + cuda_threads - 1) / cuda_threads); int iter_num = 0; + int pipeline_index = 0; DensityMat denstiy_mat; frc_strs_iat_gbl f_s_iat_dev; grid_para para; @@ -112,9 +113,10 @@ void gint_fvl_gamma_gpu(hamilt::HContainer* dm, dim3 grid_dot(cuda_block); dim3 block_dot(cuda_threads); - para_init(para, iter_num, nbz, gridt); + pipeline_index = iter_num % gridt.nstreams; + para_init(para, iter_num, nbz, pipeline_index,gridt); cal_init(f_s_iat, - para.stream_num, + pipeline_index, cuda_block, atom_num_grid, max_size, @@ -141,19 +143,20 @@ void gint_fvl_gamma_gpu(hamilt::HContainer* dm, para_mem_copy(para, gridt, nbz, + pipeline_index, atom_num_grid); cal_mem_cpy(f_s_iat, gridt, atom_num_grid, cuda_block, - para.stream_num); - checkCuda(cudaStreamSynchronize(gridt.streams[para.stream_num])); + pipeline_index); + checkCuda(cudaStreamSynchronize(gridt.streams[pipeline_index])); /* cuda stream compute and Multiplication of multinomial matrices */ get_psi_force<<>>( + gridt.streams[pipeline_index]>>>( gridt.ylmcoef_g, dr, gridt.bxyz, @@ -192,14 +195,14 @@ void gint_fvl_gamma_gpu(hamilt::HContainer* dm, para.matrix_C_device, para.ldc_device, atom_pair_num, - gridt.streams[para.stream_num], + gridt.streams[pipeline_index], nullptr); /* force compute in GPU */ if (isforce){ dot_product_force<<>>( + gridt.streams[pipeline_index]>>>( para.psir_lx_device, para.psir_ly_device, para.psir_lz_device, @@ -215,7 +218,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer* dm, dot_product_stress<<>>( + gridt.streams[pipeline_index]>>>( para.psir_lxx_device, para.psir_lxy_device, para.psir_lxz_device, diff --git a/source/module_hamilt_lcao/module_gint/gtask_force.cpp b/source/module_hamilt_lcao/module_gint/gtask_force.cpp index e785b4192b..9aa7e9c386 100644 --- a/source/module_hamilt_lcao/module_gint/gtask_force.cpp +++ b/source/module_hamilt_lcao/module_gint/gtask_force.cpp @@ -259,5 +259,370 @@ void allocateDm(double* matrixHost, } return; } +void calculateInit(DensityMat& denstiy_mat, + frc_strs_iat_gbl& f_s_iat_dev, + hamilt::HContainer* dm, + const Grid_Technique& gridt, + const UnitCell& ucell, + int lgd, + int cuda_block, + int atom_num_grid) +{ + denstiy_mat.density_mat_h = new double[lgd * lgd]; + allocateDm(denstiy_mat.density_mat_h, dm, gridt, ucell); + + checkCuda(cudaMalloc((void**)&denstiy_mat.density_mat_d, + lgd * lgd * sizeof(double))); + checkCuda(cudaMemcpy(denstiy_mat.density_mat_d, + denstiy_mat.density_mat_h, + lgd * lgd * sizeof(double), + cudaMemcpyHostToDevice)); + + checkCuda(cudaMalloc((void**)&f_s_iat_dev.stress_global, + 6 * cuda_block * gridt.nstreams * sizeof(double))); + checkCuda(cudaMemset(f_s_iat_dev.stress_global, + 0, + 6 * cuda_block * gridt.nstreams * sizeof(double))); + + checkCuda(cudaMalloc((void**)&f_s_iat_dev.force_global, + 3 * atom_num_grid * gridt.nstreams * sizeof(double))); + checkCuda(cudaMemset(f_s_iat_dev.force_global, + 0, + 3 * atom_num_grid * gridt.nstreams * sizeof(double))); + + checkCuda(cudaMalloc((void**)&f_s_iat_dev.iat_global, + atom_num_grid * gridt.nstreams * sizeof(int))); + checkCuda(cudaMemset(f_s_iat_dev.iat_global, + 0, + atom_num_grid * gridt.nstreams * sizeof(int))); +} + +/** + * @brief grid parameter Init + * + * GridParameter init + * + * @param para double *,contained the destiyMatHost + * @param iter_num int , used for calcute the stream + * @param nbz int,stand for the number of Z-axis + * @param gridt Grid_Technique,stored the major method in the the gint. + */ +void para_init(grid_para& para, + const int iter_num, + const int nbz, + const int pipeline_index, + const Grid_Technique& gridt) +{ + + // stream_num stand for nstreams + + //input_dou and input _int used for the Spherical Harmonics + para.input_dou + = &gridt.psi_dbl_gbl[gridt.psi_size_max * pipeline_index * 5]; + para.input_int + = &gridt.psi_int_gbl[gridt.psi_size_max * pipeline_index * 2]; + para.num_psir = &gridt.num_psir_gbl[nbz * pipeline_index]; + //one dimension,record the length and the leading dimension of three matrix + para.atom_pair_A_m + = &gridt.l_info_global[gridt.atom_pair_nbz * pipeline_index]; + para.atom_pair_B_n + = &gridt.r_info_global[gridt.atom_pair_nbz * pipeline_index]; + para.atom_pair_K + = &gridt.k_info_global[gridt.atom_pair_nbz * pipeline_index]; + para.atom_pair_lda + = &gridt.lda_info_global[gridt.atom_pair_nbz * pipeline_index]; + para.atom_pair_ldb + = &gridt.ldb_info_global[gridt.atom_pair_nbz * pipeline_index]; + para.atom_pair_ldc + = &gridt.ldc_info_global[gridt.atom_pair_nbz * pipeline_index]; + //input_double_g and input_int_g used for the Spherical Harmonics on GPU + para.input_double_g + = &gridt.psi_dbl_gbl_g[gridt.psi_size_max * pipeline_index * 5]; + para.input_int_g + = &gridt.psi_int_gbl_g[gridt.psi_size_max * pipeline_index * 2]; + para.num_psir_g = &gridt.num_psir_gbl_g[nbz * pipeline_index]; + para.psir_dm_device = &gridt.dm_global_g[gridt.psir_size * pipeline_index]; + para.psir_r_device + = &gridt.right_global_g[gridt.psir_size * pipeline_index]; + //psi function ,record the force in x y z,and the stress in six dimension + para.psir_lx_device = &gridt.d_left_x_g[gridt.psir_size * pipeline_index]; + para.psir_ly_device = &gridt.d_left_y_g[gridt.psir_size * pipeline_index]; + para.psir_lz_device = &gridt.d_left_z_g[gridt.psir_size * pipeline_index]; + para.psir_lxx_device + = &gridt.dd_left_xx_g[gridt.psir_size * pipeline_index]; + para.psir_lxy_device + = &gridt.dd_left_xy_g[gridt.psir_size * pipeline_index]; + para.psir_lxz_device + = &gridt.dd_left_xz_g[gridt.psir_size * pipeline_index]; + para.psir_lyy_device + = &gridt.dd_left_yy_g[gridt.psir_size * pipeline_index]; + para.psir_lyz_device + = &gridt.dd_left_yz_g[gridt.psir_size * pipeline_index]; + para.psir_lzz_device + = &gridt.dd_left_zz_g[gridt.psir_size * pipeline_index]; + //one dimension,record the length and the leading dimension of three matrix on GPU + para.A_m_device + = &gridt.l_info_global_g[gridt.atom_pair_nbz * pipeline_index]; + para.B_n_device + = &gridt.r_info_global_g[gridt.atom_pair_nbz * pipeline_index]; + para.K_device + = &gridt.k_info_global_g[gridt.atom_pair_nbz * pipeline_index]; + para.lda_device + = &gridt.lda_info_gbl_g[gridt.atom_pair_nbz * pipeline_index]; + para.ldb_device + = &gridt.ldb_info_gbl_g[gridt.atom_pair_nbz * pipeline_index]; + para.ldc_device + = &gridt.ldc_info_gbl_g[gridt.atom_pair_nbz * pipeline_index]; + //two dimension,record number to compute + para.matrix_A = &gridt.ap_left_gbl[gridt.atom_pair_nbz * pipeline_index]; + para.matrix_B = &gridt.ap_right_gbl[gridt.atom_pair_nbz * pipeline_index]; + para.matrix_C = &gridt.ap_output_gbl[gridt.atom_pair_nbz * pipeline_index]; + para.matrix_A_device + = &gridt.ap_left_gbl_g[gridt.atom_pair_nbz * pipeline_index]; + para.matrix_B_device + = &gridt.ap_right_gbl_g[gridt.atom_pair_nbz * pipeline_index]; + para.matrix_C_device + = &gridt.ap_output_gbl_g[gridt.atom_pair_nbz * pipeline_index]; +} +/** + * @brief frc_strs_iat on host and device Init + * + * GridParameter init + * + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Host + * @param stream_num int , record the stream in GPU + * @param cuda_block in stress compute,used for Block nums + * @param atom_num_grid in force calculate,used for Block nums + * @param max_size Maximum size of atoms on a grid. + * @param frc_strs_iat_gbl frc_strs_iat_gbl,contains the Force Stree Iat on Host + */ +void cal_init(frc_strs_iat& f_s_iat, + const int stream_num, + const int cuda_block, + const int atom_num_grid, + const int max_size, + frc_strs_iat_gbl& f_s_iat_dev) +{ + const int iat_min = -max_size - 1; + f_s_iat.stress_host = new double[6 * cuda_block]; + f_s_iat.stress_device + = &f_s_iat_dev.stress_global[6 * cuda_block * stream_num]; + f_s_iat.force_device + = &f_s_iat_dev.force_global[3 * atom_num_grid * stream_num]; + f_s_iat.iat_device + = &f_s_iat_dev.iat_global[atom_num_grid * stream_num]; + f_s_iat.iat_host = new int[atom_num_grid]; + for (int index = 0; index < atom_num_grid; index++) + { + f_s_iat.iat_host[index] = iat_min; + } + f_s_iat.force_host = new double[3 * atom_num_grid]; + ModuleBase::GlobalFunc::ZEROS(f_s_iat.force_host, + 3 * atom_num_grid); +} +/** + * @brief GridParameter memCpy,from Host to Device + * + * parameter init,which contains the gpu task and multi matrix multiplication + * + * @param para Grid parameter in task generator, + * @param gridt Grid_Technique,stored the major method in the the gint. + * @param nbz int,stand for the number of Z-axis + * @param atom_num_grid in force calculate,used for Block nums + */ +void para_mem_copy(grid_para& para, + const Grid_Technique& gridt, + const int nbz, + const int pipeline_index, + const int atom_num_grid) +{ + checkCuda(cudaMemcpyAsync(para.input_double_g, + para.input_dou, + gridt.psi_size_max * 5 * sizeof(double), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.input_int_g, + para.input_int, + gridt.psi_size_max * 2 * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.num_psir_g, + para.num_psir, + nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.A_m_device, + para.atom_pair_A_m, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.B_n_device, + para.atom_pair_B_n, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.K_device, + para.atom_pair_K, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.lda_device, + para.atom_pair_lda, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.ldb_device, + para.atom_pair_ldb, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.ldc_device, + para.atom_pair_ldc, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.matrix_A_device, + para.matrix_A, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.matrix_B_device, + para.matrix_B, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemcpyAsync(para.matrix_C_device, + para.matrix_C, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_dm_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_r_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_lx_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_ly_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_lz_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_lxx_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_lxy_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_lxz_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_lyy_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_lyz_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); + checkCuda(cudaMemsetAsync(para.psir_lzz_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[pipeline_index])); +} +/** + * @brief Force Stress Force Iat memCpy,from Host to Device + * + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device + * and Host + * @param gridt Grid_Technique,stored the major method in the the gint. + * @param atom_num_grid in force calculate,used for Block nums + * @param cuda_block in stress compute,used for Block nums + * @param stream_num int , record the stream in GPU + */ +void cal_mem_cpy(frc_strs_iat& f_s_iat, + const Grid_Technique& gridt, + const int atom_num_grid, + const int cuda_block, + const int stream_num) +{ + checkCuda(cudaMemcpyAsync(f_s_iat.iat_device, + f_s_iat.iat_host, + atom_num_grid * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemsetAsync(f_s_iat.stress_device, + 0, + 6 * cuda_block * sizeof(double), + gridt.streams[stream_num])); + checkCuda(cudaMemsetAsync(f_s_iat.force_device, + 0, + 3 * atom_num_grid * sizeof(double), + gridt.streams[stream_num])); +} +/* + * @brief Force Calculate on Host + * + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device + * and Host + * @param force stored the force for each atom on each directions + * @param atom_num_grid in force calculate,used for Block nums + */ +void cal_force_add(frc_strs_iat& f_s_iat, + std::vector& force, + const int atom_num_grid) +{ + checkCuda(cudaMemcpy(f_s_iat.force_host, + f_s_iat.force_device, + 3 * atom_num_grid * sizeof(double), + cudaMemcpyDeviceToHost)); + for (int index1 = 0; index1 < atom_num_grid; index1++) + { + int iat1 = f_s_iat.iat_host[index1]; + if (iat1 >= 0) + { + for (int index2 = 0; index2 < 3; index2++) + { + force[iat1 * 3 + index2] + += f_s_iat.force_host[index1 * 3 + index2]; + } + } + } +} +/** + * @brief Stress Calculate on Host + * + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device + * and Host + * @param stress stored the stress for each directions + * @param cuda_block in stress compute,used for Block nums + */ +void cal_stress_add(frc_strs_iat& f_s_iat, + std::vector& stress, + const int cuda_block) +{ + checkCuda(cudaMemcpy(f_s_iat.stress_host, + f_s_iat.stress_device, + 6 * cuda_block * sizeof(double), + cudaMemcpyDeviceToHost)); + for (int i = 0; i < 6; i++) + { + for (int index = 0; index < cuda_block; index++) + { + stress[i] += f_s_iat.stress_host[i * cuda_block + index]; + } + } +} } // namespace GintKernel diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu index a86c16cb5f..49db338235 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu @@ -252,368 +252,4 @@ __global__ void dot_product_force(double* psir_lx, tid += blockDim.x * gridDim.x; } } -void calculateInit(DensityMat& denstiy_mat, - frc_strs_iat_gbl& f_s_iat_dev, - hamilt::HContainer* dm, - const Grid_Technique& gridt, - const UnitCell& ucell, - int lgd, - int cuda_block, - int atom_num_grid) -{ - denstiy_mat.density_mat_h = new double[lgd * lgd]; - allocateDm(denstiy_mat.density_mat_h, dm, gridt, ucell); - - checkCuda(cudaMalloc((void**)&denstiy_mat.density_mat_d, - lgd * lgd * sizeof(double))); - checkCuda(cudaMemcpy(denstiy_mat.density_mat_d, - denstiy_mat.density_mat_h, - lgd * lgd * sizeof(double), - cudaMemcpyHostToDevice)); - - checkCuda(cudaMalloc((void**)&f_s_iat_dev.stress_global, - 6 * cuda_block * gridt.nstreams * sizeof(double))); - checkCuda(cudaMemset(f_s_iat_dev.stress_global, - 0, - 6 * cuda_block * gridt.nstreams * sizeof(double))); - - checkCuda(cudaMalloc((void**)&f_s_iat_dev.force_global, - 3 * atom_num_grid * gridt.nstreams * sizeof(double))); - checkCuda(cudaMemset(f_s_iat_dev.force_global, - 0, - 3 * atom_num_grid * gridt.nstreams * sizeof(double))); - - checkCuda(cudaMalloc((void**)&f_s_iat_dev.iat_global, - atom_num_grid * gridt.nstreams * sizeof(int))); - checkCuda(cudaMemset(f_s_iat_dev.iat_global, - 0, - atom_num_grid * gridt.nstreams * sizeof(int))); -} - -/** - * @brief grid parameter Init - * - * GridParameter init - * - * @param para double *,contained the destiyMatHost - * @param iter_num int , used for calcute the stream - * @param nbz int,stand for the number of Z-axis - * @param gridt Grid_Technique,stored the major method in the the gint. - */ -void para_init(grid_para& para, - int iter_num, - int nbz, - const Grid_Technique& gridt) -{ - - // stream_num stand for nstreams - para.stream_num = iter_num % gridt.nstreams; - //input_dou and input _int used for the Spherical Harmonics - para.input_dou - = &gridt.psi_dbl_gbl[gridt.psi_size_max * para.stream_num * 5]; - para.input_int - = &gridt.psi_int_gbl[gridt.psi_size_max * para.stream_num * 2]; - para.num_psir = &gridt.num_psir_gbl[nbz * para.stream_num]; - //one dimension,record the length and the leading dimension of three matrix - para.atom_pair_A_m - = &gridt.l_info_global[gridt.atom_pair_nbz * para.stream_num]; - para.atom_pair_B_n - = &gridt.r_info_global[gridt.atom_pair_nbz * para.stream_num]; - para.atom_pair_K - = &gridt.k_info_global[gridt.atom_pair_nbz * para.stream_num]; - para.atom_pair_lda - = &gridt.lda_info_global[gridt.atom_pair_nbz * para.stream_num]; - para.atom_pair_ldb - = &gridt.ldb_info_global[gridt.atom_pair_nbz * para.stream_num]; - para.atom_pair_ldc - = &gridt.ldc_info_global[gridt.atom_pair_nbz * para.stream_num]; - //input_double_g and input_int_g used for the Spherical Harmonics on GPU - para.input_double_g - = &gridt.psi_dbl_gbl_g[gridt.psi_size_max * para.stream_num * 5]; - para.input_int_g - = &gridt.psi_int_gbl_g[gridt.psi_size_max * para.stream_num * 2]; - para.num_psir_g = &gridt.num_psir_gbl_g[nbz * para.stream_num]; - para.psir_dm_device = &gridt.dm_global_g[gridt.psir_size * para.stream_num]; - para.psir_r_device - = &gridt.right_global_g[gridt.psir_size * para.stream_num]; - //psi function ,record the force in x y z,and the stress in six dimension - para.psir_lx_device = &gridt.d_left_x_g[gridt.psir_size * para.stream_num]; - para.psir_ly_device = &gridt.d_left_y_g[gridt.psir_size * para.stream_num]; - para.psir_lz_device = &gridt.d_left_z_g[gridt.psir_size * para.stream_num]; - para.psir_lxx_device - = &gridt.dd_left_xx_g[gridt.psir_size * para.stream_num]; - para.psir_lxy_device - = &gridt.dd_left_xy_g[gridt.psir_size * para.stream_num]; - para.psir_lxz_device - = &gridt.dd_left_xz_g[gridt.psir_size * para.stream_num]; - para.psir_lyy_device - = &gridt.dd_left_yy_g[gridt.psir_size * para.stream_num]; - para.psir_lyz_device - = &gridt.dd_left_yz_g[gridt.psir_size * para.stream_num]; - para.psir_lzz_device - = &gridt.dd_left_zz_g[gridt.psir_size * para.stream_num]; - //one dimension,record the length and the leading dimension of three matrix on GPU - para.A_m_device - = &gridt.l_info_global_g[gridt.atom_pair_nbz * para.stream_num]; - para.B_n_device - = &gridt.r_info_global_g[gridt.atom_pair_nbz * para.stream_num]; - para.K_device - = &gridt.k_info_global_g[gridt.atom_pair_nbz * para.stream_num]; - para.lda_device - = &gridt.lda_info_gbl_g[gridt.atom_pair_nbz * para.stream_num]; - para.ldb_device - = &gridt.ldb_info_gbl_g[gridt.atom_pair_nbz * para.stream_num]; - para.ldc_device - = &gridt.ldc_info_gbl_g[gridt.atom_pair_nbz * para.stream_num]; - //two dimension,record number to compute - para.matrix_A = &gridt.ap_left_gbl[gridt.atom_pair_nbz * para.stream_num]; - para.matrix_B = &gridt.ap_right_gbl[gridt.atom_pair_nbz * para.stream_num]; - para.matrix_C = &gridt.ap_output_gbl[gridt.atom_pair_nbz * para.stream_num]; - para.matrix_A_device - = &gridt.ap_left_gbl_g[gridt.atom_pair_nbz * para.stream_num]; - para.matrix_B_device - = &gridt.ap_right_gbl_g[gridt.atom_pair_nbz * para.stream_num]; - para.matrix_C_device - = &gridt.ap_output_gbl_g[gridt.atom_pair_nbz * para.stream_num]; -} -/** - * @brief frc_strs_iat on host and device Init - * - * GridParameter init - * - * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Host - * @param stream_num int , record the stream in GPU - * @param cuda_block in stress compute,used for Block nums - * @param atom_num_grid in force calculate,used for Block nums - * @param max_size Maximum size of atoms on a grid. - * @param frc_strs_iat_gbl frc_strs_iat_gbl,contains the Force Stree Iat on Host - */ -void cal_init(frc_strs_iat& f_s_iat, - const int stream_num, - const int cuda_block, - const int atom_num_grid, - const int max_size, - frc_strs_iat_gbl& f_s_iat_dev) -{ - const int iat_min = -max_size - 1; - f_s_iat.stress_host = new double[6 * cuda_block]; - f_s_iat.stress_device - = &f_s_iat_dev.stress_global[6 * cuda_block * stream_num]; - f_s_iat.force_device - = &f_s_iat_dev.force_global[3 * atom_num_grid * stream_num]; - f_s_iat.iat_device - = &f_s_iat_dev.iat_global[atom_num_grid * stream_num]; - f_s_iat.iat_host = new int[atom_num_grid]; - for (int index = 0; index < atom_num_grid; index++) - { - f_s_iat.iat_host[index] = iat_min; - } - f_s_iat.force_host = new double[3 * atom_num_grid]; - ModuleBase::GlobalFunc::ZEROS(f_s_iat.force_host, - 3 * atom_num_grid); -} - -/** - * @brief GridParameter memCpy,from Host to Device - * - * parameter init,which contains the gpu task and multi matrix multiplication - * - * @param para Grid parameter in task generator, - * @param gridt Grid_Technique,stored the major method in the the gint. - * @param nbz int,stand for the number of Z-axis - * @param atom_num_grid in force calculate,used for Block nums - */ -void para_mem_copy(grid_para& para, - const Grid_Technique& gridt, - const int nbz, - const int atom_num_grid) -{ - checkCuda(cudaMemcpyAsync(para.input_double_g, - para.input_dou, - gridt.psi_size_max * 5 * sizeof(double), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.input_int_g, - para.input_int, - gridt.psi_size_max * 2 * sizeof(int), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.num_psir_g, - para.num_psir, - nbz * sizeof(int), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.A_m_device, - para.atom_pair_A_m, - gridt.atom_pair_nbz * sizeof(int), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.B_n_device, - para.atom_pair_B_n, - gridt.atom_pair_nbz * sizeof(int), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.K_device, - para.atom_pair_K, - gridt.atom_pair_nbz * sizeof(int), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.lda_device, - para.atom_pair_lda, - gridt.atom_pair_nbz * sizeof(int), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.ldb_device, - para.atom_pair_ldb, - gridt.atom_pair_nbz * sizeof(int), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.ldc_device, - para.atom_pair_ldc, - gridt.atom_pair_nbz * sizeof(int), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.matrix_A_device, - para.matrix_A, - gridt.atom_pair_nbz * sizeof(double*), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.matrix_B_device, - para.matrix_B, - gridt.atom_pair_nbz * sizeof(double*), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemcpyAsync(para.matrix_C_device, - para.matrix_C, - gridt.atom_pair_nbz * sizeof(double*), - cudaMemcpyHostToDevice, - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_dm_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_r_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_lx_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_ly_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_lz_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_lxx_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_lxy_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_lxz_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_lyy_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_lyz_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); - checkCuda(cudaMemsetAsync(para.psir_lzz_device, - 0, - gridt.psir_size * sizeof(double), - gridt.streams[para.stream_num])); -} -/** - * @brief Force Stress Force Iat memCpy,from Host to Device - * - * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device - * and Host - * @param gridt Grid_Technique,stored the major method in the the gint. - * @param atom_num_grid in force calculate,used for Block nums - * @param cuda_block in stress compute,used for Block nums - * @param stream_num int , record the stream in GPU - */ -void cal_mem_cpy(frc_strs_iat& f_s_iat, - const Grid_Technique& gridt, - const int atom_num_grid, - const int cuda_block, - const int stream_num) -{ - checkCuda(cudaMemcpyAsync(f_s_iat.iat_device, - f_s_iat.iat_host, - atom_num_grid * sizeof(int), - cudaMemcpyHostToDevice, - gridt.streams[stream_num])); - checkCuda(cudaMemsetAsync(f_s_iat.stress_device, - 0, - 6 * cuda_block * sizeof(double), - gridt.streams[stream_num])); - checkCuda(cudaMemsetAsync(f_s_iat.force_device, - 0, - 3 * atom_num_grid * sizeof(double), - gridt.streams[stream_num])); -} -/* - * @brief Force Calculate on Host - * - * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device - * and Host - * @param force stored the force for each atom on each directions - * @param atom_num_grid in force calculate,used for Block nums - */ -void cal_force_add(frc_strs_iat& f_s_iat, - std::vector& force, - const int atom_num_grid) -{ - checkCuda(cudaMemcpy(f_s_iat.force_host, - f_s_iat.force_device, - 3 * atom_num_grid * sizeof(double), - cudaMemcpyDeviceToHost)); - for (int index1 = 0; index1 < atom_num_grid; index1++) - { - int iat1 = f_s_iat.iat_host[index1]; - if (iat1 >= 0) - { - for (int index2 = 0; index2 < 3; index2++) - { - force[iat1 * 3 + index2] - += f_s_iat.force_host[index1 * 3 + index2]; - } - } - } -} -/** - * @brief Stress Calculate on Host - * - * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device - * and Host - * @param stress stored the stress for each directions - * @param cuda_block in stress compute,used for Block nums - */ -void cal_stress_add(frc_strs_iat& f_s_iat, - std::vector& stress, - const int cuda_block) -{ - checkCuda(cudaMemcpy(f_s_iat.stress_host, - f_s_iat.stress_device, - 6 * cuda_block * sizeof(double), - cudaMemcpyDeviceToHost)); - for (int i = 0; i < 6; i++) - { - for (int index = 0; index < cuda_block; index++) - { - stress[i] += f_s_iat.stress_host[i * cuda_block + index]; - } - } -} } // namespace GintKernel