diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md index 9e87b52dcc..970dd4fd72 100644 --- a/docs/advanced/input_files/input-main.md +++ b/docs/advanced/input_files/input-main.md @@ -61,7 +61,7 @@ - [search\_radius](#search_radius) - [search\_pbc](#search_pbc) - [bx, by, bz](#bx-by-bz) - - [num\_stream] (#num_stream) + - [num\_stream](#num_stream) - [Electronic structure](#electronic-structure) - [basis\_type](#basis_type) - [ks\_solver](#ks_solver) @@ -915,6 +915,7 @@ These variables are used to control the numerical atomic orbitals related parame - **Description**: choose the number of streams in GPU when we compute the `LCAO`. According to different devices , we may have different effects.For most devices,the stream is enough when the number is bigger then 2. - **Default** : "4" + [back to top](#full-list-of-input-keywords) ## Electronic structure diff --git a/source/module_esolver/esolver_ks_lcao_elec.cpp b/source/module_esolver/esolver_ks_lcao_elec.cpp index b3d8e975b5..263520e5f1 100644 --- a/source/module_esolver/esolver_ks_lcao_elec.cpp +++ b/source/module_esolver/esolver_ks_lcao_elec.cpp @@ -66,7 +66,8 @@ void ESolver_KS_LCAO::set_matrix_grid(Record_adj& ra) this->pw_rho->nplane, this->pw_rho->startz_current, GlobalC::ucell, - GlobalC::ORB); + GlobalC::ORB, + GlobalV::NUM_STREAM); // (2)For each atom, calculate the adjacent atoms in different cells // and allocate the space for H(R) and S(R). diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp index 2b86e881dc..ae3205a187 100644 --- a/source/module_hamilt_lcao/module_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/gint.cpp @@ -36,7 +36,6 @@ Gint::~Gint() void Gint::cal_gint(Gint_inout* inout) { ModuleBase::timer::tick("Gint_interface", "cal_gint"); - if(inout->job==Gint_Tools::job_type::vlocal) { ModuleBase::TITLE("Gint_interface","cal_gint_vlocal"); @@ -132,45 +131,47 @@ void Gint::cal_gint(Gint_inout* inout) { const int ncyz = this->ny * this->nplane; int nat = ucell.nat; - // for (int is = 0; is < GlobalV::NSPIN; ++is) - // { - double *force = new double[ucell.nat * 3]; - for (int i = 0; i < nat * 3; i++) - { - force[i] = 0.0; - } - double *stress = new double[6]; - for (int i = 0; i < 6; i++) - { - stress[i] = 0.0; - } - GintKernel::gint_gamma_force_gpu(this->DMRGint[inout->ispin], - ucell.omega - / this->ncxyz, - inout->vl, - force, - stress, - this->nplane, - dr, - rcut, - *this->gridt, - ucell); + const int isforce = inout->isforce; + const int isstress =inout->isstress; + ModuleBase::TITLE("Gint_interface","cal_force_gpu"); + ModuleBase::timer::tick("Gint_interface","cal_force_gpu"); + if (isforce || isstress){ + std::vector force(nat * 3, 0.0); + std::vector stress(6, 0.0); + GintKernel::gint_fvl_gamma_gpu(this->DMRGint[inout->ispin], + ucell.omega + / this->ncxyz, + inout->vl, + force, + stress, + this->nplane, + dr, + rcut, + isforce, + isstress, + *this->gridt, + ucell); + if (inout->isforce) + { for (int iat = 0; iat < nat; iat++) { inout->fvl_dphi[0](iat, 0) += force[iat * 3]; inout->fvl_dphi[0](iat, 1) += force[iat * 3 + 1]; inout->fvl_dphi[0](iat, 2) += force[iat * 3 + 2]; } - inout->svl_dphi[0](0, 0) += stress[0]; - inout->svl_dphi[0](0, 1) += stress[1]; - inout->svl_dphi[0](0, 2) += stress[2]; - inout->svl_dphi[0](1, 1) += stress[3]; - inout->svl_dphi[0](1, 2) += stress[4]; - inout->svl_dphi[0](2, 2) += stress[5]; - - delete[] force; - delete[] stress; - // } + } + if (inout->isstress){ + inout->svl_dphi[0](0, 0) += stress[0]; + inout->svl_dphi[0](0, 1) += stress[1]; + inout->svl_dphi[0](0, 2) += stress[2]; + inout->svl_dphi[0](1, 1) += stress[3]; + inout->svl_dphi[0](1, 2) += stress[4]; + inout->svl_dphi[0](2, 2) += stress[5]; + } + force.clear(); + stress.clear(); + } + ModuleBase::timer::tick("Gint_interface","cal_force_gpu"); } } else @@ -310,7 +311,6 @@ void Gint::cal_gint(Gint_inout* inout) this->nplane, this->gridt->start_ind[grid_index], ncyz, dv); double** DM_in; - if(GlobalV::GAMMA_ONLY_LOCAL) { DM_in = inout->DM[GlobalV::CURRENT_SPIN]; diff --git a/source/module_hamilt_lcao/module_gint/gint_force.h b/source/module_hamilt_lcao/module_gint/gint_force.h index 6b6718e142..51851082de 100644 --- a/source/module_hamilt_lcao/module_gint/gint_force.h +++ b/source/module_hamilt_lcao/module_gint/gint_force.h @@ -44,7 +44,7 @@ typedef struct double** matrix_A_device; double** matrix_B_device; double** matrix_C_device; -} SGridParameter; +} grid_para; typedef struct { @@ -55,14 +55,14 @@ typedef struct int* iat_device; int* iat_host; -} ForceStressIat; +} frc_strs_iat; typedef struct { double* stress_global; double* force_global; int* iat_global; -} ForceStressIatGlobal; +} frc_strs_iat_gbl; typedef struct { @@ -84,14 +84,16 @@ typedef struct * @param ylmcoef_now Coefficients for spherical harmonics. * @param gridt Reference to Grid_Technique object. */ -void gint_gamma_force_gpu(hamilt::HContainer* dm, +void gint_fvl_gamma_gpu(hamilt::HContainer* dm, const double vfactor, const double* vlocal, - double* force, - double* stress, + std::vector& force, + std::vector& stress, const int nczp, double dr, double* rcut, + const int isforce, + const int isstress, const Grid_Technique& gridt, const UnitCell& ucell); @@ -136,7 +138,7 @@ void gpu_task_generator_force(const Grid_Technique& gridt, int& max_m, int& max_n, int& atom_pair_num, - SGridParameter& para); + grid_para& para); /** * @brief Density Matrix,force Stress Iat Init * @@ -144,7 +146,7 @@ void gpu_task_generator_force(const Grid_Technique& gridt, * * @param denstiy_mat DensityMat,contained the density_mat_dice and * destiyMatHost - * @param f_s_iat_dev ForceStressIatGlobal,contined the Force Stress and + * @param f_s_iat_dev frc_strs_iat_gbl,contined the Force Stress and * Iat Number * @param dm hamilt::HContainer,denstiy stored in the Hcontainer * @param gridt Grid_Technique,stored the major method in the the gint. @@ -154,7 +156,7 @@ void gpu_task_generator_force(const Grid_Technique& gridt, * @param atom_num_grid in force calculate,used for Block nums */ void calculateInit(DensityMat& denstiy_mat, - ForceStressIatGlobal& f_s_iat_dev, + frc_strs_iat_gbl& f_s_iat_dev, hamilt::HContainer* dm, const Grid_Technique& gridt, const UnitCell& ucell, @@ -187,28 +189,28 @@ void allocateDm(double* matrix_host, * @param nbz int,stand for the number of Z-axis * @param gridt Grid_Technique,stored the major method in the the gint. */ -void para_init(SGridParameter& para, +void para_init(grid_para& para, const int iter_num, const int nbz, const Grid_Technique& gridt); /** - * @brief ForceStressIat on host and device Init + * @brief frc_strs_iat on host and device Init * * GridParameter init * - * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Host + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Host * @param stream_num int , record the stream in GPU * @param cuda_block in stress compute,used for Block nums * @param atom_num_grid in force calculate,used for Block nums * @param max_size Maximum size of atoms on a grid. - * @param ForceStressIatGlobal ForceStressIatGlobal,contains the Force Stree Iat on Host + * @param frc_strs_iat_gbl frc_strs_iat_gbl,contains the Force Stree Iat on Host */ -void cal_init(ForceStressIat& f_s_iat, +void cal_init(frc_strs_iat& f_s_iat, const int stream_num, const int cuda_block, const int atom_num_grid, const int max_size, - const ForceStressIatGlobal& f_s_iatg); + frc_strs_iat_gbl& f_s_iatg); /** * @brief GridParameter memCpy,from Host to Device * @@ -219,21 +221,21 @@ void cal_init(ForceStressIat& f_s_iat, * @param nbz int,stand for the number of Z-axis * @param atom_num_grid in force calculate,used for Block nums */ -void para_mem_copy(SGridParameter& para, +void para_mem_copy(grid_para& para, const Grid_Technique& gridt, const int nbz, const int atom_num_grid); /** * @brief Force Stress Force Iat memCpy,from Host to Device * - * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device * and Host * @param gridt Grid_Technique,stored the major method in the the gint. * @param atom_num_grid in force calculate,used for Block nums * @param cuda_block in stress compute,used for Block nums * @param stream_num int , record the stream in GPU */ -void cal_mem_cpy(ForceStressIat& f_s_iat, +void cal_mem_cpy(frc_strs_iat& f_s_iat, const Grid_Technique& gridt, const int atom_num_grid, const int cuda_block, @@ -241,24 +243,24 @@ void cal_mem_cpy(ForceStressIat& f_s_iat, /** * @brief Force Calculate on Host * - * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device * and Host * @param force stored the force for each atom on each directions * @param atom_num_grid in force calculate,used for Block nums */ -void cal_force_add(ForceStressIat& f_s_iat, - double* force, +void cal_force_add(frc_strs_iat& f_s_iat, + std::vector& force, const int atom_num_grid); /** * @brief Stress Calculate on Host * - * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device * and Host * @param stress stored the stress for each directions * @param cuda_block in stress compute,used for Block nums */ -void cal_stress_add(ForceStressIat& f_s_iat, - double* stress, +void cal_stress_add(frc_strs_iat& f_s_iat, + std::vector& stress, const int cuda_block); } // namespace GintKernel #endif diff --git a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu index 30ea8fa6ab..3718f48819 100644 --- a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu @@ -8,13 +8,12 @@ #include "kernels/cuda/gint_force.cuh" #include "module_base/ylm.h" #include "module_hamilt_lcao/module_gint/gint_tools.h" - namespace GintKernel { // Function to calculate forces using GPU-accelerated gamma point Gint /** - * @brief Calculate forces and stresses for the `gint_gamma_force_gpu` function. + * @brief Calculate forces and stresses for the `gint_fvl_gamma_gpu` function. * * This function calculates forces and stresses based on given parameters. * @@ -30,7 +29,7 @@ namespace GintKernel */ /** * Function to calculate forces using GPU-accelerated gamma point Gint - * @brief Calculate forces and stresses for the `gint_gamma_force_gpu` function. + * @brief Calculate forces and stresses for the `gint_fvl_gamma_gpu` function. * * This function calculates forces and stresses based on given parameters. * @@ -55,14 +54,16 @@ namespace GintKernel * 6. force dot on the GPU. * 7. Copy the results back to the host. */ -void gint_gamma_force_gpu(hamilt::HContainer* dm, +void gint_fvl_gamma_gpu(hamilt::HContainer* dm, const double vfactor, const double* vlocal, - double* force, - double* stress, + std::vector& force, + std::vector& stress, const int nczp, double dr, double* rcut, + const int isforce, + const int isstress, const Grid_Technique& gridt, const UnitCell& ucell) { @@ -77,9 +78,9 @@ void gint_gamma_force_gpu(hamilt::HContainer* dm, = std::min(64, (gridt.psir_size + cuda_threads - 1) / cuda_threads); int iter_num = 0; DensityMat denstiy_mat; - ForceStressIatGlobal f_s_iat_dev; - SGridParameter para; - ForceStressIat f_s_iat; + frc_strs_iat_gbl f_s_iat_dev; + grid_para para; + frc_strs_iat f_s_iat; calculateInit(denstiy_mat, f_s_iat_dev, @@ -104,13 +105,13 @@ void gint_gamma_force_gpu(hamilt::HContainer* dm, int max_m = 0; int max_n = 0; int atom_pair_num = 0; - dim3 grid_psi(nbz, 8); + dim3 grid_psi(nbz, 32); dim3 block_psi(64); dim3 grid_dot_force(cuda_block); dim3 block_dot_force(cuda_threads); dim3 grid_dot(cuda_block); dim3 block_dot(cuda_threads); - + para_init(para, iter_num, nbz, gridt); cal_init(f_s_iat, para.stream_num, @@ -118,8 +119,6 @@ void gint_gamma_force_gpu(hamilt::HContainer* dm, atom_num_grid, max_size, f_s_iat_dev); - checkCuda(cudaStreamSynchronize(gridt.streams[para.stream_num])); - /*gpu task compute in CPU */ gpu_task_generator_force(gridt, ucell, @@ -150,6 +149,7 @@ void gint_gamma_force_gpu(hamilt::HContainer* dm, para.stream_num); checkCuda(cudaStreamSynchronize(gridt.streams[para.stream_num])); /* cuda stream compute and Multiplication of multinomial matrices */ + get_psi_force<<* dm, atom_pair_num, gridt.streams[para.stream_num], nullptr); - - checkCuda(cudaStreamSynchronize(gridt.streams[para.stream_num])); /* force compute in GPU */ + if (isforce){ dot_product_force<<* dm, nwmax, max_size, gridt.psir_size / nwmax); - /* force compute in CPU*/ - cal_force_add(f_s_iat, force, atom_num_grid); - + } /*stress compute in GPU*/ + if (isstress){ dot_product_stress<<* dm, para.psir_dm_device, f_s_iat.stress_device, gridt.psir_size); + } /* stress compute in CPU*/ - cal_stress_add(f_s_iat, stress, cuda_block); + if (isstress){ + cal_stress_add(f_s_iat, stress, cuda_block); + } + if (isforce){ + cal_force_add(f_s_iat, force, atom_num_grid); + } iter_num++; + delete[] f_s_iat.stress_host; + delete[] f_s_iat.force_host; + delete[] f_s_iat.iat_host; } } - // cudaFree(f_s_iat.stress_device); - // cudaFree(f_s_iat.force_device); - // cudaFree(f_s_iat.iat_device); - delete[] f_s_iat.stress_host; - delete[] f_s_iat.force_host; - delete[] f_s_iat.iat_host; + delete[] denstiy_mat.density_mat_h; /*free variables in CPU host*/ for (int i = 0; i < gridt.nstreams; i++) { diff --git a/source/module_hamilt_lcao/module_gint/grid_technique.cpp b/source/module_hamilt_lcao/module_gint/grid_technique.cpp index fdcfdc32eb..2eb74becc1 100644 --- a/source/module_hamilt_lcao/module_gint/grid_technique.cpp +++ b/source/module_hamilt_lcao/module_gint/grid_technique.cpp @@ -127,7 +127,8 @@ void Grid_Technique::set_pbc_grid(const int& ncx_in, const int& nplane, const int& startz_current, const UnitCell& ucell, - const LCAO_Orbitals& orb) + const LCAO_Orbitals& orb, + const int num_stream) { ModuleBase::TITLE("Grid_Technique", "init"); ModuleBase::timer::tick("Grid_Technique", "init"); @@ -184,7 +185,7 @@ void Grid_Technique::set_pbc_grid(const int& ncx_in, #if ((defined __CUDA) /* || (defined __ROCM) */) if(GlobalV::device_flag == "gpu") { - this->init_gpu_gint_variables(ucell,orb); + this->init_gpu_gint_variables(ucell,orb,num_stream); } #endif @@ -638,12 +639,14 @@ void Grid_Technique::cal_trace_lo(const UnitCell& ucell) #if ((defined __CUDA) /* || (defined __ROCM) */) -void Grid_Technique::init_gpu_gint_variables(const UnitCell& ucell,const LCAO_Orbitals &orb) +void Grid_Technique::init_gpu_gint_variables(const UnitCell& ucell,const LCAO_Orbitals &orb,const int num_stream) { if (is_malloced) { free_gpu_gint_variables(this->nat); } + nstreams = num_stream; + streams=new cudaStream_t[nstreams]; double ylmcoef[100]; ModuleBase::GlobalFunc::ZEROS(ylmcoef, 100); for (int i = 0; i < 100; i++) diff --git a/source/module_hamilt_lcao/module_gint/grid_technique.h b/source/module_hamilt_lcao/module_gint/grid_technique.h index 898b0ffb51..b51cb30686 100644 --- a/source/module_hamilt_lcao/module_gint/grid_technique.h +++ b/source/module_hamilt_lcao/module_gint/grid_technique.h @@ -102,7 +102,8 @@ class Grid_Technique : public Grid_MeshBall const int& nplane, const int& startz_current, const UnitCell& ucell, - const LCAO_Orbitals& orb); + const LCAO_Orbitals& orb, + const int num_stream); /// number of elements(basis-pairs) in this processon /// on all adjacent atoms-pairs(Grid division) @@ -162,8 +163,8 @@ class Grid_Technique : public Grid_MeshBall int atom_pair_mesh; int atom_pair_nbz; - const int nstreams = 4; - cudaStream_t streams[4]; + int nstreams=4 ; + cudaStream_t* streams; // streams[nstreams] // TODO it needs to be implemented through configuration files @@ -229,7 +230,7 @@ class Grid_Technique : public Grid_MeshBall matrix_multiple_func_type fastest_matrix_mul; private: - void init_gpu_gint_variables(const UnitCell& ucell,const LCAO_Orbitals &orb); + void init_gpu_gint_variables(const UnitCell& ucell,const LCAO_Orbitals &orb,const int num_stream); void free_gpu_gint_variables(int nat); #endif diff --git a/source/module_hamilt_lcao/module_gint/gtask_force.cpp b/source/module_hamilt_lcao/module_gint/gtask_force.cpp index 7c30d3db83..e785b4192b 100644 --- a/source/module_hamilt_lcao/module_gint/gtask_force.cpp +++ b/source/module_hamilt_lcao/module_gint/gtask_force.cpp @@ -56,7 +56,7 @@ void gpu_task_generator_force(const Grid_Technique& gridt, int& max_m, int& max_n, int& atom_pair_num, - SGridParameter& para) + grid_para& para) { const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp; const int nwmax = ucell.nwmax; diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu index bbc1d7dcb8..a86c16cb5f 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu @@ -253,7 +253,7 @@ __global__ void dot_product_force(double* psir_lx, } } void calculateInit(DensityMat& denstiy_mat, - ForceStressIatGlobal& f_s_iat_dev, + frc_strs_iat_gbl& f_s_iat_dev, hamilt::HContainer* dm, const Grid_Technique& gridt, const UnitCell& ucell, @@ -300,7 +300,7 @@ void calculateInit(DensityMat& denstiy_mat, * @param nbz int,stand for the number of Z-axis * @param gridt Grid_Technique,stored the major method in the the gint. */ -void para_init(SGridParameter& para, +void para_init(grid_para& para, int iter_num, int nbz, const Grid_Technique& gridt) @@ -377,23 +377,23 @@ void para_init(SGridParameter& para, = &gridt.ap_output_gbl_g[gridt.atom_pair_nbz * para.stream_num]; } /** - * @brief ForceStressIat on host and device Init + * @brief frc_strs_iat on host and device Init * * GridParameter init * - * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Host + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Host * @param stream_num int , record the stream in GPU * @param cuda_block in stress compute,used for Block nums * @param atom_num_grid in force calculate,used for Block nums * @param max_size Maximum size of atoms on a grid. - * @param ForceStressIatGlobal ForceStressIatGlobal,contains the Force Stree Iat on Host + * @param frc_strs_iat_gbl frc_strs_iat_gbl,contains the Force Stree Iat on Host */ -void cal_init(ForceStressIat& f_s_iat, +void cal_init(frc_strs_iat& f_s_iat, const int stream_num, const int cuda_block, const int atom_num_grid, const int max_size, - const ForceStressIatGlobal& f_s_iat_dev) + frc_strs_iat_gbl& f_s_iat_dev) { const int iat_min = -max_size - 1; f_s_iat.stress_host = new double[6 * cuda_block]; @@ -423,7 +423,7 @@ void cal_init(ForceStressIat& f_s_iat, * @param nbz int,stand for the number of Z-axis * @param atom_num_grid in force calculate,used for Block nums */ -void para_mem_copy(SGridParameter& para, +void para_mem_copy(grid_para& para, const Grid_Technique& gridt, const int nbz, const int atom_num_grid) @@ -536,14 +536,14 @@ void para_mem_copy(SGridParameter& para, /** * @brief Force Stress Force Iat memCpy,from Host to Device * - * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device * and Host * @param gridt Grid_Technique,stored the major method in the the gint. * @param atom_num_grid in force calculate,used for Block nums * @param cuda_block in stress compute,used for Block nums * @param stream_num int , record the stream in GPU */ -void cal_mem_cpy(ForceStressIat& f_s_iat, +void cal_mem_cpy(frc_strs_iat& f_s_iat, const Grid_Technique& gridt, const int atom_num_grid, const int cuda_block, @@ -566,13 +566,13 @@ void cal_mem_cpy(ForceStressIat& f_s_iat, /* * @brief Force Calculate on Host * - * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device * and Host * @param force stored the force for each atom on each directions * @param atom_num_grid in force calculate,used for Block nums */ -void cal_force_add(ForceStressIat& f_s_iat, - double* force, +void cal_force_add(frc_strs_iat& f_s_iat, + std::vector& force, const int atom_num_grid) { checkCuda(cudaMemcpy(f_s_iat.force_host, @@ -595,13 +595,13 @@ void cal_force_add(ForceStressIat& f_s_iat, /** * @brief Stress Calculate on Host * - * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * @param frc_strs_iat frc_strs_iat,contains the Force Stree Iat on Device * and Host * @param stress stored the stress for each directions * @param cuda_block in stress compute,used for Block nums */ -void cal_stress_add(ForceStressIat& f_s_iat, - double* stress, +void cal_stress_add(frc_strs_iat& f_s_iat, + std::vector& stress, const int cuda_block) { checkCuda(cudaMemcpy(f_s_iat.stress_host, @@ -612,7 +612,6 @@ void cal_stress_add(ForceStressIat& f_s_iat, { for (int index = 0; index < cuda_block; index++) { - // printf("the stress is %f\n",stress[i]); stress[i] += f_s_iat.stress_host[i * cuda_block + index]; } }