Skip to content

Commit

Permalink
Perf: reduce data transmission in the GPU code of module_gint (#4356)
Browse files Browse the repository at this point in the history
* reduce data transmission in gint_rho_gpu

* reduce data transmission in gint_vl_gpu

* reduce data transmission in gint_force_gpu

* add const qualifier

* add const qualifier
  • Loading branch information
dzzz2001 authored Jun 17, 2024
1 parent 82686dc commit bc97512
Show file tree
Hide file tree
Showing 17 changed files with 191 additions and 139 deletions.
58 changes: 31 additions & 27 deletions source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
Cuda_Mem_Wrapper<double> psi_input_double(5 * max_atom_per_z, num_streams, true);
Cuda_Mem_Wrapper<int> psi_input_int(2 * max_atom_per_z, num_streams, true);
Cuda_Mem_Wrapper<int> atom_num_per_bcell(nbzp, num_streams, true);
Cuda_Mem_Wrapper<int> start_idx_per_bcell(nbzp, num_streams, true);

Cuda_Mem_Wrapper<double> psi(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> psi_dm(max_phi_per_z, num_streams, false);
Expand Down Expand Up @@ -131,26 +132,28 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
int max_m = 0;
int max_n = 0;
int atom_pair_num = 0;
int atom_per_z = 0;
const int grid_index_ij = i * gridt.nby * nbzp
+ j * nbzp;

std::vector<bool> gpu_mat_cal_flag(max_atom * nbzp, false);

gpu_task_generator_force(gridt,
ucell,
grid_index_ij,
max_atom_per_bcell,
max_atom,
nczp,
vfactor,
rcut,
vlocal,
psi_input_double.get_host_pointer(sid),
psi_input_int.get_host_pointer(sid),
atom_num_per_bcell.get_host_pointer(sid),
iat_per_z.get_host_pointer(sid),
atom_pair_num,
gpu_mat_cal_flag);
gtask_force(gridt,
ucell,
grid_index_ij,
max_atom_per_bcell,
max_atom,
nczp,
vfactor,
rcut,
vlocal,
psi_input_double.get_host_pointer(sid),
psi_input_int.get_host_pointer(sid),
atom_num_per_bcell.get_host_pointer(sid),
start_idx_per_bcell.get_host_pointer(sid),
iat_per_z.get_host_pointer(sid),
atom_per_z,
gpu_mat_cal_flag);

alloc_mult_force(gridt,
ucell,
Expand All @@ -173,19 +176,20 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
gemm_C.get_host_pointer(sid),
gpu_mat_cal_flag);

psi_input_double.copy_host_to_device_async(streams[sid], sid);
psi_input_int.copy_host_to_device_async(streams[sid], sid);
psi_input_double.copy_host_to_device_async(streams[sid], sid, 5 * atom_per_z);
psi_input_int.copy_host_to_device_async(streams[sid], sid, 2 * atom_per_z);
atom_num_per_bcell.copy_host_to_device_async(streams[sid], sid);
start_idx_per_bcell.copy_host_to_device_async(streams[sid], sid);
iat_per_z.copy_host_to_device_async(streams[sid], sid);
gemm_m.copy_host_to_device_async(streams[sid], sid);
gemm_n.copy_host_to_device_async(streams[sid], sid);
gemm_k.copy_host_to_device_async(streams[sid], sid);
gemm_lda.copy_host_to_device_async(streams[sid], sid);
gemm_ldb.copy_host_to_device_async(streams[sid], sid);
gemm_ldc.copy_host_to_device_async(streams[sid], sid);
gemm_A.copy_host_to_device_async(streams[sid], sid);
gemm_B.copy_host_to_device_async(streams[sid], sid);
gemm_C.copy_host_to_device_async(streams[sid], sid);
gemm_m.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_n.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_k.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_lda.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_ldb.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_ldc.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);

psi.memset_device_async(streams[sid], sid, 0);
psi_dm.memset_device_async(streams[sid], sid, 0);
Expand All @@ -212,7 +216,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
psi_input_double.get_device_pointer(sid),
psi_input_int.get_device_pointer(sid),
atom_num_per_bcell.get_device_pointer(sid),
max_atom_per_bcell,
start_idx_per_bcell.get_device_pointer(sid),
gridt.atom_nwl_g,
gridt.atom_new_g,
gridt.atom_ylm_g,
Expand Down
33 changes: 17 additions & 16 deletions source/module_hamilt_lcao/module_gint/gint_force_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,22 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
* This function generates GPU tasks for force calculations.
*/

void gpu_task_generator_force(const Grid_Technique& gridt,
const UnitCell& ucell,
const int grid_index_ij,
const int max_atom_per_bcell,
const int max_atom,
const int nczp,
const double vfactor,
const double* rcut,
const double* vlocal_global_value,
double* psi_input_double,
int* psi_input_int,
int* atom_num_per_bcell,
int* iat_per_z,
int& atom_pair_num,
std::vector<bool>& gpu_mat_cal_flag);
void gtask_force(const Grid_Technique& gridt,
const UnitCell& ucell,
const int grid_index_ij,
const int max_atom_per_bcell,
const int max_atom,
const int nczp,
const double vfactor,
const double* rcut,
const double* vlocal_global_value,
double* psi_input_double,
int* psi_input_int,
int* atom_num_per_bcell,
int* start_idx_per_bcell,
int* iat_per_z,
int& atom_per_z,
std::vector<bool>& gpu_mat_cal_flag);

void alloc_mult_force(const Grid_Technique& gridt,
const UnitCell& ucell,
Expand All @@ -57,7 +58,7 @@ void alloc_mult_force(const Grid_Technique& gridt,
double** mat_A,
double** mat_B,
double** mat_C,
std::vector<bool>& gpu_mat_cal_flag);
const std::vector<bool>& gpu_mat_cal_flag);

} // namespace GintKernel
#endif
33 changes: 19 additions & 14 deletions source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
Cuda_Mem_Wrapper<double> psi_input_double(4 * max_atom_per_z, num_streams, true);
Cuda_Mem_Wrapper<int> psi_input_int(2 * max_atom_per_z, num_streams, true);
Cuda_Mem_Wrapper<int> atom_num_per_bcell(nbzp, num_streams, true);
Cuda_Mem_Wrapper<int> start_idx_per_bcell(nbzp, num_streams, true);

Cuda_Mem_Wrapper<double> psi(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> psi_dm(max_phi_per_z, num_streams, false);
Expand Down Expand Up @@ -98,6 +99,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
int max_m = 0;
int max_n = 0;
int atom_pair_num = 0;
int atom_per_z = 0;
const int grid_index_ij = i * gridt.nby * nbzp + j * nbzp;
std::vector<bool> gpu_matrix_cal_flag(max_atom * nbzp, false);

Expand All @@ -111,7 +113,9 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
rcut,
psi_input_double.get_host_pointer(sid),
psi_input_int.get_host_pointer(sid),
atom_num_per_bcell.get_host_pointer(sid));
atom_num_per_bcell.get_host_pointer(sid),
start_idx_per_bcell.get_host_pointer(sid),
atom_per_z);

alloc_mult_dot_rho(gridt,
ucell,
Expand Down Expand Up @@ -139,19 +143,20 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
rho_g.get_device_pointer(),
dot_product.get_host_pointer(sid));

psi_input_double.copy_host_to_device_async(streams[sid], sid);
psi_input_int.copy_host_to_device_async(streams[sid], sid);
psi_input_double.copy_host_to_device_async(streams[sid], sid, 4 * atom_per_z);
psi_input_int.copy_host_to_device_async(streams[sid], sid, 2 * atom_per_z);
atom_num_per_bcell.copy_host_to_device_async(streams[sid], sid);
gemm_alpha.copy_host_to_device_async(streams[sid], sid);
gemm_m.copy_host_to_device_async(streams[sid], sid);
gemm_n.copy_host_to_device_async(streams[sid], sid);
gemm_k.copy_host_to_device_async(streams[sid], sid);
gemm_lda.copy_host_to_device_async(streams[sid], sid);
gemm_ldb.copy_host_to_device_async(streams[sid], sid);
gemm_ldc.copy_host_to_device_async(streams[sid], sid);
gemm_A.copy_host_to_device_async(streams[sid], sid);
gemm_B.copy_host_to_device_async(streams[sid], sid);
gemm_C.copy_host_to_device_async(streams[sid], sid);
start_idx_per_bcell.copy_host_to_device_async(streams[sid], sid);
gemm_alpha.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_m.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_n.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_k.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_lda.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_ldb.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_ldc.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
dot_product.copy_host_to_device_async(streams[sid], sid);

psi.memset_device_async(streams[sid], sid, 0);
Expand All @@ -168,7 +173,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
psi_input_double.get_device_pointer(sid),
psi_input_int.get_device_pointer(sid),
atom_num_per_bcell.get_device_pointer(sid),
max_atom_per_bcell,
start_idx_per_bcell.get_device_pointer(sid),
gridt.atom_nwl_g,
gridt.atom_new_g,
gridt.atom_ylm_g,
Expand Down
6 changes: 4 additions & 2 deletions source/module_hamilt_lcao/module_gint/gint_rho_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ void gtask_rho(const Grid_Technique& gridt,
const double* rcut,
double* psi_input_double,
int* psi_input_int,
int* atom_num_per_bcell);
int* atom_num_per_bcell,
int* start_idx_per_bcell,
int& atom_per_z);

/**
* Allocate resources and perform matrix multiplication and vector dot products
Expand Down Expand Up @@ -89,7 +91,7 @@ void gtask_rho(const Grid_Technique& gridt,
*/
void alloc_mult_dot_rho(const Grid_Technique& gridt,
const UnitCell& ucell,
std::vector<bool>& gpu_mat_cal_flag,
const std::vector<bool>& gpu_mat_cal_flag,
const int grid_index_ij,
const int max_size,
const int lgd,
Expand Down
33 changes: 19 additions & 14 deletions source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ void gint_gamma_vl_gpu(hamilt::HContainer<double>* hRGint,
Cuda_Mem_Wrapper<double> psi_input_double(5 * max_atom_per_z, num_streams, true);
Cuda_Mem_Wrapper<int> psi_input_int(2 * max_atom_per_z, num_streams, true);
Cuda_Mem_Wrapper<int> atom_num_per_bcell(nbzp, num_streams, true);
Cuda_Mem_Wrapper<int> start_idx_per_bcell(nbzp, num_streams, true);

Cuda_Mem_Wrapper<double> psi(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> psi_vldr3(max_phi_per_z, num_streams, false);
Expand All @@ -118,9 +119,10 @@ void gint_gamma_vl_gpu(hamilt::HContainer<double>* hRGint,
const int sid = omp_get_thread_num();
checkCuda(cudaStreamSynchronize(streams[sid]));

int atom_pair_num = 0;
int max_m = 0;
int max_n = 0;
int atom_pair_num = 0;
int atom_per_z = 0;
const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp;
std::vector<bool> gpu_matrix_calc_flag(max_atom * nbzp, false);
gtask_vlocal(gridt,
Expand All @@ -134,7 +136,9 @@ void gint_gamma_vl_gpu(hamilt::HContainer<double>* hRGint,
vlocal,
psi_input_double.get_host_pointer(sid),
psi_input_int.get_host_pointer(sid),
atom_num_per_bcell.get_host_pointer(sid));
atom_num_per_bcell.get_host_pointer(sid),
start_idx_per_bcell.get_host_pointer(sid),
atom_per_z);

alloc_mult_vlocal(gridt,
ucell,
Expand All @@ -157,18 +161,19 @@ void gint_gamma_vl_gpu(hamilt::HContainer<double>* hRGint,
max_m,
max_n);

psi_input_double.copy_host_to_device_async(streams[sid], sid);
psi_input_int.copy_host_to_device_async(streams[sid], sid);
psi_input_double.copy_host_to_device_async(streams[sid], sid, 5 * atom_per_z);
psi_input_int.copy_host_to_device_async(streams[sid], sid, 2 * atom_per_z);
atom_num_per_bcell.copy_host_to_device_async(streams[sid], sid);
gemm_m.copy_host_to_device_async(streams[sid], sid);
gemm_n.copy_host_to_device_async(streams[sid], sid);
gemm_k.copy_host_to_device_async(streams[sid], sid);
gemm_lda.copy_host_to_device_async(streams[sid], sid);
gemm_ldb.copy_host_to_device_async(streams[sid], sid);
gemm_ldc.copy_host_to_device_async(streams[sid], sid);
gemm_A.copy_host_to_device_async(streams[sid], sid);
gemm_B.copy_host_to_device_async(streams[sid], sid);
gemm_C.copy_host_to_device_async(streams[sid], sid);
start_idx_per_bcell.copy_host_to_device_async(streams[sid], sid);
gemm_m.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_n.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_k.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_lda.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_ldb.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_ldc.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);

psi.memset_device_async(streams[sid], sid, 0);
psi_vldr3.memset_device_async(streams[sid], sid, 0);
Expand All @@ -186,7 +191,7 @@ void gint_gamma_vl_gpu(hamilt::HContainer<double>* hRGint,
psi_input_double.get_device_pointer(sid),
psi_input_int.get_device_pointer(sid),
atom_num_per_bcell.get_device_pointer(sid),
max_atom_per_bcell,
start_idx_per_bcell.get_device_pointer(sid),
gridt.atom_nwl_g,
gridt.atom_new_g,
gridt.atom_ylm_g,
Expand Down
6 changes: 4 additions & 2 deletions source/module_hamilt_lcao/module_gint/gint_vl_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ void gtask_vlocal(const Grid_Technique& gridt,
const double* vlocal_global_value,
double* psi_input_double,
int* psi_input_int,
int* atom_num_per_bcell);
int* atom_num_per_bcell,
int* start_idx_per_bcell,
int& atom_per_z);

void alloc_mult_vlocal(const Grid_Technique& gridt,
const UnitCell& ucell,
std::vector<bool>& gpu_matrix_calc_flag,
const std::vector<bool>& gpu_matrix_calc_flag,
const int grid_index_ij,
const int max_atom,
double* psi,
Expand Down
41 changes: 22 additions & 19 deletions source/module_hamilt_lcao/module_gint/gtask_force.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,25 @@
namespace GintKernel
{

void gpu_task_generator_force(const Grid_Technique& gridt,
const UnitCell& ucell,
const int grid_index_ij,
const int max_atom_per_bcell,
const int max_atom,
const int nczp,
const double vfactor,
const double* rcut,
const double* vlocal_global_value,
double* psi_input_double,
int* psi_input_int,
int* atom_num_per_bcell,
int* iat_per_z,
int& atom_pair_num,
std::vector<bool>& gpu_mat_cal_flag)
void gtask_force(const Grid_Technique& gridt,
const UnitCell& ucell,
const int grid_index_ij,
const int max_atom_per_bcell,
const int max_atom,
const int nczp,
const double vfactor,
const double* rcut,
const double* vlocal_global_value,
double* psi_input_double,
int* psi_input_int,
int* atom_num_per_bcell,
int* start_idx_per_bcell,
int* iat_per_z,
int& atom_per_z,
std::vector<bool>& gpu_mat_cal_flag)
{
const int nwmax = ucell.nwmax;
atom_per_z = 0;
// psir generate
for (int z_index = 0; z_index < gridt.nbzp; z_index++)
{
Expand Down Expand Up @@ -66,9 +68,8 @@ void gpu_task_generator_force(const Grid_Technique& gridt,
if (distance <= rcut[it_temp])
{
gpu_mat_cal_flag[calc_flag_index + id] = true;
int pos_temp_double = num_psi_pos + num_get_psi;
int pos_temp_int = pos_temp_double * 2;
pos_temp_double *= 5;
const int pos_temp_double = (atom_per_z + num_get_psi) * 5;
const int pos_temp_int = (atom_per_z + num_get_psi) * 2;
if (distance < 1.0E-9)
{
distance += 1.0E-9;
Expand Down Expand Up @@ -98,6 +99,8 @@ void gpu_task_generator_force(const Grid_Technique& gridt,
}
}
atom_num_per_bcell[z_index] = num_get_psi;
start_idx_per_bcell[z_index] = atom_per_z;
atom_per_z += num_get_psi;
}
}

Expand All @@ -121,7 +124,7 @@ void alloc_mult_force(const Grid_Technique& gridt,
double** mat_A,
double** mat_B,
double** mat_C,
std::vector<bool>& gpu_mat_cal_flag)
const std::vector<bool>& gpu_mat_cal_flag)
{
int tid = 0;
max_m = 0;
Expand Down
Loading

0 comments on commit bc97512

Please sign in to comment.