Skip to content

Commit

Permalink
Perf: optimize the stream strategy in module_gint (#5845)
Browse files Browse the repository at this point in the history
* optimize stream strategy

* limit max threads
  • Loading branch information
dzzz2001 authored Jan 10, 2025
1 parent 1c30f99 commit 16714c6
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 22 deletions.
29 changes: 22 additions & 7 deletions source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,11 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
const int num_streams = gridt.nstreams;

std::vector<cudaStream_t> streams(num_streams);
std::vector<cudaEvent_t> events(num_streams);
for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamCreate(&streams[i]));
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
}

Cuda_Mem_Wrapper<double> dr_part(3 * max_atom_per_z, num_streams, true);
Expand Down Expand Up @@ -89,21 +91,32 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
dm->get_wrapper(),
dm->get_nnr() * sizeof(double),
cudaMemcpyHostToDevice));

#ifdef _OPENMP
#pragma omp parallel for num_threads(num_streams) collapse(2)
const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
#endif
#pragma omp parallel num_threads(max_thread_num)
{
#ifdef _OPENMP
const int tid = omp_get_thread_num();
const int num_threads = omp_get_num_threads();
const int sid_start = tid * num_streams / num_threads;
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
#else
const int sid_start = 0;
const int thread_num_streams = num_streams;
#endif
#pragma omp for collapse(2) schedule(dynamic)
for (int i = 0; i < gridt.nbx; i++)
{
for (int j = 0; j < gridt.nby; j++)
{
// 20240620 Note that it must be set again here because
// cuda's device is not safe in a multi-threaded environment.
checkCuda(cudaSetDevice(gridt.dev_id));
#ifdef _OPENMP
const int sid = omp_get_thread_num();
#else
const int sid = 0;
#endif

const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
checkCuda(cudaEventSynchronize(events[sid]));

int max_m = 0;
int max_n = 0;
Expand Down Expand Up @@ -161,6 +174,7 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
checkCuda(cudaEventRecord(events[sid], streams[sid]));

psi.memset_device_async(streams[sid], sid, 0);
psi_dm.memset_device_async(streams[sid], sid, 0);
Expand Down Expand Up @@ -241,9 +255,9 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
stress.get_device_pointer(sid));
checkCudaLastError();
}
checkCuda(cudaStreamSynchronize(streams[sid]));
}
}
}

for(int i = 0; i < num_streams; i++)
{
Expand All @@ -254,6 +268,7 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamSynchronize(streams[i]));
checkCuda(cudaEventDestroy(events[i]));
}

if (isstress){
Expand Down
29 changes: 21 additions & 8 deletions source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,11 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
const int max_atompair_per_z = max_atom * max_atom * nbzp;

std::vector<cudaStream_t> streams(num_streams);
std::vector<cudaEvent_t> events(num_streams);
for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamCreate(&streams[i]));
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
}

Cuda_Mem_Wrapper<double> dr_part(max_atom_per_z * 3, num_streams, true);
Expand Down Expand Up @@ -71,8 +73,20 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,

// calculate the rho for every nbzp bigcells
#ifdef _OPENMP
#pragma omp parallel for num_threads(num_streams) collapse(2)
const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
#endif
#pragma omp parallel num_threads(max_thread_num)
{
#ifdef _OPENMP
const int tid = omp_get_thread_num();
const int num_threads = omp_get_num_threads();
const int sid_start = tid * num_streams / num_threads;
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
#else
const int sid_start = 0;
const int thread_num_streams = num_streams;
#endif
#pragma omp for collapse(2) schedule(dynamic)
for (int i = 0; i < gridt.nbx; i++)
{
for (int j = 0; j < gridt.nby; j++)
Expand All @@ -81,12 +95,9 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
// cuda's device is not safe in a multi-threaded environment.

checkCuda(cudaSetDevice(gridt.dev_id));
// get stream id
#ifdef _OPENMP
const int sid = omp_get_thread_num();
#else
const int sid = 0;
#endif

const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
checkCuda(cudaEventSynchronize(events[sid]));

int max_m = 0;
int max_n = 0;
Expand Down Expand Up @@ -147,6 +158,7 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
dot_product.copy_host_to_device_async(streams[sid], sid);
checkCuda(cudaEventRecord(events[sid], streams[sid]));

psi.memset_device_async(streams[sid], sid, 0);
psi_dm.memset_device_async(streams[sid], sid, 0);
Expand Down Expand Up @@ -203,9 +215,9 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
psi_dm.get_device_pointer(sid),
dot_product.get_device_pointer(sid));
checkCudaLastError();
checkCuda(cudaStreamSynchronize(streams[sid]));
}
}
}

// Copy rho from device to host
checkCuda(cudaMemcpy(rho,
Expand All @@ -216,6 +228,7 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamDestroy(streams[i]));
checkCuda(cudaEventDestroy(events[i]));
}
}
} // namespace GintKernel
27 changes: 20 additions & 7 deletions source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,12 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
const double vfactor = ucell.omega / gridt.ncxyz;
const int nczp = nbzp * gridt.bz;
std::vector<cudaStream_t> streams(num_streams);
std::vector<cudaEvent_t> events(num_streams);

for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamCreate(&streams[i]));
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
}

const int nnrg = hRGint->get_nnr();
Expand Down Expand Up @@ -73,21 +75,30 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
Cuda_Mem_Wrapper<double*> gemm_C(max_atompair_per_z, num_streams, true);

#ifdef _OPENMP
#pragma omp parallel for num_threads(num_streams) collapse(2)
const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
#endif
#pragma omp parallel num_threads(max_thread_num)
{
#ifdef _OPENMP
const int tid = omp_get_thread_num();
const int num_threads = omp_get_num_threads();
const int sid_start = tid * num_streams / num_threads;
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
#else
const int sid_start = 0;
const int thread_num_streams = num_streams;
#endif
#pragma omp for collapse(2) schedule(dynamic)
for (int i = 0; i < gridt.nbx; i++)
{
for (int j = 0; j < gridt.nby; j++)
{
// 20240620 Note that it must be set again here because
// cuda's device is not safe in a multi-threaded environment.
checkCuda(cudaSetDevice(gridt.dev_id));
#ifdef _OPENMP
const int sid = omp_get_thread_num();
#else
const int sid = 0;
#endif

const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
checkCuda(cudaEventSynchronize(events[sid]));
int max_m = 0;
int max_n = 0;
int atom_pair_num = 0;
Expand Down Expand Up @@ -141,6 +152,7 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
checkCuda(cudaEventRecord(events[sid], streams[sid]));

psi.memset_device_async(streams[sid], sid, 0);
psi_vldr3.memset_device_async(streams[sid], sid, 0);
Expand Down Expand Up @@ -187,9 +199,9 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
streams[sid],
nullptr);
checkCudaLastError();
checkCuda(cudaStreamSynchronize(streams[sid]));
}
}
}

checkCuda(cudaMemcpy(
hRGint->get_wrapper(),
Expand All @@ -200,6 +212,7 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamDestroy(streams[i]));
checkCuda(cudaEventDestroy(events[i]));
}
}

Expand Down

0 comments on commit 16714c6

Please sign in to comment.