From 3d2c7e9a6612f403b47bfa3098bc3f084c763f52 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 9 Oct 2023 12:29:14 +0100 Subject: [PATCH 1/3] feat: Add a distributed tridiagonal kernel for the OpenMP backend. --- src/CMakeLists.txt | 13 ++- src/common.f90 | 2 +- src/cuda/common.f90 | 6 + src/omp/common.f90 | 6 + src/omp/kernels_dist.f90 | 194 +++++++++++++++++++++++++++++++ tests/CMakeLists.txt | 13 ++- tests/cuda/test_cuda_tridiag.f90 | 3 +- tests/omp/test_omp_tridiag.f90 | 194 +++++++++++++++++++++++++++++++ 8 files changed, 427 insertions(+), 4 deletions(-) create mode 100644 src/cuda/common.f90 create mode 100644 src/omp/common.f90 create mode 100644 src/omp/kernels_dist.f90 create mode 100644 tests/omp/test_omp_tridiag.f90 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a955aeb3..73d89ada 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,8 +8,11 @@ set(SRC diffengine.f90 vector3d.f90 vector3d_simd.f90 + omp/common.f90 + omp/kernels_dist.f90 ) set(CUDASRC + cuda/common.f90 cuda/cuda_allocator.f90 cuda/kernels_dist.f90 ) @@ -21,12 +24,20 @@ endif() add_library(x3d2 STATIC ${SRC}) target_include_directories(x3d2 INTERFACE ${CMAKE_CURRENT_BINARY_DIR}) +target_compile_options(x3d2 PRIVATE "-O3") + if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI") target_compile_options(x3d2 PRIVATE "-cuda") - target_compile_options(x3d2 PRIVATE "-O3") target_compile_options(x3d2 PRIVATE "-fast") target_link_options(x3d2 INTERFACE "-cuda") endif() +if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "GNU") + target_compile_options(x3d2 PRIVATE "-ffast-math") +endif() + +find_package(OpenMP REQUIRED) +target_link_libraries(x3d2 PRIVATE OpenMP::OpenMP_Fortran) + find_package(MPI REQUIRED) target_link_libraries(x3d2 PRIVATE MPI::MPI_Fortran) diff --git a/src/common.f90 b/src/common.f90 index d46e3c49..0b90e087 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -1,7 +1,7 @@ module m_common implicit none - integer, parameter :: dp=kind(0.0d0), SZ=32 + integer, parameter :: dp=kind(0.0d0) real(dp), parameter :: pi = 4*atan(1.0_dp) end module m_common diff --git a/src/cuda/common.f90 b/src/cuda/common.f90 new file mode 100644 index 00000000..dc65e886 --- /dev/null +++ b/src/cuda/common.f90 @@ -0,0 +1,6 @@ +module m_cuda_common + implicit none + + integer, parameter :: SZ=32 + +end module m_cuda_common diff --git a/src/omp/common.f90 b/src/omp/common.f90 new file mode 100644 index 00000000..cb1d00b7 --- /dev/null +++ b/src/omp/common.f90 @@ -0,0 +1,6 @@ +module m_omp_common + implicit none + + integer, parameter :: SZ=16 + +end module m_omp_common diff --git a/src/omp/kernels_dist.f90 b/src/omp/kernels_dist.f90 new file mode 100644 index 00000000..b40f5e56 --- /dev/null +++ b/src/omp/kernels_dist.f90 @@ -0,0 +1,194 @@ +module m_omp_kernels_dist + use omp_lib + + use m_common, only: dp + use m_omp_common, only: SZ + + implicit none + +contains + + subroutine der_univ_dist_omp( & + du, send_u_b, send_u_e, u, u_b, u_e, coeffs_b, coeffs_e, coeffs, n, & + alfa, ffr, fbc & + ) + implicit none + + ! Arguments + real(dp), intent(out), dimension(:, :) :: du, send_u_b, send_u_e + real(dp), intent(in), dimension(:, :) :: u, u_b, u_e + real(dp), intent(in), dimension(:) :: ffr, fbc + real(dp), intent(in), dimension(:, :) :: coeffs_b, coeffs_e + real(dp), intent(in), dimension(:) :: coeffs + real(dp), intent(in) :: alfa + integer, intent(in) :: n + + ! Local variables + integer :: i, j!, b + integer :: jm2, jm1, jp1, jp2 + integer :: n_s, n_m, n_b, n_e !stencil, middle, begin, end + + real(dp) :: temp_du, c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4 + + !i = threadIdx%x + !b = blockIdx%x + !nblock = size(u, dim=3) + + n_s = (size(coeffs)-1)/2 + n_m = size(coeffs) + n_b = size(coeffs_b, dim=2) + n_e = size(coeffs_e, dim=2) + + ! store bulk coeffs in the registers + c_m4 = coeffs(1); c_m3 = coeffs(2); c_m2 = coeffs(3); c_m1 = coeffs(4) + c_j = coeffs(5) + c_p1 = coeffs(6); c_p2 = coeffs(7); c_p3 = coeffs(8); c_p4 = coeffs(9) + + !$omp simd + do i = 1, SZ + du(i, 1) = coeffs(1)*u_b(i, 1) + coeffs(2)*u_b(i, 2) & + + coeffs(3)*u_b(i, 3) + coeffs(4)*u_b(i, 4) & + + coeffs(5)*u(i, 1) & + + coeffs(6)*u(i, 2) + coeffs(7)*u(i, 3) & + + coeffs(8)*u(i, 4) + coeffs(9)*u(i, 5) + du(i, 2) = coeffs(1)*u_b(i, 2) + coeffs(2)*u_b(i, 3) & + + coeffs(3)*u_b(i, 4) + coeffs(4)*u(i, 1) & + + coeffs(5)*u(i, 2) & + + coeffs(6)*u(i, 3) + coeffs(7)*u(i, 4) & + + coeffs(8)*u(i, 5) + coeffs(9)*u(i, 6) + du(i, 3) = coeffs(1)*u_b(i, 3) + coeffs(2)*u_b(i, 4) & + + coeffs(3)*u(i, 1) + coeffs(4)*u(i, 2) & + + coeffs(5)*u(i, 3) & + + coeffs(6)*u(i, 4) + coeffs(7)*u(i, 5) & + + coeffs(8)*u(i, 6) + coeffs(9)*u(i, 7) + du(i, 3) = ffr(3)*(du(i, 3) - alfa*du(i, 2)) + du(i, 4) = coeffs(1)*u_b(i, 4) + coeffs(2)*u(i, 1) & + + coeffs(3)*u(i, 2) + coeffs(4)*u(i, 3) & + + coeffs(5)*u(i, 4) & + + coeffs(6)*u(i, 5) + coeffs(7)*u(i, 6) & + + coeffs(8)*u(i, 7) + coeffs(9)*u(i, 8) + du(i, 4) = ffr(4)*(du(i, 4) - alfa*du(i, 3)) + end do + !$omp end simd + + do j = n_s+1, n-n_s + !$omp simd + do i = 1, SZ + temp_du = c_m4*u(i, j-4) + c_m3*u(i, j-3) & + + c_m2*u(i, j-2) + c_m1*u(i, j-1) & + + c_j*u(i, j) & + + c_p1*u(i, j+1) + c_p2*u(i, j+2) & + + c_p3*u(i, j+3) + c_p4*u(i, j+4) + du(i, j) = ffr(j)*(temp_du - alfa*du(i, j-1)) + end do + !$omp end simd + end do + + !$omp simd + do i = 1, SZ + j = n-3 + du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) & + + coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) & + + coeffs(5)*u(i, j) & + + coeffs(6)*u(i, j+1) + coeffs(7)*u(i, j+2) & + + coeffs(8)*u(i, j+3) + coeffs(9)*u_e(i, 1) + du(i, j) = ffr(j)*(du(i, j) - alfa*du(i, j-1)) + j = n-2 + du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) & + + coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) & + + coeffs(5)*u(i, j) & + + coeffs(6)*u(i, j+1) + coeffs(7)*u(i, j+2) & + + coeffs(8)*u_e(i, 1) + coeffs(9)*u_e(i, 2) + du(i, j) = ffr(j)*(du(i, j) - alfa*du(i, j-1)) + j = n-1 + du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) & + + coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) & + + coeffs(5)*u(i, j) & + + coeffs(6)*u(i, j+1) + coeffs(7)*u_e(i, 1) & + + coeffs(8)*u_e(i, 2) + coeffs(9)*u_e(i, 3) + du(i, j) = ffr(j)*(du(i, j) - alfa*du(i, j-1)) + j = n + du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) & + + coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) & + + coeffs(5)*u(i, j) & + + coeffs(6)*u_e(i, 1) + coeffs(7)*u_e(i, 2) & + + coeffs(8)*u_e(i, 3) + coeffs(9)*u_e(i, 4) + du(i, j) = ffr(j)*(du(i, j) - alfa*du(i, j-1)) + end do + !$omp end simd + + !$omp simd + do i = 1, SZ + send_u_e(i, 1) = du(i, n) + end do + !$omp end simd + + ! Backward pass of the hybrid algorithm + do j = n - 2, 2, -1 + !$omp simd + do i = 1, SZ + du(i, j) = du(i, j) - fbc(j)*du(i, j + 1) + end do + !$omp end simd + end do + !$omp simd + do i = 1, SZ + du(i, 1) = ffr(1)*(du(i, 1) - fbc(1)*du(i, 2)) + send_u_b(i, 1) = du(i, 1) + end do + !$omp end simd + + end subroutine der_univ_dist_omp + + subroutine der_univ_subs_omp(du, recv_u_b, recv_u_e, n, alfa, & + dist_sa, dist_sc) + implicit none + + ! Arguments + real(dp), intent(out), dimension(:, :) :: du + real(dp), intent(in), dimension(:, :) :: recv_u_b, recv_u_e + real(dp), intent(in), dimension(:) :: dist_sa, dist_sc + real(dp), intent(in) :: alfa + integer, intent(in) :: n + + ! Local variables + integer :: i, j!, b + real(dp) :: ur, bl, recp, du_1, du_n + + !i = threadIdx%x + !b = blockIdx%x + + bl = dist_sa(1) + ur = dist_sc(n) + recp = 1._dp/(1._dp - ur*bl) + + !$omp simd + do i = 1, SZ + !du(i, 1) = recp*(du(i, 1) - bl*recv_u_b(i, 1)) + !du(i, n) = recp*(du(i, n) - ur*recv_u_e(i, 1)) + du_1 = recp*(du(i, 1) - bl*recv_u_b(i, 1)) + du_n = recp*(du(i, n) - ur*recv_u_e(i, 1)) + end do + !$omp end simd + + !$omp simd + do i = 1, SZ + du(i, 1) = du_1 + end do + !$omp end simd + do j = 2, n-1 + !$omp simd + do i = 1, SZ + du(i, j) = (du(i, j) - dist_sa(j)*du_1 - dist_sc(j)*du_n) + end do + !$omp end simd + end do + !$omp simd + do i = 1, SZ + du(i, n) = du_n + end do + !$omp end simd + + end subroutine der_univ_subs_omp + +end module m_omp_kernels_dist diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 217e236e..b65a1f25 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -3,6 +3,7 @@ set(TESTSRC test_tridiagonal.f90 test_stencil.f90 test_diffengine.f90 + omp/test_omp_tridiag.f90 ) set(CUDATESTSRC cuda/test_cuda_allocator.f90 @@ -28,7 +29,17 @@ foreach(testfile IN LISTS TESTSRC) get_filename_component(test_name ${testfile} NAME_WE) add_executable(${test_name} ${testfile}) + + target_compile_options(${test_name} PRIVATE "-O3") + + if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "GNU") + target_compile_options(${test_name} PRIVATE "-ffast-math") + endif() + target_link_libraries(${test_name} PRIVATE x3d2) - add_test(NAME ${test_name} COMMAND ${test_name}) + find_package(OpenMP REQUIRED) + target_link_libraries(${test_name} PRIVATE OpenMP::OpenMP_Fortran) + + add_test(NAME ${test_name} COMMAND sh -c "mpirun -np 1 ${test_name}") endforeach() diff --git a/tests/cuda/test_cuda_tridiag.f90 b/tests/cuda/test_cuda_tridiag.f90 index 116be9ec..3b494350 100644 --- a/tests/cuda/test_cuda_tridiag.f90 +++ b/tests/cuda/test_cuda_tridiag.f90 @@ -3,7 +3,8 @@ program test_cuda_tridiag use cudafor use mpi - use m_common, only: dp, SZ, pi + use m_common, only: dp, pi + use m_cuda_common, only: SZ use m_cuda_kernels_dist, only: der_univ_dist, der_univ_subs use m_derparams, only: der_1_vv, der_2_vv diff --git a/tests/omp/test_omp_tridiag.f90 b/tests/omp/test_omp_tridiag.f90 new file mode 100644 index 00000000..06ce8ed3 --- /dev/null +++ b/tests/omp/test_omp_tridiag.f90 @@ -0,0 +1,194 @@ +program test_dist_tridiag + use iso_fortran_env, only: stderr => error_unit + use mpi + use omp_lib + + use m_common, only: dp, pi + use m_omp_common, only: SZ + use m_omp_kernels_dist, only: der_univ_dist_omp, der_univ_subs_omp + use m_derparams, only: der_1_vv, der_2_vv + + implicit none + + logical :: allpass = .true. + real(dp), allocatable, dimension(:,:,:) :: u, du, u_b, u_e + real(dp), allocatable, dimension(:,:,:) :: u_recv_b, u_recv_e, & + u_send_b, u_send_e + + real(dp), allocatable, dimension(:,:,:) :: send_b, send_e, & + recv_b, recv_e + + real(dp), allocatable, dimension(:,:) :: coeffs_b, coeffs_e + real(dp), allocatable, dimension(:) :: coeffs, dist_fr, dist_bc, & + dist_sa, dist_sc + + integer :: n, n_block, i, j, k, n_halo, n_stencil, n_iters, iters + integer :: n_glob + integer :: nrank, nproc, pprev, pnext, tag1=1234, tag2=1234 + integer, allocatable :: srerr(:), mpireq(:) + integer :: ierr, ndevs, devnum, memClockRt, memBusWidth + + real(dp) :: dx, dx2, alfa, norm_du, tol = 1d-8, tstart, tend + real(dp) :: achievedBW, deviceBW + + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + + print*, 'I am rank', nrank, 'I am running on device' + pnext = modulo(nrank-nproc+1, nproc) + pprev = modulo(nrank-1, nproc) + print*, 'rank', nrank, 'pnext', pnext, 'pprev', pprev + allocate(srerr(nproc), mpireq(nproc)) + + n_glob = 512 + n = n_glob/nproc + n_block = 512*512/SZ + n_iters = 1000 + + allocate(u(SZ, n, n_block), du(SZ, n, n_block)) + + dx = 2*pi/n_glob + dx2 = dx*dx + + do k = 1, n_block + do j = 1, n + do i = 1, SZ + u(i, j, k) = sin((j-1+nrank*n)*dx) + end do + end do + end do + + ! set up the tridiagonal solver coeffs + call der_2_vv(coeffs, coeffs_b, coeffs_e, dist_fr, dist_bc, & + dist_sa, dist_sc, n_halo, alfa, dx2, n, 'periodic') + + n_stencil = n_halo*2 + 1 + + ! arrays for exchanging data between ranks + allocate(u_send_b(SZ, n_halo, n_block)) + allocate(u_send_e(SZ, n_halo, n_block)) + allocate(u_recv_b(SZ, n_halo, n_block)) + allocate(u_recv_e(SZ, n_halo, n_block)) + + allocate(send_b(SZ, 1, n_block), send_e(SZ, 1, n_block)) + allocate(recv_b(SZ, 1, n_block), recv_e(SZ, 1, n_block)) + + !call cpu_time(tstart) + tstart = omp_get_wtime() + do iters = 1, n_iters + ! first copy halo data into buffers + !$omp parallel do + do k = 1, n_block + do j = 1, 4 + !$omp simd + do i = 1, SZ + u_send_b(i,j,k) = u(i,j,k) + u_send_e(i,j,k) = u(i,n-n_halo+j,k) + end do + !$omp end simd + end do + end do + !$omp end parallel do + + ! halo exchange + if (nproc == 1) then + u_recv_b = u_send_e + u_recv_e = u_send_b + else + ! MPI send/recv for multi-rank simulations + call MPI_Isend(u_send_b, SZ*n_halo*n_block, & + MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, & + mpireq(1), srerr(1)) + call MPI_Irecv(u_recv_e, SZ*n_halo*n_block, & + MPI_DOUBLE_PRECISION, pnext, tag1, MPI_COMM_WORLD, & + mpireq(2), srerr(2)) + call MPI_Isend(u_send_e, SZ*n_halo*n_block, & + MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, & + mpireq(3), srerr(3)) + call MPI_Irecv(u_recv_b, SZ*n_halo*n_block, & + MPI_DOUBLE_PRECISION, pprev, tag2, MPI_COMM_WORLD, & + mpireq(4), srerr(4)) + + call MPI_Waitall(4, mpireq, MPI_STATUSES_IGNORE, ierr) + end if + + + !$omp parallel do + do k = 1, n_block + call der_univ_dist_omp( & + du(:, :, k), send_b(:, :, k), send_e(:, :, k), u(:, :, k), & + u_recv_b(:, :, k), u_recv_e(:, :, k), & + coeffs_b, coeffs_e, coeffs, n, alfa, dist_fr, dist_bc & + ) + end do + !$omp end parallel do + + ! halo exchange for 2x2 systems + if (nproc == 1) then + recv_b = send_e + recv_e = send_b + else + ! MPI send/recv for multi-rank simulations + call MPI_Isend(send_b, SZ*n_block, & + MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, & + mpireq(1), srerr(1)) + call MPI_Irecv(recv_e, SZ*n_block, & + MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, & + mpireq(2), srerr(2)) + call MPI_Isend(send_e, SZ*n_block, & + MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, & + mpireq(3), srerr(3)) + call MPI_Irecv(recv_b, SZ*n_block, & + MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, & + mpireq(4), srerr(4)) + + call MPI_Waitall(4, mpireq, MPI_STATUSES_IGNORE, ierr) + end if + + !$omp parallel do + do k = 1, n_block + call der_univ_subs_omp(du(:, :, k), recv_b(:, :, k), recv_e(:, :, k), & + n, alfa, dist_sa, dist_sc) + end do + !$omp end parallel do + end do + + !call cpu_time(tend) + tend = omp_get_wtime() + print*, 'Total time', tend-tstart + + ! 3 in the first phase, 2 in the second phase, so 5 in total + achievedBW = 5._dp*n_iters*n*n_block*SZ*dp/(tend-tstart) + print'(a, f8.3, a)', 'Achieved BW: ', achievedBW/2**30, ' GiB/s' + + memClockRt = 3200 + memBusWidth = 64 + deviceBW = 2*memBusWidth/8._dp*memClockRt*1000000 + + print'(a, f8.3, a)', 'Device BW: ', deviceBW/2**30, ' GiB/s' + print'(a, f5.2)', 'Effective BW utilization: %', achievedBW/deviceBW*100 + + ! check error + norm_du = norm2(u+du)/n_block + print*, 'error norm', norm_du + + if ( norm_du > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check second derivatives... failed' + else + write(stderr, '(a)') 'Check second derivatives... passed' + end if + + if (allpass) then + write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' + else + error stop 'SOME TESTS FAILED.' + end if + + call MPI_Finalize(ierr) + +end program test_dist_tridiag + From 3378baaf845223b8c52296e48e5b0e47deb12f9a Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 9 Oct 2023 12:29:55 +0100 Subject: [PATCH 2/3] fix: Fix some issues with the OpenMP stuff in old thomas algorithm. --- src/thomas.f90 | 8 -------- src/vector3d_simd.f90 | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/src/thomas.f90 b/src/thomas.f90 index 9f0073c0..c94e43ce 100644 --- a/src/thomas.f90 +++ b/src/thomas.f90 @@ -186,25 +186,19 @@ pure subroutine solve(self, f, df) n = size(self%fwd) if (size(f, 2) /= n) error stop do j = 2, n - !$omp simd do i = 1, size(f, 1) df(i, j) = df(i, j) - df(i, j - 1) * self%fwd(j) end do - !$omp end simd end do - !$omp simd do i = 1, size(f, 1) df(i, n) = df(i, n) * self%bwd(n) end do - !$omp end simd do j = n - 1, 1, -1 - !$omp simd do i = 1, size(f, 1) df(i, j) = (df(i, j) - df(i, j + 1) * self%updiag(j)) & & * self%bwd(j) end do - !$omp end simd end do end subroutine solve @@ -232,12 +226,10 @@ pure subroutine solve_periodic(self, f, df) alpha = self%updiag(1) select type (self) type is (periodic_tridiagsolv) - !$omp simd do i = 1, size(f, 1) df(i, 1:m) = y(i, :) - ((y(i, 1) - alpha * y(i, m)) & & / (1.+self%q(1) - alpha * self%q(m))) * self%q end do - !$omp end simd class default error stop end select diff --git a/src/vector3d_simd.f90 b/src/vector3d_simd.f90 index 9fb1f481..e6265612 100644 --- a/src/vector3d_simd.f90 +++ b/src/vector3d_simd.f90 @@ -86,8 +86,8 @@ subroutine transport_dir(self, dim, rslt) rslt(i, j, k) = -0.5 * & (u(i, j, k) * du(i, j) + dusq(i, j)) & & + self%xnu * d2u(i, j) - !$omp end simd end do + !$omp end simd end do end do layers end associate From e6d41a485f8cc9aa88206e95cfcedf86f97c8ab3 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 11 Oct 2023 11:54:49 +0100 Subject: [PATCH 3/3] fix: Finalize the MPI communicator in cuda tests. --- tests/cuda/test_cuda_tridiag.f90 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/cuda/test_cuda_tridiag.f90 b/tests/cuda/test_cuda_tridiag.f90 index 3b494350..5f44c44e 100644 --- a/tests/cuda/test_cuda_tridiag.f90 +++ b/tests/cuda/test_cuda_tridiag.f90 @@ -194,5 +194,7 @@ program test_cuda_tridiag error stop 'SOME TESTS FAILED.' end if + call MPI_Finalize(ierr) + end program test_cuda_tridiag