Skip to content

Commit

Permalink
Merge pull request #9 from semi-h/distributed
Browse files Browse the repository at this point in the history
Add a kernel for the OpenMP backend
  • Loading branch information
semi-h authored Oct 11, 2023
2 parents d826250 + e6d41a4 commit 0937998
Show file tree
Hide file tree
Showing 10 changed files with 430 additions and 13 deletions.
13 changes: 12 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ set(SRC
diffengine.f90
vector3d.f90
vector3d_simd.f90
omp/common.f90
omp/kernels_dist.f90
)
set(CUDASRC
cuda/common.f90
cuda/cuda_allocator.f90
cuda/kernels_dist.f90
)
Expand All @@ -21,12 +24,20 @@ endif()
add_library(x3d2 STATIC ${SRC})
target_include_directories(x3d2 INTERFACE ${CMAKE_CURRENT_BINARY_DIR})

target_compile_options(x3d2 PRIVATE "-O3")

if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI")
target_compile_options(x3d2 PRIVATE "-cuda")
target_compile_options(x3d2 PRIVATE "-O3")
target_compile_options(x3d2 PRIVATE "-fast")
target_link_options(x3d2 INTERFACE "-cuda")
endif()

if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "GNU")
target_compile_options(x3d2 PRIVATE "-ffast-math")
endif()

find_package(OpenMP REQUIRED)
target_link_libraries(x3d2 PRIVATE OpenMP::OpenMP_Fortran)

find_package(MPI REQUIRED)
target_link_libraries(x3d2 PRIVATE MPI::MPI_Fortran)
2 changes: 1 addition & 1 deletion src/common.f90
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module m_common
implicit none

integer, parameter :: dp=kind(0.0d0), SZ=32
integer, parameter :: dp=kind(0.0d0)
real(dp), parameter :: pi = 4*atan(1.0_dp)

end module m_common
6 changes: 6 additions & 0 deletions src/cuda/common.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
module m_cuda_common
implicit none

integer, parameter :: SZ=32

end module m_cuda_common
6 changes: 6 additions & 0 deletions src/omp/common.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
module m_omp_common
implicit none

integer, parameter :: SZ=16

end module m_omp_common
194 changes: 194 additions & 0 deletions src/omp/kernels_dist.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
module m_omp_kernels_dist
use omp_lib

use m_common, only: dp
use m_omp_common, only: SZ

implicit none

contains

subroutine der_univ_dist_omp( &
du, send_u_b, send_u_e, u, u_b, u_e, coeffs_b, coeffs_e, coeffs, n, &
alfa, ffr, fbc &
)
implicit none

! Arguments
real(dp), intent(out), dimension(:, :) :: du, send_u_b, send_u_e
real(dp), intent(in), dimension(:, :) :: u, u_b, u_e
real(dp), intent(in), dimension(:) :: ffr, fbc
real(dp), intent(in), dimension(:, :) :: coeffs_b, coeffs_e
real(dp), intent(in), dimension(:) :: coeffs
real(dp), intent(in) :: alfa
integer, intent(in) :: n

! Local variables
integer :: i, j!, b
integer :: jm2, jm1, jp1, jp2
integer :: n_s, n_m, n_b, n_e !stencil, middle, begin, end

real(dp) :: temp_du, c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4

!i = threadIdx%x
!b = blockIdx%x
!nblock = size(u, dim=3)

n_s = (size(coeffs)-1)/2
n_m = size(coeffs)
n_b = size(coeffs_b, dim=2)
n_e = size(coeffs_e, dim=2)

! store bulk coeffs in the registers
c_m4 = coeffs(1); c_m3 = coeffs(2); c_m2 = coeffs(3); c_m1 = coeffs(4)
c_j = coeffs(5)
c_p1 = coeffs(6); c_p2 = coeffs(7); c_p3 = coeffs(8); c_p4 = coeffs(9)

!$omp simd
do i = 1, SZ
du(i, 1) = coeffs(1)*u_b(i, 1) + coeffs(2)*u_b(i, 2) &
+ coeffs(3)*u_b(i, 3) + coeffs(4)*u_b(i, 4) &
+ coeffs(5)*u(i, 1) &
+ coeffs(6)*u(i, 2) + coeffs(7)*u(i, 3) &
+ coeffs(8)*u(i, 4) + coeffs(9)*u(i, 5)
du(i, 2) = coeffs(1)*u_b(i, 2) + coeffs(2)*u_b(i, 3) &
+ coeffs(3)*u_b(i, 4) + coeffs(4)*u(i, 1) &
+ coeffs(5)*u(i, 2) &
+ coeffs(6)*u(i, 3) + coeffs(7)*u(i, 4) &
+ coeffs(8)*u(i, 5) + coeffs(9)*u(i, 6)
du(i, 3) = coeffs(1)*u_b(i, 3) + coeffs(2)*u_b(i, 4) &
+ coeffs(3)*u(i, 1) + coeffs(4)*u(i, 2) &
+ coeffs(5)*u(i, 3) &
+ coeffs(6)*u(i, 4) + coeffs(7)*u(i, 5) &
+ coeffs(8)*u(i, 6) + coeffs(9)*u(i, 7)
du(i, 3) = ffr(3)*(du(i, 3) - alfa*du(i, 2))
du(i, 4) = coeffs(1)*u_b(i, 4) + coeffs(2)*u(i, 1) &
+ coeffs(3)*u(i, 2) + coeffs(4)*u(i, 3) &
+ coeffs(5)*u(i, 4) &
+ coeffs(6)*u(i, 5) + coeffs(7)*u(i, 6) &
+ coeffs(8)*u(i, 7) + coeffs(9)*u(i, 8)
du(i, 4) = ffr(4)*(du(i, 4) - alfa*du(i, 3))
end do
!$omp end simd

do j = n_s+1, n-n_s
!$omp simd
do i = 1, SZ
temp_du = c_m4*u(i, j-4) + c_m3*u(i, j-3) &
+ c_m2*u(i, j-2) + c_m1*u(i, j-1) &
+ c_j*u(i, j) &
+ c_p1*u(i, j+1) + c_p2*u(i, j+2) &
+ c_p3*u(i, j+3) + c_p4*u(i, j+4)
du(i, j) = ffr(j)*(temp_du - alfa*du(i, j-1))
end do
!$omp end simd
end do

!$omp simd
do i = 1, SZ
j = n-3
du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) &
+ coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) &
+ coeffs(5)*u(i, j) &
+ coeffs(6)*u(i, j+1) + coeffs(7)*u(i, j+2) &
+ coeffs(8)*u(i, j+3) + coeffs(9)*u_e(i, 1)
du(i, j) = ffr(j)*(du(i, j) - alfa*du(i, j-1))
j = n-2
du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) &
+ coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) &
+ coeffs(5)*u(i, j) &
+ coeffs(6)*u(i, j+1) + coeffs(7)*u(i, j+2) &
+ coeffs(8)*u_e(i, 1) + coeffs(9)*u_e(i, 2)
du(i, j) = ffr(j)*(du(i, j) - alfa*du(i, j-1))
j = n-1
du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) &
+ coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) &
+ coeffs(5)*u(i, j) &
+ coeffs(6)*u(i, j+1) + coeffs(7)*u_e(i, 1) &
+ coeffs(8)*u_e(i, 2) + coeffs(9)*u_e(i, 3)
du(i, j) = ffr(j)*(du(i, j) - alfa*du(i, j-1))
j = n
du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) &
+ coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) &
+ coeffs(5)*u(i, j) &
+ coeffs(6)*u_e(i, 1) + coeffs(7)*u_e(i, 2) &
+ coeffs(8)*u_e(i, 3) + coeffs(9)*u_e(i, 4)
du(i, j) = ffr(j)*(du(i, j) - alfa*du(i, j-1))
end do
!$omp end simd

!$omp simd
do i = 1, SZ
send_u_e(i, 1) = du(i, n)
end do
!$omp end simd

! Backward pass of the hybrid algorithm
do j = n - 2, 2, -1
!$omp simd
do i = 1, SZ
du(i, j) = du(i, j) - fbc(j)*du(i, j + 1)
end do
!$omp end simd
end do
!$omp simd
do i = 1, SZ
du(i, 1) = ffr(1)*(du(i, 1) - fbc(1)*du(i, 2))
send_u_b(i, 1) = du(i, 1)
end do
!$omp end simd

end subroutine der_univ_dist_omp

subroutine der_univ_subs_omp(du, recv_u_b, recv_u_e, n, alfa, &
dist_sa, dist_sc)
implicit none

! Arguments
real(dp), intent(out), dimension(:, :) :: du
real(dp), intent(in), dimension(:, :) :: recv_u_b, recv_u_e
real(dp), intent(in), dimension(:) :: dist_sa, dist_sc
real(dp), intent(in) :: alfa
integer, intent(in) :: n

! Local variables
integer :: i, j!, b
real(dp) :: ur, bl, recp, du_1, du_n

!i = threadIdx%x
!b = blockIdx%x

bl = dist_sa(1)
ur = dist_sc(n)
recp = 1._dp/(1._dp - ur*bl)

!$omp simd
do i = 1, SZ
!du(i, 1) = recp*(du(i, 1) - bl*recv_u_b(i, 1))
!du(i, n) = recp*(du(i, n) - ur*recv_u_e(i, 1))
du_1 = recp*(du(i, 1) - bl*recv_u_b(i, 1))
du_n = recp*(du(i, n) - ur*recv_u_e(i, 1))
end do
!$omp end simd

!$omp simd
do i = 1, SZ
du(i, 1) = du_1
end do
!$omp end simd
do j = 2, n-1
!$omp simd
do i = 1, SZ
du(i, j) = (du(i, j) - dist_sa(j)*du_1 - dist_sc(j)*du_n)
end do
!$omp end simd
end do
!$omp simd
do i = 1, SZ
du(i, n) = du_n
end do
!$omp end simd

end subroutine der_univ_subs_omp

end module m_omp_kernels_dist
8 changes: 0 additions & 8 deletions src/thomas.f90
Original file line number Diff line number Diff line change
Expand Up @@ -186,25 +186,19 @@ pure subroutine solve(self, f, df)
n = size(self%fwd)
if (size(f, 2) /= n) error stop
do j = 2, n
!$omp simd
do i = 1, size(f, 1)
df(i, j) = df(i, j) - df(i, j - 1) * self%fwd(j)
end do
!$omp end simd
end do
!$omp simd
do i = 1, size(f, 1)
df(i, n) = df(i, n) * self%bwd(n)
end do
!$omp end simd

do j = n - 1, 1, -1
!$omp simd
do i = 1, size(f, 1)
df(i, j) = (df(i, j) - df(i, j + 1) * self%updiag(j)) &
& * self%bwd(j)
end do
!$omp end simd
end do
end subroutine solve

Expand Down Expand Up @@ -232,12 +226,10 @@ pure subroutine solve_periodic(self, f, df)
alpha = self%updiag(1)
select type (self)
type is (periodic_tridiagsolv)
!$omp simd
do i = 1, size(f, 1)
df(i, 1:m) = y(i, :) - ((y(i, 1) - alpha * y(i, m)) &
& / (1.+self%q(1) - alpha * self%q(m))) * self%q
end do
!$omp end simd
class default
error stop
end select
Expand Down
2 changes: 1 addition & 1 deletion src/vector3d_simd.f90
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ subroutine transport_dir(self, dim, rslt)
rslt(i, j, k) = -0.5 * &
(u(i, j, k) * du(i, j) + dusq(i, j)) &
& + self%xnu * d2u(i, j)
!$omp end simd
end do
!$omp end simd
end do
end do layers
end associate
Expand Down
13 changes: 12 additions & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ set(TESTSRC
test_tridiagonal.f90
test_stencil.f90
test_diffengine.f90
omp/test_omp_tridiag.f90
)
set(CUDATESTSRC
cuda/test_cuda_allocator.f90
Expand All @@ -28,7 +29,17 @@ foreach(testfile IN LISTS TESTSRC)
get_filename_component(test_name ${testfile} NAME_WE)

add_executable(${test_name} ${testfile})

target_compile_options(${test_name} PRIVATE "-O3")

if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "GNU")
target_compile_options(${test_name} PRIVATE "-ffast-math")
endif()

target_link_libraries(${test_name} PRIVATE x3d2)

add_test(NAME ${test_name} COMMAND ${test_name})
find_package(OpenMP REQUIRED)
target_link_libraries(${test_name} PRIVATE OpenMP::OpenMP_Fortran)

add_test(NAME ${test_name} COMMAND sh -c "mpirun -np 1 ${test_name}")
endforeach()
5 changes: 4 additions & 1 deletion tests/cuda/test_cuda_tridiag.f90
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ program test_cuda_tridiag
use cudafor
use mpi

use m_common, only: dp, SZ, pi
use m_common, only: dp, pi
use m_cuda_common, only: SZ
use m_cuda_kernels_dist, only: der_univ_dist, der_univ_subs
use m_derparams, only: der_1_vv, der_2_vv

Expand Down Expand Up @@ -193,5 +194,7 @@ program test_cuda_tridiag
error stop 'SOME TESTS FAILED.'
end if

call MPI_Finalize(ierr)

end program test_cuda_tridiag

Loading

0 comments on commit 0937998

Please sign in to comment.