diff --git a/gpu-openacc-diffusion/Makefile b/gpu-openacc-diffusion/Makefile index 452a19e..02f3da8 100644 --- a/gpu-openacc-diffusion/Makefile +++ b/gpu-openacc-diffusion/Makefile @@ -2,7 +2,7 @@ # OpenACC implementation CXX = pgcc -CXXFLAGS = -O3 -I../common-diffusion -acc -ta=tesla -Minfo=accel -mp +CXXFLAGS = -O3 -I../common-diffusion -acc -ta=tesla -ta=tesla:cc30 -ta=tesla:cc50 -ta=tesla:cc60 -Minfo=accel -mp LINKS = -lm -lpng OBJS = boundaries.o discretization.o mesh.o numerics.o output.o timer.o diff --git a/gpu-openacc-diffusion/openacc_boundaries.c b/gpu-openacc-diffusion/openacc_boundaries.c index 6627782..d36a309 100644 --- a/gpu-openacc-diffusion/openacc_boundaries.c +++ b/gpu-openacc-diffusion/openacc_boundaries.c @@ -59,47 +59,48 @@ void apply_initial_conditions(fp_t** conc, int nx, int ny, int nm, fp_t bc[2][2] } } -void boundary_kernel(fp_t** conc, int nx, int ny, int nm, fp_t bc[2][2]) +void boundary_kernel(fp_t** __restrict__ conc, int nx, int ny, int nm, fp_t bc[2][2]) { /* apply fixed boundary values: sequence does not matter */ - #pragma acc parallel + #pragma acc declare present(conc[0:ny][0:nx], bc[0:2][0:2]) { - #pragma acc loop - for (int j = 0; j < ny/2; j++) { - #pragma acc loop - for (int i = 0; i < 1+nm/2; i++) { - conc[j][i] = bc[1][0]; /* left value */ + #pragma acc parallel + { + #pragma acc loop independent collapse(2) + for (int j = 0; j < ny/2; j++) { + for (int i = 0; i < 1+nm/2; i++) { + conc[j][i] = bc[1][0]; /* left value */ + } } - } - #pragma acc loop - for (int j = ny/2; j < ny; j++) { - #pragma acc loop - for (int i = nx-1-nm/2; i < nx; i++) { - conc[j][i] = bc[1][1]; /* right value */ + #pragma acc loop independent collapse(2) + for (int j = ny/2; j < ny; j++) { + for (int i = nx-1-nm/2; i < nx; i++) { + conc[j][i] = bc[1][1]; /* right value */ + } } } /* apply no-flux boundary conditions: inside to out, sequence matters */ for (int offset = 0; offset < nm/2; offset++) { - int ilo = nm/2 - offset; - int ihi = nx - 1 - nm/2 + offset; - #pragma acc loop - for (int j = 0; j < ny; j++) { - conc[j][ilo-1] = conc[j][ilo]; /* left condition */ - conc[j][ihi+1] = conc[j][ihi]; /* right condition */ - } - } - - for (int offset = 0; offset < nm/2; offset++) { - int jlo = nm/2 - offset; - int jhi = ny - 1 - nm/2 + offset; - #pragma acc loop - for (int i = 0; i < nx; i++) { - conc[jlo-1][i] = conc[jlo][i]; /* bottom condition */ - conc[jhi+1][i] = conc[jhi][i]; /* top condition */ + #pragma acc parallel + { + int ilo = nm/2 - offset; + int ihi = nx - 1 - nm/2 + offset; + int jlo = nm/2 - offset; + int jhi = ny - 1 - nm/2 + offset; + #pragma acc loop independent + for (int j = 0; j < ny; j++) { + conc[j][ilo-1] = conc[j][ilo]; /* left condition */ + conc[j][ihi+1] = conc[j][ihi]; /* right condition */ + } + #pragma acc loop independent + for (int i = 0; i < nx; i++) { + conc[jlo-1][i] = conc[jlo][i]; /* bottom condition */ + conc[jhi+1][i] = conc[jhi][i]; /* top condition */ + } } } } diff --git a/gpu-openacc-diffusion/openacc_discretization.c b/gpu-openacc-diffusion/openacc_discretization.c index 677a20b..c984de0 100644 --- a/gpu-openacc-diffusion/openacc_discretization.c +++ b/gpu-openacc-diffusion/openacc_discretization.c @@ -31,13 +31,14 @@ void convolution_kernel(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, int nx, int ny, int nm) { + #pragma acc declare present(conc_old[0:ny][0:nx], conc_lap[0:ny][0:nx], mask_lap[0:nm][0:nm]) #pragma acc parallel { - #pragma acc loop + #pragma acc loop collapse(2) for (int j = nm/2; j < ny-nm/2; j++) { - #pragma acc loop for (int i = nm/2; i < nx-nm/2; i++) { fp_t value = 0.; + #pragma acc loop seq collapse(2) for (int mj = -nm/2; mj < 1+nm/2; mj++) { for (int mi = -nm/2; mi < 1+nm/2; mi++) { value += mask_lap[mj+nm/2][mi+nm/2] * conc_old[j+mj][i+mi]; @@ -52,11 +53,11 @@ void convolution_kernel(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, int n void diffusion_kernel(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap, int nx, int ny, int nm, fp_t D, fp_t dt) { + #pragma acc declare present(conc_old[0:ny][0:nx], conc_new[0:ny][0:nx], conc_lap[0:ny][0:nx]) #pragma acc parallel { - #pragma acc loop + #pragma acc loop collapse(2) for (int j = nm/2; j < ny-nm/2; j++) { - #pragma acc loop for (int i = nm/2; i < nx-nm/2; i++) { conc_new[j][i] = conc_old[j][i] + dt * D * conc_lap[j][i]; } @@ -64,23 +65,14 @@ void diffusion_kernel(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap, } } - -void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, - int nx, int ny, int nm) -{ - /* If you must compute the convolution separately, do so here. */ - #pragma acc data copyin(conc_old[0:ny][0:nx], mask_lap[0:nm][0:nm]) create(conc_lap[0:ny][0:nx]) copyout(conc_lap[0:ny][0:nx]) - { - convolution_kernel(conc_old, conc_lap, mask_lap, nx, ny, nm); - } -} - void solve_diffusion_equation(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap, fp_t** mask_lap, int nx, int ny, int nm, fp_t bc[2][2], fp_t D, fp_t dt, int checks, fp_t* elapsed, struct Stopwatch* sw) { - #pragma acc data copy(conc_old[0:ny][0:nx], mask_lap[0:nm][0:nm], bc[0:2][0:2]) create(conc_lap[0:ny][0:nx], conc_new[0:ny][0:nx]) + #pragma acc data present_or_copy(conc_old[0:ny][0:nx]) \ + present_or_copyin(mask_lap[0:nm][0:nm], bc[0:2][0:2]) \ + present_or_create(conc_lap[0:ny][0:nx], conc_new[0:ny][0:nx]) { double start_time=0.; int check=0;