Skip to content

Commit

Permalink
Fix usage of OpenAcc data flow controls; 4x slower than tiled GPU alg…
Browse files Browse the repository at this point in the history
…orithms (CUDA, OpenCL)

Note: apply `#pragma acc declare present` in routines.

Closes #49.
  • Loading branch information
tkphd committed Oct 18, 2017
1 parent b34632b commit 4c78f9f
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 46 deletions.
2 changes: 1 addition & 1 deletion gpu-openacc-diffusion/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# OpenACC implementation

CXX = pgcc
CXXFLAGS = -O3 -I../common-diffusion -acc -ta=tesla -Minfo=accel -mp
CXXFLAGS = -O3 -I../common-diffusion -acc -ta=tesla -ta=tesla:cc30 -ta=tesla:cc50 -ta=tesla:cc60 -Minfo=accel -mp
LINKS = -lm -lpng

OBJS = boundaries.o discretization.o mesh.o numerics.o output.o timer.o
Expand Down
59 changes: 30 additions & 29 deletions gpu-openacc-diffusion/openacc_boundaries.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,47 +59,48 @@ void apply_initial_conditions(fp_t** conc, int nx, int ny, int nm, fp_t bc[2][2]
}
}

void boundary_kernel(fp_t** conc, int nx, int ny, int nm, fp_t bc[2][2])
void boundary_kernel(fp_t** __restrict__ conc, int nx, int ny, int nm, fp_t bc[2][2])
{
/* apply fixed boundary values: sequence does not matter */

#pragma acc parallel
#pragma acc declare present(conc[0:ny][0:nx], bc[0:2][0:2])
{
#pragma acc loop
for (int j = 0; j < ny/2; j++) {
#pragma acc loop
for (int i = 0; i < 1+nm/2; i++) {
conc[j][i] = bc[1][0]; /* left value */
#pragma acc parallel
{
#pragma acc loop independent collapse(2)
for (int j = 0; j < ny/2; j++) {
for (int i = 0; i < 1+nm/2; i++) {
conc[j][i] = bc[1][0]; /* left value */
}
}
}

#pragma acc loop
for (int j = ny/2; j < ny; j++) {
#pragma acc loop
for (int i = nx-1-nm/2; i < nx; i++) {
conc[j][i] = bc[1][1]; /* right value */
#pragma acc loop independent collapse(2)
for (int j = ny/2; j < ny; j++) {
for (int i = nx-1-nm/2; i < nx; i++) {
conc[j][i] = bc[1][1]; /* right value */
}
}
}

/* apply no-flux boundary conditions: inside to out, sequence matters */

for (int offset = 0; offset < nm/2; offset++) {
int ilo = nm/2 - offset;
int ihi = nx - 1 - nm/2 + offset;
#pragma acc loop
for (int j = 0; j < ny; j++) {
conc[j][ilo-1] = conc[j][ilo]; /* left condition */
conc[j][ihi+1] = conc[j][ihi]; /* right condition */
}
}

for (int offset = 0; offset < nm/2; offset++) {
int jlo = nm/2 - offset;
int jhi = ny - 1 - nm/2 + offset;
#pragma acc loop
for (int i = 0; i < nx; i++) {
conc[jlo-1][i] = conc[jlo][i]; /* bottom condition */
conc[jhi+1][i] = conc[jhi][i]; /* top condition */
#pragma acc parallel
{
int ilo = nm/2 - offset;
int ihi = nx - 1 - nm/2 + offset;
int jlo = nm/2 - offset;
int jhi = ny - 1 - nm/2 + offset;
#pragma acc loop independent
for (int j = 0; j < ny; j++) {
conc[j][ilo-1] = conc[j][ilo]; /* left condition */
conc[j][ihi+1] = conc[j][ihi]; /* right condition */
}
#pragma acc loop independent
for (int i = 0; i < nx; i++) {
conc[jlo-1][i] = conc[jlo][i]; /* bottom condition */
conc[jhi+1][i] = conc[jhi][i]; /* top condition */
}
}
}
}
Expand Down
24 changes: 8 additions & 16 deletions gpu-openacc-diffusion/openacc_discretization.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,14 @@

void convolution_kernel(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, int nx, int ny, int nm)
{
#pragma acc declare present(conc_old[0:ny][0:nx], conc_lap[0:ny][0:nx], mask_lap[0:nm][0:nm])
#pragma acc parallel
{
#pragma acc loop
#pragma acc loop collapse(2)
for (int j = nm/2; j < ny-nm/2; j++) {
#pragma acc loop
for (int i = nm/2; i < nx-nm/2; i++) {
fp_t value = 0.;
#pragma acc loop seq collapse(2)
for (int mj = -nm/2; mj < 1+nm/2; mj++) {
for (int mi = -nm/2; mi < 1+nm/2; mi++) {
value += mask_lap[mj+nm/2][mi+nm/2] * conc_old[j+mj][i+mi];
Expand All @@ -52,35 +53,26 @@ void convolution_kernel(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, int n
void diffusion_kernel(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap,
int nx, int ny, int nm, fp_t D, fp_t dt)
{
#pragma acc declare present(conc_old[0:ny][0:nx], conc_new[0:ny][0:nx], conc_lap[0:ny][0:nx])
#pragma acc parallel
{
#pragma acc loop
#pragma acc loop collapse(2)
for (int j = nm/2; j < ny-nm/2; j++) {
#pragma acc loop
for (int i = nm/2; i < nx-nm/2; i++) {
conc_new[j][i] = conc_old[j][i] + dt * D * conc_lap[j][i];
}
}
}
}


void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap,
int nx, int ny, int nm)
{
/* If you must compute the convolution separately, do so here. */
#pragma acc data copyin(conc_old[0:ny][0:nx], mask_lap[0:nm][0:nm]) create(conc_lap[0:ny][0:nx]) copyout(conc_lap[0:ny][0:nx])
{
convolution_kernel(conc_old, conc_lap, mask_lap, nx, ny, nm);
}
}

void solve_diffusion_equation(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap,
fp_t** mask_lap, int nx, int ny, int nm,
fp_t bc[2][2], fp_t D, fp_t dt, int checks,
fp_t* elapsed, struct Stopwatch* sw)
{
#pragma acc data copy(conc_old[0:ny][0:nx], mask_lap[0:nm][0:nm], bc[0:2][0:2]) create(conc_lap[0:ny][0:nx], conc_new[0:ny][0:nx])
#pragma acc data present_or_copy(conc_old[0:ny][0:nx]) \
present_or_copyin(mask_lap[0:nm][0:nm], bc[0:2][0:2]) \
present_or_create(conc_lap[0:ny][0:nx], conc_new[0:ny][0:nx])
{
double start_time=0.;
int check=0;
Expand Down

0 comments on commit 4c78f9f

Please sign in to comment.