Skip to content

Commit

Permalink
Finished the changes towards complex datatypes from C99. Fixes issue 70.
Browse files Browse the repository at this point in the history
Added defines to simplify adding pragmas to ignore warnings in gcc and loop_count in icc. The ignore pragmas are used in several places to suppress warnings about conversion from doublecomplex* to complex*:
- calls to Temperton FFT routines.
- copying data in SSE3 optimizations (here explicit pointer casts were also added).
Added comments in several places concerning the portability of such conversions.

icc pragmas are now visible only when compiling with icc.

Inclusion of clAmdFft.h is now enclosed in pragmas to ignore existent warning.

Calling of IGT Fortran routines was slightly changed, conversion from double to complex has been moved inside the C file from Fortran routine.

tests/2exec/comp2exec was improved by adding a common reference path (to search all reference binaries). Added '-pol lak' to suite.
  • Loading branch information
myurkin committed Aug 1, 2013
1 parent 057298c commit 653fbf5
Show file tree
Hide file tree
Showing 13 changed files with 143 additions and 74 deletions.
3 changes: 1 addition & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -329,8 +329,7 @@ ifeq ($(COMPILER),gnu)
COPT1 := -O2
COPT2 := -O3 -ffast-math -funroll-loops
CWARN := -Wall -Wextra -Wcast-qual -Wpointer-arith -Wwrite-strings -Wstrict-prototypes \
-Wstrict-aliasing=1 -Wshadow -Wcast-align -Wnested-externs -Wcomment -Wno-unknown-pragmas \
-Wno-overlength-strings
-Wstrict-aliasing=1 -Wshadow -Wcast-align -Wnested-externs -Wcomment -Wno-overlength-strings
# gcc versions prior to 4.7.2 are affected by bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=7263 , which causes
# -pedantic flag to generates warnings on every occurence of I (complex i)
GCC_GTEQ_472 := $(shell expr `gcc -dumpversion | sed -e 's/\.\([0-9][0-9]\)/\1/g' -e 's/\.\([0-9]\)/0\1/g' \
Expand Down
5 changes: 2 additions & 3 deletions src/comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,8 @@ static MPI_Datatype MPIVarType(var_type type,bool reduce,int *mult)
* directly used. In this case we emulate more complex datatypes through multiplication of double, and additional
* variable 'mult' is returned to account for this factor.
*
* Reduction of complex numbers is emulated if not supported; the emulation is not perfectly portable - depends on a
* particular implementation of complex numbers. The good thing is that it is only used for old (less modern) MPI
* implementations.
* Reduction of complex numbers is emulated if not supported; C99 implies that this emulation is portable. Anyway, it
* is only used for old (less modern) MPI implementations.
*/
{
if (reduce) *mult=1; // default value when direct correspondence is possible
Expand Down
26 changes: 24 additions & 2 deletions src/fft.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@
#include <string.h>

#ifdef CLFFT_AMD
IGNORE_WARNING(-Wstrict-prototypes) // no way to change the library header
# include <clAmdFft.h> //external library from AMD
STOP_IGNORE
// Defines precision of clAmdFft transforms. !!! CLFFT_DOUBLE_FAST should be tested when becomes operational
# define PRECISION_CLFFT CLFFT_DOUBLE
#elif defined(CLFFT_APPLE)
Expand Down Expand Up @@ -318,8 +320,15 @@ void fftX(const int isign)
#elif defined(FFT_TEMPERTON)
int nn=gridX,inc=1,jump=nn,lot=boxY;
size_t z;

/* Calls to Temperton FFT cause warnings for translation from doublecomplex to double pointers. However, such a cast
* is perfectly valid in C99. So we set pragmas to remove these warnings.
*
* !!! TODO: Another (ultimate) solution is to remove this routine altogether, since FFTW is perfect in all
* respects. This is also reasonable considering future switch to tgmath.h
*/
IGNORE_WARNING(-Wstrict-aliasing);
for (z=0;z<3*local_Nz;z++) cfft99_((double *)(Xmatrix+z*gridX*smallY),work,trigsX,ifaxX,&inc,&jump,&nn,&lot,&isign);
STOP_IGNORE;
#endif
}

Expand All @@ -343,7 +352,9 @@ void fftY(const int isign)
#elif defined(FFT_TEMPERTON)
int nn=gridY,inc=1,jump=nn,lot=3*gridZ;

IGNORE_WARNING(-Wstrict-aliasing);
cfft99_((double *)(slices_tr),work,trigsY,ifaxY,&inc,&jump,&nn,&lot,&isign);
STOP_IGNORE;
#endif
}

Expand All @@ -367,7 +378,9 @@ void fftZ(const int isign)
#elif defined(FFT_TEMPERTON)
int nn=gridZ,inc=1,jump=nn,lot=boxY,Xcomp;

IGNORE_WARNING(-Wstrict-aliasing);
for (Xcomp=0;Xcomp<3;Xcomp++) cfft99_((double *)(slices+gridYZ*Xcomp),work,trigsZ,ifaxZ,&inc,&jump,&nn,&lot,&isign);
STOP_IGNORE;
#endif
}

Expand All @@ -382,7 +395,9 @@ static void fftX_Dm(const size_t lengthZ ONLY_FOR_TEMPERTON)
int nn=gridX,inc=1,jump=nn,lot=D2sizeY,isign=FFT_FORWARD;
size_t z;

IGNORE_WARNING(-Wstrict-aliasing);
for (z=0;z<lengthZ;z++) cfft99_((double *)(D2matrix+z*gridX*D2sizeY),work,trigsX,ifaxX,&inc,&jump,&nn,&lot,&isign);
STOP_IGNORE;
#endif
}

Expand All @@ -396,7 +411,9 @@ static void fftY_Dm(void)
#elif defined(FFT_TEMPERTON)
int nn=gridY,inc=1,jump=nn,lot=gridZ,isign=FFT_FORWARD;

IGNORE_WARNING(-Wstrict-aliasing);
cfft99_((double *)slice_tr,work,trigsY,ifaxY,&inc,&jump,&nn,&lot,&isign);
STOP_IGNORE;
#endif
}

Expand All @@ -410,7 +427,9 @@ static void fftZ_Dm(void)
#elif defined(FFT_TEMPERTON)
int nn=gridZ,inc=1,jump=nn,lot=gridY,isign=FFT_FORWARD;

IGNORE_WARNING(-Wstrict-aliasing);
cfft99_((double *)slice,work,trigsZ,ifaxZ,&inc,&jump,&nn,&lot,&isign);
STOP_IGNORE;
#endif
}

Expand Down Expand Up @@ -531,6 +550,9 @@ static void fftInitAfterD(void)
* completely separate code is used for OpenCL and FFTW3, because even precise-timing output is significantly different.
* In particular, FFTW3 uses separate plans for forward and backward, while clFFT (by Apple or AMD) uses one plan for
* both directions.
*
* clFft access the OpenCL buffers directly, so they are not anyhow affected by the definition of complex numbers in the
* main part of the code (although, it is consistent with it)
*/
{
#ifdef OPENCL
Expand Down Expand Up @@ -777,7 +799,7 @@ void InitDmatrix(void)
CREATE_CL_BUFFER(bufslices,CL_MEM_READ_WRITE,gridYZ*3*sizeof(doublecomplex),NULL);
CREATE_CL_BUFFER(bufslices_tr,CL_MEM_READ_WRITE,gridYZ*3*sizeof(doublecomplex),NULL);
/* The following are constant device buffers which are initialized with host data. But bufDmatrix is initialized in
* the end of this function (to be compatible with prognosis. And bufcc_sqrt is initialized in InitCC, since it may
* the end of this function (to be compatible with prognosis). And bufcc_sqrt is initialized in InitCC, since it may
* change for every run of the iterative solver.
*/
CREATE_CL_BUFFER(bufcc_sqrt,CL_MEM_READ_ONLY,sizeof(cc_sqrt),NULL);
Expand Down
13 changes: 5 additions & 8 deletions src/fort/propaesplibreintadda.f
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ subroutine propaespacelibreintadda(Rij,k0a,arretecube,relreq,
double precision k0a,arretecubem
double precision x,y,z,arretecube,k0,xx0,yy0,zz0
double precision Rij(3),result(12)
c The structure of the result is the following:
c Re(G11),Re(G12),Re(G13),Re(G22),Re(G23),Re(G33),Im(G11),...,Im(G33)

c Variables needs for the integration
integer KEY, N, NF, NDIM, MINCLS, MAXCLS, IFAIL, NEVAL, NW
parameter (nw=4000000,ndim=3,nf=12)
double precision A(NDIM), B(NDIM), WRKSTR(NW)
double precision ABSEST(NF), FINEST(NF), ABSREQ, RELREQ,err
double precision ABSEST(NF), ABSREQ, RELREQ,err

double precision Id(3,3),Rab,Rvect(3)

Expand Down Expand Up @@ -68,21 +70,16 @@ subroutine propaespacelibreintadda(Rij,k0a,arretecube,relreq,
endif

call DCUHRE(NDIM,NF,A,B, MINCLS, MAXCLS, fonctionigtadda,
$ ABSREQ,RELREQ,KEY,NW,0,finest,ABSEST,NEVAL,IFAIL, WRKSTR)
$ ABSREQ,RELREQ,KEY,NW,0,result,ABSEST,NEVAL,IFAIL, WRKSTR)

do N = 1,NF
FINEST(N)=FINEST(N)/arretecube/arretecube/arretecube
result(N)=result(N)/arretecube/arretecube/arretecube
enddo

if (ifail.ne.0) then
write(*,*) 'IFAIL in IGT routine',IFAIL
endif

do i = 1,6
result(2*i-1)=finest(i)
result(2*i)=finest(i+6)
enddo

end
c*************************************************************
subroutine fonctionigtadda(ndim,zz,nfun,f)
Expand Down
21 changes: 19 additions & 2 deletions src/function.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* File: function.h
* $Date:: $
* Descr: function attributes
* Descr: function attributes and compiler pragmas
*
* Copyright (C) 2006,2008,2010-2011,2013 ADDA contributors
* This file is part of ADDA.
Expand All @@ -17,7 +17,6 @@
#ifndef __function_h
#define __function_h


// attribute options for GCC compilers (Intel compiler may also recognize them)
#ifdef __GNUC__
// sets a macro for testing GCC version (copied from _mingw.h)
Expand All @@ -26,6 +25,16 @@
(__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
# else
# define GCC_PREREQ(major, minor) 0
# endif
// pragmas to ignore warnings
# if GCC_PREREQ(4,6)
# define DO_PRAGMA(x) _Pragma (#x)
# define IGNORE_WARNING(x) DO_PRAGMA(GCC diagnostic ignored #x)
# // assume that push is not used anywhere
# define STOP_IGNORE _Pragma ("GCC diagnostic pop")
# else
# define IGNORE_WARNING(x)
# define STOP_IGNORE
# endif
// The following chooses between __printf__ and __gnu_printf__ attributes
# if GCC_PREREQ(4,4)
Expand All @@ -49,11 +58,19 @@
# define ATT_NORETURN __attribute__ ((__noreturn__))
# define ATT_UNUSED __attribute__ ((__unused__))
#else
# define IGNORE_WARNING(x)
# define STOP_IGNORE
# define ATT_PRINTF(a,b)
# define ATT_PURE
# define ATT_MALLOC
# define ATT_NORETURN
# define ATT_UNUSED
#endif

#ifdef __ICC
# define LARGE_LOOP _Pragma ("loop_count (10000)")
#else
# define LARGE_LOOP
#endif

#endif // __function_h
34 changes: 27 additions & 7 deletions src/interaction.c
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,18 @@ static inline __m128d accImExp_pd(const double x)

static inline doublecomplex accImExp(const double x)
{
/* Here and further in the SSE3 part it is assumed that doublecomplex is equivalent to two doubles (that is
* specified by the C99 standard). Explicit pointer casts have been put in place, and pragmas to ignore remaining
* warnings from strict aliasing.
*
* !!! TODO: SSE3 code is a nice hack. But it should be considered carefully - is it worth it? In particular, it
* seems that only parts of it are really beneficial (like tabulated evaluation of imaginary exponents), and those
* can be incorporated into the main code (using standard C99 only).
*/
doublecomplex c;
_mm_store_pd(&c,accImExp_pd(x));
IGNORE_WARNING(-Wstrict-aliasing);
_mm_store_pd((double *)(&c),accImExp_pd(x));
STOP_IGNORE;
return c;
}

Expand All @@ -155,26 +165,30 @@ static void CalcInterTerm_core(const double kr,const double kr2,const double inv
const __m128d v1 = _mm_set_pd(kr,t3);
const __m128d v2 = _mm_set_pd(t2,t1);
__m128d qff,im_re;
_mm_store_pd(expval,sc);
IGNORE_WARNING(-Wstrict-aliasing);
_mm_store_pd((double *)expval,sc);
STOP_IGNORE;

#undef INTERACT_MUL
#define INTERACT_DIAG(ind) { \
qff = _mm_set1_pd(qmunu[ind]); \
im_re = _mm_add_pd(v1,_mm_mul_pd(v2,qff)); \
im_re = cmul(sc,im_re); \
_mm_store_pd(result+ind,im_re); }
_mm_store_pd((double *)(result+ind),im_re); }
#define INTERACT_NONDIAG(ind) { \
qff = _mm_set1_pd(qmunu[ind]); \
im_re = _mm_mul_pd(v2,qff); \
im_re = cmul(sc,im_re); \
_mm_store_pd(result+ind,im_re); }
_mm_store_pd((double *)(result+ind),im_re); }

IGNORE_WARNING(-Wstrict-aliasing);
INTERACT_DIAG(0); // xx
INTERACT_NONDIAG(1); // xy
INTERACT_NONDIAG(2); // xz
INTERACT_DIAG(3); // yy
INTERACT_NONDIAG(4); // yz
INTERACT_DIAG(5); // zz
STOP_IGNORE;

#undef INTERACT_DIAG
#undef INTERACT_NONDIAG
Expand Down Expand Up @@ -716,12 +730,18 @@ void CalcInterTerm_igt(const int i,const int j,const int k,doublecomplex result[
double qvec[3],qmunu[6]; // unit directional vector {qx,qy,qz} and its outer-product {qxx,qxy,qxz,qyy,qyz,qzz}
double rn,invrn,invr3,kr,kr2; // |R/d|, 1/|R/d|, |R|^-3, kR, (kR)^2
doublecomplex expval; // exp(ikR)/|R|^3
double rtemp[3];
double rtemp[3],tmp[12];
int comp;

CalcInterParams1(i,j,k,qvec,&rn);
if (igt_lim==UNDEF || rn<=igt_lim) {
vMultScal(gridspace,qvec,rtemp);
propaespacelibreintadda_(rtemp,&WaveNum,&gridspace,&igt_eps,(double *)result);
/* passing complex vectors from Fortran to c is not necessarily portable (at least requires extra effort in
* the Fortran code. So we do it through double. This is not bad for performance, since double is anyway used
* internally for integration in this Fortran routine.
*/
propaespacelibreintadda_(rtemp,&WaveNum,&gridspace,&igt_eps,tmp);
for (comp=0;comp<6;comp++) result[comp] = tmp[comp] + I*tmp[comp+6];
}
else {
// The following is equivalent to CalcInterTerm_poi, except for the 1st part of initialization performed above
Expand Down Expand Up @@ -830,7 +850,7 @@ void InitInteraction(void)
#ifndef NO_FORTRAN
case G_IGT: CalcInterTerm = &CalcInterTerm_igt; break;
#endif
default: LogError(ONE_POS, "Invalid interaction term calculation method: %d",IntRelation);
default: LogError(ONE_POS, "Invalid interaction term calculation method: %d",(int)IntRelation);
// no break
}
// read tables if needed
Expand Down
6 changes: 5 additions & 1 deletion src/iterative.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ struct iter_params_struct {
int vec_N; // number of additional vectors to describe the state
void (*func)(const enum phase); // pointer to implementation of the iterative solver
};
static doublecomplex dumb; // dumb variable, used in workaround for issue 146
static doublecomplex dumb ATT_UNUSED; // dumb variable, used in workaround for issue 146

#define ITER_FUNC(name) static void name(const enum phase ph)

Expand Down Expand Up @@ -340,6 +340,10 @@ ITER_FUNC(BCGS2)
* so we use l=2 here. In many cases one iteration of this method is similar to two iterations of BiCGStab, but overall
* convergence is slightly better.
* Breakdown tests were made to coincide with that for BiCGStab for l=1.
*
* !!! This iterative solver produces segmentation fault when compiled with icc 11.1. Probably that is related to
* issue 146. But we leave it be (assume that this is a compiler bug). Even if someone uses this compiler, he can
* live fine without this iterative solver.
*/
{
#define LL 2 // potentially the method will also work for l=1 (but memory allocation and freeing need to be adjusted)
Expand Down
Loading

0 comments on commit 653fbf5

Please sign in to comment.