From 653fbf547bd8760d3c8af58fe174f33e7feab03d Mon Sep 17 00:00:00 2001 From: yurkin Date: Thu, 1 Aug 2013 11:47:18 +0000 Subject: [PATCH] Finished the changes towards complex datatypes from C99. Fixes issue 70. Added defines to simplify adding pragmas to ignore warnings in gcc and loop_count in icc. The ignore pragmas are used in several places to suppress warnings about conversion from doublecomplex* to complex*: - calls to Temperton FFT routines. - copying data in SSE3 optimizations (here explicit pointer casts were also added). Added comments in several places concerning the portability of such conversions. icc pragmas are now visible only when compiling with icc. Inclusion of clAmdFft.h is now enclosed in pragmas to ignore existent warning. Calling of IGT Fortran routines was slightly changed, conversion from double to complex has been moved inside the C file from Fortran routine. tests/2exec/comp2exec was improved by adding a common reference path (to search all reference binaries). Added '-pol lak' to suite. --- src/Makefile | 3 +- src/comm.c | 5 +- src/fft.c | 26 +++++++++- src/fort/propaesplibreintadda.f | 13 ++--- src/function.h | 21 +++++++- src/interaction.c | 34 ++++++++++--- src/iterative.c | 6 ++- src/linalg.c | 86 ++++++++++++++++----------------- src/oclcore.c | 4 ++ src/param.c | 2 + src/sparse_ops.h | 2 + tests/2exec/comp2exec | 14 +++--- tests/2exec/suite | 1 + 13 files changed, 143 insertions(+), 74 deletions(-) diff --git a/src/Makefile b/src/Makefile index a67baca0..fbe9e8b7 100644 --- a/src/Makefile +++ b/src/Makefile @@ -329,8 +329,7 @@ ifeq ($(COMPILER),gnu) COPT1 := -O2 COPT2 := -O3 -ffast-math -funroll-loops CWARN := -Wall -Wextra -Wcast-qual -Wpointer-arith -Wwrite-strings -Wstrict-prototypes \ - -Wstrict-aliasing=1 -Wshadow -Wcast-align -Wnested-externs -Wcomment -Wno-unknown-pragmas \ - -Wno-overlength-strings + -Wstrict-aliasing=1 -Wshadow -Wcast-align -Wnested-externs -Wcomment -Wno-overlength-strings # gcc versions prior to 4.7.2 are affected by bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=7263 , which causes # -pedantic flag to generates warnings on every occurence of I (complex i) GCC_GTEQ_472 := $(shell expr `gcc -dumpversion | sed -e 's/\.\([0-9][0-9]\)/\1/g' -e 's/\.\([0-9]\)/0\1/g' \ diff --git a/src/comm.c b/src/comm.c index 49a1a0c3..9b63b16a 100644 --- a/src/comm.c +++ b/src/comm.c @@ -154,9 +154,8 @@ static MPI_Datatype MPIVarType(var_type type,bool reduce,int *mult) * directly used. In this case we emulate more complex datatypes through multiplication of double, and additional * variable 'mult' is returned to account for this factor. * - * Reduction of complex numbers is emulated if not supported; the emulation is not perfectly portable - depends on a - * particular implementation of complex numbers. The good thing is that it is only used for old (less modern) MPI - * implementations. + * Reduction of complex numbers is emulated if not supported; C99 implies that this emulation is portable. Anyway, it + * is only used for old (less modern) MPI implementations. */ { if (reduce) *mult=1; // default value when direct correspondence is possible diff --git a/src/fft.c b/src/fft.c index da39f8f4..022e203a 100644 --- a/src/fft.c +++ b/src/fft.c @@ -34,7 +34,9 @@ #include #ifdef CLFFT_AMD + IGNORE_WARNING(-Wstrict-prototypes) // no way to change the library header # include //external library from AMD + STOP_IGNORE // Defines precision of clAmdFft transforms. !!! CLFFT_DOUBLE_FAST should be tested when becomes operational # define PRECISION_CLFFT CLFFT_DOUBLE #elif defined(CLFFT_APPLE) @@ -318,8 +320,15 @@ void fftX(const int isign) #elif defined(FFT_TEMPERTON) int nn=gridX,inc=1,jump=nn,lot=boxY; size_t z; - + /* Calls to Temperton FFT cause warnings for translation from doublecomplex to double pointers. However, such a cast + * is perfectly valid in C99. So we set pragmas to remove these warnings. + * + * !!! TODO: Another (ultimate) solution is to remove this routine altogether, since FFTW is perfect in all + * respects. This is also reasonable considering future switch to tgmath.h + */ + IGNORE_WARNING(-Wstrict-aliasing); for (z=0;z<3*local_Nz;z++) cfft99_((double *)(Xmatrix+z*gridX*smallY),work,trigsX,ifaxX,&inc,&jump,&nn,&lot,&isign); + STOP_IGNORE; #endif } @@ -343,7 +352,9 @@ void fftY(const int isign) #elif defined(FFT_TEMPERTON) int nn=gridY,inc=1,jump=nn,lot=3*gridZ; + IGNORE_WARNING(-Wstrict-aliasing); cfft99_((double *)(slices_tr),work,trigsY,ifaxY,&inc,&jump,&nn,&lot,&isign); + STOP_IGNORE; #endif } @@ -367,7 +378,9 @@ void fftZ(const int isign) #elif defined(FFT_TEMPERTON) int nn=gridZ,inc=1,jump=nn,lot=boxY,Xcomp; + IGNORE_WARNING(-Wstrict-aliasing); for (Xcomp=0;Xcomp<3;Xcomp++) cfft99_((double *)(slices+gridYZ*Xcomp),work,trigsZ,ifaxZ,&inc,&jump,&nn,&lot,&isign); + STOP_IGNORE; #endif } @@ -382,7 +395,9 @@ static void fftX_Dm(const size_t lengthZ ONLY_FOR_TEMPERTON) int nn=gridX,inc=1,jump=nn,lot=D2sizeY,isign=FFT_FORWARD; size_t z; + IGNORE_WARNING(-Wstrict-aliasing); for (z=0;z (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) # else # define GCC_PREREQ(major, minor) 0 +# endif + // pragmas to ignore warnings +# if GCC_PREREQ(4,6) +# define DO_PRAGMA(x) _Pragma (#x) +# define IGNORE_WARNING(x) DO_PRAGMA(GCC diagnostic ignored #x) +# // assume that push is not used anywhere +# define STOP_IGNORE _Pragma ("GCC diagnostic pop") +# else +# define IGNORE_WARNING(x) +# define STOP_IGNORE # endif // The following chooses between __printf__ and __gnu_printf__ attributes # if GCC_PREREQ(4,4) @@ -49,6 +58,8 @@ # define ATT_NORETURN __attribute__ ((__noreturn__)) # define ATT_UNUSED __attribute__ ((__unused__)) #else +# define IGNORE_WARNING(x) +# define STOP_IGNORE # define ATT_PRINTF(a,b) # define ATT_PURE # define ATT_MALLOC @@ -56,4 +67,10 @@ # define ATT_UNUSED #endif +#ifdef __ICC +# define LARGE_LOOP _Pragma ("loop_count (10000)") +#else +# define LARGE_LOOP +#endif + #endif // __function_h diff --git a/src/interaction.c b/src/interaction.c index 88222d92..0e420cf7 100644 --- a/src/interaction.c +++ b/src/interaction.c @@ -138,8 +138,18 @@ static inline __m128d accImExp_pd(const double x) static inline doublecomplex accImExp(const double x) { + /* Here and further in the SSE3 part it is assumed that doublecomplex is equivalent to two doubles (that is + * specified by the C99 standard). Explicit pointer casts have been put in place, and pragmas to ignore remaining + * warnings from strict aliasing. + * + * !!! TODO: SSE3 code is a nice hack. But it should be considered carefully - is it worth it? In particular, it + * seems that only parts of it are really beneficial (like tabulated evaluation of imaginary exponents), and those + * can be incorporated into the main code (using standard C99 only). + */ doublecomplex c; - _mm_store_pd(&c,accImExp_pd(x)); + IGNORE_WARNING(-Wstrict-aliasing); + _mm_store_pd((double *)(&c),accImExp_pd(x)); + STOP_IGNORE; return c; } @@ -155,26 +165,30 @@ static void CalcInterTerm_core(const double kr,const double kr2,const double inv const __m128d v1 = _mm_set_pd(kr,t3); const __m128d v2 = _mm_set_pd(t2,t1); __m128d qff,im_re; - _mm_store_pd(expval,sc); + IGNORE_WARNING(-Wstrict-aliasing); + _mm_store_pd((double *)expval,sc); + STOP_IGNORE; #undef INTERACT_MUL #define INTERACT_DIAG(ind) { \ qff = _mm_set1_pd(qmunu[ind]); \ im_re = _mm_add_pd(v1,_mm_mul_pd(v2,qff)); \ im_re = cmul(sc,im_re); \ - _mm_store_pd(result+ind,im_re); } + _mm_store_pd((double *)(result+ind),im_re); } #define INTERACT_NONDIAG(ind) { \ qff = _mm_set1_pd(qmunu[ind]); \ im_re = _mm_mul_pd(v2,qff); \ im_re = cmul(sc,im_re); \ - _mm_store_pd(result+ind,im_re); } + _mm_store_pd((double *)(result+ind),im_re); } + IGNORE_WARNING(-Wstrict-aliasing); INTERACT_DIAG(0); // xx INTERACT_NONDIAG(1); // xy INTERACT_NONDIAG(2); // xz INTERACT_DIAG(3); // yy INTERACT_NONDIAG(4); // yz INTERACT_DIAG(5); // zz + STOP_IGNORE; #undef INTERACT_DIAG #undef INTERACT_NONDIAG @@ -716,12 +730,18 @@ void CalcInterTerm_igt(const int i,const int j,const int k,doublecomplex result[ double qvec[3],qmunu[6]; // unit directional vector {qx,qy,qz} and its outer-product {qxx,qxy,qxz,qyy,qyz,qzz} double rn,invrn,invr3,kr,kr2; // |R/d|, 1/|R/d|, |R|^-3, kR, (kR)^2 doublecomplex expval; // exp(ikR)/|R|^3 - double rtemp[3]; + double rtemp[3],tmp[12]; + int comp; CalcInterParams1(i,j,k,qvec,&rn); if (igt_lim==UNDEF || rn<=igt_lim) { vMultScal(gridspace,qvec,rtemp); - propaespacelibreintadda_(rtemp,&WaveNum,&gridspace,&igt_eps,(double *)result); + /* passing complex vectors from Fortran to c is not necessarily portable (at least requires extra effort in + * the Fortran code. So we do it through double. This is not bad for performance, since double is anyway used + * internally for integration in this Fortran routine. + */ + propaespacelibreintadda_(rtemp,&WaveNum,&gridspace,&igt_eps,tmp); + for (comp=0;comp<6;comp++) result[comp] = tmp[comp] + I*tmp[comp+6]; } else { // The following is equivalent to CalcInterTerm_poi, except for the 1st part of initialization performed above @@ -830,7 +850,7 @@ void InitInteraction(void) #ifndef NO_FORTRAN case G_IGT: CalcInterTerm = &CalcInterTerm_igt; break; #endif - default: LogError(ONE_POS, "Invalid interaction term calculation method: %d",IntRelation); + default: LogError(ONE_POS, "Invalid interaction term calculation method: %d",(int)IntRelation); // no break } // read tables if needed diff --git a/src/iterative.c b/src/iterative.c index 63b053a1..9e301c4d 100644 --- a/src/iterative.c +++ b/src/iterative.c @@ -92,7 +92,7 @@ struct iter_params_struct { int vec_N; // number of additional vectors to describe the state void (*func)(const enum phase); // pointer to implementation of the iterative solver }; -static doublecomplex dumb; // dumb variable, used in workaround for issue 146 +static doublecomplex dumb ATT_UNUSED; // dumb variable, used in workaround for issue 146 #define ITER_FUNC(name) static void name(const enum phase ph) @@ -340,6 +340,10 @@ ITER_FUNC(BCGS2) * so we use l=2 here. In many cases one iteration of this method is similar to two iterations of BiCGStab, but overall * convergence is slightly better. * Breakdown tests were made to coincide with that for BiCGStab for l=1. + * + * !!! This iterative solver produces segmentation fault when compiled with icc 11.1. Probably that is related to + * issue 146. But we leave it be (assume that this is a compiler bug). Even if someone uses this compiler, he can + * live fine without this iterative solver. */ { #define LL 2 // potentially the method will also work for l=1 (but memory allocation and freeing need to be adjusted) diff --git a/src/linalg.c b/src/linalg.c index 4800294a..4ec15385 100644 --- a/src/linalg.c +++ b/src/linalg.c @@ -21,14 +21,14 @@ // project headers #include "cmplx.h" #include "comm.h" +#include "function.h" #include "types.h" #include "vars.h" // system headers #include /* There are several optimization ideas used in this file: - * - pragmas, indicating that the loop is supposed to be very large, are used. However, they are probably understood - * only by the Intel compiler. + * - pragmas for Intel compiler, indicating that the loop is supposed to be very large, are used. * - If usage of some function has coinciding arguments, than a special function for such case is created. In * particular, this allows consistent usage of 'restrict' keyword almost for all function arguments. * - Deeper optimizations, such as loop unrolling, are left to the compiler. @@ -43,7 +43,7 @@ void nInit(doublecomplex * restrict a) { register size_t i; register const size_t n=local_nRows; -#pragma loop count (100000) + LARGE_LOOP; for (i=0;i #ifdef CLFFT_AMD + IGNORE_WARNING(-Wstrict-prototypes) // no way to change the library header # include // for version information + STOP_IGNORE #endif #ifndef NO_SVNREV diff --git a/src/sparse_ops.h b/src/sparse_ops.h index 5f11cea4..993c1cb6 100644 --- a/src/sparse_ops.h +++ b/src/sparse_ops.h @@ -54,9 +54,11 @@ static inline void AijProd(doublecomplex * restrict argvec,doublecomplex * restr __m128d res, tmp; + IGNORE_WARNING(-Wstrict-aliasing); // cast from doublecomplex* to double* is perfectly valid in C99 const __m128d argX = _mm_load_pd((double *)(argvec+j3)); const __m128d argY = _mm_load_pd((double *)(argvec+j3+1)); const __m128d argZ = _mm_load_pd((double *)(argvec+j3+2)); + STOP_IGNORE; res = cmul(argX, *(__m128d *)&(iterm[0])); tmp = cmul(argY, *(__m128d *)&(iterm[1])); diff --git a/tests/2exec/comp2exec b/tests/2exec/comp2exec index 49c51f8a..7490bbc3 100755 --- a/tests/2exec/comp2exec +++ b/tests/2exec/comp2exec @@ -17,6 +17,8 @@ INPUTDIR="./../../input" ADDASEQ="./../../src/seq/adda" ADDAMPI="./../../src/mpi/adda_mpi" ADDAOCL="./../../src/ocl/adda_ocl" +#Path to reference binaries, examples: "." or "./../../win64" +REFPATH=. # MPI command prefix MPIRUN="mpiexec -n 4" @@ -35,18 +37,18 @@ fi if [ -n "$SPARSE" ]; then DEFSUITE=suite_sparse - SEQREF="./adda_sparse" # !!! This should be adjusted - MPIREF="./adda_mpi_sparse" # !!! This should be adjusted - OCLREF="./adda_ocl_sparse" # !!! This should be adjusted + SEQREF="$REFPATH/adda_spa" # !!! This should be adjusted + MPIREF="$REFPATH/adda_spa_mpi" # !!! This should be adjusted + OCLREF="$REFPATH/adda_spa_ocl" # !!! This should be adjusted else if [ -n "$SPARSE_STANDARD" ]; then DEFSUITE=suite_sparse else DEFSUITE=suite fi - SEQREF="./adda_1.1" # !!! This should be adjusted - MPIREF="./adda_mpi_1.1.exe" # !!! This should be adjusted - OCLREF="./adda_ocl_1.1" # !!! This should be adjusted + SEQREF="$REFPATH/adda" # !!! This should be adjusted + MPIREF="$REFPATH/adda_mpi" # !!! This should be adjusted + OCLREF="$REFPATH/adda_ocl" # !!! This should be adjusted fi MODE=${1:-seq} diff --git a/tests/2exec/suite b/tests/2exec/suite index d8a8ec93..eea1fe1b 100644 --- a/tests/2exec/suite +++ b/tests/2exec/suite @@ -173,6 +173,7 @@ all -pol cm ;mgn; all -pol dgf ;mgn; all -pol fcd ;mgn; all -pol igt_so ;mgn; +all -pol lak ;mgn; all -pol ldr ;p; ;mgn; all -pol ldr avgpol ;p; ;mgn; all -pol rrc ;mgn;