From 653fbf547bd8760d3c8af58fe174f33e7feab03d Mon Sep 17 00:00:00 2001
From: yurkin <yurkin@gmail.com>
Date: Thu, 1 Aug 2013 11:47:18 +0000
Subject: [PATCH] Finished the changes towards complex datatypes from C99.
 Fixes issue 70.

Added defines to simplify adding pragmas to ignore warnings in gcc and loop_count in icc. The ignore pragmas are used in several places to suppress warnings about conversion from doublecomplex* to complex*:
- calls to Temperton FFT routines.
- copying data in SSE3 optimizations (here explicit pointer casts were also added).
Added comments in several places concerning the portability of such conversions.

icc pragmas are now visible only when compiling with icc.

Inclusion of clAmdFft.h is now enclosed in pragmas to ignore existent warning.

Calling of IGT Fortran routines was slightly changed, conversion from double to complex has been moved inside the C file from Fortran routine.

tests/2exec/comp2exec was improved by adding a common reference path (to search all reference binaries). Added '-pol lak' to suite.
---
 src/Makefile                    |  3 +-
 src/comm.c                      |  5 +-
 src/fft.c                       | 26 +++++++++-
 src/fort/propaesplibreintadda.f | 13 ++---
 src/function.h                  | 21 +++++++-
 src/interaction.c               | 34 ++++++++++---
 src/iterative.c                 |  6 ++-
 src/linalg.c                    | 86 ++++++++++++++++-----------------
 src/oclcore.c                   |  4 ++
 src/param.c                     |  2 +
 src/sparse_ops.h                |  2 +
 tests/2exec/comp2exec           | 14 +++---
 tests/2exec/suite               |  1 +
 13 files changed, 143 insertions(+), 74 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index a67baca0..fbe9e8b7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -329,8 +329,7 @@ ifeq ($(COMPILER),gnu)
   COPT1   := -O2
   COPT2   := -O3 -ffast-math -funroll-loops
   CWARN   := -Wall -Wextra -Wcast-qual -Wpointer-arith -Wwrite-strings -Wstrict-prototypes \
-             -Wstrict-aliasing=1 -Wshadow -Wcast-align -Wnested-externs -Wcomment -Wno-unknown-pragmas \
-             -Wno-overlength-strings
+             -Wstrict-aliasing=1 -Wshadow -Wcast-align -Wnested-externs -Wcomment -Wno-overlength-strings
   # gcc versions prior to 4.7.2 are affected by bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=7263 , which causes
   # -pedantic flag to generates warnings on every occurence of I (complex i)
   GCC_GTEQ_472 := $(shell expr `gcc -dumpversion | sed -e 's/\.\([0-9][0-9]\)/\1/g' -e 's/\.\([0-9]\)/0\1/g' \
diff --git a/src/comm.c b/src/comm.c
index 49a1a0c3..9b63b16a 100644
--- a/src/comm.c
+++ b/src/comm.c
@@ -154,9 +154,8 @@ static MPI_Datatype MPIVarType(var_type type,bool reduce,int *mult)
  * directly used. In this case we emulate more complex datatypes through multiplication of double, and additional
  * variable 'mult' is returned to account for this factor.
  *
- * Reduction of complex numbers is emulated if not supported; the emulation is not perfectly portable - depends on a
- * particular implementation of complex numbers. The good thing is that it is only used for old (less modern) MPI
- * implementations.
+ * Reduction of complex numbers is emulated if not supported; C99 implies that this emulation is portable. Anyway, it
+ * is only used for old (less modern) MPI implementations.
  */
 {
 	if (reduce) *mult=1; // default value when direct correspondence is possible
diff --git a/src/fft.c b/src/fft.c
index da39f8f4..022e203a 100644
--- a/src/fft.c
+++ b/src/fft.c
@@ -34,7 +34,9 @@
 #include <string.h>
 
 #ifdef CLFFT_AMD
+	IGNORE_WARNING(-Wstrict-prototypes) // no way to change the library header
 #	include <clAmdFft.h> //external library from AMD
+	STOP_IGNORE
 	// Defines precision of clAmdFft transforms. !!! CLFFT_DOUBLE_FAST should be tested when becomes operational
 #	define PRECISION_CLFFT CLFFT_DOUBLE
 #elif defined(CLFFT_APPLE)
@@ -318,8 +320,15 @@ void fftX(const int isign)
 #elif defined(FFT_TEMPERTON)
 	int nn=gridX,inc=1,jump=nn,lot=boxY;
 	size_t z;
-
+	/* Calls to Temperton FFT cause warnings for translation from doublecomplex to double pointers. However, such a cast
+	 * is perfectly valid in C99. So we set pragmas to remove these warnings.
+	 *
+	 * !!! TODO: Another (ultimate) solution is to remove this routine altogether, since FFTW is perfect in all
+	 * respects. This is also reasonable considering future switch to tgmath.h
+	 */
+	IGNORE_WARNING(-Wstrict-aliasing);
 	for (z=0;z<3*local_Nz;z++) cfft99_((double *)(Xmatrix+z*gridX*smallY),work,trigsX,ifaxX,&inc,&jump,&nn,&lot,&isign);
+	STOP_IGNORE;
 #endif
 }
 
@@ -343,7 +352,9 @@ void fftY(const int isign)
 #elif defined(FFT_TEMPERTON)
 	int nn=gridY,inc=1,jump=nn,lot=3*gridZ;
 
+	IGNORE_WARNING(-Wstrict-aliasing);
 	cfft99_((double *)(slices_tr),work,trigsY,ifaxY,&inc,&jump,&nn,&lot,&isign);
+	STOP_IGNORE;
 #endif
 }
 
@@ -367,7 +378,9 @@ void fftZ(const int isign)
 #elif defined(FFT_TEMPERTON)
 	int nn=gridZ,inc=1,jump=nn,lot=boxY,Xcomp;
 
+	IGNORE_WARNING(-Wstrict-aliasing);
 	for (Xcomp=0;Xcomp<3;Xcomp++) cfft99_((double *)(slices+gridYZ*Xcomp),work,trigsZ,ifaxZ,&inc,&jump,&nn,&lot,&isign);
+	STOP_IGNORE;
 #endif
 }
 
@@ -382,7 +395,9 @@ static void fftX_Dm(const size_t lengthZ ONLY_FOR_TEMPERTON)
 	int nn=gridX,inc=1,jump=nn,lot=D2sizeY,isign=FFT_FORWARD;
 	size_t z;
 
+	IGNORE_WARNING(-Wstrict-aliasing);
 	for (z=0;z<lengthZ;z++) cfft99_((double *)(D2matrix+z*gridX*D2sizeY),work,trigsX,ifaxX,&inc,&jump,&nn,&lot,&isign);
+	STOP_IGNORE;
 #endif
 }
 
@@ -396,7 +411,9 @@ static void fftY_Dm(void)
 #elif defined(FFT_TEMPERTON)
 	int nn=gridY,inc=1,jump=nn,lot=gridZ,isign=FFT_FORWARD;
 
+	IGNORE_WARNING(-Wstrict-aliasing);
 	cfft99_((double *)slice_tr,work,trigsY,ifaxY,&inc,&jump,&nn,&lot,&isign);
+	STOP_IGNORE;
 #endif
 }
 
@@ -410,7 +427,9 @@ static void fftZ_Dm(void)
 #elif defined(FFT_TEMPERTON)
 	int nn=gridZ,inc=1,jump=nn,lot=gridY,isign=FFT_FORWARD;
 
+	IGNORE_WARNING(-Wstrict-aliasing);
 	cfft99_((double *)slice,work,trigsZ,ifaxZ,&inc,&jump,&nn,&lot,&isign);
+	STOP_IGNORE;
 #endif
 }
 
@@ -531,6 +550,9 @@ static void fftInitAfterD(void)
  * completely separate code is used for OpenCL and FFTW3, because even precise-timing output is significantly different.
  * In particular, FFTW3 uses separate plans for forward and backward, while clFFT (by Apple or AMD) uses one plan for
  * both directions.
+ *
+ * clFft access the OpenCL buffers directly, so they are not anyhow affected by the definition of complex numbers in the
+ * main part of the code (although, it is consistent with it)
  */
 {
 #ifdef OPENCL
@@ -777,7 +799,7 @@ void InitDmatrix(void)
 	CREATE_CL_BUFFER(bufslices,CL_MEM_READ_WRITE,gridYZ*3*sizeof(doublecomplex),NULL);
 	CREATE_CL_BUFFER(bufslices_tr,CL_MEM_READ_WRITE,gridYZ*3*sizeof(doublecomplex),NULL);
 	/* The following are constant device buffers which are initialized with host data. But bufDmatrix is initialized in
-	 * the end of this function (to be compatible with prognosis. And bufcc_sqrt is initialized in InitCC, since it may
+	 * the end of this function (to be compatible with prognosis). And bufcc_sqrt is initialized in InitCC, since it may
 	 * change for every run of the iterative solver.
 	 */
 	CREATE_CL_BUFFER(bufcc_sqrt,CL_MEM_READ_ONLY,sizeof(cc_sqrt),NULL);
diff --git a/src/fort/propaesplibreintadda.f b/src/fort/propaesplibreintadda.f
index e2985fb0..ed63e2b5 100644
--- a/src/fort/propaesplibreintadda.f
+++ b/src/fort/propaesplibreintadda.f
@@ -21,12 +21,14 @@ subroutine propaespacelibreintadda(Rij,k0a,arretecube,relreq,
       double precision k0a,arretecubem
       double precision x,y,z,arretecube,k0,xx0,yy0,zz0
       double precision Rij(3),result(12)
+c     The structure of the result is the following:
+c     Re(G11),Re(G12),Re(G13),Re(G22),Re(G23),Re(G33),Im(G11),...,Im(G33)
 
 c     Variables needs for the integration
       integer  KEY, N, NF, NDIM, MINCLS, MAXCLS, IFAIL, NEVAL, NW
       parameter (nw=4000000,ndim=3,nf=12)
       double precision A(NDIM), B(NDIM), WRKSTR(NW)
-      double precision  ABSEST(NF), FINEST(NF), ABSREQ, RELREQ,err
+      double precision  ABSEST(NF), ABSREQ, RELREQ,err
       
       double precision Id(3,3),Rab,Rvect(3)
 
@@ -68,21 +70,16 @@ subroutine propaespacelibreintadda(Rij,k0a,arretecube,relreq,
       endif
 
       call  DCUHRE(NDIM,NF,A,B, MINCLS, MAXCLS, fonctionigtadda,
-     $      ABSREQ,RELREQ,KEY,NW,0,finest,ABSEST,NEVAL,IFAIL, WRKSTR) 
+     $      ABSREQ,RELREQ,KEY,NW,0,result,ABSEST,NEVAL,IFAIL, WRKSTR) 
       
       do N = 1,NF
-         FINEST(N)=FINEST(N)/arretecube/arretecube/arretecube
+         result(N)=result(N)/arretecube/arretecube/arretecube
       enddo
 
       if (ifail.ne.0) then
          write(*,*) 'IFAIL in IGT routine',IFAIL
       endif
 
-      do i = 1,6
-        result(2*i-1)=finest(i)
-        result(2*i)=finest(i+6)
-      enddo
-    
       end
 c*************************************************************
       subroutine fonctionigtadda(ndim,zz,nfun,f)
diff --git a/src/function.h b/src/function.h
index 668816ae..4c7d0d85 100644
--- a/src/function.h
+++ b/src/function.h
@@ -1,6 +1,6 @@
 /* File: function.h
  * $Date::                            $
- * Descr: function attributes
+ * Descr: function attributes and compiler pragmas
  *
  * Copyright (C) 2006,2008,2010-2011,2013 ADDA contributors
  * This file is part of ADDA.
@@ -17,7 +17,6 @@
 #ifndef __function_h
 #define __function_h
 
-
 // attribute options for GCC compilers (Intel compiler may also recognize them)
 #ifdef __GNUC__
 	// sets a macro for testing GCC version (copied from _mingw.h)
@@ -26,6 +25,16 @@
 			(__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
 #	else
 #		define GCC_PREREQ(major, minor)  0
+#	endif
+	// pragmas to ignore warnings
+#	if GCC_PREREQ(4,6)
+#		define DO_PRAGMA(x) _Pragma (#x)
+#		define IGNORE_WARNING(x) DO_PRAGMA(GCC diagnostic ignored #x)
+#		// assume that push is not used anywhere
+#		define STOP_IGNORE       _Pragma ("GCC diagnostic pop")
+#	else
+#		define IGNORE_WARNING(x)
+#		define STOP_IGNORE
 #	endif
 	// The following chooses between __printf__ and __gnu_printf__ attributes
 #	if GCC_PREREQ(4,4)
@@ -49,6 +58,8 @@
 #	define ATT_NORETURN __attribute__ ((__noreturn__))
 #	define ATT_UNUSED   __attribute__ ((__unused__))
 #else
+#	define IGNORE_WARNING(x)
+#	define STOP_IGNORE
 #	define ATT_PRINTF(a,b)
 #	define ATT_PURE
 #	define ATT_MALLOC
@@ -56,4 +67,10 @@
 #	define ATT_UNUSED
 #endif
 
+#ifdef __ICC
+#	define LARGE_LOOP _Pragma ("loop_count (10000)")
+#else
+#	define LARGE_LOOP
+#endif
+
 #endif // __function_h
diff --git a/src/interaction.c b/src/interaction.c
index 88222d92..0e420cf7 100644
--- a/src/interaction.c
+++ b/src/interaction.c
@@ -138,8 +138,18 @@ static inline __m128d accImExp_pd(const double x)
 
 static inline doublecomplex accImExp(const double x)
 {
+	/* Here and further in the SSE3 part it is assumed that doublecomplex is equivalent to two doubles (that is
+	 * specified by the C99 standard). Explicit pointer casts have been put in place, and pragmas to ignore remaining
+	 * warnings from strict aliasing.
+	 *
+	 * !!! TODO: SSE3 code is a nice hack. But it should be considered carefully - is it worth it? In particular, it
+	 * seems that only parts of it are really beneficial (like tabulated evaluation of imaginary exponents), and those
+	 * can be incorporated into the main code (using standard C99 only).
+	 */
 	doublecomplex c;
-	_mm_store_pd(&c,accImExp_pd(x));
+	IGNORE_WARNING(-Wstrict-aliasing);
+	_mm_store_pd((double *)(&c),accImExp_pd(x));
+	STOP_IGNORE;
 	return c;
 }
 
@@ -155,26 +165,30 @@ static void CalcInterTerm_core(const double kr,const double kr2,const double inv
 	const __m128d v1 = _mm_set_pd(kr,t3);
 	const __m128d v2 = _mm_set_pd(t2,t1);
 	__m128d qff,im_re;
-	_mm_store_pd(expval,sc);
+	IGNORE_WARNING(-Wstrict-aliasing);
+	_mm_store_pd((double *)expval,sc);
+	STOP_IGNORE;
 
 #undef INTERACT_MUL
 #define INTERACT_DIAG(ind) { \
 	qff = _mm_set1_pd(qmunu[ind]); \
 	im_re = _mm_add_pd(v1,_mm_mul_pd(v2,qff)); \
 	im_re = cmul(sc,im_re); \
-	_mm_store_pd(result+ind,im_re); }
+	_mm_store_pd((double *)(result+ind),im_re); }
 #define INTERACT_NONDIAG(ind) { \
 	qff = _mm_set1_pd(qmunu[ind]); \
 	im_re = _mm_mul_pd(v2,qff); \
 	im_re = cmul(sc,im_re); \
-	_mm_store_pd(result+ind,im_re); }
+	_mm_store_pd((double *)(result+ind),im_re); }
 
+	IGNORE_WARNING(-Wstrict-aliasing);
 	INTERACT_DIAG(0);    // xx
 	INTERACT_NONDIAG(1); // xy
 	INTERACT_NONDIAG(2); // xz
 	INTERACT_DIAG(3);    // yy
 	INTERACT_NONDIAG(4); // yz
 	INTERACT_DIAG(5);    // zz
+	STOP_IGNORE;
 
 #undef INTERACT_DIAG
 #undef INTERACT_NONDIAG
@@ -716,12 +730,18 @@ void CalcInterTerm_igt(const int i,const int j,const int k,doublecomplex result[
 	double qvec[3],qmunu[6]; // unit directional vector {qx,qy,qz} and its outer-product {qxx,qxy,qxz,qyy,qyz,qzz}
 	double rn,invrn,invr3,kr,kr2; // |R/d|, 1/|R/d|, |R|^-3, kR, (kR)^2
 	doublecomplex expval; // exp(ikR)/|R|^3
-	double rtemp[3];
+	double rtemp[3],tmp[12];
+	int comp;
 
 	CalcInterParams1(i,j,k,qvec,&rn);
 	if (igt_lim==UNDEF || rn<=igt_lim) {
 		vMultScal(gridspace,qvec,rtemp);
-		propaespacelibreintadda_(rtemp,&WaveNum,&gridspace,&igt_eps,(double *)result);
+		/* passing complex vectors from Fortran to c is not necessarily portable (at least requires extra effort in
+		 * the Fortran code. So we do it through double. This is not bad for performance, since double is anyway used
+		 * internally for integration in this Fortran routine.
+		 */
+		propaespacelibreintadda_(rtemp,&WaveNum,&gridspace,&igt_eps,tmp);
+		for (comp=0;comp<6;comp++) result[comp] = tmp[comp] + I*tmp[comp+6];
 	}
 	else {
 		// The following is equivalent to CalcInterTerm_poi, except for the 1st part of initialization performed above
@@ -830,7 +850,7 @@ void InitInteraction(void)
 #ifndef NO_FORTRAN
 		case G_IGT: CalcInterTerm = &CalcInterTerm_igt; break;
 #endif
-		default: LogError(ONE_POS, "Invalid interaction term calculation method: %d",IntRelation);
+		default: LogError(ONE_POS, "Invalid interaction term calculation method: %d",(int)IntRelation);
 			// no break
 	}
 	// read tables if needed
diff --git a/src/iterative.c b/src/iterative.c
index 63b053a1..9e301c4d 100644
--- a/src/iterative.c
+++ b/src/iterative.c
@@ -92,7 +92,7 @@ struct iter_params_struct {
 	int vec_N;        // number of additional vectors to describe the state
 	void (*func)(const enum phase); // pointer to implementation of the iterative solver
 };
-static doublecomplex dumb; // dumb variable, used in workaround for issue 146
+static doublecomplex dumb ATT_UNUSED; // dumb variable, used in workaround for issue 146
 
 #define ITER_FUNC(name) static void name(const enum phase ph)
 
@@ -340,6 +340,10 @@ ITER_FUNC(BCGS2)
  * so we use l=2 here. In many cases one iteration of this method is similar to two iterations of BiCGStab, but overall
  * convergence is slightly better.
  * Breakdown tests were made to coincide with that for BiCGStab for l=1.
+ *
+ * !!! This iterative solver produces segmentation fault when compiled with icc 11.1. Probably that is related to
+ * issue 146. But we leave it be (assume that this is a compiler bug). Even if someone uses this compiler, he can
+ * live fine without this iterative solver.
  */
 {
 #define LL 2 // potentially the method will also work for l=1 (but memory allocation and freeing need to be adjusted)
diff --git a/src/linalg.c b/src/linalg.c
index 4800294a..4ec15385 100644
--- a/src/linalg.c
+++ b/src/linalg.c
@@ -21,14 +21,14 @@
 // project headers
 #include "cmplx.h"
 #include "comm.h"
+#include "function.h"
 #include "types.h"
 #include "vars.h"
 // system headers
 #include <string.h>
 
 /* There are several optimization ideas used in this file:
- * - pragmas, indicating that the loop is supposed to be very large, are used. However, they are probably understood
- * only by the Intel compiler.
+ * - pragmas for Intel compiler, indicating that the loop is supposed to be very large, are used.
  * - If usage of some function has coinciding arguments, than a special function for such case is created. In
  * particular, this allows consistent usage of 'restrict' keyword almost for all function arguments.
  * - Deeper optimizations, such as loop unrolling, are left to the compiler.
@@ -43,7 +43,7 @@ void nInit(doublecomplex * restrict a)
 {
 	register size_t i;
 	register const size_t n=local_nRows;
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) a[i]=0;
 }
 
@@ -64,7 +64,7 @@ double nNorm2(const doublecomplex * restrict a,TIME_TYPE *comm_timing)
 	register const size_t n=local_nRows;
 	double sum=0;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) sum+=cAbs2(a[i]);
 	// this function is not called inside the main iteration loop
 	MyInnerProduct(&sum,double_type,1,comm_timing);
@@ -82,7 +82,7 @@ doublecomplex nDotProd(const doublecomplex * restrict a,const doublecomplex * re
 	register const size_t n=local_nRows;
 	doublecomplex sum=0;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) sum+=a[i]*conj(b[i]);
 	MyInnerProduct(&sum,cmplx_type,1,comm_timing);
 	return sum;
@@ -99,7 +99,7 @@ doublecomplex nDotProd_conj(const doublecomplex * restrict a,const doublecomplex
 	register const size_t n=local_nRows;
 	doublecomplex sum=0;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) sum+=a[i]*b[i];
 	MyInnerProduct(&sum,cmplx_type,1,comm_timing);
 	return sum;
@@ -114,7 +114,7 @@ doublecomplex nDotProdSelf_conj(const doublecomplex * restrict a,TIME_TYPE *comm
 	register const size_t n=local_nRows;
 	doublecomplex sum=0;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	/* Explicit writing the following through real and imaginary types can lead to delaying the multiplication by two
 	 * until the sum is complete. But that is not believed to be significant
 	 */
@@ -132,7 +132,7 @@ doublecomplex nDotProdSelf_conj_Norm2(const doublecomplex * restrict a,double *
 	register const size_t n=local_nRows;
 	double buf[3]={0,0,0};
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	// Here the optimization for explicit treatment seems significant, so we keep the old code
 	for (i=0;i<n;i++) {
 		buf[0]+=creal(a[i])*creal(a[i]);
@@ -153,7 +153,7 @@ void nIncrem110_cmplx(doublecomplex * restrict a,const doublecomplex * restrict
 	register size_t i;
 	register const size_t n=local_nRows;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) a[i] = c1*a[i] + c2*b[i] + c[i];
 }
 
@@ -166,7 +166,7 @@ void nIncrem011_cmplx(doublecomplex * restrict a,const doublecomplex * restrict
 	register size_t i;
 	register const size_t n=local_nRows;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) a[i] += c1*b[i] + c2*c[i];
 }
 
@@ -183,11 +183,11 @@ void nIncrem110_d_c_conj(doublecomplex * restrict a,const doublecomplex * restri
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] = c1*conj(a[i]) + c2*conj(b[i]) + c[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] = c1*conj(a[i]) + c2*conj(b[i]) + c[i];
 			sum += cAbs2(a[i]);
@@ -206,7 +206,7 @@ void nIncrem111_cmplx(doublecomplex * restrict a,const doublecomplex * restrict
 	register size_t i;
 	register const size_t n=local_nRows;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) a[i] = c1*a[i] + c2*b[i] + c3*c[i];
 }
 
@@ -221,11 +221,11 @@ void nIncrem(doublecomplex * restrict a,const doublecomplex * restrict b,double
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] += b[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] += b[i];
 			sum += cAbs2(a[i]);
@@ -246,11 +246,11 @@ void nDecrem(doublecomplex * restrict a,const doublecomplex * restrict b,double
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] -= b[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] -= b[i];
 			sum += cAbs2(a[i]);
@@ -271,11 +271,11 @@ void nIncrem01(doublecomplex * restrict a,const doublecomplex * restrict b,const
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] += c*b[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] += c*b[i];
 			sum += cAbs2(a[i]);
@@ -296,11 +296,11 @@ void nIncrem10(doublecomplex * restrict a,const doublecomplex * restrict b,const
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] = c*a[i] + b[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] = c*a[i] + b[i];
 			sum += cAbs2(a[i]);
@@ -321,12 +321,12 @@ void nIncrem11_d_c(doublecomplex * restrict a,const doublecomplex * restrict b,c
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] = c1*a[i] + c2*b[i];
 	}
 	else {
 		*inprod=0.0;
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] = c1*a[i] + c2*b[i];
 			sum += cAbs2(a[i]);
@@ -347,11 +347,11 @@ void nIncrem01_cmplx(doublecomplex * restrict a,const doublecomplex * restrict b
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] += c*b[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] += c*b[i];
 			sum += cAbs2(a[i]);
@@ -372,11 +372,11 @@ void nIncrem10_cmplx(doublecomplex * restrict a,const doublecomplex * restrict b
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] = c*a[i] + b[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] = c*a[i] + b[i];
 			sum += cAbs2(a[i]);
@@ -397,11 +397,11 @@ void nLinComb_cmplx(doublecomplex * restrict a,const doublecomplex * restrict b,
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] = c1*b[i] + c2*c[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] = c1*b[i] + c2*c[i];
 			sum += cAbs2(a[i]);
@@ -422,11 +422,11 @@ void nLinComb1_cmplx(doublecomplex * restrict a,const doublecomplex * restrict b
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] = c1*b[i] + c[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] = c1*b[i] + c[i];
 			sum += cAbs2(a[i]);
@@ -447,11 +447,11 @@ void nLinComb1_cmplx_conj(doublecomplex * restrict a,const doublecomplex * restr
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] = c1*conj(b[i]) + c[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] = c1*conj(b[i]) + c[i];
 			sum += cAbs2(a[i]);
@@ -473,11 +473,11 @@ void nSubtr(doublecomplex * restrict a,const doublecomplex * restrict b,const do
 	double sum=0;
 
 	if (inprod==NULL) {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) a[i] = b[i] - c[i];
 	}
 	else {
-#pragma loop count (100000)
+		LARGE_LOOP;
 		for (i=0;i<n;i++) {
 			a[i] = b[i] - c[i];
 			sum += cAbs2(a[i]);
@@ -495,7 +495,7 @@ void nMult(doublecomplex * restrict a,const doublecomplex * restrict b,const dou
 	register const size_t n=local_nRows;
 	register size_t i;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) a[i] = c*b[i];
 }
 
@@ -507,7 +507,7 @@ void nMult_cmplx(doublecomplex * restrict a,const doublecomplex * restrict b,con
 	register const size_t n=local_nRows;
 	register size_t i;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) a[i] = c*b[i];
 }
 //======================================================================================================================
@@ -518,7 +518,7 @@ void nMultSelf(doublecomplex * restrict a,const double c)
 	register const size_t n=local_nRows;
 	register size_t i;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) a[i] *= c;
 }
 //======================================================================================================================
@@ -529,7 +529,7 @@ void nMultSelf_conj(doublecomplex * restrict a,const double c)
 	register const size_t n=local_nRows;
 	register size_t i;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) a[i] = c*conj(a[i]);
 }
 
@@ -541,7 +541,7 @@ void nMultSelf_cmplx(doublecomplex * restrict a,const doublecomplex c)
 	register const size_t n=local_nRows;
 	register size_t i;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) a[i] *= c;
 }
 
@@ -560,7 +560,7 @@ void nMult_mat(doublecomplex * restrict a,const doublecomplex * restrict b,/*con
 	 */
 	const doublecomplex * restrict val;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0,k=0;i<nd;i++,k+=3) {
 		val=c[material[i]];
 		a[k] = val[0]*b[k];
@@ -584,7 +584,7 @@ void nMultSelf_mat(doublecomplex * restrict a,/*const*/ doublecomplex (* restric
 	 */
 	const doublecomplex * restrict val;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0,k=0;i<nd;i++,k+=3) {
 		val=c[material[i]];
 		a[k] *= val[0];
@@ -601,6 +601,6 @@ void nConj(doublecomplex * restrict a)
 	register const size_t n=local_nRows;
 	register size_t i;
 
-#pragma loop count (100000)
+	LARGE_LOOP;
 	for (i=0;i<n;i++) a[i]=conj(a[i]);
 }
diff --git a/src/oclcore.c b/src/oclcore.c
index 232fdec0..94b32a23 100644
--- a/src/oclcore.c
+++ b/src/oclcore.c
@@ -311,6 +311,10 @@ static void GetDevice(struct string *copt_ptr)
 
 void oclinit(void)
 // initialize OpenCL environment
+/* The whole OpenCL part relies on correspondence between complex and two-part, which is mandated by C99. The data
+ * transfers are done through void pointers, so compiler doesn't generate warnings for casts. So that should be both
+ * convenient and portable.
+ */
 {
 	cl_int err; // error code
 	const char *cssPtr[1]; // pointer to (array of) cSourceString, required to avoid warnings
diff --git a/src/param.c b/src/param.c
index 628810ab..f71b08a8 100644
--- a/src/param.c
+++ b/src/param.c
@@ -39,7 +39,9 @@
 #include <time.h>
 
 #ifdef CLFFT_AMD
+	IGNORE_WARNING(-Wstrict-prototypes) // no way to change the library header
 #	include <clAmdFft.h> // for version information
+	STOP_IGNORE
 #endif
 
 #ifndef NO_SVNREV
diff --git a/src/sparse_ops.h b/src/sparse_ops.h
index 5f11cea4..993c1cb6 100644
--- a/src/sparse_ops.h
+++ b/src/sparse_ops.h
@@ -54,9 +54,11 @@ static inline void AijProd(doublecomplex * restrict argvec,doublecomplex * restr
 
 	__m128d res, tmp;
 
+	IGNORE_WARNING(-Wstrict-aliasing); // cast from doublecomplex* to double* is perfectly valid in C99
 	const __m128d argX = _mm_load_pd((double *)(argvec+j3));
 	const __m128d argY = _mm_load_pd((double *)(argvec+j3+1));
 	const __m128d argZ = _mm_load_pd((double *)(argvec+j3+2));
+	STOP_IGNORE;
 
 	res = cmul(argX, *(__m128d *)&(iterm[0]));
 	tmp = cmul(argY, *(__m128d *)&(iterm[1]));
diff --git a/tests/2exec/comp2exec b/tests/2exec/comp2exec
index 49c51f8a..7490bbc3 100755
--- a/tests/2exec/comp2exec
+++ b/tests/2exec/comp2exec
@@ -17,6 +17,8 @@ INPUTDIR="./../../input"
 ADDASEQ="./../../src/seq/adda"
 ADDAMPI="./../../src/mpi/adda_mpi"
 ADDAOCL="./../../src/ocl/adda_ocl"
+#Path to reference binaries, examples: "." or "./../../win64"
+REFPATH=.
 # MPI command prefix
 MPIRUN="mpiexec -n 4"
 
@@ -35,18 +37,18 @@ fi
 
 if [ -n "$SPARSE" ]; then
   DEFSUITE=suite_sparse
-  SEQREF="./adda_sparse" # !!! This should be adjusted
-  MPIREF="./adda_mpi_sparse" # !!! This should be adjusted
-  OCLREF="./adda_ocl_sparse" # !!! This should be adjusted
+  SEQREF="$REFPATH/adda_spa" # !!! This should be adjusted
+  MPIREF="$REFPATH/adda_spa_mpi" # !!! This should be adjusted
+  OCLREF="$REFPATH/adda_spa_ocl" # !!! This should be adjusted
 else
   if [ -n "$SPARSE_STANDARD" ]; then
     DEFSUITE=suite_sparse
   else
     DEFSUITE=suite
   fi
-  SEQREF="./adda_1.1" # !!! This should be adjusted
-  MPIREF="./adda_mpi_1.1.exe" # !!! This should be adjusted
-  OCLREF="./adda_ocl_1.1" # !!! This should be adjusted
+  SEQREF="$REFPATH/adda" # !!! This should be adjusted
+  MPIREF="$REFPATH/adda_mpi" # !!! This should be adjusted
+  OCLREF="$REFPATH/adda_ocl" # !!! This should be adjusted
 fi
 
 MODE=${1:-seq}
diff --git a/tests/2exec/suite b/tests/2exec/suite
index d8a8ec93..eea1fe1b 100644
--- a/tests/2exec/suite
+++ b/tests/2exec/suite
@@ -173,6 +173,7 @@ all -pol cm ;mgn;
 all -pol dgf ;mgn;
 all -pol fcd ;mgn;
 all -pol igt_so ;mgn;
+all -pol lak ;mgn;
 all -pol ldr ;p; ;mgn;
 all -pol ldr avgpol ;p; ;mgn;
 all -pol rrc ;mgn;