Major changes to the code due to switch to complex types of C99 (issu…

…e 70). All the code using complex numbers has been looked through, and in many cases it was significantly simplified. Many functions from cmplx.h (operating on complex scalars) has been removed, as they are no more used. - E2_alldir is now used to store only the final result, while (intermediary) complex fields are stored in a new vector E_ad. This slightly increases required memory (proportional to number of angles), but improves clarity. - removed restrict optimizations from cmplx.h, because the functions are inline anyway. - added boolVector to memory.c/h (and a couple of minor corrections) - arguments of CalcField in crosssec.c are now declared as 3-vectors. - added test of gcc version in Makefile, '-pedantic' is used only for versions >= gcc (otherwise spurious warnings about complex numbers appear). - MatVec in sparse mode was updated to use standard functions from linalg.c - removed copying of scalar function arguments into registers for functions in linalg.c. That had some sense for double[2] arguments, but can be left to compiler to arguments that are passed by value. - removed all '#pragma ivdep' in linalg.c, since now this information should be obvious to compiler anyway. - version incremented to 1.3a1 MPI part: - To have portable coupling with 'double complex', MPI_C_DOUBLE_COMPLEX is now used when available. However, it can be either unavailable, or not supported in reduction operations. Therefore, rather complicated logic has been added to parbas.h to detect different cases and take full advantage of the available features. Missing features are emulated by replacing 'double complex' with 'double[2]' as was before. But it is not perfectly portable. - Similar issues were addressed for MPI_C_BOOL. The corresponding part of granule generator was changed to take full advantage of it. - Accumulate() now operates in place and operates on any datatype. Corresponding buffers Egrid_buffer, Eplane_buffer, and E2alldir_buffer were removed (somewhat decreasing the memory footprint in MPI mode). Quick tests of sequential and MPI parts showed marginal improvement of computational speed. Both sequential and MPI parts (including sparse, without SSE3, counterparts) were thoroughly testes with tests/2exec. Remaining issues: test and fix coupling with Fortran, C++, OpenCL parts, and sparse-SSE3. They seem to be working, but not perfectly portable.
adda-team · Jul 31, 2013 · 057298c · 057298c
1 parent af41300
commit 057298c
Show file tree

Hide file tree

Showing 28 changed files with 898 additions and 1,751 deletions.
diff --git a/src/CalculateE.c b/src/CalculateE.c
@@ -40,7 +40,6 @@
 // defined and initialized in calculator.c
 extern double * restrict muel_phi,* restrict muel_phi_buf;
 extern doublecomplex * restrict EplaneX, * restrict EplaneY;
-extern double * restrict Eplane_buffer;
 extern const double dtheta_deg,dtheta_rad;
 extern doublecomplex * restrict ampl_alphaX,* restrict ampl_alphaY;
 extern double * restrict muel_alpha;
@@ -82,25 +81,25 @@ static void ComputeMuellerMatrix(double matrix[4][4], const doublecomplex s1,con
  * Huffman
  */
 {
-	matrix[0][0] = 0.5*(cMultConRe(s1,s1)+cMultConRe(s2,s2)+cMultConRe(s3,s3)+cMultConRe(s4,s4));
-	matrix[0][1] = 0.5*(cMultConRe(s2,s2)-cMultConRe(s1,s1)+cMultConRe(s4,s4)-cMultConRe(s3,s3));
-	matrix[0][2] = cMultConRe(s2,s3)+cMultConRe(s1,s4);
-	matrix[0][3] = cMultConIm(s2,s3)-cMultConIm(s1,s4);
-
-	matrix[1][0] = 0.5*(cMultConRe(s2,s2)-cMultConRe(s1,s1)+cMultConRe(s3,s3)-cMultConRe(s4,s4));
-	matrix[1][1] = 0.5*(cMultConRe(s2,s2)+cMultConRe(s1,s1)-cMultConRe(s3,s3)-cMultConRe(s4,s4));
-	matrix[1][2] = cMultConRe(s2,s3)-cMultConRe(s1,s4);
-	matrix[1][3] = cMultConIm(s2,s3)+cMultConIm(s1,s4);
-
-	matrix[2][0] = cMultConRe(s2,s4)+cMultConRe(s1,s3);
-	matrix[2][1] = cMultConRe(s2,s4)-cMultConRe(s1,s3);
-	matrix[2][2] = cMultConRe(s1,s2)+cMultConRe(s3,s4);
-	matrix[2][3] = cMultConIm(s2,s1)+cMultConIm(s4,s3);
-
-	matrix[3][0] = cMultConIm(s4,s2)+cMultConIm(s1,s3);
-	matrix[3][1] = cMultConIm(s4,s2)-cMultConIm(s1,s3);
-	matrix[3][2] = cMultConIm(s1,s2)-cMultConIm(s3,s4);
-	matrix[3][3] = cMultConRe(s1,s2)-cMultConRe(s3,s4);
+	matrix[0][0] = 0.5*(cAbs2(s1) + cAbs2(s2) + cAbs2(s3) + cAbs2(s4));
+	matrix[0][1] = 0.5*(cAbs2(s2) - cAbs2(s1) + cAbs2(s4) - cAbs2(s3));
+	matrix[0][2] = creal(s2*conj(s3) + s1*conj(s4));
+	matrix[0][3] = cimag(s2*conj(s3) - s1*conj(s4));
+
+	matrix[1][0] = 0.5*(cAbs2(s2) - cAbs2(s1) + cAbs2(s3) - cAbs2(s4));
+	matrix[1][1] = 0.5*(cAbs2(s2) + cAbs2(s1) - cAbs2(s3) - cAbs2(s4));
+	matrix[1][2] = creal(s2*conj(s3) - s1*conj(s4));
+	matrix[1][3] = cimag(s2*conj(s3) + s1*conj(s4));
+
+	matrix[2][0] = creal(s2*conj(s4) + s1*conj(s3));
+	matrix[2][1] = creal(s2*conj(s4) - s1*conj(s3));
+	matrix[2][2] = creal(s1*conj(s2) + s3*conj(s4));
+	matrix[2][3] = cimag(s2*conj(s1) + s4*conj(s3));
+
+	matrix[3][0] = cimag(s4*conj(s2) + s1*conj(s3));
+	matrix[3][1] = cimag(s4*conj(s2) - s1*conj(s3));
+	matrix[3][2] = cimag(s1*conj(s2) - s3*conj(s4));
+	matrix[3][3] = creal(s1*conj(s2) - s3*conj(s4));
 }
 
 //======================================================================================================================
@@ -166,7 +165,7 @@ void MuellerMatrix(void)
 	double * restrict cos2=NULL,* restrict sin2=NULL,* restrict cos4=NULL,* restrict sin4=NULL;
 	double matrix[4][4];
 	double theta,phi,ph,max_err,max_err_c2,max_err_s2,max_err_c4,max_err_s4;
-	doublecomplex s1,s2,s3,s4,s10,s20,s30,s40;
+	doublecomplex s1,s2,s3,s4;
 	char fname[MAX_FNAME];
 	int i;
 	size_t index,index1,k_or,j,n,ind;
@@ -191,16 +190,11 @@ void MuellerMatrix(void)
 				co=cos(alph);
 				si=sin(alph);
 				for (i=0;i<nTheta;i++) {
-					// read amplitude matrix from memory
-					cEqual(ampl_alphaX[index],s10);
-					cEqual(ampl_alphaX[index+1],s30);
-					cEqual(ampl_alphaY[index],s40);
-					cEqual(ampl_alphaY[index+1],s20);
-					// transform it, multiplying by rotation matrix (-alpha)
-					cLinComb(s20,s30,co,si,s2);  // s2 =  co*s20 + si*s30
-					cLinComb(s20,s30,-si,co,s3); // s3 = -si*s20 + co*s30
-					cLinComb(s40,s10,co,si,s4);  // s4 =  co*s40 + si*s10
-					cLinComb(s40,s10,-si,co,s1); // s1 = -si*s40 + co*s10
+					// transform amplitude matrix, multiplying by rotation matrix (-alpha)
+					s2 =  co*ampl_alphaY[index+1] + si*ampl_alphaX[index+1]; // s2 =  co*s20 + si*s30
+					s3 = -si*ampl_alphaY[index+1] + co*ampl_alphaX[index+1]; // s3 = -si*s20 + co*s30
+					s4 =  co*ampl_alphaY[index] + si*ampl_alphaX[index];     // s4 =  co*s40 + si*s10
+					s1 = -si*ampl_alphaY[index] + co*ampl_alphaX[index];     // s1 = -si*s40 + co*s10
 
 					ComputeMuellerMatrix((double (*)[4])(muel_alpha+index1),s1,s2,s3,s4);
 					index+=2;
@@ -218,9 +212,9 @@ void MuellerMatrix(void)
 				fprintf(ampl,THETA_HEADER" "AMPL_HEADER"\n");
 				for (i=0;i<nTheta;i++) {
 					theta=i*dtheta_deg;
-					fprintf(ampl,ANGLE_FORMAT" "AMPL_FORMAT"\n",theta,EplaneX[2*i][RE],EplaneX[2*i][IM],
-						EplaneY[2*i+1][RE],EplaneY[2*i+1][IM],EplaneX[2*i+1][RE],EplaneX[2*i+1][IM],EplaneY[2*i][RE],
-						EplaneY[2*i][IM]);
+					fprintf(ampl,ANGLE_FORMAT" "AMPL_FORMAT"\n",theta,creal(EplaneX[2*i]),cimag(EplaneX[2*i]),
+						creal(EplaneY[2*i+1]),cimag(EplaneY[2*i+1]),creal(EplaneX[2*i+1]),cimag(EplaneX[2*i+1]),
+						creal(EplaneY[2*i]),cimag(EplaneY[2*i]));
 				}
 				FCloseErr(ampl,F_AMPL,ONE_POS);
 			}
@@ -298,17 +292,12 @@ void MuellerMatrix(void)
 					ph=Deg2Rad(phi);
 					co=cos(ph);
 					si=sin(ph);
-					// read amplitude matrix from memory
-					cEqual(EgridY[index],s10);
-					cEqual(EgridY[index+1],s30);
-					cEqual(EgridX[index],s40);
-					cEqual(EgridX[index+1],s20);
+					// transform the amplitude matrix, multiplying by rotation matrix from per-par to X-Y
+					s2 = co*EgridX[index+1] + si*EgridY[index+1]; // s2 =  co*s20 + si*s30
+					s3 = si*EgridX[index+1] - co*EgridY[index+1]; // s3 =  si*s20 - co*s30
+					s4 = co*EgridX[index] + si*EgridY[index]; // s4 =  co*s40 + si*s10
+					s1 = si*EgridX[index] - co*EgridY[index]; // s1 =  si*s40 - co*s10
 					index+=2;
-					// transform it, multiplying by rotation matrix from per-par to X-Y
-					cLinComb(s20,s30,co,si,s2);  // s2 =  co*s20 + si*s30
-					cLinComb(s20,s30,si,-co,s3); // s3 =  si*s20 - co*s30
-					cLinComb(s40,s10,co,si,s4);  // s4 =  co*s40 + si*s10
-					cLinComb(s40,s10,si,-co,s1); // s1 =  si*s40 - co*s10
 
 					if (store_mueller) ComputeMuellerMatrix(matrix,s1,s2,s3,s4);
 
@@ -321,8 +310,8 @@ void MuellerMatrix(void)
 								matrix[0][0],matrix[0][1],matrix[0][2],matrix[0][3],matrix[1][0],matrix[1][1],
 								matrix[1][2],matrix[1][3],matrix[2][0],matrix[2][1],matrix[2][2],matrix[2][3],
 								matrix[3][0],matrix[3][1],matrix[3][2],matrix[3][3]);
-						if (store_ampl) fprintf(ampl,ANGLE_FORMAT" "ANGLE_FORMAT" "AMPL_FORMAT"\n",
-								theta,phi,s1[RE],s1[IM],s2[RE],s2[IM],s3[RE],s3[IM],s4[RE],s4[IM]);
+						if (store_ampl) fprintf(ampl,ANGLE_FORMAT" "ANGLE_FORMAT" "AMPL_FORMAT"\n",theta,phi,
+							creal(s1),cimag(s1),creal(s2),cimag(s2),creal(s3),cimag(s3),creal(s4),cimag(s4));
 					}
 				}
 				if (phi_integr) {
@@ -434,15 +423,15 @@ static void CalcEplane(const enum incpol which,const enum Eftype type)
 
 				CalcField(ebuff,robserver);
 				// convert to (l,r) frame
-				crDotProd(ebuff,incPolper,Eplane[2*i]); // Eper[i]=Esca.incPolper
+				Eplane[2*i]=crDotProd(ebuff,incPolper); // Eper[i]=Esca.incPolper
 				LinComb(prop,incPolpar,-si,co,epar);    // epar=-si*prop+co*incPolpar
-				crDotProd(ebuff,epar,Eplane[2*i+1]);    // Epar[i]=Esca.epar
+				Eplane[2*i+1]=crDotProd(ebuff,epar);    // Epar[i]=Esca.epar
 			} //  end for i
 
 			// Accumulate Eplane to root and sum
 			D("Accumulating Eplane started");
 			// accumulate only on processor 0 !, done in one operation
-			Accumulate((double *)Eplane,4*nTheta,Eplane_buffer,&Timing_EPlaneComm);
+			Accumulate(Eplane,cmplx_type,2*nTheta,&Timing_EPlaneComm);
 			D("Accumulating Eplane finished");
 
 			Timing_EPlane = GET_TIME() - tstart;
@@ -507,8 +496,8 @@ static void StoreFields(const enum incpol which,doublecomplex * restrict cmplxF,
 #endif
 	// saves fields to file
 	if (cmplx_mode) for (j=0;j<local_nRows;j+=3) fprintf(file,GFORM10L"\n",DipoleCoord[j],DipoleCoord[j+1],
-		DipoleCoord[j+2],cvNorm2(cmplxF+j),cmplxF[j][RE],cmplxF[j][IM],cmplxF[j+1][RE],cmplxF[j+1][IM],cmplxF[j+2][RE],
-		cmplxF[j+2][IM]);
+		DipoleCoord[j+2],cvNorm2(cmplxF+j),creal(cmplxF[j]),cimag(cmplxF[j]),creal(cmplxF[j+1]),cimag(cmplxF[j+1]),
+		creal(cmplxF[j+2]),cimag(cmplxF[j+2]));
 	else for (j=0;j<local_nRows;j+=3) fprintf(file,GFORM7L"\n",DipoleCoord[j],DipoleCoord[j+1],DipoleCoord[j+2],
 		DotProd(realF+j,realF+j),realF[j],realF[j+1],realF[j+2]);
 	FCloseErr(file,fname,ALL_POS);

diff --git a/src/GenerateB.c b/src/GenerateB.c
@@ -160,7 +160,7 @@ void GenerateB (const enum incpol which,   // x - or y polarized incident light
 	doublecomplex v1[3],v2[3],v3[3];
 	double ro2,ro4;
 	double x,y,z,x2_s,xy_s;
-	doublecomplex t1,t2,t3,t4,t5,t6,t7,t8,t0,ctemp;
+	doublecomplex t1,t2,t3,t4,t5,t6,t7,t8,ctemp;
 	const double *ex; // coordinate axis of the beam reference frame
 	double ey[3];
 	double r1[3];
@@ -184,8 +184,8 @@ void GenerateB (const enum incpol which,   // x - or y polarized incident light
 		case B_PLANE: // plane is separate to be fast
 			for (i=0;i<local_nvoid_Ndip;i++) {
 				j=3*i;
-				imExp(WaveNum*DotProd(DipoleCoord+j,prop),ctemp); // ctemp=exp(ik*r.a)
-				cScalMultRVec(ex,ctemp,b+j); // b[i]=ctemp*ex
+				ctemp=imExp(WaveNum*DotProd(DipoleCoord+j,prop)); // ctemp=exp(ik*r.a)
+				cvMultScal_RVec(ctemp,ex,b+j); // b[i]=ctemp*ex
 			}
 			return;
 		case B_LMINUS:
@@ -199,94 +199,49 @@ void GenerateB (const enum incpol which,   // x - or y polarized incident light
 				y=DotProd(r1,ey)*scale_x;
 				z=DotProd(r1,prop)*scale_z;
 				ro2=x*x+y*y;
-				// calculate Q=1/(2z-i)
-				Q[IM]=1/(1+4*z*z);
-				Q[RE]=2*z*Q[IM];
-				// calculate psi0=-iQexp(iQro^2)
-				cMult_i2(Q,t1);
-				cMultReal(ro2,t1,t2);
-				cExpSelf(t2);
-				cMult(t1,t2,psi0);
-				cInvSign(psi0);
+				Q=1/(2*z-I);
+				psi0=-I*Q*cexp(I*Q*ro2);
 				// ctemp=exp(ik*z0)*psi0, z0 - non-scaled coordinate (z/scale_z)
-				imExp(WaveNum*z/scale_z,ctemp);
-				cMultSelf(ctemp,psi0);
+				ctemp=imExp(WaveNum*z/scale_z)*psi0;
 				// the following logic (if-else-if...) is hard to replace by a simple switch
-				if (beamtype==B_LMINUS) {
-					cScalMultRVec(ex,ctemp,b+j); // b[i]=ctemp*ex
-				}
+				if (beamtype==B_LMINUS) cvMultScal_RVec(ctemp,ex,b+j); // b[i]=ctemp*ex
 				else {
 					x2_s=x*x/ro2;
-					cSquare(Q,Q2);
+					Q2=Q*Q;
 					ro4=ro2*ro2;
 					// some combinations that are used more than once
-					cMultReal(s2*ro2,Q2,t4); // t4=(s*ro*Q)^2
-					cMultReal(ro2,Q,t5);
-					cMult_i(t5);             // t5=i*Q*ro^2
-					cMultReal(ro4,Q2,t6);    // t6=ro^4*Q^2
-					cMultReal(x*s,Q,t7);     // t7=x*s*Q
+					t4=s2*ro2*Q2; // t4=(s*ro*Q)^2
+					t5=I*ro2*Q;   // t5=i*Q*ro^2
+					t6=ro4*Q2;    // t6=ro^4*Q^2
+					t7=x*s*Q;     // t7=x*s*Q
 					if (beamtype==B_DAVIS3) {
 						// t1=1+s^2(-4Q^2*x^2-iQ^3*ro^4)=1-t4(4x2_s+t5)
-						cEqual(t5,t1);
-						t1[RE]+=4*x2_s;
-						cMultSelf(t1,t4);
-						cMultReal(-1,t1,t1);
-						t1[RE]+=1;
+						t1 = 1 - t4*(4*x2_s+t5);
 						// t2=0
-						t2[RE]=t2[IM]=0;
+						t2=0;
 						// t3=-s(2Qx)+s^3(8Q^3*ro^2*x+2iQ^4*ro^4*x-4iQ^2x)=2t7[-1+iQ*s2*(-4t5+t6-2)]
-						cMultReal(-4,t5,t3);
-						cAdd(t3,t6,t3);
-						t3[RE]-=2;
-						cMultReal(s2,t3,t3);
-						cMultSelf(t3,Q);
-						cMult_i(t3);
-						t3[RE]-=1;
-						cMultSelf(t3,t7);
-						cMultReal(2,t3,t3);
+						t3 = 2*t7*(-1 + I*Q*s2*(-4*t5+t6-2));
 					}
 					else if (beamtype==B_BARTON5) {
 						xy_s=x*y/ro2;
-						cMultReal(2,t5,t8);
-						t8[RE]+=8; // t8=8+2i*Q*ro^2
+						t8=8+2*t5; // t8=8+2i*Q*ro^2
 						/* t1 = 1 + s^2(-ro^2*Q^2-i*ro^4*Q^3-2Q^2*x^2)
 						 *    + s^4[2ro^4*Q^4+3iro^6*Q^5-0.5ro^8*Q^6+x^2(8ro^2*Q^4+2iro^4*Q^5)]
-						 *    = 1 + t4*{-1-2xs2-t5+t4*[2+3t5-0.5t6+x2_s*t8]}
+						 *    = 1 + t4*{-1-2x2_s-t5+t4*[2+3t5-0.5t6+x2_s*t8]}
 						 */
-						cMultReal(x2_s,t8,t1);
-						cMultReal(-0.5,t6,t0);
-						cAdd(t1,t0,t1);
-						cMultReal(3,t5,t0);
-						cAdd(t1,t0,t1);
-						t1[RE]+=2;
-						cMultSelf(t1,t4);
-						cSubtr(t1,t5,t1);
-						t1[RE]-=1+2*x2_s;
-						cMultSelf(t1,t4);
-						t1[RE]+=1;
+						t1 = 1 + t4*(-1 - 2*x2_s - t5 + t4*(2+3*t5-0.5*t6+x2_s*t8));
 						// t2=s^2(-2Q^2*xy)+s^4[xy(8ro^2*Q^4+2iro^4*Q^5)]=xy_s*t4(-2+t4*t8)
-						cMult(t4,t8,t2);
-						t2[RE]-=2;
-						cMultSelf(t2,t4);
-						cMultReal(xy_s,t2,t2);
+						t2=xy_s*t4*(-2+t4*t8);
 						/* t3 = s(-2Qx) + s^3[(6ro^2*Q^3+2iro^4*Q^4)x] + s^5[(-20ro^4*Q^5-10iro^6*Q^6+ro^8*Q^7)x]
 						 *    = t7{-2+t4[6+2t5+t4(-20-10t5+t6)]}
 						 */
-						cMultReal(-10,t5,t3);
-						cAdd(t3,t6,t3);
-						t3[RE]-=20;
-						cMultSelf(t3,t4);
-						cMultReal(2,t5,t0);
-						cAdd(t3,t0,t3);
-						t3[RE]+=6;
-						cMultSelf(t3,t4);
-						t3[RE]-=2;
-						cMultSelf(t3,t7);
-					} else LogError(ONE_POS,"Inconsistency in beam definition"); // to remove warnings
+						t3 = t7*(-2 + t4*(6 + 2*t5 + t4*(-20-10*t5+t6)));
+					}
+					else LogError(ONE_POS,"Inconsistency in beam definition"); // to remove warnings
 					// b[i]=ctemp(ex*t1+ey*t2+ez*t3)
-					cScalMultRVec(ex,t1,v1);
-					cScalMultRVec(ey,t2,v2);
-					cScalMultRVec(prop,t3,v3);
+					cvMultScal_RVec(t1,ex,v1);
+					cvMultScal_RVec(t2,ey,v2);
+					cvMultScal_RVec(t3,prop,v3);
 					cvAdd2Self(v1,v2,v3);
 					cvMultScal_cmplx(ctemp,v1,b+j);
 				}

diff --git a/src/Makefile b/src/Makefile
@@ -328,10 +328,17 @@ ifeq ($(COMPILER),gnu)
   CSTD    := -std=c99
   COPT1   := -O2
   COPT2   := -O3 -ffast-math -funroll-loops
-  CWARN   := -Wall -Wextra -pedantic -Wcast-qual -Wpointer-arith -Wwrite-strings -Wstrict-prototypes \
+  CWARN   := -Wall -Wextra -Wcast-qual -Wpointer-arith -Wwrite-strings -Wstrict-prototypes \
              -Wstrict-aliasing=1 -Wshadow -Wcast-align -Wnested-externs -Wcomment -Wno-unknown-pragmas \
              -Wno-overlength-strings
-
+  # gcc versions prior to 4.7.2 are affected by bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=7263 , which causes
+  # -pedantic flag to generates warnings on every occurence of I (complex i)
+  GCC_GTEQ_472 := $(shell expr `gcc -dumpversion | sed -e 's/\.\([0-9][0-9]\)/\1/g' -e 's/\.\([0-9]\)/0\1/g' \
+    -e 's/^[0-9]\{3,4\}$$/&00/'` \>= 40702)
+  ifeq "$(GCC_GTEQ_472)" "1"
+    CWARN += -pedantic
+  endif
+  
   # Use gfortran if available (GCC 4 and later), otherwise try g77
   ifeq ($(shell which gfortran > /dev/null 2>&1 && echo 0),0)
     CF    := gfortran