Skip to content

Commit

Permalink
Major changes to the code due to switch to complex types of C99 (issu…
Browse files Browse the repository at this point in the history
…e 70). All the code using complex numbers has been looked through, and in many cases it was significantly simplified. Many functions from cmplx.h (operating on complex scalars) has been removed, as they are no more used.

- E2_alldir is now used to store only the final result, while (intermediary) complex fields are stored in a new vector E_ad. This slightly increases required memory (proportional to number of angles), but improves clarity.
- removed restrict optimizations from cmplx.h, because the functions are inline anyway.
- added boolVector to memory.c/h (and a couple of minor corrections)
- arguments of CalcField in crosssec.c are now declared as 3-vectors.
- added test of gcc version in Makefile, '-pedantic' is used only for versions >= gcc (otherwise spurious warnings about complex numbers appear).
- MatVec in sparse mode was updated to use standard functions from linalg.c
- removed copying of scalar function arguments into registers for functions in linalg.c. That had some sense for double[2] arguments, but can be left to compiler to arguments that are passed by value.
- removed all '#pragma ivdep' in linalg.c, since now this information should be obvious to compiler anyway.
- version incremented to 1.3a1

MPI part:
- To have portable coupling with 'double complex', MPI_C_DOUBLE_COMPLEX is now used when available. However, it can be either unavailable, or not supported in reduction operations. Therefore, rather complicated logic has been added to parbas.h to detect different cases and take full advantage of the available features. Missing features are emulated by replacing 'double complex' with 'double[2]' as was before. But it is not perfectly portable.
- Similar issues were addressed for MPI_C_BOOL. The corresponding part of granule generator was changed to take full advantage of it.
- Accumulate() now operates in place and operates on any datatype. Corresponding buffers Egrid_buffer, Eplane_buffer, and E2alldir_buffer were removed (somewhat decreasing the memory footprint in MPI mode).

Quick tests of sequential and MPI parts showed marginal improvement of computational speed.
Both sequential and MPI parts (including sparse, without SSE3, counterparts) were thoroughly testes with tests/2exec.

Remaining issues: test and fix coupling with Fortran, C++, OpenCL parts, and sparse-SSE3. They seem to be working, but not perfectly portable.
  • Loading branch information
myurkin committed Jul 31, 2013
1 parent af41300 commit 057298c
Show file tree
Hide file tree
Showing 28 changed files with 898 additions and 1,751 deletions.
91 changes: 40 additions & 51 deletions src/CalculateE.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
// defined and initialized in calculator.c
extern double * restrict muel_phi,* restrict muel_phi_buf;
extern doublecomplex * restrict EplaneX, * restrict EplaneY;
extern double * restrict Eplane_buffer;
extern const double dtheta_deg,dtheta_rad;
extern doublecomplex * restrict ampl_alphaX,* restrict ampl_alphaY;
extern double * restrict muel_alpha;
Expand Down Expand Up @@ -82,25 +81,25 @@ static void ComputeMuellerMatrix(double matrix[4][4], const doublecomplex s1,con
* Huffman
*/
{
matrix[0][0] = 0.5*(cMultConRe(s1,s1)+cMultConRe(s2,s2)+cMultConRe(s3,s3)+cMultConRe(s4,s4));
matrix[0][1] = 0.5*(cMultConRe(s2,s2)-cMultConRe(s1,s1)+cMultConRe(s4,s4)-cMultConRe(s3,s3));
matrix[0][2] = cMultConRe(s2,s3)+cMultConRe(s1,s4);
matrix[0][3] = cMultConIm(s2,s3)-cMultConIm(s1,s4);

matrix[1][0] = 0.5*(cMultConRe(s2,s2)-cMultConRe(s1,s1)+cMultConRe(s3,s3)-cMultConRe(s4,s4));
matrix[1][1] = 0.5*(cMultConRe(s2,s2)+cMultConRe(s1,s1)-cMultConRe(s3,s3)-cMultConRe(s4,s4));
matrix[1][2] = cMultConRe(s2,s3)-cMultConRe(s1,s4);
matrix[1][3] = cMultConIm(s2,s3)+cMultConIm(s1,s4);

matrix[2][0] = cMultConRe(s2,s4)+cMultConRe(s1,s3);
matrix[2][1] = cMultConRe(s2,s4)-cMultConRe(s1,s3);
matrix[2][2] = cMultConRe(s1,s2)+cMultConRe(s3,s4);
matrix[2][3] = cMultConIm(s2,s1)+cMultConIm(s4,s3);

matrix[3][0] = cMultConIm(s4,s2)+cMultConIm(s1,s3);
matrix[3][1] = cMultConIm(s4,s2)-cMultConIm(s1,s3);
matrix[3][2] = cMultConIm(s1,s2)-cMultConIm(s3,s4);
matrix[3][3] = cMultConRe(s1,s2)-cMultConRe(s3,s4);
matrix[0][0] = 0.5*(cAbs2(s1) + cAbs2(s2) + cAbs2(s3) + cAbs2(s4));
matrix[0][1] = 0.5*(cAbs2(s2) - cAbs2(s1) + cAbs2(s4) - cAbs2(s3));
matrix[0][2] = creal(s2*conj(s3) + s1*conj(s4));
matrix[0][3] = cimag(s2*conj(s3) - s1*conj(s4));

matrix[1][0] = 0.5*(cAbs2(s2) - cAbs2(s1) + cAbs2(s3) - cAbs2(s4));
matrix[1][1] = 0.5*(cAbs2(s2) + cAbs2(s1) - cAbs2(s3) - cAbs2(s4));
matrix[1][2] = creal(s2*conj(s3) - s1*conj(s4));
matrix[1][3] = cimag(s2*conj(s3) + s1*conj(s4));

matrix[2][0] = creal(s2*conj(s4) + s1*conj(s3));
matrix[2][1] = creal(s2*conj(s4) - s1*conj(s3));
matrix[2][2] = creal(s1*conj(s2) + s3*conj(s4));
matrix[2][3] = cimag(s2*conj(s1) + s4*conj(s3));

matrix[3][0] = cimag(s4*conj(s2) + s1*conj(s3));
matrix[3][1] = cimag(s4*conj(s2) - s1*conj(s3));
matrix[3][2] = cimag(s1*conj(s2) - s3*conj(s4));
matrix[3][3] = creal(s1*conj(s2) - s3*conj(s4));
}

//======================================================================================================================
Expand Down Expand Up @@ -166,7 +165,7 @@ void MuellerMatrix(void)
double * restrict cos2=NULL,* restrict sin2=NULL,* restrict cos4=NULL,* restrict sin4=NULL;
double matrix[4][4];
double theta,phi,ph,max_err,max_err_c2,max_err_s2,max_err_c4,max_err_s4;
doublecomplex s1,s2,s3,s4,s10,s20,s30,s40;
doublecomplex s1,s2,s3,s4;
char fname[MAX_FNAME];
int i;
size_t index,index1,k_or,j,n,ind;
Expand All @@ -191,16 +190,11 @@ void MuellerMatrix(void)
co=cos(alph);
si=sin(alph);
for (i=0;i<nTheta;i++) {
// read amplitude matrix from memory
cEqual(ampl_alphaX[index],s10);
cEqual(ampl_alphaX[index+1],s30);
cEqual(ampl_alphaY[index],s40);
cEqual(ampl_alphaY[index+1],s20);
// transform it, multiplying by rotation matrix (-alpha)
cLinComb(s20,s30,co,si,s2); // s2 = co*s20 + si*s30
cLinComb(s20,s30,-si,co,s3); // s3 = -si*s20 + co*s30
cLinComb(s40,s10,co,si,s4); // s4 = co*s40 + si*s10
cLinComb(s40,s10,-si,co,s1); // s1 = -si*s40 + co*s10
// transform amplitude matrix, multiplying by rotation matrix (-alpha)
s2 = co*ampl_alphaY[index+1] + si*ampl_alphaX[index+1]; // s2 = co*s20 + si*s30
s3 = -si*ampl_alphaY[index+1] + co*ampl_alphaX[index+1]; // s3 = -si*s20 + co*s30
s4 = co*ampl_alphaY[index] + si*ampl_alphaX[index]; // s4 = co*s40 + si*s10
s1 = -si*ampl_alphaY[index] + co*ampl_alphaX[index]; // s1 = -si*s40 + co*s10

ComputeMuellerMatrix((double (*)[4])(muel_alpha+index1),s1,s2,s3,s4);
index+=2;
Expand All @@ -218,9 +212,9 @@ void MuellerMatrix(void)
fprintf(ampl,THETA_HEADER" "AMPL_HEADER"\n");
for (i=0;i<nTheta;i++) {
theta=i*dtheta_deg;
fprintf(ampl,ANGLE_FORMAT" "AMPL_FORMAT"\n",theta,EplaneX[2*i][RE],EplaneX[2*i][IM],
EplaneY[2*i+1][RE],EplaneY[2*i+1][IM],EplaneX[2*i+1][RE],EplaneX[2*i+1][IM],EplaneY[2*i][RE],
EplaneY[2*i][IM]);
fprintf(ampl,ANGLE_FORMAT" "AMPL_FORMAT"\n",theta,creal(EplaneX[2*i]),cimag(EplaneX[2*i]),
creal(EplaneY[2*i+1]),cimag(EplaneY[2*i+1]),creal(EplaneX[2*i+1]),cimag(EplaneX[2*i+1]),
creal(EplaneY[2*i]),cimag(EplaneY[2*i]));
}
FCloseErr(ampl,F_AMPL,ONE_POS);
}
Expand Down Expand Up @@ -298,17 +292,12 @@ void MuellerMatrix(void)
ph=Deg2Rad(phi);
co=cos(ph);
si=sin(ph);
// read amplitude matrix from memory
cEqual(EgridY[index],s10);
cEqual(EgridY[index+1],s30);
cEqual(EgridX[index],s40);
cEqual(EgridX[index+1],s20);
// transform the amplitude matrix, multiplying by rotation matrix from per-par to X-Y
s2 = co*EgridX[index+1] + si*EgridY[index+1]; // s2 = co*s20 + si*s30
s3 = si*EgridX[index+1] - co*EgridY[index+1]; // s3 = si*s20 - co*s30
s4 = co*EgridX[index] + si*EgridY[index]; // s4 = co*s40 + si*s10
s1 = si*EgridX[index] - co*EgridY[index]; // s1 = si*s40 - co*s10
index+=2;
// transform it, multiplying by rotation matrix from per-par to X-Y
cLinComb(s20,s30,co,si,s2); // s2 = co*s20 + si*s30
cLinComb(s20,s30,si,-co,s3); // s3 = si*s20 - co*s30
cLinComb(s40,s10,co,si,s4); // s4 = co*s40 + si*s10
cLinComb(s40,s10,si,-co,s1); // s1 = si*s40 - co*s10

if (store_mueller) ComputeMuellerMatrix(matrix,s1,s2,s3,s4);

Expand All @@ -321,8 +310,8 @@ void MuellerMatrix(void)
matrix[0][0],matrix[0][1],matrix[0][2],matrix[0][3],matrix[1][0],matrix[1][1],
matrix[1][2],matrix[1][3],matrix[2][0],matrix[2][1],matrix[2][2],matrix[2][3],
matrix[3][0],matrix[3][1],matrix[3][2],matrix[3][3]);
if (store_ampl) fprintf(ampl,ANGLE_FORMAT" "ANGLE_FORMAT" "AMPL_FORMAT"\n",
theta,phi,s1[RE],s1[IM],s2[RE],s2[IM],s3[RE],s3[IM],s4[RE],s4[IM]);
if (store_ampl) fprintf(ampl,ANGLE_FORMAT" "ANGLE_FORMAT" "AMPL_FORMAT"\n",theta,phi,
creal(s1),cimag(s1),creal(s2),cimag(s2),creal(s3),cimag(s3),creal(s4),cimag(s4));
}
}
if (phi_integr) {
Expand Down Expand Up @@ -434,15 +423,15 @@ static void CalcEplane(const enum incpol which,const enum Eftype type)

CalcField(ebuff,robserver);
// convert to (l,r) frame
crDotProd(ebuff,incPolper,Eplane[2*i]); // Eper[i]=Esca.incPolper
Eplane[2*i]=crDotProd(ebuff,incPolper); // Eper[i]=Esca.incPolper
LinComb(prop,incPolpar,-si,co,epar); // epar=-si*prop+co*incPolpar
crDotProd(ebuff,epar,Eplane[2*i+1]); // Epar[i]=Esca.epar
Eplane[2*i+1]=crDotProd(ebuff,epar); // Epar[i]=Esca.epar
} // end for i

// Accumulate Eplane to root and sum
D("Accumulating Eplane started");
// accumulate only on processor 0 !, done in one operation
Accumulate((double *)Eplane,4*nTheta,Eplane_buffer,&Timing_EPlaneComm);
Accumulate(Eplane,cmplx_type,2*nTheta,&Timing_EPlaneComm);
D("Accumulating Eplane finished");

Timing_EPlane = GET_TIME() - tstart;
Expand Down Expand Up @@ -507,8 +496,8 @@ static void StoreFields(const enum incpol which,doublecomplex * restrict cmplxF,
#endif
// saves fields to file
if (cmplx_mode) for (j=0;j<local_nRows;j+=3) fprintf(file,GFORM10L"\n",DipoleCoord[j],DipoleCoord[j+1],
DipoleCoord[j+2],cvNorm2(cmplxF+j),cmplxF[j][RE],cmplxF[j][IM],cmplxF[j+1][RE],cmplxF[j+1][IM],cmplxF[j+2][RE],
cmplxF[j+2][IM]);
DipoleCoord[j+2],cvNorm2(cmplxF+j),creal(cmplxF[j]),cimag(cmplxF[j]),creal(cmplxF[j+1]),cimag(cmplxF[j+1]),
creal(cmplxF[j+2]),cimag(cmplxF[j+2]));
else for (j=0;j<local_nRows;j+=3) fprintf(file,GFORM7L"\n",DipoleCoord[j],DipoleCoord[j+1],DipoleCoord[j+2],
DotProd(realF+j,realF+j),realF[j],realF[j+1],realF[j+2]);
FCloseErr(file,fname,ALL_POS);
Expand Down
95 changes: 25 additions & 70 deletions src/GenerateB.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ void GenerateB (const enum incpol which, // x - or y polarized incident light
doublecomplex v1[3],v2[3],v3[3];
double ro2,ro4;
double x,y,z,x2_s,xy_s;
doublecomplex t1,t2,t3,t4,t5,t6,t7,t8,t0,ctemp;
doublecomplex t1,t2,t3,t4,t5,t6,t7,t8,ctemp;
const double *ex; // coordinate axis of the beam reference frame
double ey[3];
double r1[3];
Expand All @@ -184,8 +184,8 @@ void GenerateB (const enum incpol which, // x - or y polarized incident light
case B_PLANE: // plane is separate to be fast
for (i=0;i<local_nvoid_Ndip;i++) {
j=3*i;
imExp(WaveNum*DotProd(DipoleCoord+j,prop),ctemp); // ctemp=exp(ik*r.a)
cScalMultRVec(ex,ctemp,b+j); // b[i]=ctemp*ex
ctemp=imExp(WaveNum*DotProd(DipoleCoord+j,prop)); // ctemp=exp(ik*r.a)
cvMultScal_RVec(ctemp,ex,b+j); // b[i]=ctemp*ex
}
return;
case B_LMINUS:
Expand All @@ -199,94 +199,49 @@ void GenerateB (const enum incpol which, // x - or y polarized incident light
y=DotProd(r1,ey)*scale_x;
z=DotProd(r1,prop)*scale_z;
ro2=x*x+y*y;
// calculate Q=1/(2z-i)
Q[IM]=1/(1+4*z*z);
Q[RE]=2*z*Q[IM];
// calculate psi0=-iQexp(iQro^2)
cMult_i2(Q,t1);
cMultReal(ro2,t1,t2);
cExpSelf(t2);
cMult(t1,t2,psi0);
cInvSign(psi0);
Q=1/(2*z-I);
psi0=-I*Q*cexp(I*Q*ro2);
// ctemp=exp(ik*z0)*psi0, z0 - non-scaled coordinate (z/scale_z)
imExp(WaveNum*z/scale_z,ctemp);
cMultSelf(ctemp,psi0);
ctemp=imExp(WaveNum*z/scale_z)*psi0;
// the following logic (if-else-if...) is hard to replace by a simple switch
if (beamtype==B_LMINUS) {
cScalMultRVec(ex,ctemp,b+j); // b[i]=ctemp*ex
}
if (beamtype==B_LMINUS) cvMultScal_RVec(ctemp,ex,b+j); // b[i]=ctemp*ex
else {
x2_s=x*x/ro2;
cSquare(Q,Q2);
Q2=Q*Q;
ro4=ro2*ro2;
// some combinations that are used more than once
cMultReal(s2*ro2,Q2,t4); // t4=(s*ro*Q)^2
cMultReal(ro2,Q,t5);
cMult_i(t5); // t5=i*Q*ro^2
cMultReal(ro4,Q2,t6); // t6=ro^4*Q^2
cMultReal(x*s,Q,t7); // t7=x*s*Q
t4=s2*ro2*Q2; // t4=(s*ro*Q)^2
t5=I*ro2*Q; // t5=i*Q*ro^2
t6=ro4*Q2; // t6=ro^4*Q^2
t7=x*s*Q; // t7=x*s*Q
if (beamtype==B_DAVIS3) {
// t1=1+s^2(-4Q^2*x^2-iQ^3*ro^4)=1-t4(4x2_s+t5)
cEqual(t5,t1);
t1[RE]+=4*x2_s;
cMultSelf(t1,t4);
cMultReal(-1,t1,t1);
t1[RE]+=1;
t1 = 1 - t4*(4*x2_s+t5);
// t2=0
t2[RE]=t2[IM]=0;
t2=0;
// t3=-s(2Qx)+s^3(8Q^3*ro^2*x+2iQ^4*ro^4*x-4iQ^2x)=2t7[-1+iQ*s2*(-4t5+t6-2)]
cMultReal(-4,t5,t3);
cAdd(t3,t6,t3);
t3[RE]-=2;
cMultReal(s2,t3,t3);
cMultSelf(t3,Q);
cMult_i(t3);
t3[RE]-=1;
cMultSelf(t3,t7);
cMultReal(2,t3,t3);
t3 = 2*t7*(-1 + I*Q*s2*(-4*t5+t6-2));
}
else if (beamtype==B_BARTON5) {
xy_s=x*y/ro2;
cMultReal(2,t5,t8);
t8[RE]+=8; // t8=8+2i*Q*ro^2
t8=8+2*t5; // t8=8+2i*Q*ro^2
/* t1 = 1 + s^2(-ro^2*Q^2-i*ro^4*Q^3-2Q^2*x^2)
* + s^4[2ro^4*Q^4+3iro^6*Q^5-0.5ro^8*Q^6+x^2(8ro^2*Q^4+2iro^4*Q^5)]
* = 1 + t4*{-1-2xs2-t5+t4*[2+3t5-0.5t6+x2_s*t8]}
* = 1 + t4*{-1-2x2_s-t5+t4*[2+3t5-0.5t6+x2_s*t8]}
*/
cMultReal(x2_s,t8,t1);
cMultReal(-0.5,t6,t0);
cAdd(t1,t0,t1);
cMultReal(3,t5,t0);
cAdd(t1,t0,t1);
t1[RE]+=2;
cMultSelf(t1,t4);
cSubtr(t1,t5,t1);
t1[RE]-=1+2*x2_s;
cMultSelf(t1,t4);
t1[RE]+=1;
t1 = 1 + t4*(-1 - 2*x2_s - t5 + t4*(2+3*t5-0.5*t6+x2_s*t8));
// t2=s^2(-2Q^2*xy)+s^4[xy(8ro^2*Q^4+2iro^4*Q^5)]=xy_s*t4(-2+t4*t8)
cMult(t4,t8,t2);
t2[RE]-=2;
cMultSelf(t2,t4);
cMultReal(xy_s,t2,t2);
t2=xy_s*t4*(-2+t4*t8);
/* t3 = s(-2Qx) + s^3[(6ro^2*Q^3+2iro^4*Q^4)x] + s^5[(-20ro^4*Q^5-10iro^6*Q^6+ro^8*Q^7)x]
* = t7{-2+t4[6+2t5+t4(-20-10t5+t6)]}
*/
cMultReal(-10,t5,t3);
cAdd(t3,t6,t3);
t3[RE]-=20;
cMultSelf(t3,t4);
cMultReal(2,t5,t0);
cAdd(t3,t0,t3);
t3[RE]+=6;
cMultSelf(t3,t4);
t3[RE]-=2;
cMultSelf(t3,t7);
} else LogError(ONE_POS,"Inconsistency in beam definition"); // to remove warnings
t3 = t7*(-2 + t4*(6 + 2*t5 + t4*(-20-10*t5+t6)));
}
else LogError(ONE_POS,"Inconsistency in beam definition"); // to remove warnings
// b[i]=ctemp(ex*t1+ey*t2+ez*t3)
cScalMultRVec(ex,t1,v1);
cScalMultRVec(ey,t2,v2);
cScalMultRVec(prop,t3,v3);
cvMultScal_RVec(t1,ex,v1);
cvMultScal_RVec(t2,ey,v2);
cvMultScal_RVec(t3,prop,v3);
cvAdd2Self(v1,v2,v3);
cvMultScal_cmplx(ctemp,v1,b+j);
}
Expand Down
11 changes: 9 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -328,10 +328,17 @@ ifeq ($(COMPILER),gnu)
CSTD := -std=c99
COPT1 := -O2
COPT2 := -O3 -ffast-math -funroll-loops
CWARN := -Wall -Wextra -pedantic -Wcast-qual -Wpointer-arith -Wwrite-strings -Wstrict-prototypes \
CWARN := -Wall -Wextra -Wcast-qual -Wpointer-arith -Wwrite-strings -Wstrict-prototypes \
-Wstrict-aliasing=1 -Wshadow -Wcast-align -Wnested-externs -Wcomment -Wno-unknown-pragmas \
-Wno-overlength-strings

# gcc versions prior to 4.7.2 are affected by bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=7263 , which causes
# -pedantic flag to generates warnings on every occurence of I (complex i)
GCC_GTEQ_472 := $(shell expr `gcc -dumpversion | sed -e 's/\.\([0-9][0-9]\)/\1/g' -e 's/\.\([0-9]\)/0\1/g' \
-e 's/^[0-9]\{3,4\}$$/&00/'` \>= 40702)
ifeq "$(GCC_GTEQ_472)" "1"
CWARN += -pedantic
endif
# Use gfortran if available (GCC 4 and later), otherwise try g77
ifeq ($(shell which gfortran > /dev/null 2>&1 && echo 0),0)
CF := gfortran
Expand Down
Loading

0 comments on commit 057298c

Please sign in to comment.