From 3fd68c9832b1ceb47a591988187082b37b416e86 Mon Sep 17 00:00:00 2001 From: yurkin Date: Wed, 25 Sep 2013 09:45:34 +0000 Subject: [PATCH] Finally, a working implementation of particle near surface in FFT mode. - additional required memory was described in comments in calculator.c - definition of ZsumShift is now different for sparse and FFT modes in parallel (relative to the local or global bottom dipoles respectively). Related difference is in definition of local_Nz_Rm (used for Sommerfeld table and, in FFT mode, for filling Rmatrix). - Sommerfeld table is now not calculated during prognosis, but corresponding memory is properly accounted for. - Current implementation requires additional forward fftY and fftZ at each iteration (but see issue 177). - BlockTranspose_Dm in comm.c was renamed into BlockTranspose_DRm, since it works both for D- and Rmatrix. - existing functions to index Dmatrix has been slightly changed to not use DsizeYZ and use y>=DsizeY instead of y>smallY. - functions and plans, not specific to Dmatrix were renamed (Y and Z forward transforms), now they have 'slice' in their names. - transposeYZ_Dm was changed to transpose (a general function), and transposeYZ is now just a wrapper around the latter. - version incremented to 1.3b1. Main computational bottleneck is computation of the table of Sommerfeld integrals. Currently it is approximately equivalent to 200 iterations (issue 176). So should not be a problem for applications with a larger number of iterations. New implementation was extensively tested against previous sparse version. Still, quantitative comparison with published literature data is required. Remaining limitation is that it doesn't work in OpenCL mode (see issue 101 for details, explicit exception was added to param.c). Changes to timing: - added time for 'init interaction' (significant when Sommerfeld table is calculated). - Time for 'init Dmatrix' now may include the time for initialization of Rmatrix (doesn't include time for Sommerfeld table). - Precise timing for Dmatrix now includes a separate line for initialization of Rmatrix (without details) Other: - scattering at exactly 90 degrees (along surface) for non-trivial surfaces is now handled by a special case to produce exact 0. This makes the output consistent, avoiding large integration errors for -phi_integr or alldir. - added explicit exception to forbid combination of -no_reduced_fft and -iter cgnr in MPI FFT mode (issue 174). Tests in tests/2exec/ have been significantly improved - added SURF_EXT flag, which allows running a large suite of tests, adding '-surf ...' option to each of them. - added a number of ignores, which are always active (decreases false-positives) - added a couple of macros in suite files (NOMPI and NOMPISEQ), which indicates that the line should be skipped for a specific comparison modes. - added a number of tests to the default suite: '-surf ...', '-int_surf', -scat_plane, '-shape read ... -grid ...', '-no_reduced_fft -iter cgnr' (see above) --- src/GenerateB.c | 2 +- src/calculator.c | 11 +- src/comm.c | 8 +- src/comm.h | 2 +- src/const.h | 2 +- src/crosssec.c | 11 +- src/fft.c | 507 +++++++++++++++++++++++++++------------ src/interaction.c | 47 ++-- src/interaction.h | 4 +- src/make_particle.c | 9 +- src/matvec.c | 94 ++++++-- src/param.c | 17 ++ src/somnec.c | 2 +- src/timing.c | 5 +- src/vars.c | 2 +- tests/2exec/comp2exec | 52 +++- tests/2exec/suite | 22 +- tests/2exec/suite_sparse | 23 +- tests/2exec/suite_surf | 35 ++- 19 files changed, 606 insertions(+), 249 deletions(-) diff --git a/src/GenerateB.c b/src/GenerateB.c index afccdd26..e1535379 100644 --- a/src/GenerateB.c +++ b/src/GenerateB.c @@ -43,7 +43,7 @@ extern const int beam_Npars; extern const double beam_pars[]; extern const char *beam_fnameY; extern const char *beam_fnameX; -extern opt_index opt_beam; +extern const opt_index opt_beam; // used in crosssec.c double beam_center_0[3]; // position of the beam center in laboratory reference frame diff --git a/src/calculator.c b/src/calculator.c index eb60c0bf..afca210c 100644 --- a/src/calculator.c +++ b/src/calculator.c @@ -43,7 +43,7 @@ extern const angle_set beta_int,gamma_int,theta_int,phi_int; extern const int avg_inc_pol; extern const char *alldir_parms,*scat_grid_parms; // defined and initialized in timing.c -extern TIME_TYPE Timing_Init; +extern TIME_TYPE Timing_Init,Timing_Init_Int; #ifdef OPENCL extern TIME_TYPE Timing_OCL_Init; #endif @@ -445,14 +445,17 @@ static void AllocateEverything(void) /* estimate of the memory (only the fastest scaling part): * MatVec - (288+384nprocs/boxX [+192/nprocs])*Ndip * more exactly: gridX*gridY*gridZ*(36+48nprocs/boxX [+24/nprocs]) value in [] is only for parallel mode. - * For OpenCL mode all MatVec part is allocated on GPU instead of main (CPU) memory + * For surf additionally: gridX*gridY*gridZ*(48+48nprocs/boxX) + * + for Sommerfeld table: 64*boxZ*(boxX*boxY-(MIN(boxX,boxY))^2) + * For OpenCL mode all MatVec part is allocated on GPU instead of main (CPU) memory * others - nvoid_Ndip*{271(CGNR,BiCG), 367(CSYM,QMR2), 415(BiCGStab,QMR), or 463(BCGS2)} * + additional 8*nvoid_Ndip for OpenCL mode and CGNR or Bi-CGSTAB * PARALLEL: above is total; division over processors of MatVec is uniform, others - according to local_nvoid_Ndip * * Sparse mode - each processor needs (265--457, depending on iterative solver)*local_nvoid_Ndip + 60*nvoid_Ndip * and division is uniform, i.e. local_nvoid_Ndip = nvoid_Ndip/nprocs - * Part of the memory is currently not distributed among processors - see issue 160. + * Sommerfeld table - same as above, but it is not divided among processors. + * Part of the memory is currently not distributed among processors - see issues 160,175. */ MAXIMIZE(memPeak,memory); double memSum=AccumulateMax(memPeak,&memmax); @@ -592,7 +595,9 @@ void Calculator (void) else dtheta_deg=dtheta_rad=block_theta=0; finish_avg=false; // Do preliminary setup for MatVec + TIME_TYPE startInitInt=GET_TIME(); InitInteraction(); + Timing_Init_Int=GET_TIME()-startInitInt; #ifndef SPARSE // initialize D matrix (for matrix-vector multiplication) D("InitDmatrix started"); diff --git a/src/comm.c b/src/comm.c index 9b63b16a..6da60ff1 100644 --- a/src/comm.c +++ b/src/comm.c @@ -676,10 +676,10 @@ void BlockTranspose(doublecomplex * restrict X UOIP,TIME_TYPE *timing UOIP) //====================================================================================================================== -void BlockTranspose_Dm(doublecomplex * restrict X UOIP,const size_t lengthY UOIP,const size_t lengthZ UOIP) -/* do the data-transposition, i.e. exchange, between fftX and fftY&fftZ; specialized for D matrix. It can be updated to - * accept timing argument for generality. But, since this is a specialized function, we keep the timing variable - * hard-wired in the code. +void BlockTranspose_DRm(doublecomplex * restrict X UOIP,const size_t lengthY UOIP,const size_t lengthZ UOIP) +/* do the data-transposition, i.e. exchange, between fftX and fftY&fftZ; specialized for D or R matrix. It can be + * updated to accept timing argument for generality. But, since this is a specialized function, we keep the timing + * variable hard-wired in the code. */ { #ifdef ADDA_MPI diff --git a/src/comm.h b/src/comm.h index 010f9c5a..cb425d92 100644 --- a/src/comm.h +++ b/src/comm.h @@ -45,7 +45,7 @@ void ReadField(const char * restrict fname,doublecomplex *restrict field); #ifndef SPARSE void BlockTranspose(doublecomplex * restrict X,TIME_TYPE *timing); -void BlockTranspose_Dm(doublecomplex * restrict X,size_t lengthY,size_t lengthZ); +void BlockTranspose_DRm(doublecomplex * restrict X,size_t lengthY,size_t lengthZ); // used by granule generator void SetGranulComm(double z0,double z1,double gdZ,int gZ,size_t gXY,size_t buf_size,int *lz0,int *lz1,int sm_gr); void CollectDomainGranul(unsigned char * restrict dom,size_t gXY,int lz0,int locgZ,TIME_TYPE *timing); diff --git a/src/const.h b/src/const.h index e965fda7..77c0578e 100644 --- a/src/const.h +++ b/src/const.h @@ -18,7 +18,7 @@ #define __const_h // version number (string) -#define ADDA_VERSION "1.3a2" +#define ADDA_VERSION "1.3b1" /* ADDA uses certain C99 extensions, which are widely supported by GNU and Intel compilers. However, they may be not * completely supported by e.g. Microsoft Visual Studio compiler. Therefore, we check the version of the standard here diff --git a/src/crosssec.c b/src/crosssec.c index 834499a3..5b8d2eec 100644 --- a/src/crosssec.c +++ b/src/crosssec.c @@ -631,6 +631,11 @@ static void CalcFieldSurf(doublecomplex ebuff[static restrict 3], // where to wr if (above) { // simple reflection vCopy(nF,nN); nN[2]*=-1; + // no scattering at exactly 90 degrees for non-trivial surface (to avoid randomness for this case) + if (fabs(nN[2])ROUND_ERR) { + cvInit(ebuff); + return; + } } else { // transmission if (msubInf) { // no transmission for perfectly reflecting substrate => zero result @@ -738,11 +743,13 @@ static void CalcFieldSurf(doublecomplex ebuff[static restrict 3], // where to wr kt=NAN; // redundant to remove warnings below } else { - if (cabs(msub-1)=DsizeY) y=gridY-y; if (z>=DsizeZ) z=gridZ-z; - - return(NDCOMP*(x*DsizeYZ+z*DsizeY+y)); + return(NDCOMP*((x*DsizeZ+z)*DsizeY+y)); } //====================================================================================================================== -static inline size_t IndexGarbledD(const size_t x,int y,int z,const size_t lengthN UOIP) -// index D2 matrix after BlockTranspose +static inline size_t IndexGarbledD(const size_t x,int y,int z) +// index D2 matrix after BlockTranspose (periodic over y and z) { - if (y<0) y+=D2sizeY; - if (z<0) z+=D2sizeZ; + if (y<0) y+=gridY; + if (z<0) z+=gridZ; #ifdef PARALLEL - return(((z%lengthN)*D2sizeY+y)*gridX+(z/lengthN)*local_Nx+x%local_Nx); + return(((z%lz_Dm)*D2sizeY+y)*gridX+(z/lz_Dm)*local_Nx+x%local_Nx); #else return((z*D2sizeY+y)*gridX+x); #endif @@ -146,128 +156,71 @@ static inline size_t IndexGarbledD(const size_t x,int y,int z,const size_t lengt //====================================================================================================================== -static inline size_t IndexD2matrix(int x,int y,int z,const int nnn) -// index D2 matrix to store calculated elements +static inline size_t IndexSliceD2matrix(int y,int z) +// index slice of D2 matrix (periodic over y and z) { - if (x<0) x+=gridX; - if (y<0) y+=D2sizeY; - // if (z<0) z+=D2sizeZ; - return(((z-nnn*local_z0)*D2sizeY+y)*gridX+x); + if (y<0) y+=gridY; + if (z<0) z+=gridZ; + return(y*gridZ+z); } //====================================================================================================================== -static inline size_t IndexSliceD2matrix(int y,int z) -// index slice of D2 matrix +static inline size_t Index2matrix(int x,int y,const int z,const int sizeY) +// index D2 or R2 matrix to store calculated elements (periodic over x and y), z should already be shifted { + if (x<0) x+=gridX; if (y<0) y+=gridY; - if (z<0) z+=gridZ; - - return(y*gridZ+z); + return((z*sizeY+y)*gridX+x); } //====================================================================================================================== -static inline size_t IndexSlice_zyD2matrix(const size_t y,const size_t z) -// index transposed slice of D2 matrix +static inline size_t IndexSlice_zy(const size_t y,const size_t z) +// index transposed slice of D2 (or R2) matrix { return (z*gridY+y); } //====================================================================================================================== -void TransposeYZ(const int direction) -/* optimized routine to transpose y and z; forward: slices->slices_tr; backward: slices_tr->slices; direction can be - * made boolean but this contradicts with existing definitions of FFT_FORWARD and FFT_BACKWARD, which themselves are - * determined by FFT routines invocation format - */ +static inline size_t IndexRmatrix(const size_t x,size_t y,const size_t z) +// index R matrix to store final result (symmetric with respect to center for y) { -#ifdef OPENCL - const size_t enqtglobalzy[3]={gridZ,gridY,3}; - const size_t enqtglobalyz[3]={gridY,gridZ,3}; - const size_t tblock[3]={16,16,1}; // this corresponds to BLOCK_DIM in oclkernels.cl - /* TODO: test in which cases is the uncached variant faster than the cached one, to make a conditional or to remove - * cltransposef/b if cltransposeof/b is allways faster than cltransposef/b - */ - /* When calling kernels the working group size can't be smaller than the data size; hence cached kernel can be used - * only for large enough problems. Alternative solution is to determine the block size during ADDA runtime and pass - * it to kernel during its compilation. But using small block size is not efficient anyway, so falling back to - * noncached kernel is logical. - */ - bool cached=(enqtglobalzy[0]>=tblock[0] && enqtglobalzy[1]>=tblock[1]); - cached&=(gridZ%16==0 && gridY%16==0); // this is required due to current limitation of cached kernel - - if (direction==FFT_FORWARD) { - if (cached) - CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeof,3,NULL,enqtglobalzy,tblock,0,NULL,NULL)); - else CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposef,2,NULL,enqtglobalzy,NULL,0,NULL,NULL)); - } - else { - if (cached) - CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeob,3,NULL,enqtglobalyz,tblock,0,NULL,NULL)); - else CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeb,2,NULL,enqtglobalyz,NULL,0,NULL,NULL)); - } - clFinish(command_queue); -#else - size_t y,z,Y,Z,y1,y2,z1,z2,i,j,y0,z0,Xcomp; - doublecomplex *t0,*t1,*t2,*t3,*t4,*w0,*w1,*w2,*w3; - - if (direction==FFT_FORWARD) { - Y=gridY; - Z=gridZ; - w0=slices; - t0=slices_tr-Y; - } - else { // direction==FFT_BACKWARD - Y=gridZ; - Z=gridY; - w0=slices_tr; - t0=slices-Y; - } + if (y>=RsizeY) y=gridY-y; + return(NDCOMP*((x*gridZ+z)*RsizeY+y)); +} - y1=Y/blockTr; - y2=Y%blockTr; - z1=Z/blockTr; - z2=Z%blockTr; +//====================================================================================================================== - for(Xcomp=0;Xcomp<3;Xcomp++) { - w1=w0+Xcomp*gridYZ; - t1=t0+Xcomp*gridYZ; - for(i=0;i<=y1;i++) { - if (i==y1) y0=y2; - else y0=blockTr; - w2=w1; - t2=t1; - for(j=0;j<=z1;j++) { - if (j==z1) z0=z2; - else z0=blockTr; - w3=w2; - t3=t2; - for (y=0;y trans +static inline size_t IndexSliceR2matrix(int y,const int z) +// index slice of R2 matrix (periodic over y) { - size_t y,z,Y,Z,y1,y2,z1,z2,i,j,y0,z0; - doublecomplex *t1,*t2,*t3,*t4,*w1,*w2,*w3; + if (y<0) y+=gridY; + return(y*gridZ+z); +} - Y=gridY; - Z=gridZ; +//====================================================================================================================== + +static void transpose(const doublecomplex * restrict data,doublecomplex * restrict trans,const size_t Y,const size_t Z) +// optimized routine to transpose complex matrix with dimensions YxZ: data -> trans +{ + size_t y,z,y1,y2,z1,z2,i,j,y0,z0; + doublecomplex *t1,*t2,*t3,*t4; + const doublecomplex *w1,*w2,*w3; y1=Y/blockTr; y2=Y%blockTr; @@ -302,6 +255,55 @@ static void transposeYZ_Dm(doublecomplex *data,doublecomplex *trans) //====================================================================================================================== +void TransposeYZ(const int direction) +/* optimized routine to transpose y and z; forward: slices->slices_tr; backward: slices_tr->slices; direction can be + * made boolean but this contradicts with existing definitions of FFT_FORWARD and FFT_BACKWARD, which themselves are + * determined by FFT routines invocation format + */ +{ +#ifdef OPENCL + const size_t enqtglobalzy[3]={gridZ,gridY,3}; + const size_t enqtglobalyz[3]={gridY,gridZ,3}; + const size_t tblock[3]={16,16,1}; // this corresponds to BLOCK_DIM in oclkernels.cl + /* TODO: test in which cases is the uncached variant faster than the cached one, to make a conditional or to remove + * cltransposef/b if cltransposeof/b is allways faster than cltransposef/b + */ + /* When calling kernels the working group size can't be smaller than the data size; hence cached kernel can be used + * only for large enough problems. Alternative solution is to determine the block size during ADDA runtime and pass + * it to kernel during its compilation. But using small block size is not efficient anyway, so falling back to + * noncached kernel is logical. + */ + bool cached=(enqtglobalzy[0]>=tblock[0] && enqtglobalzy[1]>=tblock[1]); + cached&=(gridZ%16==0 && gridY%16==0); // this is required due to current limitation of cached kernel + + if (direction==FFT_FORWARD) { + if (cached) + CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeof,3,NULL,enqtglobalzy,tblock,0,NULL,NULL)); + else CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposef,2,NULL,enqtglobalzy,NULL,0,NULL,NULL)); + } + else { + if (cached) + CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeob,3,NULL,enqtglobalyz,tblock,0,NULL,NULL)); + else CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeb,2,NULL,enqtglobalyz,NULL,0,NULL,NULL)); + } + clFinish(command_queue); +#else + size_t Xcomp,ind; + + if (direction==FFT_FORWARD) for (Xcomp=0;Xcomp<3;Xcomp++) { + ind=Xcomp*gridYZ; + transpose(slices+ind,slices_tr+ind,gridY,gridZ); + if (surface) transpose(slicesR+ind,slicesR_tr+ind,gridY,gridZ); + } + else for (Xcomp=0;Xcomp<3;Xcomp++) { // direction==FFT_BACKWARD + ind=Xcomp*gridYZ; + transpose(slices_tr+ind,slices+ind,gridZ,gridY); + } +#endif +} + +//====================================================================================================================== + void fftX(const int isign) // FFT three components of (buf)Xmatrix(x) for all y,z; called from matvec { @@ -347,13 +349,18 @@ void fftY(const int isign) # endif clFinish(command_queue); #elif defined(FFTW3) - if (isign==FFT_FORWARD) fftw_execute(planYf); + if (isign==FFT_FORWARD) { + fftw_execute(planYf); + if (surface) fftw_execute(planYRf); + } else fftw_execute(planYb); #elif defined(FFT_TEMPERTON) int nn=gridY,inc=1,jump=nn,lot=3*gridZ; IGNORE_WARNING(-Wstrict-aliasing); cfft99_((double *)(slices_tr),work,trigsY,ifaxY,&inc,&jump,&nn,&lot,&isign); + // the same operation is applied to sliceR_tr, when required + if (surface && isign==FFT_FORWARD) cfft99_((double *)(slicesR_tr),work,trigsY,ifaxY,&inc,&jump,&nn,&lot,&isign); STOP_IGNORE; #endif } @@ -373,20 +380,28 @@ void fftZ(const int isign) # endif clFinish(command_queue); #elif defined(FFTW3) - if (isign==FFT_FORWARD) fftw_execute(planZf); + if (isign==FFT_FORWARD) { + fftw_execute(planZf); + if (surface) fftw_execute(planZRf); + } else fftw_execute(planZb); #elif defined(FFT_TEMPERTON) int nn=gridZ,inc=1,jump=nn,lot=boxY,Xcomp; IGNORE_WARNING(-Wstrict-aliasing); for (Xcomp=0;Xcomp<3;Xcomp++) cfft99_((double *)(slices+gridYZ*Xcomp),work,trigsZ,ifaxZ,&inc,&jump,&nn,&lot,&isign); + if (surface && isign==FFT_FORWARD) { // the same operation is applied to slicesR, but with inverse transform + const int invSign=FFT_BACKWARD; + for (Xcomp=0;Xcomp<3;Xcomp++) + cfft99_((double *)(slicesR+gridYZ*Xcomp),work,trigsZ,ifaxZ,&inc,&jump,&nn,&lot,&invSign); + } STOP_IGNORE; #endif } //====================================================================================================================== -static void fftX_Dm(const size_t lengthZ ONLY_FOR_TEMPERTON) +static void fftX_Dm(void) // FFT(forward) D2matrix(x) for all y,z; used for Dmatrix calculation { #ifdef FFTW3 @@ -396,18 +411,36 @@ static void fftX_Dm(const size_t lengthZ ONLY_FOR_TEMPERTON) size_t z; IGNORE_WARNING(-Wstrict-aliasing); - for (z=0;z(int)smallZ) kcor=k-gridZ; else kcor=k; for (j=jstart;j0) x=gridX-x; if (y>0) y=gridY-y; if (z>0) z=gridZ-z; @@ -95,8 +98,27 @@ static inline size_t IndexDmatrix_mv(size_t x,size_t y,size_t z,const bool trans if (z>=DsizeZ) z=gridZ-z; } - return(NDCOMP*(x*DsizeYZ+z*DsizeY+y)); + return NDCOMP*((x*DsizeZ+z)*DsizeY+y); +} + +//====================================================================================================================== + +static inline size_t IndexRmatrix_mv(size_t x,size_t y,size_t z,const bool transposed) +{ + if (transposed) { // used only for G_SO !!! + /* reflection along the x-axis can't work in parallel mode, since the corresponding values are generally stored + * on a different processor. A rearrangement of memory distribution is required to remove this limitation. + */ + if (x>0) x=gridX-x; + if (y>0) y=gridY-y; + } + else { + if (y>=RsizeY) y=gridY-y; + } + + return NDCOMP*((x*gridZ+z)*RsizeY+y); } + #endif // OPENCL #endif // SPARSE @@ -119,7 +141,7 @@ void MatVec (doublecomplex * restrict argvec, // the argument vector size_t boxY_st=boxY,boxZ_st=boxZ; // copies with different type #ifndef OPENCL // these variables are not needed for OpenCL size_t i; - doublecomplex fmat[6],xv[3],yv[3]; + doublecomplex fmat[6],xv[3],yv[3],xvR[3],yvR[3]; size_t index,y,z,Xcomp; unsigned char mat; #endif @@ -146,6 +168,14 @@ void MatVec (doublecomplex * restrict argvec, // the argument vector * G_SO: F(D(T)) (k) = F(D) (-k) * k - vector index * + * For reflected matrix the situation is similar. + * R.x=F(-1)(F(R).H(X)), where R is a vector, similar with G, where R[i,j,k=0] is for interaction of two bottom dipoles. + * H(X) is FxFy(Fz^(-1)(X)), where Fx,... are Fourier transforms along corresponding coordinates. It can be computed + * along with F(X). + * Matrix R is symmetric (as a whole), but not in small parts, so R(i,j)=R(j,i)(T). Hence, in contrast to D, for + * 'transpose' actual transpose (changing sign of a few elements) of 3x3 submatrix is required along with addressing + * different elements of F(R). + * * For (her) three additional operations of nConj are used. Should not be a problem, but can be avoided by a more * complex code. */ @@ -232,6 +262,11 @@ void MatVec (doublecomplex * restrict argvec, // the argument vector #endif // following is done by slices for(x=local_x0;xsmallY) { // we assume that compiler will optimize x*=-1 into negation of sign + if (reduced_FFT) { // symmetry with respect to reflection (x_i -> x_2N-i) is the same as in r-space + if (y>=DsizeY) { // we assume that compiler will optimize x*=-1 into negation of sign fmat[1]*=-1; - if (z>smallZ) fmat[2]*=-1; + if (z>=DsizeZ) fmat[2]*=-1; else fmat[4]*=-1; } - else if (z>smallZ) { + else if (z>=DsizeZ) { + fmat[2]*=-1; + fmat[4]*=-1; + } + } + cSymMatrVec(fmat,xv,yv); // yv=fmat.xv + if (surface) { + for (Xcomp=0;Xcomp<3;Xcomp++) xvR[Xcomp]=slicesR_tr[i+Xcomp*gridYZ]; + j=IndexRmatrix_mv(x-local_x0,y,z,transposed); + memcpy(fmat,Rmatrix+j,6*sizeof(doublecomplex)); + if (reduced_FFT && y>=RsizeY) { + fmat[1]*=-1; + fmat[4]*=-1; + } + if (transposed) { // corresponds to transpose of 3x3 matrix fmat[2]*=-1; fmat[4]*=-1; } + // yv+=fmat.xvR + cReflMatrVec(fmat,xvR,yvR); + cvAdd(yvR,yv,yv); } - cSymMatrVec(fmat,xv,yv); // yv=fmat*xv for (Xcomp=0;Xcomp<3;Xcomp++) slices_tr[i+Xcomp*gridYZ]=yv[Xcomp]; } #endif diff --git a/src/param.c b/src/param.c index 0ac8df4d..a4cf9b28 100644 --- a/src/param.c +++ b/src/param.c @@ -1400,6 +1400,9 @@ PARSE_FUNC(store_scat_grid) PARSE_FUNC(surf) { double mre,mim; +#ifdef OPENCL + PrintError("Currently surface is not supported in OpenCL mode"); +#endif if (Narg!=2 && Narg!=3) NargError(Narg,"2 or 3"); ScanDoubleError(argv[1],&hsub); TestPositive(hsub,"height above surface"); @@ -1945,6 +1948,20 @@ void VariablesInterconnect(void) } #ifdef SPARSE if (shape==SH_SPHERE) PrintError("Sparse mode requires shape to be read from file (-shape read ...)"); +#endif +#if defined(PARALLEL) && !defined(SPARSE) + /* Transpose of the non-symmetric interaction matrix can't be done in MPI mode, due to existing memory distribution + * among different processors. This causes problems for iterative solvers, which require a product of Hermitian + * transpose (calculated through the standard transpose) of the matrix with vector (currently, only CGNR). To remove + * this limitation (issue 174) memory distribution should be changed to have both x and gridX-x on the same + * processor. + */ + if (!reduced_FFT && IterMethod==IT_CGNR) PrintError("Non-symmetric interaction matrix (e.g., -no_reduced_fft) " + "can not be used together with CGNR iterative solver in the MPI mode"); + /* TO ADD NEW ITERATIVE SOLVER + * add the new iterative solver to the above line, if it requires calculation of product of Hermitian transpose + * of the matrix with vector (i.e. calls MatVec function with 'true' as the fourth argument) + */ #endif // scale boxes by jagged; should be completely robust to overflows #define JAGGED_BOX(a) { \ diff --git a/src/somnec.c b/src/somnec.c index eaba321a..052305e9 100644 --- a/src/somnec.c +++ b/src/somnec.c @@ -11,7 +11,7 @@ /* TODO: Systematic accuracy study of this code is required. At least 7 digits of precision are desired (for test runs) * However, the urgent thing is that for some reason the accuracy of this code is very bad (few percent errors for - * rho=0, exactly). + * rho=0, exactly). (issue 176) */ // ADDA: the cycling to generate interpolation grid was removed, now it is a single-run routine diff --git a/src/timing.c b/src/timing.c index 236bd87d..c14ebac5 100644 --- a/src/timing.c +++ b/src/timing.c @@ -39,7 +39,8 @@ TIME_TYPE Timing_EPlane,Timing_EPlaneComm, // for Eplane calculation: total a Timing_ScatQuan; // for integral scattering quantities size_t TotalEFieldPlane; // total number of planes for scattered field calculations // used in calculator.c -TIME_TYPE Timing_Init; // for total initialization of the program (before CalculateE) +TIME_TYPE Timing_Init, // for total initialization of the program (before CalculateE) + Timing_Init_Int; // for initialization of interaction routines (including computing tables) size_t TotalEval; // total number of orientation evaluations #ifdef OPENCL TIME_TYPE Timing_OCL_Init; // for initialization of OpenCL (including building program) @@ -139,6 +140,8 @@ void FinalStatistics(void) " init OpenCL "FFORMT"\n",TO_SEC(Timing_OCL_Init)); #endif + fprintf(logfile, + " init interaction "FFORMT"\n",TO_SEC(Timing_Init_Int)); #ifndef SPARSE fprintf(logfile, " init Dmatrix "FFORMT"\n",TO_SEC(Timing_Dm_Init)); diff --git a/src/vars.c b/src/vars.c index 7c85c9db..61cedf7a 100644 --- a/src/vars.c +++ b/src/vars.c @@ -154,7 +154,7 @@ int local_z0,local_z1; // starting and ending z for current processor size_t local_Nz; // number of z layers (based on the division of smallZ) int local_Nz_unif; /* number of z layers (distance between max and min values), belonging to this processor, after all non_void dipoles are uniformly distributed between all processors */ -int local_z1_coer; // ending z, coerced to be not greater than boxZ +int local_z1_coer; // ending z, coerced to be not greater than boxZ (and not smaller than local_z0) // starting, ending x for current processor and number of x layers (based on the division of smallX) size_t local_x0,local_x1,local_Nx; diff --git a/tests/2exec/comp2exec b/tests/2exec/comp2exec index 5cc09b88..293fa898 100755 --- a/tests/2exec/comp2exec +++ b/tests/2exec/comp2exec @@ -31,7 +31,9 @@ MPIRUN="mpiexec -n 4" #SPARSE=1 #SPARSE_STANDARD=1 -#Special mode for comparing surface mode (with surface refractive index = 1) with standard +# Extensive testing of surface (in combination with all other options) and special mode for comparing surface mode +# (with surface refractive index = 1) with standard. Use at most one of them +#SURF_EXT=1 #SURF_STANDARD=1 if [ -n "$SPARSE_STANDARD" ]; then @@ -79,6 +81,11 @@ else exit 1 fi +if [ -n "$SURF_EXT" ]; then + DEFSUITE=suite_surf + EXECREF="$EXECREF -surf 10 2 0.1" + EXECTEST="$EXECTEST -surf 10 2 0.1" +fi if [ -n "$SURF_STANDARD" ]; then DEFSUITE=suite_surf EXECTEST="$EXECTEST -surf 10 1 0 -yz" @@ -155,7 +162,7 @@ function mycmp { # behavior is mainly determined by file name base=`basename $1` if [ "$base" == $SONAME ]; then - IGNORE="^all data is saved in '.*'" + IGNORE="^all data is saved in '.*'|No real dipoles are assigned" if [ $MODE == "mpi_seq" ]; then IGNORE="$IGNORE|^(M|Total m|Maximum m|Additional m)emory usage" elif [ $MODE == "ocl_seq" ]; then @@ -164,11 +171,14 @@ function mycmp { IGNORE="$IGNORE|^(M|Total m|OpenCL m)emory usage" fi if [ -n "$FFTCOMP" ]; then - IGNORE="$IGNORE|^(M|Total m|OpenCL m|Maximum m)emory usage" + IGNORE="$IGNORE|^(M|Total m|OpenCL m|Maximum m)emory usage|^Initializing (clFFT|FFTW3)" fi if [ -n "$SPARSE_STANDARD" ]; then - IGNORE="$IGNORE|^Calculating Green's function|^Fourier transform of Dmatrix|^Initializing (clFFT|FFTW3)" + IGNORE="$IGNORE|^Calculating( reflected|) Green's function|^Fourier transform of" fi + if [ -n "$SURF_STANDARD" ]; then + IGNORE="$IGNORE|^Calculating (table|reflected)|^Fourier transform of|^(M|Total m|OpenCL m|Maximum m)emory usage" + fi if [[ $MODE == "mpi" || $MODE == "mpi_seq" ]]; then CUT="^Error posting writev, " # due to typical random errors of MPICH under Windows else @@ -186,27 +196,28 @@ function mycmp { elif [ "$base" == "log" ]; then IGNORE="^Generated by ADDA v\.|^command: '.*'" if [ $MODE == "mpi_seq" ]; then - IGNORE="$IGNORE|^The program was run on: |^(M|Total m|Maximum m|Additional m)emory usage" + IGNORE="$IGNORE|^The program was run on:|^(M|Total m|Maximum m|Additional m)emory usage|^The FFT grid is:" elif [ $MODE == "ocl_seq" ]; then - IGNORE="$IGNORE|^Using OpenCL device|^Device memory|^OpenCL FFT algorithm: |^(M|Total m|OpenCL m)emory usage" + IGNORE="$IGNORE|^Using OpenCL device|^Device memory|^OpenCL FFT algorithm:|^(M|Total m|OpenCL m)emory usage" fi if [ -n "$FFTCOMP" ]; then - IGNORE="$IGNORE|^(|OpenCL )FFT algorithm: |^The FFT grid is: |^(M|Total m|OpenCL m|Maximum m)emory usage" + IGNORE="$IGNORE|^(|OpenCL )FFT algorithm:|^The FFT grid is:|^(M|Total m|OpenCL m|Maximum m)emory usage" fi if [ -n "$SURF_STANDARD" ]; then - IGNORE="$IGNORE|^Particle is placed near|^ height of the particle|^Reflected|^Transmitted|^Total planes of E" + IGNORE="$IGNORE|^Particle is placed|^ height of the|^Reflected|^Transmitted|^Total planes of E" + IGNORE="$IGNORE|^(M|Total m|OpenCL m|Maximum m)emory usage" fi CUT="^Total wall time: " asmin rtol 4 numigndiff $1 $2 "$IGNORE" "$CUT" - elif [[ "$base" = CrossSec-* ]]; then + elif [[ "$base" == CrossSec-* ]]; then numdiff $1 $2 elif [[ "$base" == mueller* || "$base" == ampl* ]]; then asmin atol 9 asmin rtol 5 numdiff $1 $2 elif [[ "$base" == log_int_* || "$base" == "log_orient_avg" ]]; then - asmin atol 12 + asmin atol 10 numdiff $1 $2 elif [[ "$base" == "granules" ]]; then #compare only some comments and total number of lines if [ `wc -l < $1` == `wc -l < $2` ]; then @@ -219,6 +230,10 @@ function mycmp { igndiff $1 $2 "generated by ADDA v\." elif [[ "$base" == IncBeam* ]]; then numdiff $1 $2 + elif [[ "$base" == DipPol* || "$base" == IntField* ]]; then + asmin atol 12 + asmin rtol 6 + numdiff $1 $2 else diff $1 $2 >&2 fi @@ -275,9 +290,24 @@ while read -r cmpfiles cmdline; do let imax=imax+1 finds[$imax]=$cmpfiles reps[$imax]="$cmdline" - #skip blank and commented lines, and all lines when skip=0 + # skip blank and commented lines, and all lines when skip=0 elif [[ -n "$cmpfiles" && "${cmpfiles:0:1}" != "#" && ( $skip -eq 0 || ( $skip -eq 1 && "$cmdline" == $3 ) ) ]]; then skip=0; + # test special cases, given in the file + if [ "$cmpfiles" == "NOMPI" ]; then + if [[ $MODE == "mpi" || $MODE == "mpi_seq" ]]; then + continue + else + cmpfiles="$ALLNAME" + fi + fi + if [ "$cmpfiles" == "NOMPISEQ" ]; then + if [ $MODE == "mpi_seq" ]; then + continue + else + cmpfiles="$ALLNAME" + fi + fi for i in `seq 0 $imax`; do # variable substitution cmdline="${cmdline/${finds[$i]}/${reps[$i]}}" done diff --git a/tests/2exec/suite b/tests/2exec/suite index 4d5b24e3..63bfdb13 100644 --- a/tests/2exec/suite +++ b/tests/2exec/suite @@ -25,6 +25,8 @@ # The format is the following: ' ' the first one is coma-separated list of files to # compare or 'all' (which compares all produced files. is everything after the first space and it is passed # directly to ADDA. +# Instead of 'all' a number of macros can be used: NOMPI, NOMPISEQ which is equivalent to 'all' for other modes, but +# causes the line to be skipped in the matching mode. NOMPI lines are skipped both in mpi and mpi_seq modes. all @@ -72,8 +74,7 @@ all -chp_dir chp_tmp -chp_type regular -chpoint 1s -eps 3 ;mgn; all -h chp_dir all -chp_dir chp_tmp -chp_type always -eps 3 ;mgn; all -h chp_load -# the following may cause errors in mpi_seq mode -all -chp_dir chp_tmp -chp_load ;mgn; +NOMPISEQ -chp_dir chp_tmp -chp_load ;mgn; all -h Cpr all -Cpr ;mgn; @@ -101,6 +102,7 @@ CrossSec-Y,CrossSec-X,mueller -granul 0.2 2 2 -size 8 -shape coated 0.5 ;3m; ;n; all -h grid all -grid 4 6 8 ;m; ;n; +all -grid 10 10 10 -shape read sphere.geom -sym enf ;m; ;n; all -h all -h h @@ -122,6 +124,10 @@ all -int igt_so ;mgn; all -int poi ;mgn; all -int so ;mgn; +all -h int_surf +all -int_surf img -surf 4 2 0 ;mgn; +all -int_surf som -surf 4 2 0 ;mgn; + all -h iter all -iter bcgs2 ;mgn; all -iter bicg ;mgn; @@ -145,6 +151,7 @@ all -maxiter 5 ;mgn; all -h no_reduced_fft all -no_reduced_fft ;mgn; +NOMPI -no_reduced_fft -iter cgnr ;mgn; all -h no_vol_cor all -no_vol_cor -size 3 ;mgn; @@ -210,6 +217,9 @@ all -scat_matr ampl ;mgn; all -scat_matr both ;mgn; all -scat_matr none ;m; ;g; +all -h scat_plane +all -scat_plane ;se; ;mgn; + all -h shape all -h shape axisymmetric all -shape axisymmetric 196.txt ;mgn; @@ -274,6 +284,14 @@ all -store_int_field ;se; ;mgn; all -h store_scat_grid all -store_scat_grid ;sep; ;mgn; +all -h surf +all -surf 4 2 0 ;mgn; +all -surf 4 3 4 -prop 1 2 3 ;mgn; +all -surf 4 3 4 -prop 1 2 -3 ;se; ;mgn; +all -surf 4 inf -prop 1 2 -3 ;se; ;mgn; +all -surf 4 2 0 -no_reduced_fft ;mgn; +NOMPI -surf 4 2 0 -iter cgnr -no_reduced_fft ;mgn; + all -h sym all -sym auto ;mgn; all -sym no ;mgn; diff --git a/tests/2exec/suite_sparse b/tests/2exec/suite_sparse index c3715164..a1211ac4 100644 --- a/tests/2exec/suite_sparse +++ b/tests/2exec/suite_sparse @@ -26,6 +26,8 @@ # The format is the following: ' ' the first one is coma-separated list of files to # compare or 'all' (which compares all produced files. is everything after the first space and it is passed # directly to ADDA. +# Instead of 'all' a number of macros can be used: NOMPI, NOMPISEQ which is equivalent to 'all' for other modes, but +# causes the line to be skipped in the matching mode. NOMPI lines are skipped both in mpi and mpi_seq modes. all ;g; @@ -58,8 +60,7 @@ all -chp_dir chp_tmp -chp_type regular -chpoint 1s -eps 3 ;mgn; all -h chp_dir all -chp_dir chp_tmp -chp_type always -eps 3 ;mgn; all -h chp_load -# the following may cause errors in mpi_seq mode -all -chp_dir chp_tmp -chp_load ;mgn; +NOMPISEQ -chp_dir chp_tmp -chp_load ;mgn; all -h Cpr all -Cpr ;sep; ;mn; @@ -85,7 +86,7 @@ all -eq_rad 1 ;mgn; #CrossSec-Y,CrossSec-X,mueller -granul 0.2 2 2 -size 8 -shape coated 0.5 ;3m; ;n; all -h grid -#all -grid 4 6 8 ;m; ;n; +all -grid 6 6 8 ;mg4n; all -h all -h h @@ -107,6 +108,10 @@ all -int igt_so ;mgn; all -int poi ;mgn; all -int so ;mgn; +all -h int_surf +all -int_surf img -surf 4 2 0 ;mgn; +all -int_surf som -surf 4 2 0 ;mgn; + all -h iter all -iter bcgs2 ;mgn; all -iter bicg ;mgn; @@ -130,6 +135,7 @@ all -maxiter 5 ;mgn; all -h no_reduced_fft all -no_reduced_fft ;mgn; +NOMPI -no_reduced_fft -iter cgnr ;mgn; all -h no_vol_cor all -no_vol_cor -size 3 ;mgn; @@ -194,6 +200,9 @@ all -scat_matr ampl ;mgn; all -scat_matr both ;mgn; all -scat_matr none ;m; ;g; +all -h scat_plane +all -scat_plane ;se; ;mn; + all -h shape #all -h shape axisymmetric #all -shape axisymmetric 196.txt ;mgn; @@ -258,6 +267,14 @@ all -store_int_field ;se; ;mn; all -h store_scat_grid all -store_scat_grid ;sep; ;mn; +all -h surf +all -surf 4 2 0 ;mgn; +all -surf 4 3 4 -prop 1 2 3 -shape read sphere.geom ;mn; +all -surf 4 3 4 -prop 1 2 -3 ;se; ;mn; +all -surf 4 inf -prop 1 2 -3 ;se; ;mn; +all -surf 4 2 0 -no_reduced_fft ;mgn; +NOMPI -surf 4 2 0 -iter cgnr -no_reduced_fft ;mgn; + all -h sym all -sym auto ;mn; -shape read sphere.geom all -sym no ;mn; -shape read sphere.geom diff --git a/tests/2exec/suite_surf b/tests/2exec/suite_surf index 39326cb7..a9a5b035 100644 --- a/tests/2exec/suite_surf +++ b/tests/2exec/suite_surf @@ -27,24 +27,11 @@ # The format is the following: ' ' the first one is coma-separated list of files to # compare or 'all' (which compares all produced files. is everything after the first space and it is passed # directly to ADDA. +# Instead of 'all' a number of macros can be used: NOMPI, NOMPISEQ which is equivalent to 'all' for other modes, but +# causes the line to be skipped in the matching mode. NOMPI lines are skipped both in mpi and mpi_seq modes. all -# testing of different grids - relevant for FFT methods -# to remove redundant warnings for ocl_seq, only (2,3,5) numbers are used -all -grid 2 ;smn; -all -grid 4 ;smn; -all -grid 6 ;smn; -all -grid 8 ;smn; -all -grid 10 ;smn; -all -grid 12 ;smn; -all -grid 16 ;smn; -all -grid 18 ;smn; -all -grid 20 ;smn; -all -grid 24 ;smn; -all -grid 30 ;smn; -all -grid 32 ;smn; - all -h alldir_inp all -alldir_inp adp.dat -Csca ;mgn; @@ -76,8 +63,7 @@ all -chp_dir chp_tmp -chp_type regular -chpoint 1s -eps 3 ;mgn; all -h chp_dir all -chp_dir chp_tmp -chp_type always -eps 3 ;mgn; all -h chp_load -# the following may cause errors in mpi_seq mode -all -chp_dir chp_tmp -chp_load ;mgn; +NOMPISEQ -chp_dir chp_tmp -chp_load ;mgn; all -h Cpr # radiative forces are not yet supported with surf @@ -102,10 +88,12 @@ all -eq_rad 1 ;mgn; # properties are compared using rather large tolerances all -h granul CrossSec-Y,CrossSec-X,mueller -granul 0.2 0.5 2 -size 8 -shape coated 0.5 ;3m; ;n; -CrossSec-Y,CrossSec-X,mueller -granul 0.2 2 2 -size 8 -shape coated 0.5 ;3m; ;n; +# Standard tolerances are not sufficient, when the following is run in the presence of non-trivial surface +#CrossSec-Y,CrossSec-X,mueller -granul 0.2 2 2 -size 8 -shape coated 0.5 ;3m; ;n; all -h grid all -grid 4 6 8 ;m; ;n; +all -grid 10 10 10 -shape read sphere.geom -sym enf ;m; ;n; all -h all -h h @@ -141,7 +129,8 @@ all -h jagged all -jagged 2 ;mg4n; all -h lambda -all -lambda 1 ;mgn; +# 1 was changed to 10, to keep old tolerances +all -lambda 10 ;mgn; all -h m all -m 1.2 0.2 ;g; ;n; @@ -151,6 +140,7 @@ all -maxiter 5 ;mgn; all -h no_reduced_fft all -no_reduced_fft ;mgn; +NOMPI -no_reduced_fft -iter cgnr ;mgn; all -h no_vol_cor all -no_vol_cor -size 3 ;mgn; @@ -221,6 +211,10 @@ all -scat_matr both ;mgn; # for equivalent runs with surface we additionally add -yz, which is incompatible with the following #all -scat_matr none ;m; ;g; +all -h scat_plane +# for equivalent runs with surface we additionally add -yz, which is incompatible with the following +#all -scat_plane ;se; ;mgn; + all -h shape all -h shape axisymmetric all -shape axisymmetric 196.txt ;mgn; @@ -294,7 +288,8 @@ all -sym no ;mgn; all -sym enf ;mgn; all -sym auto ;sep; ;mgn; all -sym no ;sep; ;mgn; -all -sym enf ;sep; ;mgn; +# enforcing symmetry for non-default propagation is forbidden for -surf +#all -sym enf ;sep; ;mgn; all -h test all -test ;mgn;