From 3fd68c9832b1ceb47a591988187082b37b416e86 Mon Sep 17 00:00:00 2001
From: yurkin <yurkin@gmail.com>
Date: Wed, 25 Sep 2013 09:45:34 +0000
Subject: [PATCH] Finally, a working implementation of particle near surface in
 FFT mode. - additional required memory was described in comments in
 calculator.c - definition of ZsumShift is now different for sparse and FFT
 modes in parallel (relative to the local or global bottom dipoles
 respectively). Related difference is in definition of local_Nz_Rm (used for
 Sommerfeld table and, in FFT mode, for filling Rmatrix). - Sommerfeld table
 is now not calculated during prognosis, but corresponding memory is properly
 accounted for. - Current implementation requires additional forward fftY and
 fftZ at each iteration (but see issue 177). - BlockTranspose_Dm in comm.c was
 renamed into BlockTranspose_DRm, since it works both for D- and Rmatrix. -
 existing functions to index Dmatrix has been slightly changed to not use
 DsizeYZ and use y>=DsizeY instead of y>smallY. - functions and plans, not
 specific to Dmatrix were renamed (Y and Z forward transforms), now they have
 'slice' in their names. - transposeYZ_Dm was changed to transpose (a general
 function), and transposeYZ is now just a wrapper around the latter. - version
 incremented to 1.3b1.

Main computational bottleneck is computation of the table of Sommerfeld integrals. Currently it is approximately equivalent to 200 iterations (issue 176). So should not be a problem for applications with a larger number of iterations.

New implementation was extensively tested against previous sparse version. Still, quantitative comparison with published literature data is required. Remaining limitation is that it doesn't work in OpenCL mode (see issue 101 for details, explicit exception was added to param.c).

Changes to timing:
- added time for 'init interaction' (significant when Sommerfeld table is calculated).
- Time for 'init Dmatrix' now may include the time for initialization of Rmatrix (doesn't include time for Sommerfeld table).
- Precise timing for Dmatrix now includes a separate line for initialization of Rmatrix (without details)

Other:
- scattering at exactly 90 degrees (along surface) for non-trivial surfaces is now handled by a special case to produce exact 0. This makes the output consistent, avoiding large integration errors for -phi_integr or alldir.
- added explicit exception to forbid combination of -no_reduced_fft and -iter cgnr in MPI FFT mode (issue 174).

Tests in tests/2exec/ have been significantly improved
- added SURF_EXT flag, which allows running a large suite of tests, adding '-surf ...' option to each of them.
- added a number of ignores, which are always active (decreases false-positives)
- added a couple of macros in suite files (NOMPI and NOMPISEQ), which indicates that the line should be skipped for a specific comparison modes.
- added a number of tests to the default suite: '-surf ...', '-int_surf', -scat_plane, '-shape read ... -grid ...', '-no_reduced_fft -iter cgnr' (see above)
---
 src/GenerateB.c          |   2 +-
 src/calculator.c         |  11 +-
 src/comm.c               |   8 +-
 src/comm.h               |   2 +-
 src/const.h              |   2 +-
 src/crosssec.c           |  11 +-
 src/fft.c                | 507 +++++++++++++++++++++++++++------------
 src/interaction.c        |  47 ++--
 src/interaction.h        |   4 +-
 src/make_particle.c      |   9 +-
 src/matvec.c             |  94 ++++++--
 src/param.c              |  17 ++
 src/somnec.c             |   2 +-
 src/timing.c             |   5 +-
 src/vars.c               |   2 +-
 tests/2exec/comp2exec    |  52 +++-
 tests/2exec/suite        |  22 +-
 tests/2exec/suite_sparse |  23 +-
 tests/2exec/suite_surf   |  35 ++-
 19 files changed, 606 insertions(+), 249 deletions(-)

diff --git a/src/GenerateB.c b/src/GenerateB.c
index afccdd26..e1535379 100644
--- a/src/GenerateB.c
+++ b/src/GenerateB.c
@@ -43,7 +43,7 @@ extern const int beam_Npars;
 extern const double beam_pars[];
 extern const char *beam_fnameY;
 extern const char *beam_fnameX;
-extern opt_index opt_beam;
+extern const opt_index opt_beam;
 
 // used in crosssec.c
 double beam_center_0[3]; // position of the beam center in laboratory reference frame
diff --git a/src/calculator.c b/src/calculator.c
index eb60c0bf..afca210c 100644
--- a/src/calculator.c
+++ b/src/calculator.c
@@ -43,7 +43,7 @@ extern const angle_set beta_int,gamma_int,theta_int,phi_int;
 extern const int avg_inc_pol;
 extern const char *alldir_parms,*scat_grid_parms;
 // defined and initialized in timing.c
-extern TIME_TYPE Timing_Init;
+extern TIME_TYPE Timing_Init,Timing_Init_Int;
 #ifdef OPENCL
 extern TIME_TYPE Timing_OCL_Init;
 #endif
@@ -445,14 +445,17 @@ static void AllocateEverything(void)
 	/* estimate of the memory (only the fastest scaling part):
 	 * MatVec - (288+384nprocs/boxX [+192/nprocs])*Ndip
 	 *          more exactly: gridX*gridY*gridZ*(36+48nprocs/boxX [+24/nprocs]) value in [] is only for parallel mode.
-	 *          For OpenCL mode all MatVec part is allocated on GPU instead of main (CPU) memory
+	 * For surf additionally: gridX*gridY*gridZ*(48+48nprocs/boxX)
+	 * 			+ for Sommerfeld table: 64*boxZ*(boxX*boxY-(MIN(boxX,boxY))^2)
+	 *    For OpenCL mode all MatVec part is allocated on GPU instead of main (CPU) memory
 	 * others - nvoid_Ndip*{271(CGNR,BiCG), 367(CSYM,QMR2), 415(BiCGStab,QMR), or 463(BCGS2)}
 	 *          + additional 8*nvoid_Ndip for OpenCL mode and CGNR or Bi-CGSTAB
 	 * PARALLEL: above is total; division over processors of MatVec is uniform, others - according to local_nvoid_Ndip
 	 *
 	 * Sparse mode - each processor needs (265--457, depending on iterative solver)*local_nvoid_Ndip + 60*nvoid_Ndip
 	 *               and division is uniform, i.e. local_nvoid_Ndip = nvoid_Ndip/nprocs
-	 *               Part of the memory is currently not distributed among processors - see issue 160.
+	 *               Sommerfeld table - same as above, but it is not divided among processors.
+	 *               Part of the memory is currently not distributed among processors - see issues 160,175.
 	 */
 	MAXIMIZE(memPeak,memory);
 	double memSum=AccumulateMax(memPeak,&memmax);
@@ -592,7 +595,9 @@ void Calculator (void)
 	else dtheta_deg=dtheta_rad=block_theta=0;
 	finish_avg=false;
 	// Do preliminary setup for MatVec
+	TIME_TYPE startInitInt=GET_TIME();
 	InitInteraction();
+	Timing_Init_Int=GET_TIME()-startInitInt;
 #ifndef SPARSE
 	// initialize D matrix (for matrix-vector multiplication)
 	D("InitDmatrix started");
diff --git a/src/comm.c b/src/comm.c
index 9b63b16a..6da60ff1 100644
--- a/src/comm.c
+++ b/src/comm.c
@@ -676,10 +676,10 @@ void BlockTranspose(doublecomplex * restrict X UOIP,TIME_TYPE *timing UOIP)
 
 //======================================================================================================================
 
-void BlockTranspose_Dm(doublecomplex * restrict X UOIP,const size_t lengthY UOIP,const size_t lengthZ UOIP)
-/* do the data-transposition, i.e. exchange, between fftX and fftY&fftZ; specialized for D matrix. It can be updated to
- * accept timing argument for generality. But, since this is a specialized function, we keep the timing variable
- * hard-wired in the code.
+void BlockTranspose_DRm(doublecomplex * restrict X UOIP,const size_t lengthY UOIP,const size_t lengthZ UOIP)
+/* do the data-transposition, i.e. exchange, between fftX and fftY&fftZ; specialized for D or R matrix. It can be
+ * updated to accept timing argument for generality. But, since this is a specialized function, we keep the timing
+ * variable hard-wired in the code.
  */
 {
 #ifdef ADDA_MPI
diff --git a/src/comm.h b/src/comm.h
index 010f9c5a..cb425d92 100644
--- a/src/comm.h
+++ b/src/comm.h
@@ -45,7 +45,7 @@ void ReadField(const char * restrict fname,doublecomplex *restrict field);
 
 #ifndef SPARSE
 void BlockTranspose(doublecomplex * restrict X,TIME_TYPE *timing);
-void BlockTranspose_Dm(doublecomplex * restrict X,size_t lengthY,size_t lengthZ);
+void BlockTranspose_DRm(doublecomplex * restrict X,size_t lengthY,size_t lengthZ);
 // used by granule generator
 void SetGranulComm(double z0,double z1,double gdZ,int gZ,size_t gXY,size_t buf_size,int *lz0,int *lz1,int sm_gr);
 void CollectDomainGranul(unsigned char * restrict dom,size_t gXY,int lz0,int locgZ,TIME_TYPE *timing);
diff --git a/src/const.h b/src/const.h
index e965fda7..77c0578e 100644
--- a/src/const.h
+++ b/src/const.h
@@ -18,7 +18,7 @@
 #define __const_h
 
 // version number (string)
-#define ADDA_VERSION "1.3a2"
+#define ADDA_VERSION "1.3b1"
 
 /* ADDA uses certain C99 extensions, which are widely supported by GNU and Intel compilers. However, they may be not
  * completely supported by e.g. Microsoft Visual Studio compiler. Therefore, we check the version of the standard here
diff --git a/src/crosssec.c b/src/crosssec.c
index 834499a3..5b8d2eec 100644
--- a/src/crosssec.c
+++ b/src/crosssec.c
@@ -631,6 +631,11 @@ static void CalcFieldSurf(doublecomplex ebuff[static restrict 3], // where to wr
 	if (above) { // simple reflection
 		vCopy(nF,nN);
 		nN[2]*=-1;
+		// no scattering at exactly 90 degrees for non-trivial surface (to avoid randomness for this case)
+		if (fabs(nN[2])<ROUND_ERR && cabs(msub-1)>ROUND_ERR) {
+			cvInit(ebuff);
+			return;
+		}
 	}
 	else { // transmission
 		if (msubInf) { // no transmission for perfectly reflecting substrate => zero result
@@ -738,11 +743,13 @@ static void CalcFieldSurf(doublecomplex ebuff[static restrict 3], // where to wr
 		kt=NAN; // redundant to remove warnings below
 	}
 	else {
-		if (cabs(msub-1)<ROUND_ERR && fabs(ki)<ROUND_ERR) kt=ki; // special case to avoid randomness due to round-off errors
+		// special case to avoid randomness due to round-off errors
+		if (cabs(msub-1)<ROUND_ERR && fabs(ki)<ROUND_ERR) kt=ki;
 		else kt=cSqrtCut(msub*msub - (nN[0]*nN[0]+nN[1]*nN[1]));
 		if (above) {
 			/* here we test only the exact zero, since for other cases (when msub=1, and very small values of ki,kt) the
-			 * above assignment kt=ki guarantees correct results through standard functions
+			 * above assignment kt=ki guarantees correct results through standard functions. Also the case of
+			 * 90deg-scattering (for msub!=1) is taken care of in the beginning of this function.
 			 */
 			if (ki==0 && kt==0) cs=cp=0;
 			else {
diff --git a/src/fft.c b/src/fft.c
index 022e203a..f35e5f5d 100644
--- a/src/fft.c
+++ b/src/fft.c
@@ -70,18 +70,23 @@
 
 // SEMI-GLOBAL VARIABLES
 
+// defined and initialized in interaction.c
+extern const int local_Nz_Rm;
 // defined and initialized in timing.c
 extern TIME_TYPE Timing_FFT_Init,Timing_Dm_Init;
 
 // used in matvec.c
 doublecomplex * restrict Dmatrix; // holds FFT of the interaction matrix
+doublecomplex * restrict Rmatrix; // holds FFT of the reflection matrix
 #ifndef OPENCL
 	// holds input vector (on expanded grid) to matvec, also used as storage space in iterative.c
 doublecomplex * restrict Xmatrix;
 doublecomplex * restrict slices; // used in inner cycle of matvec - holds 3 components (for fixed x)
 doublecomplex * restrict slices_tr; // additional storage space for slices to accelerate transpose
+doublecomplex * restrict slicesR,* restrict slicesR_tr; // same as above, but for reflected interaction
 #endif
 size_t DsizeY,DsizeZ,DsizeYZ; // size of the 'matrix' D
+size_t RsizeY; // size of the 'matrix' R
 // used in comm.c
 double * restrict BT_buffer, * restrict BT_rbuffer; // buffers for BlockTranspose
 
@@ -89,7 +94,13 @@ double * restrict BT_buffer, * restrict BT_rbuffer; // buffers for BlockTranspos
 
 // D2 matrix and its two slices; used only temporary for InitDmatrix
 static doublecomplex * restrict slice,* restrict slice_tr,* restrict D2matrix;
-static size_t D2sizeX,D2sizeY,D2sizeZ; // size of the 'matrix' D2
+static doublecomplex * restrict R2matrix; // same for surface (slice and slice_tr are reused from Dmatrix)
+static size_t D2sizeY,D2sizeZ; // size of the 'matrix' D2 (x-size is gridX)
+static size_t R2sizeY; // size of the 'matrix' R2 (x- and z-sizes are corresponding grids)
+static size_t lz_Dm,lz_Rm; // local sizes along z for D(2) and R(2) matrices
+// the following two lines are defined in InitDmatrix but used in InitRmatrix, they are analogous to Dm values
+static size_t Rsize,R2sizeTot; // sizes of R and R2 matrices
+static int jstartR; // starting index for y
 static size_t blockTr=TR_BLOCK;        // block size for TransposeYZ
 static bool weird_nprocs;              // whether weird number of processors is used
 #ifdef OPENCL
@@ -101,9 +112,9 @@ static clFFT_Plan clplanX,clplanY,clplanZ; // clFFT plans
 #endif
 #ifdef FFTW3
 // FFTW3 plans: f - FFT_FORWARD; b - FFT_BACKWARD
-static fftw_plan planXf_Dm,planYf_Dm,planZf_Dm;
+static fftw_plan planXf_Dm,planYf_slice,planZf_slice,planXf_Rm;
 #	ifndef OPENCL // these plans are used only if OpenCL is not used
-static fftw_plan planXf,planXb,planYf,planYb,planZf,planZb;
+static fftw_plan planXf,planXb,planYf,planYb,planZf,planZb,planYRf,planZRf; // last two for reflected interaction
 #	endif
 #elif defined(FFT_TEMPERTON)
 #	ifdef NO_FORTRAN
@@ -122,23 +133,22 @@ void cfft99_(double * restrict data,double * restrict _work,const double * restr
 //======================================================================================================================
 
 static inline size_t IndexDmatrix(const size_t x,size_t y,size_t z)
-// index D matrix to store final result
+// index D matrix to store final result (symmetric with respect to center for y and z)
 {
 	if (y>=DsizeY) y=gridY-y;
 	if (z>=DsizeZ) z=gridZ-z;
-
-	return(NDCOMP*(x*DsizeYZ+z*DsizeY+y));
+	return(NDCOMP*((x*DsizeZ+z)*DsizeY+y));
 }
 
 //======================================================================================================================
 
-static inline size_t IndexGarbledD(const size_t x,int y,int z,const size_t lengthN UOIP)
-// index D2 matrix after BlockTranspose
+static inline size_t IndexGarbledD(const size_t x,int y,int z)
+// index D2 matrix after BlockTranspose (periodic over y and z)
 {
-	if (y<0) y+=D2sizeY;
-	if (z<0) z+=D2sizeZ;
+	if (y<0) y+=gridY;
+	if (z<0) z+=gridZ;
 #ifdef PARALLEL
-	return(((z%lengthN)*D2sizeY+y)*gridX+(z/lengthN)*local_Nx+x%local_Nx);
+	return(((z%lz_Dm)*D2sizeY+y)*gridX+(z/lz_Dm)*local_Nx+x%local_Nx);
 #else
 	return((z*D2sizeY+y)*gridX+x);
 #endif
@@ -146,128 +156,71 @@ static inline size_t IndexGarbledD(const size_t x,int y,int z,const size_t lengt
 
 //======================================================================================================================
 
-static inline size_t IndexD2matrix(int x,int y,int z,const int nnn)
-// index D2 matrix to store calculated elements
+static inline size_t IndexSliceD2matrix(int y,int z)
+// index slice of D2 matrix (periodic over y and z)
 {
-	if (x<0) x+=gridX;
-	if (y<0) y+=D2sizeY;
-	//  if (z<0) z+=D2sizeZ;
-	return(((z-nnn*local_z0)*D2sizeY+y)*gridX+x);
+	if (y<0) y+=gridY;
+	if (z<0) z+=gridZ;
+	return(y*gridZ+z);
 }
 
 //======================================================================================================================
 
-static inline size_t IndexSliceD2matrix(int y,int z)
-// index slice of D2 matrix
+static inline size_t Index2matrix(int x,int y,const int z,const int sizeY)
+// index D2 or R2 matrix to store calculated elements (periodic over x and y), z should already be shifted
 {
+	if (x<0) x+=gridX;
 	if (y<0) y+=gridY;
-	if (z<0) z+=gridZ;
-
-	return(y*gridZ+z);
+	return((z*sizeY+y)*gridX+x);
 }
 
 //======================================================================================================================
 
-static inline size_t IndexSlice_zyD2matrix(const size_t y,const size_t z)
-// index transposed slice of D2 matrix
+static inline size_t IndexSlice_zy(const size_t y,const size_t z)
+// index transposed slice of D2 (or R2) matrix
 {
 	return (z*gridY+y);
 }
 
 //======================================================================================================================
 
-void TransposeYZ(const int direction)
-/* optimized routine to transpose y and z; forward: slices->slices_tr; backward: slices_tr->slices; direction can be
- * made boolean but this contradicts with existing definitions of FFT_FORWARD and FFT_BACKWARD, which themselves are
- * determined by FFT routines invocation format
- */
+static inline size_t IndexRmatrix(const size_t x,size_t y,const size_t z)
+// index R matrix to store final result (symmetric with respect to center for y)
 {
-#ifdef OPENCL
-	const size_t enqtglobalzy[3]={gridZ,gridY,3};
-	const size_t enqtglobalyz[3]={gridY,gridZ,3};
-	const size_t tblock[3]={16,16,1}; // this corresponds to BLOCK_DIM in oclkernels.cl
-	/* TODO: test in which cases is the uncached variant faster than the cached one, to make a conditional or to remove
-	 * cltransposef/b if cltransposeof/b is allways faster than cltransposef/b
-	 */
-	/* When calling kernels the working group size can't be smaller than the data size; hence cached kernel can be used
-	 * only for large enough problems. Alternative solution is to determine the block size during ADDA runtime and pass
-	 * it to kernel during its compilation. But using small block size is not efficient anyway, so falling back to
-	 * noncached kernel is logical.
-	 */
-	bool cached=(enqtglobalzy[0]>=tblock[0] && enqtglobalzy[1]>=tblock[1]);
-	cached&=(gridZ%16==0 && gridY%16==0); // this is required due to current limitation of cached kernel
-	
-	if (direction==FFT_FORWARD) {
-		if (cached)
-			CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeof,3,NULL,enqtglobalzy,tblock,0,NULL,NULL));
-		else CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposef,2,NULL,enqtglobalzy,NULL,0,NULL,NULL));
-	}
-	else {
-		if (cached)
-			CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeob,3,NULL,enqtglobalyz,tblock,0,NULL,NULL));
-		else CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeb,2,NULL,enqtglobalyz,NULL,0,NULL,NULL));
-	}
-	clFinish(command_queue);
-#else
-	size_t y,z,Y,Z,y1,y2,z1,z2,i,j,y0,z0,Xcomp;
-	doublecomplex *t0,*t1,*t2,*t3,*t4,*w0,*w1,*w2,*w3;
-
-	if (direction==FFT_FORWARD) {
-		Y=gridY;
-		Z=gridZ;
-		w0=slices;
-		t0=slices_tr-Y;
-	}
-	else { // direction==FFT_BACKWARD
-		Y=gridZ;
-		Z=gridY;
-		w0=slices_tr;
-		t0=slices-Y;
-	}
+	if (y>=RsizeY) y=gridY-y;
+	return(NDCOMP*((x*gridZ+z)*RsizeY+y));
+}
 
-	y1=Y/blockTr;
-	y2=Y%blockTr;
-	z1=Z/blockTr;
-	z2=Z%blockTr;
+//======================================================================================================================
 
-	for(Xcomp=0;Xcomp<3;Xcomp++) {
-		w1=w0+Xcomp*gridYZ;
-		t1=t0+Xcomp*gridYZ;
-		for(i=0;i<=y1;i++) {
-			if (i==y1) y0=y2;
-			else y0=blockTr;
-			w2=w1;
-			t2=t1;
-			for(j=0;j<=z1;j++) {
-				if (j==z1) z0=z2;
-				else z0=blockTr;
-				w3=w2;
-				t3=t2;
-				for (y=0;y<y0;y++) {
-					t4=t3+y;
-					for (z=0;z<z0;z++) *(t4+=Y)=w3[z];
-					w3+=Z;
-				}
-				w2+=blockTr;
-				t2+=blockTr*Y;
-			}
-			w1+=blockTr*Z;
-			t1+=blockTr;
-		}
-	}
+static inline size_t IndexGarbledR(const size_t x,int y,const int z)
+// index R2 matrix after BlockTranspose (periodic over y)
+{
+	if (y<0) y+=gridY;
+#ifdef PARALLEL
+	return(((z%lz_Rm)*R2sizeY+y)*gridX+(z/lz_Rm)*local_Nx+x%local_Nx);
+#else
+	return((z*R2sizeY+y)*gridX+x);
 #endif
 }
 
 //======================================================================================================================
 
-static void transposeYZ_Dm(doublecomplex *data,doublecomplex *trans)
-// optimized routine to transpose y and z for Dmatrix: data -> trans
+static inline size_t IndexSliceR2matrix(int y,const int z)
+// index slice of R2 matrix (periodic over y)
 {
-	size_t y,z,Y,Z,y1,y2,z1,z2,i,j,y0,z0;
-	doublecomplex *t1,*t2,*t3,*t4,*w1,*w2,*w3;
+	if (y<0) y+=gridY;
+	return(y*gridZ+z);
+}
 
-	Y=gridY;
-	Z=gridZ;
+//======================================================================================================================
+
+static void transpose(const doublecomplex * restrict data,doublecomplex * restrict trans,const size_t Y,const size_t Z)
+// optimized routine to transpose complex matrix with dimensions YxZ: data -> trans
+{
+	size_t y,z,y1,y2,z1,z2,i,j,y0,z0;
+	doublecomplex *t1,*t2,*t3,*t4;
+	const doublecomplex *w1,*w2,*w3;
 
 	y1=Y/blockTr;
 	y2=Y%blockTr;
@@ -302,6 +255,55 @@ static void transposeYZ_Dm(doublecomplex *data,doublecomplex *trans)
 
 //======================================================================================================================
 
+void TransposeYZ(const int direction)
+/* optimized routine to transpose y and z; forward: slices->slices_tr; backward: slices_tr->slices; direction can be
+ * made boolean but this contradicts with existing definitions of FFT_FORWARD and FFT_BACKWARD, which themselves are
+ * determined by FFT routines invocation format
+ */
+{
+#ifdef OPENCL
+	const size_t enqtglobalzy[3]={gridZ,gridY,3};
+	const size_t enqtglobalyz[3]={gridY,gridZ,3};
+	const size_t tblock[3]={16,16,1}; // this corresponds to BLOCK_DIM in oclkernels.cl
+	/* TODO: test in which cases is the uncached variant faster than the cached one, to make a conditional or to remove
+	 * cltransposef/b if cltransposeof/b is allways faster than cltransposef/b
+	 */
+	/* When calling kernels the working group size can't be smaller than the data size; hence cached kernel can be used
+	 * only for large enough problems. Alternative solution is to determine the block size during ADDA runtime and pass
+	 * it to kernel during its compilation. But using small block size is not efficient anyway, so falling back to
+	 * noncached kernel is logical.
+	 */
+	bool cached=(enqtglobalzy[0]>=tblock[0] && enqtglobalzy[1]>=tblock[1]);
+	cached&=(gridZ%16==0 && gridY%16==0); // this is required due to current limitation of cached kernel
+	
+	if (direction==FFT_FORWARD) {
+		if (cached)
+			CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeof,3,NULL,enqtglobalzy,tblock,0,NULL,NULL));
+		else CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposef,2,NULL,enqtglobalzy,NULL,0,NULL,NULL));
+	}
+	else {
+		if (cached)
+			CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeob,3,NULL,enqtglobalyz,tblock,0,NULL,NULL));
+		else CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,cltransposeb,2,NULL,enqtglobalyz,NULL,0,NULL,NULL));
+	}
+	clFinish(command_queue);
+#else
+	size_t Xcomp,ind;
+
+	if (direction==FFT_FORWARD) for (Xcomp=0;Xcomp<3;Xcomp++) {
+		ind=Xcomp*gridYZ;
+		transpose(slices+ind,slices_tr+ind,gridY,gridZ);
+		if (surface) transpose(slicesR+ind,slicesR_tr+ind,gridY,gridZ);
+	}
+	else for (Xcomp=0;Xcomp<3;Xcomp++) { // direction==FFT_BACKWARD
+		ind=Xcomp*gridYZ;
+		transpose(slices_tr+ind,slices+ind,gridZ,gridY);
+	}
+#endif
+}
+
+//======================================================================================================================
+
 void fftX(const int isign)
 // FFT three components of (buf)Xmatrix(x) for all y,z; called from matvec
 {
@@ -347,13 +349,18 @@ void fftY(const int isign)
 #	endif
 	clFinish(command_queue);
 #elif defined(FFTW3)
-	if (isign==FFT_FORWARD) fftw_execute(planYf);
+	if (isign==FFT_FORWARD) {
+		fftw_execute(planYf);
+		if (surface) fftw_execute(planYRf);
+	}
 	else fftw_execute(planYb);
 #elif defined(FFT_TEMPERTON)
 	int nn=gridY,inc=1,jump=nn,lot=3*gridZ;
 
 	IGNORE_WARNING(-Wstrict-aliasing);
 	cfft99_((double *)(slices_tr),work,trigsY,ifaxY,&inc,&jump,&nn,&lot,&isign);
+	// the same operation is applied to sliceR_tr, when required
+	if (surface && isign==FFT_FORWARD) cfft99_((double *)(slicesR_tr),work,trigsY,ifaxY,&inc,&jump,&nn,&lot,&isign);
 	STOP_IGNORE;
 #endif
 }
@@ -373,20 +380,28 @@ void fftZ(const int isign)
 #	endif
 	clFinish(command_queue);
 #elif defined(FFTW3)
-	if (isign==FFT_FORWARD) fftw_execute(planZf);
+	if (isign==FFT_FORWARD) {
+		fftw_execute(planZf);
+		if (surface) fftw_execute(planZRf);
+	}
 	else fftw_execute(planZb);
 #elif defined(FFT_TEMPERTON)
 	int nn=gridZ,inc=1,jump=nn,lot=boxY,Xcomp;
 
 	IGNORE_WARNING(-Wstrict-aliasing);
 	for (Xcomp=0;Xcomp<3;Xcomp++) cfft99_((double *)(slices+gridYZ*Xcomp),work,trigsZ,ifaxZ,&inc,&jump,&nn,&lot,&isign);
+	if (surface && isign==FFT_FORWARD) { // the same operation is applied to slicesR, but with inverse transform
+		const int invSign=FFT_BACKWARD;
+		for (Xcomp=0;Xcomp<3;Xcomp++)
+			cfft99_((double *)(slicesR+gridYZ*Xcomp),work,trigsZ,ifaxZ,&inc,&jump,&nn,&lot,&invSign);
+	}
 	STOP_IGNORE;
 #endif
 }
 
 //======================================================================================================================
 
-static void fftX_Dm(const size_t lengthZ ONLY_FOR_TEMPERTON)
+static void fftX_Dm(void)
 // FFT(forward) D2matrix(x) for all y,z; used for Dmatrix calculation
 {
 #ifdef FFTW3
@@ -396,18 +411,36 @@ static void fftX_Dm(const size_t lengthZ ONLY_FOR_TEMPERTON)
 	size_t z;
 
 	IGNORE_WARNING(-Wstrict-aliasing);
-	for (z=0;z<lengthZ;z++) cfft99_((double *)(D2matrix+z*gridX*D2sizeY),work,trigsX,ifaxX,&inc,&jump,&nn,&lot,&isign);
+	for (z=0;z<lz_Dm;z++) cfft99_((double *)(D2matrix+z*gridX*D2sizeY),work,trigsX,ifaxX,&inc,&jump,&nn,&lot,&isign);
+	STOP_IGNORE;
+#endif
+}
+
+//======================================================================================================================
+
+static void fftX_Rm(void)
+// FFT(forward) D2matrix(x) for all y,z; used for Rmatrix calculation
+{
+#ifdef FFTW3
+	fftw_execute(planXf_Rm);
+#elif defined(FFT_TEMPERTON)
+	int nn=gridX,inc=1,jump=nn,lot=R2sizeY,isign=FFT_FORWARD;
+	size_t z;
+	const size_t zlim=local_Nz_Rm; // can be smaller by 1 than lz_Rm
+
+	IGNORE_WARNING(-Wstrict-aliasing);
+	for (z=0;z<zlim;z++) cfft99_((double *)(R2matrix+z*gridX*R2sizeY),work,trigsX,ifaxX,&inc,&jump,&nn,&lot,&isign);
 	STOP_IGNORE;
 #endif
 }
 
 //======================================================================================================================
 
-static void fftY_Dm(void)
-// FFT(forward) slice_tr(y) for all z; used for Dmatrix calculation
+static void fftY_slice(void)
+// FFT(forward) slice_tr(y) for all z; used for Dmatrix and Rmatrix calculation
 {
 #ifdef FFTW3
-	fftw_execute(planYf_Dm);
+	fftw_execute(planYf_slice);
 #elif defined(FFT_TEMPERTON)
 	int nn=gridY,inc=1,jump=nn,lot=gridZ,isign=FFT_FORWARD;
 
@@ -419,11 +452,11 @@ static void fftY_Dm(void)
 
 //======================================================================================================================
 
-static void fftZ_Dm(void)
-// FFT(forward) slice(z) for all y; used for Dmatrix calculation
+static void fftZ_slice(void)
+// FFT(forward) slice(z) for all y; used for Dmatrix and Rmatrix calculation
 {
 #ifdef FFTW3
-	fftw_execute(planZf_Dm);
+	fftw_execute(planZf_slice);
 #elif defined(FFT_TEMPERTON)
 	int nn=gridZ,inc=1,jump=nn,lot=gridY,isign=FFT_FORWARD;
 
@@ -512,7 +545,7 @@ int fftFit(int x,int divis)
 
 //======================================================================================================================
 
-static void fftInitBeforeD(const int lengthZ ONLY_FOR_FFTW3)
+static void fftInitBeforeD(void)
 // initialize fft before initialization of Dmatrix
 {
 #ifdef FFTW3
@@ -520,18 +553,24 @@ static void fftInitBeforeD(const int lengthZ ONLY_FOR_FFTW3)
 
 	D("FFTW library version: %s\n     compiler: %s\n     codelet optimizations: %s",fftw_version,fftw_cc,
 		fftw_codelet_optim);
-	planYf_Dm=fftw_plan_many_dft(1,&grYint,gridZ,slice_tr,NULL,1,gridY,slice_tr,NULL,1,gridY,FFT_FORWARD,PLAN_FFTW_DM);
-	planZf_Dm=fftw_plan_many_dft(1,&grZint,gridY,slice,NULL,1,gridZ,slice,NULL,1,gridZ,FFT_FORWARD,PLAN_FFTW_DM);
-	planXf_Dm=fftw_plan_many_dft(1,&grXint,lengthZ*D2sizeY,D2matrix,NULL,1,gridX,D2matrix,NULL,1,gridX,FFT_FORWARD,
+	planYf_slice=fftw_plan_many_dft(1,&grYint,gridZ,slice_tr,NULL,1,gridY,slice_tr,NULL,1,gridY,FFT_FORWARD,
+		PLAN_FFTW_DM);
+	planZf_slice=fftw_plan_many_dft(1,&grZint,gridY,slice,NULL,1,gridZ,slice,NULL,1,gridZ,FFT_FORWARD,PLAN_FFTW_DM);
+	planXf_Dm=fftw_plan_many_dft(1,&grXint,lz_Dm*D2sizeY,D2matrix,NULL,1,gridX,D2matrix,NULL,1,gridX,FFT_FORWARD,
 		PLAN_FFTW_DM);
+	// very similar to Dm, but local_Nz_Rm can be smaller by 1 than lz_Rm
+	if (surface) planXf_Rm=fftw_plan_many_dft(1,&grXint,local_Nz_Rm*R2sizeY,R2matrix,NULL,1,gridX,R2matrix,NULL,1,gridX,
+		FFT_FORWARD,PLAN_FFTW_DM);
 #elif defined(FFT_TEMPERTON)
-	int size,nn;
+	int nn;
+	size_t size;
 
 	// allocate memory
 	MALLOC_VECTOR(trigsX,double,2*gridX,ALL);
 	MALLOC_VECTOR(trigsY,double,2*gridY,ALL);
 	MALLOC_VECTOR(trigsZ,double,2*gridZ,ALL);
 	size=MAX(gridX*D2sizeY,3*gridYZ);
+	if (surface) size=MAX(size,gridX*R2sizeY);
 	MALLOC_VECTOR(work,double,2*size,ALL);
 	// initialize ifax and trigs
 	nn=gridX;
@@ -672,6 +711,8 @@ static void fftInitAfterD(void)
 #	endif
 	lot=3*gridZ;
 	planYf=fftw_plan_many_dft(1,&grYint,lot,slices_tr,NULL,1,gridY,slices_tr,NULL,1,gridY,FFT_FORWARD,PLAN_FFTW);
+	if (surface) // same operation, but applied to slicesR_tr
+		planYRf=fftw_plan_many_dft(1,&grYint,lot,slicesR_tr,NULL,1,gridY,slicesR_tr,NULL,1,gridY,FFT_FORWARD,PLAN_FFTW);
 #	ifdef PRECISE_TIMING
 	GetTime(tvp+1);
 #	endif
@@ -686,6 +727,8 @@ static void fftInitAfterD(void)
 	howmany_dims[1].n=boxY;
 	howmany_dims[1].is=howmany_dims[1].os=gridZ;
 	planZf=fftw_plan_guru_dft(1,&dims,2,howmany_dims,slices,slices,FFT_FORWARD,PLAN_FFTW);
+	// same operation but for slicesR and inverse transform (since correlation is computed instead of convolution)
+	if (surface) planZRf=fftw_plan_guru_dft(1,&dims,2,howmany_dims,slicesR,slicesR,FFT_BACKWARD,PLAN_FFTW);
 #	ifdef PRECISE_TIMING
 	GetTime(tvp+3);
 #	endif
@@ -722,11 +765,94 @@ static void fftInitAfterD(void)
 #	endif
 	// destroy old plans
 	fftw_destroy_plan(planXf_Dm);
-	fftw_destroy_plan(planYf_Dm);
-	fftw_destroy_plan(planZf_Dm);
+	fftw_destroy_plan(planYf_slice);
+	fftw_destroy_plan(planZf_slice);
+	if (surface) fftw_destroy_plan(planXf_Rm);
+#endif
+}
+
+//======================================================================================================================
+
+static void InitRmatrix(const double invNgrid)
+/* Initializes the matrix R. R[i][j][k]=GR[i1-i2][j1-j2][k1+k2]. Actually R=-FFT(GR)/Ngrid. Then -GR.x=invFFT(R*FFT(x))
+ * for practical implementation of FFT such that invFFT(FFT(x))=Ngrid*x. GR is exactly reflected Green's tensor. The
+ * routine is very similar to the corresponding part of InitDmatrix. Moreover, some initialization is delegated to the
+ * latter function.
+ */
+{
+	int i,j,k,Rcomp;
+	size_t x,y,z,indexfrom,indexto,ind,index;
+
+	// allocate memory for Rmatrix (R2matrix is allocated earlier in InitDmatrix)
+	MALLOC_VECTOR(Rmatrix,complex,Rsize,ALL);
+#ifdef PARALLEL
+	// allocate buffer for BlockTranspose_DRm
+	size_t bufsize = 2*lz_Rm*R2sizeY*local_Nx;
+	MALLOC_VECTOR(BT_buffer,double,bufsize,ALL);
+	MALLOC_VECTOR(BT_rbuffer,double,bufsize,ALL);
+#endif
+	if (IFROOT) printf("Calculating reflected Green's function (Rmatrix)\n");
+	/* Interaction matrix values are calculated all at once for performance reasons. They are stored in Rmatrix with
+	 * indexing corresponding to R2matrix (to facilitate copying) but NDCOMP elements instead of one. Afterwards they
+	 * are replaced by Fourier transforms (with different indexing) component-wise (in cycle over NDCOMP)
+	 */
+	/* fill Rmatrix with 0, this if to fill the possible gap between e.g. boxY and gridY/2; (and for R=0) probably
+	 * faster than using a lot of conditionals
+	 */
+	for (ind=0;ind<Rsize;ind++) Rmatrix[ind]=0;
+	// fill Rmatrix with values of reflected Green's tensor
+	for(k=0;k<local_Nz_Rm;k++) for (j=jstartR;j<boxY;j++) for (i=1-boxX;i<boxX;i++) {
+			index=NDCOMP*Index2matrix(i,j,k,R2sizeY);
+			(*CalcReflTerm)(i,j,k,Rmatrix+index);
+	} // end of i,j,k loop
+	if (IFROOT) printf("Fourier transform of Rmatrix");
+	for(Rcomp=0;Rcomp<NDCOMP;Rcomp++) { // main cycle over components of Rmatrix
+		// fill R2matrix with precomputed values from Rmatrix
+		for (ind=0;ind<R2sizeTot;ind++) R2matrix[ind]=Rmatrix[NDCOMP*ind+Rcomp];
+		fftX_Rm(); // fftX R2matrix
+		BlockTranspose_DRm(R2matrix,R2sizeY,lz_Rm);
+		for(x=local_x0;x<local_x1;x++) {
+			for (ind=0;ind<gridYZ;ind++) slice[ind]=0.0; // fill slice with 0.0
+			for(j=jstartR;j<boxY;j++) for(k=0;k<2*boxZ-1;k++) {
+				indexfrom=IndexGarbledR(x,j,k);
+				indexto=IndexSliceR2matrix(j,k);
+				slice[indexto]=R2matrix[indexfrom];
+			}
+			/* here a specific symmetry is used, that elements of R depend on direction y/|rho| either as even order
+			 * (0 or 2) or as odd (1) - the latter are elements 1 and 4 (see GetSomIntegral in interaction.c)
+			 */
+			if (reduced_FFT) for(j=1;j<boxY;j++) for(k=0;k<2*boxZ-1;k++) {
+				// mirror along y
+				indexfrom=IndexSliceR2matrix(j,k);
+				indexto=IndexSliceR2matrix(-j,k);
+				if (Rcomp==1 || Rcomp==4) slice[indexto]=-slice[indexfrom];
+				else slice[indexto]=slice[indexfrom];
+			}
+			fftZ_slice(); // fftZ slice
+			transpose(slice,slice_tr,gridY,gridZ);
+			fftY_slice(); // fftY slice_tr
+			for(z=0;z<gridZ;z++) for(y=0;y<RsizeY;y++) {
+				indexto=IndexRmatrix(x-local_x0,y,z)+Rcomp;
+				indexfrom=IndexSlice_zy(y,z);
+				Rmatrix[indexto]=-invNgrid*slice_tr[indexfrom];
+			}
+		} // end slice X
+		if (IFROOT) printf(".");
+	} // end of Rcomp
+	if (IFROOT) printf("\n");
+#ifdef PARALLEL
+	// deallocate buffers for BlockTranspose_DRm
+	Free_general(BT_buffer);
+	Free_general(BT_rbuffer);
+#endif
+#ifdef OPENCL
+	// copy Rmatrix to OpenCL buffer, blocking to ensure completion before function end
+	//CL_CH_ERR(clEnqueueWriteBuffer(command_queue,bufRmatrix,CL_TRUE,0,Dsize*sizeof(*Rmatrix),Rmatrix,0,NULL,NULL));
+	Free_cVector(Rmatrix);
 #endif
 }
 
+
 //======================================================================================================================
 
 void InitDmatrix(void)
@@ -739,15 +865,14 @@ void InitDmatrix(void)
 	size_t x,y,z,indexfrom,indexto,ind,index,Dsize,D2sizeTot;
 	double invNgrid;
 	int nnn; // multiplier used for reduced_FFT or not reduced; 1 or 2
-	int jstart, kstart;
-	size_t lengthN;
+	int jstart,kstart;
 	TIME_TYPE start,time1;
 #ifdef PRECISE_TIMING
 	// precise timing of the Dmatrix computation
-	SYSTEM_TIME tvp[13];
+	SYSTEM_TIME tvp[15];
 	SYSTEM_TIME Timing_fftX,Timing_fftY,Timing_fftZ,Timing_Gcalc,Timing_ar1,Timing_ar2,Timing_ar3,Timing_BT,Timing_TYZ,
 		Timing_beg;
-	double t_fftX,t_fftY,t_fftZ,t_ar1,t_ar2,t_ar3,t_TYZ,t_beg,t_Gcalc,t_Arithm,t_FFT,t_BT,t_InitMV;
+	double t_fftX,t_fftY,t_fftZ,t_ar1,t_ar2,t_ar3,t_TYZ,t_beg,t_Gcalc,t_Arithm,t_FFT,t_BT,t_InitMV,t_Rm,t_Tot;
 
 	// This should be the first occurrence of PRECISE_TIMING in the program
 	SetTimerFreq();
@@ -765,7 +890,6 @@ void InitDmatrix(void)
 	start=GET_TIME();
 
 	// initialize sizes of D and D2 matrices
-	D2sizeX=gridX;
 	if (reduced_FFT) {
 		D2sizeY=gridY/2;
 		D2sizeZ=gridZ/2;
@@ -783,13 +907,13 @@ void InitDmatrix(void)
 		kstart=1-boxZ;
 	}
 	// auxiliary parameters
-	lengthN=nnn*local_Nz;
+	lz_Dm=nnn*local_Nz;
 	DsizeYZ=DsizeY*DsizeZ;
 	invNgrid=1.0/(gridX*((double)gridYZ));
 	local_Nsmall=(gridX/2)*(gridYZ/(2*nprocs)); // size of X vector (for 1 component)
 	// potentially this may cause unnecessary error during prognosis, but makes code cleaner
 	Dsize=MultOverflow(NDCOMP*local_Nx,DsizeYZ,ONE_POS_FUNC);
-	D2sizeTot=nnn*local_Nz*D2sizeY*D2sizeX; // this should be approximately equal to Dsize/NDCOMP
+	D2sizeTot=nnn*local_Nz*D2sizeY*gridX; // this should be approximately equal to Dsize/NDCOMP
 	if (IFROOT) fprintf(logfile,"The FFT grid is: %zux%zux%zu\n",gridX,gridY,gridZ);
 #ifdef OPENCL // perform setting up of buffers and kernels
 	// create all Buffers needed on Device in MatVec; When prognosis, the following code just counts required memory
@@ -886,14 +1010,44 @@ void InitDmatrix(void)
 		CL_CH_ERR(clSetKernelArg(cltransposeob,4,17*16*3*sizeof(doublecomplex),NULL));
 	}
 #endif
+	// part of the code for InitRmatrix is here to be compatible with prognosis and FFT init
+	if (surface) {
+		/* We keep option to turn off reduced_FFT for Rmatrix (at least for tests). However, its usefulness for some
+		 * weird Green's tensors (non-symmetric) is limited, because certain symmetry along the z-axis is still assumed
+		 * - that is R({i1,j1,k1},{i2,j2,k2})=R({i1,j1,k2},{i2,j2,k1}). Without this limitation the whole FFT part will
+		 * be broken (or need significant revision).
+		 *
+		 * Moreover the savings by using reduced_FFT is only a factor of two for Rmatrix (in contrast to 4 for Dmatrix)
+		 */
+		if (reduced_FFT) {
+			R2sizeY=gridY/2;
+			RsizeY=gridY/2+1;
+			jstartR=0;
+		}
+		else {
+			R2sizeY=RsizeY=gridY;
+			jstartR=1-boxY;
+		}
+		lz_Rm=2*local_Nz;
+		// potentially this may cause unnecessary error during prognosis, but makes code cleaner
+		Rsize=MultOverflow(NDCOMP*local_Nx,RsizeY*gridZ,ONE_POS_FUNC);
+		R2sizeTot=lz_Rm*R2sizeY*gridX; // this should be approximately equal to Rsize/NDCOMP
+	}
+	// memory estimation and exit for prognosis
 	MAXIMIZE(memPeak,memory);
-	// objects which are always allocated (at least temporarily): Dmatrix,D2matrix,slice,slice_tr
-	memPeak+=sizeof(doublecomplex)*((double)Dsize+D2sizeTot+2*gridYZ);
+	/* objects which are always allocated (at least temporarily): Dmatrix,D2matrix,slice,slice_tr
+	 * for surface, the peak is either by D2matrix & R2matrix, or by R2matrix & Rmatrix (the latter is mostly probable)
+	 */
+	memPeak+=sizeof(doublecomplex)*((double)Dsize+2*gridYZ+(surface ? (MAX(Rsize,D2sizeTot)+R2sizeTot) : D2sizeTot));
 #ifndef OPENCL
-	// allocated memory that is used further on, not relevant for OpenCL version
+	/* allocated memory that is used further on (Dmatrix,Xmatrix,slices,slices_tr), not relevant for OpenCL version;
+	 * we assume that it is always larger than memPeak above (so memPeak doesn't have to be adjusted). In particular,
+	 * we ignore the memory, which is temporarily allocated for BlockTranspose buffers of Dm and Rm.
+	 */
 	double mem=sizeof(doublecomplex)*((double)Dsize+3*local_Nsmall+6*gridYZ);
+	if (surface) mem+=sizeof(doublecomplex)*((double)Rsize+6*gridYZ); // for Rmatrix, slicesR, and slicesR_tr
 #ifdef PARALLEL
-	size_t BTsize = 6*smallY*local_Nz*local_Nx; // in doubles
+	const size_t BTsize = 6*smallY*local_Nz*local_Nx; // in doubles
 	mem+=2*BTsize*sizeof(double);
 #endif
 	// printout some information
@@ -913,15 +1067,20 @@ void InitDmatrix(void)
 	MALLOC_VECTOR(D2matrix,complex,D2sizeTot,ALL);
 	MALLOC_VECTOR(slice,complex,gridYZ,ALL);
 	MALLOC_VECTOR(slice_tr,complex,gridYZ,ALL);
+	/* allocate memory for R2matrix components. In principle, this can be done after D2 matrix is freed. However, this
+	 * way allows us to init all FFT routines (in particular, build FFTW plans) in one go. Moreover, this should not
+	 * increase the peak memory, since Rmatrix is allocated further on (see above).
+	 */
+	if (surface) MALLOC_VECTOR(R2matrix,complex,R2sizeTot,ALL);
 	// actually allocation of Xmatrix, slices, slices_tr is below after freeing of Dmatrix and its slice
 #ifdef PARALLEL
 	// allocate buffer for BlockTranspose_Dm
-	size_t bufsize = 2*lengthN*D2sizeY*local_Nx;
+	size_t bufsize = 2*lz_Dm*D2sizeY*local_Nx;
 	MALLOC_VECTOR(BT_buffer,double,bufsize,ALL);
 	MALLOC_VECTOR(BT_rbuffer,double,bufsize,ALL);
 #endif
 	D("Initialize FFT (1st part)");
-	fftInitBeforeD(lengthN);
+	fftInitBeforeD();
 #ifdef PRECISE_TIMING
 	GetTime(tvp+1);
 	Elapsed(tvp,tvp+1,&Timing_beg); // it includes a lot of OpenCL stuff
@@ -941,7 +1100,7 @@ void InitDmatrix(void)
 		if (k>(int)smallZ) kcor=k-gridZ;
 		else kcor=k;
 		for (j=jstart;j<boxY;j++) for (i=1-boxX;i<boxX;i++) {
-			index=NDCOMP*IndexD2matrix(i,j,k,nnn);
+			index=NDCOMP*Index2matrix(i,j,k-nnn*local_z0,D2sizeY);
 			/* The test for zero distance is somewhat non-optimal. However, other alternatives are not perfect either:
 			 * 1) complicate the loops to remove the zero element in the beginning (move tests to the upper level)
 			 * 2) call the function with zero - it will produce NaN. Then set this element to zero after the loop.
@@ -964,12 +1123,12 @@ void InitDmatrix(void)
 		GetTime(tvp+3);
 		ElapsedInc(tvp+2,tvp+3,&Timing_ar1);
 #endif
-		fftX_Dm(lengthN); // fftX D2matrix
+		fftX_Dm(); // fftX D2matrix
 #ifdef PRECISE_TIMING
 		GetTime(tvp+4);
 		ElapsedInc(tvp+3,tvp+4,&Timing_fftX);
 #endif
-		BlockTranspose_Dm(D2matrix,D2sizeY,lengthN);
+		BlockTranspose_DRm(D2matrix,D2sizeY,lz_Dm);
 #ifdef PRECISE_TIMING
 		GetTime(tvp+5);
 		ElapsedInc(tvp+4,tvp+5,&Timing_BT);
@@ -980,11 +1139,11 @@ void InitDmatrix(void)
 #endif
 			for (ind=0;ind<gridYZ;ind++) slice[ind]=0.0; // fill slice with 0.0
 			for(j=jstart;j<boxY;j++) for(k=kstart;k<boxZ;k++) {
-				indexfrom=IndexGarbledD(x,j,k,lengthN);
+				indexfrom=IndexGarbledD(x,j,k);
 				indexto=IndexSliceD2matrix(j,k);
 				slice[indexto]=D2matrix[indexfrom];
 			}
-			if (reduced_FFT) {
+			if (reduced_FFT) { // here a specific symmetry is used, that G is a combination of tensors I and RR/|R|^2
 				for(j=1;j<boxY;j++) for(k=0;k<boxZ;k++) {
 					// mirror along y
 					indexfrom=IndexSliceD2matrix(j,k);
@@ -1004,24 +1163,24 @@ void InitDmatrix(void)
 			GetTime(tvp+7);
 			ElapsedInc(tvp+6,tvp+7,&Timing_ar2);
 #endif
-			fftZ_Dm(); // fftZ slice
+			fftZ_slice(); // fftZ slice
 #ifdef PRECISE_TIMING
 			GetTime(tvp+8);
 			ElapsedInc(tvp+7,tvp+8,&Timing_fftZ);
 #endif
-			transposeYZ_Dm(slice,slice_tr);
+			transpose(slice,slice_tr,gridY,gridZ);
 #ifdef PRECISE_TIMING
 			GetTime(tvp+9);
 			ElapsedInc(tvp+8,tvp+9,&Timing_TYZ);
 #endif
-			fftY_Dm(); // fftY slice_tr
+			fftY_slice(); // fftY slice_tr
 #ifdef PRECISE_TIMING
 			GetTime(tvp+10);
 			ElapsedInc(tvp+9,tvp+10,&Timing_fftY);
 #endif
 			for(z=0;z<DsizeZ;z++) for(y=0;y<DsizeY;y++) {
 				indexto=IndexDmatrix(x-local_x0,y,z)+Dcomp;
-				indexfrom=IndexSlice_zyD2matrix(y,z);
+				indexfrom=IndexSlice_zy(y,z);
 				Dmatrix[indexto]=-invNgrid*slice_tr[indexfrom];
 			}
 #ifdef PRECISE_TIMING
@@ -1031,36 +1190,54 @@ void InitDmatrix(void)
 		} // end slice X
 		if (IFROOT) printf(".");
 	} // end of Dcomp
-	// free vectors used for computation of Dmatrix
+	if (IFROOT) printf("\n");
+	// free vectors used for computation of Dmatrix; slice and slice_tr are freed after InitRmatrix
 	Free_cVector(D2matrix);
-	Free_cVector(slice);
-	Free_cVector(slice_tr);
 #ifdef PARALLEL
-	// deallocate buffers for BlockTranspose_Dm
+	// deallocate buffers for BlockTranspose_DRm
 	Free_general(BT_buffer);
 	Free_general(BT_rbuffer);
-	// allocate buffers for BlockTranspose
-	MALLOC_VECTOR(BT_buffer,double,BTsize,ALL);
-	MALLOC_VECTOR(BT_rbuffer,double,BTsize,ALL);
 #endif
 #ifdef OPENCL
 	// copy Dmatrix to OpenCL buffer, blocking to ensure completion before function end
 	CL_CH_ERR(clEnqueueWriteBuffer(command_queue,bufDmatrix,CL_TRUE,0,Dsize*sizeof(*Dmatrix),Dmatrix,0,NULL,NULL));
 	Free_cVector(Dmatrix);
-#else
+#endif
+	if (surface) { // only the total execution time of InitRmatrix is timed
+#ifdef PRECISE_TIMING
+			GetTime(tvp+12);
+#endif
+			InitRmatrix(invNgrid);
+			Free_cVector(R2matrix); // free it here since it was allocated above
+#ifdef PRECISE_TIMING
+			GetTime(tvp+13);
+			t_Rm=DiffSec(tvp+12,tvp+13);
+#endif
+	}
+	Free_cVector(slice);
+	Free_cVector(slice_tr);
+#ifdef PARALLEL
+	// allocate buffers for BlockTranspose
+	MALLOC_VECTOR(BT_buffer,double,BTsize,ALL);
+	MALLOC_VECTOR(BT_rbuffer,double,BTsize,ALL);
+#endif
+#ifndef OPENCL
 	// allocate memory for Xmatrix, slices and slices_tr - used in matvec
 	MALLOC_VECTOR(Xmatrix,complex,3*local_Nsmall,ALL);
 	MALLOC_VECTOR(slices,complex,3*gridYZ,ALL);
 	MALLOC_VECTOR(slices_tr,complex,3*gridYZ,ALL);
+	if (surface) { // additional slices for reflection interaction
+		MALLOC_VECTOR(slicesR,complex,3*gridYZ,ALL);
+		MALLOC_VECTOR(slicesR_tr,complex,3*gridYZ,ALL);
+	}
 #endif
-	if (IFROOT) printf("\n");
 	time1=GET_TIME();
 	Timing_Dm_Init=time1-start;
 
 #ifdef PRECISE_TIMING
-	GetTime(tvp+12);
+	GetTime(tvp+14);
 	// time for extra initialization required for MatVec; it includes copying Dmatrix to GPU
-	t_InitMV=DiffSec(tvp+11,tvp+12);
+	t_InitMV=DiffSec(tvp+11,tvp+14);
 	// analyze and print precise timing information
 	t_beg=TimerToSec(&Timing_beg);
 	t_Gcalc=TimerToSec(&Timing_Gcalc);
@@ -1074,6 +1251,12 @@ void InitDmatrix(void)
 	t_BT=TimerToSec(&Timing_BT);
 	t_Arithm=t_beg+t_Gcalc+t_ar1+t_ar2+t_ar3+t_TYZ;
 	t_FFT=t_fftX+t_fftY+t_fftZ;
+	t_Tot=DiffSec(tvp,tvp+14);
+
+	if (surface) { // correct InitMV and Total time by that of InitRmatrix
+		t_InitMV-=t_Rm;
+		t_Tot-=t_Rm;
+	}
 
 	if (IFROOT) PrintBoth(logfile,
 		"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
@@ -1090,8 +1273,9 @@ void InitDmatrix(void)
 		"FFTY   = "FFORMPT"\n"
 		"Arith3 = "FFORMPT"\n"
 		"InitMV = "FFORMPT"\n\n",
-		t_beg,t_Arithm,t_Gcalc,t_FFT,t_ar1,t_BT,t_fftX,t_InitMV,t_BT,t_ar2,DiffSec(tvp,tvp+12),t_fftZ,t_TYZ,t_fftY,
+		t_beg,t_Arithm,t_Gcalc,t_FFT,t_ar1,t_BT,t_fftX,t_InitMV,t_BT,t_ar2,t_Tot,t_fftZ,t_TYZ,t_fftY,
 		t_ar3,t_InitMV);
+	if (surface && IFROOT) PrintBoth(logfile,"Additionally time for initialization of Rmatrix = "FFORMPT"\n\n",t_Rm);
 #endif
 
 	fftInitAfterD();
@@ -1131,6 +1315,11 @@ void Free_FFT_Dmat(void)
 	Free_cVector(Xmatrix);
 	Free_cVector(slices);
 	Free_cVector(slices_tr);
+	if (surface) {
+		Free_cVector(Rmatrix);
+		Free_cVector(slicesR);
+		Free_cVector(slicesR_tr);
+	}
 #	ifdef PARALLEL
 	Free_general(BT_buffer);
 	Free_general(BT_rbuffer);
@@ -1142,6 +1331,10 @@ void Free_FFT_Dmat(void)
 	fftw_destroy_plan(planYb);
 	fftw_destroy_plan(planZf);
 	fftw_destroy_plan(planZb);
+	if (surface) {
+		fftw_destroy_plan(planYRf);
+		fftw_destroy_plan(planZRf);
+	}
 #	endif
 #endif
 #ifdef FFT_TEMPERTON // these vectors are used even with OpenCL
diff --git a/src/interaction.c b/src/interaction.c
index 61af9258..df6b2ecd 100644
--- a/src/interaction.c
+++ b/src/interaction.c
@@ -25,9 +25,12 @@
 // SEMI-GLOBAL VARIABLES
 
 // defined and initialized in make_particle.c
-extern double gridspace,ZsumShift;
+extern const double gridspace,ZsumShift;
 // defined and initialized in param.c
-extern double igt_lim,igt_eps;
+extern const double igt_lim,igt_eps;
+
+// used in fft.c
+int local_Nz_Rm; // number of local layers in Rmatrix, not greater than 2*boxZ-1 (also used in SPARSE)
 
 // LOCAL VARIABLES
 
@@ -906,10 +909,17 @@ static void CalcSomTable(void)
  * However, in sparse mode this procedure is inefficient, since incurs (potentially) a lot of unnecessary evaluations
  * of Sommerfeld integrals. For really sparse aggregates the better way is to buildup a lookup table, using only
  * actually used pairs of (z,rho), as is done in DDA-SI code. A hash table can be used for that.
- * TODO: Implement such improvement for the sparse mode
+ * TODO: Implement such improvement for the sparse mode (issue 175)
+ *
+ * Using only actually used value of (z,rho) can be also relevant for FFT mode (consider, e.g. a sphere and z close to 0
+ * and to 2*boxZ-1). However, searching through such pairs seems to be O(N^2) operation, which is unacceptable in FFT
+ * mode.
+ *
+ * Another problem in SPARSE mode is that currently all values of z are computed on each processor (in MPI mode). This
+ * is related to the current parallelization mode (issue 160), but it can be improved by computing all values in chunks
+ * and then gathering them on each processor. Better to combine it with the hash table above.
  */
 {
-	// TOSO: those ranges are correct for SPARSE, but should be modified for the FFT version
 	int i,j,k;
 	const double scale=kd/TWO_PI;
 	const double isc=pow(WaveNum/TWO_PI,3); // this is subject to under/overflow
@@ -924,17 +934,19 @@ static void CalcSomTable(void)
 	for (j=0;j<boxY;j++) somIndex[j+1]=somIndex[j] + (XlessY ? MIN(j+1,boxX) : (boxX-j));
 	// allocate and fill the table
 	// TODO: add the required memory to calculator.c (and the manual)
-	const size_t tmp=4*(2*boxZ-1)*somIndex[boxY];
-	MALLOC_VECTOR(somTable,complex,tmp,ALL);
+	const size_t tmp=4*local_Nz_Rm*somIndex[boxY];
 	memory+=tmp*sizeof(doublecomplex);
-	if (IFROOT) printf("Calculating table of Sommerfeld integrals\n");
-	ind=0;
-	for (k=0;k<(2*boxZ-1);k++) {
-		z=(k+ZsumShift)*scale;
-		for (j=0;j<boxY;j++) {
-			y=j;
-			if (XlessY) for (i=0;i<=j && i<boxX;i++,ind++) evluaWrapper(i,y,z,scale,isc,somTable+4*ind);
-			else for (i=j;i<boxX;i++,ind++) evluaWrapper(i,y,z,scale,isc,somTable+4*ind);
+	if (!prognosis) {
+		MALLOC_VECTOR(somTable,complex,tmp,ALL);
+		if (IFROOT) printf("Calculating table of Sommerfeld integrals\n");
+		ind=0;
+		for (k=0;k<local_Nz_Rm;k++) {
+			z=(k+ZsumShift)*scale;
+			for (j=0;j<boxY;j++) {
+				y=j;
+				if (XlessY) for (i=0;i<=j && i<boxX;i++,ind++) evluaWrapper(i,y,z,scale,isc,somTable+4*ind);
+				else for (i=j;i<boxX;i++,ind++) evluaWrapper(i,y,z,scale,isc,somTable+4*ind);
+			}
 		}
 	}
 }
@@ -1022,11 +1034,16 @@ void InitInteraction(void)
 
 	// Interaction through reflection from surface
 	if (surface) {
+#ifdef SPARSE
+		local_Nz_Rm=2*boxZ-1;
+#else
+		local_Nz_Rm=MAX(MIN(2*local_z1,2*boxZ-1)-2*local_z0,0);
+#endif
 		switch (ReflRelation) {
 			case GR_IMG: CalcReflTerm = &CalcReflTerm_img; break;
 			case GR_SOM:
 				CalcReflTerm = &CalcReflTerm_som;
-				som_init(msub*msub);
+				if (!prognosis) som_init(msub*msub);
 				CalcSomTable();
 				break;
 		}
diff --git a/src/interaction.h b/src/interaction.h
index 5f578f96..380d3594 100644
--- a/src/interaction.h
+++ b/src/interaction.h
@@ -25,7 +25,9 @@
 void (*CalcInterTerm)(const int i,const int j,const int k,doublecomplex result[static restrict 6]);
 
 /* Calculates reflection term between two dipoles; given integer distance vector {i,j,k} (in units of d). k is the _sum_
- * of dipole indices along z with respect to the center of bottom dipoles of the particle (position_full in SPARSE mode)
+ * of dipole indices along z with respect to the center of bottom dipoles of the particle. Bottom is considered for the
+ * current processor (position) and the whole particle (position_full) in FFT and SPARSE modes respectively. The latter
+ * behavior is determined by ZsumShift.
  * The acting dipole is placed in the origin, while the field is calculated at position given as argument.
  * Six components of the matrix are computed at once: [GR11, GR12, GR13, GR22, GR23, GR33].
  * The matrix is not symmetric, but satisfies: GR21=GR12, GR31=-GR13, GR32=-GR23. However the large matrix GR, which
diff --git a/src/make_particle.c b/src/make_particle.c
index 9d32660a..9359cc02 100644
--- a/src/make_particle.c
+++ b/src/make_particle.c
@@ -48,7 +48,7 @@ extern const int jagged;
 extern const char *shape_fname;
 extern const char *shapename;
 extern const bool volcor,save_geom;
-extern opt_index opt_sh;
+extern const opt_index opt_sh;
 #ifndef SPARSE
 extern const int sh_Npars;
 extern const double sh_pars[];
@@ -57,7 +57,7 @@ extern const double gr_vf;
 extern double gr_d;
 extern const int gr_mat;
 extern enum shform sg_format;
-extern bool store_grans;
+extern const bool store_grans;
 #endif
 // defined and initialized in timing.c
 extern TIME_TYPE Timing_Particle;
@@ -2281,7 +2281,7 @@ void MakeParticle(void)
 	 * tensor do not fail. And accuracy of the DDA itself is anyway questionable when some of the dipoles are very close
 	 * to the substrate (whether they cross it or not).
 	 */
-	if (surface && hsub<=-minZco) LogError(ALL_POS,"The particle must be entirely above the substrate. There exist a"
+	if (surface && hsub<=-minZco) LogError(ALL_POS,"The particle must be entirely above the substrate. There exist a "
 		"dipole with z="GFORMDEF" (relative to the center), making specified height of the center ("GFORMDEF") too "
 		"small",minZco,hsub);
 	// save geometry
@@ -2308,13 +2308,14 @@ void MakeParticle(void)
 	box_origin_unif[1]=-gridspace*cY;
 #ifndef SPARSE
 	box_origin_unif[2]=gridspace*(local_z0_unif-cZ);
+	if (surface) ZsumShift=2*((hsub/gridspace)-cZ+local_z0);
 #else
 	box_origin_unif[2]=-gridspace*cZ;
+	if (surface) ZsumShift=2*((hsub/gridspace)-cZ);
 #	ifdef PARALLEL
 	AllGather(NULL,position_full,int3_type,NULL);
 #	endif
 #endif // SPARSE
 	
-	if (surface) ZsumShift=2*((hsub/gridspace)-cZ);
 	Timing_Particle += GET_TIME() - tstart;
 }
diff --git a/src/matvec.c b/src/matvec.c
index 8444e403..bf1dcb0c 100644
--- a/src/matvec.c
+++ b/src/matvec.c
@@ -39,9 +39,9 @@
 extern doublecomplex * restrict arg_full;
 #elif !defined(OPENCL)
 // defined and initialized in fft.c
-extern const doublecomplex * restrict Dmatrix;
-extern doublecomplex * restrict Xmatrix,* restrict slices,* restrict slices_tr;
-extern const size_t DsizeY,DsizeZ,DsizeYZ;
+extern const doublecomplex * restrict Dmatrix,* restrict Rmatrix;
+extern doublecomplex * restrict Xmatrix,* restrict slices,* restrict slices_tr,* restrict slicesR,* restrict slicesR_tr;
+extern const size_t DsizeY,DsizeZ,RsizeY;
 #endif // !SPARSE && !OPENCL
 // defined and initialized in timing.c
 extern size_t TotalMatVec;
@@ -53,14 +53,14 @@ extern size_t TotalMatVec;
 
 static inline size_t IndexSliceZY(const size_t y,const size_t z)
 {
-	return (z*gridY+y);
+	return z*gridY+y;
 }
 
 //======================================================================================================================
 
 static inline size_t IndexSliceYZ(const size_t y,const size_t z)
 {
-	return(y*gridZ+z);
+	return y*gridZ+z;
 }
 
 //======================================================================================================================
@@ -68,9 +68,9 @@ static inline size_t IndexSliceYZ(const size_t y,const size_t z)
 static inline size_t IndexGarbledX(const size_t x,const size_t y,const size_t z)
 {
 #ifdef PARALLEL
-	return(((z%local_Nz)*smallY+y)*gridX+(z/local_Nz)*local_Nx+x%local_Nx);
+	return ((z%local_Nz)*smallY+y)*gridX+(z/local_Nz)*local_Nx+x%local_Nx;
 #else
-	return((z*smallY+y)*gridX+x);
+	return (z*smallY+y)*gridX+x;
 #endif
 }
 
@@ -78,7 +78,7 @@ static inline size_t IndexGarbledX(const size_t x,const size_t y,const size_t z)
 
 static inline size_t IndexXmatrix(const size_t x,const size_t y,const size_t z)
 {
-	return((z*smallY+y)*gridX+x);
+	return (z*smallY+y)*gridX+x;
 }
 
 //======================================================================================================================
@@ -86,6 +86,9 @@ static inline size_t IndexXmatrix(const size_t x,const size_t y,const size_t z)
 static inline size_t IndexDmatrix_mv(size_t x,size_t y,size_t z,const bool transposed)
 {
 	if (transposed) { // used only for G_SO
+		/* reflection along the x-axis can't work in parallel mode, since the corresponding values are generally stored
+		 * on a different processor. A rearrangement of memory distribution is required to remove this limitation.
+		 */
 		if (x>0) x=gridX-x;
 		if (y>0) y=gridY-y;
 		if (z>0) z=gridZ-z;
@@ -95,8 +98,27 @@ static inline size_t IndexDmatrix_mv(size_t x,size_t y,size_t z,const bool trans
 		if (z>=DsizeZ) z=gridZ-z;
 	}
 
-	return(NDCOMP*(x*DsizeYZ+z*DsizeY+y));
+	return NDCOMP*((x*DsizeZ+z)*DsizeY+y);
+}
+
+//======================================================================================================================
+
+static inline size_t IndexRmatrix_mv(size_t x,size_t y,size_t z,const bool transposed)
+{
+	if (transposed) { // used only for G_SO !!!
+		/* reflection along the x-axis can't work in parallel mode, since the corresponding values are generally stored
+		 * on a different processor. A rearrangement of memory distribution is required to remove this limitation.
+		 */
+		if (x>0) x=gridX-x;
+		if (y>0) y=gridY-y;
+	}
+	else {
+		if (y>=RsizeY) y=gridY-y;
+	}
+
+	return NDCOMP*((x*gridZ+z)*RsizeY+y);
 }
+
 #endif // OPENCL
 #endif // SPARSE
 
@@ -119,7 +141,7 @@ void MatVec (doublecomplex * restrict argvec,    // the argument vector
 	size_t boxY_st=boxY,boxZ_st=boxZ; // copies with different type
 #ifndef OPENCL // these variables are not needed for OpenCL
 	size_t i;
-	doublecomplex fmat[6],xv[3],yv[3];
+	doublecomplex fmat[6],xv[3],yv[3],xvR[3],yvR[3];
 	size_t index,y,z,Xcomp;
 	unsigned char mat;
 #endif
@@ -146,6 +168,14 @@ void MatVec (doublecomplex * restrict argvec,    // the argument vector
  * G_SO: F(D(T)) (k) =  F(D) (-k)
  *       k - vector index
  *
+ * For reflected matrix the situation is similar.
+ * R.x=F(-1)(F(R).H(X)), where R is a vector, similar with G, where R[i,j,k=0] is for interaction of two bottom dipoles.
+ * H(X) is FxFy(Fz^(-1)(X)), where Fx,... are Fourier transforms along corresponding coordinates. It can be computed
+ * along with F(X).
+ * Matrix R is symmetric (as a whole), but not in small parts, so R(i,j)=R(j,i)(T). Hence, in contrast to D, for
+ * 'transpose' actual transpose (changing sign of a few elements) of 3x3 submatrix is required along with addressing
+ * different elements of F(R).
+ *
  * For (her) three additional operations of nConj are used. Should not be a problem, but can be avoided by a more
  * complex code.
  */
@@ -232,6 +262,11 @@ void MatVec (doublecomplex * restrict argvec,    // the argument vector
 #endif
 	// following is done by slices
 	for(x=local_x0;x<local_x1;x++) {
+		/* TODO: if z and y FFTs are interchanged, then computing reflected interaction can be optimized even further.
+		 * Moreover, the typical situation of particles near surfaces, like large particulate slabs, correspond to the
+		 * smallest dimension along z, which will also benefit from such interchange (then gridZ do not have to divide
+		 * nprocs) - issue 177
+		 */
 #ifdef PRECISE_TIMING
 		GetTime(tvp+4);
 #endif
@@ -241,33 +276,35 @@ void MatVec (doublecomplex * restrict argvec,    // the argument vector
 		CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,clzero,1,NULL,&slicesize,NULL,0,NULL,NULL));
 		CL_CH_ERR(clEnqueueNDRangeKernel(command_queue,clarith2,2,NULL,gwsarith24,NULL,0,NULL,NULL));
 		clFinish(command_queue);
+		// TODO: add corresponding surface code (issue 101)
 #else
 		// clear slice
 		for(i=0;i<3*gridYZ;i++) slices[i]=0.0;
-
 		// fill slices with values from Xmatrix
 		for(y=0;y<boxY_st;y++) for(z=0;z<boxZ_st;z++) {
 			i=IndexSliceYZ(y,z);
 			j=IndexGarbledX(x,y,z);
 			for (Xcomp=0;Xcomp<3;Xcomp++) slices[i+Xcomp*gridYZ]=Xmatrix[j+Xcomp*local_Nsmall];
 		}
+		// create a copy of slice, which is further transformed differently
+		if (surface) memcpy(slicesR,slices,3*gridYZ*sizeof(doublecomplex));
 #endif
 #ifdef PRECISE_TIMING
 		GetTime(tvp+5);
 		ElapsedInc(tvp+4,tvp+5,&Timing_Mult2);
 #endif
 		// FFT z&y
-		fftZ(FFT_FORWARD); // fftZ (buf)slices
+		fftZ(FFT_FORWARD); // fftZ (buf)slices (and reflected terms)
 #ifdef PRECISE_TIMING
 		GetTime(tvp+6);
 		ElapsedInc(tvp+5,tvp+6,&Timing_FFTZf);
 #endif
-		TransposeYZ(FFT_FORWARD);
+		TransposeYZ(FFT_FORWARD); // including reflecting terms
 #ifdef PRECISE_TIMING
 		GetTime(tvp+7);
 		ElapsedInc(tvp+6,tvp+7,&Timing_TYZf);
 #endif
-		fftY(FFT_FORWARD); // fftY (buf)slices_tr
+		fftY(FFT_FORWARD); // fftY (buf)slices_tr (and reflected terms)
 #ifdef PRECISE_TIMING//
 		GetTime(tvp+8);
 		ElapsedInc(tvp+7,tvp+8,&Timing_FFTYf);
@@ -280,25 +317,40 @@ void MatVec (doublecomplex * restrict argvec,    // the argument vector
 		clFinish(command_queue); //wait till kernel executions are finished
 #else
 		// arith3 on host
-		// do the product D~*X~ 
+		// do the product D~*X~  and R~*X'~
 		for(z=0;z<gridZ;z++) for(y=0;y<gridY;y++) {
 			i=IndexSliceZY(y,z);
 			for (Xcomp=0;Xcomp<3;Xcomp++) xv[Xcomp]=slices_tr[i+Xcomp*gridYZ];
-
 			j=IndexDmatrix_mv(x-local_x0,y,z,transposed);
 			memcpy(fmat,Dmatrix+j,6*sizeof(doublecomplex));
-			if (reduced_FFT) {
-				if (y>smallY) { // we assume that compiler will optimize x*=-1 into negation of sign
+			if (reduced_FFT) { // symmetry with respect to reflection (x_i -> x_2N-i) is the same as in r-space
+				if (y>=DsizeY) { // we assume that compiler will optimize x*=-1 into negation of sign
 					fmat[1]*=-1;
-					if (z>smallZ) fmat[2]*=-1;
+					if (z>=DsizeZ) fmat[2]*=-1;
 					else fmat[4]*=-1;
 				}
-				else if (z>smallZ) {
+				else if (z>=DsizeZ) {
+					fmat[2]*=-1;
+					fmat[4]*=-1;
+				}
+			}
+			cSymMatrVec(fmat,xv,yv); // yv=fmat.xv
+			if (surface) {
+				for (Xcomp=0;Xcomp<3;Xcomp++) xvR[Xcomp]=slicesR_tr[i+Xcomp*gridYZ];
+				j=IndexRmatrix_mv(x-local_x0,y,z,transposed);
+				memcpy(fmat,Rmatrix+j,6*sizeof(doublecomplex));
+				if (reduced_FFT && y>=RsizeY) {
+					fmat[1]*=-1;
+					fmat[4]*=-1;
+				}
+				if (transposed) { // corresponds to transpose of 3x3 matrix
 					fmat[2]*=-1;
 					fmat[4]*=-1;
 				}
+				// yv+=fmat.xvR
+				cReflMatrVec(fmat,xvR,yvR);
+				cvAdd(yvR,yv,yv);
 			}
-			cSymMatrVec(fmat,xv,yv); // yv=fmat*xv
 			for (Xcomp=0;Xcomp<3;Xcomp++) slices_tr[i+Xcomp*gridYZ]=yv[Xcomp];
 		}
 #endif
diff --git a/src/param.c b/src/param.c
index 0ac8df4d..a4cf9b28 100644
--- a/src/param.c
+++ b/src/param.c
@@ -1400,6 +1400,9 @@ PARSE_FUNC(store_scat_grid)
 PARSE_FUNC(surf)
 {
 	double mre,mim;
+#ifdef OPENCL
+	PrintError("Currently surface is not supported in OpenCL mode");
+#endif
 	if (Narg!=2 && Narg!=3) NargError(Narg,"2 or 3");
 	ScanDoubleError(argv[1],&hsub);
 	TestPositive(hsub,"height above surface");
@@ -1945,6 +1948,20 @@ void VariablesInterconnect(void)
 	}
 #ifdef SPARSE
 	if (shape==SH_SPHERE) PrintError("Sparse mode requires shape to be read from file (-shape read ...)");
+#endif
+#if defined(PARALLEL) && !defined(SPARSE)
+	/* Transpose of the non-symmetric interaction matrix can't be done in MPI mode, due to existing memory distribution
+	 * among different processors. This causes problems for iterative solvers, which require a product of Hermitian
+	 * transpose (calculated through the standard transpose) of the matrix with vector (currently, only CGNR). To remove
+	 * this limitation (issue 174) memory distribution should be changed to have both x and gridX-x on the same
+	 * processor.
+	 */
+	if (!reduced_FFT && IterMethod==IT_CGNR) PrintError("Non-symmetric interaction matrix (e.g., -no_reduced_fft) "
+		"can not be used together with CGNR iterative solver in the MPI mode");
+	/* TO ADD NEW ITERATIVE SOLVER
+	 * add the new iterative solver to the above line, if it requires calculation of product of Hermitian transpose
+	 * of the matrix with vector (i.e. calls MatVec function with 'true' as the fourth argument)
+	 */
 #endif
 	// scale boxes by jagged; should be completely robust to overflows
 #define JAGGED_BOX(a) { \
diff --git a/src/somnec.c b/src/somnec.c
index eaba321a..052305e9 100644
--- a/src/somnec.c
+++ b/src/somnec.c
@@ -11,7 +11,7 @@
 
 /* TODO: Systematic accuracy study of this code is required. At least 7 digits of precision are desired (for test runs)
  * However, the urgent thing is that for some reason the accuracy of this code is very bad (few percent errors for
- * rho=0, exactly).
+ * rho=0, exactly). (issue 176)
  */
 
 // ADDA: the cycling to generate interpolation grid was removed, now it is a single-run routine
diff --git a/src/timing.c b/src/timing.c
index 236bd87d..c14ebac5 100644
--- a/src/timing.c
+++ b/src/timing.c
@@ -39,7 +39,8 @@ TIME_TYPE Timing_EPlane,Timing_EPlaneComm,    // for Eplane calculation: total a
           Timing_ScatQuan;                    // for integral scattering quantities
 size_t TotalEFieldPlane; // total number of planes for scattered field calculations
 // used in calculator.c
-TIME_TYPE Timing_Init; // for total initialization of the program (before CalculateE)
+TIME_TYPE Timing_Init, // for total initialization of the program (before CalculateE)
+          Timing_Init_Int; // for initialization of interaction routines (including computing tables)
 size_t TotalEval;      // total number of orientation evaluations
 #ifdef OPENCL
 TIME_TYPE Timing_OCL_Init; // for initialization of OpenCL (including building program)
@@ -139,6 +140,8 @@ void FinalStatistics(void)
 				"    init OpenCL          "FFORMT"\n",TO_SEC(Timing_OCL_Init));
 
 #endif
+			fprintf(logfile,
+				"    init interaction     "FFORMT"\n",TO_SEC(Timing_Init_Int));
 #ifndef SPARSE
 			fprintf(logfile,
 				"    init Dmatrix         "FFORMT"\n",TO_SEC(Timing_Dm_Init));
diff --git a/src/vars.c b/src/vars.c
index 7c85c9db..61cedf7a 100644
--- a/src/vars.c
+++ b/src/vars.c
@@ -154,7 +154,7 @@ int local_z0,local_z1;    // starting and ending z for current processor
 size_t local_Nz;          // number of z layers (based on the division of smallZ)
 int local_Nz_unif;        /* number of z layers (distance between max and min values), belonging to this processor,
                              after all non_void dipoles are uniformly distributed between all processors */
-int local_z1_coer;        // ending z, coerced to be not greater than boxZ
+int local_z1_coer;        // ending z, coerced to be not greater than boxZ (and not smaller than local_z0)
 	// starting, ending x for current processor and number of x layers (based on the division of smallX)
 size_t local_x0,local_x1,local_Nx;
 
diff --git a/tests/2exec/comp2exec b/tests/2exec/comp2exec
index 5cc09b88..293fa898 100755
--- a/tests/2exec/comp2exec
+++ b/tests/2exec/comp2exec
@@ -31,7 +31,9 @@ MPIRUN="mpiexec -n 4"
 #SPARSE=1
 #SPARSE_STANDARD=1
 
-#Special mode for comparing surface mode (with surface refractive index = 1) with standard
+# Extensive testing of surface (in combination with all other options) and special mode for comparing surface mode
+# (with surface refractive index = 1) with standard. Use at most one of them
+#SURF_EXT=1
 #SURF_STANDARD=1
 
 if [ -n "$SPARSE_STANDARD" ]; then
@@ -79,6 +81,11 @@ else
   exit 1
 fi
 
+if [ -n "$SURF_EXT" ]; then
+  DEFSUITE=suite_surf
+  EXECREF="$EXECREF -surf 10 2 0.1"
+  EXECTEST="$EXECTEST -surf 10 2 0.1"
+fi
 if [ -n "$SURF_STANDARD" ]; then
   DEFSUITE=suite_surf
   EXECTEST="$EXECTEST -surf 10 1 0 -yz"
@@ -155,7 +162,7 @@ function mycmp {
   # behavior is mainly determined by file name
   base=`basename $1`
   if [ "$base" == $SONAME ]; then
-    IGNORE="^all data is saved in '.*'"
+    IGNORE="^all data is saved in '.*'|No real dipoles are assigned"
     if [ $MODE == "mpi_seq" ]; then
       IGNORE="$IGNORE|^(M|Total m|Maximum m|Additional m)emory usage"
     elif [ $MODE == "ocl_seq" ]; then
@@ -164,11 +171,14 @@ function mycmp {
       IGNORE="$IGNORE|^(M|Total m|OpenCL m)emory usage"
     fi
     if [ -n "$FFTCOMP" ]; then
-      IGNORE="$IGNORE|^(M|Total m|OpenCL m|Maximum m)emory usage"
+      IGNORE="$IGNORE|^(M|Total m|OpenCL m|Maximum m)emory usage|^Initializing (clFFT|FFTW3)"
     fi
     if [ -n "$SPARSE_STANDARD" ]; then
-      IGNORE="$IGNORE|^Calculating Green's function|^Fourier transform of Dmatrix|^Initializing (clFFT|FFTW3)"
+      IGNORE="$IGNORE|^Calculating( reflected|) Green's function|^Fourier transform of"
     fi
+	if [ -n "$SURF_STANDARD" ]; then
+      IGNORE="$IGNORE|^Calculating (table|reflected)|^Fourier transform of|^(M|Total m|OpenCL m|Maximum m)emory usage"
+	fi
     if [[ $MODE == "mpi" || $MODE == "mpi_seq" ]]; then
       CUT="^Error posting writev, " # due to typical random errors of MPICH under Windows
     else
@@ -186,27 +196,28 @@ function mycmp {
   elif [ "$base" == "log" ]; then
     IGNORE="^Generated by ADDA v\.|^command: '.*'"
     if [ $MODE == "mpi_seq" ]; then
-      IGNORE="$IGNORE|^The program was run on: |^(M|Total m|Maximum m|Additional m)emory usage"
+      IGNORE="$IGNORE|^The program was run on:|^(M|Total m|Maximum m|Additional m)emory usage|^The FFT grid is:"
     elif [ $MODE == "ocl_seq" ]; then
-      IGNORE="$IGNORE|^Using OpenCL device|^Device memory|^OpenCL FFT algorithm: |^(M|Total m|OpenCL m)emory usage"
+      IGNORE="$IGNORE|^Using OpenCL device|^Device memory|^OpenCL FFT algorithm:|^(M|Total m|OpenCL m)emory usage"
     fi
     if [ -n "$FFTCOMP" ]; then
-      IGNORE="$IGNORE|^(|OpenCL )FFT algorithm: |^The FFT grid is: |^(M|Total m|OpenCL m|Maximum m)emory usage"
+      IGNORE="$IGNORE|^(|OpenCL )FFT algorithm:|^The FFT grid is:|^(M|Total m|OpenCL m|Maximum m)emory usage"
     fi
 	if [ -n "$SURF_STANDARD" ]; then
-      IGNORE="$IGNORE|^Particle is placed near|^  height of the particle|^Reflected|^Transmitted|^Total planes of E"
+      IGNORE="$IGNORE|^Particle is placed|^  height of the|^Reflected|^Transmitted|^Total planes of E"
+	  IGNORE="$IGNORE|^(M|Total m|OpenCL m|Maximum m)emory usage"
 	fi
     CUT="^Total wall time: "
     asmin rtol 4
     numigndiff $1 $2 "$IGNORE" "$CUT"
-  elif [[ "$base" = CrossSec-* ]]; then
+  elif [[ "$base" == CrossSec-* ]]; then
     numdiff $1 $2
   elif [[ "$base" == mueller* || "$base" == ampl* ]]; then
     asmin atol 9
     asmin rtol 5
     numdiff $1 $2
   elif [[ "$base" == log_int_* || "$base" == "log_orient_avg" ]]; then
-    asmin atol 12
+	asmin atol 10
     numdiff $1 $2
   elif [[ "$base" == "granules" ]]; then #compare only some comments and total number of lines
     if [ `wc -l < $1` == `wc -l < $2` ]; then
@@ -219,6 +230,10 @@ function mycmp {
     igndiff $1 $2 "generated by ADDA v\."
   elif [[ "$base" == IncBeam* ]]; then
  	numdiff $1 $2
+  elif [[ "$base" == DipPol* || "$base" == IntField* ]]; then
+    asmin atol 12
+	asmin rtol 6
+    numdiff $1 $2
   else
     diff $1 $2 >&2
   fi
@@ -275,9 +290,24 @@ while read -r cmpfiles cmdline; do
     let imax=imax+1
     finds[$imax]=$cmpfiles
     reps[$imax]="$cmdline"
-  #skip blank and commented lines, and all lines when skip=0
+  # skip blank and commented lines, and all lines when skip=0
   elif [[ -n "$cmpfiles" && "${cmpfiles:0:1}" != "#" && ( $skip -eq 0 || ( $skip -eq 1 && "$cmdline" == $3 ) ) ]]; then
     skip=0;
+    # test special cases, given in the file
+	if [ "$cmpfiles" == "NOMPI" ]; then
+      if [[ $MODE == "mpi" || $MODE == "mpi_seq" ]]; then
+		continue
+      else
+        cmpfiles="$ALLNAME"
+	  fi
+    fi
+	if [ "$cmpfiles" == "NOMPISEQ" ]; then
+      if [ $MODE == "mpi_seq" ]; then
+		continue
+      else
+        cmpfiles="$ALLNAME"
+	  fi
+    fi
     for i in `seq 0 $imax`; do # variable substitution
       cmdline="${cmdline/${finds[$i]}/${reps[$i]}}"
     done
diff --git a/tests/2exec/suite b/tests/2exec/suite
index 4d5b24e3..63bfdb13 100644
--- a/tests/2exec/suite
+++ b/tests/2exec/suite
@@ -25,6 +25,8 @@
 # The format is the following: '<list,of,files,to,compare> <cmdline>' the first one is coma-separated list of files to
 # compare or 'all' (which compares all produced files. <cmdline> is everything after the first space and it is passed
 # directly to ADDA.
+# Instead of 'all' a number of macros can be used: NOMPI, NOMPISEQ which is equivalent to 'all' for other modes, but
+# causes the line to be skipped in the matching mode. NOMPI lines are skipped both in mpi and mpi_seq modes.
 
 all
 
@@ -72,8 +74,7 @@ all -chp_dir chp_tmp -chp_type regular -chpoint 1s -eps 3 ;mgn;
 all -h chp_dir
 all -chp_dir chp_tmp -chp_type always -eps 3 ;mgn;
 all -h chp_load
-# the following may cause errors in mpi_seq mode
-all -chp_dir chp_tmp -chp_load ;mgn;
+NOMPISEQ -chp_dir chp_tmp -chp_load ;mgn;
 
 all -h Cpr
 all -Cpr ;mgn;
@@ -101,6 +102,7 @@ CrossSec-Y,CrossSec-X,mueller -granul 0.2 2 2 -size 8 -shape coated 0.5 ;3m; ;n;
 
 all -h grid
 all -grid 4 6 8 ;m; ;n;
+all -grid 10 10 10 -shape read sphere.geom -sym enf ;m; ;n;
 
 all -h
 all -h h
@@ -122,6 +124,10 @@ all -int igt_so ;mgn;
 all -int poi ;mgn;
 all -int so ;mgn;
 
+all -h int_surf
+all -int_surf img -surf 4 2 0 ;mgn;
+all -int_surf som -surf 4 2 0 ;mgn;
+
 all -h iter
 all -iter bcgs2 ;mgn;
 all -iter bicg ;mgn;
@@ -145,6 +151,7 @@ all -maxiter 5 ;mgn;
 
 all -h no_reduced_fft
 all -no_reduced_fft ;mgn;
+NOMPI -no_reduced_fft -iter cgnr ;mgn;
 
 all -h no_vol_cor
 all -no_vol_cor -size 3 ;mgn;
@@ -210,6 +217,9 @@ all -scat_matr ampl ;mgn;
 all -scat_matr both ;mgn;
 all -scat_matr none ;m; ;g;
 
+all -h scat_plane
+all -scat_plane ;se; ;mgn;
+
 all -h shape
 all -h shape axisymmetric
 all -shape axisymmetric 196.txt ;mgn;
@@ -274,6 +284,14 @@ all -store_int_field ;se; ;mgn;
 all -h store_scat_grid
 all -store_scat_grid ;sep; ;mgn;
 
+all -h surf
+all -surf 4 2 0 ;mgn;
+all -surf 4 3 4 -prop 1 2 3 ;mgn;
+all -surf 4 3 4 -prop 1 2 -3 ;se; ;mgn;
+all -surf 4 inf -prop 1 2 -3 ;se; ;mgn;
+all -surf 4 2 0 -no_reduced_fft ;mgn;
+NOMPI -surf 4 2 0 -iter cgnr -no_reduced_fft ;mgn; 
+
 all -h sym
 all -sym auto ;mgn;
 all -sym no ;mgn;
diff --git a/tests/2exec/suite_sparse b/tests/2exec/suite_sparse
index c3715164..a1211ac4 100644
--- a/tests/2exec/suite_sparse
+++ b/tests/2exec/suite_sparse
@@ -26,6 +26,8 @@
 # The format is the following: '<list,of,files,to,compare> <cmdline>' the first one is coma-separated list of files to
 # compare or 'all' (which compares all produced files. <cmdline> is everything after the first space and it is passed
 # directly to ADDA.
+# Instead of 'all' a number of macros can be used: NOMPI, NOMPISEQ which is equivalent to 'all' for other modes, but
+# causes the line to be skipped in the matching mode. NOMPI lines are skipped both in mpi and mpi_seq modes.
 
 all ;g;
 
@@ -58,8 +60,7 @@ all -chp_dir chp_tmp -chp_type regular -chpoint 1s -eps 3 ;mgn;
 all -h chp_dir
 all -chp_dir chp_tmp -chp_type always -eps 3 ;mgn;
 all -h chp_load
-# the following may cause errors in mpi_seq mode
-all -chp_dir chp_tmp -chp_load ;mgn;
+NOMPISEQ -chp_dir chp_tmp -chp_load ;mgn;
 
 all -h Cpr
 all -Cpr ;sep; ;mn;
@@ -85,7 +86,7 @@ all -eq_rad 1 ;mgn;
 #CrossSec-Y,CrossSec-X,mueller -granul 0.2 2 2 -size 8 -shape coated 0.5 ;3m; ;n;
 
 all -h grid
-#all -grid 4 6 8 ;m; ;n;
+all -grid 6 6 8 ;mg4n;
 
 all -h
 all -h h
@@ -107,6 +108,10 @@ all -int igt_so ;mgn;
 all -int poi ;mgn;
 all -int so ;mgn;
 
+all -h int_surf
+all -int_surf img -surf 4 2 0 ;mgn;
+all -int_surf som -surf 4 2 0 ;mgn;
+
 all -h iter
 all -iter bcgs2 ;mgn;
 all -iter bicg ;mgn;
@@ -130,6 +135,7 @@ all -maxiter 5 ;mgn;
 
 all -h no_reduced_fft
 all -no_reduced_fft ;mgn;
+NOMPI -no_reduced_fft -iter cgnr ;mgn; 
 
 all -h no_vol_cor
 all -no_vol_cor -size 3 ;mgn;
@@ -194,6 +200,9 @@ all -scat_matr ampl ;mgn;
 all -scat_matr both ;mgn;
 all -scat_matr none ;m; ;g; 
 
+all -h scat_plane
+all -scat_plane ;se; ;mn;
+
 all -h shape
 #all -h shape axisymmetric
 #all -shape axisymmetric 196.txt ;mgn;
@@ -258,6 +267,14 @@ all -store_int_field ;se; ;mn;
 all -h store_scat_grid
 all -store_scat_grid ;sep; ;mn;
 
+all -h surf
+all -surf 4 2 0 ;mgn;
+all -surf 4 3 4 -prop 1 2 3 -shape read sphere.geom ;mn;
+all -surf 4 3 4 -prop 1 2 -3 ;se; ;mn;
+all -surf 4 inf -prop 1 2 -3 ;se; ;mn;
+all -surf 4 2 0 -no_reduced_fft ;mgn;
+NOMPI -surf 4 2 0 -iter cgnr -no_reduced_fft ;mgn; 
+
 all -h sym 
 all -sym auto ;mn; -shape read sphere.geom
 all -sym no ;mn; -shape read sphere.geom
diff --git a/tests/2exec/suite_surf b/tests/2exec/suite_surf
index 39326cb7..a9a5b035 100644
--- a/tests/2exec/suite_surf
+++ b/tests/2exec/suite_surf
@@ -27,24 +27,11 @@
 # The format is the following: '<list,of,files,to,compare> <cmdline>' the first one is coma-separated list of files to
 # compare or 'all' (which compares all produced files. <cmdline> is everything after the first space and it is passed
 # directly to ADDA.
+# Instead of 'all' a number of macros can be used: NOMPI, NOMPISEQ which is equivalent to 'all' for other modes, but
+# causes the line to be skipped in the matching mode. NOMPI lines are skipped both in mpi and mpi_seq modes.
 
 all
 
-# testing of different grids - relevant for FFT methods
-# to remove redundant warnings for ocl_seq, only (2,3,5) numbers are used
-all -grid 2 ;smn;
-all -grid 4 ;smn;
-all -grid 6 ;smn;
-all -grid 8 ;smn;
-all -grid 10 ;smn;
-all -grid 12 ;smn;
-all -grid 16 ;smn;
-all -grid 18 ;smn;
-all -grid 20 ;smn;
-all -grid 24 ;smn;
-all -grid 30 ;smn;
-all -grid 32 ;smn;
-
 all -h alldir_inp
 all -alldir_inp adp.dat -Csca ;mgn;
 
@@ -76,8 +63,7 @@ all -chp_dir chp_tmp -chp_type regular -chpoint 1s -eps 3 ;mgn;
 all -h chp_dir
 all -chp_dir chp_tmp -chp_type always -eps 3 ;mgn;
 all -h chp_load
-# the following may cause errors in mpi_seq mode
-all -chp_dir chp_tmp -chp_load ;mgn;
+NOMPISEQ -chp_dir chp_tmp -chp_load ;mgn;
 
 all -h Cpr
 # radiative forces are not yet supported with surf
@@ -102,10 +88,12 @@ all -eq_rad 1 ;mgn;
 # properties are compared using rather large tolerances
 all -h granul
 CrossSec-Y,CrossSec-X,mueller -granul 0.2 0.5 2 -size 8 -shape coated 0.5 ;3m; ;n;
-CrossSec-Y,CrossSec-X,mueller -granul 0.2 2 2 -size 8 -shape coated 0.5 ;3m; ;n;
+# Standard tolerances are not sufficient, when the following is run in the presence of non-trivial surface
+#CrossSec-Y,CrossSec-X,mueller -granul 0.2 2 2 -size 8 -shape coated 0.5 ;3m; ;n;
 
 all -h grid
 all -grid 4 6 8 ;m; ;n;
+all -grid 10 10 10 -shape read sphere.geom -sym enf ;m; ;n;
 
 all -h
 all -h h
@@ -141,7 +129,8 @@ all -h jagged
 all -jagged 2 ;mg4n;
 
 all -h lambda
-all -lambda 1 ;mgn;
+# 1 was changed to 10, to keep old tolerances
+all -lambda 10 ;mgn;
 
 all -h m
 all -m 1.2 0.2 ;g; ;n;
@@ -151,6 +140,7 @@ all -maxiter 5 ;mgn;
 
 all -h no_reduced_fft
 all -no_reduced_fft ;mgn;
+NOMPI -no_reduced_fft -iter cgnr ;mgn;
 
 all -h no_vol_cor
 all -no_vol_cor -size 3 ;mgn;
@@ -221,6 +211,10 @@ all -scat_matr both ;mgn;
 # for equivalent runs with surface we additionally add -yz, which is incompatible with the following
 #all -scat_matr none ;m; ;g;
 
+all -h scat_plane
+# for equivalent runs with surface we additionally add -yz, which is incompatible with the following
+#all -scat_plane ;se; ;mgn;
+
 all -h shape
 all -h shape axisymmetric
 all -shape axisymmetric 196.txt ;mgn;
@@ -294,7 +288,8 @@ all -sym no ;mgn;
 all -sym enf ;mgn;
 all -sym auto ;sep; ;mgn;
 all -sym no ;sep; ;mgn;
-all -sym enf ;sep; ;mgn;
+# enforcing symmetry for non-default propagation is forbidden for -surf
+#all -sym enf ;sep; ;mgn;
 
 all -h test
 all -test ;mgn;