From 7edae35d07805c014e3117ba589e93bdcd102cb9 Mon Sep 17 00:00:00 2001
From: James Hooker <jhooker@mrc-lmb.cam.ac.uk>
Date: Sun, 5 Mar 2023 20:51:03 +0000
Subject: [PATCH 1/2] Fixed stride issues in `helper.cu`

Now, `ceilfracf` does what its name suggests.
Incidentally, it is no longer needed.
`cuda_kernel_softMaskBackgroundValue`, `cuda_kernel_cosineFilter`, and `cuda_kernel_softMaskOutsideMap` now do proper grid-stride iteration.
---
 .../cuda/cuda_kernels/cuda_device_utils.cuh   |   2 +-
 src/acc/cuda/cuda_kernels/helper.cu           | 389 ++++++------------
 2 files changed, 136 insertions(+), 255 deletions(-)

diff --git a/src/acc/cuda/cuda_kernels/cuda_device_utils.cuh b/src/acc/cuda/cuda_kernels/cuda_device_utils.cuh
index d999af07e..cca876080 100644
--- a/src/acc/cuda/cuda_kernels/cuda_device_utils.cuh
+++ b/src/acc/cuda/cuda_kernels/cuda_device_utils.cuh
@@ -42,7 +42,7 @@ static inline
 __device__ int ceilfracf(T1 a, T2 b)
 {
 //	return __float2int_ru(__fdividef( (float)a, (float)b ) );
-	return (int)(a/b + 1);
+	return ceilf(float(a) / float(b));
 }
 
 static inline
diff --git a/src/acc/cuda/cuda_kernels/helper.cu b/src/acc/cuda/cuda_kernels/helper.cu
index a33187dc9..165f83757 100644
--- a/src/acc/cuda/cuda_kernels/helper.cu
+++ b/src/acc/cuda/cuda_kernels/helper.cu
@@ -230,264 +230,145 @@ __global__ void cuda_kernel_RNDnormalDitributionComplexWithPowerModulation3D( AC
 //	}
 //}
 
-__global__ void cuda_kernel_softMaskOutsideMap(	XFLOAT *vol,
-												long int vol_size,
-												long int xdim,
-												long int ydim,
-												long int zdim,
-												long int xinit,
-												long int yinit,
-												long int zinit,
-												bool do_Mnoise,
-												XFLOAT radius,
-												XFLOAT radius_p,
-												XFLOAT cosine_width	)
-{
-
-		int tid = threadIdx.x;
-
-//		vol.setXmippOrigin(); // sets xinit=xdim , also for y z
-		XFLOAT r, raisedcos;
-
-		__shared__ XFLOAT     img_pixels[SOFTMASK_BLOCK_SIZE];
-		__shared__ XFLOAT    partial_sum[SOFTMASK_BLOCK_SIZE];
-		__shared__ XFLOAT partial_sum_bg[SOFTMASK_BLOCK_SIZE];
-
-		XFLOAT sum_bg_total =  (XFLOAT)0.0;
-
-		long int texel_pass_num = ceilfracf(vol_size,SOFTMASK_BLOCK_SIZE);
-		int texel = tid;
-
-		partial_sum[tid]=(XFLOAT)0.0;
-		partial_sum_bg[tid]=(XFLOAT)0.0;
-		if (do_Mnoise)
-		{
-			for (int pass = 0; pass < texel_pass_num; pass++, texel+=SOFTMASK_BLOCK_SIZE) // loop the available warps enough to complete all translations for this orientation
-			{
-				XFLOAT x,y,z;
-				if(texel<vol_size)
-				{
-					img_pixels[tid]=__ldg(&vol[texel]);
-
-					z = floor( (float) texel                   / (float)((xdim)*(ydim)));
-					y = floor( (XFLOAT)(texel-z*(xdim)*(ydim)) / (XFLOAT) xdim );
-					x = texel - z*(xdim)*(ydim) - y*xdim;
-
-					z-=zinit;
-					y-=yinit;
-					x-=xinit;
-
-					r = sqrt(x*x + y*y + z*z);
-
-					if (r < radius)
-						continue;
-					else if (r > radius_p)
-					{
-						partial_sum[tid]    += (XFLOAT)1.0;
-						partial_sum_bg[tid] += img_pixels[tid];
-					}
-					else
-					{
-#if defined(ACC_DOUBLE_PRECISION)
-						raisedcos = 0.5 + 0.5  * cospi( (radius_p - r) / cosine_width );
-#else
-						raisedcos = 0.5f + 0.5f * cospif((radius_p - r) / cosine_width );
-#endif
-						partial_sum[tid] += raisedcos;
-						partial_sum_bg[tid] += raisedcos * img_pixels[tid];
-					}
-				}
-			}
-		}
-
-		__syncthreads();
-		for(int j=(SOFTMASK_BLOCK_SIZE/2); j>0; j/=2)
-		{
-			if(tid<j)
-			{
-				partial_sum[tid] += partial_sum[tid+j];
-				partial_sum_bg[tid] += partial_sum_bg[tid+j];
-			}
-			__syncthreads();
-		}
-
-		sum_bg_total  = partial_sum_bg[0] / partial_sum[0];
-
-
-		texel = tid;
-		for (int pass = 0; pass < texel_pass_num; pass++, texel+=SOFTMASK_BLOCK_SIZE) // loop the available warps enough to complete all translations for this orientation
-		{
-			XFLOAT x,y,z;
-			if(texel<vol_size)
-			{
-				img_pixels[tid]=__ldg(&vol[texel]);
-
-				z =  floor( (float) texel                  / (float)((xdim)*(ydim)));
-				y = floor( (XFLOAT)(texel-z*(xdim)*(ydim)) / (XFLOAT)  xdim         );
-				x = texel - z*(xdim)*(ydim) - y*xdim;
-
-				z-=zinit;
-				y-=yinit;
-				x-=xinit;
-
-				r = sqrt(x*x + y*y + z*z);
-
-				if (r < radius)
-					continue;
-				else if (r > radius_p)
-					img_pixels[tid]=sum_bg_total;
-				else
-				{
-#if defined(ACC_DOUBLE_PRECISION)
-					raisedcos = 0.5  + 0.5  * cospi( (radius_p - r) / cosine_width );
-#else
-					raisedcos = 0.5f + 0.5f * cospif((radius_p - r) / cosine_width );
-#endif
-					img_pixels[tid]= img_pixels[tid]*(1-raisedcos) + sum_bg_total*raisedcos;
-
-				}
-				vol[texel]=img_pixels[tid];
-			}
-
-		}
+__global__ void cuda_kernel_softMaskOutsideMap(
+    XFLOAT *vol, long int vol_size,
+	long int xdim,  long int ydim,  long int zdim,
+	long int xinit, long int yinit, long int zinit,
+	bool do_Mnoise,
+	XFLOAT radius, XFLOAT radius_p, XFLOAT cosine_width
+) {
+    __shared__ XFLOAT partial_sum   [SOFTMASK_BLOCK_SIZE];
+    __shared__ XFLOAT partial_sum_bg[SOFTMASK_BLOCK_SIZE];
+
+    const size_t tid = threadIdx.x, bid = blockIdx.x;
+    const size_t stride = blockDim.x * gridDim.x;
+    const size_t xydim = xdim * ydim;
+
+    partial_sum   [tid] = 0.0;
+    partial_sum_bg[tid] = 0.0;
+
+    const auto weight = [radius, radius_p, cosine_width] (XFLOAT r) -> XFLOAT {
+        return r < radius   ? 0.0 :
+               r > radius_p ? 1.0 :
+        #if defined(ACC_DOUBLE_PRECISION)
+        0.5  + 0.5  * cospi ((radius_p - r) / cosine_width);
+        #else
+        0.5f + 0.5f * cospif((radius_p - r) / cosine_width);
+        #endif
+    };
+
+    XFLOAT bg = 0.0;
+    if (do_Mnoise) {
+        for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) {
+            const int z = i / xydim        - zinit;
+            const int y = i % xydim / xdim - yinit;
+            const int x = i % xydim % xdim - xinit;
+
+            const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z));
+            const XFLOAT w = weight(r);
+            partial_sum   [tid] += w;
+            partial_sum_bg[tid] += w * __ldg(&vol[i]);
+        }
+
+        __syncthreads();
+        for (int j = SOFTMASK_BLOCK_SIZE / 2; j > 0; j /= 2) {
+            if (tid < j) {
+                partial_sum   [tid] += partial_sum   [tid + j];
+                partial_sum_bg[tid] += partial_sum_bg[tid + j];
+            }
+            __syncthreads();
+        }
+
+        bg = partial_sum_bg[0] / partial_sum[0];
+    }
+
+    for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) {
+        const int z = i / xydim        - zinit;
+        const int y = i % xydim / xdim - yinit;
+        const int x = i % xydim % xdim - xinit;
+
+        const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z));
+        const XFLOAT w = weight(r);
+
+        vol[i] = __ldg(&vol[i]) * (1 - w) + bg * w;
+    }
 }
 
-__global__ void cuda_kernel_softMaskBackgroundValue(	XFLOAT *vol,
-														long int vol_size,
-														long int xdim,
-														long int ydim,
-														long int zdim,
-														long int xinit,
-														long int yinit,
-														long int zinit,
-														XFLOAT radius,
-														XFLOAT radius_p,
-														XFLOAT cosine_width,
-														XFLOAT *g_sum,
-														XFLOAT *g_sum_bg)
-{
-
-		int tid = threadIdx.x;
-		int bid = blockIdx.x;
-
-//		vol.setXmippOrigin(); // sets xinit=xdim , also for y z
-		XFLOAT r, raisedcos;
-		int x,y,z;
-		__shared__ XFLOAT     img_pixels[SOFTMASK_BLOCK_SIZE];
-		__shared__ XFLOAT    partial_sum[SOFTMASK_BLOCK_SIZE];
-		__shared__ XFLOAT partial_sum_bg[SOFTMASK_BLOCK_SIZE];
-
-		long int texel_pass_num = ceilfracf(vol_size,SOFTMASK_BLOCK_SIZE*gridDim.x);
-		int texel = bid*SOFTMASK_BLOCK_SIZE*texel_pass_num + tid;
-
-		partial_sum[tid]=(XFLOAT)0.0;
-		partial_sum_bg[tid]=(XFLOAT)0.0;
-
-		for (int pass = 0; pass < texel_pass_num; pass++, texel+=SOFTMASK_BLOCK_SIZE) // loop the available warps enough to complete all translations for this orientation
-		{
-			if(texel<vol_size)
-			{
-				img_pixels[tid]=__ldg(&vol[texel]);
-
-				z =   texel / (xdim*ydim) ;
-				y = ( texel % (xdim*ydim) ) / xdim ;
-				x = ( texel % (xdim*ydim) ) % xdim ;
-
-				z-=zinit;
-				y-=yinit;
-				x-=xinit;
-
-				r = sqrt(XFLOAT(x*x + y*y + z*z));
-
-				if (r < radius)
-					continue;
-				else if (r > radius_p)
-				{
-					partial_sum[tid]    += (XFLOAT)1.0;
-					partial_sum_bg[tid] += img_pixels[tid];
-				}
-				else
-				{
-#if defined(ACC_DOUBLE_PRECISION)
-					raisedcos = 0.5 + 0.5  * cospi( (radius_p - r) / cosine_width );
-#else
-					raisedcos = 0.5f + 0.5f * cospif((radius_p - r) / cosine_width );
-#endif
-					partial_sum[tid] += raisedcos;
-					partial_sum_bg[tid] += raisedcos * img_pixels[tid];
-				}
-			}
-		}
-
-		cuda_atomic_add(&g_sum[tid]   , partial_sum[tid]);
-		cuda_atomic_add(&g_sum_bg[tid], partial_sum_bg[tid]);
+__global__ void cuda_kernel_softMaskBackgroundValue(
+    XFLOAT *vol, long int vol_size,
+    long int xdim,  long int ydim,  long int zdim,
+    long int xinit, long int yinit, long int zinit,
+    XFLOAT radius, XFLOAT radius_p, XFLOAT cosine_width,
+    XFLOAT *g_sum, XFLOAT *g_sum_bg
+) {
+    __shared__ XFLOAT partial_sum   [SOFTMASK_BLOCK_SIZE];
+    __shared__ XFLOAT partial_sum_bg[SOFTMASK_BLOCK_SIZE];
+
+    const size_t tid = threadIdx.x, bid = blockIdx.x;
+    const size_t stride = blockDim.x * gridDim.x;
+    const size_t xydim = xdim * ydim;
+
+    partial_sum   [tid] = 0.0;
+    partial_sum_bg[tid] = 0.0;
+
+    const auto weight = [radius, radius_p, cosine_width] (XFLOAT r) -> XFLOAT {
+        return r < radius   ? 0.0 :
+               r > radius_p ? 1.0 :
+        #if defined(ACC_DOUBLE_PRECISION)
+        0.5  + 0.5  * cospi ((radius_p - r) / cosine_width);
+        #else
+        0.5f + 0.5f * cospif((radius_p - r) / cosine_width);
+        #endif
+    };
+
+    for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) {
+        const int z = i / xydim        - zinit;
+        const int y = i % xydim / xdim - yinit;
+        const int x = i % xydim % xdim - xinit;
+
+        const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z));
+        const XFLOAT w = weight(r);
+
+        partial_sum   [tid] += w;
+        partial_sum_bg[tid] += w * __ldg(&vol[i]);
+    }
+
+    cuda_atomic_add(&g_sum   [tid], partial_sum   [tid]);
+    cuda_atomic_add(&g_sum_bg[tid], partial_sum_bg[tid]);
 }
 
-
-__global__ void cuda_kernel_cosineFilter(	XFLOAT *vol,
-											long int vol_size,
-											long int xdim,
-											long int ydim,
-											long int zdim,
-											long int xinit,
-											long int yinit,
-											long int zinit,
-											bool do_noise,
-											XFLOAT *noise,
-											XFLOAT radius,
-											XFLOAT radius_p,
-											XFLOAT cosine_width,
-											XFLOAT bg_value)
-{
-
-	int tid = threadIdx.x;
-	int bid = blockIdx.x;
-
-//		vol.setXmippOrigin(); // sets xinit=xdim , also for y z
-	XFLOAT r, raisedcos, defVal;
-	int x,y,z;
-	__shared__ XFLOAT     img_pixels[SOFTMASK_BLOCK_SIZE];
-
-	long int texel_pass_num = ceilfracf(vol_size,SOFTMASK_BLOCK_SIZE*gridDim.x);
-	int texel = bid*SOFTMASK_BLOCK_SIZE*texel_pass_num + tid;
-
-	defVal = bg_value;
-	for (int pass = 0; pass < texel_pass_num; pass++, texel+=SOFTMASK_BLOCK_SIZE) // loop the available warps enough to complete all translations for this orientation
-	{
-		if(texel<vol_size)
-		{
-			img_pixels[tid]=__ldg(&vol[texel]);
-
-			z =   texel / (xdim*ydim) ;
-			y = ( texel % (xdim*ydim) ) / xdim ;
-			x = ( texel % (xdim*ydim) ) % xdim ;
-
-			z-=zinit;
-			y-=yinit;
-			x-=xinit;
-
-			r = sqrt(XFLOAT(x*x + y*y + z*z));
-
-			if(do_noise)
-				defVal = noise[texel];
-
-			if (r < radius)
-				continue;
-			else if (r > radius_p)
-				img_pixels[tid]=defVal;
-			else
-			{
-#if defined(ACC_DOUBLE_PRECISION)
-				raisedcos = 0.5  + 0.5  * cospi( (radius_p - r) / cosine_width );
-#else
-				raisedcos = 0.5f + 0.5f * cospif((radius_p - r) / cosine_width );
-#endif
-				img_pixels[tid]= img_pixels[tid]*(1-raisedcos) + defVal*raisedcos;
-
-			}
-			vol[texel]=img_pixels[tid];
-		}
-
+__global__ void cuda_kernel_cosineFilter(
+    XFLOAT *vol, long int vol_size,
+    long int xdim,  long int ydim,  long int zdim,
+    long int xinit, long int yinit, long int zinit,
+    bool do_noise, XFLOAT *noise,
+    XFLOAT radius, XFLOAT radius_p, XFLOAT cosine_width,
+    XFLOAT bg_value
+) {
+	const size_t tid = threadIdx.x, bid = blockIdx.x;
+    const size_t stride = blockDim.x * gridDim.x;
+    const size_t xydim = xdim * ydim;
+
+    const auto weight = [radius, radius_p, cosine_width] (XFLOAT r) -> XFLOAT {
+        return r < radius   ? 0.0 :
+               r > radius_p ? 1.0 :
+        #if defined(ACC_DOUBLE_PRECISION)
+        0.5  + 0.5  * cospi ((radius_p - r) / cosine_width);
+        #else
+        0.5f + 0.5f * cospif((radius_p - r) / cosine_width);
+        #endif
+    };
+
+	for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) {
+        const int z = i / xydim        - zinit;
+        const int y = i % xydim / xdim - yinit;
+        const int x = i % xydim % xdim - xinit;
+
+        const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z));
+        const XFLOAT w = weight(r);
+        const XFLOAT bg = do_noise ? noise[i] : bg_value;
+
+        vol[i] = __ldg(&vol[i]) * (1 - w) + bg * w;
 	}
 }
 

From f251f92900d103eeaff0007c527ee8445f849fc3 Mon Sep 17 00:00:00 2001
From: James Hooker <jhooker@mrc-lmb.cam.ac.uk>
Date: Sun, 5 Mar 2023 21:34:33 +0000
Subject: [PATCH 2/2] Fixed "grid-strided" loops in `helper.cpp`

---
 src/acc/cpu/cpu_kernels/helper.cpp | 205 ++++++++++-------------------
 1 file changed, 71 insertions(+), 134 deletions(-)

diff --git a/src/acc/cpu/cpu_kernels/helper.cpp b/src/acc/cpu/cpu_kernels/helper.cpp
index 02353cd54..ad4f2ccba 100644
--- a/src/acc/cpu/cpu_kernels/helper.cpp
+++ b/src/acc/cpu/cpu_kernels/helper.cpp
@@ -115,142 +115,79 @@ void RNDnormalDitributionComplexWithPowerModulation3D(ACCCOMPLEX* Image, size_t
 		}
     }
 }
-void softMaskBackgroundValue(	int      block_dim,
-                                int      block_size,
-                                XFLOAT  *vol,
-								long int vol_size,
-								long int xdim,
-								long int ydim,
-								long int zdim,
-								long int xinit,
-								long int yinit,
-								long int zinit,
-								XFLOAT   radius,
-								XFLOAT   radius_p,
-								XFLOAT   cosine_width,
-								XFLOAT  *g_sum,
-								XFLOAT  *g_sum_bg)
-{
-	for(int bid=0; bid<block_dim; bid++)
-	{
-		for(int tid=0; tid<block_size; tid++)
-		{
-	//		vol.setXmippOrigin(); // sets xinit=xdim , also for y z
-			XFLOAT r, raisedcos;
-			int x,y,z;
-			XFLOAT     img_pixels;
-
-			size_t texel_pass_num = ceilfracf((size_t)vol_size,(size_t)block_size*(size_t)block_dim);
-			size_t texel = (size_t)bid*(size_t)block_size*(size_t)texel_pass_num + tid;
 
-			for (size_t pass = 0; pass < texel_pass_num; pass++, texel+=block_size) // loop through all translations for this orientation
-			{
-				if(texel<vol_size)
-				{
-					img_pixels = vol[texel];
-
-					z =   texel / (xdim*ydim) ;
-					y = ( texel % (xdim*ydim) ) / xdim ;
-					x = ( texel % (xdim*ydim) ) % xdim ;
-
-					z-=zinit;
-					y-=yinit;
-					x-=xinit;
-
-					r = sqrt(XFLOAT(x*x + y*y + z*z));
-
-					if (r < radius)
-						continue;
-					else if (r > radius_p)
-					{
-						g_sum[tid]    += (XFLOAT)1.0;
-						g_sum_bg[tid] += img_pixels;
-					}
-					else
-					{
-	#if defined(ACC_DOUBLE_PRECISION)
-						raisedcos = 0.5 + 0.5  * cos ( (radius_p - r) / cosine_width * M_PI);
-	#else
-						raisedcos = 0.5 + 0.5  * cosf( (radius_p - r) / cosine_width * M_PI);
-	#endif
-						g_sum[tid] += raisedcos;
-						g_sum_bg[tid] += raisedcos * img_pixels;
-					}
-				}
-			}
-		} // tid
-	} // bid
+void softMaskBackgroundValue(
+	int grid_dim, int block_dim,
+	XFLOAT *vol, long int vol_size,
+	long int xdim,  long int ydim,  long int zdim,
+	long int xinit, long int yinit, long int zinit,
+	XFLOAT radius, XFLOAT radius_p, XFLOAT cosine_width,
+	XFLOAT *g_sum, XFLOAT *g_sum_bg
+) {
+    const size_t stride = block_dim * grid_dim;
+    const size_t xydim = xdim * ydim;
+
+    const auto weight = [radius, radius_p, cosine_width] (XFLOAT r) -> XFLOAT {
+        return r < radius   ? 0.0 :
+               r > radius_p ? 1.0 :
+        #if defined(ACC_DOUBLE_PRECISION)
+        0.5  + 0.5  * cos (M_PI * (radius_p - r) / cosine_width);
+        #else
+        0.5f + 0.5f * cosf(M_PI * (radius_p - r) / cosine_width);
+        #endif
+    };
+
+	for (int bid = 0; bid < grid_dim;  bid++)
+	for (int tid = 0; tid < block_dim; tid++) {
+		for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) {
+			const int z = i / xydim        - zinit;
+			const int y = i % xydim / xdim - yinit;
+			const int x = i % xydim % xdim - xinit;
+
+			const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z));
+			const XFLOAT w = weight(r);
+
+			g_sum   [tid] += w;
+			g_sum_bg[tid] += w * vol[i];
+		}
+	}
 }
-
-
-void cosineFilter(	int      block_dim,
-					int      block_size,
-					XFLOAT  *vol,
-					long int vol_size,
-					long int xdim,
-					long int ydim,
-					long int zdim,
-					long int xinit,
-					long int yinit,
-					long int zinit,
-					bool     do_noise,
-					XFLOAT *noise,
-					XFLOAT   radius,
-					XFLOAT   radius_p,
-					XFLOAT   cosine_width,
-					XFLOAT   bg_value)
-{
-	for(int bid=0; bid<block_dim; bid++)
-	{
-		for(int tid=0; tid<block_size; tid++)
-		{
-//		vol.setXmippOrigin(); // sets xinit=xdim , also for y z
-			XFLOAT r, raisedcos, defVal;
-			int x,y,z;
-			XFLOAT     img_pixels;
-
-			size_t texel_pass_num = ceilfracf((size_t)vol_size,(size_t)block_size*(size_t)block_dim);
-			size_t texel = (size_t)bid*(size_t)block_size*(size_t)texel_pass_num + tid;
-
-			defVal = bg_value;
-			for (size_t pass = 0; pass < texel_pass_num; pass++, texel+=block_size) // loop the available warps enough to complete all translations for this orientation
-			{
-				if(texel<vol_size)
-				{
-					img_pixels= vol[texel];
-
-					z =   texel / (xdim*ydim) ;
-					y = ( texel % (xdim*ydim) ) / xdim ;
-					x = ( texel % (xdim*ydim) ) % xdim ;
-
-					z-=zinit;
-					y-=yinit;
-					x-=xinit;
-
-					r = sqrt(XFLOAT(x*x + y*y + z*z));
-
-					if(do_noise)
-                                		defVal = noise[texel];
-
-					if (r < radius)
-						continue;
-					else if (r > radius_p)
-						img_pixels=defVal;
-					else
-					{
-		#if defined(ACC_DOUBLE_PRECISION)
-						raisedcos = 0.5 + 0.5  * cos ( (radius_p - r) / cosine_width * M_PI);
-		#else
-						raisedcos = 0.5 + 0.5  * cosf( (radius_p - r) / cosine_width * M_PI);
-		#endif
-						img_pixels= img_pixels*(1-raisedcos) + defVal*raisedcos;
-
-					}
-					vol[texel]=img_pixels;
-				}
-			}
-		} // tid
-	} // bid
+void cosineFilter(
+	int block_dim, int block_size,
+	XFLOAT *vol, long int vol_size,
+	long int xdim,  long int ydim,  long int zdim,
+	long int xinit, long int yinit, long int zinit,
+	bool do_noise, XFLOAT *noise,
+	XFLOAT radius, XFLOAT radius_p, XFLOAT cosine_width,
+	XFLOAT bg_value
+) {
+	const size_t stride = block_dim * grid_dim;
+    const size_t xydim = xdim * ydim;
+
+    const auto weight = [radius, radius_p, cosine_width] (XFLOAT r) -> XFLOAT {
+        return r < radius   ? 0.0 :
+               r > radius_p ? 1.0 :
+        #if defined(ACC_DOUBLE_PRECISION)
+        0.5  + 0.5  * cos (M_PI * (radius_p - r) / cosine_width);
+        #else
+        0.5f + 0.5f * cosf(M_PI * (radius_p - r) / cosine_width);
+        #endif
+    };
+
+	for (int bid = 0; bid < grid_dim;  bid++)
+	for (int tid = 0; tid < block_dim; tid++) {
+		for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) {
+			const int z = i / xydim        - zinit;
+			const int y = i % xydim / xdim - yinit;
+			const int x = i % xydim % xdim - xinit;
+
+			const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z));
+			const XFLOAT w = weight(r);
+			const XFLOAT bg = do_noise ? noise[i] : bg_value;
+
+			vol[i] = vol[i] * (1 - w) + bg * w;
+		}
+	}
 }
 
 template <typename T>