From 7edae35d07805c014e3117ba589e93bdcd102cb9 Mon Sep 17 00:00:00 2001 From: James Hooker Date: Sun, 5 Mar 2023 20:51:03 +0000 Subject: [PATCH 1/2] Fixed stride issues in `helper.cu` Now, `ceilfracf` does what its name suggests. Incidentally, it is no longer needed. `cuda_kernel_softMaskBackgroundValue`, `cuda_kernel_cosineFilter`, and `cuda_kernel_softMaskOutsideMap` now do proper grid-stride iteration. --- .../cuda/cuda_kernels/cuda_device_utils.cuh | 2 +- src/acc/cuda/cuda_kernels/helper.cu | 389 ++++++------------ 2 files changed, 136 insertions(+), 255 deletions(-) diff --git a/src/acc/cuda/cuda_kernels/cuda_device_utils.cuh b/src/acc/cuda/cuda_kernels/cuda_device_utils.cuh index d999af07e..cca876080 100644 --- a/src/acc/cuda/cuda_kernels/cuda_device_utils.cuh +++ b/src/acc/cuda/cuda_kernels/cuda_device_utils.cuh @@ -42,7 +42,7 @@ static inline __device__ int ceilfracf(T1 a, T2 b) { // return __float2int_ru(__fdividef( (float)a, (float)b ) ); - return (int)(a/b + 1); + return ceilf(float(a) / float(b)); } static inline diff --git a/src/acc/cuda/cuda_kernels/helper.cu b/src/acc/cuda/cuda_kernels/helper.cu index a33187dc9..165f83757 100644 --- a/src/acc/cuda/cuda_kernels/helper.cu +++ b/src/acc/cuda/cuda_kernels/helper.cu @@ -230,264 +230,145 @@ __global__ void cuda_kernel_RNDnormalDitributionComplexWithPowerModulation3D( AC // } //} -__global__ void cuda_kernel_softMaskOutsideMap( XFLOAT *vol, - long int vol_size, - long int xdim, - long int ydim, - long int zdim, - long int xinit, - long int yinit, - long int zinit, - bool do_Mnoise, - XFLOAT radius, - XFLOAT radius_p, - XFLOAT cosine_width ) -{ - - int tid = threadIdx.x; - -// vol.setXmippOrigin(); // sets xinit=xdim , also for y z - XFLOAT r, raisedcos; - - __shared__ XFLOAT img_pixels[SOFTMASK_BLOCK_SIZE]; - __shared__ XFLOAT partial_sum[SOFTMASK_BLOCK_SIZE]; - __shared__ XFLOAT partial_sum_bg[SOFTMASK_BLOCK_SIZE]; - - XFLOAT sum_bg_total = (XFLOAT)0.0; - - long int texel_pass_num = ceilfracf(vol_size,SOFTMASK_BLOCK_SIZE); - int texel = tid; - - partial_sum[tid]=(XFLOAT)0.0; - partial_sum_bg[tid]=(XFLOAT)0.0; - if (do_Mnoise) - { - for (int pass = 0; pass < texel_pass_num; pass++, texel+=SOFTMASK_BLOCK_SIZE) // loop the available warps enough to complete all translations for this orientation - { - XFLOAT x,y,z; - if(texel radius_p) - { - partial_sum[tid] += (XFLOAT)1.0; - partial_sum_bg[tid] += img_pixels[tid]; - } - else - { -#if defined(ACC_DOUBLE_PRECISION) - raisedcos = 0.5 + 0.5 * cospi( (radius_p - r) / cosine_width ); -#else - raisedcos = 0.5f + 0.5f * cospif((radius_p - r) / cosine_width ); -#endif - partial_sum[tid] += raisedcos; - partial_sum_bg[tid] += raisedcos * img_pixels[tid]; - } - } - } - } - - __syncthreads(); - for(int j=(SOFTMASK_BLOCK_SIZE/2); j>0; j/=2) - { - if(tid radius_p) - img_pixels[tid]=sum_bg_total; - else - { -#if defined(ACC_DOUBLE_PRECISION) - raisedcos = 0.5 + 0.5 * cospi( (radius_p - r) / cosine_width ); -#else - raisedcos = 0.5f + 0.5f * cospif((radius_p - r) / cosine_width ); -#endif - img_pixels[tid]= img_pixels[tid]*(1-raisedcos) + sum_bg_total*raisedcos; - - } - vol[texel]=img_pixels[tid]; - } - - } +__global__ void cuda_kernel_softMaskOutsideMap( + XFLOAT *vol, long int vol_size, + long int xdim, long int ydim, long int zdim, + long int xinit, long int yinit, long int zinit, + bool do_Mnoise, + XFLOAT radius, XFLOAT radius_p, XFLOAT cosine_width +) { + __shared__ XFLOAT partial_sum [SOFTMASK_BLOCK_SIZE]; + __shared__ XFLOAT partial_sum_bg[SOFTMASK_BLOCK_SIZE]; + + const size_t tid = threadIdx.x, bid = blockIdx.x; + const size_t stride = blockDim.x * gridDim.x; + const size_t xydim = xdim * ydim; + + partial_sum [tid] = 0.0; + partial_sum_bg[tid] = 0.0; + + const auto weight = [radius, radius_p, cosine_width] (XFLOAT r) -> XFLOAT { + return r < radius ? 0.0 : + r > radius_p ? 1.0 : + #if defined(ACC_DOUBLE_PRECISION) + 0.5 + 0.5 * cospi ((radius_p - r) / cosine_width); + #else + 0.5f + 0.5f * cospif((radius_p - r) / cosine_width); + #endif + }; + + XFLOAT bg = 0.0; + if (do_Mnoise) { + for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) { + const int z = i / xydim - zinit; + const int y = i % xydim / xdim - yinit; + const int x = i % xydim % xdim - xinit; + + const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z)); + const XFLOAT w = weight(r); + partial_sum [tid] += w; + partial_sum_bg[tid] += w * __ldg(&vol[i]); + } + + __syncthreads(); + for (int j = SOFTMASK_BLOCK_SIZE / 2; j > 0; j /= 2) { + if (tid < j) { + partial_sum [tid] += partial_sum [tid + j]; + partial_sum_bg[tid] += partial_sum_bg[tid + j]; + } + __syncthreads(); + } + + bg = partial_sum_bg[0] / partial_sum[0]; + } + + for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) { + const int z = i / xydim - zinit; + const int y = i % xydim / xdim - yinit; + const int x = i % xydim % xdim - xinit; + + const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z)); + const XFLOAT w = weight(r); + + vol[i] = __ldg(&vol[i]) * (1 - w) + bg * w; + } } -__global__ void cuda_kernel_softMaskBackgroundValue( XFLOAT *vol, - long int vol_size, - long int xdim, - long int ydim, - long int zdim, - long int xinit, - long int yinit, - long int zinit, - XFLOAT radius, - XFLOAT radius_p, - XFLOAT cosine_width, - XFLOAT *g_sum, - XFLOAT *g_sum_bg) -{ - - int tid = threadIdx.x; - int bid = blockIdx.x; - -// vol.setXmippOrigin(); // sets xinit=xdim , also for y z - XFLOAT r, raisedcos; - int x,y,z; - __shared__ XFLOAT img_pixels[SOFTMASK_BLOCK_SIZE]; - __shared__ XFLOAT partial_sum[SOFTMASK_BLOCK_SIZE]; - __shared__ XFLOAT partial_sum_bg[SOFTMASK_BLOCK_SIZE]; - - long int texel_pass_num = ceilfracf(vol_size,SOFTMASK_BLOCK_SIZE*gridDim.x); - int texel = bid*SOFTMASK_BLOCK_SIZE*texel_pass_num + tid; - - partial_sum[tid]=(XFLOAT)0.0; - partial_sum_bg[tid]=(XFLOAT)0.0; - - for (int pass = 0; pass < texel_pass_num; pass++, texel+=SOFTMASK_BLOCK_SIZE) // loop the available warps enough to complete all translations for this orientation - { - if(texel radius_p) - { - partial_sum[tid] += (XFLOAT)1.0; - partial_sum_bg[tid] += img_pixels[tid]; - } - else - { -#if defined(ACC_DOUBLE_PRECISION) - raisedcos = 0.5 + 0.5 * cospi( (radius_p - r) / cosine_width ); -#else - raisedcos = 0.5f + 0.5f * cospif((radius_p - r) / cosine_width ); -#endif - partial_sum[tid] += raisedcos; - partial_sum_bg[tid] += raisedcos * img_pixels[tid]; - } - } - } - - cuda_atomic_add(&g_sum[tid] , partial_sum[tid]); - cuda_atomic_add(&g_sum_bg[tid], partial_sum_bg[tid]); +__global__ void cuda_kernel_softMaskBackgroundValue( + XFLOAT *vol, long int vol_size, + long int xdim, long int ydim, long int zdim, + long int xinit, long int yinit, long int zinit, + XFLOAT radius, XFLOAT radius_p, XFLOAT cosine_width, + XFLOAT *g_sum, XFLOAT *g_sum_bg +) { + __shared__ XFLOAT partial_sum [SOFTMASK_BLOCK_SIZE]; + __shared__ XFLOAT partial_sum_bg[SOFTMASK_BLOCK_SIZE]; + + const size_t tid = threadIdx.x, bid = blockIdx.x; + const size_t stride = blockDim.x * gridDim.x; + const size_t xydim = xdim * ydim; + + partial_sum [tid] = 0.0; + partial_sum_bg[tid] = 0.0; + + const auto weight = [radius, radius_p, cosine_width] (XFLOAT r) -> XFLOAT { + return r < radius ? 0.0 : + r > radius_p ? 1.0 : + #if defined(ACC_DOUBLE_PRECISION) + 0.5 + 0.5 * cospi ((radius_p - r) / cosine_width); + #else + 0.5f + 0.5f * cospif((radius_p - r) / cosine_width); + #endif + }; + + for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) { + const int z = i / xydim - zinit; + const int y = i % xydim / xdim - yinit; + const int x = i % xydim % xdim - xinit; + + const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z)); + const XFLOAT w = weight(r); + + partial_sum [tid] += w; + partial_sum_bg[tid] += w * __ldg(&vol[i]); + } + + cuda_atomic_add(&g_sum [tid], partial_sum [tid]); + cuda_atomic_add(&g_sum_bg[tid], partial_sum_bg[tid]); } - -__global__ void cuda_kernel_cosineFilter( XFLOAT *vol, - long int vol_size, - long int xdim, - long int ydim, - long int zdim, - long int xinit, - long int yinit, - long int zinit, - bool do_noise, - XFLOAT *noise, - XFLOAT radius, - XFLOAT radius_p, - XFLOAT cosine_width, - XFLOAT bg_value) -{ - - int tid = threadIdx.x; - int bid = blockIdx.x; - -// vol.setXmippOrigin(); // sets xinit=xdim , also for y z - XFLOAT r, raisedcos, defVal; - int x,y,z; - __shared__ XFLOAT img_pixels[SOFTMASK_BLOCK_SIZE]; - - long int texel_pass_num = ceilfracf(vol_size,SOFTMASK_BLOCK_SIZE*gridDim.x); - int texel = bid*SOFTMASK_BLOCK_SIZE*texel_pass_num + tid; - - defVal = bg_value; - for (int pass = 0; pass < texel_pass_num; pass++, texel+=SOFTMASK_BLOCK_SIZE) // loop the available warps enough to complete all translations for this orientation - { - if(texel radius_p) - img_pixels[tid]=defVal; - else - { -#if defined(ACC_DOUBLE_PRECISION) - raisedcos = 0.5 + 0.5 * cospi( (radius_p - r) / cosine_width ); -#else - raisedcos = 0.5f + 0.5f * cospif((radius_p - r) / cosine_width ); -#endif - img_pixels[tid]= img_pixels[tid]*(1-raisedcos) + defVal*raisedcos; - - } - vol[texel]=img_pixels[tid]; - } - +__global__ void cuda_kernel_cosineFilter( + XFLOAT *vol, long int vol_size, + long int xdim, long int ydim, long int zdim, + long int xinit, long int yinit, long int zinit, + bool do_noise, XFLOAT *noise, + XFLOAT radius, XFLOAT radius_p, XFLOAT cosine_width, + XFLOAT bg_value +) { + const size_t tid = threadIdx.x, bid = blockIdx.x; + const size_t stride = blockDim.x * gridDim.x; + const size_t xydim = xdim * ydim; + + const auto weight = [radius, radius_p, cosine_width] (XFLOAT r) -> XFLOAT { + return r < radius ? 0.0 : + r > radius_p ? 1.0 : + #if defined(ACC_DOUBLE_PRECISION) + 0.5 + 0.5 * cospi ((radius_p - r) / cosine_width); + #else + 0.5f + 0.5f * cospif((radius_p - r) / cosine_width); + #endif + }; + + for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) { + const int z = i / xydim - zinit; + const int y = i % xydim / xdim - yinit; + const int x = i % xydim % xdim - xinit; + + const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z)); + const XFLOAT w = weight(r); + const XFLOAT bg = do_noise ? noise[i] : bg_value; + + vol[i] = __ldg(&vol[i]) * (1 - w) + bg * w; } } From f251f92900d103eeaff0007c527ee8445f849fc3 Mon Sep 17 00:00:00 2001 From: James Hooker Date: Sun, 5 Mar 2023 21:34:33 +0000 Subject: [PATCH 2/2] Fixed "grid-strided" loops in `helper.cpp` --- src/acc/cpu/cpu_kernels/helper.cpp | 205 ++++++++++------------------- 1 file changed, 71 insertions(+), 134 deletions(-) diff --git a/src/acc/cpu/cpu_kernels/helper.cpp b/src/acc/cpu/cpu_kernels/helper.cpp index 02353cd54..ad4f2ccba 100644 --- a/src/acc/cpu/cpu_kernels/helper.cpp +++ b/src/acc/cpu/cpu_kernels/helper.cpp @@ -115,142 +115,79 @@ void RNDnormalDitributionComplexWithPowerModulation3D(ACCCOMPLEX* Image, size_t } } } -void softMaskBackgroundValue( int block_dim, - int block_size, - XFLOAT *vol, - long int vol_size, - long int xdim, - long int ydim, - long int zdim, - long int xinit, - long int yinit, - long int zinit, - XFLOAT radius, - XFLOAT radius_p, - XFLOAT cosine_width, - XFLOAT *g_sum, - XFLOAT *g_sum_bg) -{ - for(int bid=0; bid radius_p) - { - g_sum[tid] += (XFLOAT)1.0; - g_sum_bg[tid] += img_pixels; - } - else - { - #if defined(ACC_DOUBLE_PRECISION) - raisedcos = 0.5 + 0.5 * cos ( (radius_p - r) / cosine_width * M_PI); - #else - raisedcos = 0.5 + 0.5 * cosf( (radius_p - r) / cosine_width * M_PI); - #endif - g_sum[tid] += raisedcos; - g_sum_bg[tid] += raisedcos * img_pixels; - } - } - } - } // tid - } // bid +void softMaskBackgroundValue( + int grid_dim, int block_dim, + XFLOAT *vol, long int vol_size, + long int xdim, long int ydim, long int zdim, + long int xinit, long int yinit, long int zinit, + XFLOAT radius, XFLOAT radius_p, XFLOAT cosine_width, + XFLOAT *g_sum, XFLOAT *g_sum_bg +) { + const size_t stride = block_dim * grid_dim; + const size_t xydim = xdim * ydim; + + const auto weight = [radius, radius_p, cosine_width] (XFLOAT r) -> XFLOAT { + return r < radius ? 0.0 : + r > radius_p ? 1.0 : + #if defined(ACC_DOUBLE_PRECISION) + 0.5 + 0.5 * cos (M_PI * (radius_p - r) / cosine_width); + #else + 0.5f + 0.5f * cosf(M_PI * (radius_p - r) / cosine_width); + #endif + }; + + for (int bid = 0; bid < grid_dim; bid++) + for (int tid = 0; tid < block_dim; tid++) { + for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) { + const int z = i / xydim - zinit; + const int y = i % xydim / xdim - yinit; + const int x = i % xydim % xdim - xinit; + + const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z)); + const XFLOAT w = weight(r); + + g_sum [tid] += w; + g_sum_bg[tid] += w * vol[i]; + } + } } - - -void cosineFilter( int block_dim, - int block_size, - XFLOAT *vol, - long int vol_size, - long int xdim, - long int ydim, - long int zdim, - long int xinit, - long int yinit, - long int zinit, - bool do_noise, - XFLOAT *noise, - XFLOAT radius, - XFLOAT radius_p, - XFLOAT cosine_width, - XFLOAT bg_value) -{ - for(int bid=0; bid radius_p) - img_pixels=defVal; - else - { - #if defined(ACC_DOUBLE_PRECISION) - raisedcos = 0.5 + 0.5 * cos ( (radius_p - r) / cosine_width * M_PI); - #else - raisedcos = 0.5 + 0.5 * cosf( (radius_p - r) / cosine_width * M_PI); - #endif - img_pixels= img_pixels*(1-raisedcos) + defVal*raisedcos; - - } - vol[texel]=img_pixels; - } - } - } // tid - } // bid +void cosineFilter( + int block_dim, int block_size, + XFLOAT *vol, long int vol_size, + long int xdim, long int ydim, long int zdim, + long int xinit, long int yinit, long int zinit, + bool do_noise, XFLOAT *noise, + XFLOAT radius, XFLOAT radius_p, XFLOAT cosine_width, + XFLOAT bg_value +) { + const size_t stride = block_dim * grid_dim; + const size_t xydim = xdim * ydim; + + const auto weight = [radius, radius_p, cosine_width] (XFLOAT r) -> XFLOAT { + return r < radius ? 0.0 : + r > radius_p ? 1.0 : + #if defined(ACC_DOUBLE_PRECISION) + 0.5 + 0.5 * cos (M_PI * (radius_p - r) / cosine_width); + #else + 0.5f + 0.5f * cosf(M_PI * (radius_p - r) / cosine_width); + #endif + }; + + for (int bid = 0; bid < grid_dim; bid++) + for (int tid = 0; tid < block_dim; tid++) { + for (size_t i = bid * blockDim.x + tid; i < vol_size; i += stride) { + const int z = i / xydim - zinit; + const int y = i % xydim / xdim - yinit; + const int x = i % xydim % xdim - xinit; + + const XFLOAT r = sqrt(XFLOAT(x * x + y * y + z * z)); + const XFLOAT w = weight(r); + const XFLOAT bg = do_noise ? noise[i] : bg_value; + + vol[i] = vol[i] * (1 - w) + bg * w; + } + } } template