diff --git a/include/math/bias_add.h b/include/math/bias_add.h index 4755c9c..a056e3a 100644 --- a/include/math/bias_add.h +++ b/include/math/bias_add.h @@ -8,12 +8,15 @@ */ #pragma once +#include "mdnn_device_types.h" #include "tensor/tensor.h" +#include "tensor/tensor_utilities.h" #include "utilities_internal.h" namespace magmadnn { namespace math { +/* template void bias_add(const Tensor &x, const Tensor &bias, Tensor &out); @@ -21,6 +24,10 @@ void bias_add(const Tensor &x, const Tensor &bias, Tensor &out); template void bias_add_device(const Tensor &x, const Tensor &bias, Tensor &out); #endif +*/ + +template +void bias_add(const Tensor &x, const Tensor &bias, Tensor &out); } // namespace math } // namespace magmadnn \ No newline at end of file diff --git a/src/math/bias_add.cpp b/src/math/bias_add.cpp index d0e13b7..487aa7c 100644 --- a/src/math/bias_add.cpp +++ b/src/math/bias_add.cpp @@ -11,20 +11,18 @@ namespace magmadnn { namespace math { -template -void bias_add(const Tensor &x, const Tensor &bias, Tensor &out) { - // assert(T_IS_SAME_MEMORY_TYPE(x, bias) && T_IS_SAME_MEMORY_TYPE(bias, out)); - MAGMADNN_ASSERT(TYPES_MATCH(T, x.dtype()) && TYPES_MATCH(T, bias.dtype()) && TYPES_MATCH(T, out.dtype()), - "invalid tensor types"); +template <> +void bias_add(const Tensor &x, const Tensor &bias, Tensor &out) { + MAGMADNN_ASSERT(::magmadnn::utilities::do_tensors_match(out.dtype(), GetMemoryType::value, {x, bias, out}), + "bias_add: tensors must have same dtype and memory type."); - if (out.get_memory_type() == HOST) { + FOR_ALL_DTYPES(out.dtype(), T, { const T *x_ptr = x.get_ptr(); const T *bias_ptr = bias.get_ptr(); T *out_ptr = out.get_ptr(); index_t x_rows = x.shape(0); index_t x_cols = x.shape(1); - // unsigned int x_size = x_rows*x_cols; /* TODO -- test openmp here */ for (unsigned int r = 0; r < x_rows; r++) { @@ -32,20 +30,12 @@ void bias_add(const Tensor &x, const Tensor &bias, Tensor &out) { out_ptr[r * x_cols + c] = x_ptr[r * x_cols + c] + bias_ptr[r]; } } - } -#if defined(_HAS_CUDA_) - else { - bias_add_device(x, bias, out); - } -#endif + }) } -#define COMPILE_BIASADD(type) template void bias_add(const Tensor &, const Tensor &, Tensor &out); -CALL_FOR_ALL_TYPES(COMPILE_BIASADD) -#undef COMPILE_BIASADD #if defined(_USE_CUDNN_BIAS_) /* temporarily undefined this until cudnn works */ - +/* TODO -- investigate using CuDNN for this */ #if defined(_HAS_CUDA_) template void bias_add_device(const Tensor &x, const Tensor &bias, Tensor &out) { diff --git a/src/math/bias_add_device.cu b/src/math/bias_add_device.cu index bd1c49a..080f915 100644 --- a/src/math/bias_add_device.cu +++ b/src/math/bias_add_device.cu @@ -14,7 +14,7 @@ namespace magmadnn { namespace math { template -__global__ void kernel_bias_add_device(const T *x, const T *bias, T *out, unsigned int x_rows, unsigned int x_cols) { +__global__ void kernel_bias_add(const T *x, const T *bias, T *out, unsigned int x_rows, unsigned int x_cols) { unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -23,17 +23,16 @@ __global__ void kernel_bias_add_device(const T *x, const T *bias, T *out, unsign } } -template -void bias_add_device(const Tensor &x, const Tensor &bias, Tensor &out) { - unsigned int x_rows = x.shape(0); - unsigned int x_cols = x.shape(1); +template <> +void bias_add(const Tensor &x, const Tensor &bias, Tensor &out) { + size_t x_rows = x.shape(0); + size_t x_cols = x.shape(1); - kernel_bias_add_device<<<(x_rows * x_cols + BLK_SIZE - 1) / BLK_SIZE, BLK_SIZE>>>(x.get_ptr(), bias.get_ptr(), - out.get_ptr(), x_rows, x_cols); + FOR_ALL_DTYPES(out.dtype(), T, { + kernel_bias_add<<<(x_rows * x_cols + BLK_SIZE - 1) / BLK_SIZE, BLK_SIZE>>>(x.get_ptr(), bias.get_ptr(), + out.get_ptr(), x_rows, x_cols); + }) } -#define COMPILE_BIASADD_DEVICE(type) template void bias_add_device(const Tensor&, const Tensor&, Tensor&); -CALL_FOR_ALL_TYPES(COMPILE_BIASADD_DEVICE) -#undef COMPILE_BIASADD_DEVICE } // namespace math } // namespace magmadnn