diff --git a/include/math/bias_add.h b/include/math/bias_add.h
index 4755c9c..a056e3a 100644
--- a/include/math/bias_add.h
+++ b/include/math/bias_add.h
@@ -8,12 +8,15 @@
  */
 #pragma once
 
+#include "mdnn_device_types.h"
 #include "tensor/tensor.h"
+#include "tensor/tensor_utilities.h"
 #include "utilities_internal.h"
 
 namespace magmadnn {
 namespace math {
 
+/*
 template <typename T>
 void bias_add(const Tensor &x, const Tensor &bias, Tensor &out);
 
@@ -21,6 +24,10 @@ void bias_add(const Tensor &x, const Tensor &bias, Tensor &out);
 template <typename T>
 void bias_add_device(const Tensor &x, const Tensor &bias, Tensor &out);
 #endif
+*/
+
+template <DeviceType dev>
+void bias_add(const Tensor &x, const Tensor &bias, Tensor &out);
 
 }  // namespace math
 }  // namespace magmadnn
\ No newline at end of file
diff --git a/src/math/bias_add.cpp b/src/math/bias_add.cpp
index d0e13b7..487aa7c 100644
--- a/src/math/bias_add.cpp
+++ b/src/math/bias_add.cpp
@@ -11,20 +11,18 @@
 namespace magmadnn {
 namespace math {
 
-template <typename T>
-void bias_add(const Tensor &x, const Tensor &bias, Tensor &out) {
-    // assert(T_IS_SAME_MEMORY_TYPE(x, bias) && T_IS_SAME_MEMORY_TYPE(bias, out));
-    MAGMADNN_ASSERT(TYPES_MATCH(T, x.dtype()) && TYPES_MATCH(T, bias.dtype()) && TYPES_MATCH(T, out.dtype()),
-                    "invalid tensor types");
+template <>
+void bias_add<CPU>(const Tensor &x, const Tensor &bias, Tensor &out) {
+    MAGMADNN_ASSERT(::magmadnn::utilities::do_tensors_match(out.dtype(), GetMemoryType<CPU>::value, {x, bias, out}),
+                    "bias_add: tensors must have same dtype and memory type.");
 
-    if (out.get_memory_type() == HOST) {
+    FOR_ALL_DTYPES(out.dtype(), T, {
         const T *x_ptr = x.get_ptr<T>();
         const T *bias_ptr = bias.get_ptr<T>();
         T *out_ptr = out.get_ptr<T>();
 
         index_t x_rows = x.shape(0);
         index_t x_cols = x.shape(1);
-        // unsigned int x_size = x_rows*x_cols;
 
         /* TODO -- test openmp here */
         for (unsigned int r = 0; r < x_rows; r++) {
@@ -32,20 +30,12 @@ void bias_add(const Tensor &x, const Tensor &bias, Tensor &out) {
                 out_ptr[r * x_cols + c] = x_ptr[r * x_cols + c] + bias_ptr[r];
             }
         }
-    }
-#if defined(_HAS_CUDA_)
-    else {
-        bias_add_device<T>(x, bias, out);
-    }
-#endif
+    })
 }
-#define COMPILE_BIASADD(type) template void bias_add<type>(const Tensor &, const Tensor &, Tensor &out);
-CALL_FOR_ALL_TYPES(COMPILE_BIASADD)
-#undef COMPILE_BIASADD
 
 #if defined(_USE_CUDNN_BIAS_)
 /* temporarily undefined this until cudnn works */
-
+/* TODO -- investigate using CuDNN for this */
 #if defined(_HAS_CUDA_)
 template <typename T>
 void bias_add_device(const Tensor &x, const Tensor &bias, Tensor &out) {
diff --git a/src/math/bias_add_device.cu b/src/math/bias_add_device.cu
index bd1c49a..080f915 100644
--- a/src/math/bias_add_device.cu
+++ b/src/math/bias_add_device.cu
@@ -14,7 +14,7 @@ namespace magmadnn {
 namespace math {
 
 template <typename T>
-__global__ void kernel_bias_add_device(const T *x, const T *bias, T *out, unsigned int x_rows, unsigned int x_cols) {
+__global__ void kernel_bias_add(const T *x, const T *bias, T *out, unsigned int x_rows, unsigned int x_cols) {
     unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
     unsigned int stride = blockDim.x * gridDim.x;
 
@@ -23,17 +23,16 @@ __global__ void kernel_bias_add_device(const T *x, const T *bias, T *out, unsign
     }
 }
 
-template <typename T>
-void bias_add_device(const Tensor &x, const Tensor &bias, Tensor &out) {
-    unsigned int x_rows = x.shape(0);
-    unsigned int x_cols = x.shape(1);
+template <>
+void bias_add<GPU>(const Tensor &x, const Tensor &bias, Tensor &out) {
+    size_t x_rows = x.shape(0);
+    size_t x_cols = x.shape(1);
 
-    kernel_bias_add_device<<<(x_rows * x_cols + BLK_SIZE - 1) / BLK_SIZE, BLK_SIZE>>>(x.get_ptr<T>(), bias.get_ptr<T>(),
-                                                                                      out.get_ptr<T>(), x_rows, x_cols);
+    FOR_ALL_DTYPES(out.dtype(), T, {
+        kernel_bias_add<<<(x_rows * x_cols + BLK_SIZE - 1) / BLK_SIZE, BLK_SIZE>>>(x.get_ptr<T>(), bias.get_ptr<T>(),
+                                                                                   out.get_ptr<T>(), x_rows, x_cols);
+    })
 }
-#define COMPILE_BIASADD_DEVICE(type) template void bias_add_device<type>(const Tensor&, const Tensor&, Tensor&);
-CALL_FOR_ALL_TYPES(COMPILE_BIASADD_DEVICE)
-#undef COMPILE_BIASADD_DEVICE
 
 }  // namespace math
 }  // namespace magmadnn