diff --git a/dlib/cuda/cpu_dlib.cpp b/dlib/cuda/cpu_dlib.cpp index 0e5ca5cee6..25a461d949 100644 --- a/dlib/cuda/cpu_dlib.cpp +++ b/dlib/cuda/cpu_dlib.cpp @@ -1620,122 +1620,175 @@ namespace dlib namespace ttimpl { - void softmax ( - const long num_locations, - const long num_channels, - tensor& dest, - const tensor& src - ) - { - DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k()); - DLIB_CASSERT(have_same_dimensions(dest,src)); - const auto d = dest.host(); - const auto s = src.host(); + void softmax( + const long num_locations, + const long num_channels, + tensor& dest, + const tensor& src, + operation_mode mode = operation_mode::CHANNEL_WISE + ) + { + DLIB_ASSERT(num_channels * num_locations == src.nr() * src.nc() * src.k()); + DLIB_CASSERT(have_same_dimensions(dest, src)); + const auto d = dest.host(); + const auto s = src.host(); - // Note that we subtract out the max values in each channel before applying - // exp() to avoid numeric overflow in the subsequent computations. Doing this - // doesn't change the resulting output, it just makes it more numerically - // stable. - for (long n = 0; n < src.num_samples(); ++n) - { - auto ss = s + num_locations*num_channels*n; - auto dd = d + num_locations*num_channels*n; - for (long i = 0; i < num_locations; ++i) + for (long n = 0; n < src.num_samples(); ++n) { - float max_val = -std::numeric_limits::infinity(); - for (long k = 0; k < num_channels; ++k) - max_val = std::max(max_val, ss[k*num_locations]); + auto ss = s + num_locations * num_channels * n; + auto dd = d + num_locations * num_channels * n; - for (long k = 0; k < num_channels; ++k) - dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val); + if (mode == operation_mode::CHANNEL_WISE) + { + for (long i = 0; i < num_locations; ++i) + { + float max_val = -std::numeric_limits::infinity(); + for (long k = 0; k < num_channels; ++k) + max_val = std::max(max_val, ss[k * num_locations]); - ++ss; - ++dd; - } - } + float sum = 0.0f; + for (long k = 0; k < num_channels; ++k) + { + dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val); + sum += dd[k * num_locations]; + } + for (long k = 0; k < num_channels; ++k) + dd[k * num_locations] /= sum; - // Now normalize each channel so they sum to 1. - for (long n = 0; n < src.num_samples(); ++n) - { - const auto dd = d + num_locations*num_channels*n; - for (long i = 0; i < num_locations; ++i) - { - const auto ddd = dd+i; + ++ss; + ++dd; + } + } + else if (mode == operation_mode::PLANE_WISE) + { + for (long k = 0; k < num_channels; ++k) + { + auto s_channel = ss + k * num_locations; + auto d_channel = dd + k * num_locations; + for (long r = 0; r < src.nr(); ++r) + { + float max_val = -std::numeric_limits::infinity(); + for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx) + max_val = std::max(max_val, s_channel[idx]); - float temp = 0; - for (long k = 0; k < num_channels; ++k) - temp += ddd[k*num_locations]; - for (long k = 0; k < num_channels; ++k) - ddd[k*num_locations] /= temp; + if (max_val == -std::numeric_limits::infinity()) + { + for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx) + d_channel[idx] = 0.0f; + } + else + { + float sum = 0.0f; + for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx) + { + d_channel[idx] = std::exp(s_channel[idx] - max_val); + sum += d_channel[idx]; + } + for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx) + d_channel[idx] /= sum; + } + } + } + } } } - } - void softmax_gradient ( - const long num_locations, - const long num_channels, - tensor& grad, - const tensor& dest, - const tensor& gradient_input - ) - { - DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k()); - DLIB_CASSERT(have_same_dimensions(grad,dest)); - DLIB_CASSERT(have_same_dimensions(grad,gradient_input)); - const auto d = dest.host(); - const auto g = grad.host(); - const auto in = gradient_input.host(); - - - for (long n = 0; n < grad.num_samples(); ++n) + void softmax_gradient( + const long num_locations, + const long num_channels, + tensor& grad, + const tensor& dest, + const tensor& gradient_input, + operation_mode mode = operation_mode::CHANNEL_WISE + ) { - const auto d2 = d + num_locations*num_channels*n; - const auto g2 = g + num_locations*num_channels*n; - const auto in2 = in + num_locations*num_channels*n; - for (long i = 0; i < num_locations; ++i) + DLIB_ASSERT(num_channels * num_locations == grad.nr() * grad.nc() * grad.k()); + DLIB_CASSERT(have_same_dimensions(grad, dest)); + DLIB_CASSERT(have_same_dimensions(grad, gradient_input)); + + const auto d = dest.host(); + const auto g = grad.host(); + const auto in = gradient_input.host(); + for (long n = 0; n < grad.num_samples(); ++n) { - const auto d3 = d2+i; - const auto g3 = g2+i; - const auto in3 = in2+i; + const auto d2 = d + num_locations * num_channels * n; + const auto g2 = g + num_locations * num_channels * n; + const auto in2 = in + num_locations * num_channels * n; - float temp = 0; - for (long k = 0; k < num_channels; ++k) - temp += -d3[k*num_locations]*in3[k*num_locations]; - if (is_same_object(gradient_input, grad)) + if (mode == operation_mode::CHANNEL_WISE) { - for (long k = 0; k < num_channels; ++k) - g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]); + for (long i = 0; i < num_locations; ++i) + { + const auto d3 = d2 + i; + const auto g3 = g2 + i; + const auto in3 = in2 + i; + float sum = 0.0f; + for (long k = 0; k < num_channels; ++k) + sum += -d3[k * num_locations] * in3[k * num_locations]; + if (is_same_object(gradient_input, grad)) + { + for (long k = 0; k < num_channels; ++k) + g3[k * num_locations] = d3[k * num_locations] * (sum + in3[k * num_locations]); + } + else + { + for (long k = 0; k < num_channels; ++k) + g3[k * num_locations] += d3[k * num_locations] * (sum + in3[k * num_locations]); + } + } } - else + else if (mode == operation_mode::PLANE_WISE) { for (long k = 0; k < num_channels; ++k) - g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]); + { + const auto d_channel = d2 + k * num_locations; + const auto g_channel = g2 + k * num_locations; + const auto in_channel = in2 + k * num_locations; + for (long r = 0; r < grad.nr(); ++r) + { + float sum = 0.0f; + for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx) + sum += -d_channel[idx] * in_channel[idx]; + if (is_same_object(gradient_input, grad)) + { + for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx) + g_channel[idx] = d_channel[idx] * (sum + in_channel[idx]); + } + else + { + for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx) + g_channel[idx] += d_channel[idx] * (sum + in_channel[idx]); + } + } + } } } } } - } // ---------------------------------------------------------------------------------------- - void softmax ( + void softmax( tensor& dest, - const tensor& src + const tensor& src, + operation_mode mode ) { - DLIB_CASSERT(have_same_dimensions(dest,src)); - ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src); + DLIB_CASSERT(have_same_dimensions(dest, src)); + DLIB_CASSERT(mode == operation_mode::CHANNEL_WISE || mode == operation_mode::PLANE_WISE, "Invalid softmax mode"); + ttimpl::softmax(src.nr() * src.nc(), src.k(), dest, src, mode); } - void softmax_gradient ( + void softmax_gradient( tensor& grad, const tensor& dest, - const tensor& gradient_input + const tensor& gradient_input, + operation_mode mode ) { - DLIB_CASSERT(have_same_dimensions(grad,dest)); - DLIB_CASSERT(have_same_dimensions(grad,gradient_input)); - ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input); + DLIB_CASSERT(have_same_dimensions(grad, dest)); + DLIB_CASSERT(have_same_dimensions(grad, gradient_input)); + ttimpl::softmax_gradient(grad.nr() * grad.nc(), grad.k(), grad, dest, gradient_input, mode); } // ------------------------------------------------------------------------------------ diff --git a/dlib/cuda/cpu_dlib.h b/dlib/cuda/cpu_dlib.h index f26795445d..f35b3c9728 100644 --- a/dlib/cuda/cpu_dlib.h +++ b/dlib/cuda/cpu_dlib.h @@ -291,15 +291,17 @@ namespace dlib // ----------------------------------------------------------------------------------- - void softmax ( + void softmax( tensor& dest, - const tensor& src + const tensor& src, + operation_mode mode = operation_mode::CHANNEL_WISE ); - void softmax_gradient ( + void softmax_gradient( tensor& grad, const tensor& dest, - const tensor& gradient_input + const tensor& gradient_input, + operation_mode mode = operation_mode::CHANNEL_WISE ); // ------------------------------------------------------------------------------------ diff --git a/dlib/cuda/cublas_dlibapi.cpp b/dlib/cuda/cublas_dlibapi.cpp index 376cc9f006..064e92c3df 100644 --- a/dlib/cuda/cublas_dlibapi.cpp +++ b/dlib/cuda/cublas_dlibapi.cpp @@ -101,55 +101,118 @@ namespace dlib const tensor& lhs, bool trans_lhs, const tensor& rhs, - bool trans_rhs + bool trans_rhs, + operation_mode mode ) { - // Recall that BLAS uses column major order so to deal with that we flip the - // order of the lhs and rhs arguments. - const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N; - const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N; - - const int dest_nr = dest.num_samples(); - const int dest_nc = dest.size()/dest_nr; - const int lhs_nr = lhs.num_samples(); - const int lhs_nc = lhs.size()/lhs_nr; - const int rhs_nr = rhs.num_samples(); - const int rhs_nc = rhs.size()/rhs_nr; - if (trans_lhs && trans_rhs) + if (mode == operation_mode::CHANNEL_WISE) { - DLIB_ASSERT( dest_nr == lhs_nc && - dest_nc == rhs_nr && - lhs_nr == rhs_nc) - } - else if (!trans_lhs && trans_rhs) - { - DLIB_ASSERT( dest_nr == lhs_nr && - dest_nc == rhs_nr && - lhs_nc == rhs_nc) - } - else if (trans_lhs && !trans_rhs) - { - DLIB_ASSERT( dest_nr == lhs_nc && - dest_nc == rhs_nc && - lhs_nr == rhs_nr) + // Recall that BLAS uses column major order so to deal with that we flip the + // order of the lhs and rhs arguments. + const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N; + const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N; + + const int dest_nr = dest.num_samples(); + const int dest_nc = dest.size() / dest_nr; + const int lhs_nr = lhs.num_samples(); + const int lhs_nc = lhs.size() / lhs_nr; + const int rhs_nr = rhs.num_samples(); + const int rhs_nc = rhs.size() / rhs_nr; + if (trans_lhs && trans_rhs) + { + DLIB_ASSERT(dest_nr == lhs_nc && + dest_nc == rhs_nr && + lhs_nr == rhs_nc) + } + else if (!trans_lhs && trans_rhs) + { + DLIB_ASSERT(dest_nr == lhs_nr && + dest_nc == rhs_nr && + lhs_nc == rhs_nc) + } + else if (trans_lhs && !trans_rhs) + { + DLIB_ASSERT(dest_nr == lhs_nc && + dest_nc == rhs_nc && + lhs_nr == rhs_nr) + } + else + { + DLIB_ASSERT(dest_nr == lhs_nr && + dest_nc == rhs_nc && + lhs_nc == rhs_nr) + } + + const int k = trans_rhs ? rhs_nc : rhs_nr; + CHECK_CUBLAS(cublasSgemm(context(), + transb, + transa, + dest_nc, dest_nr, k, + &alpha, + rhs.device(), rhs_nc, + lhs.device(), lhs_nc, + &beta, + dest.device(), dest_nc)); } - else + else if (mode == operation_mode::PLANE_WISE) { - DLIB_ASSERT( dest_nr == lhs_nr && - dest_nc == rhs_nc && - lhs_nc == rhs_nr) - } + const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N; + const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N; + + long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() }); + long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() }); + + auto is_matrix = [](const auto& tensor) { + return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) || + (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1)); + }; + const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest); + + if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) num_samples = num_channels = 1; + + size_t lhs_rows = lhs.nr(); + size_t lhs_cols = lhs.nc(); + if (lhs_is_matrix && (lhs.num_samples() > 1 || lhs.k() > 1)) { + lhs_rows = lhs.num_samples(); + lhs_cols = lhs.k(); + } + size_t rhs_rows = rhs.nr(); + size_t rhs_cols = rhs.nc(); + if (rhs_is_matrix && (rhs.num_samples() > 1 || rhs.k() > 1)) { + rhs_rows = rhs.num_samples(); + rhs_cols = rhs.k(); + } + size_t dest_rows = dest.nr(); + size_t dest_cols = dest.nc(); + if (dest_is_matrix && (dest.num_samples() > 1 || dest.k() > 1)) { + dest_rows = dest.num_samples(); + dest_cols = dest.k(); + } + + const size_t lhs_plane_size = lhs_rows * lhs_cols; + const size_t rhs_plane_size = rhs_rows * rhs_cols; + const size_t dest_plane_size = dest_rows * dest_cols; - const int k = trans_rhs ? rhs_nc : rhs_nr; - CHECK_CUBLAS(cublasSgemm(context(), - transb, - transa, - dest_nc, dest_nr, k, - &alpha, - rhs.device(), rhs_nc, - lhs.device(), lhs_nc, - &beta, - dest.device(),dest_nc)); + for (long b = 0; b < num_samples; ++b) + { + for (long c = 0; c < num_channels; ++c) + { + auto lhs_slice = lhs_is_matrix ? lhs.device() : + lhs.device() + (b * num_channels + c) * lhs_plane_size; + auto rhs_slice = rhs_is_matrix ? rhs.device() : + rhs.device() + (b * num_channels + c) * rhs_plane_size; + auto dest_slice = dest_is_matrix ? dest.device() : + dest.device() + (b * num_channels + c) * dest_plane_size; + const int k = trans_rhs ? rhs_cols : rhs_rows; + + CHECK_CUBLAS(cublasSgemm( + context(), transb, transa, dest_cols, dest_rows, k, + &alpha, rhs_slice, rhs_cols, lhs_slice, lhs_cols, + &beta, dest_slice, dest_cols + )); + } + } + } } // ------------------------------------------------------------------------------------ diff --git a/dlib/cuda/cublas_dlibapi.h b/dlib/cuda/cublas_dlibapi.h index b46fd25caa..45d8f426f0 100644 --- a/dlib/cuda/cublas_dlibapi.h +++ b/dlib/cuda/cublas_dlibapi.h @@ -9,9 +9,9 @@ #include "cuda_errors.h" namespace dlib -{ +{ namespace cuda - { + { // ----------------------------------------------------------------------------------- @@ -22,21 +22,52 @@ namespace dlib const tensor& lhs, bool trans_lhs, const tensor& rhs, - bool trans_rhs + bool trans_rhs, + operation_mode mode = operation_mode::CHANNEL_WISE ); - /*! - requires - - The dimensions of lhs and rhs must be compatible for matrix - multiplication. In particular: + /*! + requires + - The dimensions of lhs and rhs must be compatible for matrix multiplication. + The specific requirements depend on the mode: + + For CHANNEL_WISE mode (default): - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs) - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs) - Let D == mat(dest) - D.nr() == L.nr() && D.nc() == R.nc() - (i.e. dest must be preallocated and have the correct output dimensions) + (i.e. dest must be preallocated and have the correct output dimensions) - L.nc() == R.nr() - ensures - - performs: dest = alpha*L*R + beta*mat(dest) - !*/ + + For PLANE_WISE mode: + - lhs.num_samples() == rhs.num_samples() && lhs.k() == rhs.k() + - If !trans_lhs && !trans_rhs: + lhs.nc() == rhs.nr() + dest.nr() == lhs.nr() && dest.nc() == rhs.nc() + - If trans_lhs && !trans_rhs: + lhs.nr() == rhs.nr() + dest.nr() == lhs.nc() && dest.nc() == rhs.nc() + - If !trans_lhs && trans_rhs: + lhs.nc() == rhs.nc() + dest.nr() == lhs.nr() && dest.nc() == rhs.nr() + - If trans_lhs && trans_rhs: + lhs.nr() == rhs.nc() + dest.nr() == lhs.nc() && dest.nc() == rhs.nr() + + ensures + - Performs matrix multiplication based on the specified mode: + + For CHANNEL_WISE mode: + - performs: dest = alpha*L*R + beta*mat(dest) + where L, R, and D are as defined above. + + For PLANE_WISE mode: + - Performs matrix multiplication for each corresponding 2D plane (nr x nc) + in lhs and rhs across all samples and channels. + - The operation is equivalent to performing the following for each sample + and channel: + dest[s][k] = alpha * (lhs[s][k] * rhs[s][k]) + beta * dest[s][k] + where [s][k] represents the 2D plane for sample s and channel k. + !*/ // ------------------------------------------------------------------------------------ diff --git a/dlib/cuda/cudnn_dlibapi.cpp b/dlib/cuda/cudnn_dlibapi.cpp index c09845cc03..40dddb8d89 100644 --- a/dlib/cuda/cudnn_dlibapi.cpp +++ b/dlib/cuda/cudnn_dlibapi.cpp @@ -1528,61 +1528,123 @@ namespace dlib grad.device())); } - // ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------ - void softmax ( + void softmax( tensor& dest, - const tensor& src + const tensor& src, + operation_mode mode ) { - DLIB_CASSERT(have_same_dimensions(dest,src)); - if (src.size() == 0) - return; + DLIB_CASSERT(have_same_dimensions(dest, src)); + if (src.size() == 0) return; const float alpha = 1; const float beta = 0; - CHECK_CUDNN(cudnnSoftmaxForward(context(), - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - descriptor(src), - src.device(), - &beta, - descriptor(dest), - dest.device())); - } + if (mode == operation_mode::CHANNEL_WISE) + { + CHECK_CUDNN(cudnnSoftmaxForward(context(), + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + descriptor(src), + src.device(), + &beta, + descriptor(dest), + dest.device())); + } + else if (mode == operation_mode::PLANE_WISE) + { + const long num_samples = src.num_samples(); + const long num_channels = src.k(); + const size_t plane_size = src.nr() * src.nc(); + for (long s = 0; s < num_samples; ++s) + { + for (long k = 0; k < num_channels; ++k) + { + auto src_slice = src.device() + (s * num_channels + k) * plane_size; + auto dest_slice = dest.device() + (s * num_channels + k) * plane_size; + auto a_src_slice = alias_tensor(src.nr(), src.nc())(src, (s * num_channels + k) * plane_size); + auto a_dest_slice = alias_tensor(dest.nr(), dest.nc())(dest, (s * num_channels + k) * plane_size); + + CHECK_CUDNN(cudnnSoftmaxForward(context(), + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + descriptor(a_src_slice), + src_slice, + &beta, + descriptor(a_dest_slice), + dest_slice)); + } + } + } + } - void softmax_gradient ( + void softmax_gradient( tensor& grad, const tensor& dest, - const tensor& gradient_input + const tensor& gradient_input, + operation_mode mode ) { DLIB_CASSERT( - have_same_dimensions(dest,gradient_input) == true && - have_same_dimensions(dest,grad) == true ); - if (dest.size() == 0) - return; + have_same_dimensions(dest, gradient_input) == true && + have_same_dimensions(dest, grad) == true); + if (dest.size() == 0) return; const float alpha = 1; - const float beta = is_same_object(grad,gradient_input) ? 0 : 1; - CHECK_CUDNN(cudnnSoftmaxBackward(context(), - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - descriptor(dest), - dest.device(), - descriptor(gradient_input), - gradient_input.device(), - &beta, - descriptor(grad), - grad.device())); + const float beta = is_same_object(grad, gradient_input) ? 0 : 1; + + if (mode == operation_mode::CHANNEL_WISE) + { + CHECK_CUDNN(cudnnSoftmaxBackward(context(), + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + descriptor(dest), + dest.device(), + descriptor(gradient_input), + gradient_input.device(), + &beta, + descriptor(grad), + grad.device())); + } + else if (mode == operation_mode::PLANE_WISE) + { + const long num_samples = dest.num_samples(); + const long num_channels = dest.k(); + const size_t plane_size = dest.nr() * dest.nc(); + + for (long s = 0; s < num_samples; ++s) + { + for (long k = 0; k < num_channels; ++k) + { + auto dest_slice = dest.device() + (s * num_channels + k) * plane_size; + auto gi_slice = gradient_input.device() + (s * num_channels + k) * plane_size; + auto grad_slice = grad.device() + (s * num_channels + k) * plane_size; + auto a_dest_slice = alias_tensor(dest.nr(), dest.nc())(dest, (s * num_channels + k) * plane_size); + auto a_gi_slice = alias_tensor(gradient_input.nr(), gradient_input.nc())(gradient_input, (s * num_channels + k) * plane_size); + auto a_grad_slice = alias_tensor(grad.nr(), grad.nc())(grad, (s * num_channels + k) * plane_size); + + CHECK_CUDNN(cudnnSoftmaxBackward(context(), + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + descriptor(a_dest_slice), + dest_slice, + descriptor(a_gi_slice), + gi_slice, + &beta, + descriptor(a_grad_slice), + grad_slice)); + } + } + } } - // ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------ void softmax_all ( diff --git a/dlib/cuda/cudnn_dlibapi.h b/dlib/cuda/cudnn_dlibapi.h index 7b040a00c2..283be0d395 100644 --- a/dlib/cuda/cudnn_dlibapi.h +++ b/dlib/cuda/cudnn_dlibapi.h @@ -3,17 +3,19 @@ #ifndef DLIB_DNN_CuDNN_H_ #define DLIB_DNN_CuDNN_H_ +#include +#include "operation_mode.h" #ifdef DLIB_USE_CUDA - #include "cuda_errors.h" -#include #include "cuda_data_ptr.h" +#endif // DLIB_USE_CUDA namespace dlib { class tensor; class resizable_tensor; +#ifdef DLIB_USE_CUDA namespace cuda { @@ -352,13 +354,15 @@ namespace dlib void softmax ( tensor& dest, - const tensor& src + const tensor& src, + operation_mode mode = operation_mode::CHANNEL_WISE ); void softmax_gradient ( tensor& grad, const tensor& dest, - const tensor& gradient_input + const tensor& gradient_input, + operation_mode mode = operation_mode::CHANNEL_WISE ); // ------------------------------------------------------------------------------------ @@ -416,9 +420,8 @@ namespace dlib // ------------------------------------------------------------------------------------ } -} - #endif // DLIB_USE_CUDA +} #endif // DLIB_DNN_CuDNN_H_ diff --git a/dlib/cuda/operation_mode.h b/dlib/cuda/operation_mode.h new file mode 100644 index 0000000000..7d10f89c8f --- /dev/null +++ b/dlib/cuda/operation_mode.h @@ -0,0 +1,23 @@ +// Copyright (C) 2024 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_CUDA_OPERATION_MODE_H +#define DLIB_CUDA_OPERATION_MODE_H + +namespace dlib +{ +// ---------------------------------------------------------------------------------------- + + /*! + This enum is used to determine the mode of operation for certain functions + (such as gemm and softmax) in Dlib. It specifies whether the calculation + should be performed based on the matrix field in nr()xnc() or if the matrix + should be considered in num_samples()xk(). This helps in organizing tensor + computations more efficiently according to the required dimensions. + */ + enum class operation_mode { CHANNEL_WISE = 0, PLANE_WISE = 1 }; + +// ---------------------------------------------------------------------------------------- + +} // namespace dlib + +#endif // DLIB_CUDA_OPERATION_MODE_H \ No newline at end of file diff --git a/dlib/cuda/tensor_tools.cpp b/dlib/cuda/tensor_tools.cpp index 90a09a2884..069b4d4659 100644 --- a/dlib/cuda/tensor_tools.cpp +++ b/dlib/cuda/tensor_tools.cpp @@ -208,33 +208,99 @@ namespace dlib { namespace tt const tensor& lhs, bool trans_lhs, const tensor& rhs, - bool trans_rhs + bool trans_rhs, + operation_mode mode ) { #ifdef DLIB_USE_CUDA - cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs); + cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs, mode); #else - if (beta != 0) + if (mode == operation_mode::CHANNEL_WISE) { - if (trans_lhs && trans_rhs) - dest = alpha*trans(mat(lhs))*trans(mat(rhs)) + beta*mat(dest); - else if (!trans_lhs && trans_rhs) - dest = alpha*mat(lhs)*trans(mat(rhs)) + beta*mat(dest); - else if (trans_lhs && !trans_rhs) - dest = alpha*trans(mat(lhs))*mat(rhs) + beta*mat(dest); + if (beta != 0) + { + if (trans_lhs && trans_rhs) + dest = alpha * trans(mat(lhs)) * trans(mat(rhs)) + beta * mat(dest); + else if (!trans_lhs && trans_rhs) + dest = alpha * mat(lhs) * trans(mat(rhs)) + beta * mat(dest); + else if (trans_lhs && !trans_rhs) + dest = alpha * trans(mat(lhs)) * mat(rhs) + beta * mat(dest); + else + dest = alpha * mat(lhs) * mat(rhs) + beta * mat(dest); + } else - dest = alpha*mat(lhs)*mat(rhs) + beta*mat(dest); + { + if (trans_lhs && trans_rhs) + dest = alpha * trans(mat(lhs)) * trans(mat(rhs)); + else if (!trans_lhs && trans_rhs) + dest = alpha * mat(lhs) * trans(mat(rhs)); + else if (trans_lhs && !trans_rhs) + dest = alpha * trans(mat(lhs)) * mat(rhs); + else + dest = alpha * mat(lhs) * mat(rhs); + } } - else + else if (mode == operation_mode::PLANE_WISE) { - if (trans_lhs && trans_rhs) - dest = alpha*trans(mat(lhs))*trans(mat(rhs)); - else if (!trans_lhs && trans_rhs) - dest = alpha*mat(lhs)*trans(mat(rhs)); - else if (trans_lhs && !trans_rhs) - dest = alpha*trans(mat(lhs))*mat(rhs); - else - dest = alpha*mat(lhs)*mat(rhs); + auto is_matrix = [](const auto& tensor) { + return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) || + (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1)); + }; + + long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() }); + long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() }); + const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest); + + if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) { + num_samples = num_channels = 1; + } + + long lhs_rows = (lhs_is_matrix && lhs.num_samples() > 1) ? lhs.num_samples() : lhs.nr(); + long lhs_cols = (lhs_is_matrix && lhs.k() > 1) ? lhs.k() : lhs.nc(); + long rhs_rows = (rhs_is_matrix && rhs.num_samples() > 1) ? rhs.num_samples() : rhs.nr(); + long rhs_cols = (rhs_is_matrix && rhs.k() > 1) ? rhs.k() : rhs.nc(); + long dest_rows = (dest_is_matrix && dest.num_samples() > 1) ? dest.num_samples() : dest.nr(); + long dest_cols = (dest_is_matrix && dest.k() > 1) ? dest.k() : dest.nc(); + + const size_t lhs_plane_size = lhs_rows * lhs_cols; + const size_t rhs_plane_size = rhs_rows * rhs_cols; + const size_t dest_plane_size = dest_rows * dest_cols; + + for (long b = 0; b < num_samples; ++b) + { + for (long c = 0; c < num_channels; ++c) + { + auto lhs_slice = lhs_is_matrix ? alias_tensor(lhs_rows, lhs_cols)(lhs, 0) : + alias_tensor(lhs_rows, lhs_cols)(lhs, (b * num_channels + c) * lhs_plane_size); + auto rhs_slice = rhs_is_matrix ? alias_tensor(rhs_rows, rhs_cols)(rhs, 0) : + alias_tensor(rhs_rows, rhs_cols)(rhs, (b * num_channels + c) * rhs_plane_size); + auto dest_slice = dest_is_matrix ? alias_tensor(dest_rows, dest_cols)(dest, 0) : + alias_tensor(dest_rows, dest_cols)(dest, (b * num_channels + c) * dest_plane_size); + + if (beta != 0) + { + if (trans_lhs && trans_rhs) + dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice)) + beta * mat(dest_slice); + else if (!trans_lhs && trans_rhs) + dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice)) + beta * mat(dest_slice); + else if (trans_lhs && !trans_rhs) + dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice) + beta * mat(dest_slice); + else + dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice) + beta * mat(dest_slice); + } + else + { + if (trans_lhs && trans_rhs) + dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice)); + else if (!trans_lhs && trans_rhs) + dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice)); + else if (trans_lhs && !trans_rhs) + dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice); + else + dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice); + } + } + } } #endif } @@ -817,31 +883,32 @@ namespace dlib { namespace tt #endif } -// ---------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------- - void softmax ( + void softmax( tensor& dest, - const tensor& src + const tensor& src, + operation_mode mode ) { #ifdef DLIB_USE_CUDA - cuda::softmax(dest,src); + cuda::softmax(dest, src, mode); #else - cpu::softmax(dest,src); + cpu::softmax(dest, src, mode); #endif } - void softmax_gradient ( + void softmax_gradient( tensor& grad, const tensor& dest, - const tensor& gradient_input + const tensor& gradient_input, + operation_mode mode ) { #ifdef DLIB_USE_CUDA - cuda::softmax_gradient(grad, dest, gradient_input); + cuda::softmax_gradient(grad, dest, gradient_input, mode); #else - cpu::softmax_gradient(grad, dest, gradient_input); + cpu::softmax_gradient(grad, dest, gradient_input, mode); #endif } diff --git a/dlib/cuda/tensor_tools.h b/dlib/cuda/tensor_tools.h index 8ea593a429..17649603d9 100644 --- a/dlib/cuda/tensor_tools.h +++ b/dlib/cuda/tensor_tools.h @@ -165,21 +165,56 @@ namespace dlib { namespace tt const tensor& lhs, bool trans_lhs, const tensor& rhs, - bool trans_rhs + bool trans_rhs, + operation_mode mode = operation_mode::CHANNEL_WISE ); /*! requires - dest does not alias the memory of lhs or rhs - The dimensions of lhs and rhs must be compatible for matrix multiplication. - In particular: - - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs) - - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs) - - Let D == mat(dest) - - D.nr() == L.nr() && D.nc() == R.nc() - (i.e. dest must be preallocated and have the correct output dimensions) - - L.nc() == R.nr() - ensures - - performs: dest = alpha*L*R + beta*mat(dest) + The specific requirements depend on the mode: + + For CHANNEL_WISE mode (default): + - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs) + - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs) + - Let D == mat(dest) + - D.nr() == L.nr() && D.nc() == R.nc() + (i.e. dest must be preallocated and have the correct output dimensions) + - L.nc() == R.nr() + + For PLANE_WISE mode: + - lhs.num_samples() == rhs.num_samples() && lhs.k() == rhs.k() + - If !trans_lhs && !trans_rhs: + lhs.nc() == rhs.nr() + dest.nr() == lhs.nr() && dest.nc() == rhs.nc() + - If trans_lhs && !trans_rhs: + lhs.nr() == rhs.nr() + dest.nr() == lhs.nc() && dest.nc() == rhs.nc() + - If !trans_lhs && trans_rhs: + lhs.nc() == rhs.nc() + dest.nr() == lhs.nr() && dest.nc() == rhs.nr() + - If trans_lhs && trans_rhs: + lhs.nr() == rhs.nc() + dest.nr() == lhs.nc() && dest.nc() == rhs.nr() + + ensures + - Performs matrix multiplication based on the specified mode: + + For CHANNEL_WISE mode: + - performs: dest = alpha*L*R + beta*mat(dest) + where L, R, and D are as defined above. + + For PLANE_WISE mode: + - Performs matrix multiplication for each corresponding 2D plane (nr x nc) + in lhs and rhs across all samples and channels. + - The operation is equivalent to performing the following for each sample + and channel: + dest[s][k] = alpha * (lhs[s][k] * rhs[s][k]) + beta * dest[s][k] + where [s][k] represents the 2D plane for sample s and channel k. + + Note that the PLANE_WISE mode is particularly useful for operations like attention + mechanisms in neural networks, where you want to perform matrix multiplications + on 2D planes of 4D tensors while preserving the sample and channel dimensions. !*/ // ---------------------------------------------------------------------------------------- @@ -1386,44 +1421,54 @@ namespace dlib { namespace tt // ---------------------------------------------------------------------------------------- - void softmax ( + void softmax( tensor& dest, - const tensor& src + const tensor& src, + operation_mode mode = operation_mode::CHANNEL_WISE ); /*! requires - have_same_dimensions(dest, src) == true - ensures - - Note that the softmax function is a vector valued function: - s(x) == exp(x)/sum(exp(x)) - - Computes the softmax function on src and writes the results to dest. The - softmax is computed per spatial location across the different channels at - each location. That is, softmax() outputs a new tensor, #dest, where each of + - mode == CHANNEL_WISE || mode == PLANE_WISE + ensures + - Note that the softmax function is a vector valued function: + s(x) == exp(x)/sum(exp(x)) + - Computes the softmax function on src and writes the results to dest. + - If mode == CHANNEL_WISE: + The softmax is computed per spatial location across the different channels at + each location. That is, softmax() outputs a new tensor, #dest, where each of the spatial locations in dest (i.e. image idx, row idx, and column idx) - contains the output of s() evaluated over the channel values at each - location. + contains the output of s() evaluated over the channel values at each location. + - If mode == PLANE_WISE: + The softmax is computed across entire planes (nr x nc) of the input tensor. + This is useful for operations in Large Language Models (LLMs) and other + applications requiring 2D tensor processing. - This function supports in-place operation, i.e. having is_same_object(dest, src)==true !*/ - void softmax_gradient ( + void softmax_gradient( tensor& grad, const tensor& dest, - const tensor& gradient_input + const tensor& gradient_input, + operation_mode mode = operation_mode::CHANNEL_WISE ); /*! requires - - have_same_dimensions(dest,gradient_input) == true - - have_same_dimensions(dest,grad) == true - ensures - - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor. - Then let f(SRC) == dot(gradient_input,dest). Then this function computes the - gradient of f() with respect to SRC and stores it to grad. Moreover, if - is_same_object(grad,gradient_input)==true then the output is assigned to - grad, replacing its previous contents. Otherwise the output is added to - grad. + - have_same_dimensions(dest,gradient_input) == true + - have_same_dimensions(dest,grad) == true + - mode == CHANNEL_WISE || mode == PLANE_WISE + ensures + - We interpret dest as the output of softmax(dest,SRC,mode) for some SRC tensor. + Then let f(SRC) == dot(gradient_input,dest). Then this function computes the + gradient of f() with respect to SRC and stores it to grad. Moreover, if + is_same_object(grad,gradient_input)==true then the output is assigned to + grad, replacing its previous contents. Otherwise the output is added to grad. + - The gradient computation takes into account the specified mode: + - If mode == CHANNEL_WISE: The gradient is computed per spatial location across channels. + - If mode == PLANE_WISE: The gradient is computed across entire planes of the tensor. - This function supports in-place operation, i.e. having - is_same_object(grad, gradient_input)==true + is_same_object(grad, gradient_input)==true !*/ // ---------------------------------------------------------------------------------------- diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h index f34e7a8390..023ccbf810 100644 --- a/dlib/dnn/layers.h +++ b/dlib/dnn/layers.h @@ -13,6 +13,7 @@ #include "../cuda/tensor_tools.h" #include "../vectorstream.h" #include "utilities.h" +#include "../cuda/operation_mode.h" #include @@ -2789,6 +2790,100 @@ namespace dlib using mult_prev9_ = mult_prev_; using mult_prev10_ = mult_prev_; +// ---------------------------------------------------------------------------------------- + + template < + template class tag + > + class multm_prev_ + { + public: + const static unsigned long id = tag_id::id; + + multm_prev_() {} + template void setup(const SUBNET& /*sub*/) {} + + template + void forward(const SUBNET& sub, resizable_tensor& output) + { + auto& t1 = sub.get_output(); + auto& t2 = layer(sub).get_output(); + output.set_size(t1.num_samples(), t1.k(), t1.nr(), t2.nc()); + + tt::gemm(0, output, 1, t1, false, t2, false, operation_mode::PLANE_WISE); + } + + template + void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + auto& t1 = sub.get_output(); + auto& t2 = layer(sub).get_output(); + auto& prev = sub.get_gradient_input(); + auto& prev_tag = layer(sub).get_gradient_input(); + + tt::gemm(1, prev, 1, gradient_input, false, t2, true, operation_mode::PLANE_WISE); + tt::gemm(1, prev_tag, 1, t1, true, gradient_input, false, operation_mode::PLANE_WISE); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + inline dpoint map_input_to_output(const dpoint& p) const { return p; } + inline dpoint map_output_to_input(const dpoint& p) const { return p; } + + friend void serialize(const multm_prev_& /*item*/, std::ostream& out) + { + serialize("multm_prev_", out); + } + friend void deserialize(multm_prev_& /*item*/, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "multm_prev_") + throw serialization_error("Unexpected version '" + version + "' found while deserializing dlib::multm_prev_."); + } + + friend std::ostream& operator<<(std::ostream& out, const multm_prev_& /*item*/) + { + out << "multm_prev" << id; + return out; + } + friend void to_xml(const multm_prev_& /*item*/, std::ostream& out) + { + out << "\n"; + } + + private: + resizable_tensor params; // unused + }; + + template < + template class tag, + typename SUBNET + > + using multm_prev = add_layer, SUBNET>; + + template using multm_prev1 = multm_prev; + template using multm_prev2 = multm_prev; + template using multm_prev3 = multm_prev; + template using multm_prev4 = multm_prev; + template using multm_prev5 = multm_prev; + template using multm_prev6 = multm_prev; + template using multm_prev7 = multm_prev; + template using multm_prev8 = multm_prev; + template using multm_prev9 = multm_prev; + template using multm_prev10 = multm_prev; + using multm_prev1_ = multm_prev_; + using multm_prev2_ = multm_prev_; + using multm_prev3_ = multm_prev_; + using multm_prev4_ = multm_prev_; + using multm_prev5_ = multm_prev_; + using multm_prev6_ = multm_prev_; + using multm_prev7_ = multm_prev_; + using multm_prev8_ = multm_prev_; + using multm_prev9_ = multm_prev_; + using multm_prev10_ = multm_prev_; + // ---------------------------------------------------------------------------------------- template < @@ -3985,31 +4080,28 @@ namespace dlib // ---------------------------------------------------------------------------------------- + template class softmax_ { public: - softmax_() - { - } + softmax_() {} template - void setup (const SUBNET& /*sub*/) - { - } + void setup(const SUBNET& /*sub*/) {} void forward_inplace(const tensor& input, tensor& output) { - tt::softmax(output, input); - } + tt::softmax(output, input, s_mode_); + } void backward_inplace( const tensor& computed_output, - const tensor& gradient_input, - tensor& data_grad, - tensor& + const tensor& gradient_input, + tensor& data_grad, + tensor& /*params_grad*/ ) { - tt::softmax_gradient(data_grad, computed_output, gradient_input); + tt::softmax_gradient(data_grad, computed_output, gradient_input, s_mode_); } const tensor& get_layer_params() const { return params; } @@ -4025,26 +4117,31 @@ namespace dlib std::string version; deserialize(version, in); if (version != "softmax_") - throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::softmax_."); + throw serialization_error("Unexpected version '" + version + "' found while deserializing dlib::softmax_."); } friend std::ostream& operator<<(std::ostream& out, const softmax_& /*item*/) { - out << "softmax"; + out << "softmax (mode=" << (s_mode_ == operation_mode::CHANNEL_WISE + ? "channel_wise" : "plane_wise") << ")"; return out; } friend void to_xml(const softmax_& /*item*/, std::ostream& out) { - out << "\n"; + out << "\n"; } private: - resizable_tensor params; + resizable_tensor params; // unused }; template - using softmax = add_layer; + using softmax = add_layer, SUBNET>; + + template + using softmaxm = add_layer, SUBNET>; // ---------------------------------------------------------------------------------------- @@ -5088,4 +5185,4 @@ namespace dlib } -#endif // DLIB_DNn_LAYERS_H_ \ No newline at end of file +#endif // DLIB_DNn_LAYERS_H_ diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h index 0d951e7804..99fe91401c 100644 --- a/dlib/dnn/layers_abstract.h +++ b/dlib/dnn/layers_abstract.h @@ -5,6 +5,7 @@ #include "../cuda/tensor_abstract.h" #include "core_abstract.h" +#include "../cuda/operation_mode.h" namespace dlib @@ -2953,44 +2954,67 @@ namespace dlib // ---------------------------------------------------------------------------------------- + template class softmax_ { /*! WHAT THIS OBJECT REPRESENTS This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface - defined above. In particular, it defines a softmax layer. To be precise, - we define the softmax function s(x) as: - s(x) == exp(x)/sum(exp(x)) - where x is a vector. Then this layer treats its input tensor as a - collection of multi-channel images and applies s() to each spatial location - in each image. In each application, the tensor::k() channel elements at - each position are input to s() and then replaced by the outputs of s(). + defined above. It defines a softmax layer with two modes of operation: + channel-wise and plane-wise. + + The softmax function s(x) is defined as: + s(x) == exp(x)/sum(exp(x)) + where x is a vector. + + 1. Channel-wise mode (s_mode_ == CHANNEL_WISE): + This mode treats the input tensor as a collection of multi-channel images + and applies s() to each spatial location in each image. The tensor::k() + channel elements at each position are input to s() and then replaced by + the outputs of s(). + + 2. Plane-wise mode (s_mode_ == PLANE_WISE): + This mode applies the softmax function across entire planes (nr x nc) of + the input tensor, useful for operations in Large Language Models (LLMs) + and other applications requiring 2D tensor processing. + + In both modes, the sum of the outputs of s() will always be equal to 1 for + each application of the function. - This means that, for example, if you collapsed each output image to a 1 - channel image by adding the channels then you would end up with images - where each pixel value was 1. This is because the sum of the outputs of - s() will always be equal to 1. + TEMPLATE PARAMETERS + - s_mode_: Determines the mode of operation (CHANNEL_WISE or PLANE_WISE) !*/ public: + softmax_(); - softmax_( - ); - - template void setup (const SUBNET& sub); + template void setup(const SUBNET& sub); void forward_inplace(const tensor& input, tensor& output); - void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad); - const tensor& get_layer_params() const; - tensor& get_layer_params(); + void backward_inplace( + const tensor& computed_output, + const tensor& gradient_input, + tensor& data_grad, + tensor& params_grad + ); + const tensor& get_layer_params() const; + tensor& get_layer_params(); /*! - These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ - interface. Note that this layer doesn't have any parameters, so the tensor + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ + interface. Note that this layer doesn't have any parameters, so the tensor returned by get_layer_params() is always empty. !*/ + + friend void serialize(const softmax_& item, std::ostream& out); + friend void deserialize(softmax_& item, std::istream& in); + friend std::ostream& operator<<(std::ostream& out, const softmax_& item); + friend void to_xml(const softmax_& item, std::ostream& out); }; template - using softmax = add_layer; + using softmax = add_layer, SUBNET>; + + template + using softmaxm = add_layer, SUBNET>; // ---------------------------------------------------------------------------------------- @@ -3175,6 +3199,85 @@ namespace dlib using mult_prev9_ = mult_prev_; using mult_prev10_ = mult_prev_; +// ---------------------------------------------------------------------------------------- + + template < + template class tag + > + class multm_prev_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. This layer performs matrix multiplication on the output + of two previous layers. It multiplies the tensor from its immediate + predecessor layer, sub.get_output(), with the tensor from a deeper layer, + layer(sub).get_output(). + + The tag template argument specifies which layer to multiply with the + output of the previous layer. The result of this multiplication is + output by multm_prev_. The multiplication is performed using a modified + version of gemm() to account for the 2D matrix dimension in the nr()xnc() + planes of Dlib's 4D tensors. + + This layer is similar to mult_prev_, but it considers the full matrix + in the nr()xnc() planes of the tensor, rather than just the upper + num_samples()xk() plane. This makes it suitable for implementing + mechanisms like attention, especially when the k() channel plane is + used to model multiple heads for parallel matrix processing. + + The output tensor dimensions are determined as follows: + - output.num_samples() == t1.num_samples() + - output.k() == t1.k() + - output.nr() == t1.nr() + - output.nc() == t2.nc() + where t1 is sub.get_output() and t2 is layer(sub).get_output(). + !*/ + + public: + multm_prev_( + ); + + template void setup (const SUBNET& sub); + template void forward(const SUBNET& sub, resizable_tensor& output); + template void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + template < + template class tag, + typename SUBNET + > + using multm_prev = add_layer, SUBNET>; + + // Here we add some convenient aliases for using multm_prev_ with the tag layers. + template using multm_prev1 = multm_prev; + template using multm_prev2 = multm_prev; + template using multm_prev3 = multm_prev; + template using multm_prev4 = multm_prev; + template using multm_prev5 = multm_prev; + template using multm_prev6 = multm_prev; + template using multm_prev7 = multm_prev; + template using multm_prev8 = multm_prev; + template using multm_prev9 = multm_prev; + template using multm_prev10 = multm_prev; + using multm_prev1_ = multm_prev_; + using multm_prev2_ = multm_prev_; + using multm_prev3_ = multm_prev_; + using multm_prev4_ = multm_prev_; + using multm_prev5_ = multm_prev_; + using multm_prev6_ = multm_prev_; + using multm_prev7_ = multm_prev_; + using multm_prev8_ = multm_prev_; + using multm_prev9_ = multm_prev_; + using multm_prev10_ = multm_prev_; + // ---------------------------------------------------------------------------------------- template < diff --git a/dlib/dnn/visitors.h b/dlib/dnn/visitors.h index 726f3b200e..589e3556ef 100644 --- a/dlib/dnn/visitors.h +++ b/dlib/dnn/visitors.h @@ -841,6 +841,16 @@ namespace dlib update(i); } + template