Skip to content

Commit

Permalink
Add multm_prev_ layer and enhance gemm() function for PLANE_WISE oper…
Browse files Browse the repository at this point in the history
…ations (davisking#3020)

* Fix Stride Indexing Bugs in `reorg` and `reorg_gradient` Functions (CPU & CUDA) and Add `add_to` Parameter

* 'add_to' parameter missing in cuda call reorg_gradient.launch_kernel()

* Cleanup: remove using namespace std; (davisking#3016)

* remove using namespace std from headers

* more std::

* more std::

* more std:: on windows stuff

* remove uses of using namespace std::chrono

* do not use C++17 features

* Add Davis suggestion

* revert some more stuff

* revert removing include

* more std::chrono stuff

* fix build error

* Adjust comment formatting to be like other dlib comments

* Add positional encodings layer to Dlib

* Add multm_prev layer and enhance gemm() function for PLANE_WISE operations

* Updates

* Updates

* Resynchronization with tril_ class

* Delete .vscode/settings.json

Not required for the merging

* Remove duplicates

* Small improvements to PLANE_WISE in gemm() function

* Same improvements for the CPU version

* Introducing a new enum for operation modes in tensor computations

* Remove a test duplicated call in dnn tests

* Remove duplicated declaration

* Comment fixed

* Fixing the Cuda compilation

* Merging with updated softmax_ layer

* Fixing header for CPU compilation

* Adding a missing cast

* Test fixed to use the new operation_mode enum

* softmaxm test fixed

* Enum test removed

* Enum test removed

* Fixing indentation

* Fixing indentation

* Test removed

* Move the operation_mode enumeration to its own header

* Use operation_mode instead of unsigned long

---------

Co-authored-by: Adrià <[email protected]>
Co-authored-by: Davis King <[email protected]>
  • Loading branch information
3 people authored Dec 20, 2024
1 parent dfbee6d commit 230c0b0
Show file tree
Hide file tree
Showing 13 changed files with 979 additions and 286 deletions.
221 changes: 137 additions & 84 deletions dlib/cuda/cpu_dlib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1620,122 +1620,175 @@ namespace dlib

namespace ttimpl
{
void softmax (
const long num_locations,
const long num_channels,
tensor& dest,
const tensor& src
)
{
DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
DLIB_CASSERT(have_same_dimensions(dest,src));
const auto d = dest.host();
const auto s = src.host();
void softmax(
const long num_locations,
const long num_channels,
tensor& dest,
const tensor& src,
operation_mode mode = operation_mode::CHANNEL_WISE
)
{
DLIB_ASSERT(num_channels * num_locations == src.nr() * src.nc() * src.k());
DLIB_CASSERT(have_same_dimensions(dest, src));
const auto d = dest.host();
const auto s = src.host();

// Note that we subtract out the max values in each channel before applying
// exp() to avoid numeric overflow in the subsequent computations. Doing this
// doesn't change the resulting output, it just makes it more numerically
// stable.
for (long n = 0; n < src.num_samples(); ++n)
{
auto ss = s + num_locations*num_channels*n;
auto dd = d + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
for (long n = 0; n < src.num_samples(); ++n)
{
float max_val = -std::numeric_limits<float>::infinity();
for (long k = 0; k < num_channels; ++k)
max_val = std::max(max_val, ss[k*num_locations]);
auto ss = s + num_locations * num_channels * n;
auto dd = d + num_locations * num_channels * n;

for (long k = 0; k < num_channels; ++k)
dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);
if (mode == operation_mode::CHANNEL_WISE)
{
for (long i = 0; i < num_locations; ++i)
{
float max_val = -std::numeric_limits<float>::infinity();
for (long k = 0; k < num_channels; ++k)
max_val = std::max(max_val, ss[k * num_locations]);

++ss;
++dd;
}
}
float sum = 0.0f;
for (long k = 0; k < num_channels; ++k)
{
dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val);
sum += dd[k * num_locations];
}
for (long k = 0; k < num_channels; ++k)
dd[k * num_locations] /= sum;

// Now normalize each channel so they sum to 1.
for (long n = 0; n < src.num_samples(); ++n)
{
const auto dd = d + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
{
const auto ddd = dd+i;
++ss;
++dd;
}
}
else if (mode == operation_mode::PLANE_WISE)
{
for (long k = 0; k < num_channels; ++k)
{
auto s_channel = ss + k * num_locations;
auto d_channel = dd + k * num_locations;
for (long r = 0; r < src.nr(); ++r)
{
float max_val = -std::numeric_limits<float>::infinity();
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
max_val = std::max(max_val, s_channel[idx]);

float temp = 0;
for (long k = 0; k < num_channels; ++k)
temp += ddd[k*num_locations];
for (long k = 0; k < num_channels; ++k)
ddd[k*num_locations] /= temp;
if (max_val == -std::numeric_limits<float>::infinity())
{
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
d_channel[idx] = 0.0f;
}
else
{
float sum = 0.0f;
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
{
d_channel[idx] = std::exp(s_channel[idx] - max_val);
sum += d_channel[idx];
}
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
d_channel[idx] /= sum;
}
}
}
}
}
}
}

void softmax_gradient (
const long num_locations,
const long num_channels,
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
DLIB_CASSERT(have_same_dimensions(grad,dest));
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
const auto d = dest.host();
const auto g = grad.host();
const auto in = gradient_input.host();


for (long n = 0; n < grad.num_samples(); ++n)
void softmax_gradient(
const long num_locations,
const long num_channels,
tensor& grad,
const tensor& dest,
const tensor& gradient_input,
operation_mode mode = operation_mode::CHANNEL_WISE
)
{
const auto d2 = d + num_locations*num_channels*n;
const auto g2 = g + num_locations*num_channels*n;
const auto in2 = in + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
DLIB_ASSERT(num_channels * num_locations == grad.nr() * grad.nc() * grad.k());
DLIB_CASSERT(have_same_dimensions(grad, dest));
DLIB_CASSERT(have_same_dimensions(grad, gradient_input));

const auto d = dest.host();
const auto g = grad.host();
const auto in = gradient_input.host();
for (long n = 0; n < grad.num_samples(); ++n)
{
const auto d3 = d2+i;
const auto g3 = g2+i;
const auto in3 = in2+i;
const auto d2 = d + num_locations * num_channels * n;
const auto g2 = g + num_locations * num_channels * n;
const auto in2 = in + num_locations * num_channels * n;

float temp = 0;
for (long k = 0; k < num_channels; ++k)
temp += -d3[k*num_locations]*in3[k*num_locations];
if (is_same_object(gradient_input, grad))
if (mode == operation_mode::CHANNEL_WISE)
{
for (long k = 0; k < num_channels; ++k)
g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
for (long i = 0; i < num_locations; ++i)
{
const auto d3 = d2 + i;
const auto g3 = g2 + i;
const auto in3 = in2 + i;
float sum = 0.0f;
for (long k = 0; k < num_channels; ++k)
sum += -d3[k * num_locations] * in3[k * num_locations];
if (is_same_object(gradient_input, grad))
{
for (long k = 0; k < num_channels; ++k)
g3[k * num_locations] = d3[k * num_locations] * (sum + in3[k * num_locations]);
}
else
{
for (long k = 0; k < num_channels; ++k)
g3[k * num_locations] += d3[k * num_locations] * (sum + in3[k * num_locations]);
}
}
}
else
else if (mode == operation_mode::PLANE_WISE)
{
for (long k = 0; k < num_channels; ++k)
g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
{
const auto d_channel = d2 + k * num_locations;
const auto g_channel = g2 + k * num_locations;
const auto in_channel = in2 + k * num_locations;
for (long r = 0; r < grad.nr(); ++r)
{
float sum = 0.0f;
for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
sum += -d_channel[idx] * in_channel[idx];
if (is_same_object(gradient_input, grad))
{
for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
g_channel[idx] = d_channel[idx] * (sum + in_channel[idx]);
}
else
{
for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
g_channel[idx] += d_channel[idx] * (sum + in_channel[idx]);
}
}
}
}
}
}
}
}

// ----------------------------------------------------------------------------------------

void softmax (
void softmax(
tensor& dest,
const tensor& src
const tensor& src,
operation_mode mode
)
{
DLIB_CASSERT(have_same_dimensions(dest,src));
ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
DLIB_CASSERT(have_same_dimensions(dest, src));
DLIB_CASSERT(mode == operation_mode::CHANNEL_WISE || mode == operation_mode::PLANE_WISE, "Invalid softmax mode");
ttimpl::softmax(src.nr() * src.nc(), src.k(), dest, src, mode);
}

void softmax_gradient (
void softmax_gradient(
tensor& grad,
const tensor& dest,
const tensor& gradient_input
const tensor& gradient_input,
operation_mode mode
)
{
DLIB_CASSERT(have_same_dimensions(grad,dest));
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
DLIB_CASSERT(have_same_dimensions(grad, dest));
DLIB_CASSERT(have_same_dimensions(grad, gradient_input));
ttimpl::softmax_gradient(grad.nr() * grad.nc(), grad.k(), grad, dest, gradient_input, mode);
}

// ------------------------------------------------------------------------------------
Expand Down
10 changes: 6 additions & 4 deletions dlib/cuda/cpu_dlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,15 +291,17 @@ namespace dlib

// -----------------------------------------------------------------------------------

void softmax (
void softmax(
tensor& dest,
const tensor& src
const tensor& src,
operation_mode mode = operation_mode::CHANNEL_WISE
);

void softmax_gradient (
void softmax_gradient(
tensor& grad,
const tensor& dest,
const tensor& gradient_input
const tensor& gradient_input,
operation_mode mode = operation_mode::CHANNEL_WISE
);

// ------------------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 230c0b0

Please sign in to comment.