Add multm_prev_ layer and enhance gemm() function for PLANE_WISE oper…

…ations (davisking#3020) * Fix Stride Indexing Bugs in `reorg` and `reorg_gradient` Functions (CPU & CUDA) and Add `add_to` Parameter * 'add_to' parameter missing in cuda call reorg_gradient.launch_kernel() * Cleanup: remove using namespace std; (davisking#3016) * remove using namespace std from headers * more std:: * more std:: * more std:: on windows stuff * remove uses of using namespace std::chrono * do not use C++17 features * Add Davis suggestion * revert some more stuff * revert removing include * more std::chrono stuff * fix build error * Adjust comment formatting to be like other dlib comments * Add positional encodings layer to Dlib * Add multm_prev layer and enhance gemm() function for PLANE_WISE operations * Updates * Updates * Resynchronization with tril_ class * Delete .vscode/settings.json Not required for the merging * Remove duplicates * Small improvements to PLANE_WISE in gemm() function * Same improvements for the CPU version * Introducing a new enum for operation modes in tensor computations * Remove a test duplicated call in dnn tests * Remove duplicated declaration * Comment fixed * Fixing the Cuda compilation * Merging with updated softmax_ layer * Fixing header for CPU compilation * Adding a missing cast * Test fixed to use the new operation_mode enum * softmaxm test fixed * Enum test removed * Enum test removed * Fixing indentation * Fixing indentation * Test removed * Move the operation_mode enumeration to its own header * Use operation_mode instead of unsigned long --------- Co-authored-by: Adrià <[email protected]> Co-authored-by: Davis King <[email protected]>
kp-forks · Dec 20, 2024 · 230c0b0 · 230c0b0
1 parent dfbee6d
commit 230c0b0
Show file tree

Hide file tree

Showing 13 changed files with 979 additions and 286 deletions.
diff --git a/dlib/cuda/cpu_dlib.cpp b/dlib/cuda/cpu_dlib.cpp
@@ -1620,122 +1620,175 @@ namespace dlib
 
         namespace ttimpl
         {
-        void softmax (
-            const long num_locations,
-            const long num_channels,
-            tensor& dest,
-            const tensor& src
-        )
-        {
-            DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
-            DLIB_CASSERT(have_same_dimensions(dest,src));
-            const auto d = dest.host();
-            const auto s = src.host();
+            void softmax(
+                const long num_locations,
+                const long num_channels,
+                tensor& dest,
+                const tensor& src,
+                operation_mode mode = operation_mode::CHANNEL_WISE
+            )
+            {
+                DLIB_ASSERT(num_channels * num_locations == src.nr() * src.nc() * src.k());
+                DLIB_CASSERT(have_same_dimensions(dest, src));
+                const auto d = dest.host();
+                const auto s = src.host();
 
-            // Note that we subtract out the max values in each channel before applying
-            // exp() to avoid numeric overflow in the subsequent computations.  Doing this
-            // doesn't change the resulting output, it just makes it more numerically
-            // stable.
-            for (long n = 0; n < src.num_samples(); ++n)
-            {
-                auto ss = s + num_locations*num_channels*n;
-                auto dd = d + num_locations*num_channels*n;
-                for (long i = 0; i < num_locations; ++i)
+                for (long n = 0; n < src.num_samples(); ++n)
                 {
-                    float max_val = -std::numeric_limits<float>::infinity();
-                    for (long k = 0; k < num_channels; ++k)
-                        max_val = std::max(max_val, ss[k*num_locations]);
+                    auto ss = s + num_locations * num_channels * n;
+                    auto dd = d + num_locations * num_channels * n;
 
-                    for (long k = 0; k < num_channels; ++k)
-                        dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);
+                    if (mode == operation_mode::CHANNEL_WISE)
+                    {
+                        for (long i = 0; i < num_locations; ++i)
+                        {
+                            float max_val = -std::numeric_limits<float>::infinity();
+                            for (long k = 0; k < num_channels; ++k)
+                                max_val = std::max(max_val, ss[k * num_locations]);
 
-                    ++ss;
-                    ++dd;
-                }
-            }
+                            float sum = 0.0f;
+                            for (long k = 0; k < num_channels; ++k)
+                            {
+                                dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val);
+                                sum += dd[k * num_locations];
+                            }
+                            for (long k = 0; k < num_channels; ++k)
+                                dd[k * num_locations] /= sum;
 
-            // Now normalize each channel so they sum to 1.
-            for (long n = 0; n < src.num_samples(); ++n)
-            {
-                const auto dd = d + num_locations*num_channels*n;
-                for (long i = 0; i < num_locations; ++i)
-                {
-                    const auto ddd = dd+i;
+                            ++ss;
+                            ++dd;
+                        }
+                    }
+                    else if (mode == operation_mode::PLANE_WISE)
+                    {
+                        for (long k = 0; k < num_channels; ++k)
+                        {
+                            auto s_channel = ss + k * num_locations;
+                            auto d_channel = dd + k * num_locations;
+                            for (long r = 0; r < src.nr(); ++r)
+                            {
+                                float max_val = -std::numeric_limits<float>::infinity();
+                                for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                    max_val = std::max(max_val, s_channel[idx]);
 
-                    float temp = 0;
-                    for (long k = 0; k < num_channels; ++k)
-                        temp += ddd[k*num_locations];
-                    for (long k = 0; k < num_channels; ++k)
-                        ddd[k*num_locations] /= temp;
+                                if (max_val == -std::numeric_limits<float>::infinity())
+                                {
+                                    for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                        d_channel[idx] = 0.0f;
+                                }
+                                else
+                                {
+                                    float sum = 0.0f;
+                                    for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                    {
+                                        d_channel[idx] = std::exp(s_channel[idx] - max_val);
+                                        sum += d_channel[idx];
+                                    }
+                                    for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                        d_channel[idx] /= sum;
+                                }
+                            }
+                        }
+                    }
                 }
             }
-        }
 
-        void softmax_gradient (
-            const long num_locations,
-            const long num_channels,
-            tensor& grad,
-            const tensor& dest,
-            const tensor& gradient_input
-        )
-        {
-            DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
-            DLIB_CASSERT(have_same_dimensions(grad,dest));
-            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
-            const auto d = dest.host();
-            const auto g = grad.host();
-            const auto in = gradient_input.host();
-
-
-            for (long n = 0; n < grad.num_samples(); ++n)
+            void softmax_gradient(
+                const long num_locations,
+                const long num_channels,
+                tensor& grad,
+                const tensor& dest,
+                const tensor& gradient_input,
+                operation_mode mode = operation_mode::CHANNEL_WISE
+            )
             {
-                const auto d2 = d + num_locations*num_channels*n;
-                const auto g2 = g + num_locations*num_channels*n;
-                const auto in2 = in + num_locations*num_channels*n;
-                for (long i = 0; i < num_locations; ++i)
+                DLIB_ASSERT(num_channels * num_locations == grad.nr() * grad.nc() * grad.k());
+                DLIB_CASSERT(have_same_dimensions(grad, dest));
+                DLIB_CASSERT(have_same_dimensions(grad, gradient_input));
+
+                const auto d = dest.host();
+                const auto g = grad.host();
+                const auto in = gradient_input.host();
+                for (long n = 0; n < grad.num_samples(); ++n)
                 {
-                    const auto d3 = d2+i;
-                    const auto g3 = g2+i;
-                    const auto in3 = in2+i;
+                    const auto d2 = d + num_locations * num_channels * n;
+                    const auto g2 = g + num_locations * num_channels * n;
+                    const auto in2 = in + num_locations * num_channels * n;
 
-                    float temp = 0;
-                    for (long k = 0; k < num_channels; ++k)
-                        temp += -d3[k*num_locations]*in3[k*num_locations];
-                    if (is_same_object(gradient_input, grad))
+                    if (mode == operation_mode::CHANNEL_WISE)
                     {
-                        for (long k = 0; k < num_channels; ++k)
-                            g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
+                        for (long i = 0; i < num_locations; ++i)
+                        {
+                            const auto d3 = d2 + i;
+                            const auto g3 = g2 + i;
+                            const auto in3 = in2 + i;
+                            float sum = 0.0f;
+                            for (long k = 0; k < num_channels; ++k)
+                                sum += -d3[k * num_locations] * in3[k * num_locations];
+                            if (is_same_object(gradient_input, grad))
+                            {
+                                for (long k = 0; k < num_channels; ++k)
+                                    g3[k * num_locations] = d3[k * num_locations] * (sum + in3[k * num_locations]);
+                            }
+                            else
+                            {
+                                for (long k = 0; k < num_channels; ++k)
+                                    g3[k * num_locations] += d3[k * num_locations] * (sum + in3[k * num_locations]);
+                            }
+                        }
                     }
-                    else
+                    else if (mode == operation_mode::PLANE_WISE)
                     {
                         for (long k = 0; k < num_channels; ++k)
-                            g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
+                        {
+                            const auto d_channel = d2 + k * num_locations;
+                            const auto g_channel = g2 + k * num_locations;
+                            const auto in_channel = in2 + k * num_locations;
+                            for (long r = 0; r < grad.nr(); ++r)
+                            {
+                                float sum = 0.0f;
+                                for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
+                                    sum += -d_channel[idx] * in_channel[idx];
+                                if (is_same_object(gradient_input, grad))
+                                {
+                                    for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
+                                        g_channel[idx] = d_channel[idx] * (sum + in_channel[idx]);
+                                }
+                                else
+                                {
+                                    for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
+                                        g_channel[idx] += d_channel[idx] * (sum + in_channel[idx]);
+                                }
+                            }
+                        }
                     }
                 }
             }
         }
-        }
 
     // ----------------------------------------------------------------------------------------
 
-        void softmax (
+        void softmax(
             tensor& dest,
-            const tensor& src
+            const tensor& src,
+            operation_mode mode
         )
         {
-            DLIB_CASSERT(have_same_dimensions(dest,src));
-            ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
+            DLIB_CASSERT(have_same_dimensions(dest, src));
+            DLIB_CASSERT(mode == operation_mode::CHANNEL_WISE || mode == operation_mode::PLANE_WISE, "Invalid softmax mode");
+            ttimpl::softmax(src.nr() * src.nc(), src.k(), dest, src, mode);
         }
 
-        void softmax_gradient (
+        void softmax_gradient(
             tensor& grad,
             const tensor& dest,
-            const tensor& gradient_input
+            const tensor& gradient_input,
+            operation_mode mode
         )
         {
-            DLIB_CASSERT(have_same_dimensions(grad,dest));
-            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
-            ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
+            DLIB_CASSERT(have_same_dimensions(grad, dest));
+            DLIB_CASSERT(have_same_dimensions(grad, gradient_input));
+            ttimpl::softmax_gradient(grad.nr() * grad.nc(), grad.k(), grad, dest, gradient_input, mode);
         }
 
     // ------------------------------------------------------------------------------------

diff --git a/dlib/cuda/cpu_dlib.h b/dlib/cuda/cpu_dlib.h
@@ -291,15 +291,17 @@ namespace dlib
 
     // -----------------------------------------------------------------------------------
 
-        void softmax (
+        void softmax(
             tensor& dest,
-            const tensor& src
+            const tensor& src,
+            operation_mode mode = operation_mode::CHANNEL_WISE
         );
 
-        void softmax_gradient (
+        void softmax_gradient(
             tensor& grad,
             const tensor& dest,
-            const tensor& gradient_input
+            const tensor& gradient_input,
+            operation_mode mode = operation_mode::CHANNEL_WISE
         );
 
     // ------------------------------------------------------------------------------------