diff --git a/dlib/cuda/cpu_dlib.cpp b/dlib/cuda/cpu_dlib.cpp
index 0e5ca5cee6..25a461d949 100644
--- a/dlib/cuda/cpu_dlib.cpp
+++ b/dlib/cuda/cpu_dlib.cpp
@@ -1620,122 +1620,175 @@ namespace dlib
 
         namespace ttimpl
         {
-        void softmax (
-            const long num_locations,
-            const long num_channels,
-            tensor& dest,
-            const tensor& src
-        )
-        {
-            DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
-            DLIB_CASSERT(have_same_dimensions(dest,src));
-            const auto d = dest.host();
-            const auto s = src.host();
+            void softmax(
+                const long num_locations,
+                const long num_channels,
+                tensor& dest,
+                const tensor& src,
+                operation_mode mode = operation_mode::CHANNEL_WISE
+            )
+            {
+                DLIB_ASSERT(num_channels * num_locations == src.nr() * src.nc() * src.k());
+                DLIB_CASSERT(have_same_dimensions(dest, src));
+                const auto d = dest.host();
+                const auto s = src.host();
 
-            // Note that we subtract out the max values in each channel before applying
-            // exp() to avoid numeric overflow in the subsequent computations.  Doing this
-            // doesn't change the resulting output, it just makes it more numerically
-            // stable.
-            for (long n = 0; n < src.num_samples(); ++n)
-            {
-                auto ss = s + num_locations*num_channels*n;
-                auto dd = d + num_locations*num_channels*n;
-                for (long i = 0; i < num_locations; ++i)
+                for (long n = 0; n < src.num_samples(); ++n)
                 {
-                    float max_val = -std::numeric_limits<float>::infinity();
-                    for (long k = 0; k < num_channels; ++k)
-                        max_val = std::max(max_val, ss[k*num_locations]);
+                    auto ss = s + num_locations * num_channels * n;
+                    auto dd = d + num_locations * num_channels * n;
 
-                    for (long k = 0; k < num_channels; ++k)
-                        dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);
+                    if (mode == operation_mode::CHANNEL_WISE)
+                    {
+                        for (long i = 0; i < num_locations; ++i)
+                        {
+                            float max_val = -std::numeric_limits<float>::infinity();
+                            for (long k = 0; k < num_channels; ++k)
+                                max_val = std::max(max_val, ss[k * num_locations]);
 
-                    ++ss;
-                    ++dd;
-                }
-            }
+                            float sum = 0.0f;
+                            for (long k = 0; k < num_channels; ++k)
+                            {
+                                dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val);
+                                sum += dd[k * num_locations];
+                            }
+                            for (long k = 0; k < num_channels; ++k)
+                                dd[k * num_locations] /= sum;
 
-            // Now normalize each channel so they sum to 1.
-            for (long n = 0; n < src.num_samples(); ++n)
-            {
-                const auto dd = d + num_locations*num_channels*n;
-                for (long i = 0; i < num_locations; ++i)
-                {
-                    const auto ddd = dd+i;
+                            ++ss;
+                            ++dd;
+                        }
+                    }
+                    else if (mode == operation_mode::PLANE_WISE)
+                    {
+                        for (long k = 0; k < num_channels; ++k)
+                        {
+                            auto s_channel = ss + k * num_locations;
+                            auto d_channel = dd + k * num_locations;
+                            for (long r = 0; r < src.nr(); ++r)
+                            {
+                                float max_val = -std::numeric_limits<float>::infinity();
+                                for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                    max_val = std::max(max_val, s_channel[idx]);
 
-                    float temp = 0;
-                    for (long k = 0; k < num_channels; ++k)
-                        temp += ddd[k*num_locations];
-                    for (long k = 0; k < num_channels; ++k)
-                        ddd[k*num_locations] /= temp;
+                                if (max_val == -std::numeric_limits<float>::infinity())
+                                {
+                                    for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                        d_channel[idx] = 0.0f;
+                                }
+                                else
+                                {
+                                    float sum = 0.0f;
+                                    for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                    {
+                                        d_channel[idx] = std::exp(s_channel[idx] - max_val);
+                                        sum += d_channel[idx];
+                                    }
+                                    for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                        d_channel[idx] /= sum;
+                                }
+                            }
+                        }
+                    }
                 }
             }
-        }
 
-        void softmax_gradient (
-            const long num_locations,
-            const long num_channels,
-            tensor& grad,
-            const tensor& dest,
-            const tensor& gradient_input
-        )
-        {
-            DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
-            DLIB_CASSERT(have_same_dimensions(grad,dest));
-            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
-            const auto d = dest.host();
-            const auto g = grad.host();
-            const auto in = gradient_input.host();
-
-
-            for (long n = 0; n < grad.num_samples(); ++n)
+            void softmax_gradient(
+                const long num_locations,
+                const long num_channels,
+                tensor& grad,
+                const tensor& dest,
+                const tensor& gradient_input,
+                operation_mode mode = operation_mode::CHANNEL_WISE
+            )
             {
-                const auto d2 = d + num_locations*num_channels*n;
-                const auto g2 = g + num_locations*num_channels*n;
-                const auto in2 = in + num_locations*num_channels*n;
-                for (long i = 0; i < num_locations; ++i)
+                DLIB_ASSERT(num_channels * num_locations == grad.nr() * grad.nc() * grad.k());
+                DLIB_CASSERT(have_same_dimensions(grad, dest));
+                DLIB_CASSERT(have_same_dimensions(grad, gradient_input));
+
+                const auto d = dest.host();
+                const auto g = grad.host();
+                const auto in = gradient_input.host();
+                for (long n = 0; n < grad.num_samples(); ++n)
                 {
-                    const auto d3 = d2+i;
-                    const auto g3 = g2+i;
-                    const auto in3 = in2+i;
+                    const auto d2 = d + num_locations * num_channels * n;
+                    const auto g2 = g + num_locations * num_channels * n;
+                    const auto in2 = in + num_locations * num_channels * n;
 
-                    float temp = 0;
-                    for (long k = 0; k < num_channels; ++k)
-                        temp += -d3[k*num_locations]*in3[k*num_locations];
-                    if (is_same_object(gradient_input, grad))
+                    if (mode == operation_mode::CHANNEL_WISE)
                     {
-                        for (long k = 0; k < num_channels; ++k)
-                            g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
+                        for (long i = 0; i < num_locations; ++i)
+                        {
+                            const auto d3 = d2 + i;
+                            const auto g3 = g2 + i;
+                            const auto in3 = in2 + i;
+                            float sum = 0.0f;
+                            for (long k = 0; k < num_channels; ++k)
+                                sum += -d3[k * num_locations] * in3[k * num_locations];
+                            if (is_same_object(gradient_input, grad))
+                            {
+                                for (long k = 0; k < num_channels; ++k)
+                                    g3[k * num_locations] = d3[k * num_locations] * (sum + in3[k * num_locations]);
+                            }
+                            else
+                            {
+                                for (long k = 0; k < num_channels; ++k)
+                                    g3[k * num_locations] += d3[k * num_locations] * (sum + in3[k * num_locations]);
+                            }
+                        }
                     }
-                    else
+                    else if (mode == operation_mode::PLANE_WISE)
                     {
                         for (long k = 0; k < num_channels; ++k)
-                            g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
+                        {
+                            const auto d_channel = d2 + k * num_locations;
+                            const auto g_channel = g2 + k * num_locations;
+                            const auto in_channel = in2 + k * num_locations;
+                            for (long r = 0; r < grad.nr(); ++r)
+                            {
+                                float sum = 0.0f;
+                                for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
+                                    sum += -d_channel[idx] * in_channel[idx];
+                                if (is_same_object(gradient_input, grad))
+                                {
+                                    for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
+                                        g_channel[idx] = d_channel[idx] * (sum + in_channel[idx]);
+                                }
+                                else
+                                {
+                                    for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
+                                        g_channel[idx] += d_channel[idx] * (sum + in_channel[idx]);
+                                }
+                            }
+                        }
                     }
                 }
             }
         }
-        }
 
     // ----------------------------------------------------------------------------------------
 
-        void softmax (
+        void softmax(
             tensor& dest,
-            const tensor& src
+            const tensor& src,
+            operation_mode mode
         )
         {
-            DLIB_CASSERT(have_same_dimensions(dest,src));
-            ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
+            DLIB_CASSERT(have_same_dimensions(dest, src));
+            DLIB_CASSERT(mode == operation_mode::CHANNEL_WISE || mode == operation_mode::PLANE_WISE, "Invalid softmax mode");
+            ttimpl::softmax(src.nr() * src.nc(), src.k(), dest, src, mode);
         }
 
-        void softmax_gradient (
+        void softmax_gradient(
             tensor& grad,
             const tensor& dest,
-            const tensor& gradient_input
+            const tensor& gradient_input,
+            operation_mode mode
         )
         {
-            DLIB_CASSERT(have_same_dimensions(grad,dest));
-            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
-            ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
+            DLIB_CASSERT(have_same_dimensions(grad, dest));
+            DLIB_CASSERT(have_same_dimensions(grad, gradient_input));
+            ttimpl::softmax_gradient(grad.nr() * grad.nc(), grad.k(), grad, dest, gradient_input, mode);
         }
 
     // ------------------------------------------------------------------------------------
diff --git a/dlib/cuda/cpu_dlib.h b/dlib/cuda/cpu_dlib.h
index f26795445d..f35b3c9728 100644
--- a/dlib/cuda/cpu_dlib.h
+++ b/dlib/cuda/cpu_dlib.h
@@ -291,15 +291,17 @@ namespace dlib
 
     // -----------------------------------------------------------------------------------
 
-        void softmax (
+        void softmax(
             tensor& dest,
-            const tensor& src
+            const tensor& src,
+            operation_mode mode = operation_mode::CHANNEL_WISE
         );
 
-        void softmax_gradient (
+        void softmax_gradient(
             tensor& grad,
             const tensor& dest,
-            const tensor& gradient_input
+            const tensor& gradient_input,
+            operation_mode mode = operation_mode::CHANNEL_WISE
         );
 
     // ------------------------------------------------------------------------------------
diff --git a/dlib/cuda/cublas_dlibapi.cpp b/dlib/cuda/cublas_dlibapi.cpp
index 376cc9f006..064e92c3df 100644
--- a/dlib/cuda/cublas_dlibapi.cpp
+++ b/dlib/cuda/cublas_dlibapi.cpp
@@ -101,55 +101,118 @@ namespace dlib
             const tensor& lhs,
             bool trans_lhs,
             const tensor& rhs,
-            bool trans_rhs
+            bool trans_rhs,
+            operation_mode mode
         )
         {
-            // Recall that BLAS uses column major order so to deal with that we flip the
-            // order of the lhs and rhs arguments.
-            const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N;
-            const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-            const int dest_nr = dest.num_samples();
-            const int dest_nc = dest.size()/dest_nr;
-            const int lhs_nr = lhs.num_samples();
-            const int lhs_nc = lhs.size()/lhs_nr;
-            const int rhs_nr = rhs.num_samples();
-            const int rhs_nc = rhs.size()/rhs_nr;
-            if (trans_lhs && trans_rhs)
+            if (mode == operation_mode::CHANNEL_WISE)
             {
-                DLIB_ASSERT( dest_nr == lhs_nc &&
-                              dest_nc == rhs_nr &&
-                              lhs_nr == rhs_nc)
-            }
-            else if (!trans_lhs && trans_rhs)
-            {
-                DLIB_ASSERT( dest_nr == lhs_nr &&
-                              dest_nc == rhs_nr &&
-                              lhs_nc == rhs_nc)
-            }
-            else if (trans_lhs && !trans_rhs)
-            {
-                DLIB_ASSERT( dest_nr == lhs_nc &&
-                              dest_nc == rhs_nc &&
-                              lhs_nr == rhs_nr)
+                // Recall that BLAS uses column major order so to deal with that we flip the
+                // order of the lhs and rhs arguments.
+                const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+                const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+                const int dest_nr = dest.num_samples();
+                const int dest_nc = dest.size() / dest_nr;
+                const int lhs_nr = lhs.num_samples();
+                const int lhs_nc = lhs.size() / lhs_nr;
+                const int rhs_nr = rhs.num_samples();
+                const int rhs_nc = rhs.size() / rhs_nr;
+                if (trans_lhs && trans_rhs)
+                {
+                    DLIB_ASSERT(dest_nr == lhs_nc &&
+                        dest_nc == rhs_nr &&
+                        lhs_nr == rhs_nc)
+                }
+                else if (!trans_lhs && trans_rhs)
+                {
+                    DLIB_ASSERT(dest_nr == lhs_nr &&
+                        dest_nc == rhs_nr &&
+                        lhs_nc == rhs_nc)
+                }
+                else if (trans_lhs && !trans_rhs)
+                {
+                    DLIB_ASSERT(dest_nr == lhs_nc &&
+                        dest_nc == rhs_nc &&
+                        lhs_nr == rhs_nr)
+                }
+                else
+                {
+                    DLIB_ASSERT(dest_nr == lhs_nr &&
+                        dest_nc == rhs_nc &&
+                        lhs_nc == rhs_nr)
+                }
+
+                const int k = trans_rhs ? rhs_nc : rhs_nr;
+                CHECK_CUBLAS(cublasSgemm(context(),
+                    transb,
+                    transa,
+                    dest_nc, dest_nr, k,
+                    &alpha,
+                    rhs.device(), rhs_nc,
+                    lhs.device(), lhs_nc,
+                    &beta,
+                    dest.device(), dest_nc));
             }
-            else
+            else if (mode == operation_mode::PLANE_WISE)
             {
-                DLIB_ASSERT( dest_nr == lhs_nr &&
-                              dest_nc == rhs_nc &&
-                              lhs_nc == rhs_nr)
-            }
+                const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+                const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+                long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() });
+                long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() });
+
+                auto is_matrix = [](const auto& tensor) {
+                    return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) ||
+                        (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1));
+                };
+                const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest);
+
+                if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) num_samples = num_channels = 1;
+
+                size_t lhs_rows = lhs.nr();
+                size_t lhs_cols = lhs.nc();
+                if (lhs_is_matrix && (lhs.num_samples() > 1 || lhs.k() > 1)) {
+                    lhs_rows = lhs.num_samples();
+                    lhs_cols = lhs.k();
+                }
+                size_t rhs_rows = rhs.nr();
+                size_t rhs_cols = rhs.nc();
+                if (rhs_is_matrix && (rhs.num_samples() > 1 || rhs.k() > 1)) {
+                    rhs_rows = rhs.num_samples();
+                    rhs_cols = rhs.k();
+                }
+                size_t dest_rows = dest.nr();
+                size_t dest_cols = dest.nc();
+                if (dest_is_matrix && (dest.num_samples() > 1 || dest.k() > 1)) {
+                    dest_rows = dest.num_samples();
+                    dest_cols = dest.k();
+                }
+
+                const size_t lhs_plane_size = lhs_rows * lhs_cols;
+                const size_t rhs_plane_size = rhs_rows * rhs_cols;
+                const size_t dest_plane_size = dest_rows * dest_cols;
 
-            const int k = trans_rhs ? rhs_nc : rhs_nr;
-            CHECK_CUBLAS(cublasSgemm(context(),
-                              transb,
-                              transa, 
-                              dest_nc, dest_nr, k,
-                              &alpha,
-                              rhs.device(), rhs_nc,
-                              lhs.device(), lhs_nc,
-                              &beta,
-                              dest.device(),dest_nc));
+                for (long b = 0; b < num_samples; ++b)
+                {
+                    for (long c = 0; c < num_channels; ++c)
+                    {
+                        auto lhs_slice = lhs_is_matrix ? lhs.device() :
+                            lhs.device() + (b * num_channels + c) * lhs_plane_size;
+                        auto rhs_slice = rhs_is_matrix ? rhs.device() :
+                            rhs.device() + (b * num_channels + c) * rhs_plane_size;
+                        auto dest_slice = dest_is_matrix ? dest.device() :
+                            dest.device() + (b * num_channels + c) * dest_plane_size;
+                        const int k = trans_rhs ? rhs_cols : rhs_rows;
+
+                        CHECK_CUBLAS(cublasSgemm(
+                            context(), transb, transa, dest_cols, dest_rows, k,
+                            &alpha, rhs_slice, rhs_cols, lhs_slice, lhs_cols,
+                            &beta, dest_slice, dest_cols
+                        ));
+                    }
+                }
+            }
         }
 
     // ------------------------------------------------------------------------------------
diff --git a/dlib/cuda/cublas_dlibapi.h b/dlib/cuda/cublas_dlibapi.h
index b46fd25caa..45d8f426f0 100644
--- a/dlib/cuda/cublas_dlibapi.h
+++ b/dlib/cuda/cublas_dlibapi.h
@@ -9,9 +9,9 @@
 #include "cuda_errors.h"
 
 namespace dlib
-{
+{    
     namespace cuda 
-    {
+    {        
 
     // -----------------------------------------------------------------------------------
 
@@ -22,21 +22,52 @@ namespace dlib
             const tensor& lhs,
             bool trans_lhs,
             const tensor& rhs,
-            bool trans_rhs
+            bool trans_rhs,
+            operation_mode mode = operation_mode::CHANNEL_WISE
         );
-        /*!
-            requires
-                - The dimensions of lhs and rhs must be compatible for matrix
-                  multiplication.  In particular:
+    /*!
+        requires
+            - The dimensions of lhs and rhs must be compatible for matrix multiplication.
+                The specific requirements depend on the mode:
+
+                For CHANNEL_WISE mode (default):
                     - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
                     - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
                     - Let D == mat(dest)
                     - D.nr() == L.nr() && D.nc() == R.nc()
-                      (i.e. dest must be preallocated and have the correct output dimensions)
+                        (i.e. dest must be preallocated and have the correct output dimensions)
                     - L.nc() == R.nr()
-            ensures
-                - performs: dest = alpha*L*R + beta*mat(dest)
-        !*/
+
+                For PLANE_WISE mode:
+                    - lhs.num_samples() == rhs.num_samples() && lhs.k() == rhs.k()
+                    - If !trans_lhs && !trans_rhs:
+                        lhs.nc() == rhs.nr()
+                        dest.nr() == lhs.nr() && dest.nc() == rhs.nc()
+                    - If trans_lhs && !trans_rhs:
+                        lhs.nr() == rhs.nr()
+                        dest.nr() == lhs.nc() && dest.nc() == rhs.nc()
+                    - If !trans_lhs && trans_rhs:
+                        lhs.nc() == rhs.nc()
+                        dest.nr() == lhs.nr() && dest.nc() == rhs.nr()
+                    - If trans_lhs && trans_rhs:
+                        lhs.nr() == rhs.nc()
+                        dest.nr() == lhs.nc() && dest.nc() == rhs.nr()
+
+        ensures
+            - Performs matrix multiplication based on the specified mode:
+
+                For CHANNEL_WISE mode:
+                    - performs: dest = alpha*L*R + beta*mat(dest)
+                        where L, R, and D are as defined above.
+
+                For PLANE_WISE mode:
+                    - Performs matrix multiplication for each corresponding 2D plane (nr x nc)
+                        in lhs and rhs across all samples and channels.
+                    - The operation is equivalent to performing the following for each sample
+                        and channel:
+                        dest[s][k] = alpha * (lhs[s][k] * rhs[s][k]) + beta * dest[s][k]
+                        where [s][k] represents the 2D plane for sample s and channel k.
+    !*/
 
     // ------------------------------------------------------------------------------------
 
diff --git a/dlib/cuda/cudnn_dlibapi.cpp b/dlib/cuda/cudnn_dlibapi.cpp
index c09845cc03..40dddb8d89 100644
--- a/dlib/cuda/cudnn_dlibapi.cpp
+++ b/dlib/cuda/cudnn_dlibapi.cpp
@@ -1528,61 +1528,123 @@ namespace dlib
                                        grad.device()));
         }
 
-    // ------------------------------------------------------------------------------------
     // ------------------------------------------------------------------------------------
 
-        void softmax (
+        void softmax(
             tensor& dest,
-            const tensor& src
+            const tensor& src,
+            operation_mode mode
         )
         {
-            DLIB_CASSERT(have_same_dimensions(dest,src));
-            if (src.size() == 0)
-                return;
+            DLIB_CASSERT(have_same_dimensions(dest, src));
+            if (src.size() == 0) return;
 
             const float alpha = 1;
             const float beta = 0;
 
-            CHECK_CUDNN(cudnnSoftmaxForward(context(),
-                                      CUDNN_SOFTMAX_ACCURATE,
-                                      CUDNN_SOFTMAX_MODE_CHANNEL,
-                                      &alpha,
-                                      descriptor(src),
-                                      src.device(),
-                                      &beta,
-                                      descriptor(dest),
-                                      dest.device()));
-        }
+            if (mode == operation_mode::CHANNEL_WISE)
+            {
+                CHECK_CUDNN(cudnnSoftmaxForward(context(),
+                    CUDNN_SOFTMAX_ACCURATE,
+                    CUDNN_SOFTMAX_MODE_CHANNEL,
+                    &alpha,
+                    descriptor(src),
+                    src.device(),
+                    &beta,
+                    descriptor(dest),
+                    dest.device()));
+            }
+            else if (mode == operation_mode::PLANE_WISE)
+            {
+                const long num_samples = src.num_samples();
+                const long num_channels = src.k();
+                const size_t plane_size = src.nr() * src.nc();
 
+                for (long s = 0; s < num_samples; ++s)
+                {
+                    for (long k = 0; k < num_channels; ++k)
+                    {
+                        auto src_slice = src.device() + (s * num_channels + k) * plane_size;
+                        auto dest_slice = dest.device() + (s * num_channels + k) * plane_size;
+                        auto a_src_slice = alias_tensor(src.nr(), src.nc())(src, (s * num_channels + k) * plane_size);
+                        auto a_dest_slice = alias_tensor(dest.nr(), dest.nc())(dest, (s * num_channels + k) * plane_size);
+
+                        CHECK_CUDNN(cudnnSoftmaxForward(context(),
+                            CUDNN_SOFTMAX_ACCURATE,
+                            CUDNN_SOFTMAX_MODE_CHANNEL,
+                            &alpha,
+                            descriptor(a_src_slice),
+                            src_slice,
+                            &beta,
+                            descriptor(a_dest_slice),
+                            dest_slice));
+                    }
+                }
+            }
+        }
 
-        void softmax_gradient (
+        void softmax_gradient(
             tensor& grad,
             const tensor& dest,
-            const tensor& gradient_input
+            const tensor& gradient_input,
+            operation_mode mode
         )
         {
             DLIB_CASSERT(
-                  have_same_dimensions(dest,gradient_input) == true &&
-                  have_same_dimensions(dest,grad) == true );
-            if (dest.size() == 0)
-                return;
+                have_same_dimensions(dest, gradient_input) == true &&
+                have_same_dimensions(dest, grad) == true);
+            if (dest.size() == 0) return;
 
             const float alpha = 1;
-            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
-            CHECK_CUDNN(cudnnSoftmaxBackward(context(),
-                                      CUDNN_SOFTMAX_ACCURATE,
-                                      CUDNN_SOFTMAX_MODE_CHANNEL,
-                                      &alpha,
-                                      descriptor(dest),
-                                      dest.device(),
-                                      descriptor(gradient_input),
-                                      gradient_input.device(),
-                                      &beta,
-                                      descriptor(grad),
-                                      grad.device()));
+            const float beta = is_same_object(grad, gradient_input) ? 0 : 1;
+
+            if (mode == operation_mode::CHANNEL_WISE)
+            {
+                CHECK_CUDNN(cudnnSoftmaxBackward(context(),
+                    CUDNN_SOFTMAX_ACCURATE,
+                    CUDNN_SOFTMAX_MODE_CHANNEL,
+                    &alpha,
+                    descriptor(dest),
+                    dest.device(),
+                    descriptor(gradient_input),
+                    gradient_input.device(),
+                    &beta,
+                    descriptor(grad),
+                    grad.device()));
+            }
+            else if (mode == operation_mode::PLANE_WISE)
+            {
+                const long num_samples = dest.num_samples();
+                const long num_channels = dest.k();
+                const size_t plane_size = dest.nr() * dest.nc();
+
+                for (long s = 0; s < num_samples; ++s)
+                {
+                    for (long k = 0; k < num_channels; ++k)
+                    {
+                        auto dest_slice = dest.device() + (s * num_channels + k) * plane_size;
+                        auto gi_slice = gradient_input.device() + (s * num_channels + k) * plane_size;
+                        auto grad_slice = grad.device() + (s * num_channels + k) * plane_size;
+                        auto a_dest_slice = alias_tensor(dest.nr(), dest.nc())(dest, (s * num_channels + k) * plane_size);
+                        auto a_gi_slice = alias_tensor(gradient_input.nr(), gradient_input.nc())(gradient_input, (s * num_channels + k) * plane_size);
+                        auto a_grad_slice = alias_tensor(grad.nr(), grad.nc())(grad, (s * num_channels + k) * plane_size);
+
+                        CHECK_CUDNN(cudnnSoftmaxBackward(context(),
+                            CUDNN_SOFTMAX_ACCURATE,
+                            CUDNN_SOFTMAX_MODE_CHANNEL,
+                            &alpha,
+                            descriptor(a_dest_slice),
+                            dest_slice,
+                            descriptor(a_gi_slice),
+                            gi_slice,
+                            &beta,
+                            descriptor(a_grad_slice),
+                            grad_slice));
+                    }
+                }
+            }
         }
 
-    // ------------------------------------------------------------------------------------
     // ------------------------------------------------------------------------------------
 
         void softmax_all (
diff --git a/dlib/cuda/cudnn_dlibapi.h b/dlib/cuda/cudnn_dlibapi.h
index 7b040a00c2..283be0d395 100644
--- a/dlib/cuda/cudnn_dlibapi.h
+++ b/dlib/cuda/cudnn_dlibapi.h
@@ -3,17 +3,19 @@
 #ifndef DLIB_DNN_CuDNN_H_
 #define DLIB_DNN_CuDNN_H_
 
+#include <memory>
+#include "operation_mode.h"
 #ifdef DLIB_USE_CUDA
-
 #include "cuda_errors.h"
-#include <memory>
 #include "cuda_data_ptr.h"
+#endif // DLIB_USE_CUDA
 
 namespace dlib
 {
     class tensor;
     class resizable_tensor;
 
+#ifdef DLIB_USE_CUDA
     namespace cuda 
     {
 
@@ -352,13 +354,15 @@ namespace dlib
 
         void softmax (
             tensor& dest,
-            const tensor& src
+            const tensor& src,
+            operation_mode mode = operation_mode::CHANNEL_WISE
         );
 
         void softmax_gradient (
             tensor& grad,
             const tensor& dest,
-            const tensor& gradient_input
+            const tensor& gradient_input,
+            operation_mode mode = operation_mode::CHANNEL_WISE
         );
 
     // ------------------------------------------------------------------------------------
@@ -416,9 +420,8 @@ namespace dlib
     // ------------------------------------------------------------------------------------
 
     }
-}
-
 #endif // DLIB_USE_CUDA
+}
 
 #endif // DLIB_DNN_CuDNN_H_
 
diff --git a/dlib/cuda/operation_mode.h b/dlib/cuda/operation_mode.h
new file mode 100644
index 0000000000..7d10f89c8f
--- /dev/null
+++ b/dlib/cuda/operation_mode.h
@@ -0,0 +1,23 @@
+// Copyright (C) 2024  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_CUDA_OPERATION_MODE_H
+#define DLIB_CUDA_OPERATION_MODE_H
+
+namespace dlib
+{
+// ----------------------------------------------------------------------------------------
+
+    /*!
+        This enum is used to determine the mode of operation for certain functions
+        (such as gemm and softmax) in Dlib. It specifies whether the calculation
+        should be performed based on the matrix field in nr()xnc() or if the matrix
+        should be considered in num_samples()xk(). This helps in organizing tensor
+        computations more efficiently according to the required dimensions.
+    */
+    enum class operation_mode { CHANNEL_WISE = 0, PLANE_WISE = 1 };
+
+// ----------------------------------------------------------------------------------------
+
+} // namespace dlib
+
+#endif // DLIB_CUDA_OPERATION_MODE_H
\ No newline at end of file
diff --git a/dlib/cuda/tensor_tools.cpp b/dlib/cuda/tensor_tools.cpp
index 90a09a2884..069b4d4659 100644
--- a/dlib/cuda/tensor_tools.cpp
+++ b/dlib/cuda/tensor_tools.cpp
@@ -208,33 +208,99 @@ namespace dlib { namespace tt
         const tensor& lhs,
         bool trans_lhs,
         const tensor& rhs,
-        bool trans_rhs
+        bool trans_rhs,
+        operation_mode mode
     )
     {
 #ifdef DLIB_USE_CUDA
-        cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs);
+        cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs, mode);
 #else
-        if (beta != 0)
+        if (mode == operation_mode::CHANNEL_WISE)
         {
-            if (trans_lhs && trans_rhs)
-                dest = alpha*trans(mat(lhs))*trans(mat(rhs)) + beta*mat(dest);
-            else if (!trans_lhs && trans_rhs)
-                dest = alpha*mat(lhs)*trans(mat(rhs)) + beta*mat(dest);
-            else if (trans_lhs && !trans_rhs)
-                dest = alpha*trans(mat(lhs))*mat(rhs) + beta*mat(dest);
+            if (beta != 0)
+            {
+                if (trans_lhs && trans_rhs)
+                    dest = alpha * trans(mat(lhs)) * trans(mat(rhs)) + beta * mat(dest);
+                else if (!trans_lhs && trans_rhs)
+                    dest = alpha * mat(lhs) * trans(mat(rhs)) + beta * mat(dest);
+                else if (trans_lhs && !trans_rhs)
+                    dest = alpha * trans(mat(lhs)) * mat(rhs) + beta * mat(dest);
+                else
+                    dest = alpha * mat(lhs) * mat(rhs) + beta * mat(dest);
+            }
             else
-                dest = alpha*mat(lhs)*mat(rhs) + beta*mat(dest);
+            {
+                if (trans_lhs && trans_rhs)
+                    dest = alpha * trans(mat(lhs)) * trans(mat(rhs));
+                else if (!trans_lhs && trans_rhs)
+                    dest = alpha * mat(lhs) * trans(mat(rhs));
+                else if (trans_lhs && !trans_rhs)
+                    dest = alpha * trans(mat(lhs)) * mat(rhs);
+                else
+                    dest = alpha * mat(lhs) * mat(rhs);
+            }
         }
-        else
+        else if (mode == operation_mode::PLANE_WISE)
         {
-            if (trans_lhs && trans_rhs)
-                dest = alpha*trans(mat(lhs))*trans(mat(rhs));
-            else if (!trans_lhs && trans_rhs)
-                dest = alpha*mat(lhs)*trans(mat(rhs));
-            else if (trans_lhs && !trans_rhs)
-                dest = alpha*trans(mat(lhs))*mat(rhs);
-            else
-                dest = alpha*mat(lhs)*mat(rhs);
+            auto is_matrix = [](const auto& tensor) {
+                return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) ||
+                    (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1));
+                };
+
+            long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() });
+            long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() });
+            const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest);
+
+            if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) {
+                num_samples = num_channels = 1;
+            }
+
+            long lhs_rows = (lhs_is_matrix && lhs.num_samples() > 1) ? lhs.num_samples() : lhs.nr();
+            long lhs_cols = (lhs_is_matrix && lhs.k() > 1) ? lhs.k() : lhs.nc();
+            long rhs_rows = (rhs_is_matrix && rhs.num_samples() > 1) ? rhs.num_samples() : rhs.nr();
+            long rhs_cols = (rhs_is_matrix && rhs.k() > 1) ? rhs.k() : rhs.nc();
+            long dest_rows = (dest_is_matrix && dest.num_samples() > 1) ? dest.num_samples() : dest.nr();
+            long dest_cols = (dest_is_matrix && dest.k() > 1) ? dest.k() : dest.nc();
+
+            const size_t lhs_plane_size = lhs_rows * lhs_cols;
+            const size_t rhs_plane_size = rhs_rows * rhs_cols;
+            const size_t dest_plane_size = dest_rows * dest_cols;
+
+            for (long b = 0; b < num_samples; ++b)
+            {
+                for (long c = 0; c < num_channels; ++c)
+                {
+                    auto lhs_slice = lhs_is_matrix ? alias_tensor(lhs_rows, lhs_cols)(lhs, 0) :
+                        alias_tensor(lhs_rows, lhs_cols)(lhs, (b * num_channels + c) * lhs_plane_size);
+                    auto rhs_slice = rhs_is_matrix ? alias_tensor(rhs_rows, rhs_cols)(rhs, 0) :
+                        alias_tensor(rhs_rows, rhs_cols)(rhs, (b * num_channels + c) * rhs_plane_size);
+                    auto dest_slice = dest_is_matrix ? alias_tensor(dest_rows, dest_cols)(dest, 0) :
+                        alias_tensor(dest_rows, dest_cols)(dest, (b * num_channels + c) * dest_plane_size);
+
+                    if (beta != 0)
+                    {
+                        if (trans_lhs && trans_rhs)
+                            dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice)) + beta * mat(dest_slice);
+                        else if (!trans_lhs && trans_rhs)
+                            dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice)) + beta * mat(dest_slice);
+                        else if (trans_lhs && !trans_rhs)
+                            dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice) + beta * mat(dest_slice);
+                        else
+                            dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice) + beta * mat(dest_slice);
+                    }
+                    else
+                    {
+                        if (trans_lhs && trans_rhs)
+                            dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice));
+                        else if (!trans_lhs && trans_rhs)
+                            dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice));
+                        else if (trans_lhs && !trans_rhs)
+                            dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice);
+                        else
+                            dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice);
+                    }
+                }
+            }
         }
 #endif
     }
@@ -817,31 +883,32 @@ namespace dlib { namespace tt
 #endif
     }
 
-// ----------------------------------------------------------------------------------------
 // ----------------------------------------------------------------------------------------
 
-    void softmax (
+    void softmax(
         tensor& dest,
-        const tensor& src
+        const tensor& src,
+        operation_mode mode
     )
     {
 #ifdef DLIB_USE_CUDA
-        cuda::softmax(dest,src);
+        cuda::softmax(dest, src, mode);
 #else
-        cpu::softmax(dest,src);
+        cpu::softmax(dest, src, mode);
 #endif
     }
 
-    void softmax_gradient (
+    void softmax_gradient(
         tensor& grad,
         const tensor& dest,
-        const tensor& gradient_input
+        const tensor& gradient_input,
+        operation_mode mode
     )
     {
 #ifdef DLIB_USE_CUDA
-        cuda::softmax_gradient(grad, dest, gradient_input);
+        cuda::softmax_gradient(grad, dest, gradient_input, mode);
 #else
-        cpu::softmax_gradient(grad, dest, gradient_input);
+        cpu::softmax_gradient(grad, dest, gradient_input, mode);
 #endif
     }
 
diff --git a/dlib/cuda/tensor_tools.h b/dlib/cuda/tensor_tools.h
index 8ea593a429..17649603d9 100644
--- a/dlib/cuda/tensor_tools.h
+++ b/dlib/cuda/tensor_tools.h
@@ -165,21 +165,56 @@ namespace dlib { namespace tt
         const tensor& lhs,
         bool trans_lhs,
         const tensor& rhs,
-        bool trans_rhs
+        bool trans_rhs,
+        operation_mode mode = operation_mode::CHANNEL_WISE
     );
     /*!
         requires
             - dest does not alias the memory of lhs or rhs
             - The dimensions of lhs and rhs must be compatible for matrix multiplication.
-              In particular:
-                - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
-                - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
-                - Let D == mat(dest)
-                - D.nr() == L.nr() && D.nc() == R.nc()
-                  (i.e. dest must be preallocated and have the correct output dimensions)
-                - L.nc() == R.nr()
-        ensures
-            - performs: dest = alpha*L*R + beta*mat(dest)
+                The specific requirements depend on the mode:
+
+                For CHANNEL_WISE mode (default):
+                    - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
+                    - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
+                    - Let D == mat(dest)
+                    - D.nr() == L.nr() && D.nc() == R.nc()
+                        (i.e. dest must be preallocated and have the correct output dimensions)
+                    - L.nc() == R.nr()
+
+                For PLANE_WISE mode:
+                    - lhs.num_samples() == rhs.num_samples() && lhs.k() == rhs.k()
+                    - If !trans_lhs && !trans_rhs:
+                        lhs.nc() == rhs.nr()
+                        dest.nr() == lhs.nr() && dest.nc() == rhs.nc()
+                    - If trans_lhs && !trans_rhs:
+                        lhs.nr() == rhs.nr()
+                        dest.nr() == lhs.nc() && dest.nc() == rhs.nc()
+                    - If !trans_lhs && trans_rhs:
+                        lhs.nc() == rhs.nc()
+                        dest.nr() == lhs.nr() && dest.nc() == rhs.nr()
+                    - If trans_lhs && trans_rhs:
+                        lhs.nr() == rhs.nc()
+                        dest.nr() == lhs.nc() && dest.nc() == rhs.nr()
+
+        ensures
+            - Performs matrix multiplication based on the specified mode:
+
+                For CHANNEL_WISE mode:
+                    - performs: dest = alpha*L*R + beta*mat(dest)
+                        where L, R, and D are as defined above.
+
+                For PLANE_WISE mode:
+                    - Performs matrix multiplication for each corresponding 2D plane (nr x nc)
+                        in lhs and rhs across all samples and channels.
+                    - The operation is equivalent to performing the following for each sample
+                        and channel:
+                            dest[s][k] = alpha * (lhs[s][k] * rhs[s][k]) + beta * dest[s][k]
+                            where [s][k] represents the 2D plane for sample s and channel k.
+            
+                Note that the PLANE_WISE mode is particularly useful for operations like attention
+                mechanisms in neural networks, where you want to perform matrix multiplications
+                on 2D planes of 4D tensors while preserving the sample and channel dimensions.
     !*/
 
 // ----------------------------------------------------------------------------------------
@@ -1386,44 +1421,54 @@ namespace dlib { namespace tt
 
 // ----------------------------------------------------------------------------------------
 
-    void softmax (
+    void softmax(
         tensor& dest,
-        const tensor& src
+        const tensor& src,
+        operation_mode mode = operation_mode::CHANNEL_WISE
     );
     /*!
         requires
             - have_same_dimensions(dest, src) == true
-        ensures
-            - Note that the softmax function is a vector valued function: 
-                s(x) == exp(x)/sum(exp(x)) 
-            - Computes the softmax function on src and writes the results to dest.  The
-              softmax is computed per spatial location across the different channels at
-              each location.  That is, softmax() outputs a new tensor, #dest, where each of
+            - mode == CHANNEL_WISE || mode == PLANE_WISE
+        ensures
+            - Note that the softmax function is a vector valued function:
+              s(x) == exp(x)/sum(exp(x))
+            - Computes the softmax function on src and writes the results to dest.
+            - If mode == CHANNEL_WISE:
+              The softmax is computed per spatial location across the different channels at
+              each location. That is, softmax() outputs a new tensor, #dest, where each of
               the spatial locations in dest (i.e. image idx, row idx, and column idx)
-              contains the output of s() evaluated over the channel values at each
-              location.
+              contains the output of s() evaluated over the channel values at each location.
+            - If mode == PLANE_WISE:
+              The softmax is computed across entire planes (nr x nc) of the input tensor.
+              This is useful for operations in Large Language Models (LLMs) and other
+              applications requiring 2D tensor processing.
             - This function supports in-place operation, i.e. having
               is_same_object(dest, src)==true
     !*/
 
-    void softmax_gradient (
+    void softmax_gradient(
         tensor& grad,
         const tensor& dest,
-        const tensor& gradient_input
+        const tensor& gradient_input,
+        operation_mode mode = operation_mode::CHANNEL_WISE
     );
     /*!
         requires
-            - have_same_dimensions(dest,gradient_input) == true 
-            - have_same_dimensions(dest,grad) == true 
-        ensures
-            - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
-              Then let f(SRC) == dot(gradient_input,dest).  Then this function computes the
-              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
-              is_same_object(grad,gradient_input)==true then the output is assigned to
-              grad, replacing its previous contents.  Otherwise the output is added to
-              grad.
+            - have_same_dimensions(dest,gradient_input) == true
+            - have_same_dimensions(dest,grad) == true
+            - mode == CHANNEL_WISE || mode == PLANE_WISE
+        ensures
+            - We interpret dest as the output of softmax(dest,SRC,mode) for some SRC tensor.
+            Then let f(SRC) == dot(gradient_input,dest).  Then this function computes the
+            gradient of f() with respect to SRC and stores it to grad.  Moreover, if
+            is_same_object(grad,gradient_input)==true then the output is assigned to
+            grad, replacing its previous contents.  Otherwise the output is added to grad.
+            - The gradient computation takes into account the specified mode:
+            - If mode == CHANNEL_WISE: The gradient is computed per spatial location across channels.
+            - If mode == PLANE_WISE: The gradient is computed across entire planes of the tensor.
             - This function supports in-place operation, i.e. having
-              is_same_object(grad, gradient_input)==true
+            is_same_object(grad, gradient_input)==true
     !*/
 
 // ----------------------------------------------------------------------------------------
diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index f34e7a8390..023ccbf810 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -13,6 +13,7 @@
 #include "../cuda/tensor_tools.h"
 #include "../vectorstream.h"
 #include "utilities.h"
+#include "../cuda/operation_mode.h"
 #include <sstream>
 
 
@@ -2789,6 +2790,100 @@ namespace dlib
     using mult_prev9_  = mult_prev_<tag9>;
     using mult_prev10_ = mult_prev_<tag10>;
 
+// ----------------------------------------------------------------------------------------
+
+    template <
+        template<typename> class tag
+        >
+    class multm_prev_
+    {
+    public:
+        const static unsigned long id = tag_id<tag>::id;
+
+        multm_prev_() {}
+        template <typename SUBNET> void setup(const SUBNET& /*sub*/) {}
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            auto& t1 = sub.get_output();
+            auto& t2 = layer<tag>(sub).get_output();
+            output.set_size(t1.num_samples(), t1.k(), t1.nr(), t2.nc());
+
+            tt::gemm(0, output, 1, t1, false, t2, false, operation_mode::PLANE_WISE);
+        }
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+        {
+            auto& t1 = sub.get_output();
+            auto& t2 = layer<tag>(sub).get_output();
+            auto& prev = sub.get_gradient_input();
+            auto& prev_tag = layer<tag>(sub).get_gradient_input();            
+
+            tt::gemm(1, prev, 1, gradient_input, false, t2, true, operation_mode::PLANE_WISE);
+            tt::gemm(1, prev_tag, 1, t1, true, gradient_input, false, operation_mode::PLANE_WISE);
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        inline dpoint map_input_to_output(const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input(const dpoint& p) const { return p; }
+
+        friend void serialize(const multm_prev_& /*item*/, std::ostream& out)
+        {
+            serialize("multm_prev_", out);
+        }
+        friend void deserialize(multm_prev_& /*item*/, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "multm_prev_")
+                throw serialization_error("Unexpected version '" + version + "' found while deserializing dlib::multm_prev_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const multm_prev_& /*item*/)
+        {
+            out << "multm_prev" << id;
+            return out;
+        }
+        friend void to_xml(const multm_prev_& /*item*/, std::ostream& out)
+        {
+            out << "<multm_prev tag='" << id << "'/>\n";
+        }
+
+    private:
+        resizable_tensor params; // unused
+    };
+
+    template <
+        template<typename> class tag,
+        typename SUBNET
+        >
+    using multm_prev = add_layer<multm_prev_<tag>, SUBNET>;
+
+    template <typename SUBNET> using multm_prev1 = multm_prev<tag1, SUBNET>;
+    template <typename SUBNET> using multm_prev2 = multm_prev<tag2, SUBNET>;
+    template <typename SUBNET> using multm_prev3 = multm_prev<tag3, SUBNET>;
+    template <typename SUBNET> using multm_prev4 = multm_prev<tag4, SUBNET>;
+    template <typename SUBNET> using multm_prev5 = multm_prev<tag5, SUBNET>;
+    template <typename SUBNET> using multm_prev6 = multm_prev<tag6, SUBNET>;
+    template <typename SUBNET> using multm_prev7 = multm_prev<tag7, SUBNET>;
+    template <typename SUBNET> using multm_prev8 = multm_prev<tag8, SUBNET>;
+    template <typename SUBNET> using multm_prev9 = multm_prev<tag9, SUBNET>;
+    template <typename SUBNET> using multm_prev10 = multm_prev<tag10, SUBNET>;
+    using multm_prev1_ = multm_prev_<tag1>;
+    using multm_prev2_ = multm_prev_<tag2>;
+    using multm_prev3_ = multm_prev_<tag3>;
+    using multm_prev4_ = multm_prev_<tag4>;
+    using multm_prev5_ = multm_prev_<tag5>;
+    using multm_prev6_ = multm_prev_<tag6>;
+    using multm_prev7_ = multm_prev_<tag7>;
+    using multm_prev8_ = multm_prev_<tag8>;
+    using multm_prev9_ = multm_prev_<tag9>;
+    using multm_prev10_ = multm_prev_<tag10>;
+
 // ----------------------------------------------------------------------------------------
 
     template <
@@ -3985,31 +4080,28 @@ namespace dlib
 
 // ----------------------------------------------------------------------------------------
 
+    template <operation_mode s_mode_>
     class softmax_
     {
     public:
-        softmax_() 
-        {
-        }
+        softmax_() {}
 
         template <typename SUBNET>
-        void setup (const SUBNET& /*sub*/)
-        {
-        }
+        void setup(const SUBNET& /*sub*/) {}
 
         void forward_inplace(const tensor& input, tensor& output)
         {
-            tt::softmax(output, input);
-        } 
+            tt::softmax(output, input, s_mode_);
+        }
 
         void backward_inplace(
             const tensor& computed_output,
-            const tensor& gradient_input, 
-            tensor& data_grad, 
-            tensor& 
+            const tensor& gradient_input,
+            tensor& data_grad,
+            tensor& /*params_grad*/
         )
         {
-            tt::softmax_gradient(data_grad, computed_output, gradient_input);
+            tt::softmax_gradient(data_grad, computed_output, gradient_input, s_mode_);
         }
 
         const tensor& get_layer_params() const { return params; }
@@ -4025,26 +4117,31 @@ namespace dlib
             std::string version;
             deserialize(version, in);
             if (version != "softmax_")
-                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::softmax_.");
+                throw serialization_error("Unexpected version '" + version + "' found while deserializing dlib::softmax_.");
         }
 
         friend std::ostream& operator<<(std::ostream& out, const softmax_& /*item*/)
         {
-            out << "softmax";
+            out << "softmax (mode=" << (s_mode_ == operation_mode::CHANNEL_WISE
+                ? "channel_wise" : "plane_wise") << ")";
             return out;
         }
 
         friend void to_xml(const softmax_& /*item*/, std::ostream& out)
         {
-            out << "<softmax/>\n";
+            out << "<softmax mode='" << (s_mode_ == operation_mode::CHANNEL_WISE
+                ? "channel_wise" : "plane_wise") << "'/>\n";
         }
 
     private:
-        resizable_tensor params;
+        resizable_tensor params; // unused
     };
 
     template <typename SUBNET>
-    using softmax = add_layer<softmax_, SUBNET>;
+    using softmax = add_layer<softmax_<operation_mode::CHANNEL_WISE>, SUBNET>;
+
+    template <typename SUBNET>
+    using softmaxm = add_layer<softmax_<operation_mode::PLANE_WISE>, SUBNET>;
 
 // ----------------------------------------------------------------------------------------
 
@@ -5088,4 +5185,4 @@ namespace dlib
 
 }
 
-#endif // DLIB_DNn_LAYERS_H_
\ No newline at end of file
+#endif // DLIB_DNn_LAYERS_H_
diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h
index 0d951e7804..99fe91401c 100644
--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -5,6 +5,7 @@
 
 #include "../cuda/tensor_abstract.h"
 #include "core_abstract.h"
+#include "../cuda/operation_mode.h"
 
 
 namespace dlib
@@ -2953,44 +2954,67 @@ namespace dlib
 
 // ----------------------------------------------------------------------------------------
 
+    template <operation_mode s_mode_>
     class softmax_
     {
         /*!
             WHAT THIS OBJECT REPRESENTS
                 This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a softmax layer.  To be precise,
-                we define the softmax function s(x) as:
-                    s(x) == exp(x)/sum(exp(x)) 
-                where x is a vector.  Then this layer treats its input tensor as a
-                collection of multi-channel images and applies s() to each spatial location
-                in each image.  In each application, the tensor::k() channel elements at
-                each position are input to s() and then replaced by the outputs of s().   
+                defined above. It defines a softmax layer with two modes of operation:
+                channel-wise and plane-wise.
+
+                The softmax function s(x) is defined as:
+                    s(x) == exp(x)/sum(exp(x))
+                where x is a vector.
+
+                1. Channel-wise mode (s_mode_ == CHANNEL_WISE):
+                This mode treats the input tensor as a collection of multi-channel images
+                and applies s() to each spatial location in each image. The tensor::k()
+                channel elements at each position are input to s() and then replaced by
+                the outputs of s().
+
+                2. Plane-wise mode (s_mode_ == PLANE_WISE):
+                This mode applies the softmax function across entire planes (nr x nc) of
+                the input tensor, useful for operations in Large Language Models (LLMs)
+                and other applications requiring 2D tensor processing.
+
+                In both modes, the sum of the outputs of s() will always be equal to 1 for
+                each application of the function.
 
-                This means that, for example, if you collapsed each output image to a 1
-                channel image by adding the channels then you would end up with images
-                where each pixel value was 1.  This is because the sum of the outputs of
-                s() will always be equal to 1.
+            TEMPLATE PARAMETERS
+                - s_mode_: Determines the mode of operation (CHANNEL_WISE or PLANE_WISE)
         !*/
 
     public:
+        softmax_();
 
-        softmax_(
-        );
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void setup(const SUBNET& sub);
         void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
+        void backward_inplace(
+            const tensor& computed_output,
+            const tensor& gradient_input,
+            tensor& data_grad,
+            tensor& params_grad
+        );
+        const tensor& get_layer_params() const;
+        tensor& get_layer_params();
         /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
-            interface.  Note that this layer doesn't have any parameters, so the tensor
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_
+            interface. Note that this layer doesn't have any parameters, so the tensor
             returned by get_layer_params() is always empty.
         !*/
+
+        friend void serialize(const softmax_& item, std::ostream& out);
+        friend void deserialize(softmax_& item, std::istream& in);
+        friend std::ostream& operator<<(std::ostream& out, const softmax_& item);
+        friend void to_xml(const softmax_& item, std::ostream& out);
     };
 
     template <typename SUBNET>
-    using softmax = add_layer<softmax_, SUBNET>;
+    using softmax = add_layer<softmax_<operation_mode::CHANNEL_WISE>, SUBNET>;
+
+    template <typename SUBNET>
+    using softmaxm = add_layer<softmax_<operation_mode::PLANE_WISE>, SUBNET>;
 
 // ----------------------------------------------------------------------------------------
 
@@ -3175,6 +3199,85 @@ namespace dlib
     using mult_prev9_  = mult_prev_<tag9>;
     using mult_prev10_ = mult_prev_<tag10>;
 
+// ----------------------------------------------------------------------------------------
+
+    template <
+        template<typename> class tag
+        >
+    class multm_prev_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above. This layer performs matrix multiplication on the output
+                of two previous layers. It multiplies the tensor from its immediate
+                predecessor layer, sub.get_output(), with the tensor from a deeper layer,
+                layer<tag>(sub).get_output().
+
+                The tag template argument specifies which layer to multiply with the
+                output of the previous layer. The result of this multiplication is
+                output by multm_prev_. The multiplication is performed using a modified
+                version of gemm() to account for the 2D matrix dimension in the nr()xnc()
+                planes of Dlib's 4D tensors.
+
+                This layer is similar to mult_prev_, but it considers the full matrix
+                in the nr()xnc() planes of the tensor, rather than just the upper
+                num_samples()xk() plane. This makes it suitable for implementing
+                mechanisms like attention, especially when the k() channel plane is
+                used to model multiple heads for parallel matrix processing.
+
+                The output tensor dimensions are determined as follows:
+                    - output.num_samples() == t1.num_samples()
+                    - output.k() == t1.k()
+                    - output.nr() == t1.nr()
+                    - output.nc() == t2.nc()
+                where t1 is sub.get_output() and t2 is layer<tag>(sub).get_output().
+        !*/
+
+    public:
+        multm_prev_(
+        ); 
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;        
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+    template <
+        template<typename> class tag,
+        typename SUBNET
+        >
+    using multm_prev = add_layer<multm_prev_<tag>, SUBNET>;
+
+    // Here we add some convenient aliases for using multm_prev_ with the tag layers. 
+    template <typename SUBNET> using multm_prev1  = multm_prev<tag1, SUBNET>;
+    template <typename SUBNET> using multm_prev2  = multm_prev<tag2, SUBNET>;
+    template <typename SUBNET> using multm_prev3  = multm_prev<tag3, SUBNET>;
+    template <typename SUBNET> using multm_prev4  = multm_prev<tag4, SUBNET>;
+    template <typename SUBNET> using multm_prev5  = multm_prev<tag5, SUBNET>;
+    template <typename SUBNET> using multm_prev6  = multm_prev<tag6, SUBNET>;
+    template <typename SUBNET> using multm_prev7  = multm_prev<tag7, SUBNET>;
+    template <typename SUBNET> using multm_prev8  = multm_prev<tag8, SUBNET>;
+    template <typename SUBNET> using multm_prev9  = multm_prev<tag9, SUBNET>;
+    template <typename SUBNET> using multm_prev10 = multm_prev<tag10, SUBNET>;
+    using multm_prev1_  = multm_prev_<tag1>;
+    using multm_prev2_  = multm_prev_<tag2>;
+    using multm_prev3_  = multm_prev_<tag3>;
+    using multm_prev4_  = multm_prev_<tag4>;
+    using multm_prev5_  = multm_prev_<tag5>;
+    using multm_prev6_  = multm_prev_<tag6>;
+    using multm_prev7_  = multm_prev_<tag7>;
+    using multm_prev8_  = multm_prev_<tag8>;
+    using multm_prev9_  = multm_prev_<tag9>;
+    using multm_prev10_ = multm_prev_<tag10>;
+
 // ----------------------------------------------------------------------------------------
 
     template <
diff --git a/dlib/dnn/visitors.h b/dlib/dnn/visitors.h
index 726f3b200e..589e3556ef 100644
--- a/dlib/dnn/visitors.h
+++ b/dlib/dnn/visitors.h
@@ -841,6 +841,16 @@ namespace dlib
                 update(i);
             }
 
+            template <template <typename> class TAG, typename U, typename E>
+            void operator()(size_t i, const add_layer<multm_prev_<TAG>, U, E>&)
+            {
+                start_node(i, "multm");
+                end_node();
+                const auto t = tag_id<TAG>::id;
+                out << tag_to_layer.at(t) << " -> " << i << '\n';
+                update(i);
+            }            
+
             template <template <typename> class TAG, typename U, typename E>
             void operator()(size_t i, const add_layer<resize_prev_to_tagged_<TAG>, U, E>&)
             {
@@ -962,8 +972,8 @@ namespace dlib
                 update(i);
             }
 
-            template <typename U, typename E>
-            void operator()(size_t i, const add_layer<softmax_, U, E>&)
+            template <operation_mode sm, typename U, typename E>
+            void operator()(size_t i, const add_layer<softmax_<sm>, U, E>&)
             {
                 start_node(i, "softmax");
                 end_node();
@@ -1062,7 +1072,7 @@ namespace dlib
                 end_node();
                 update(i);
             }
-
+            
             template <typename T, typename U, typename E>
             void operator()(size_t i, const add_layer<T, U, E>&)
             {
diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp
index 6d3c6c94b4..a99123e7f8 100644
--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -166,6 +166,85 @@ namespace
 #endif
     }
 
+    void test_softmaxm()
+    {
+        print_spinner();
+        using net_type = tag1<softmaxm<tag2<input<matrix<float>>>>>;
+        net_type net;
+
+        // Initialization
+        dlib::rand rnd(std::rand());
+        const long nr = 2, nc = 3;
+        const int n_samples = 3, k = 1;
+        std::vector<matrix<float>> x(n_samples);
+        matrix<float> xtmp(nr, nc);
+        for (int ii = 0; ii < n_samples; ++ii) {
+            for (int jj = 0; jj < nr; ++jj)
+                for (int kk = 0; kk < nc; ++kk) {
+                    float r = rnd.get_random_gaussian();
+                    if (r > 1 || r < -1) r = -std::numeric_limits<float>::infinity();
+                    xtmp(jj, kk) = r;
+                }
+            x[ii] = xtmp;
+        }
+
+        // Convert input matrix to tensor
+        resizable_tensor input_tensor;
+        net.to_tensor(&x[0], &x[0] + n_samples, input_tensor);
+        net.forward(input_tensor);
+
+        // Expected output tensor
+        resizable_tensor expected_output;
+        expected_output.copy_size(input_tensor);
+        for (int ii = 0; ii < n_samples; ++ii) {
+            for (int jj = 0; jj < nr; ++jj) {
+                matrix<float> m(1, nc);
+                bool all_neg_inf = true;
+                for (int kk = 0; kk < nc; ++kk) {
+                    m(0, kk) = input_tensor.host()[tensor_index(input_tensor, ii, 0, jj, kk)];
+                    if (m(0, kk) > -std::numeric_limits<float>::infinity()) all_neg_inf = false;
+                }
+
+                matrix<float> r(1, nc);
+                if (all_neg_inf)
+                    for (int kk = 0; kk < nc; ++kk) r(0, kk) = 0.0f;
+                else {
+                    // Stabilize the computation by subtracting the max value
+                    float max_val = max(m);
+                    matrix<float> exp_m = exp(m - max_val);
+                    float sum_exp = sum(exp_m) + std::numeric_limits<float>::epsilon();
+                    r = exp_m / sum_exp;
+                }
+                for (int kk = 0; kk < nc; ++kk)
+                    expected_output.host()[tensor_index(expected_output, ii, 0, jj, kk)] = r(0, kk);
+            }
+        }
+
+        // Compare output tensor with expected output
+        auto& net_output = layer<tag1>(net).get_output();
+        DLIB_TEST(max(abs(mat(net_output) - mat(expected_output))) < 1e-5);
+
+        // Compare CPU and CUDA utility functions
+        resizable_tensor output_tensor, cpu_grad, gradient_input;
+        output_tensor.copy_size(input_tensor);
+        cpu_grad.copy_size(input_tensor);
+        cpu_grad = 0;
+        gradient_input.copy_size(input_tensor);
+        randomize_parameters(gradient_input, nr + nc, rnd);
+        cpu::softmax(output_tensor, input_tensor, operation_mode::PLANE_WISE);
+        cpu::softmax_gradient(cpu_grad, output_tensor, gradient_input, operation_mode::PLANE_WISE);
+        DLIB_TEST(max(abs(mat(output_tensor) - mat(expected_output))) < 1e-5);
+#ifdef DLIB_USE_CUDA
+        resizable_tensor cuda_grad;
+        cuda_grad.copy_size(input_tensor);
+        cuda_grad = 0;
+        cuda::softmax(output_tensor, input_tensor, operation_mode::PLANE_WISE);
+        cpu::softmax_gradient(cuda_grad, output_tensor, gradient_input, operation_mode::PLANE_WISE);
+        DLIB_TEST(max(abs(mat(output_tensor) - mat(expected_output))) < 1e-5);
+        DLIB_TEST(max(abs(mat(cuda_grad) - mat(cpu_grad))) < 1e-5);
+#endif
+    }
+
     void test_softmax_all()
     {
         using namespace dlib::tt;
@@ -2390,7 +2469,13 @@ void test_embeddings()
         }
         {
             print_spinner();
-            softmax_ l;
+            softmax_<operation_mode::CHANNEL_WISE> l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }
+        {
+            print_spinner();
+            softmax_<operation_mode::PLANE_WISE> l;
             auto res = test_layer(l);
             DLIB_TEST_MSG(res, res);
         }
@@ -2970,6 +3055,53 @@ void test_embeddings()
         DLIB_TEST(rs.mean() < 0.1);
     }
 
+// ----------------------------------------------------------------------------------------
+
+void test_multm_prev()
+{
+    print_spinner();
+    using net_type = tag1<multm_prev6<skip5<tag6<transpose<tag5<input<matrix<float>>>>>>>>;
+    net_type net;
+
+    dlib::rand rnd;
+    const int nr = 3, nc = 4;
+    const int n_samples = 3, k = 1;
+    std::vector<matrix<float>> x(n_samples);
+    matrix<float> xtmp(nr, nc);
+    for (int ii = 0; ii < n_samples; ++ii) {
+        for (int jj = 0; jj < nr; ++jj)
+            for (int kk = 0; kk < nc; ++kk)
+                xtmp(jj, kk) = rnd.get_random_gaussian();
+        x[ii] = xtmp;
+    }
+
+    resizable_tensor input_tensor;
+    net.to_tensor(&x[0], &x[0] + n_samples, input_tensor);
+    net.forward(input_tensor);
+
+    resizable_tensor expected_output(n_samples, k, nr, nr);
+    matrix<float> input_mat(nr, nc);
+    matrix<float> output_mat(nr, nr);
+
+    for (long s = 0; s < n_samples; ++s) {
+        for (long r = 0; r < nr; ++r) {
+            for (long c = 0; c < nc; ++c) {
+                input_mat(r, c) = input_tensor.host()[tensor_index(input_tensor, s, 0, r, c)];
+            }
+        }
+        output_mat = input_mat * trans(input_mat);
+
+        for (long r = 0; r < nr; ++r) {
+            for (long c = 0; c < nr; ++c) {
+                expected_output.host()[tensor_index(expected_output, s, 0, r, c)] = output_mat(r, c);
+            }
+        }
+    }
+
+    auto& net_output = layer<tag1>(net).get_output();
+    DLIB_TEST(max(abs(mat(net_output) - mat(expected_output))) < 1e-5);
+}
+
 // ----------------------------------------------------------------------------------------
 
     void test_multioutput_linear_regression()
@@ -4655,6 +4787,7 @@ void test_embeddings()
             test_avg_pool(4,5,40,50,0,1);
             test_tanh();
             test_softmax();
+            test_softmaxm();
             test_softmax_all();
             test_sigmoid();
             test_mish();
@@ -4678,6 +4811,7 @@ void test_embeddings()
             test_copy_tensor_cpu();
             test_copy_tensor_add_to_cpu();
             test_concat();
+            test_multm_prev();
             test_simple_linear_regression();
             test_simple_linear_regression_eil();
             test_simple_linear_regression_with_mult_prev();