Revert "Added small-size larfg and larf kernels" (#769)

Revert "Added small-size larfg and larf kernels (#759)" Reverting due to failures in single-precision SYGVDX on gfx1101 when GetParam() = ({ 50, 50, 60, 70, -15, -5, 28, 35, 1 }, { 2, V, I, U }). This reverts commit 5156c3a.
ROCm · Jul 24, 2024 · 057cfa7 · 057cfa7
1 parent 5156c3a
commit 057cfa7
Show file tree

Hide file tree

Showing 21 changed files with 79 additions and 956 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,8 +10,6 @@ Full documentation for rocSOLVER is available at the [rocSOLVER documentation](h
     - LARFG_64
 
 ### Optimized
-- Improved performanced of LARFG, LARF, and downstream functions such as GEQR2 and GEQRF
-
 ### Changed
 ### Deprecated
 ### Removed

diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
@@ -280,16 +280,6 @@ set(rocsolver_refact_source
 )
 
 set(rocsolver_specialized_source
-  # larf
-  specialized/rocauxiliary_larf_specialized_kernels_s.cpp
-  specialized/rocauxiliary_larf_specialized_kernels_d.cpp
-  specialized/rocauxiliary_larf_specialized_kernels_c.cpp
-  specialized/rocauxiliary_larf_specialized_kernels_z.cpp
-  # larfg
-  specialized/rocauxiliary_larfg_specialized_kernels_s.cpp
-  specialized/rocauxiliary_larfg_specialized_kernels_d.cpp
-  specialized/rocauxiliary_larfg_specialized_kernels_c.cpp
-  specialized/rocauxiliary_larfg_specialized_kernels_z.cpp
   # gemm
   specialized/roclapack_gemm_specialized_kernels_s.cpp
   specialized/roclapack_gemm_specialized_kernels_d.cpp

diff --git a/library/src/auxiliary/rocauxiliary_larf.hpp b/library/src/auxiliary/rocauxiliary_larf.hpp
@@ -34,7 +34,6 @@
 
 #include "rocblas.hpp"
 #include "rocsolver/rocsolver.h"
-#include "rocsolver_run_specialized_kernels.hpp"
 
 ROCSOLVER_BEGIN_NAMESPACE
 
@@ -59,23 +58,6 @@ void rocsolver_larf_getMemorySize(const rocblas_side side,
     // size of scalars (constants)
     *size_scalars = sizeof(T) * 3;
 
-    // size of array of pointers to workspace
-    if(BATCHED)
-        *size_workArr = sizeof(T*) * batch_count;
-    else
-        *size_workArr = 0;
-
-    // if small size no workspace needed
-    bool ssker_left
-        = (side == rocblas_side_left && m <= LARF_SSKER_MAX_DIM && n <= LARF_SSKER_MIN_DIM);
-    bool ssker_right
-        = (side == rocblas_side_right && m <= LARF_SSKER_MIN_DIM && n <= LARF_SSKER_MAX_DIM);
-    if(ssker_left || ssker_right)
-    {
-        *size_Abyx = 0;
-        return;
-    }
-
     // size of temporary result in Householder matrix generation
     if(side == rocblas_side_left)
         *size_Abyx = n;
@@ -84,6 +66,12 @@ void rocsolver_larf_getMemorySize(const rocblas_side side,
     else
         *size_Abyx = std::max(m, n);
     *size_Abyx *= sizeof(T) * batch_count;
+
+    // size of array of pointers to workspace
+    if(BATCHED)
+        *size_workArr = sizeof(T*) * batch_count;
+    else
+        *size_workArr = 0;
 }
 
 template <typename T, typename I, typename U>
@@ -149,17 +137,6 @@ rocblas_status rocsolver_larf_template(rocblas_handle handle,
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
-    // if n is small, use small-size kernel
-    bool ssker_left
-        = (side == rocblas_side_left && m <= LARF_SSKER_MAX_DIM && n <= LARF_SSKER_MIN_DIM);
-    bool ssker_right
-        = (side == rocblas_side_right && m <= LARF_SSKER_MIN_DIM && n <= LARF_SSKER_MAX_DIM);
-    if(ssker_left || ssker_right)
-    {
-        return larf_run_small(handle, side, m, n, x, shiftx, incx, stridex, alpha, stridep, A,
-                              shiftA, lda, stridea, batch_count);
-    }
-
     // everything must be executed with scalars on the device
     rocblas_pointer_mode old_mode;
     rocblas_get_pointer_mode(handle, &old_mode);

diff --git a/library/src/auxiliary/rocauxiliary_larfg.hpp b/library/src/auxiliary/rocauxiliary_larfg.hpp
@@ -34,90 +34,91 @@
 
 #include "rocblas.hpp"
 #include "rocsolver/rocsolver.h"
-#include "rocsolver_run_specialized_kernels.hpp"
 
 ROCSOLVER_BEGIN_NAMESPACE
 
-template <typename T, std::enable_if_t<!rocblas_is_complex<T>, int> = 0>
-__device__ void run_set_taubeta(T* tau, T* norms, T* alpha)
+template <typename T, typename I, typename U, std::enable_if_t<!rocblas_is_complex<T>, int> = 0>
+ROCSOLVER_KERNEL void set_taubeta(T* tau,
+                                  const rocblas_stride strideP,
+                                  T* norms,
+                                  U alpha,
+                                  const rocblas_stride shifta,
+                                  const rocblas_stride stride)
 {
-    if(norms[0] > 0)
+    I b = hipBlockIdx_x;
+
+    T* a = load_ptr_batch<T>(alpha, b, shifta, stride);
+    T* t = tau + b * strideP;
+
+    if(norms[b] > 0)
     {
-        T n = sqrt(norms[0] + alpha[0] * alpha[0]);
-        n = alpha[0] >= 0 ? -n : n;
+        T n = sqrt(norms[b] + a[0] * a[0]);
+        n = a[0] >= 0 ? -n : n;
 
         // scaling factor:
-        norms[0] = 1.0 / (alpha[0] - n);
+        norms[b] = 1.0 / (a[0] - n);
 
         // tau:
-        tau[0] = (n - alpha[0]) / n;
+        t[0] = (n - a[0]) / n;
 
         // beta:
-        alpha[0] = n;
+        a[0] = n;
     }
     else
     {
-        norms[0] = 1;
-        tau[0] = 0;
+        norms[b] = 1;
+        t[0] = 0;
     }
 }
 
-template <typename T, std::enable_if_t<rocblas_is_complex<T>, int> = 0>
-__device__ void run_set_taubeta(T* tau, T* norms, T* alpha)
+template <typename T, typename I, typename U, std::enable_if_t<rocblas_is_complex<T>, int> = 0>
+ROCSOLVER_KERNEL void set_taubeta(T* tau,
+                                  const rocblas_stride strideP,
+                                  T* norms,
+                                  U alpha,
+                                  const rocblas_stride shifta,
+                                  const rocblas_stride stride)
 {
     using S = decltype(std::real(T{}));
+    I b = hipBlockIdx_x;
     S r, rr, ri, ar, ai;
 
-    ar = alpha[0].real();
-    ai = alpha[0].imag();
+    T* a = load_ptr_batch<T>(alpha, b, shifta, stride);
+    T* t = tau + b * strideP;
+
+    ar = a[0].real();
+    ai = a[0].imag();
     S m = ai * ai;
 
-    if(norms[0].real() > 0 || m > 0)
+    if(norms[b].real() > 0 || m > 0)
     {
         m += ar * ar;
-        S n = sqrt(norms[0].real() + m);
+        S n = sqrt(norms[b].real() + m);
         n = ar >= 0 ? -n : n;
 
         // scaling factor:
-        //    norms[0] = 1.0 / (alpha[0] - n);
+        //    norms[b] = 1.0 / (a[0] - n);
         r = (ar - n) * (ar - n) + ai * ai;
         rr = (ar - n) / r;
         ri = -ai / r;
-        norms[0] = rocblas_complex_num<S>(rr, ri);
+        norms[b] = rocblas_complex_num<S>(rr, ri);
 
         // tau:
-        //    tau[0] = (n - alpha[0]) / n;
+        //t[0] = (n - a[0]) / n;
         rr = (n - ar) / n;
         ri = -ai / n;
-        tau[0] = rocblas_complex_num<S>(rr, ri);
+        t[0] = rocblas_complex_num<S>(rr, ri);
 
         // beta:
-        alpha[0] = n;
+        a[0] = n;
     }
     else
     {
-        norms[0] = 1;
-        tau[0] = 0;
+        norms[b] = 1;
+        t[0] = 0;
     }
 }
 
-template <typename T, typename I, typename U>
-ROCSOLVER_KERNEL void set_taubeta(T* tauA,
-                                  const rocblas_stride strideP,
-                                  T* norms,
-                                  U alphaA,
-                                  const rocblas_stride shiftA,
-                                  const rocblas_stride strideA)
-{
-    I bid = hipBlockIdx_x;
-
-    // select batch instance
-    T* alpha = load_ptr_batch<T>(alphaA, bid, shiftA, strideA);
-    T* tau = tauA + bid * strideP;
-
-    run_set_taubeta<T>(tau, norms + bid, alpha);
-}
-
 template <typename T, typename I>
 void rocsolver_larfg_getMemorySize(const I n, const I batch_count, size_t* size_work, size_t* size_norms)
 {
@@ -129,14 +130,6 @@ void rocsolver_larfg_getMemorySize(const I n, const I batch_count, size_t* size_
         return;
     }
 
-    // if small size no workspace needed
-    if(n <= LARFG_SSKER_MAX_N)
-    {
-        *size_norms = 0;
-        *size_work = 0;
-        return;
-    }
-
     // size of space to store norms
     *size_norms = sizeof(T) * batch_count;
 
@@ -207,13 +200,6 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle,
         return rocblas_status_success;
     }
 
-    // if n is small, use small-size kernel
-    if(n <= LARFG_SSKER_MAX_N)
-    {
-        return larfg_run_small(handle, n, alpha, shifta, stridex, x, shiftx, incx, stridex, tau,
-                               strideP, batch_count);
-    }
-
     // everything must be executed with scalars on the device
     rocblas_pointer_mode old_mode;
     rocblas_get_pointer_mode(handle, &old_mode);

diff --git a/library/src/auxiliary/rocauxiliary_latrd.hpp b/library/src/auxiliary/rocauxiliary_latrd.hpp
@@ -58,8 +58,7 @@ void rocsolver_latrd_getMemorySize(const rocblas_int n,
         return;
     }
 
-    size_t n1 = 0, n2 = 0;
-    size_t w1 = 0, w2 = 0, w3 = 0;
+    size_t w_temp;
 
     // size of scalars (constants) for rocblas calls
     *size_scalars = sizeof(T) * 3;
@@ -71,20 +70,11 @@ void rocsolver_latrd_getMemorySize(const rocblas_int n,
         *size_workArr = 0;
 
     // extra requirements for calling larfg
-    rocsolver_larfg_getMemorySize<T>(n, batch_count, &w1, &n1);
+    rocsolver_larfg_getMemorySize<T>(n, batch_count, size_work, size_norms);
 
     // extra requirements for calling symv/hemv
-    rocblasCall_symv_hemv_mem<BATCHED, T>(n, batch_count, &w2);
-
-    // size of re-usable workspace
-    // TODO: replace with rocBLAS call
-    constexpr int ROCBLAS_DOT_NB = 512;
-    w3 = n > 2 ? (n - 2) / ROCBLAS_DOT_NB + 2 : 1;
-    w3 *= sizeof(T) * batch_count;
-    n2 = sizeof(T) * batch_count;
-
-    *size_norms = std::max(n1, n2);
-    *size_work = std::max({w1, w2, w3});
+    rocblasCall_symv_hemv_mem<BATCHED, T>(n, batch_count, &w_temp);
+    *size_work = std::max(*size_work, w_temp);
 }
 
 template <typename T, typename S, typename U>

diff --git a/library/src/include/ideal_sizes.hpp b/library/src/include/ideal_sizes.hpp
@@ -31,37 +31,6 @@
     \brief ideal_sizes.hpp gathers all constants that can be tuned for performance.
  *********************************************************************************/
 
-#define BS1 256 // generic 1 dimensional thread-block size used to call common kernels
-#define BS2 32 // generic 2 dimensional thread-block size used to call common kernels
-
-/******************************* larf ****************************************
-*******************************************************************************/
-#ifndef LARF_SSKER_THREADS
-#define LARF_SSKER_THREADS 256 // must be 64, 128, 256, 512, or 1024
-#endif
-
-#ifndef LARF_SSKER_BLOCKS
-#define LARF_SSKER_BLOCKS 64
-#endif
-
-#ifndef LARF_SSKER_MAX_DIM
-#define LARF_SSKER_MAX_DIM 2048 // should be >= LARF_SSKER_THREADS
-#endif
-
-#ifndef LARF_SSKER_MIN_DIM
-#define LARF_SSKER_MIN_DIM 64 // should be >= LARF_SSKER_BLOCKS
-#endif
-
-/******************************* larfg ****************************************
-*******************************************************************************/
-#ifndef LARFG_SSKER_THREADS
-#define LARFG_SSKER_THREADS 256 // must be 64, 128, 256, 512, or 1024
-#endif
-
-#ifndef LARFG_SSKER_MAX_N
-#define LARFG_SSKER_MAX_N 2048
-#endif
-
 /***************** geqr2/geqrf and geql2/geqlf ********************************
 *******************************************************************************/
 /*! \brief Determines the size of the block column factorized at each step