Skip to content

Commit

Permalink
Revert "Added small-size larfg and larf kernels" (#769)
Browse files Browse the repository at this point in the history
Revert "Added small-size larfg and larf kernels (#759)"

Reverting due to failures in single-precision SYGVDX on gfx1101 when
GetParam() = ({ 50, 50, 60, 70, -15, -5, 28, 35, 1 }, { 2, V, I, U }).

This reverts commit 5156c3a.
  • Loading branch information
cgmb authored Jul 24, 2024
1 parent 5156c3a commit 057cfa7
Show file tree
Hide file tree
Showing 21 changed files with 79 additions and 956 deletions.
2 changes: 0 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ Full documentation for rocSOLVER is available at the [rocSOLVER documentation](h
- LARFG_64

### Optimized
- Improved performanced of LARFG, LARF, and downstream functions such as GEQR2 and GEQRF

### Changed
### Deprecated
### Removed
Expand Down
10 changes: 0 additions & 10 deletions library/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -280,16 +280,6 @@ set(rocsolver_refact_source
)

set(rocsolver_specialized_source
# larf
specialized/rocauxiliary_larf_specialized_kernels_s.cpp
specialized/rocauxiliary_larf_specialized_kernels_d.cpp
specialized/rocauxiliary_larf_specialized_kernels_c.cpp
specialized/rocauxiliary_larf_specialized_kernels_z.cpp
# larfg
specialized/rocauxiliary_larfg_specialized_kernels_s.cpp
specialized/rocauxiliary_larfg_specialized_kernels_d.cpp
specialized/rocauxiliary_larfg_specialized_kernels_c.cpp
specialized/rocauxiliary_larfg_specialized_kernels_z.cpp
# gemm
specialized/roclapack_gemm_specialized_kernels_s.cpp
specialized/roclapack_gemm_specialized_kernels_d.cpp
Expand Down
35 changes: 6 additions & 29 deletions library/src/auxiliary/rocauxiliary_larf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@

#include "rocblas.hpp"
#include "rocsolver/rocsolver.h"
#include "rocsolver_run_specialized_kernels.hpp"

ROCSOLVER_BEGIN_NAMESPACE

Expand All @@ -59,23 +58,6 @@ void rocsolver_larf_getMemorySize(const rocblas_side side,
// size of scalars (constants)
*size_scalars = sizeof(T) * 3;

// size of array of pointers to workspace
if(BATCHED)
*size_workArr = sizeof(T*) * batch_count;
else
*size_workArr = 0;

// if small size no workspace needed
bool ssker_left
= (side == rocblas_side_left && m <= LARF_SSKER_MAX_DIM && n <= LARF_SSKER_MIN_DIM);
bool ssker_right
= (side == rocblas_side_right && m <= LARF_SSKER_MIN_DIM && n <= LARF_SSKER_MAX_DIM);
if(ssker_left || ssker_right)
{
*size_Abyx = 0;
return;
}

// size of temporary result in Householder matrix generation
if(side == rocblas_side_left)
*size_Abyx = n;
Expand All @@ -84,6 +66,12 @@ void rocsolver_larf_getMemorySize(const rocblas_side side,
else
*size_Abyx = std::max(m, n);
*size_Abyx *= sizeof(T) * batch_count;

// size of array of pointers to workspace
if(BATCHED)
*size_workArr = sizeof(T*) * batch_count;
else
*size_workArr = 0;
}

template <typename T, typename I, typename U>
Expand Down Expand Up @@ -149,17 +137,6 @@ rocblas_status rocsolver_larf_template(rocblas_handle handle,
hipStream_t stream;
rocblas_get_stream(handle, &stream);

// if n is small, use small-size kernel
bool ssker_left
= (side == rocblas_side_left && m <= LARF_SSKER_MAX_DIM && n <= LARF_SSKER_MIN_DIM);
bool ssker_right
= (side == rocblas_side_right && m <= LARF_SSKER_MIN_DIM && n <= LARF_SSKER_MAX_DIM);
if(ssker_left || ssker_right)
{
return larf_run_small(handle, side, m, n, x, shiftx, incx, stridex, alpha, stridep, A,
shiftA, lda, stridea, batch_count);
}

// everything must be executed with scalars on the device
rocblas_pointer_mode old_mode;
rocblas_get_pointer_mode(handle, &old_mode);
Expand Down
98 changes: 42 additions & 56 deletions library/src/auxiliary/rocauxiliary_larfg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,90 +34,91 @@

#include "rocblas.hpp"
#include "rocsolver/rocsolver.h"
#include "rocsolver_run_specialized_kernels.hpp"

ROCSOLVER_BEGIN_NAMESPACE

template <typename T, std::enable_if_t<!rocblas_is_complex<T>, int> = 0>
__device__ void run_set_taubeta(T* tau, T* norms, T* alpha)
template <typename T, typename I, typename U, std::enable_if_t<!rocblas_is_complex<T>, int> = 0>
ROCSOLVER_KERNEL void set_taubeta(T* tau,
const rocblas_stride strideP,
T* norms,
U alpha,
const rocblas_stride shifta,
const rocblas_stride stride)
{
if(norms[0] > 0)
I b = hipBlockIdx_x;

T* a = load_ptr_batch<T>(alpha, b, shifta, stride);
T* t = tau + b * strideP;

if(norms[b] > 0)
{
T n = sqrt(norms[0] + alpha[0] * alpha[0]);
n = alpha[0] >= 0 ? -n : n;
T n = sqrt(norms[b] + a[0] * a[0]);
n = a[0] >= 0 ? -n : n;

// scaling factor:
norms[0] = 1.0 / (alpha[0] - n);
norms[b] = 1.0 / (a[0] - n);

// tau:
tau[0] = (n - alpha[0]) / n;
t[0] = (n - a[0]) / n;

// beta:
alpha[0] = n;
a[0] = n;
}
else
{
norms[0] = 1;
tau[0] = 0;
norms[b] = 1;
t[0] = 0;
}
}

template <typename T, std::enable_if_t<rocblas_is_complex<T>, int> = 0>
__device__ void run_set_taubeta(T* tau, T* norms, T* alpha)
template <typename T, typename I, typename U, std::enable_if_t<rocblas_is_complex<T>, int> = 0>
ROCSOLVER_KERNEL void set_taubeta(T* tau,
const rocblas_stride strideP,
T* norms,
U alpha,
const rocblas_stride shifta,
const rocblas_stride stride)
{
using S = decltype(std::real(T{}));
I b = hipBlockIdx_x;
S r, rr, ri, ar, ai;

ar = alpha[0].real();
ai = alpha[0].imag();
T* a = load_ptr_batch<T>(alpha, b, shifta, stride);
T* t = tau + b * strideP;

ar = a[0].real();
ai = a[0].imag();
S m = ai * ai;

if(norms[0].real() > 0 || m > 0)
if(norms[b].real() > 0 || m > 0)
{
m += ar * ar;
S n = sqrt(norms[0].real() + m);
S n = sqrt(norms[b].real() + m);
n = ar >= 0 ? -n : n;

// scaling factor:
// norms[0] = 1.0 / (alpha[0] - n);
// norms[b] = 1.0 / (a[0] - n);
r = (ar - n) * (ar - n) + ai * ai;
rr = (ar - n) / r;
ri = -ai / r;
norms[0] = rocblas_complex_num<S>(rr, ri);
norms[b] = rocblas_complex_num<S>(rr, ri);

// tau:
// tau[0] = (n - alpha[0]) / n;
//t[0] = (n - a[0]) / n;
rr = (n - ar) / n;
ri = -ai / n;
tau[0] = rocblas_complex_num<S>(rr, ri);
t[0] = rocblas_complex_num<S>(rr, ri);

// beta:
alpha[0] = n;
a[0] = n;
}
else
{
norms[0] = 1;
tau[0] = 0;
norms[b] = 1;
t[0] = 0;
}
}

template <typename T, typename I, typename U>
ROCSOLVER_KERNEL void set_taubeta(T* tauA,
const rocblas_stride strideP,
T* norms,
U alphaA,
const rocblas_stride shiftA,
const rocblas_stride strideA)
{
I bid = hipBlockIdx_x;

// select batch instance
T* alpha = load_ptr_batch<T>(alphaA, bid, shiftA, strideA);
T* tau = tauA + bid * strideP;

run_set_taubeta<T>(tau, norms + bid, alpha);
}

template <typename T, typename I>
void rocsolver_larfg_getMemorySize(const I n, const I batch_count, size_t* size_work, size_t* size_norms)
{
Expand All @@ -129,14 +130,6 @@ void rocsolver_larfg_getMemorySize(const I n, const I batch_count, size_t* size_
return;
}

// if small size no workspace needed
if(n <= LARFG_SSKER_MAX_N)
{
*size_norms = 0;
*size_work = 0;
return;
}

// size of space to store norms
*size_norms = sizeof(T) * batch_count;

Expand Down Expand Up @@ -207,13 +200,6 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle,
return rocblas_status_success;
}

// if n is small, use small-size kernel
if(n <= LARFG_SSKER_MAX_N)
{
return larfg_run_small(handle, n, alpha, shifta, stridex, x, shiftx, incx, stridex, tau,
strideP, batch_count);
}

// everything must be executed with scalars on the device
rocblas_pointer_mode old_mode;
rocblas_get_pointer_mode(handle, &old_mode);
Expand Down
18 changes: 4 additions & 14 deletions library/src/auxiliary/rocauxiliary_latrd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ void rocsolver_latrd_getMemorySize(const rocblas_int n,
return;
}

size_t n1 = 0, n2 = 0;
size_t w1 = 0, w2 = 0, w3 = 0;
size_t w_temp;

// size of scalars (constants) for rocblas calls
*size_scalars = sizeof(T) * 3;
Expand All @@ -71,20 +70,11 @@ void rocsolver_latrd_getMemorySize(const rocblas_int n,
*size_workArr = 0;

// extra requirements for calling larfg
rocsolver_larfg_getMemorySize<T>(n, batch_count, &w1, &n1);
rocsolver_larfg_getMemorySize<T>(n, batch_count, size_work, size_norms);

// extra requirements for calling symv/hemv
rocblasCall_symv_hemv_mem<BATCHED, T>(n, batch_count, &w2);

// size of re-usable workspace
// TODO: replace with rocBLAS call
constexpr int ROCBLAS_DOT_NB = 512;
w3 = n > 2 ? (n - 2) / ROCBLAS_DOT_NB + 2 : 1;
w3 *= sizeof(T) * batch_count;
n2 = sizeof(T) * batch_count;

*size_norms = std::max(n1, n2);
*size_work = std::max({w1, w2, w3});
rocblasCall_symv_hemv_mem<BATCHED, T>(n, batch_count, &w_temp);
*size_work = std::max(*size_work, w_temp);
}

template <typename T, typename S, typename U>
Expand Down
31 changes: 0 additions & 31 deletions library/src/include/ideal_sizes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,37 +31,6 @@
\brief ideal_sizes.hpp gathers all constants that can be tuned for performance.
*********************************************************************************/

#define BS1 256 // generic 1 dimensional thread-block size used to call common kernels
#define BS2 32 // generic 2 dimensional thread-block size used to call common kernels

/******************************* larf ****************************************
*******************************************************************************/
#ifndef LARF_SSKER_THREADS
#define LARF_SSKER_THREADS 256 // must be 64, 128, 256, 512, or 1024
#endif

#ifndef LARF_SSKER_BLOCKS
#define LARF_SSKER_BLOCKS 64
#endif

#ifndef LARF_SSKER_MAX_DIM
#define LARF_SSKER_MAX_DIM 2048 // should be >= LARF_SSKER_THREADS
#endif

#ifndef LARF_SSKER_MIN_DIM
#define LARF_SSKER_MIN_DIM 64 // should be >= LARF_SSKER_BLOCKS
#endif

/******************************* larfg ****************************************
*******************************************************************************/
#ifndef LARFG_SSKER_THREADS
#define LARFG_SSKER_THREADS 256 // must be 64, 128, 256, 512, or 1024
#endif

#ifndef LARFG_SSKER_MAX_N
#define LARFG_SSKER_MAX_N 2048
#endif

/***************** geqr2/geqrf and geql2/geqlf ********************************
*******************************************************************************/
/*! \brief Determines the size of the block column factorized at each step
Expand Down
Loading

0 comments on commit 057cfa7

Please sign in to comment.