diff --git a/.github/workflows/codspeed-bench.yml b/.github/workflows/codspeed-bench.yml new file mode 100644 index 0000000000..04befefa9e --- /dev/null +++ b/.github/workflows/codspeed-bench.yml @@ -0,0 +1,150 @@ +name: Run codspeed benchmarks + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + benchmarks: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + fortran: [gfortran] + build: [make] + pyver: ["3.12"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.pyver }} + + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + fi + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get update + sudo apt-get install -y gfortran cmake ccache libtinfo5 + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + # GNU make and cmake call the compilers differently. It looks like + # that causes the cache to mismatch. Keep the ccache for both build + # tools separate to avoid polluting each other. + key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} + # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }} + ccache-${{ runner.os }}-${{ matrix.build }} + + - name: Write out the .pc + run: | + cd benchmark/pybench + cat > openblas.pc << EOF + libdir=${{ github.workspace }} + includedir= ${{ github.workspace }} + openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64 + version=0.0.99 + extralib=-lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas + Name: openblas + Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version + Version: ${version} + URL: https://github.com/xianyi/OpenBLAS + Libs: ${{ github.workspace }}/libopenblas.so -Wl,-rpath,${{ github.workspace }} + Libs.private: -lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas + Cflags: -I${{ github.workspace}} + EOF + cat openblas.pc + + - name: Configure ccache + run: | + if [ "${{ matrix.build }}" = "make" ]; then + # Add ccache to path + if [ "$RUNNER_OS" = "Linux" ]; then + echo "/usr/lib/ccache" >> $GITHUB_PATH + elif [ "$RUNNER_OS" = "macOS" ]; then + echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + fi + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Build OpenBLAS + run: | + case "${{ matrix.build }}" in + "make") + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}" + ;; + "cmake") + mkdir build && cd build + cmake -DDYNAMIC_ARCH=1 \ + -DNOFORTRAN=0 \ + -DBUILD_WITHOUT_LAPACK=0 \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ + .. + cmake --build . + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac + + - name: Show ccache status + continue-on-error: true + run: ccache -s + + - name: Install benchmark dependencies + run: pip install meson ninja numpy pytest pytest-codspeed --user + + - name: Build the wrapper + run: | + cd benchmark/pybench + export PKG_CONFIG_PATH=$PWD + meson setup build --prefix=$PWD/build-install + meson install -C build + # + # sanity check + cd build/openblas_wrap + python -c'import _flapack; print(dir(_flapack))' + + - name: Run benchmarks + uses: CodSpeedHQ/action@v2 + with: + token: ${{ secrets.CODSPEED_TOKEN }} + run: | + cd benchmark/pybench + export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/ + OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py --codspeed + diff --git a/.gitignore b/.gitignore index dc6804f1ef..8294da4d44 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ benchmark/smallscaling CMakeCache.txt CMakeFiles/* .vscode +**/__pycache__ diff --git a/Makefile.system b/Makefile.system index f452011ad2..9b998b19b8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -269,6 +269,7 @@ else ifeq ($(ARCH), power) SMALL_MATRIX_OPT = 1 BUILD_BFLOAT16 = 1 endif +SMALL_MATRIX_OPT = 1 ifeq ($(SMALL_MATRIX_OPT), 1) CCOMMON_OPT += -DSMALL_MATRIX_OPT endif diff --git a/benchmark/pybench/README.md b/benchmark/pybench/README.md new file mode 100644 index 0000000000..7523ca75ab --- /dev/null +++ b/benchmark/pybench/README.md @@ -0,0 +1,49 @@ +# Continuous benchmarking of OpenBLAS performance + +We run a set of benchmarks of subset of OpenBLAS functionality. + +## Benchmark runner + +[![CodSpeed Badge](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/OpenMathLib/OpenBLAS/) + +Click on [benchmarks](https://codspeed.io/OpenMathLib/OpenBLAS/benchmarks) to see the performance of a particular benchmark over time; +Click on [branches](https://codspeed.io/OpenMathLib/OpenBLAS/branches/) and then on the last PR link to see the flamegraphs. + +## What are the benchmarks + +We run raw BLAS/LAPACK subroutines, via f2py-generated python wrappers. The wrappers themselves are equivalent to [those from SciPy](https://docs.scipy.org/doc/scipy/reference/linalg.lapack.html). +In fact, the wrappers _are_ from SciPy, we take a small subset simply to avoid having to build the whole SciPy for each CI run. + + +## Adding a new benchmark + +`.github/workflows/codspeed-bench.yml` does all the orchestration on CI. + +Benchmarks live in the `benchmark/pybench` directory. It is organized as follows: + +- benchmarks themselves live in the `benchmarks` folder. Note that the LAPACK routines are imported from the `openblas_wrap` package. +- the `openblas_wrap` package is a simple trampoline: it contains an f2py extension, `_flapack`, which talks to OpenBLAS, and exports the python names in its `__init__.py`. +This way, the `openblas_wrap` package shields the benchmarks from the details of where a particular LAPACK function comes from. If wanted, you may for instance swap the `_flapack` extension to +`scipy.linalg.blas` and `scipy.linalg.lapack`. + +To change parameters of an existing benchmark, edit python files in the `benchmark/pybench/benchmarks` directory. + +To add a benchmark for a new BLAS or LAPACK function, you need to: + +- add an f2py wrapper for the bare LAPACK function. You can simply copy a wrapper from SciPy (look for `*.pyf.src` files in https://github.com/scipy/scipy/tree/main/scipy/linalg) +- add an import to `benchmark/pybench/openblas_wrap/__init__.py` + + +## Running benchmarks locally + +This benchmarking layer is orchestrated from python, therefore you'll need to +have all what it takes to build OpenBLAS from source, plus `python` and + +``` +$ python -mpip install numpy meson ninja pytest pytest-benchmark +``` + +The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/test_blas.py`. + +An ASV compatible benchmark suite is planned but currently not implemented. + diff --git a/benchmark/pybench/benchmarks/bench_blas.py b/benchmark/pybench/benchmarks/bench_blas.py new file mode 100644 index 0000000000..064be1ead9 --- /dev/null +++ b/benchmark/pybench/benchmarks/bench_blas.py @@ -0,0 +1,185 @@ +import pytest +import numpy as np +from openblas_wrap import ( + # level 1 + dnrm2, ddot, daxpy, + # level 3 + dgemm, dsyrk, + # lapack + dgesv, # linalg.solve + dgesdd, dgesdd_lwork, # linalg.svd + dsyev, dsyev_lwork, # linalg.eigh +) + +# ### BLAS level 1 ### + +# dnrm2 + +dnrm2_sizes = [100, 1000] + +def run_dnrm2(n, x, incx): + res = dnrm2(x, n, incx=incx) + return res + + +@pytest.mark.parametrize('n', dnrm2_sizes) +def test_nrm2(benchmark, n): + rndm = np.random.RandomState(1234) + x = np.array(rndm.uniform(size=(n,)), dtype=float) + result = benchmark(run_dnrm2, n, x, 1) + + +# ddot + +ddot_sizes = [100, 1000] + +def run_ddot(x, y,): + res = ddot(x, y) + return res + + +@pytest.mark.parametrize('n', ddot_sizes) +def test_dot(benchmark, n): + rndm = np.random.RandomState(1234) + x = np.array(rndm.uniform(size=(n,)), dtype=float) + y = np.array(rndm.uniform(size=(n,)), dtype=float) + result = benchmark(run_ddot, x, y) + + +# daxpy + +daxpy_sizes = [100, 1000] + +def run_daxpy(x, y,): + res = daxpy(x, y, a=2.0) + return res + + +@pytest.mark.parametrize('n', daxpy_sizes) +def test_daxpy(benchmark, n): + rndm = np.random.RandomState(1234) + x = np.array(rndm.uniform(size=(n,)), dtype=float) + y = np.array(rndm.uniform(size=(n,)), dtype=float) + result = benchmark(run_daxpy, x, y) + + + + +# ### BLAS level 3 ### + +# dgemm + +gemm_sizes = [100, 1000] + +def run_gemm(a, b, c): + alpha = 1.0 + res = dgemm(alpha, a, b, c=c, overwrite_c=True) + return res + + +@pytest.mark.parametrize('n', gemm_sizes) +def test_gemm(benchmark, n): + rndm = np.random.RandomState(1234) + a = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') + b = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') + c = np.empty((n, n), dtype=float, order='F') + result = benchmark(run_gemm, a, b, c) + assert result is c + + +# dsyrk + +syrk_sizes = [100, 1000] + + +def run_syrk(a, c): + res = dsyrk(1.0, a, c=c, overwrite_c=True) + return res + + +@pytest.mark.parametrize('n', syrk_sizes) +def test_syrk(benchmark, n): + rndm = np.random.RandomState(1234) + a = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') + c = np.empty((n, n), dtype=float, order='F') + result = benchmark(run_syrk, a, c) + assert result is c + + +# ### LAPACK ### + +# linalg.solve + +gesv_sizes = [100, 1000] + + +def run_gesv(a, b): + res = dgesv(a, b, overwrite_a=True, overwrite_b=True) + return res + + +@pytest.mark.parametrize('n', gesv_sizes) +def test_gesv(benchmark, n): + rndm = np.random.RandomState(1234) + a = (np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') + + np.eye(n, order='F')) + b = np.array(rndm.uniform(size=(n, 1)), order='F') + lu, piv, x, info = benchmark(run_gesv, a, b) + assert lu is a + assert x is b + assert info == 0 + + +# linalg.svd + +gesdd_sizes = [(100, 5), (1000, 222)] + + +def run_gesdd(a, lwork): + res = dgesdd(a, lwork=lwork, full_matrices=False, overwrite_a=False) + return res + + +@pytest.mark.parametrize('mn', gesdd_sizes) +def test_gesdd(benchmark, mn): + m, n = mn + rndm = np.random.RandomState(1234) + a = np.array(rndm.uniform(size=(m, n)), dtype=float, order='F') + + lwork, info = dgesdd_lwork(m, n) + lwork = int(lwork) + assert info == 0 + + u, s, vt, info = benchmark(run_gesdd, a, lwork) + + assert info == 0 + np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=1e-13) + + +# linalg.eigh + +syev_sizes = [50, 200] + + +def run_syev(a, lwork): + res = dsyev(a, lwork=lwork, overwrite_a=True) + return res + + +@pytest.mark.parametrize('n', syev_sizes) +def test_syev(benchmark, n): + rndm = np.random.RandomState(1234) + a = rndm.uniform(size=(n, n)) + a = np.asarray(a + a.T, dtype=float, order='F') + a_ = a.copy() + + lwork, info = dsyev_lwork(n) + lwork = int(lwork) + assert info == 0 + + w, v, info = benchmark(run_syev, a, lwork) + + assert info == 0 + assert a is v # overwrite_a=True + + diff --git a/benchmark/pybench/meson.build b/benchmark/pybench/meson.build new file mode 100644 index 0000000000..5d921c9ed5 --- /dev/null +++ b/benchmark/pybench/meson.build @@ -0,0 +1,48 @@ +# +# Taken from SciPy (of course) +# +project( + 'openblas-wrap', + 'c', 'fortran', + version: '0.1', + license: 'BSD-3', + meson_version: '>= 1.1.0', + default_options: [ + 'buildtype=debugoptimized', + 'b_ndebug=if-release', + 'c_std=c17', + 'fortran_std=legacy', + ], +) + +py3 = import('python').find_installation(pure: false) +py3_dep = py3.dependency() + +cc = meson.get_compiler('c') + +_global_c_args = cc.get_supported_arguments( + '-Wno-unused-but-set-variable', + '-Wno-unused-function', + '-Wno-conversion', + '-Wno-misleading-indentation', +) +add_project_arguments(_global_c_args, language : 'c') + +# We need -lm for all C code (assuming it uses math functions, which is safe to +# assume for SciPy). For C++ it isn't needed, because libstdc++/libc++ is +# guaranteed to depend on it. For Fortran code, Meson already adds `-lm`. +m_dep = cc.find_library('m', required : false) +if m_dep.found() + add_project_link_arguments('-lm', language : 'c') +endif + +generate_f2pymod = find_program('openblas_wrap/generate_f2pymod.py') + +openblas = dependency('openblas', method: 'pkg-config', required: true) +openblas_dep = declare_dependency( + dependencies: openblas, + compile_args: [] +) + + +subdir('openblas_wrap') diff --git a/benchmark/pybench/openblas_wrap/__init__.py b/benchmark/pybench/openblas_wrap/__init__.py new file mode 100644 index 0000000000..06e16a665a --- /dev/null +++ b/benchmark/pybench/openblas_wrap/__init__.py @@ -0,0 +1,28 @@ +""" +Trampoline to hide the LAPACK details (scipy.lapack.linalg or scipy_openblas32 or...) +from benchmarking. +""" + +__version__ = "0.1" + + +#from scipy.linalg.blas import ( +from ._flapack import ( + # level 1 + dnrm2 as dnrm2, + ddot as ddot, + daxpy as daxpy, + # level 3 + dgemm as dgemm, + dsyrk as dsyrk, +) + +#from scipy.linalg.lapack import ( +from openblas_wrap._flapack import ( + # linalg.solve + dgesv as dgesv, + # linalg.svd + dgesdd as dgesdd, dgesdd_lwork as dgesdd_lwork, + # linalg.eigh + dsyev as dsyev, dsyev_lwork as dsyev_lwork +) diff --git a/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src b/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src new file mode 100644 index 0000000000..d2d94baa0e --- /dev/null +++ b/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src @@ -0,0 +1,326 @@ +! +! Taken from scipy/linalg +! +! Shorthand notations +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! +! Level 1 BLAS +! + + +python module _flapack + usercode ''' +#define F_INT int +''' + +interface + + +subroutine axpy(n,a,x,offx,incx,y,offy,incy) + ! Calculate z = a*x+y, where a is scalar. + + callstatement (*f2py_func)(&n,&a,x+offx,&incx,y+offy,&incy) + callprotoargument F_INT*,*,*,F_INT*,*,F_INT* + + dimension(*), intent(in) :: x + dimension(*), intent(in,out,out=z) :: y + optional, intent(in):: a=<1.0,\0,(1.0\,0.0),\2> + integer optional, intent(in),check(incx>0||incx<0) :: incx = 1 + integer optional, intent(in),check(incy>0||incy<0) :: incy = 1 + integer optional, intent(in),depend(x) :: offx=0 + integer optional, intent(in),depend(y) :: offy=0 + check(offx>=0 && offx=0 && offy(n-1)*abs(incx)) :: n + check(len(y)-offy>(n-1)*abs(incy)) :: n + +end subroutine axpy + +function ddot(n,x,offx,incx,y,offy,incy) result (xy) + ! Computes a vector-vector dot product. + + callstatement ddot_return_value = (*f2py_func)(&n,x+offx,&incx,y+offy,&incy) + callprotoargument F_INT*,double*,F_INT*,double*,F_INT* + intent(c) ddot + fortranname F_FUNC(ddot,DDOT) + + double precision dimension(*), intent(in) :: x + double precision dimension(*), intent(in) :: y + double precision ddot,xy + integer optional, intent(in),check(incx>0||incx<0) :: incx = 1 + integer optional, intent(in),check(incy>0||incy<0) :: incy = 1 + integer optional, intent(in),depend(x) :: offx=0 + integer optional, intent(in),depend(y) :: offy=0 + check(offx>=0 && offx=0 && offy(n-1)*abs(incx)) :: n + check(len(y)-offy>(n-1)*abs(incy)) :: n + +end function ddot + + +function nrm2(n,x,offx,incx) result(n2) + + nrm2, n2 + + callstatement nrm2_return_value = (*f2py_func)(&n,x+offx,&incx) + callprotoargument F_INT*,*,F_INT* + intent(c) nrm2 + fortranname F_FUNC(nrm2,NRM2) + + dimension(*),intent(in) :: x + + integer optional, intent(in),check(incx>0) :: incx = 1 + + integer optional,intent(in),depend(x) :: offx=0 + check(offx>=0 && offx(n-1)*abs(incx)) :: n + +end function nrm2 + +! +! Level 3 BLAS +! + + +subroutine gemm(m,n,k,alpha,a,b,beta,c,trans_a,trans_b,lda,ka,ldb,kb) + ! Computes a scalar-matrix-matrix product and adds the result to a + ! scalar-matrix product. + ! + ! c = gemm(alpha,a,b,beta=0,c=0,trans_a=0,trans_b=0,overwrite_c=0) + ! Calculate C <- alpha * op(A) * op(B) + beta * C + + callstatement (*f2py_func)((trans_a?(trans_a==2?"C":"T"):"N"), & + (trans_b?(trans_b==2?"C":"T"):"N"),&m,&n,&k,&alpha,a,&lda,b,&ldb,&beta,c,&m) + callprotoargument char*,char*,F_INT*,F_INT*,F_INT*,*,*,F_INT*,*, & + F_INT*,*,*,F_INT* + + integer optional,intent(in),check(trans_a>=0 && trans_a <=2) :: trans_a = 0 + integer optional,intent(in),check(trans_b>=0 && trans_b <=2) :: trans_b = 0 + intent(in) :: alpha + intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2> + + dimension(lda,ka),intent(in) :: a + dimension(ldb,kb),intent(in) :: b + dimension(m,n),intent(in,out,copy),depend(m,n),optional :: c + check(shape(c,0)==m && shape(c,1)==n) :: c + + integer depend(a),intent(hide) :: lda = shape(a,0) + integer depend(a),intent(hide) :: ka = shape(a,1) + integer depend(b),intent(hide) :: ldb = shape(b,0) + integer depend(b),intent(hide) :: kb = shape(b,1) + + integer depend(a,trans_a,ka,lda),intent(hide):: m = (trans_a?ka:lda) + integer depend(a,trans_a,ka,lda),intent(hide):: k = (trans_a?lda:ka) + integer depend(b,trans_b,kb,ldb,k),intent(hide),check(trans_b?kb==k:ldb==k) :: & + n = (trans_b?ldb:kb) + +end subroutine gemm + + +subroutine rk(n,k,alpha,a,beta,c,trans,lower,lda,ka) + ! performs one of the symmetric rank k operations + ! C := alpha*A*A**T + beta*C, or C := alpha*A**T*A + beta*C, + ! + ! c = syrk(alpha,a,beta=0,c=0,trans=0,lower=0,overwrite_c=0) + ! + callstatement (*f2py_func)((lower?"L":"U"), & + (trans?(trans==2?"C":"T"):"N"), &n,&k,&alpha,a,&lda,&beta,c,&n) + callprotoargument char*,char*,F_INT*,F_INT*,*,*,F_INT*,*, & + *,F_INT* + + integer optional, intent(in),check(lower==0||lower==1) :: lower = 0 + integer optional,intent(in),check(trans>=0 && trans <=2) :: trans = 0 + + intent(in) :: alpha + intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2,\2,\2> + + dimension(lda,ka),intent(in) :: a + dimension(n,n),intent(in,out,copy),depend(n),optional :: c + check(shape(c,0)==n && shape(c,1)==n) :: c + + integer depend(a),intent(hide) :: lda = shape(a,0) + integer depend(a),intent(hide) :: ka = shape(a,1) + + integer depend(a, trans, ka, lda), intent(hide) :: n = (trans ? ka : lda) + integer depend(a, trans, ka, lda), intent(hide) :: k = (trans ? lda : ka) + +end subroutine rk + + +! +! LAPACK +! + +subroutine gesv(n,nrhs,a,piv,b,info) + ! lu,piv,x,info = gesv(a,b,overwrite_a=0,overwrite_b=0) + ! Solve A * X = B. + ! A = P * L * U + ! U is upper diagonal triangular, L is unit lower triangular, + ! piv pivots columns. + + callstatement {F_INT i;(*f2py_func)(&n,&nrhs,a,&n,piv,b,&n,&info);for(i=0;i\*,F_INT*,F_INT*,*,F_INT*,F_INT* + + integer depend(a),intent(hide):: n = shape(a,0) + integer depend(b),intent(hide):: nrhs = shape(b,1) + dimension(n,n),check(shape(a,0)==shape(a,1)) :: a + integer dimension(n),depend(n),intent(out) :: piv + dimension(n,nrhs),check(shape(a,0)==shape(b,0)),depend(n) :: b + integer intent(out)::info + intent(in,out,copy,out=x) b + intent(in,out,copy,out=lu) a +end subroutine gesv + + +subroutine gesdd(m,n,minmn,u0,u1,vt0,vt1,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info) + ! u,s,vt,info = gesdd(a,compute_uv=1,lwork=..,overwrite_a=0) + ! Compute the singular value decomposition (SVD) using divide and conquer: + ! A = U * SIGMA * transpose(V) + ! A - M x N matrix + ! U - M x M matrix or min(M,N) x N if full_matrices=False + ! SIGMA - M x N zero matrix with a main diagonal filled with min(M,N) + ! singular values + ! transpose(V) - N x N matrix or N x min(M,N) if full_matrices=False + + callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,a,&m,s,u,&u0,vt,&vt0,work,&lwork,iwork,&info) + callprotoargument char*,F_INT*,F_INT*,*,F_INT*,*,*,F_INT*,*,F_INT*,*,F_INT*,F_INT*,F_INT* + + integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1 + integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1 + integer intent(hide),depend(a):: m = shape(a,0) + integer intent(hide),depend(a):: n = shape(a,1) + integer intent(hide),depend(m,n):: minmn = MIN(m,n) + integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1) + integer intent(hide),depend(compute_uv,minmn, full_matrices) :: u1 = (compute_uv?(full_matrices?m:minmn):1) + integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1) + integer intent(hide),depend(compute_uv,minmn) :: vt1 = (compute_uv?n:1) + dimension(m,n),intent(in,copy,aligned8) :: a + dimension(minmn),intent(out),depend(minmn) :: s + dimension(u0,u1),intent(out),depend(u0, u1) :: u + dimension(vt0,vt1),intent(out),depend(vt0, vt1) :: vt + dimension(lwork),intent(hide,cache),depend(lwork) :: work + integer optional,intent(in),depend(minmn,compute_uv) & + :: lwork = max((compute_uv?4*minmn*minmn+MAX(m,n)+9*minmn:MAX(14*minmn+4,10*minmn+2+25*(25+8))+MAX(m,n)),1) + integer intent(hide,cache),dimension(8*minmn),depend(minmn) :: iwork + integer intent(out)::info + +end subroutine gesdd + +subroutine gesdd_lwork(m,n,minmn,u0,vt0,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info) + ! LWORK computation for (S/D)GESDD + + fortranname gesdd + callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,&a,&m,&s,&u,&u0,&vt,&vt0,&work,&lwork,&iwork,&info) + callprotoargument char*,F_INT*,F_INT*,*,F_INT*,*,*,F_INT*,*,F_INT*,*,F_INT*,F_INT*,F_INT* + + integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1 + integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1 + integer intent(in) :: m + integer intent(in) :: n + integer intent(hide),depend(m,n):: minmn = MIN(m,n) + integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1) + integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1) + intent(hide) :: a + intent(hide) :: s + intent(hide) :: u + intent(hide) :: vt + intent(out) :: work + integer intent(hide) :: lwork = -1 + integer intent(hide) :: iwork + integer intent(out) :: info + +end subroutine gesdd_lwork + + +subroutine syev(compute_v,lower,n,w,a,lda,work,lwork,info) + ! w,v,info = syev(a,compute_v=1,lower=0,lwork=3*n-1,overwrite_a=0) + ! Compute all eigenvalues and, optionally, eigenvectors of a + ! real symmetric matrix A. + ! + ! Performance tip: + ! If compute_v=0 then set also overwrite_a=1. + + callstatement (*f2py_func)((compute_v?"V":"N"),(lower?"L":"U"),&n,a,&lda,w,work,&lwork,&info) + callprotoargument char*,char*,F_INT*,*,F_INT*,*,*,F_INT*,F_INT* + + integer optional,intent(in):: compute_v = 1 + check(compute_v==1||compute_v==0) compute_v + integer optional,intent(in),check(lower==0||lower==1) :: lower = 0 + + integer intent(hide),depend(a):: n = shape(a,0) + integer intent(hide),depend(a):: lda = MAX(1,shape(a,0)) + dimension(n,n),check(shape(a,0)==shape(a,1)) :: a + intent(in,copy,out,out=v) :: a + + dimension(n),intent(out),depend(n) :: w + + integer optional,intent(in),depend(n) :: lwork=max(3*n-1,1) + check(lwork>=3*n-1) :: lwork + dimension(lwork),intent(hide),depend(lwork) :: work + + integer intent(out) :: info + +end subroutine syev + + +subroutine syev_lwork(lower,n,w,a,lda,work,lwork,info) + ! LWORK routines for syev + + fortranname syev + + callstatement (*f2py_func)("N",(lower?"L":"U"),&n,&a,&lda,&w,&work,&lwork,&info) + callprotoargument char*,char*,F_INT*,*,F_INT*,*,*,F_INT*,F_INT* + + integer intent(in):: n + integer optional,intent(in),check(lower==0||lower==1) :: lower = 0 + + integer intent(hide),depend(n):: lda = MAX(1, n) + intent(hide):: a + intent(hide):: w + integer intent(hide):: lwork = -1 + + intent(out):: work + integer intent(out):: info + +end subroutine syev_lwork + +end interface + +end python module _flapack + + + diff --git a/benchmark/pybench/openblas_wrap/generate_f2pymod.py b/benchmark/pybench/openblas_wrap/generate_f2pymod.py new file mode 100644 index 0000000000..5a8ba13895 --- /dev/null +++ b/benchmark/pybench/openblas_wrap/generate_f2pymod.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +""" +Process f2py template files (`filename.pyf.src` -> `filename.pyf`) + +Usage: python generate_pyf.py filename.pyf.src -o filename.pyf +""" + +import os +import sys +import re +import subprocess +import argparse + + +# START OF CODE VENDORED FROM `numpy.distutils.from_template` +############################################################# +""" +process_file(filename) + + takes templated file .xxx.src and produces .xxx file where .xxx + is .pyf .f90 or .f using the following template rules: + + '<..>' denotes a template. + + All function and subroutine blocks in a source file with names that + contain '<..>' will be replicated according to the rules in '<..>'. + + The number of comma-separated words in '<..>' will determine the number of + replicates. + + '<..>' may have two different forms, named and short. For example, + + named: + where anywhere inside a block '

' will be replaced with + 'd', 's', 'z', and 'c' for each replicate of the block. + + <_c> is already defined: <_c=s,d,c,z> + <_t> is already defined: <_t=real,double precision,complex,double complex> + + short: + , a short form of the named, useful when no

appears inside + a block. + + In general, '<..>' contains a comma separated list of arbitrary + expressions. If these expression must contain a comma|leftarrow|rightarrow, + then prepend the comma|leftarrow|rightarrow with a backslash. + + If an expression matches '\\' then it will be replaced + by -th expression. + + Note that all '<..>' forms in a block must have the same number of + comma-separated entries. + + Predefined named template rules: + + + + + +""" + +routine_start_re = re.compile( + r'(\n|\A)(( (\$|\*))|)\s*(subroutine|function)\b', + re.I +) +routine_end_re = re.compile(r'\n\s*end\s*(subroutine|function)\b.*(\n|\Z)', re.I) +function_start_re = re.compile(r'\n (\$|\*)\s*function\b', re.I) + +def parse_structure(astr): + """ Return a list of tuples for each function or subroutine each + tuple is the start and end of a subroutine or function to be + expanded. + """ + + spanlist = [] + ind = 0 + while True: + m = routine_start_re.search(astr, ind) + if m is None: + break + start = m.start() + if function_start_re.match(astr, start, m.end()): + while True: + i = astr.rfind('\n', ind, start) + if i==-1: + break + start = i + if astr[i:i+7]!='\n $': + break + start += 1 + m = routine_end_re.search(astr, m.end()) + ind = end = m and m.end()-1 or len(astr) + spanlist.append((start, end)) + return spanlist + +template_re = re.compile(r"<\s*(\w[\w\d]*)\s*>") +named_re = re.compile(r"<\s*(\w[\w\d]*)\s*=\s*(.*?)\s*>") +list_re = re.compile(r"<\s*((.*?))\s*>") + +def find_repl_patterns(astr): + reps = named_re.findall(astr) + names = {} + for rep in reps: + name = rep[0].strip() or unique_key(names) + repl = rep[1].replace(r'\,', '@comma@') + thelist = conv(repl) + names[name] = thelist + return names + +def find_and_remove_repl_patterns(astr): + names = find_repl_patterns(astr) + astr = re.subn(named_re, '', astr)[0] + return astr, names + +item_re = re.compile(r"\A\\(?P\d+)\Z") +def conv(astr): + b = astr.split(',') + l = [x.strip() for x in b] + for i in range(len(l)): + m = item_re.match(l[i]) + if m: + j = int(m.group('index')) + l[i] = l[j] + return ','.join(l) + +def unique_key(adict): + """ Obtain a unique key given a dictionary.""" + allkeys = list(adict.keys()) + done = False + n = 1 + while not done: + newkey = '__l%s' % (n) + if newkey in allkeys: + n += 1 + else: + done = True + return newkey + + +template_name_re = re.compile(r'\A\s*(\w[\w\d]*)\s*\Z') +def expand_sub(substr, names): + substr = substr.replace(r'\>', '@rightarrow@') + substr = substr.replace(r'\<', '@leftarrow@') + lnames = find_repl_patterns(substr) + substr = named_re.sub(r"<\1>", substr) # get rid of definition templates + + def listrepl(mobj): + thelist = conv(mobj.group(1).replace(r'\,', '@comma@')) + if template_name_re.match(thelist): + return "<%s>" % (thelist) + name = None + for key in lnames.keys(): # see if list is already in dictionary + if lnames[key] == thelist: + name = key + if name is None: # this list is not in the dictionary yet + name = unique_key(lnames) + lnames[name] = thelist + return "<%s>" % name + + substr = list_re.sub(listrepl, substr) # convert all lists to named templates + # newnames are constructed as needed + + numsubs = None + base_rule = None + rules = {} + for r in template_re.findall(substr): + if r not in rules: + thelist = lnames.get(r, names.get(r, None)) + if thelist is None: + raise ValueError('No replicates found for <%s>' % (r)) + if r not in names and not thelist.startswith('_'): + names[r] = thelist + rule = [i.replace('@comma@', ',') for i in thelist.split(',')] + num = len(rule) + + if numsubs is None: + numsubs = num + rules[r] = rule + base_rule = r + elif num == numsubs: + rules[r] = rule + else: + print("Mismatch in number of replacements (base <{}={}>) " + "for <{}={}>. Ignoring." + .format(base_rule, ','.join(rules[base_rule]), r, thelist)) + if not rules: + return substr + + def namerepl(mobj): + name = mobj.group(1) + return rules.get(name, (k+1)*[name])[k] + + newstr = '' + for k in range(numsubs): + newstr += template_re.sub(namerepl, substr) + '\n\n' + + newstr = newstr.replace('@rightarrow@', '>') + newstr = newstr.replace('@leftarrow@', '<') + return newstr + +def process_str(allstr): + newstr = allstr + writestr = '' + + struct = parse_structure(newstr) + + oldend = 0 + names = {} + names.update(_special_names) + for sub in struct: + cleanedstr, defs = find_and_remove_repl_patterns(newstr[oldend:sub[0]]) + writestr += cleanedstr + names.update(defs) + writestr += expand_sub(newstr[sub[0]:sub[1]], names) + oldend = sub[1] + writestr += newstr[oldend:] + + return writestr + +include_src_re = re.compile( + r"(\n|\A)\s*include\s*['\"](?P[\w\d./\\]+\.src)['\"]", + re.I +) + +def resolve_includes(source): + d = os.path.dirname(source) + with open(source) as fid: + lines = [] + for line in fid: + m = include_src_re.match(line) + if m: + fn = m.group('name') + if not os.path.isabs(fn): + fn = os.path.join(d, fn) + if os.path.isfile(fn): + lines.extend(resolve_includes(fn)) + else: + lines.append(line) + else: + lines.append(line) + return lines + +def process_file(source): + lines = resolve_includes(source) + return process_str(''.join(lines)) + +_special_names = find_repl_patterns(''' +<_c=s,d,c,z> +<_t=real,double precision,complex,double complex> + + + + + +''') + +# END OF CODE VENDORED FROM `numpy.distutils.from_template` +########################################################### + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("infile", type=str, + help="Path to the input file") + parser.add_argument("-o", "--outdir", type=str, + help="Path to the output directory") + args = parser.parse_args() + + if not args.infile.endswith(('.pyf', '.pyf.src', '.f.src')): + raise ValueError(f"Input file has unknown extension: {args.infile}") + + outdir_abs = os.path.join(os.getcwd(), args.outdir) + + # Write out the .pyf/.f file + if args.infile.endswith(('.pyf.src', '.f.src')): + code = process_file(args.infile) + fname_pyf = os.path.join(args.outdir, + os.path.splitext(os.path.split(args.infile)[1])[0]) + + with open(fname_pyf, 'w') as f: + f.write(code) + else: + fname_pyf = args.infile + + # Now invoke f2py to generate the C API module file + if args.infile.endswith(('.pyf.src', '.pyf')): + p = subprocess.Popen([sys.executable, '-m', 'numpy.f2py', fname_pyf, + '--build-dir', outdir_abs], #'--quiet'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.getcwd()) + out, err = p.communicate() + if not (p.returncode == 0): + raise RuntimeError(f"Writing {args.outfile} with f2py failed!\n" + f"{out}\n" + r"{err}") + + +if __name__ == "__main__": + main() diff --git a/benchmark/pybench/openblas_wrap/meson.build b/benchmark/pybench/openblas_wrap/meson.build new file mode 100644 index 0000000000..9f1b717876 --- /dev/null +++ b/benchmark/pybench/openblas_wrap/meson.build @@ -0,0 +1,50 @@ +# find numpy & f2py includes +inc_numpy = run_command(py3, + ['-c', 'import os; os.chdir(".."); import numpy; print(numpy.get_include())'], + check : true +).stdout().strip() + +inc_f2py = run_command(py3, + ['-c', 'import os; os.chdir(".."); import numpy.f2py; print(numpy.f2py.get_include())'], + check : true +).stdout().strip() + + +inc_np = include_directories(inc_numpy, inc_f2py) +fortranobject_c = inc_f2py / 'fortranobject.c' + + +fortranobject_lib = static_library('_fortranobject', + fortranobject_c, +# c_args: numpy_nodepr_api, + dependencies: py3_dep, + include_directories: [inc_np, inc_f2py], + gnu_symbol_visibility: 'hidden', +) +fortranobject_dep = declare_dependency( + link_with: fortranobject_lib, + include_directories: [inc_np, inc_f2py], +) + + +# f2py generated wrappers + +flapack_module = custom_target('flapack_module', + output: ['_flapackmodule.c'], + input: 'blas_lapack.pyf.src', + command: [generate_f2pymod, '@INPUT@', '-o', '@OUTDIR@'], +) + +py3.extension_module('_flapack', + flapack_module, + link_args: [], # version_link_args, + dependencies: [openblas_dep, fortranobject_dep], + install: true, + subdir: 'openblas_wrap' +) + + +py3.install_sources( + ['__init__.py'], + subdir: 'openblas_wrap' +) diff --git a/benchmark/pybench/scipy_openblas.pc b/benchmark/pybench/scipy_openblas.pc new file mode 100644 index 0000000000..2348fac626 --- /dev/null +++ b/benchmark/pybench/scipy_openblas.pc @@ -0,0 +1,12 @@ +libdir=/home/br/repos/OpenBLAS/ +includedir=/home/br/repos/OpenBLAS/ +openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64 +version=0.3.27 +extralib=-lm -lpthread -lgfortran -lquadmath -L${libdir} -lopenblas +Name: openblas +Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version +Version: ${version} +URL: https://github.com/xianyi/OpenBLAS +Libs: -L${libdir} -lopenblas +Libs.private: ${extralib} +Cflags: -I${includedir} diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 0a498510aa..a162986f85 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -5,10 +5,10 @@ include(CheckCCompilerFlag) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang") - set(CCOMMON_OPT "${CCOMMON_OPT} -Wall") set(COMMON_PROF "${COMMON_PROF} -fno-inline") set(NO_UNINITIALIZED_WARN "-Wno-uninitialized") + set(GCC_VERSION ${CMAKE_C_COMPILER_VERSION}) if (QUIET_MAKE) set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused") @@ -140,7 +140,6 @@ endif () if (${CORE} STREQUAL COOPERLAKE) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) set (CCOMMON_OPT "${CCOMMON_OPT} -march=cooperlake") else () @@ -153,7 +152,6 @@ endif () if (${CORE} STREQUAL SAPPHIRERAPIDS) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids") else () @@ -167,7 +165,6 @@ if (${CORE} STREQUAL ZEN) if (HAVE_AVX512VL) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 13.0 OR ${GCC_VERSION} VERSION_EQUAL 13.0) set (CCOMMON_OPT "${CCOMMON_OPT} -march=znver4") else () @@ -180,7 +177,6 @@ endif () if (${CORE} STREQUAL A64FX) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") else () @@ -194,7 +190,6 @@ if (${CORE} STREQUAL NEOVERSEN2) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () @@ -209,7 +204,6 @@ if (${CORE} STREQUAL NEOVERSEV1) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") else () - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") else () @@ -221,7 +215,6 @@ endif () if (${CORE} STREQUAL NEOVERSEN1) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") else () @@ -266,23 +259,21 @@ endif () if (${CORE} STREQUAL POWER10) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") else () - message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." ) + message(FATAL_ERROR "Compiler GCC ${GCC_VERSION} does not support Power10." ) endif() endif () endif () if (${CORE} STREQUAL POWER9) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") else () set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") - message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.") + message(WARNING "Compiler GCC ${GCC_VERSION} does not fully support Power9.") endif () endif () endif () diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 176d3d4388..69246385c4 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -67,6 +67,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F else() set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") endif () + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () else () CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) if(COMPILER_SUPPORT_ILP32D_ABI) diff --git a/cmake/os.cmake b/cmake/os.cmake index e24059dd5b..2effbe0e5b 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -38,7 +38,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") # Test for supporting MS_ABI # removed string parsing in favor of CMake's version comparison -hpa - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + set(GCC_VERSION ${CMAKE_C_COMPILER_VERSION}) if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) # GCC Version >=4.7 # It is compatible with MSVC ABI. diff --git a/cmake/system.cmake b/cmake/system.cmake index 6fec04bfcf..479f50538f 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -170,7 +170,6 @@ include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") if (DEFINED TARGET) if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() @@ -186,7 +185,6 @@ if (DEFINED TARGET) endif() if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") else() @@ -209,7 +207,6 @@ if (DEFINED TARGET) if (((${TARGET} STREQUAL ZEN) AND HAVE_AVX512VL) AND NOT NO_AVX512) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.99) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=znver4") else() @@ -227,8 +224,7 @@ if (DEFINED TARGET) if ((${TARGET} STREQUAL HASWELL OR (${TARGET} STREQUAL ZEN AND NOT HAVE_AVX512VL)) AND NOT NO_AVX2) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 4.7 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 4.7) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") @@ -267,20 +263,18 @@ if (DEFINED TARGET) endif() if (${TARGET} STREQUAL POWER10) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) + if (CMAKE_C_COMPILER VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") else () - message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.") + message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.") endif() endif() if (${TARGET} STREQUAL POWER9) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 5.0 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 5.0) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") else () set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") - message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.") + message(WARNING "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support fully Power9.") endif() endif() if (${TARGET} STREQUAL POWER8) @@ -291,11 +285,10 @@ if (${TARGET} STREQUAL NEOVERSEV1) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") else () - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") else () - message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.") + message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V1.") endif() endif() endif() @@ -303,11 +296,10 @@ if (${TARGET} STREQUAL NEOVERSEV1) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () - message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.") + message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse N2.") endif() endif() endif() diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 5e8ba866be..9befc9a3c4 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -187,8 +187,8 @@ macro(ParseMakefileVars MAKEFILE_IN) set (HasValidGroup 1) set (STR ${CMAKE_MATCH_4}) endif () - if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1) - if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR})) + if (DEFINED CMAKE_MATCH_1 AND ${HasValidGroup} EQUAL 1) + if (NOT (CMAKE_MATCH_1 STREQUAL ${STR})) #message (STATUS "condition is true") set (IfElse 1) continue () diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index 0ad32ae4e0..3b7a9c82ea 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include /* If LASX extension instructions supported, * using core LOONGSON3R5 diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index c7ccf84260..9fec7afca2 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -826,6 +826,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF if (nthreads_m * nthreads_n > args -> nthreads) { nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); } + /* The nthreads_m and nthreads_n are adjusted so that the submatrix */ + /* to be handled by each thread preferably becomes a square matrix */ + /* by minimizing an objective function 'n * nthreads_m + m * nthreads_n'. */ + /* Objective function come from sum of partitions in m and n. */ + /* (n / nthreads_n) + (m / nthreads_m) */ + /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ + while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) { + nthreads_m /= 2; + nthreads_n *= 2; + } } /* Execute serial or parallel computation */ diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index e3ebc7a7e8..80c28ebda9 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -1074,13 +1074,10 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l main_status[cpu] = MAIN_RUNNING1; #endif -//For Loongson servers, like the 3C5000 (featuring 16 cores), applying an -//offset to the buffer is essential for minimizing cache conflicts and optimizing performance. -#if defined(LOONGSON3R5) && !defined(NO_AFFINITY) - char model_name[128]; - get_cpu_model(model_name); - if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL)) - if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); +//For target LOONGSON3R5, applying an offset to the buffer is essential +//for minimizing cache conflicts and optimizing performance. +#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) + if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); #endif if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); @@ -1157,4 +1154,4 @@ if (queue -> mode & BLAS_PTHREAD) { } -#endif \ No newline at end of file +#endif diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 867d0e3614..ff52cfba8f 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -13,9 +13,9 @@ modification, are permitted provided that the following conditions are notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -63,6 +63,9 @@ static char* openblas_config_str="" #ifdef USE_TLS "USE_TLS " #endif +#ifdef USE_LOCKING + "USE_LOCKING " +#endif #ifndef DYNAMIC_ARCH CHAR_CORENAME #endif @@ -83,7 +86,7 @@ char tmpstr[20]; #endif if (openblas_get_parallel() == 0) sprintf(tmpstr, " SINGLE_THREADED"); - else + else snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER); strcat(tmp_config_str, tmpstr); return tmp_config_str; @@ -91,7 +94,7 @@ char tmpstr[20]; char* openblas_get_corename(void) { -#ifndef DYNAMIC_ARCH +#ifndef DYNAMIC_ARCH return CHAR_CORENAME; #else return gotoblas_corename(); diff --git a/exports/Makefile b/exports/Makefile index 457e59b2c8..668a4866e8 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -315,11 +315,6 @@ test : linktest.c linktest.c : $(GENSYM) ../Makefile.system ../getarch.c ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c -ifeq ($(F_COMPILER), IBM) - mv linktest.c linktest.c.FIRST - egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c - rm linktest.c.FIRST -endif clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/interface/gbmv.c b/interface/gbmv.c index 1d58ba807f..7a65813685 100644 --- a/interface/gbmv.c +++ b/interface/gbmv.c @@ -227,7 +227,10 @@ void CNAME(enum CBLAS_ORDER order, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + if (m * n < 250000 || kl+ku < 15 ) + nthreads = 1; + else + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif diff --git a/interface/gemm.c b/interface/gemm.c index c402836ca3..4537b6a78f 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -521,15 +521,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS buffer = (XFLOAT *)blas_memory_alloc(0); -//For Loongson servers, like the 3C5000 (featuring 16 cores), applying an -//offset to the buffer is essential for minimizing cache conflicts and optimizing performance. -#if defined(LOONGSON3R5) && !defined(NO_AFFINITY) - char model_name[128]; - get_cpu_model(model_name); - if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL)) - sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); - else - sa = (XFLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); +//For target LOONGSON3R5, applying an offset to the buffer is essential +//for minimizing cache conflicts and optimizing performance. +#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) + sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); #else sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); #endif diff --git a/interface/zgbmv.c b/interface/zgbmv.c index 5e275a8edc..5128b22e15 100644 --- a/interface/zgbmv.c +++ b/interface/zgbmv.c @@ -251,7 +251,10 @@ void CNAME(enum CBLAS_ORDER order, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + if (m * n < 125000 || ku + kl < 15) + nthreads = 1; + else + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif diff --git a/kernel/generic/laswp_ncopy_6.c b/kernel/generic/laswp_ncopy_6.c new file mode 100644 index 0000000000..85a17a092f --- /dev/null +++ b/kernel/generic/laswp_ncopy_6.c @@ -0,0 +1,276 @@ + +/*********************************************************************/ +/* Copyright 2009, 2010, 2024 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define PREFETCHSIZE 4 + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip; + blasint *piv; + FLOAT *dx1, *dy1; + FLOAT *dx2, *dy2; + FLOAT *dx3, *dy3; + FLOAT *dx4, *dy4; + FLOAT *dx5, *dy5; + FLOAT *dx6, *dy6; + FLOAT atemp1, btemp1; + FLOAT atemp2, btemp2; + FLOAT atemp3, btemp3; + FLOAT atemp4, btemp4; + FLOAT atemp5, btemp5; + FLOAT atemp6, btemp6; + + a--; + ipiv += k1 - 1; + + if (n <= 0) return 0; + if (k1 > k2) return 0; + + j = (n / 6); + if (j > 0) { + do { + piv = ipiv; + i = k1; + + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + i + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + i + lda * 3; + dy4 = a + ip + lda * 3; + dx5 = a + i + lda * 4; + dy5 = a + ip + lda * 4; + dx6 = a + i + lda * 5; + dy6 = a + ip + lda * 5; + +#ifdef __GNUC__ + __builtin_prefetch(dx1 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx2 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx3 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx4 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx5 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx6 + PREFETCHSIZE, 0, 1); +#endif + + atemp1 = *dx1; + btemp1 = *dy1; + atemp2 = *dx2; + btemp2 = *dy2; + atemp3 = *dx3; + btemp3 = *dy3; + atemp4 = *dx4; + btemp4 = *dy4; + + atemp5 = *dx5; + btemp5 = *dy5; + atemp6 = *dx6; + btemp6 = *dy6; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *dy3 = atemp3; + *dy4 = atemp4; + *dy5 = atemp5; + *dy6 = atemp6; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + *(buffer + 2) = btemp3; + *(buffer + 3) = btemp4; + *(buffer + 4) = btemp5; + *(buffer + 5) = btemp6; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + *(buffer + 2) = atemp3; + *(buffer + 3) = atemp4; + *(buffer + 4) = atemp5; + *(buffer + 5) = atemp6; + } + + buffer += 6; + + i++; + } while (i <= k2); + + a += 6 * lda; + j --; + } while (j > 0); + } + + if ((n % 6) & 4) { + piv = ipiv; + + ip = *piv; + piv ++; + + dx1 = a + k1; + dy1 = a + ip; + dx2 = a + k1 + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + k1 + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + k1 + lda * 3; + dy4 = a + ip + lda * 3; + + i = k1; + + do { + atemp1 = *dx1; + atemp2 = *dx2; + atemp3 = *dx3; + atemp4 = *dx4; + + btemp1 = *dy1; + btemp2 = *dy2; + btemp3 = *dy3; + btemp4 = *dy4; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *dy3 = atemp3; + *dy4 = atemp4; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + *(buffer + 2) = btemp3; + *(buffer + 3) = btemp4; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + *(buffer + 2) = atemp3; + *(buffer + 3) = atemp4; + } + + ip = *piv; + piv ++; + + i++; + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + i + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + i + lda * 3; + dy4 = a + ip + lda * 3; + + buffer += 4; + + } while (i <= k2); + + a += 4 * lda; + } + + if ((n % 6) & 2) { + piv = ipiv; + + i = k1; + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda; + dy2 = a + ip + lda; + + atemp1 = *dx1; + btemp1 = *dy1; + atemp2 = *dx2; + btemp2 = *dy2; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + } + + buffer += 2; + + i++; + } while (i <= k2); + + a += 2 * lda; + } + + + if ((n % 6) & 1) { + piv = ipiv; + + i = k1; + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + atemp1 = *dx1; + btemp1 = *dy1; + + if (ip != i) { + *dy1 = atemp1; + *buffer = btemp1; + } else { + *buffer = atemp1; + } + + buffer ++; + + i++; + } while (i <= k2); + + // a += lda; + } + + return 0; +} \ No newline at end of file diff --git a/kernel/generic/symm_lcopy_6.c b/kernel/generic/symm_lcopy_6.c index ca730e1eef..3a3e2d5b2f 100644 --- a/kernel/generic/symm_lcopy_6.c +++ b/kernel/generic/symm_lcopy_6.c @@ -41,98 +41,141 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js, offset; - - FLOAT data01, data02, data03, data04; - FLOAT *ao1, *ao2, *ao3, *ao4; + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = (n / 6); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; + if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + if (offset > -4) ao5 += lda; else ao5 ++; + if (offset > -5) ao6 += lda; else ao6 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + + b += 6; + + offset --; + i --; + } + + posX += 6; + js --; + } - js = (n >> 2); - while (js > 0){ + if ((n - n/6) & 4) { + offset = posX - posY; - offset = posX - posY; + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; - if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; - if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; - if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; - if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + i = m; - i = m; + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); - while (i > 0) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); - data03 = *(ao3 + 0); - data04 = *(ao4 + 0); + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; - if (offset > 0) ao1 += lda; else ao1 ++; - if (offset > -1) ao2 += lda; else ao2 ++; - if (offset > -2) ao3 += lda; else ao3 ++; - if (offset > -3) ao4 += lda; else ao4 ++; + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; + b += 4; - b += 4; + offset --; + i --; + } - offset --; - i --; + posX += 4; } - posX += 4; - js --; - } + if ((n - n/6) & 2) { - if (n & 2) { + offset = posX - posY; - offset = posX - posY; + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; - if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; - if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + i = m; - i = m; + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); - while (i > 0) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; - if (offset > 0) ao1 += lda; else ao1 ++; - if (offset > -1) ao2 += lda; else ao2 ++; + b[ 0] = data01; + b[ 1] = data02; - b[ 0] = data01; - b[ 1] = data02; + b += 2; - b += 2; + offset --; + i --; + } - offset --; - i --; + posX += 2; } - posX += 2; - } - - if (n & 1) { + if ((n - n/6) & 1) { - offset = posX - posY; + offset = posX - posY; - if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; - i = m; + i = m; - while (i > 0) { - data01 = *(ao1 + 0); + while (i > 0) { + data01 = *(ao1 + 0); - if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > 0) ao1 += lda; else ao1 ++; - b[ 0] = data01; + b[ 0] = data01; - b ++; + b ++; - offset --; - i --; + offset --; + i --; + } } - } - return 0; + return 0; } diff --git a/kernel/generic/symm_ucopy_6.c b/kernel/generic/symm_ucopy_6.c index 6dbb861e98..a83d937d0b 100644 --- a/kernel/generic/symm_ucopy_6.c +++ b/kernel/generic/symm_ucopy_6.c @@ -41,96 +41,140 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js, offset; + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = (n / 6); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; + if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + if (offset > -4) ao5 ++; else ao5 += lda; + if (offset > -5) ao6 ++; else ao6 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + + b += 6; + + offset --; + i --; + } + + posX += 6; + js --; + } - FLOAT data01, data02, data03, data04; - FLOAT *ao1, *ao2, *ao3, *ao4; + if ((n - n/6) & 4) { - js = (n >> 2); - while (js > 0){ + offset = posX - posY; - offset = posX - posY; + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; - if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; - if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; - if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; - if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + i = m; - i = m; + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); - while (i > 0) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); - data03 = *(ao3 + 0); - data04 = *(ao4 + 0); + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; - if (offset > 0) ao1 ++; else ao1 += lda; - if (offset > -1) ao2 ++; else ao2 += lda; - if (offset > -2) ao3 ++; else ao3 += lda; - if (offset > -3) ao4 ++; else ao4 += lda; + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; + b += 4; - b += 4; + offset --; + i --; + } - offset --; - i --; + posX += 4; } - posX += 4; - js --; - } + if ((n - n/6) & 2) { + offset = posX - posY; - if (n & 2) { - offset = posX - posY; + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; - if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; - if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + i = m; - i = m; + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); - while (i > 0) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; - if (offset > 0) ao1 ++; else ao1 += lda; - if (offset > -1) ao2 ++; else ao2 += lda; + b[ 0] = data01; + b[ 1] = data02; - b[ 0] = data01; - b[ 1] = data02; + b += 2; - b += 2; + offset --; + i --; + } - offset --; - i --; + posX += 2; } - posX += 2; - } - - if (n & 1) { - offset = posX - posY; + if ((n - n/6) & 1) { + offset = posX - posY; - if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; - i = m; + i = m; - while (i > 0) { - data01 = *(ao1 + 0); + while (i > 0) { + data01 = *(ao1 + 0); - if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > 0) ao1 ++; else ao1 += lda; - b[ 0] = data01; + b[ 0] = data01; - b ++; + b ++; - offset --; - i --; + offset --; + i --; + } } - } - return 0; + return 0; } diff --git a/kernel/generic/trmm_lncopy_6.c b/kernel/generic/trmm_lncopy_6.c index 0dcfb965ac..999f0d367d 100644 --- a/kernel/generic/trmm_lncopy_6.c +++ b/kernel/generic/trmm_lncopy_6.c @@ -41,444 +41,510 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js; - BLASLONG X; - - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *ao1, *ao2, *ao3, *ao4; - - js = (n >> 2); - - if (js > 0){ - do { - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - ao3 = a + posY + (posX + 2) * lda; - ao4 = a + posY + (posX + 3) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - ao3 = a + posX + (posY + 2) * lda; - ao4 = a + posX + (posY + 3) * lda; - } - - i = (m >> 2); - if (i > 0) { - do { - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b[ 4] = data02; - b[ 5] = data06; - b[ 6] = data10; - b[ 7] = data14; - - b[ 8] = data03; - b[ 9] = data07; - b[10] = data11; - b[11] = data15; - b[12] = data04; - b[13] = data08; - b[14] = data12; - b[15] = data16; - - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - - } else - if (X < posY) { - ao1 += 4 * lda; - ao2 += 4 * lda; - ao3 += 4 * lda; - ao4 += 4 * lda; - b += 16; - - } else { + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT data01, data02, data05, data06; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = (n / 6); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } + + i = (m / 6); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 6; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + ao5 ++; + ao6 ++; + b += 6; + } + + } else if (X < posY) { + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + b += 36; + + } else { #ifdef UNIT - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data12 = *(ao3 + 3); - - b[ 0] = ONE; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b[ 4] = data02; - b[ 5] = ONE; - b[ 6] = ZERO; - b[ 7] = ZERO; - - b[ 8] = data03; - b[ 9] = data07; - b[10] = ONE; - b[11] = ZERO; - b[12] = data04; - b[13] = data08; - b[14] = data12; - b[15] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b[ 4] = data02; - b[ 5] = data06; - b[ 6] = ZERO; - b[ 7] = ZERO; - - b[ 8] = data03; - b[ 9] = data07; - b[10] = data11; - b[11] = ZERO; - b[12] = data04; - b[13] = data08; - b[14] = data12; - b[15] = data16; + b[ 0] = *(ao1 + 0); #endif - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - } - - X += 4; - i --; - } while (i > 0); - } - - i = (m & 3); - if (i) { - - if (X > posY) { - - if (m & 2) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao2 + 0); - data04 = *(ao2 + 1); - data05 = *(ao3 + 0); - data06 = *(ao3 + 1); - data07 = *(ao4 + 0); - data08 = *(ao4 + 1); - - b[ 0] = data01; - b[ 1] = data03; - b[ 2] = data05; - b[ 3] = data07; - b[ 4] = data02; - b[ 5] = data04; - b[ 6] = data06; - b[ 7] = data08; - - ao1 += 2; - ao2 += 2; - ao3 += 2; - ao4 += 2; - b += 8; - } - - if (m & 1) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); - data03 = *(ao3 + 0); - data04 = *(ao4 + 0); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - - ao1 += 1; - ao2 += 1; - ao3 += 1; - ao4 += 1; - b += 4; - } - - } else - if (X < posY) { - if (m & 2) { - ao1 += 2 * lda; - ao2 += 2 * lda; - - b += 8; - } - - if (m & 1) { - ao1 += lda; - b += 4; - } - - } else { + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + + b[ 6] = *(ao1 + 1); +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(ao2 + 1); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + + b[12] = *(ao1 + 2); + b[13] = *(ao2 + 2); +#ifdef UNIT + b[14] = ONE; +#else + b[14] = *(ao3 + 2); +#endif + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + + b[18] = *(ao1 + 3); + b[19] = *(ao2 + 3); + b[20] = *(ao3 + 3); +#ifdef UNIT + b[21] = ONE; +#else + b[21] = *(ao4 + 3); +#endif + b[22] = ZERO; + b[23] = ZERO; + + b[24] = *(ao1 + 4); + b[25] = *(ao2 + 4); + b[26] = *(ao3 + 4); + b[27] = *(ao4 + 4); +#ifdef UNIT + b[28] = ONE; +#else + b[28] = *(ao5 + 4); +#endif + b[29] = ZERO; + + b[30] = *(ao1 + 5); + b[31] = *(ao2 + 5); + b[32] = *(ao3 + 5); + b[33] = *(ao4 + 5); + b[34] = *(ao5 + 5); +#ifdef UNIT + b[35] = ONE; +#else + b[35] = *(ao6 + 5); +#endif + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + b += 36; + } + + X += 6; + i --; + } while (i > 0); + } + + i = (m % 6); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + ao5 ++; + ao6 ++; + b += 6; + } + + } else if (X < posY) { + + b += 6 * i; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + + if (i >= 2) { + b[ 0] = *(ao1 + 1); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(ao2 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 3) { + b[ 0] = *(ao1 + 2); + b[ 1] = *(ao2 + 2); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(ao3 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 4) { + b[ 0] = *(ao1 + 3); + b[ 1] = *(ao2 + 3); + b[ 2] = *(ao3 + 3); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(ao4 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 5) { + b[ 0] = *(ao1 + 4); + b[ 1] = *(ao2 + 4); + b[ 2] = *(ao3 + 4); + b[ 3] = *(ao4 + 4); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(ao5 + 4); +#endif + b[ 5] = ZERO; + b += 6; + } + } + } + + posY += 6; + js --; + } while (js > 0); + } /* End of main loop */ + + if ((n % 6) & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 2; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + b += 4; + } + } else if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + ao3 += 2 * lda; + ao4 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(ao1 + 1); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(ao2 + 1); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(ao1 + 2); + b[ 9] = *(ao2 + 2); +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(ao3 + 2); +#endif + b[ 11] = ZERO; + + b[ 12] = *(ao1 + 3); + b[ 13] = *(ao2 + 3); + b[ 14] = *(ao3 + 3); +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(ao4 + 3); +#endif + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + X += 4; + i -= 2; + continue; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + b += 4; + } + } else if (X < posY) { + /* ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; */ + b += 4 * i; + } else { #ifdef UNIT - data05 = *(ao2 + 0); - data09 = *(ao3 + 0); - data13 = *(ao4 + 0); - - if (i >= 2) { - data10 = *(ao3 + 1); - data14 = *(ao4 + 1); - } - - if (i >= 3) { - data15 = *(ao4 + 2); - } - - b[ 0] = ONE; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = ONE; - b[ 2] = data10; - b[ 3] = data14; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = ONE; - b[ 3] = data15; - b += 4; - } + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - data09 = *(ao3 + 0); - data13 = *(ao4 + 0); - - if (i >= 2) { - data06 = *(ao2 + 1); - data10 = *(ao3 + 1); - data14 = *(ao4 + 1); - } - - if (i >= 3) { - data11 = *(ao3 + 2); - data15 = *(ao4 + 2); - } - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = data06; - b[ 2] = data10; - b[ 3] = data14; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = data11; - b[ 3] = data15; - b += 4; - } + b[ 0] = *(ao1 + 0); #endif - } - } - - posY += 4; - js --; - } while (js > 0); - } /* End of main loop */ - - - if (n & 2){ - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - } - - i = (m >> 1); - if (i > 0) { - do { - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data02; - b[ 3] = data06; - - ao1 += 2; - ao2 += 2; - b += 4; - - } else - if (X < posY) { - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 4; - } else { + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + } + + posY += 4; + } + + + if ((n % 6) & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { #ifdef UNIT - data02 = *(ao1 + 1); + data02 = *(ao1 + 1); - b[ 0] = ONE; - b[ 1] = ZERO; - b[ 2] = data02; - b[ 3] = ONE; + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = ZERO; - b[ 2] = data02; - b[ 3] = data06; + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data06; #endif - ao1 += 2; - ao2 += 2; - - b += 4; - } - - X += 2; - i --; - } while (i > 0); - } - - i = (m & 1); - if (i) { - - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao2 + 0); - b[ 0] = data01; - b[ 1] = data02; - - ao1 += 1; - ao2 += 1; - b += 2; - } else - if (X < posY) { - ao1 += lda; - b += 2; - } else { + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 1; + ao2 += 1; + b += 2; + } else if (X < posY) { + ao1 += lda; + b += 2; + } else { #ifdef UNIT - data05 = *(ao2 + 0); + data05 = *(ao2 + 0); - b[ 0] = ONE; - b[ 1] = data05; + b[ 0] = ONE; + b[ 1] = data05; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); - b[ 0] = data01; - b[ 1] = data05; + b[ 0] = data01; + b[ 1] = data05; #endif - b += 2; - } - } - posY += 2; - } - - if (n & 1){ - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - } - - i = m; - if (i > 0) { - do { - if (X > posY) { - data01 = *(ao1 + 0); - b[ 0] = data01; - b += 1; - ao1 += 1; - } else - if (X < posY) { - b += 1; - ao1 += lda; - } else { + b += 2; + } + } + posY += 2; + } + + if ((n % 6) & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += 1; + } else if (X < posY) { + b += 1; + ao1 += lda; + } else { #ifdef UNIT - b[ 0] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - b[ 0] = data01; + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - b += 1; - ao1 += 1; - } + b += 1; + ao1 += 1; + } - X ++; - i --; - } while (i > 0); - } + X ++; + i --; + } while (i > 0); + } - posY += 1; - } + posY += 1; + } - return 0; + return 0; } diff --git a/kernel/generic/trmm_ltcopy_6.c b/kernel/generic/trmm_ltcopy_6.c index 66a7325bb1..7c22450369 100644 --- a/kernel/generic/trmm_ltcopy_6.c +++ b/kernel/generic/trmm_ltcopy_6.c @@ -41,448 +41,511 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js; - BLASLONG X; - - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *ao1, *ao2, *ao3, *ao4; - - js = (n >> 2); - - if (js > 0){ - do { - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - ao3 = a + posY + (posX + 2) * lda; - ao4 = a + posY + (posX + 3) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - ao3 = a + posX + (posY + 2) * lda; - ao4 = a + posX + (posY + 3) * lda; - } - - i = (m >> 2); - if (i > 0) { - do { - if (X > posY) { - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - - } else - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b[ 4] = data05; - b[ 5] = data06; - b[ 6] = data07; - b[ 7] = data08; - - b[ 8] = data09; - b[ 9] = data10; - b[10] = data11; - b[11] = data12; - b[12] = data13; - b[13] = data14; - b[14] = data15; - b[15] = data16; - - ao1 += 4 * lda; - ao2 += 4 * lda; - ao3 += 4 * lda; - ao4 += 4 * lda; - b += 16; - - } else { + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT data01, data02, data05, data06; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = (n / 6); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } + + i = (m / 6); + if (i > 0) { + do { + if (X > posY) { + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + b += 36; + + } else if (X < posY) { + for (ii = 0; ii < 6; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + + ao1 += lda; + b += 6; + } + + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + + b[ 6] = ZERO; +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(ao2 + 1); +#endif + b[ 8] = *(ao2 + 2); + b[ 9] = *(ao2 + 3); + b[10] = *(ao2 + 4); + b[11] = *(ao2 + 5); + + b[12] = ZERO; + b[13] = ZERO; +#ifdef UNIT + b[14] = ONE; +#else + b[14] = *(ao3 + 2); +#endif + b[15] = *(ao3 + 3); + b[16] = *(ao3 + 4); + b[17] = *(ao3 + 5); + + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; +#ifdef UNIT + b[21] = ONE; +#else + b[21] = *(ao4 + 3); +#endif + b[22] = *(ao4 + 4); + b[23] = *(ao4 + 5); + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; +#ifdef UNIT + b[28] = ONE; +#else + b[28] = *(ao5 + 4); +#endif + b[29] = *(ao5 + 5); + + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; +#ifdef UNIT + b[35] = ONE; +#else + b[35] = *(ao6 + 5); +#endif + + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + b += 36; + } + + X += 6; + i --; + } while (i > 0); + } + + i = (m % 6); + if (i) { + + if (X > posY) { + + b += 6 * i; + + } else if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + + ao1 += lda; + ao2 += lda; + ao3 += lda; + ao4 += lda; + ao5 += lda; + ao6 += lda; + b += 6; + } + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + b += 6; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(ao2 + 1); +#endif + b[ 2] = *(ao2 + 2); + b[ 3] = *(ao2 + 3); + b[ 4] = *(ao2 + 4); + b[ 5] = *(ao2 + 5); + b += 6; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(ao3 + 2); +#endif + b[ 3] = *(ao3 + 3); + b[ 4] = *(ao3 + 4); + b[ 5] = *(ao3 + 5); + b += 6; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(ao4 + 3); +#endif + b[ 4] = *(ao4 + 4); + b[ 5] = *(ao4 + 5); + b += 6; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(ao5 + 4); +#endif + b[ 5] = *(ao5 + 5); + b += 6; + } + } + } + + posY += 6; + js --; + } while (js > 0); + } /* End of main loop */ + + if ((n % 6) & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } else if (X < posY) { + + for (ii = 0; ii < 2; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + ao1 += lda; + b += 4; + } + + ao2 += 2 * lda; + ao3 += 2 * lda; + ao4 += 2 * lda; + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(ao2 + 1); +#endif + b[ 6] = *(ao2 + 2); + b[ 7] = *(ao2 + 3); + b[ 8] = ZERO; + b[ 9] = ZERO; #ifdef UNIT - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data12 = *(ao3 + 3); - - b[ 0] = ONE; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - - b[ 4] = ZERO; - b[ 5] = ONE; - b[ 6] = data07; - b[ 7] = data08; - - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = ONE; - b[11] = data12; - - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = ONE; + b[ 10] = ONE; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b[ 4] = ZERO; - b[ 5] = data06; - b[ 6] = data07; - b[ 7] = data08; - - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = data11; - b[11] = data12; - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = data16; + b[ 10] = *(ao3 + 2); #endif - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - } - - X += 4; - i --; - } while (i > 0); - } - - i = (m & 3); - if (i) { - - if (X > posY) { - - if (m & 2) { - ao1 += 2; - ao2 += 2; - ao3 += 2; - ao4 += 2; - b += 8; - } - - if (m & 1) { - ao1 += 1; - ao2 += 1; - ao3 += 1; - ao4 += 1; - b += 4; - } - - } else - if (X < posY) { - if (m & 2) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b[ 4] = data05; - b[ 5] = data06; - b[ 6] = data07; - b[ 7] = data08; - - ao1 += 2 * lda; - ao2 += 2 * lda; - - b += 8; - } - - if (m & 1) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - - ao1 += lda; - b += 4; - } - - } else { + b[ 11] = *(ao3 + 3); + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(ao4 + 3); +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + X += 4; + i -= 2; + continue; + } + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0) { + if (X > posY) { + /* ao1 += i; + ao2 += i; + ao3 += i; + ao4 += i; */ + b += 4 * i; + } else if (X < posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + + // ao1 += lda; + // ao2 += lda; + // ao3 += lda; + // ao4 += lda; + b += 4; + } + } else { #ifdef UNIT - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - if (i >= 2) { - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - } - - if (i >= 3) { - data12 = *(ao3 + 3); - } - - b[ 0] = ONE; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = ONE; - b[ 2] = data07; - b[ 3] = data08; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = ONE; - b[ 3] = data12; - b += 4; - } + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - if (i >= 2) { - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - } - - if (i >= 3) { - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - } - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = data06; - b[ 2] = data07; - b[ 3] = data08; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = data11; - b[ 3] = data12; - b += 4; - } + b[ 0] = *(ao1 + 0); #endif - } - } - - posY += 4; - js --; - } while (js > 0); - } /* End of main loop */ - - - if (n & 2){ - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - } - - i = (m >> 1); - if (i > 0) { - do { - if (X > posY) { - ao1 += 2; - ao2 += 2; - b += 4; - - } else - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data05; - b[ 3] = data06; - - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 4; - } else { + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b += 4; + } + } + posY += 4; + } + + + if ((n % 6) & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { #ifdef UNIT - data02 = *(ao1 + 1); + data02 = *(ao1 + 1); - b[ 0] = ONE; - b[ 1] = data02; - b[ 2] = ZERO; - b[ 3] = ONE; + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = ZERO; - b[ 3] = data06; + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data06; #endif - ao1 += 2; - ao2 += 2; - b += 4; - } - - X += 2; - i --; - } while (i > 0); - } - - i = (m & 1); - if (i) { - - if (X > posY) { - ao1 += 1; - ao2 += 1; - - b += 2; - } else - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - - b[ 0] = data01; - b[ 1] = data02; - ao1 += lda; - b += 2; - } else { + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + ao1 += 1; + ao2 += 1; + + b += 2; + } else if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { #ifdef UNIT - data02 = *(ao1 + 1); + data02 = *(ao1 + 1); - b[ 0] = ONE; - b[ 1] = data02; + b[ 0] = ONE; + b[ 1] = data02; #else - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); - b[ 0] = data01; - b[ 1] = data02; + b[ 0] = data01; + b[ 1] = data02; #endif - b += 2; - } - } - posY += 2; - } - - if (n & 1){ - X = posX; - - if (posX <= posY) { - ao1 = a + posY + (posX + 0) * lda; - } else { - ao1 = a + posX + (posY + 0) * lda; - } - - i = m; - if (i > 0) { - do { - if (X > posY) { - b += 1; - ao1 += 1; - } else - if (X < posY) { - data01 = *(ao1 + 0); - b[ 0] = data01; - ao1 += lda; - b += 1; - } else { + b += 2; + } + } + posY += 2; + } + + if ((n % 6) & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + b += 1; + ao1 += 1; + } else if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { #ifdef UNIT - b[ 0] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - b[ 0] = data01; + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - ao1 += 1; - b += 1; - } + ao1 += 1; + b += 1; + } - X ++; - i --; - } while (i > 0); - } + X ++; + i --; + } while (i > 0); + } - posY += 1; - } + posY += 1; + } - return 0; + return 0; } diff --git a/kernel/generic/trmm_uncopy_6.c b/kernel/generic/trmm_uncopy_6.c index 4878f3f530..9521cc7243 100644 --- a/kernel/generic/trmm_uncopy_6.c +++ b/kernel/generic/trmm_uncopy_6.c @@ -41,745 +41,544 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js; - BLASLONG X, mm; - - FLOAT data01, data02, data03, data04, data05, data06; - FLOAT data07, data08, data09, data10, data11, data12; - FLOAT data13, data14, data15, data16, data17, data18; - FLOAT data19, data20, data21, data22, data23, data24; - FLOAT data25, data26, data27, data28, data29, data30; - FLOAT data31, data32, data33, data34, data35, data36; - - FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; - - //js = (n >> 2); - js = n/6; - if (js > 0){ - do { - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - ao3 = a + posX + (posY + 2) * lda; - ao4 = a + posX + (posY + 3) * lda; - ao5 = a + posX + (posY + 4) * lda; - ao6 = a + posX + (posY + 5) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - ao3 = a + posY + (posX + 2) * lda; - ao4 = a + posY + (posX + 3) * lda; - ao5 = a + posY + (posX + 4) * lda; - ao6 = a + posY + (posX + 5) * lda; - } - - i = m/6; - if (i > 0) { - do { - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - data05 = *(ao1 + 4); - data06 = *(ao1 + 5); - - data07 = *(ao2 + 0); - data08 = *(ao2 + 1); - data09 = *(ao2 + 2); - data10 = *(ao2 + 3); - data11 = *(ao2 + 4); - data12 = *(ao2 + 5); - - data13 = *(ao3 + 0); - data14 = *(ao3 + 1); - data15 = *(ao3 + 2); - data16 = *(ao3 + 3); - data17 = *(ao3 + 4); - data18 = *(ao3 + 5); - - data19 = *(ao4 + 0); - data20 = *(ao4 + 1); - data21 = *(ao4 + 2); - data22 = *(ao4 + 3); - data23 = *(ao4 + 4); - data24 = *(ao4 + 5); - - data25 = *(ao5 + 0); - data26 = *(ao5 + 1); - data27 = *(ao5 + 2); - data28 = *(ao5 + 3); - data29 = *(ao5 + 4); - data30 = *(ao5 + 5); - - data31 = *(ao6 + 0); - data32 = *(ao6 + 1); - data33 = *(ao6 + 2); - data34 = *(ao6 + 3); - data35 = *(ao6 + 4); - data36 = *(ao6 + 5); - - b[ 0] = data01; - b[ 1] = data07; - b[ 2] = data13; - b[ 3] = data19; - b[ 4] = data25; - b[ 5] = data31; - - b[ 6] = data02; - b[ 7] = data08; - b[ 8] = data14; - b[ 9] = data20; - b[10] = data26; - b[11] = data32; - - b[12] = data03; - b[13] = data09; - b[14] = data15; - b[15] = data21; - b[16] = data27; - b[17] = data33; - - b[18] = data04; - b[19] = data10; - b[20] = data16; - b[21] = data22; - b[22] = data28; - b[23] = data34; - - b[24] = data05; - b[25] = data11; - b[26] = data17; - b[27] = data23; - b[28] = data29; - b[29] = data35; - - b[30] = data06; - b[31] = data12; - b[32] = data18; - b[33] = data24; - b[34] = data30; - b[35] = data36; - - ao1 += 6; - ao2 += 6; - ao3 += 6; - ao4 += 6; - ao5 += 6; - ao6 += 6; - b += 36; - } else - if (X > posY) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b[ 4] = ZERO; - b[ 5] = ZERO; - b[ 6] = ZERO; - b[ 7] = ZERO; - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = ZERO; - b[11] = ZERO; - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = ZERO; - b[16] = ZERO; - b[17] = ZERO; - b[18] = ZERO; - b[19] = ZERO; - b[20] = ZERO; - b[21] = ZERO; - b[22] = ZERO; - b[23] = ZERO; - b[24] = ZERO; - b[25] = ZERO; - b[26] = ZERO; - b[27] = ZERO; - b[28] = ZERO; - b[29] = ZERO; - b[30] = ZERO; - b[31] = ZERO; - b[32] = ZERO; - b[33] = ZERO; - b[34] = ZERO; - b[35] = ZERO; - - ao1 += 6 * lda; - ao2 += 6 * lda; - ao3 += 6 * lda; - ao4 += 6 * lda; - ao5 += 6 * lda; - ao6 += 6 * lda; - - b += 36; - } else { - data01 = *(ao1 + 0); - data07 = *(ao2 + 0); - data13 = *(ao3 + 0); - data19 = *(ao4 + 0); - data25 = *(ao5 + 0); - data31 = *(ao6 + 0); - - data08 = *(ao2 + 1); - data14 = *(ao3 + 1); - data20 = *(ao4 + 1); - data26 = *(ao5 + 1); - data32 = *(ao6 + 1); - - data15 = *(ao3 + 2); - data21 = *(ao4 + 2); - data27 = *(ao5 + 2); - data33 = *(ao6 + 2); - - data22 = *(ao4 + 3); - data28 = *(ao5 + 3); - data34 = *(ao6 + 3); - - data29 = *(ao5 + 4); - data35 = *(ao6 + 4); - - data36 = *(ao6 + 5); + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT data01, data02, data05, data06; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = n/6; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } + + i = m/6; + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 6; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + ao5 ++; + ao6 ++; + b += 6; + } + } else if (X > posY) { + // b[ 0] = ZERO; + // b[ 1] = ZERO; + // b[ 2] = ZERO; + // b[ 3] = ZERO; + // b[ 4] = ZERO; + // b[ 5] = ZERO; + // b[ 6] = ZERO; + // b[ 7] = ZERO; + // b[ 8] = ZERO; + // b[ 9] = ZERO; + // b[10] = ZERO; + // b[11] = ZERO; + // b[12] = ZERO; + // b[13] = ZERO; + // b[14] = ZERO; + // b[15] = ZERO; + // b[16] = ZERO; + // b[17] = ZERO; + // b[18] = ZERO; + // b[19] = ZERO; + // b[20] = ZERO; + // b[21] = ZERO; + // b[22] = ZERO; + // b[23] = ZERO; + // b[24] = ZERO; + // b[25] = ZERO; + // b[26] = ZERO; + // b[27] = ZERO; + // b[28] = ZERO; + // b[29] = ZERO; + // b[30] = ZERO; + // b[31] = ZERO; + // b[32] = ZERO; + // b[33] = ZERO; + // b[34] = ZERO; + // b[35] = ZERO; + + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + b += 36; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + + b[ 6] = ZERO; +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(ao2 + 1); +#endif + b[ 8] = *(ao3 + 1); + b[ 9] = *(ao4 + 1); + b[10] = *(ao5 + 1); + b[11] = *(ao6 + 1); + + b[ 12] = ZERO; + b[ 13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; +#else + b[ 14] = *(ao3 + 2); +#endif + b[ 15] = *(ao4 + 2); + b[ 16] = *(ao5 + 2); + b[ 17] = *(ao6 + 2); + + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; +#ifdef UNIT + b[ 21] = ONE; +#else + b[ 21] = *(ao4 + 3); +#endif + b[ 22] = *(ao5 + 3); + b[ 23] = *(ao6 + 3); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; +#ifdef UNIT + b[ 28] = ONE; +#else + b[ 28] = *(ao5 + 4); +#endif + b[ 29] = *(ao6 + 4); + + b[ 30] = ZERO; + b[ 31] = ZERO; + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; +#ifdef UNIT + b[ 35] = ONE; +#else + b[ 35] = *(ao6 + 5); +#endif + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + b += 36; + } + X += 6; + i --; + } while (i > 0); + } + i = m % 6; + if (i) { + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + ao5 ++; + ao6 ++; + b += 6; + } + } else if (X > posY) { + b += 6 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = *(ao5 + 0); + b[ 5] = *(ao6 + 0); + b += 6; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(ao2 + 1); +#endif + b[ 2] = *(ao3 + 1); + b[ 3] = *(ao4 + 1); + b[ 4] = *(ao5 + 1); + b[ 5] = *(ao6 + 1); + b += 6; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(ao3 + 2); +#endif + b[ 3] = *(ao4 + 2); + b[ 4] = *(ao5 + 2); + b[ 5] = *(ao6 + 2); + b += 6; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(ao4 + 3); +#endif + b[ 4] = *(ao5 + 3); + b[ 5] = *(ao6 + 3); + b += 6; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(ao5 + 4); +#endif + b[ 5] = *(ao6 + 4); + b += 6; + } + } + } + posY += 6; + js --; + } while (js > 0); + } /* End of main loop */ + + + if ((n % 6) & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 2; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + b += 4; + } + } else if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + ao3 += 2 * lda; + ao4 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b[ 4] = ZERO; #ifdef UNIT - b[ 0] = ONE; - b[ 1] = data07; - b[ 2] = data13; - b[ 3] = data19; - b[ 4] = data25; - b[ 5] = data31; - - b[ 6] = ZERO; - b[ 7] = ONE; - b[ 8] = data14; - b[ 9] = data20; - b[10] = data26; - b[11] = data32; - - b[12] = ZERO; - b[13] = ZERO; - b[14] = ONE; - b[15] = data21; - b[16] = data27; - b[17] = data33; - - b[18] = ZERO; - b[19] = ZERO; - b[20] = ZERO; - b[21] = ONE; - b[22] = data28; - b[23] = data34; - - b[24] = ZERO; - b[25] = ZERO; - b[26] = ZERO; - b[27] = ZERO; - b[28] = ONE; - b[29] = data35; - - b[30] = ZERO; - b[31] = ZERO; - b[32] = ZERO; - b[33] = ZERO; - b[34] = ZERO; - b[35] = ONE; + b[ 5] = ONE; #else - b[ 0] = data01; - b[ 1] = data07; - b[ 2] = data13; - b[ 3] = data19; - b[ 4] = data25; - b[ 5] = data31; - - b[ 6] = ZERO; - b[ 7] = data08; - b[ 8] = data14; - b[ 9] = data20; - b[10] = data26; - b[11] = data32; - - b[12] = ZERO; - b[13] = ZERO; - b[14] = data15; - b[15] = data21; - b[16] = data27; - b[17] = data33; - - b[18] = ZERO; - b[19] = ZERO; - b[20] = ZERO; - b[21] = data22; - b[22] = data28; - b[23] = data34; - - b[24] = ZERO; - b[25] = ZERO; - b[26] = ZERO; - b[27] = ZERO; - b[28] = data29; - b[29] = data35; - - b[30] = ZERO; - b[31] = ZERO; - b[32] = ZERO; - b[33] = ZERO; - b[34] = ZERO; - b[35] = data36; + b[ 5] = *(ao2 + 1); #endif + b[ 6] = *(ao3 + 1); + b[ 7] = *(ao4 + 1); - ao1 += 6; - ao2 += 6; - ao3 += 6; - ao4 += 6; - ao5 += 6; - ao6 += 7; - - b += 36; - } - X += 6; - i --; - } while (i > 0); - } - mm = m - m/6; - if (mm & 4) { - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b[ 4] = data02; - b[ 5] = data06; - b[ 6] = data10; - b[ 7] = data14; - - b[ 8] = data03; - b[ 9] = data07; - b[10] = data11; - b[11] = data15; - b[12] = data04; - b[13] = data08; - b[14] = data12; - b[15] = data16; - - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - } else - if (X > posY) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b[ 4] = ZERO; - b[ 5] = ZERO; - b[ 6] = ZERO; - b[ 7] = ZERO; - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = ZERO; - b[11] = ZERO; - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = ZERO; - b[16] = ZERO; - b[17] = ZERO; - b[18] = ZERO; - b[19] = ZERO; - b[20] = ZERO; - b[21] = ZERO; - b[22] = ZERO; - b[23] = ZERO; - - ao1 += 4 * lda; - ao2 += 4 * lda; - ao3 += 4 * lda; - ao4 += 4 * lda; - - b += 16; - } else { + b[ 8] = ZERO; + b[ 9] = ZERO; #ifdef UNIT - data05 = *(ao2 + 0); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - - b[ 0] = ONE; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - - b[ 4] = ZERO; - b[ 5] = ONE; - b[ 6] = data10; - b[ 7] = data14; - - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = ONE; - b[11] = data15; - - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = ONE; + b[ 10] = ONE; #else - data01 = *(ao1 + 0); - - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - - b[ 4] = ZERO; - b[ 5] = data06; - b[ 6] = data10; - b[ 7] = data14; - - b[ 8] = ZERO; - b[ 9] = ZERO; - b[10] = data11; - b[11] = data15; - - b[12] = ZERO; - b[13] = ZERO; - b[14] = ZERO; - b[15] = data16; + b[ 10] = *(ao3 + 2); #endif - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - - b += 16; - } - X += 4; - } - - if (mm & 3) { - if (X < posY) { - if (mm & 2) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao2 + 0); - data04 = *(ao2 + 1); - data05 = *(ao3 + 0); - data06 = *(ao3 + 1); - data07 = *(ao4 + 0); - data08 = *(ao4 + 1); - - b[ 0] = data01; - b[ 1] = data03; - b[ 2] = data05; - b[ 3] = data07; - b[ 4] = data02; - b[ 5] = data04; - b[ 6] = data06; - b[ 7] = data08; - - ao1 += 2; - ao2 += 2; - ao3 += 2; - ao4 += 2; - b += 8; - } - - if (mm & 1) { - data01 = *(ao1 + 0); - data03 = *(ao2 + 0); - data05 = *(ao3 + 0); - data07 = *(ao4 + 0); - - b[ 0] = data01; - b[ 1] = data03; - b[ 2] = data05; - b[ 3] = data07; - - ao1 += 1; - ao2 += 1; - ao3 += 1; - ao4 += 1; - b += 4; - } - - } else - if (X > posY) { - if (m & 2) { - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 8; - } - - if (m & 1) { - ao1 += lda; - b += 4; - } - - } else { + b[ 11] = *(ao4 + 2); + + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(ao4 + 3); +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + X += 4; + i -= 2; + continue; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + + ao1 ++; + ao2 ++; + ao3 ++; + ao4 ++; + b += 4; + } + } else if (X > posY) { + /* ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; */ + b += 4 * i; + } else { #ifdef UNIT - data05 = *(ao2 + 0); - data09 = *(ao3 + 0); - data13 = *(ao4 + 0); - - if (i >= 2) { - data10 = *(ao3 + 1); - data14 = *(ao4 + 1); - } - - if (i >= 3) { - data15 = *(ao4 + 2); - } - - b[ 0] = ONE; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = ONE; - b[ 2] = data10; - b[ 3] = data14; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = ONE; - b[ 3] = data15; - b += 4; - } + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - data09 = *(ao3 + 0); - data13 = *(ao4 + 0); - - if (i >= 2) { - data06 = *(ao2 + 1); - data10 = *(ao3 + 1); - data14 = *(ao4 + 1); - } - - if (i >= 3) { - data11 = *(ao3 + 2); - data15 = *(ao4 + 2); - } - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data09; - b[ 3] = data13; - b += 4; - - if(i >= 2) { - b[ 0] = ZERO; - b[ 1] = data06; - b[ 2] = data10; - b[ 3] = data14; - b += 4; - } - - if (i >= 3) { - b[ 0] = ZERO; - b[ 1] = ZERO; - b[ 2] = data11; - b[ 3] = data15; - b += 4; - } + b[ 0] = *(ao1 + 0); #endif - } - } - - posY += 4; - js --; - } while (js > 0); - } /* End of main loop */ - - if (n & 2){ - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - } - - i = (m >> 1); - if (i > 0) { - do { - if (X < posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = data02; - b[ 3] = data06; - - ao1 += 2; - ao2 += 2; - b += 4; - - } else - if (X > posY) { - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 4; - - } else { + b[ 1] = *(ao2 + 0); + b[ 2] = *(ao3 + 0); + b[ 3] = *(ao4 + 0); + b += 4; + } + } + + posY += 4; + } + + if ((n % 6) & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { #ifdef UNIT - data05 = *(ao2 + 0); + data05 = *(ao2 + 0); - b[ 0] = ONE; - b[ 1] = data05; - b[ 2] = ZERO; - b[ 3] = ONE; + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = ONE; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data05; - b[ 2] = ZERO; - b[ 3] = data06; + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = data06; #endif - ao1 += 2 * lda; - ao2 += 2 * lda; - - b += 4; - } - - X += 2; - i --; - } while (i > 0); - } - - i = (m & 1); - if (i) { - - if (X < posY) { - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - - b[ 0] = data01; - b[ 1] = data05; - ao1 += 1; - ao2 += 1; - b += 2; - } else - if (X > posY) { - ao1 += lda; - ao2 += lda; - b += 2; - } else { + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; + ao1 += 1; + ao2 += 1; + b += 2; + } else if (X > posY) { + ao1 += lda; + ao2 += lda; + b += 2; + } else { #ifdef UNIT - data05 = *(ao2 + 0); - b[ 0] = ONE; - b[ 1] = data05; + data05 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data05; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); - b[ 0] = data01; - b[ 1] = data05; + b[ 0] = data01; + b[ 1] = data05; #endif - ao1 += lda; - ao2 += lda; - b += 2; - } - } - - posY += 2; - } - - if (n & 1){ - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - } - - i = m; - if (m > 0) { - do { - if (X < posY) { - data01 = *(ao1 + 0); - b[ 0] = data01; - ao1 += 1; - b += 1; - } else - if (X > posY) { - ao1 += lda; - b += 1; - } else { + ao1 += lda; + ao2 += lda; + b += 2; + } + } + + posY += 2; + } + + if ((n % 6) & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else if (X > posY) { + ao1 += lda; + b += 1; + } else { #ifdef UNIT - b[ 0] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - b[ 0] = data01; + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - ao1 += lda; - b += 1; - } + ao1 += lda; + b += 1; + } - X += 1; - i --; - } while (i > 0); - } - } + X += 1; + i --; + } while (i > 0); + } + } - return 0; + return 0; } diff --git a/kernel/generic/trmm_utcopy_6.c b/kernel/generic/trmm_utcopy_6.c index 441f7338b5..e7ec4999f3 100644 --- a/kernel/generic/trmm_utcopy_6.c +++ b/kernel/generic/trmm_utcopy_6.c @@ -41,432 +41,510 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js; - BLASLONG X; - - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *ao1, *ao2, *ao3, *ao4; - - js = (n >> 2); - - if (js > 0){ - do { - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - ao3 = a + posX + (posY + 2) * lda; - ao4 = a + posX + (posY + 3) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - ao3 = a + posY + (posX + 2) * lda; - ao4 = a + posY + (posX + 3) * lda; - } - - i = (m >> 2); - if (i > 0) { - do { - if (X < posY) { - ao1 += 4; - ao2 += 4; - ao3 += 4; - ao4 += 4; - b += 16; - } else - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - data12 = *(ao3 + 3); - - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b[ 4] = data05; - b[ 5] = data06; - b[ 6] = data07; - b[ 7] = data08; - - b[ 8] = data09; - b[ 9] = data10; - b[10] = data11; - b[11] = data12; - b[12] = data13; - b[13] = data14; - b[14] = data15; - b[15] = data16; - - ao1 += 4 * lda; - ao2 += 4 * lda; - ao3 += 4 * lda; - ao4 += 4 * lda; - b += 16; - - } else { + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT data01, data02, data05, data06; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + js = (n / 6); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } + + i = (m / 6); + if (i > 0) { + do { + if (X < posY) { + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + + b += 36; + } else if (X > posY) { + for (ii = 0; ii < 6; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + + ao1 += lda; + b += 6; + } + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + } else { #ifdef UNIT - data05 = *(ao2 + 0); - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - - b[ 0] = ONE; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - - b[ 4] = data05; - b[ 5] = ONE; - b[ 6] = ZERO; - b[ 7] = ZERO; - - b[ 8] = data09; - b[ 9] = data10; - b[10] = ONE; - b[11] = ZERO; - - b[12] = data13; - b[13] = data14; - b[14] = data15; - b[15] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - data13 = *(ao4 + 0); - data14 = *(ao4 + 1); - data15 = *(ao4 + 2); - data16 = *(ao4 + 3); - - b[ 0] = data01; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - - b[ 4] = data05; - b[ 5] = data06; - b[ 6] = ZERO; - b[ 7] = ZERO; - - b[ 8] = data09; - b[ 9] = data10; - b[10] = data11; - b[11] = ZERO; - - b[12] = data13; - b[13] = data14; - b[14] = data15; - b[15] = data16; + b[ 0] = *(ao1 + 0); #endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; - ao1 += 4 * lda; - ao2 += 4 * lda; - ao3 += 4 * lda; - ao4 += 4 * lda; - - b += 16; - } - - X += 4; - i --; - } while (i > 0); - } - - i = (m & 3); - if (i) { - - if (X < posY) { - - if (m & 2) { - ao1 += 2; - ao2 += 2; - ao3 += 2; - ao4 += 2; - b += 8; - } - - if (m & 1) { - ao1 += 1; - ao2 += 1; - ao3 += 1; - ao4 += 1; - b += 4; - } - - } else - if (X > posY) { - if (m & 2) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - data07 = *(ao2 + 2); - data08 = *(ao2 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - b[ 4] = data05; - b[ 5] = data06; - b[ 6] = data07; - b[ 7] = data08; - - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 8; - } - - if (m & 1) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data03 = *(ao1 + 2); - data04 = *(ao1 + 3); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data03; - b[ 3] = data04; - - ao1 += lda; - b += 4; - } - - } else { + b[ 6] = *(ao2 + 0); +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(ao2 + 1); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + + b[12] = *(ao3 + 0); + b[13] = *(ao3 + 1); +#ifdef UNIT + b[14] = ONE; +#else + b[14] = *(ao3 + 2); +#endif + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + + b[18] = *(ao4 + 0); + b[19] = *(ao4 + 1); + b[20] = *(ao4 + 2); +#ifdef UNIT + b[21] = ONE; +#else + b[21] = *(ao4 + 3); +#endif + b[22] = ZERO; + b[23] = ZERO; + + b[24] = *(ao5 + 0); + b[25] = *(ao5 + 1); + b[26] = *(ao5 + 2); + b[27] = *(ao5 + 3); +#ifdef UNIT + b[28] = ONE; +#else + b[28] = *(ao5 + 4); +#endif + b[29] = ZERO; + + b[30] = *(ao6 + 0); + b[31] = *(ao6 + 1); + b[32] = *(ao6 + 2); + b[33] = *(ao6 + 3); + b[34] = *(ao6 + 4); +#ifdef UNIT + b[35] = ONE; +#else + b[35] = *(ao6 + 5); +#endif + + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + b += 36; + } + + X += 6; + i --; + } while (i > 0); + } + + i = m % 6; + if (i > 0) { + if (X < posY) { + + ao1 += i; + ao2 += i; + ao3 += i; + ao4 += i; + ao5 += i; + ao6 += i; + b += 6 * i; + + } else if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + b[ 4] = *(ao1 + 4); + b[ 5] = *(ao1 + 5); + + ao1 += lda; + ao2 += lda; + ao3 += lda; + ao4 += lda; + ao5 += lda; + ao6 += lda; + b += 6; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + + if (i >= 2) { + b[ 0] = *(ao2 + 0); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(ao2 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 3) { + b[ 0] = *(ao3 + 0); + b[ 1] = *(ao3 + 1); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(ao3 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 4) { + b[ 0] = *(ao4 + 0); + b[ 1] = *(ao4 + 1); + b[ 2] = *(ao4 + 2); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(ao4 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b += 6; + } + + if (i >= 5) { + b[ 0] = *(ao5 + 0); + b[ 1] = *(ao5 + 1); + b[ 2] = *(ao5 + 2); + b[ 3] = *(ao5 + 3); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(ao5 + 4); +#endif + b[ 5] = ZERO; + b += 6; + } + } + } + + posY += 6; + js --; + } while (js > 0); + } /* End of main loop */ + + if ((n % 6) & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } else if (X > posY) { + for (ii = 0; ii < 2; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + ao1 += lda; + b += 4; + } + + ao2 += 2 * lda; + ao3 += 2 * lda; + ao4 += 2 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(ao1 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(ao2 + 0); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(ao2 + 1); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(ao3 + 0); + b[ 9] = *(ao3 + 1); +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(ao3 + 2); +#endif + b[ 11] = ZERO; + + b[ 12] = *(ao4 + 0); + b[ 13] = *(ao4 + 1); + b[ 14] = *(ao4 + 2); +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(ao4 + 3); +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + X += 4; + i -= 2; + continue; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0) { + if (X < posY) { + ao1 += i; + ao2 += i; + ao3 += i; + ao4 += i; + b += 4 * i; + } else if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao1 + 2); + b[ 3] = *(ao1 + 3); + ao1 += lda; + b += 4; + } + ao2 += lda; + ao3 += lda; + ao4 += lda; + } else { #ifdef UNIT - if (i >= 2) { - data05 = *(ao2 + 0); - } - - if (i >= 3) { - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - } - - b[ 0] = ONE; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b += 4; - - if(i >= 2) { - b[ 0] = data05; - b[ 1] = ONE; - b[ 2] = ZERO; - b[ 3] = ZERO; - b += 4; - } - - if (i >= 3) { - b[ 0] = data09; - b[ 1] = data10; - b[ 2] = ONE; - b[ 3] = ZERO; - b += 4; - } + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - - if (i >= 2) { - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - } - - if (i >= 3) { - data09 = *(ao3 + 0); - data10 = *(ao3 + 1); - data11 = *(ao3 + 2); - } - - b[ 0] = data01; - b[ 1] = ZERO; - b[ 2] = ZERO; - b[ 3] = ZERO; - b += 4; - - if(i >= 2) { - b[ 0] = data05; - b[ 1] = data06; - b[ 2] = ZERO; - b[ 3] = ZERO; - b += 4; - } - - if (i >= 3) { - b[ 0] = data09; - b[ 1] = data10; - b[ 2] = data11; - b[ 3] = ZERO; - b += 4; - } + b[ 0] = *(ao1 + 0); #endif - } - } - - posY += 4; - js --; - } while (js > 0); - } /* End of main loop */ - - if (n & 2){ - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - ao2 = a + posX + (posY + 1) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - ao2 = a + posY + (posX + 1) * lda; - } - - i = (m >> 1); - if (i > 0) { - do { - if (X < posY) { - ao1 += 2; - ao2 += 2; - b += 4; - - } else - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); - - b[ 0] = data01; - b[ 1] = data02; - b[ 2] = data05; - b[ 3] = data06; - - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 4; - } else { + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + } + posY += 4; + } + + if ((n % 6) & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { #ifdef UNIT - data05 = *(ao2 + 0); + data05 = *(ao2 + 0); - b[ 0] = ONE; - b[ 1] = ZERO; - b[ 2] = data05; - b[ 3] = ONE; + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = ONE; #else - data01 = *(ao1 + 0); - data05 = *(ao2 + 0); - data06 = *(ao2 + 1); + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); - b[ 0] = data01; - b[ 1] = ZERO; - b[ 2] = data05; - b[ 3] = data06; + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = data06; #endif - ao1 += 2 * lda; - ao2 += 2 * lda; - b += 4; - } - - X += 2; - i --; - } while (i > 0); - } - - i = (m & 1); - if (i) { - - if (X < posY) { - ao1 += 2; - b += 2; - } else - if (X > posY) { - data01 = *(ao1 + 0); - data02 = *(ao1 + 1); - - b[ 0] = data01; - b[ 1] = data02; - - ao1 += lda; - b += 2; - } else { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + ao1 += 2; + b += 2; + } else if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += lda; + b += 2; + } else { #ifdef UNIT - b[ 0] = ONE; - b[ 1] = ZERO; + b[ 0] = ONE; + b[ 1] = ZERO; #else - data01 = *(ao1 + 0); + data01 = *(ao1 + 0); - b[ 0] = data01; - b[ 1] = ZERO; + b[ 0] = data01; + b[ 1] = ZERO; #endif - b += 2; - } - } - posY += 2; - } - - if (n & 1){ - X = posX; - - if (posX <= posY) { - ao1 = a + posX + (posY + 0) * lda; - } else { - ao1 = a + posY + (posX + 0) * lda; - } - - i = m; - if (m > 0) { - do { - - if (X < posY) { - b += 1; - ao1 += 1; - } else - if (X > posY) { - data01 = *(ao1 + 0); - b[ 0] = data01; - ao1 += lda; - b += 1; - } else { + b += 2; + } + } + posY += 2; + } + + if ((n % 6) & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b += 1; + ao1 += 1; + } else if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { #ifdef UNIT - b[ 0] = ONE; + b[ 0] = ONE; #else - data01 = *(ao1 + 0); - b[ 0] = data01; + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - ao1 += lda; - b += 1; - } + ao1 += lda; + b += 1; + } - X += 1; - i --; - } while (i > 0); - } - } + X += 1; + i --; + } while (i > 0); + } + } - return 0; + return 0; } diff --git a/kernel/generic/trsm_lncopy_6.c b/kernel/generic/trsm_lncopy_6.c index a37c50d1f8..b0cc7ba40c 100644 --- a/kernel/generic/trsm_lncopy_6.c +++ b/kernel/generic/trsm_lncopy_6.c @@ -49,22 +49,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *a1, *a2, *a3, *a4; + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data09, data10, data11, data12, data13, data14; + FLOAT data17, data18, data19, data20, data21, data22; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data33, data34, data35, data36, data37, data38; + FLOAT data41, data42, data43, data44, data45, data46; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; - j = (n >> 2); + BLASLONG mmod6, nmod6; + mmod6 = m - (m/6)*6 ; + nmod6 = n - (n/6)*6 ; + + // j = (n >> 3); + j = (n / 6); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; - i = (m >> 2); ii = 0; + // i = (m >> 3); + i = (m / 6); while (i > 0) { if (ii == jj) { @@ -74,233 +87,562 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); #ifndef UNIT - data06 = *(a2 + 1); + data19 = *(a3 + 2); #endif - data07 = *(a2 + 2); - data08 = *(a2 + 3); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); #ifndef UNIT - data11 = *(a3 + 2); + data28 = *(a4 + 3); #endif - data12 = *(a3 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); #ifndef UNIT - data16 = *(a4 + 3); + data37 = *(a5 + 4); +#endif + data38 = *(a5 + 5); + +#ifndef UNIT + data46 = *(a6 + 5); #endif *(b + 0) = INV(data01); - *(b + 4) = data02; - *(b + 5) = INV(data06); + *(b + 6) = data02; + *(b + 7) = INV(data10); - *(b + 8) = data03; - *(b + 9) = data07; - *(b + 10) = INV(data11); + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = INV(data19); + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = INV(data28); + + *(b + 24) = data05; + *(b + 25) = data13; + *(b + 26) = data21; + *(b + 27) = data29; + *(b + 28) = INV(data37); + + *(b + 30) = data06; + *(b + 31) = data14; + *(b + 32) = data22; + *(b + 33) = data30; + *(b + 34) = data38; + *(b + 35) = INV(data46); - *(b + 12) = data04; - *(b + 13) = data08; - *(b + 14) = data12; - *(b + 15) = INV(data16); } if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = data19; + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = data28; + *(b + 22) = data36; + *(b + 23) = data44; + + *(b + 24) = data05; + *(b + 25) = data13; + *(b + 26) = data21; + *(b + 27) = data29; + *(b + 28) = data37; + *(b + 29) = data45; + + *(b + 30) = data06; + *(b + 31) = data14; + *(b + 32) = data22; + *(b + 33) = data30; + *(b + 34) = data38; + *(b + 35) = data46; + } + + a1 += 6; + a2 += 6; + a3 += 6; + a4 += 6; + a5 += 6; + a6 += 6; + a7 += 6; + a8 += 6; + b += 36; + + i --; + ii += 6; + } + + if (mmod6 & 4) { + if (ii == jj) { +#ifndef UNIT data01 = *(a1 + 0); +#endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); - data09 = *(a3 + 0); - data10 = *(a3 + 1); - data11 = *(a3 + 2); - data12 = *(a3 + 3); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); - data16 = *(a4 + 3); + *(b + 6) = data02; + *(b + 7) = INV(data10); + + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = INV(data19); + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); *(b + 0) = data01; - *(b + 1) = data05; - *(b + 2) = data09; - *(b + 3) = data13; - *(b + 4) = data02; - *(b + 5) = data06; - *(b + 6) = data10; - *(b + 7) = data14; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = data19; + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = data28; + *(b + 22) = data36; + *(b + 23) = data44; - *(b + 8) = data03; - *(b + 9) = data07; - *(b + 10) = data11; - *(b + 11) = data15; - *(b + 12) = data04; - *(b + 13) = data08; - *(b + 14) = data12; - *(b + 15) = data16; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; - b += 16; - - i --; + a5 += 4; + a6 += 4; + a7 += 4; + a8 += 4; + b += 24; ii += 4; } - if ((m & 2) != 0) { - - if (ii== jj) { + if (mmod6 & 2) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); - *(b + 4) = data02; - *(b + 5) = INV(data06); + *(b + 6) = data02; + *(b + 7) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); - data05 = *(a3 + 0); - data06 = *(a3 + 1); - data07 = *(a4 + 0); - data08 = *(a4 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); *(b + 0) = data01; - *(b + 1) = data03; - *(b + 2) = data05; - *(b + 3) = data07; - *(b + 4) = data02; - *(b + 5) = data04; - *(b + 6) = data06; - *(b + 7) = data08; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; } a1 += 2; a2 += 2; a3 += 2; a4 += 2; - b += 8; - + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 12; ii += 2; } - if ((m & 1) != 0) { + if (mmod6 & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + data33 = *(a5 + 0); + data41 = *(a6 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + } + b += 6; + } + + a += 6 * lda; + jj += 6; + j --; + } + + if (nmod6 & 4) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data10); + + *(b + 8) = data03; + *(b + 9) = data11; + *(b + 10) = INV(data19); + + *(b + 12) = data04; + *(b + 13) = data12; + *(b + 14) = data20; + *(b + 15) = INV(data28); + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i -= 2; + ii += 4; + } + + else if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + i -- ; + ii += 2; + } + + else { + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + i -- ; + ii += 2; + } + } - if (ii== jj) { + if (m & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } - if (ii > jj) { + if (ii > jj) { data01 = *(a1 + 0); - data02 = *(a2 + 0); - data03 = *(a3 + 0); - data04 = *(a4 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); *(b + 0) = data01; - *(b + 1) = data02; - *(b + 2) = data03; - *(b + 3) = data04; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; } - b += 4; + b += 4; } a += 4 * lda; jj += 4; - j --; } - if (n & 2) { + if (nmod6 & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; - i = (m >> 1); ii = 0; + i = (m >> 1); while (i > 0) { if (ii == jj) { - #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT - data04 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 2) = data02; - *(b + 3) = INV(data04); + *(b + 3) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); *(b + 0) = data01; - *(b + 1) = data03; + *(b + 1) = data09; *(b + 2) = data02; - *(b + 3) = data04; + *(b + 3) = data10; } a1 += 2; a2 += 2; - b += 4; + b += 4; i --; ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { + if (m & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } - if (ii > jj) { + if (ii > jj) { data01 = *(a1 + 0); - data02 = *(a2 + 0); + data09 = *(a2 + 0); + *(b + 0) = data01; - *(b + 1) = data02; + *(b + 1) = data09; } - b += 2; + b += 2; } + a += 2 * lda; jj += 2; } - if (n & 1) { + if (nmod6 & 1) { a1 = a + 0 * lda; - i = m; ii = 0; + i = m; while (i > 0) { if (ii == jj) { @@ -315,8 +657,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; } - a1+= 1; - b += 1; + a1 += 1; + b += 1; + i --; ii += 1; } diff --git a/kernel/generic/trsm_ltcopy_6.c b/kernel/generic/trsm_ltcopy_6.c index 12043eb335..9cda3d72ff 100644 --- a/kernel/generic/trsm_ltcopy_6.c +++ b/kernel/generic/trsm_ltcopy_6.c @@ -49,22 +49,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *a1, *a2, *a3, *a4; + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data09, data10, data11, data12, data13, data14; + FLOAT data17, data18, data19, data20, data21, data22; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data33, data34, data35, data36, data37, data38; + FLOAT data41, data42, data43, data44, data45, data46; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; - j = (n >> 2); + BLASLONG mmod6, nmod6, k; + mmod6 = m - (m/6)*6 ; + nmod6 = n - (n/6)*6 ; + + // j = (n >> 3); + j = (n / 6); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; - i = (m >> 2); ii = 0; + // i = (m >> 3); + i = (m / 6); while (i > 0) { if (ii == jj) { @@ -75,35 +88,65 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif - data07 = *(a2 + 2); - data08 = *(a2 + 3); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); #ifndef UNIT - data11 = *(a3 + 2); + data19 = *(a3 + 2); #endif - data12 = *(a3 + 3); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); #ifndef UNIT - data16 = *(a4 + 3); + data28 = *(a4 + 3); +#endif + data29 = *(a4 + 4); + data30 = *(a4 + 5); + +#ifndef UNIT + data37 = *(a5 + 4); +#endif + data38 = *(a5 + 5); + +#ifndef UNIT + data46 = *(a6 + 5); #endif *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; - *(b + 5) = INV(data06); - *(b + 6) = data07; - *(b + 7) = data08; + *(b + 7) = INV(data10); + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; - *(b + 10) = INV(data11); - *(b + 11) = data12; + *(b + 14) = INV(data19); + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; - *(b + 15) = INV(data16); + *(b + 21) = INV(data28); + *(b + 22) = data29; + *(b + 23) = data30; + + *(b + 28) = INV(data37); + *(b + 29) = data38; + + *(b + 35) = INV(data46); } if (ii < jj) { @@ -111,21 +154,182 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = data19; + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = data28; + *(b + 22) = data29; + *(b + 23) = data30; + + *(b + 24) = data33; + *(b + 25) = data34; + *(b + 26) = data35; + *(b + 27) = data36; + *(b + 28) = data37; + *(b + 29) = data38; + *(b + 30) = data41; + *(b + 31) = data42; + *(b + 32) = data43; + *(b + 33) = data44; + *(b + 34) = data45; + *(b + 35) = data46; + } - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); + a1 += 6 * lda; + a2 += 6 * lda; + a3 += 6 * lda; + a4 += 6 * lda; + a5 += 6 * lda; + a6 += 6 * lda; + a7 += 6 * lda; + a8 += 6 * lda; + b += 36; - data09 = *(a3 + 0); - data10 = *(a3 + 1); - data11 = *(a3 + 2); - data12 = *(a3 + 3); + i --; + ii += 6; + } - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); - data16 = *(a4 + 3); + if (mmod6 & 4) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + data29 = *(a4 + 4); + data30 = *(a4 + 5); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + + *(b + 7) = INV(data10); + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; + + *(b + 14) = INV(data19); + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; + + *(b + 21) = INV(data28); + *(b + 22) = data29; + *(b + 23) = data30; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); *(b + 0) = data01; *(b + 1) = data02; @@ -133,32 +337,38 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; - *(b + 6) = data07; - *(b + 7) = data08; - - *(b + 8) = data09; - *(b + 9) = data10; - *(b + 10) = data11; - *(b + 11) = data12; - *(b + 12) = data13; - *(b + 13) = data14; - *(b + 14) = data15; - *(b + 15) = data16; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = data19; + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = data28; + *(b + 22) = data29; + *(b + 23) = data30; } a1 += 4 * lda; a2 += 4 * lda; - a3 += 4 * lda; - a4 += 4 * lda; - b += 16; + /* a3 += 4 * lda; + a4 += 4 * lda; */ + b += 24; - i --; ii += 4; } - if ((m & 2) != 0) { - - if (ii== jj) { + if (mmod6 & 2) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); @@ -166,22 +376,29 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif - data07 = *(a2 + 2); - data08 = *(a2 + 3); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; - *(b + 5) = INV(data06); - *(b + 6) = data07; - *(b + 7) = data08; - + *(b + 7) = INV(data10); + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; } if (ii < jj) { @@ -189,11 +406,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); *(b + 0) = data01; *(b + 1) = data02; @@ -201,20 +422,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; - *(b + 6) = data07; - *(b + 7) = data08; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; } a1 += 2 * lda; - a2 += 2 * lda; - b += 8; + // a2 += 2 * lda; + b += 12; ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { + if (mmod6 & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); @@ -222,38 +446,78 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; } - if (ii < jj) { + if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; } - b += 4; + b += 6; } + a += 6; + jj += 6; + j --; + } + if (nmod6 & 4) { + + a1 = a; a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + *(b + ii - jj) = INV(*(a1 + ii - jj)); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k) = *(a1 + k); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + jj += 4; - j --; } - if (n & 2) { + if (nmod6 & 2) { + a1 = a + 0 * lda; a2 = a + 1 * lda; - i = (m >> 1); ii = 0; + i = (m >> 1); while (i > 0) { if (ii == jj) { @@ -264,25 +528,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); #ifndef UNIT - data04 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 1) = data02; - - *(b + 3) = INV(data04); + *(b + 3) = INV(data10); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; - *(b + 2) = data03; - *(b + 3) = data04; + *(b + 2) = data09; + *(b + 3) = data10; } a1 += 2 * lda; @@ -293,19 +556,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { + if (m & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif + // data02 = *(a1 + 1); + *(b + 0) = INV(data01); + // *(b + 1) = data02; } - if (ii < jj) { + if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); + *(b + 0) = data01; *(b + 1) = data02; } @@ -315,11 +581,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj += 2; } - if (n & 1) { + if (nmod6 & 1) { + a1 = a + 0 * lda; - i = m; ii = 0; + i = m; while (i > 0) { if (ii == jj) { @@ -334,12 +601,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; } - a1 += 1 * lda; + a1 += lda; b += 1; i --; ii += 1; } + } return 0; diff --git a/kernel/generic/trsm_uncopy_6.c b/kernel/generic/trsm_uncopy_6.c index a1bb1e2034..e20773da47 100644 --- a/kernel/generic/trsm_uncopy_6.c +++ b/kernel/generic/trsm_uncopy_6.c @@ -36,7 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include #include "common.h" #ifndef UNIT @@ -49,22 +48,38 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *a1, *a2, *a3, *a4; + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data09, data10, data11, data12, data13, data14; + FLOAT data17, data18, data19, data20, data21, data22; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data33, data34, data35, data36, data37, data38; + FLOAT data41, data42, data43, data44, data45, data46; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; - j = (n >> 2); + BLASLONG mmod6, nmod6; + mmod6 = m - (m/6)*6 ; + nmod6 = n - (n/6)*6 ; + + // j = (n >> 3); + j = (n / 6); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + // a7 = a + 6 * lda; + // a8 = a + 7 * lda; - i = (m >> 2); ii = 0; + + // i = (m >> 3); + i = (m / 6); while (i > 0) { if (ii == jj) { @@ -73,188 +88,729 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); #endif - data05 = *(a2 + 0); + data09 = *(a2 + 0); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif - data09 = *(a3 + 0); - data10 = *(a3 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); #ifndef UNIT - data11 = *(a3 + 2); + data19 = *(a3 + 2); #endif - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); #ifndef UNIT - data16 = *(a4 + 3); + data28 = *(a4 + 3); #endif - *(b + 0) = INV(data01); - *(b + 1) = data05; - *(b + 2) = data09; - *(b + 3) = data13; + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); +#ifndef UNIT + data37 = *(a5 + 4); +#endif - *(b + 5) = INV(data06); - *(b + 6) = data10; - *(b + 7) = data14; + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); +#ifndef UNIT + data46 = *(a6 + 5); +#endif - *(b + 10) = INV(data11); - *(b + 11) = data15; +// data49 = *(a7 + 0); +// data50 = *(a7 + 1); +// data51 = *(a7 + 2); +// data52 = *(a7 + 3); +// data53 = *(a7 + 4); +// data54 = *(a7 + 5); +// #ifndef UNIT +// data55 = *(a7 + 6); +// #endif +// +// data57 = *(a8 + 0); +// data58 = *(a8 + 1); +// data59 = *(a8 + 2); +// data60 = *(a8 + 3); +// data61 = *(a8 + 4); +// data62 = *(a8 + 5); +// data63 = *(a8 + 6); +// #ifndef UNIT +// data64 = *(a8 + 7); +// #endif - *(b + 15) = INV(data16); + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 7) = INV(data10); + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + + *(b + 14) = INV(data19); + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + // *(b + 22) = data51; + // *(b + 23) = data59; + + *(b + 21) = INV(data28); + *(b + 22) = data36; + *(b + 23) = data44; + // *(b + 30) = data52; + // *(b + 31) = data60; + + *(b + 28) = INV(data37); + *(b + 29) = data45; + // *(b + 38) = data53; + // *(b + 39) = data61; + + *(b + 35) = INV(data46); + // *(b + 46) = data54; + // *(b + 47) = data62; + + // *(b + 54) = INV(data55); + // *(b + 55) = data63; + + // *(b + 63) = INV(data64); } if (ii < jj) { - data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + // data07 = *(a1 + 6); + // data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + // data15 = *(a2 + 6); + // data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + // data23 = *(a3 + 6); + // data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + // data31 = *(a4 + 6); + // data32 = *(a4 + 7); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + // data39 = *(a5 + 6); + // data40 = *(a5 + 7); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + // data47 = *(a6 + 6); + // data48 = *(a6 + 7); + + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data51 = *(a7 + 2); + // data52 = *(a7 + 3); + // data53 = *(a7 + 4); + // data54 = *(a7 + 5); + // data55 = *(a7 + 6); + // data56 = *(a7 + 7); + + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + // data59 = *(a8 + 2); + // data60 = *(a8 + 3); + // data61 = *(a8 + 4); + // data62 = *(a8 + 5); + // data63 = *(a8 + 6); + // data64 = *(a8 + 7); - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = data19; + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + // *(b + 22) = data51; + // *(b + 23) = data59; + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = data28; + *(b + 22) = data36; + *(b + 23) = data44; + // *(b + 30) = data52; + // *(b + 31) = data60; + + *(b + 24) = data05; + *(b + 25) = data13; + *(b + 26) = data21; + *(b + 27) = data29; + *(b + 28) = data37; + *(b + 29) = data45; + // *(b + 38) = data53; + // *(b + 39) = data61; + + *(b + 30) = data06; + *(b + 31) = data14; + *(b + 32) = data22; + *(b + 33) = data30; + *(b + 34) = data38; + *(b + 35) = data46; + // *(b + 46) = data54; + // *(b + 47) = data62; + + // *(b + 48) = data07; + // *(b + 49) = data15; + // *(b + 50) = data23; + // *(b + 51) = data31; + // *(b + 52) = data39; + // *(b + 53) = data47; + // *(b + 54) = data55; + // *(b + 55) = data63; + + // *(b + 56) = data08; + // *(b + 57) = data16; + // *(b + 58) = data24; + // *(b + 59) = data32; + // *(b + 60) = data40; + // *(b + 61) = data48; + // *(b + 62) = data56; + // *(b + 63) = data64; + } + + a1 += 6; + a2 += 6; + a3 += 6; + a4 += 6; + a5 += 6; + a6 += 6; + // a7 += 6; + // a8 += 6; + b += 36; - data09 = *(a3 + 0); - data10 = *(a3 + 1); - data11 = *(a3 + 2); - data12 = *(a3 + 3); + i --; + ii += 6; + } - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); - data16 = *(a4 + 3); + if (mmod6 & 4) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data51 = *(a7 + 2); + // data52 = *(a7 + 3); + + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + // data59 = *(a8 + 2); + // data60 = *(a8 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 7) = INV(data10); + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + + *(b + 14) = INV(data19); + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + // *(b + 22) = data51; + // *(b + 23) = data59; + + *(b + 21) = INV(data28); + *(b + 22) = data36; + *(b + 23) = data44; + // *(b + 30) = data52; + // *(b + 31) = data60; + + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data51 = *(a7 + 2); + // data52 = *(a7 + 3); + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + // data59 = *(a8 + 2); + // data60 = *(a8 + 3); *(b + 0) = data01; - *(b + 1) = data05; - *(b + 2) = data09; - *(b + 3) = data13; - *(b + 4) = data02; - *(b + 5) = data06; - *(b + 6) = data10; - *(b + 7) = data14; - - *(b + 8) = data03; - *(b + 9) = data07; - *(b + 10) = data11; - *(b + 11) = data15; - *(b + 12) = data04; - *(b + 13) = data08; - *(b + 14) = data12; - *(b + 15) = data16; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + + *(b + 12) = data03; + *(b + 13) = data11; + *(b + 14) = data19; + *(b + 15) = data27; + *(b + 16) = data35; + *(b + 17) = data43; + // *(b + 22) = data51; + // *(b + 23) = data59; + + *(b + 18) = data04; + *(b + 19) = data12; + *(b + 20) = data20; + *(b + 21) = data28; + *(b + 22) = data36; + *(b + 23) = data44; + // *(b + 30) = data52; + // *(b + 31) = data60; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; - b += 16; - - i --; + a5 += 4; + a6 += 4; + // a7 += 4; + // a8 += 4; + b += 24; ii += 4; } - if ((m & 2) != 0) { + if (mmod6 & 2) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); - if (ii== jj) { + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 7) = INV(data10); + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + + *(b + 6) = data02; + *(b + 7) = data10; + *(b + 8) = data18; + *(b + 9) = data26; + *(b + 10) = data34; + *(b + 11) = data42; + // *(b + 14) = data50; + // *(b + 15) = data58; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 12; + ii += 2; + } + + if (mmod6 & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + data33 = *(a5 + 0); + data41 = *(a6 + 0); + // data49 = *(a7 + 0); + // data57 = *(a8 + 0); - data05 = *(a2 + 0); + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + } + + if (ii < jj) { + data01 = *(a1 + 0); + // data02 = *(a1 + 1); + data09 = *(a2 + 0); + // data10 = *(a2 + 1); + data17 = *(a3 + 0); + // data18 = *(a3 + 1); + data25 = *(a4 + 0); + // data26 = *(a4 + 1); + + // // data33 = *(a5 + 0); + // data34 = *(a5 + 1); + // // data41 = *(a6 + 0); + // data42 = *(a6 + 1); + // data49 = *(a7 + 0); + // data50 = *(a7 + 1); + // data57 = *(a8 + 0); + // data58 = *(a8 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + // *(b + 6) = data49; + // *(b + 7) = data57; + } + b += 6; + // ii += 1; + } + + a += 6 * lda; + jj += 6; + j --; + } + + + if (nmod6 & 4) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif - data09 = *(a3 + 0); - data10 = *(a3 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif - data13 = *(a4 + 0); - data14 = *(a4 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif *(b + 0) = INV(data01); - *(b + 1) = data05; - *(b + 2) = data09; - *(b + 3) = data13; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + + *(b + 5) = INV(data10); + *(b + 6) = data18; + *(b + 7) = data26; + + *(b + 10) = INV(data19); + *(b + 11) = data27; - *(b + 5) = INV(data06); - *(b + 6) = data10; - *(b + 7) = data14; + *(b + 15) = INV(data28); + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i -= 2; + ii += 4; } - if (ii < jj) { + else if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); - data05 = *(a3 + 0); - data06 = *(a3 + 1); - data07 = *(a4 + 0); - data08 = *(a4 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); *(b + 0) = data01; - *(b + 1) = data02; - *(b + 2) = data03; - *(b + 3) = data04; - *(b + 4) = data05; - *(b + 5) = data06; - *(b + 6) = data07; - *(b + 7) = data08; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + i -- ; + ii += 2; } + else{ + a1 += 2; a2 += 2; - b += 8; + a3 += 2; + a4 += 2; + b += 8; + i -- ; ii += 2; } + } - if ((m & 1) != 0) { + if (m & 1) { + if (ii == jj) { - if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif - - data05 = *(a2 + 0); - data09 = *(a3 + 0); - data13 = *(a4 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); *(b + 0) = INV(data01); - *(b + 1) = data05; - *(b + 2) = data09; - *(b + 3) = data13; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; } - if (ii < jj) { + if (ii < jj) { data01 = *(a1 + 0); - data02 = *(a2 + 0); - data03 = *(a3 + 0); - data04 = *(a4 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); *(b + 0) = data01; - *(b + 1) = data02; - *(b + 2) = data03; - *(b + 3) = data04; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; } b += 4; + // ii += 1; } - a += 4 * lda; + a += 4 * lda; jj += 4; - j --; } - if (n & 2) { + if (nmod6 & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; - i = (m >> 1); ii = 0; + + i = (m >> 1); while (i > 0) { if (ii == jj) { @@ -263,68 +819,70 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); #endif - data03 = *(a2 + 0); + data09 = *(a2 + 0); #ifndef UNIT - data04 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); - *(b + 1) = data03; - *(b + 3) = INV(data04); + *(b + 1) = data09; + + *(b + 3) = INV(data10); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); *(b + 0) = data01; - *(b + 1) = data03; + *(b + 1) = data09; *(b + 2) = data02; - *(b + 3) = data04; + *(b + 3) = data10; } a1 += 2; a2 += 2; - b += 4; + b += 4; i --; ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { - + if (m & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif - - data03 = *(a2 + 0); + data09 = *(a2 + 0); *(b + 0) = INV(data01); - *(b + 1) = data03; + *(b + 1) = data09; } - if (ii < jj) { + if (ii < jj) { data01 = *(a1 + 0); - data02 = *(a2 + 0); + data09 = *(a2 + 0); + *(b + 0) = data01; - *(b + 1) = data02; + *(b + 1) = data09; } b += 2; + // ii += 1; } - a += 2 * lda; + + a += 2 * lda; jj += 2; } - if (n & 1) { + if (nmod6 & 1) { a1 = a + 0 * lda; - i = m; ii = 0; + + i = m; while (i > 0) { if (ii == jj) { @@ -339,10 +897,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; } - a1+= 1; - b += 1; + a1 += 1; + b += 1; i --; - ii += 1; + ii ++; } } diff --git a/kernel/generic/trsm_utcopy_6.c b/kernel/generic/trsm_utcopy_6.c index f83617224f..6afc005470 100644 --- a/kernel/generic/trsm_utcopy_6.c +++ b/kernel/generic/trsm_utcopy_6.c @@ -49,21 +49,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02, data03, data04, data05, data06, data07, data08; - FLOAT data09, data10, data11, data12, data13, data14, data15, data16; - FLOAT *a1, *a2, *a3, *a4; + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data09, data10, data11, data12, data13, data14; + FLOAT data17, data18, data19, data20, data21, data22; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data33, data34, data35, data36, data37, data38; + FLOAT data41, data42, data43, data44, data45, data46; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; - j = (n >> 2); + BLASLONG mmod6, nmod6, k; + mmod6 = m - (m/6)*6 ; + nmod6 = n - (n/6)*6 ; + + // j = (n >> 3); + j = (n / 6); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; - i = (m >> 2); + // i = (m >> 3); + i = (m / 6); ii = 0; while (i > 0) { @@ -72,37 +85,67 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); #endif - data05 = *(a2 + 0); + data09 = *(a2 + 0); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif - data09 = *(a3 + 0); - data10 = *(a3 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); #ifndef UNIT - data11 = *(a3 + 2); + data19 = *(a3 + 2); #endif - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); #ifndef UNIT - data16 = *(a4 + 3); + data28 = *(a4 + 3); #endif - *(b + 0) = INV(data01); + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); +#ifndef UNIT + data37 = *(a5 + 4); +#endif - *(b + 4) = data05; - *(b + 5) = INV(data06); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); +#ifndef UNIT + data46 = *(a6 + 5); +#endif - *(b + 8) = data09; - *(b + 9) = data10; - *(b + 10) = INV(data11); + *(b + 0) = INV(data01); - *(b + 12) = data13; - *(b + 13) = data14; - *(b + 14) = data15; - *(b + 15) = INV(data16); + *(b + 6) = data09; + *(b + 7) = INV(data10); + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = INV(data19); + + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = INV(data28); + + *(b + 24) = data33; + *(b + 25) = data34; + *(b + 26) = data35; + *(b + 27) = data36; + *(b + 28) = INV(data37); + + *(b + 30) = data41; + *(b + 31) = data42; + *(b + 32) = data43; + *(b + 33) = data44; + *(b + 34) = data45; + *(b + 35) = INV(data46); } if (ii > jj) { @@ -110,21 +153,166 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = data19; + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = data28; + *(b + 22) = data29; + *(b + 23) = data30; + + *(b + 24) = data33; + *(b + 25) = data34; + *(b + 26) = data35; + *(b + 27) = data36; + *(b + 28) = data37; + *(b + 29) = data38; + *(b + 30) = data41; + *(b + 31) = data42; + *(b + 32) = data43; + *(b + 33) = data44; + *(b + 34) = data45; + *(b + 35) = data46; - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); + } - data09 = *(a3 + 0); - data10 = *(a3 + 1); - data11 = *(a3 + 2); - data12 = *(a3 + 3); + a1 += 6 * lda; + a2 += 6 * lda; + a3 += 6 * lda; + a4 += 6 * lda; + a5 += 6 * lda; + a6 += 6 * lda; + a7 += 6 * lda; + a8 += 6 * lda; + b += 36; - data13 = *(a4 + 0); - data14 = *(a4 + 1); - data15 = *(a4 + 2); - data16 = *(a4 + 3); + i --; + ii += 6; + } + + if (mmod6 & 4) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 6) = data09; + *(b + 7) = INV(data10); + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = INV(data19); + + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); *(b + 0) = data01; *(b + 1) = data02; @@ -132,44 +320,49 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; - *(b + 6) = data07; - *(b + 7) = data08; - - *(b + 8) = data09; - *(b + 9) = data10; - *(b + 10) = data11; - *(b + 11) = data12; - *(b + 12) = data13; - *(b + 13) = data14; - *(b + 14) = data15; - *(b + 15) = data16; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; + + *(b + 12) = data17; + *(b + 13) = data18; + *(b + 14) = data19; + *(b + 15) = data20; + *(b + 16) = data21; + *(b + 17) = data22; + *(b + 18) = data25; + *(b + 19) = data26; + *(b + 20) = data27; + *(b + 21) = data28; + *(b + 22) = data29; + *(b + 23) = data30; } a1 += 4 * lda; a2 += 4 * lda; - a3 += 4 * lda; - a4 += 4 * lda; - b += 16; - - i --; + /* a3 += 4 * lda; + a4 += 4 * lda; */ + b += 24; ii += 4; } - if ((m & 2) != 0) { - - if (ii== jj) { + if (mmod6 & 2) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif - data05 = *(a2 + 0); + + data09 = *(a2 + 0); #ifndef UNIT - data06 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); - - *(b + 4) = data05; - *(b + 5) = INV(data06); + *(b + 6) = data09; + *(b + 7) = INV(data10); } if (ii > jj) { @@ -177,11 +370,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); - data05 = *(a2 + 0); - data06 = *(a2 + 1); - data07 = *(a2 + 2); - data08 = *(a2 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); *(b + 0) = data01; *(b + 1) = data02; @@ -189,46 +386,84 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; - *(b + 6) = data07; - *(b + 7) = data08; + *(b + 6) = data09; + *(b + 7) = data10; + *(b + 8) = data11; + *(b + 9) = data12; + *(b + 10) = data13; + *(b + 11) = data14; } a1 += 2 * lda; - a2 += 2 * lda; - b += 8; - + // a2 += 2 * lda; + b += 12; ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { + if (mmod6 & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } - if (ii > jj) { + if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; } - b += 4; + b += 6; } + a += 6; + jj += 6; + j --; + } + + if (nmod6 & 4) { + + a1 = a; a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + jj += 4; - j --; } - if (n & 2) { + + if (nmod6 & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; @@ -240,58 +475,58 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifndef UNIT data01 = *(a1 + 0); #endif - data03 = *(a2 + 0); + + data09 = *(a2 + 0); #ifndef UNIT - data04 = *(a2 + 1); + data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); - *(b + 2) = data03; - *(b + 3) = INV(data04); + *(b + 2) = data09; + *(b + 3) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); - data03 = *(a2 + 0); - data04 = *(a2 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; - *(b + 2) = data03; - *(b + 3) = data04; + *(b + 2) = data09; + *(b + 3) = data10; } a1 += 2 * lda; a2 += 2 * lda; b += 4; - i --; ii += 2; } - if ((m & 1) != 0) { - - if (ii== jj) { + if (m & 1) { + if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } - if (ii > jj) { + if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); + *(b + 0) = data01; *(b + 1) = data02; } - b += 2; + b += 2; } a += 2; jj += 2; } - if (n & 1) { + if (nmod6 & 1) { a1 = a + 0 * lda; i = m; @@ -310,9 +545,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; } - a1 += 1 * lda; + a1 += lda; b += 1; - i --; ii += 1; } diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 20d0769f48..0280f90913 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -85,11 +85,11 @@ ZSWAPKERNEL = cswap_lasx.S CSUMKERNEL = csum_lasx.S ZSUMKERNEL = csum_lasx.S -DGEMMKERNEL = dgemm_kernel_16x4.S +DGEMMKERNEL = dgemm_kernel_16x6.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S -DGEMMONCOPY = dgemm_ncopy_4.S -DGEMMOTCOPY = dgemm_tcopy_4.S +DGEMMONCOPY = gemm_ncopy_6.prefx.c +DGEMMOTCOPY = dgemm_tcopy_6.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) @@ -153,13 +153,17 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S -DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S -DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S +DTRSMKERNEL_LN = trsm_kernel_LN_UNROLLN6.c +DTRSMKERNEL_LT = trsm_kernel_LT_UNROLLN6.c +DTRSMKERNEL_RN = trsm_kernel_RN_UNROLLN6.c +DTRSMKERNEL_RT = trsm_kernel_RT_UNROLLN6.c STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DGEMM_SMALL_M_PERMIT = dgemm_small_matrix_permit.c +DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_lasx.S +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_lasx.S endif diff --git a/kernel/loongarch64/dgemm_kernel_16x6.S b/kernel/loongarch64/dgemm_kernel_16x6.S new file mode 100644 index 0000000000..90da107377 --- /dev/null +++ b/kernel/loongarch64/dgemm_kernel_16x6.S @@ -0,0 +1,6256 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define C4 $r25 +#define C5 $r26 +#define T0 $r27 /* !! DO NOT USE $r21 and $r22 !! */ +#define T1 $r28 +#define T2 $r29 +#define I48 $r30 +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define D0 $xr7 +#define D1 $xr8 +#define D2 $xr9 +#define D3 $xr10 +#define D4 $xr11 +#define D5 $xr12 +#define D6 $xr13 +#define D7 $xr14 +#define D8 $xr15 +#define D9 $xr16 +#define D10 $xr17 +#define D11 $xr18 +#define D12 $xr19 +#define D13 $xr20 +#define D14 $xr21 +#define D15 $xr22 +#define D16 $xr23 +#define D17 $xr24 +#define D18 $xr25 +#define D19 $xr26 +#define D20 $xr27 +#define D21 $xr28 +#define D22 $xr29 +#define D23 $xr30 +#define VALPHA $xr31 + +/* Prefetch interval */ +#define A_PRE 0x200 /* 0x200 / 0x80 = 4 */ +#define B_PRE 0x100 /* 0x100 / 0x30 = 4 */ + +.macro KERNEL_16x6 + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + xvldrepl.d U6, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x18 + xvldrepl.d U5, B0, 0x20 + xvldrepl.d U6, B0, 0x28 + + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + xvfmadd.d D16, U0, U5, D16 + xvfmadd.d D17, U1, U5, D17 + xvfmadd.d D18, U2, U5, D18 + xvfmadd.d D19, U3, U5, D19 + preld 0, A0, A_PRE + 0x40 + + xvfmadd.d D20, U0, U6, D20 + xvfmadd.d D21, U1, U6, D21 + xvfmadd.d D22, U2, U6, D22 + xvfmadd.d D23, U3, U6, D23 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x30 +.endm + + PROLOGUE + + addi.d $sp, $sp, -160 + /* Store $r23~$31 */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + SDARG $r30, $sp, 56 + SDARG $r31, $sp, 64 + fst.d $f23, $sp, 72 + fst.d $f24, $sp, 80 + fst.d $f25, $sp, 96 + fst.d $f26, $sp, 104 + fst.d $f27, $sp, 112 + fst.d $f28, $sp, 120 + fst.d $f29, $sp, 128 + fst.d $f30, $sp, 136 + fst.d $f31, $sp, 144 + fst.d ALPHA, $sp, 152 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + + addi.d I48, ZERO, 48 + /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ + xvld VALPHA, $sp, 152 + xvreplve0.d VALPHA, VALPHA + xor T0, T0, T0 + addi.d T0, T0, 6 + /* if (!(N / 6)) goto L_N5 */ + div.d J, N, T0 /* J = bn / 6 */ + mul.d T0, J, T0 + sub.d N, N, T0 + beq ZERO, J, .L_N5 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N / 6) && (M >> 4)) START !!! ************************* +* dgemm_core_16x6 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + add.d C4, C3, T0 + add.d C5, C4, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + mul.d T0, OFF, I48 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 6 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D23, + * avoidig set 0 operation + * Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + xvldrepl.d U6, B0, 0x10 + + preld 0, C0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + preld 0, C0, 0x40 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + preld 0, C1, 0x00 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + preld 0, C1, 0x40 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 + + preld 0, C2, 0x00 + /* line 3 */ + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 + preld 0, C2, 0x40 + xvfmul.d D10, U2, U6 + xvfmul.d D11, U3, U6 + + xvldrepl.d U4, B0, 0x18 + xvldrepl.d U5, B0, 0x20 + xvldrepl.d U6, B0, 0x28 + + preld 0, C3, 0x00 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + preld 0, C3, 0x40 + xvfmul.d D14, U2, U4 + xvfmul.d D15, U3, U4 + + preld 0, C4, 0x00 + /* line 5 */ + xvfmul.d D16, U0, U5 + xvfmul.d D17, U1, U5 + preld 0, C4, 0x40 + xvfmul.d D18, U2, U5 + xvfmul.d D19, U3, U5 + + preld 0, C5, 0x00 + /* line 6 */ + xvfmul.d D20, U0, U6 + xvfmul.d D21, U1, U6 + preld 0, C5, 0x40 + xvfmul.d D22, U2, U6 + xvfmul.d D23, U3, U6 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x30 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + /* Calculate 8 sets of D0~D23 */ +.L_TL1: /* TL-- */ + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + KERNEL_16x6 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + + /* Maybe we need calculate the last + * 7 sets of D0~D23? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + xvfmadd.d D18, U2, U4, D18 + xvfmadd.d D19, U3, U4, D19 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + xvfmadd.d D22, U2, U4, D22 + xvfmadd.d D23, U3, U4, D23 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D10, D10, VALPHA + xvfmul.d D11, D11, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D14, D14, VALPHA + xvfmul.d D15, D15, VALPHA + xvfmul.d D16, D16, VALPHA + xvfmul.d D17, D17, VALPHA + xvfmul.d D18, D18, VALPHA + xvfmul.d D19, D19, VALPHA + xvfmul.d D20, D20, VALPHA + xvfmul.d D21, D21, VALPHA + xvfmul.d D22, D22, VALPHA + xvfmul.d D23, D23, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvld U2, C2, 0x40 + xvld U3, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + xvfmadd.d D10, D10, VALPHA, U2 + xvfmadd.d D11, D11, VALPHA, U3 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvld U2, C3, 0x40 + xvld U3, C3, 0x60 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 + xvfmadd.d D14, D14, VALPHA, U2 + xvfmadd.d D15, D15, VALPHA, U3 + + /* Load C4 */ + xvld U0, C4, 0x00 + xvld U1, C4, 0x20 + xvld U2, C4, 0x40 + xvld U3, C4, 0x60 + xvfmadd.d D16, D16, VALPHA, U0 + xvfmadd.d D17, D17, VALPHA, U1 + xvfmadd.d D18, D18, VALPHA, U2 + xvfmadd.d D19, D19, VALPHA, U3 + + /* Load C5 */ + xvld U0, C5, 0x00 + xvld U1, C5, 0x20 + xvld U2, C5, 0x40 + xvld U3, C5, 0x60 + xvfmadd.d D20, D20, VALPHA, U0 + xvfmadd.d D21, D21, VALPHA, U1 + xvfmadd.d D22, D22, VALPHA, U2 + xvfmadd.d D23, D23, VALPHA, U3 + #endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + xvst D10, C2, 0x40 + xvst D11, C2, 0x60 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + xvst D14, C3, 0x40 + xvst D15, C3, 0x60 + /* Store C4 */ + xvst D16, C4, 0x00 + xvst D17, C4, 0x20 + xvst D18, C4, 0x40 + xvst D19, C4, 0x60 + /* Store C5 */ + xvst D20, C5, 0x00 + xvst D21, C5, 0x20 + xvst D22, C5, 0x40 + xvst D23, C5, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + addi.d C2, C2, 0x80 + addi.d C3, C3, 0x80 + addi.d C4, C4, 0x80 + addi.d C5, C5, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -16 +#else + /* number of values in B */ + addi.d L, L, -6 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + mul.d T0, L, I48 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + mul.d T0, OFF, I48 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 6 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + + xvldrepl.d U4, B0, 0x20 + /* line 5 */ + xvfmul.d D16, U0, U4 + xvfmul.d D17, U1, U4 + + xvldrepl.d U4, B0, 0x28 + /* line 6 */ + xvfmul.d D20, U0, U4 + xvfmul.d D21, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 + +.L_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M8_TL1 + +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M8_L0 + +.L_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + xvfmadd.d D17, U1, U4, D17 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + xvfmadd.d D21, U1, U4, D21 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M8_L71 + +.L_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D16, D16, VALPHA + xvfmul.d D17, D17, VALPHA + xvfmul.d D20, D20, VALPHA + xvfmul.d D21, D21, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 + + /* Load C4 */ + xvld U0, C4, 0x00 + xvld U1, C4, 0x20 + xvfmadd.d D16, D16, VALPHA, U0 + xvfmadd.d D17, D17, VALPHA, U1 + + /* Load C5 */ + xvld U0, C5, 0x00 + xvld U1, C5, 0x20 + xvfmadd.d D20, D20, VALPHA, U0 + xvfmadd.d D21, D21, VALPHA, U1 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + /* Store C4 */ + xvst D16, C4, 0x00 + xvst D17, C4, 0x20 + /* Store C5 */ + xvst D20, C5, 0x00 + xvst D21, C5, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + addi.d C4, C4, 0x40 + addi.d C5, C5, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -6 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + mul.d T0, L, I48 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N / 6 ) && (M & 8)) End************/ + +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + mul.d T0, OFF, I48 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 6 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + xvldrepl.d U4, B0, 0x20 + /* line 5 */ + xvfmul.d D16, U0, U4 + + xvldrepl.d U4, B0, 0x28 + /* line 6 */ + xvfmul.d D20, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + +.L_M4_TL1: /* TL-- */ + /***8-1***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M4_TL1 + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D16, D16, VALPHA + xvfmul.d D20, D20, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 + + /* Load C4 */ + xvld U0, C4, 0x00 + xvfmadd.d D16, D16, VALPHA, U0 + + /* Load C5 */ + xvld U0, C5, 0x00 + xvfmadd.d D20, D20, VALPHA, U0 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + /* Store C2 */ + xvst D8, C2, 0x00 + /* Store C3 */ + xvst D12, C3, 0x00 + /* Store C4 */ + xvst D16, C4, 0x00 + /* Store C5 */ + xvst D20, C5, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + addi.d C4, C4, 0x20 + addi.d C5, C5, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -6 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + mul.d T0, L, I48 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N / 6 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + mul.d T0, OFF, I48 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 6 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + xvldrepl.d U4, B0, 0x20 + /* line 5 */ + xvfmul.d D16, U0, U4 + + xvldrepl.d U4, B0, 0x28 + /* line 6 */ + xvfmul.d D20, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + +.L_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D16, D16, VALPHA + xvfmul.d D20, D20, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 + + /* Load C4 */ + xvld U0, C4, 0x00 + xvfmadd.d D16, D16, VALPHA, U0 + + /* Load C5 */ + xvld U0, C5, 0x00 + xvfmadd.d D20, D20, VALPHA, U0 +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D16, C4, 0x00, 0x00 + xvstelm.d D20, C5, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + xvstelm.d D8, C2, 0x08, 0x01 + xvstelm.d D12, C3, 0x08, 0x01 + xvstelm.d D16, C4, 0x08, 0x01 + xvstelm.d D20, C5, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + addi.d C4, C4, 0x10 + addi.d C5, C5, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -6 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + mul.d T0, L, I48 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N / 6 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + mul.d T0, OFF, I48 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 6 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + xvldrepl.d U4, B0, 0x20 + /* line 5 */ + xvfmul.d D16, U0, U4 + + xvldrepl.d U4, B0, 0x28 + /* line 6 */ + xvfmul.d D20, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + +.L_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + xvldrepl.d U4, B0, 0x20 + xvfmadd.d D16, U0, U4, D16 + + xvldrepl.d U4, B0, 0x28 + xvfmadd.d D20, U0, U4, D20 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x30 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: +#ifdef TRMMKERNEL + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D16, D16, VALPHA + xvfmul.d D20, D20, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 + + /* Load C4 */ + xvld U0, C4, 0x00 + xvfmadd.d D16, D16, VALPHA, U0 + + /* Load C5 */ + xvld U0, C5, 0x00 + xvfmadd.d D20, D20, VALPHA, U0 +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D16, C4, 0x00, 0x00 + xvstelm.d D20, C5, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + addi.d C4, C4, 0x08 + addi.d C5, C5, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -6 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + mul.d T0, L, I48 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N / 6 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 6) + * C += (LDC * 6) + */ + /* since the array type is double, + * so we must mul 48 + */ + addi.d T2, ZERO,48 + mul.d T0, K, T2 + mul.d T1, LDC, T2 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x06 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/ + +.L_N5: + andi J, N, 4 + beq ZERO, J, .L_N3 + +/************************* Condition 2 if((N & 4) && (M >> 4)) START !!! ************************* +* dgemm_core_16x4 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N5_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N5_M8 + +.L_N5_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + xvfmul.d D6, U2, U4 + xvfmul.d D7, U3, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + xvfmul.d D10, U2, U4 + xvfmul.d D11, U3, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + xvfmul.d D14, U2, U4 + xvfmul.d D15, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N5_L7 */ + beq ZERO,TL, .L_N5_L7 + +.L_N5_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N5_TL1 + +.L_N5_L7: + /* if (!(L & 7)) goto L_N5_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N5_L0 + +.L_N5_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N5_L71 + +.L_N5_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D10, D10, VALPHA + xvfmul.d D11, D11, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D14, D14, VALPHA + xvfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvld U2, C2, 0x40 + xvld U3, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + xvfmadd.d D10, D10, VALPHA, U2 + xvfmadd.d D11, D11, VALPHA, U3 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvld U2, C3, 0x40 + xvld U3, C3, 0x60 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 + xvfmadd.d D14, D14, VALPHA, U2 + xvfmadd.d D15, D15, VALPHA, U3 + #endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + xvst D10, C2, 0x40 + xvst D11, C2, 0x60 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + xvst D14, C3, 0x40 + xvst D15, C3, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + addi.d C2, C2, 0x80 + addi.d C3, C3, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -16 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N5_I1 + +.L_N5_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N5_M0 + + andi I, M, 8 + beq ZERO,I, .L_N5_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N5_M8_L7 */ + beq ZERO,TL, .L_N5_M8_L7 + +.L_N5_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N5_M8_TL1 + +.L_N5_M8_L7: + /* if (!(L & 7)) goto L_N5_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N5_M8_L0 + +.L_N5_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N5_M8_L71 + +.L_N5_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 4 ) && (M & 8) ) End************/ + +.L_N5_M4: + andi I, M, 4 + beq ZERO,I, .L_N5_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N5_M4_L7 */ + beq ZERO,TL, .L_N5_M4_L7 + +.L_N5_M4_TL1: /* TL-- */ + /***8-1***/ + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N5_M4_TL1 + +.L_N5_M4_L7: + /* if (!(L & 7)) goto L_N5_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N5_M4_L0 + +.L_N5_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N5_M4_L71 + +.L_N5_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + /* Store C2 */ + xvst D8, C2, 0x00 + /* Store C3 */ + xvst D12, C3, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 4 ) && (M & 4) ) End************/ + +.L_N5_M2: + andi I, M, 2 + beq ZERO,I, .L_N5_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N5_M2_L7 */ + beq ZERO,TL, .L_N5_M2_L7 + +.L_N5_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N5_M2_TL1 + +.L_N5_M2_L7: + /* if (!(L & 7)) goto L_N5_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N5_M2_L0 + +.L_N5_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N5_M2_L71 + +.L_N5_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA + #else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 + #endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + xvstelm.d D8, C2, 0x08, 0x01 + xvstelm.d D12, C3, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 4 ) && (M & 2) ) End************/ + +.L_N5_M1: + andi I, M, 1 + beq ZERO,I, .L_N5_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N5_M1_L7 */ + beq ZERO,TL, .L_N5_M1_L7 + +.L_N5_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N5_M1_TL1 + +.L_N5_M1_L7: + /* if (!(L & 7)) goto L_N5_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N5_M1_L0 + +.L_N5_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N5_M1_L71 + +.L_N5_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA + #else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 4 ) && (M & 1) ) End************/ + +.L_N5_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + addi.d T2, ZERO,32 + mul.d T0, K, T2 + mul.d T1, LDC, T2 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 4) && (M >> 4)) End !!! ************************* +* dgemm_core_16x4 */ + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 3 if((N & 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N3_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + xvfmul.d D6, U2, U4 + xvfmul.d D7, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + +.L_N3_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -16 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N3_M0 + + andi I, M, 8 + beq ZERO,I, .L_N3_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M8_L7 */ + beq ZERO,TL, .L_N3_M8_L7 + +.L_N3_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M8_TL1 + +.L_N3_M8_L7: + /* if (!(L & 7)) goto L_N3_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M8_L0 + +.L_N3_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M8_L71 + +.L_N3_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2) && (M & 8) ) End************/ + +.L_N3_M4: + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + +.L_N3_M4_TL1: /* TL-- */ + /***8-1***/ + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + /* Cumulative D0~D23 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + +.L_N3_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + +.L_N3_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 32 + */ + addi.d T2, ZERO,16 + mul.d T0, K, T2 + mul.d T1, LDC, T2 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 3 if((N & 2) && (M >> 4)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 4 if((N & 1) && (M >> 4)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N1_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + +.L_N1_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -16 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 + + andi I, M, 8 + beq ZERO,I, .L_N1_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 + +.L_N1_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M8_TL1 + +.L_N1_M8_L7: + /* if (!(L & 7)) goto L_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M8_L0 + +.L_N1_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M8_L71 + +.L_N1_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 +#endif + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2) && (M & 8) ) End************/ + +.L_N1_M4: + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + +.L_N1_M4_TL1: /* TL-- */ + /***8-1***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + #endif + + /* Store C0 */ + xvst D0, C0, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + +.L_N1_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + +.L_N1_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif + + xvstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 4 if((N & 1) && (M >> 4)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore $r23~$31 */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + LDARG $r30, $sp, 56 + LDARG $r31, $sp, 64 + fld.d $f23, $sp, 72 + fld.d $f24, $sp, 80 + fld.d $f25, $sp, 96 + fld.d $f26, $sp, 104 + fld.d $f27, $sp, 112 + fld.d $f28, $sp, 120 + fld.d $f29, $sp, 128 + fld.d $f30, $sp, 136 + fld.d $f31, $sp, 144 + addi.d $sp, $sp, 160 + + /* Back home */ + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_16.S b/kernel/loongarch64/dgemm_ncopy_16.S index 95c879031a..4c32e0ec79 100644 --- a/kernel/loongarch64/dgemm_ncopy_16.S +++ b/kernel/loongarch64/dgemm_ncopy_16.S @@ -655,6 +655,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d TD, TD, 0x10 .L_N1: + andi J, N, 0x01 + beq ZERO, J, .L_N0 move S1, TS beq ZERO, M, .L_N0 diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S index 203c3eb27d..4ca485508a 100644 --- a/kernel/loongarch64/dgemm_ncopy_8_lsx.S +++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S @@ -268,6 +268,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d S2, S2, 0x08 addi.d TD, TD, 0x10 .L_N1: + andi J, N, 0x01 + beq ZERO, J, .L_N0 move S1, TS beq ZERO, M, .L_N0 .L_M1: diff --git a/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S b/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S new file mode 100644 index 0000000000..230bf96535 --- /dev/null +++ b/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S @@ -0,0 +1,549 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +#define M $a0 +#define N $a1 +#define K $a2 +#define A $a3 +#define LDA $a4 +#define ALPHA $f0 +#define B $a5 +#define LDB $a6 +#define C $a7 +#define LDC $t0 +#ifdef B0 +#define BETA $f1 +#endif +#undef ZERO +#define ZERO $r0 + +#define M16 $t1 +#define M8 $t1 +#define M4 $t1 +#define M2 $t1 +#define M1 $t1 +#define N4 $t2 +#define N2 $t2 +#define N1 $t2 +#define K8 $t3 +#define A0 $t4 +#define X0 $t5 +#define B1 $t6 +#define B2 $t7 +#define B3 $t8 +#define C0 $s0 +#define C1 $s1 +#define C2 $s2 +#define C3 $s3 +#define K1 $s4 + +#define VALPHA $xr0 +#ifndef B0 +#define VBETA $xr1 +#endif +#define D0 $xr2 +#define D1 $xr3 +#define D2 $xr4 +#define D3 $xr5 +#define D4 $xr6 +#define D5 $xr7 +#define D6 $xr8 +#define D7 $xr9 +#define D8 $xr10 +#define D9 $xr11 +#define D10 $xr12 +#define D11 $xr13 +#define D12 $xr14 +#define D13 $xr15 +#define D14 $xr16 +#define D15 $xr17 +#define S0 $xr18 +#define S1 $xr19 +#define S2 $xr20 +#define S3 $xr21 +#define Z0 $xr22 +#define Z1 $xr23 +#define Z2 $xr24 +#define Z3 $xr25 +#define V0 $vr2 +#define V1 $vr3 +#define V2 $vr4 +#define V3 $vr5 +#define F0 $f2 +#define F1 $f3 +#define F2 $f4 +#define F3 $f5 + +.macro DGEMM_SMALL_KERNEL_NN_TAIL M + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M\M\()_N3 +.L_M\M\()_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M\M\()_N4_END +.L_M\M\()_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1, D2, S0, Z2, D2, D3, S0, Z3, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N4_K1 +.L_M\M\()_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 + GLD xv, , S0, C2, 0x00 + GMADD xvf, d, D2, S0, VBETA, D2 + GLD xv, , S0, C3, 0x00 + GMADD xvf, d, D3, S0, VBETA, D3 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00, D2, C2, 0x00, D3, C3, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00, V2, C2, 0x00, V3, C3, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00, F2, C2, 0x00, F3, C3, 0x00 +.endif + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M\M\()_N4 +.L_M\M\()_N3: + andi N2, N, 0x02 + beqz N2, .L_M\M\()_N1 +.L_M\M\()_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N2_END +.L_M\M\()_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N2_K1 +.L_M\M\()_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00 +.endif + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M\M\()_N1: + andi N1, N, 0x01 + beqz N1, .L_M\M\()_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N1_END +.L_M\M\()_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N1_K1 +.L_M\M\()_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00 +.endif +.L_M\M\()_END: +.if \M == 4 + PTR_ADDI A, A, 0x20 + PTR_ADDI C, C, 0x20 +.elseif \M == 2 + PTR_ADDI A, A, 0x10 + PTR_ADDI C, C, 0x10 +.elseif \M == 1 +.endif +.endm + + PROLOGUE + PTR_LD LDC, $sp, 0 + push_if_used 5, 2 + xvreplve0.d VALPHA, VALPHA +#ifndef B0 + xvreplve0.d VBETA, VBETA +#endif + PTR_SLLI LDA, LDA, 3 + PTR_SLLI LDB, LDB, 3 + PTR_SLLI LDC, LDC, 3 + PTR_SLLI K8, K, 3 + PTR_SRAI M16, M, 4 // M >> 4 + beqz M16, .L_M15 +.L_M16: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M16_N3 +.L_M16_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7, \ + D8, D8, D8, D9, D9, D9, D10, D10, D10, D11, D11, D11, \ + D12, D12, D12, D13, D13, D13, D14, D14, D14, D15, D15, D15 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M16_N4_END +.L_M16_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7, \ + D8, S0, Z2, D8, D9, S1, Z2, D9, D10, S2, Z2, D10, D11, S3, Z2, D11, \ + D12, S0, Z3, D12, D13, S1, Z3, D13, D14, S2, Z3, D14, D15, S3, Z3, D15 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N4_K1 + .L_M16_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ + D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ + D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20, S2, C2, 0x40, S3, C2, 0x60 + GMADD xvf, d, D8, S0, VBETA, D8, D9, S1, VBETA, D9, D10, S2, VBETA, D10, D11, S3, VBETA, D11 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20, S2, C3, 0x40, S3, C3, 0x60 + GMADD xvf, d, D12, S0, VBETA, D12, D13, S1, VBETA, D13, D14, S2, VBETA, D14, D15, S3, VBETA, D15 +#endif + GST xv, , D12, C3, 0x00, D13, C3, 0x20, D14, C3, 0x40, D15, C3, 0x60, \ + D8, C2, 0x00, D9, C2, 0x20, D10, C2, 0x40, D11, C2, 0x60, \ + D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M16_N4 +.L_M16_N3: + andi N2, N, 0x02 + beqz N2, .L_M16_N1 +.L_M16_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N2_END +.L_M16_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N2_K1 +.L_M16_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 +#endif + GST xv, , D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M16_N1: + andi N1, N, 0x01 + beqz N1, .L_M16_END + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N1_END +.L_M16_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N1_K1 +.L_M16_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_ALSL X0, LDB, X0, 2 + // Restore A0 + move A0, A +.L_M16_END: + PTR_ADDI M16, M16, -1 + PTR_ADDI A, A, 0x80 + PTR_ADDI C, C, 0x80 + bnez M16, .L_M16 +.L_M15: + andi M8, M, 0x08 + beqz M8, .L_M7 +.L_M8: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M8_N3 +.L_M8_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M8_N4_END +.L_M8_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3, \ + D4, S0, Z2, D4, D5, S1, Z2, D5, \ + D6, S0, Z3, D6, D7, S1, Z3, D7, + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N4_K1 +.L_M8_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20 + GMADD xvf, d, D6, S0, VBETA, D6, D7, S1, VBETA, D7 +#endif + GST xv, , D4, C2, 0x00, D5, C2, 0x20, D6, C3, 0x00, D7, C3, 0x20, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M8_N4 +.L_M8_N3: + andi N2, N, 0x02 + beqz N2, .L_M8_N1 +.L_M8_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N2_END +.L_M8_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N2_K1 +.L_M8_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M8_N1: + andi N1, N, 0x01 + beqz N1, .L_M8_END + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N1_END +.L_M8_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N1_K1 +.L_M8_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20 +.L_M8_END: + PTR_ADDI A, A, 0x40 + PTR_ADDI C, C, 0x40 +.L_M7: + andi M4, M, 0x04 + beqz M4, .L_M3 +.L_M4: + DGEMM_SMALL_KERNEL_NN_TAIL 4 +.L_M3: + andi M2, M, 0x02 + beqz M2, .L_M1 +.L_M2: + DGEMM_SMALL_KERNEL_NN_TAIL 2 +.L_M1: + andi M1, M, 0x01 + beqz M1, .L_M0 + DGEMM_SMALL_KERNEL_NN_TAIL 1 +.L_M0: + pop_if_used 5, 2 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_matrix_permit.c b/kernel/loongarch64/dgemm_small_matrix_permit.c new file mode 100644 index 0000000000..8afeb817b5 --- /dev/null +++ b/kernel/loongarch64/dgemm_small_matrix_permit.c @@ -0,0 +1,39 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + + if (MNK <= 64.0 * 64.0 * 64.0) + return 1; + + return 0; +} + diff --git a/kernel/loongarch64/dgemm_tcopy_6.S b/kernel/loongarch64/dgemm_tcopy_6.S new file mode 100644 index 0000000000..d3bb4a2a64 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_6.S @@ -0,0 +1,555 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define T0 $r27 +#define T1 $r28 +#define T2 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define V0 $vr0 +#define V1 $vr1 +#define V2 $vr2 +#define V3 $vr3 +/* LASX vectors */ +#define U0 $xr4 +#define U1 $xr5 +#define U2 $xr6 +#define U3 $xr7 +#define U4 $xr8 +#define U5 $xr9 +#define U6 $xr10 +#define U7 $xr11 + + PROLOGUE + + addi.d $sp, $sp, -56 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + + move S0, SRC + move P0, DST + + addi.d I, ZERO, 0x06 + div.d T0, N, I // 1 + mul.d T1, I, T0 // 6 + sub.d N, N, T1 // 1 + + srai.d T2, N, 0x02 + slli.d T2, T2, 0x02 + add.d T2, T1, T2 + + mul.d P2, M, T1 + mul.d P3, M, T2 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + + srai.d T2, N, 0x01 + srai.d J, M, 0x03 + slli.d T2, T2, 0x01 + add.d T2, T1, T2 + + add.d P2, DST, P2 + mul.d P4, M, T2 + add.d P3, DST, P3 + slli.d P4, P4, 0x03 + slli.d TL, LDA, 0x03 + add.d P4, DST, P4 + + slli.d T2, TL, 0x01 + slli.d T1, M, 0x03 + mul.d T1, T1, I + beq ZERO, J, .L_M7 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T2 + add.d S4, S2, T2 + add.d S5, S3, T2 + add.d S6, S4, T2 + add.d S7, S5, T2 + add.d S8, S6, T2 + add.d S0, S7, T2 + + move P1, P0 + addi.d P0, P0, 0x180 + + move I, T0 + addi.d J, J, -1 + beq ZERO, I, .L_N7 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + vld V0, S1, 0x20 + xvld U1, S2, 0x00 + vld V1, S2, 0x20 + xvld U2, S3, 0x00 + vld V2, S3, 0x20 + xvld U3, S4, 0x00 + vld V3, S4, 0x20 + + xvst U0, P1, 0x00 + vst V0, P1, 0x20 + + xvst U1, P1, 0x30 + vst V1, P1, 0x50 + + xvst U2, P1, 0x60 + vst V2, P1, 0x80 + + xvst U3, P1, 0x90 + vst V3, P1, 0xB0 + + xvld U0, S5, 0x00 + vld V0, S5, 0x20 + xvld U1, S6, 0x00 + vld V1, S6, 0x20 + xvld U2, S7, 0x00 + vld V2, S7, 0x20 + xvld U3, S8, 0x00 + vld V3, S8, 0x20 + + xvst U0, P1, 0xC0 + vst V0, P1, 0xE0 + + xvst U1, P1, 0xF0 + vst V1, P1, 0x110 + + xvst U2, P1, 0x120 + vst V2, P1, 0x140 + + xvst U3, P1, 0x150 + vst V3, P1, 0x170 + + addi.d S1, S1, 0x30 + addi.d S2, S2, 0x30 + addi.d S3, S3, 0x30 + addi.d S4, S4, 0x30 + addi.d S5, S5, 0x30 + addi.d S6, S6, 0x30 + addi.d S7, S7, 0x30 + addi.d S8, S8, 0x30 + addi.d I, I, -1 + + add.d P1, P1, T1 + blt ZERO, I, .L_I1 + +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P2, P2, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + xvpermi.q U4, U5, 0x02 + xvpermi.q U6, U7, 0x02 + + xvst U0, P3, 0x00 + xvst U2, P3, 0x20 + xvst U4, P3, 0x40 + xvst U6, P3, 0x60 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P3, P3, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + fst.d F4, P4, 0x20 + fst.d F5, P4, 0x28 + fst.d F6, P4, 0x30 + fst.d F7, P4, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P4, P4, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T2 + add.d S4, S2, T2 + add.d S0, S3, T2 + + move P1, P0 + addi.d P0, P0, 0xC0 + + move I, T0 + beq ZERO, I, .L_4N7 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + vld V0, S1, 0x20 + xvld U1, S2, 0x00 + vld V1, S2, 0x20 + xvld U2, S3, 0x00 + vld V2, S3, 0x20 + xvld U3, S4, 0x00 + vld V3, S4, 0x20 + + xvst U0, P1, 0x00 + vst V0, P1, 0x20 + + xvst U1, P1, 0x30 + vst V1, P1, 0x50 + + xvst U2, P1, 0x60 + vst V2, P1, 0x80 + + xvst U3, P1, 0x90 + vst V3, P1, 0xB0 + + addi.d S1, S1, 0x30 + addi.d S2, S2, 0x30 + addi.d S3, S3, 0x30 + addi.d S4, S4, 0x30 + + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 + +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P2, P2, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P3, 0x00 + xvst U2, P3, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P3, P3, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P4, P4, 0x20 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T2 + + move P1, P0 + addi.d P0, P0, 0x60 + + move I, T0 + beq ZERO, I, .L_2N7 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + vld V0, S1, 0x20 + xvld U1, S2, 0x00 + vld V1, S2, 0x20 + + xvst U0, P1, 0x00 + vst V0, P1, 0x20 + + xvst U1, P1, 0x30 + vst V1, P1, 0x50 + + addi.d S1, S1, 0x30 + addi.d S2, S2, 0x30 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 + +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P2, P2, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P3, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P3, P3, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P4, P4, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x30 + + move I, T0 + beq ZERO, I, .L_1N7 + +.L_1I1: /* I-- */ + xvld U0, S1, 0x00 + vld V0, S1, 0x20 + + xvst U0, P1, 0x00 + vst V0, P1, 0x20 + + addi.d S1, S1, 0x30 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + xvld U0, S1, 0x00 + + xvst U0, P2, 0x00 + + addi.d S1, S1, 0x20 + addi.d P2, P2, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x10 + addi.d P3, P3, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P4, 0x00 + + addi.d S1, S1, 0x08 + addi.d P4, P4, 0x08 + +.L_M0: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + addi.d $sp, $sp, 56 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dot_lsx.S b/kernel/loongarch64/dot_lsx.S index f7f6135531..8a74d82e7d 100644 --- a/kernel/loongarch64/dot_lsx.S +++ b/kernel/loongarch64/dot_lsx.S @@ -165,7 +165,7 @@ PROLOGUE /* store dot in s1 $f8 */ #ifdef DSDOT vfadd.d $vr8, $vr8, $vr9 - fsub.s s2, s2, s2, /* set s2 to 0.0 */ + fsub.s s2, s2, s2 /* set s2 to 0.0 */ vpackod.d $vr0, $vr8, $vr8 vfadd.d $vr8, $vr8, $vr0 #else diff --git a/kernel/loongarch64/gemm_ncopy_6.prefx.c b/kernel/loongarch64/gemm_ncopy_6.prefx.c new file mode 100644 index 0000000000..65680d4e32 --- /dev/null +++ b/kernel/loongarch64/gemm_ncopy_6.prefx.c @@ -0,0 +1,299 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + BLASLONG nmod6; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6 ; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + + nmod6 = n - (n / 6)* 6 ; + aoffset = a; + boffset = b; + + // prefex A: 1 block, block size: 4*8 bytes, offset: 16*8 bytes, base: aoffset1,2,,6; + BLASULONG index = 0x100080; //( (1<<20)|(16<<3)&0xffff) ) ; + // prefex B: 1 block, block size: 24*8 bytes, offset: 96*8 bytes, base: boffset; + BLASULONG index_b = 0xb00300; //(11<<20) | ((96*8)&0xffff) ; + + j = (n / 6); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset += 6 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + ctemp17 = *(aoffset5 + 0); + ctemp18 = *(aoffset5 + 1); + ctemp19 = *(aoffset5 + 2); + ctemp20 = *(aoffset5 + 3); + + ctemp21 = *(aoffset6 + 0); + ctemp22 = *(aoffset6 + 1); + ctemp23 = *(aoffset6 + 2); + ctemp24 = *(aoffset6 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp05; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp13; + *(boffset + 4) = ctemp17; + *(boffset + 5) = ctemp21; + + *(boffset + 6) = ctemp02; + *(boffset + 7) = ctemp06; + *(boffset + 8) = ctemp10; + *(boffset + 9) = ctemp14; + *(boffset + 10) = ctemp18; + *(boffset + 11) = ctemp22; + + *(boffset + 12) = ctemp03; + *(boffset + 13) = ctemp07; + *(boffset + 14) = ctemp11; + *(boffset + 15) = ctemp15; + *(boffset + 16) = ctemp19; + *(boffset + 17) = ctemp23; + + *(boffset + 18) = ctemp04; + *(boffset + 19) = ctemp08; + *(boffset + 20) = ctemp12; + *(boffset + 21) = ctemp16; + *(boffset + 22) = ctemp20; + *(boffset + 23) = ctemp24; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + aoffset5 += 4; + aoffset6 += 4; + + boffset += 24; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + boffset += 6; + i --; + }while(i > 0); + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (nmod6 & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp02; + *(boffset + 5) = ctemp04; + *(boffset + 6) = ctemp06; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + boffset += 4; + } + } + + if (nmod6 & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + boffset += 2; + } + } + + if (nmod6 & 1){ + aoffset1 = aoffset; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + } + } + + return 0; +} diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S index a2fc9dbbd8..c22ade4b38 100644 --- a/kernel/loongarch64/icamax_lsx.S +++ b/kernel/loongarch64/icamax_lsx.S @@ -308,8 +308,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w x1, t3, 3 vinsgr2vr.w x2, t4, 3 addi.d I, I, -1 - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 vfmul.s x3, VI4, x1 vfmul.s x4, VI4, x2 vfcmp.clt.s VT0, x1, VI3 diff --git a/kernel/loongarch64/trsm_kernel_LN_UNROLLN6.c b/kernel/loongarch64/trsm_kernel_LN_UNROLLN6.c new file mode 100644 index 0000000000..5e25a5e3e4 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LN_UNROLLN6.c @@ -0,0 +1,342 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + // j = (n >> GEMM_UNROLL_N_SHIFT); + j = (n / 6); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + BLASLONG nmodN = n - n/6*6 ; + + // if (n & (GEMM_UNROLL_N - 1)) { + if (nmodN) { + + // j = (GEMM_UNROLL_N >> 1); + j = 4; + while (j > 0) { + if (nmodN & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/loongarch64/trsm_kernel_LT_UNROLLN6.c b/kernel/loongarch64/trsm_kernel_LT_UNROLLN6.c new file mode 100644 index 0000000000..2106c88cae --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LT_UNROLLN6.c @@ -0,0 +1,327 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + // j = (n >> GEMM_UNROLL_N_SHIFT); + j = (n / 6); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + BLASLONG nmodN = n - n/6*6 ; + + // if (n & (GEMM_UNROLL_N - 1)) { + if (nmodN) { + + // j = (GEMM_UNROLL_N >> 1); + j = 4; + + while (j > 0) { + if (nmodN & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/loongarch64/trsm_kernel_RN_UNROLLN6.c b/kernel/loongarch64/trsm_kernel_RN_UNROLLN6.c new file mode 100644 index 0000000000..42d5155c33 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_RN_UNROLLN6.c @@ -0,0 +1,325 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + // j = (n >> GEMM_UNROLL_N_SHIFT); + j = (n / 6); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + BLASLONG nmodN = n - n/6*6 ; + + // if (n & (GEMM_UNROLL_N - 1)) { + if (nmodN) { + + // j = (GEMM_UNROLL_N >> 1); + j = 4; + + while (j > 0) { + if (nmodN & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/loongarch64/trsm_kernel_RT_UNROLLN6.c b/kernel/loongarch64/trsm_kernel_RT_UNROLLN6.c new file mode 100644 index 0000000000..7424ad5791 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_RT_UNROLLN6.c @@ -0,0 +1,351 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + + BLASLONG nmodN = n - n/6*6 ; + + // if (n & (GEMM_UNROLL_N - 1)) { + if (nmodN) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (nmodN & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + // j = (n >> GEMM_UNROLL_N_SHIFT); + j = (n / 6); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/power/cswap_microk_power10.c b/kernel/power/cswap_microk_power10.c index 2a44a9e30e..f71b6f98c4 100644 --- a/kernel/power/cswap_microk_power10.c +++ b/kernel/power/cswap_microk_power10.c @@ -58,6 +58,16 @@ static void cswap_kernel_32 (long n, float *x, float *y) "lxvp 62, 224(%3) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" +#else "stxv 33, 0(%3) \n\t" "stxv 32, 16(%3) \n\t" "stxv 35, 32(%3) \n\t" @@ -66,9 +76,20 @@ static void cswap_kernel_32 (long n, float *x, float *y) "stxv 36, 80(%3) \n\t" "stxv 39, 96(%3) \n\t" "stxv 38, 112(%3) \n\t" +#endif "addi %3, %3, 128 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 0(%3) \n\t" + "stxv 41, 16(%3) \n\t" + "stxv 42, 32(%3) \n\t" + "stxv 43, 48(%3) \n\t" + "stxv 44, 64(%3) \n\t" + "stxv 45, 80(%3) \n\t" + "stxv 46, 96(%3) \n\t" + "stxv 47, 112(%3) \n\t" +#else "stxv 41, 0(%3) \n\t" "stxv 40, 16(%3) \n\t" "stxv 43, 32(%3) \n\t" @@ -77,9 +98,20 @@ static void cswap_kernel_32 (long n, float *x, float *y) "stxv 44, 80(%3) \n\t" "stxv 47, 96(%3) \n\t" "stxv 46, 112(%3) \n\t" +#endif "addi %3, %3, 128 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 52, 64(%4) \n\t" + "stxv 53, 80(%4) \n\t" + "stxv 54, 96(%4) \n\t" + "stxv 55, 112(%4) \n\t" +#else "stxv 49, 0(%4) \n\t" "stxv 48, 16(%4) \n\t" "stxv 51, 32(%4) \n\t" @@ -88,9 +120,20 @@ static void cswap_kernel_32 (long n, float *x, float *y) "stxv 52, 80(%4) \n\t" "stxv 55, 96(%4) \n\t" "stxv 54, 112(%4) \n\t" +#endif "addi %4, %4, 128 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 56, 0(%4) \n\t" + "stxv 57, 16(%4) \n\t" + "stxv 58, 32(%4) \n\t" + "stxv 59, 48(%4) \n\t" + "stxv 60, 64(%4) \n\t" + "stxv 61, 80(%4) \n\t" + "stxv 62, 96(%4) \n\t" + "stxv 63, 112(%4) \n\t" +#else "stxv 57, 0(%4) \n\t" "stxv 56, 16(%4) \n\t" "stxv 59, 32(%4) \n\t" @@ -99,6 +142,7 @@ static void cswap_kernel_32 (long n, float *x, float *y) "stxv 60, 80(%4) \n\t" "stxv 63, 96(%4) \n\t" "stxv 62, 112(%4) \n\t" +#endif "addi %4, %4, 128 \n\t" diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c index 302b2418e3..a01e1b53da 100644 --- a/kernel/power/saxpy_power10.c +++ b/kernel/power/saxpy_power10.c @@ -76,6 +76,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS saxpy_kernel_64(n1, &x[i], &y[i], da); i += n1; +#if defined(__clang__) +#pragma clang loop interleave_count(2) +#endif while(i < n) { diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 205a32d31b..de22427016 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -101,10 +101,8 @@ SCLAUX = la_constants.o \ slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ ssteqr.o ssterf.o slaisnan.o sisnan.o \ - slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o -ifneq ($(F_COMPILER), IBM) -SCLAUX += ../INSTALL/second_$(TIMER).o -endif + slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o \ + ../INSTALL/second_$(TIMER).o endif ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" @@ -126,10 +124,7 @@ DZLAUX = la_constants.o\ dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ dsteqr.o dsterf.o dlaisnan.o disnan.o \ dlartgp.o dlartgs.o ../INSTALL/droundup_lwork.o \ - ../INSTALL/dlamch.o -ifneq ($(F_COMPILER), IBM) -DZLAUX += ../INSTALL/dsecnd_$(TIMER).o -endif + ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o endif #ifeq ($(BUILD_SINGLE),1) diff --git a/param.h b/param.h index 84e241fcd2..2ebe824db5 100644 --- a/param.h +++ b/param.h @@ -2856,7 +2856,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 1 #else -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 6 #define DGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_M 16 @@ -2864,6 +2864,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 16 #define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_MN 96 #endif #define QGEMM_DEFAULT_UNROLL_N 2 diff --git a/utest/Makefile b/utest/Makefile index d4154e0c2c..36acf96cdb 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -61,6 +61,10 @@ OBJS = utest_main2.o OBJS_EXT = $(DIR_EXT)/utest_main2.o endif +ifeq ($(NO_CBLAS), 1) +override CFLAGS += -DNO_CBLAS +endif + all : run_test ifeq ($(OSNAME), AIX) diff --git a/utest/test_extensions/common.c b/utest/test_extensions/common.c index 8a6a47795a..808aa54557 100644 --- a/utest/test_extensions/common.c +++ b/utest/test_extensions/common.c @@ -69,7 +69,7 @@ float smatrix_difference(float *a, float *b, blasint cols, blasint rows, blasint for (j = 0; j < cols; j++) { a_ptr[j] -= b_ptr[j]; } - norm += cblas_snrm2(cols, a_ptr, inc); + norm += BLASFUNC(snrm2)(&cols, a_ptr, &inc); a_ptr += ld; b_ptr += ld; @@ -92,7 +92,7 @@ double dmatrix_difference(double *a, double *b, blasint cols, blasint rows, blas for (j = 0; j < cols; j++) { a_ptr[j] -= b_ptr[j]; } - norm += cblas_dnrm2(cols, a_ptr, inc); + norm += BLASFUNC(dnrm2)(&cols, a_ptr, &inc); a_ptr += ld; b_ptr += ld; @@ -256,4 +256,4 @@ void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src a_dst[i*lda_dst+j+1] = (-1.0) * conj *alpha[0] * a_src[i*lda_src+j+1] + alpha[1] * a_src[i*lda_src+j]; } } -} \ No newline at end of file +} diff --git a/utest/test_extensions/test_caxpby.c b/utest/test_extensions/test_caxpby.c index 221a48ac71..8adf5b3e6c 100644 --- a/utest/test_extensions/test_caxpby.c +++ b/utest/test_extensions/test_caxpby.c @@ -96,7 +96,7 @@ static float check_caxpby(blasint n, float *alpha, blasint incx, float *beta, bl // Find the norm of differences return BLASFUNC(scnrm2)(&n, data_caxpby.y_test, &incy_abs); } - +#ifndef NO_CBLAS /** * C API specific function * Test caxpby by comparing it with cscal and caxpy. @@ -146,7 +146,7 @@ static float c_api_check_caxpby(blasint n, float *alpha, blasint incx, float *be // Find the norm of differences return cblas_scnrm2(n, data_caxpby.y_test, incy_abs); } - +#endif /** * Fortran API specific test * Test caxpby by comparing it with cscal and caxpy. @@ -388,6 +388,7 @@ CTEST(caxpby, check_n_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test caxpby by comparing it with cscal and caxpy. @@ -629,3 +630,4 @@ CTEST(caxpby, c_api_check_n_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } #endif +#endif diff --git a/utest/test_extensions/test_cgeadd.c b/utest/test_extensions/test_cgeadd.c index 9b87ad9f37..55c52137a2 100644 --- a/utest/test_extensions/test_cgeadd.c +++ b/utest/test_extensions/test_cgeadd.c @@ -62,13 +62,14 @@ static void cgeadd_trusted(blasint m, blasint n, float *alpha, float *aptr, blasint lda, float *beta, float *cptr, blasint ldc) { blasint i; + blasint one=1; lda *= 2; ldc *= 2; for (i = 0; i < n; i++) { - cblas_caxpby(m, alpha, aptr, 1, beta, cptr, 1); + BLASFUNC(caxpby)(&m, alpha, aptr, &one, beta, cptr, &one); aptr += lda; cptr += ldc; } @@ -116,9 +117,11 @@ static float check_cgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda, beta, data_cgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda, beta, data_cgeadd.c_test, ldc); +#endif // Find the differences between output matrix caculated by cgeadd and sgemm return smatrix_difference(data_cgeadd.c_test, data_cgeadd.c_verify, cols, rows, ldc*2); @@ -150,9 +153,11 @@ static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda, beta, data_cgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda, beta, data_cgeadd.c_test, ldc); +#endif return check_error(); } @@ -419,7 +424,7 @@ CTEST(cgeadd, m_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } - +#ifndef NO_CBLAS /** * C API specific test * Test cgeadd by comparing it against sgemm @@ -877,4 +882,5 @@ CTEST(cgeadd, c_api_m_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_cgemm.c b/utest/test_extensions/test_cgemm.c index cd38d710bf..15d64e372e 100644 --- a/utest/test_extensions/test_cgemm.c +++ b/utest/test_extensions/test_cgemm.c @@ -73,9 +73,10 @@ static float check_cgemm(char transa, char transb, blasint m, blasint n, blasint float alpha_conj[] = {1.0f, 0.0f}; char transa_verify = transa; char transb_verify = transb; + char cc[2]="C", cr[2]="R"; - int arows = k, acols = m; - int brows = n, bcols = k; + blasint arows = k, acols = m; + blasint brows = n, bcols = k; if (transa == 'T' || transa == 'C'){ arows = m; acols = k; @@ -99,12 +100,12 @@ static float check_cgemm(char transa, char transb, blasint m, blasint n, blasint data_cgemm.c_verify[i] = data_cgemm.c_test[i]; if (transa == 'R'){ - cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_cgemm.a_verify, lda, lda); + BLASFUNC(cimatcopy)(cc, cr, &arows, &acols, alpha_conj, data_cgemm.a_verify, &lda, &lda); transa_verify = 'N'; } if (transb == 'R'){ - cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_cgemm.b_verify, ldb, ldb); + BLASFUNC(cimatcopy)(cc, cr, &brows, &bcols, alpha_conj, data_cgemm.b_verify, &ldb, &ldb); transb_verify = 'N'; } @@ -270,4 +271,4 @@ CTEST(cgemm, transa_conjnotransb) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_cgemmt.c b/utest/test_extensions/test_cgemmt.c index ed9279933d..dfeb06ff6e 100644 --- a/utest/test_extensions/test_cgemmt.c +++ b/utest/test_extensions/test_cgemmt.c @@ -73,9 +73,11 @@ static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra if(api == 'F') BLASFUNC(cgemm)(&transa, &transb, &m, &m, &k, alpha, data_cgemmt.a_test, &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_gemm, &ldc); +#ifndef NO_CBLAS else cblas_cgemm(order, transa, transb, m, m, k, alpha, data_cgemmt.a_test, lda, data_cgemmt.b_test, ldb, beta, data_cgemmt.c_gemm, ldc); +#endif ldc *= 2; @@ -160,9 +162,11 @@ static float check_cgemmt(char api, enum CBLAS_ORDER order, char uplo, char tran if (api == 'F') BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test, &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda, data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc); +#endif for (i = 0; i < m * ldc * 2; i++) data_cgemmt.c_verify[i] -= data_cgemmt.c_test[i]; @@ -197,9 +201,11 @@ static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char trans if (api == 'F') BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test, &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda, data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc); +#endif return check_error(); } @@ -680,6 +686,7 @@ CTEST(cgemmt, lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test cgemmt by comparing it against sgemm @@ -1591,6 +1598,7 @@ CTEST(cgemmt, c_api_rowmajor_lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Fortran API specific test @@ -1736,6 +1744,7 @@ CTEST(cgemmt, xerbla_ldc_invalid) ASSERT_EQUAL(TRUE, passed); } +#ifndef NO_CBLAS /** * C API specific test. * Test error function for an invalid param order. @@ -2007,4 +2016,5 @@ CTEST(cgemmt, xerbla_c_api_rowmajor_ldc_invalid) M, K, lda, ldb, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_cgemv_t.c b/utest/test_extensions/test_cgemv_t.c index cb4e5ad9e4..dd95d32bad 100644 --- a/utest/test_extensions/test_cgemv_t.c +++ b/utest/test_extensions/test_cgemv_t.c @@ -65,6 +65,7 @@ static struct DATA_CGEMV_T data_cgemv_t; static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x) { blasint i; + blasint one=1; float *a_ptr = data_cgemv_t.a_verify; float *x_ptr = data_cgemv_t.x_test; float *x_res = data_cgemv_t.x_verify; @@ -73,7 +74,11 @@ static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc for (i = 0; i < n * inc_x; i+= inc_x) { - result = cblas_cdotu(lda, a_ptr, 1, x_ptr, inc_x); +#ifdef RETURN_BY_STACK + BLASFUNC(cdotu)(&result, &lda, a_ptr, &one, x_ptr, &inc_x); +#else + result = BLASFUNC(cdotu)(&lda, a_ptr, &one, x_ptr, &inc_x); +#endif x_res[0] = CREAL(result); x_res[1] = CIMAG(result); a_ptr += lda * 2; @@ -153,6 +158,7 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n, BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgemv_t.a_test, &lda, data_cgemv_t.x_test, &inc_x, beta, data_cgemv_t.y_test, &inc_y); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -173,13 +179,14 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n, cblas_cgemv(corder, ctrans, m, n, alpha, data_cgemv_t.a_test, lda, data_cgemv_t.x_test, inc_x, beta, data_cgemv_t.y_test, inc_y); } +#endif // Find the differences between output vector caculated by cgemv and reference funcs for (i = 0; i < m * inc_y * 2; i++) data_cgemv_t.y_test[i] -= data_cgemv_t.y_verify[i]; // Find the norm of differences - return cblas_scnrm2(m, data_cgemv_t.y_test, inc_y); + return BLASFUNC(scnrm2)(&m, data_cgemv_t.y_test, &inc_y); } /** @@ -213,6 +220,7 @@ static int check_badargs(char order, char trans, blasint m, blasint n, return check_error(); } +#ifndef NO_CBLAS /** * C API specific function * Check if error function was called with expected function name @@ -1130,3 +1138,4 @@ CTEST(cgemv, c_api_xerbla_invalid_order_col_major) ASSERT_EQUAL(TRUE, passed); } #endif +#endif diff --git a/utest/test_extensions/test_cimatcopy.c b/utest/test_extensions/test_cimatcopy.c index a4b1e30ac0..0c96a3b17c 100644 --- a/utest/test_extensions/test_cimatcopy.c +++ b/utest/test_extensions/test_cimatcopy.c @@ -98,6 +98,7 @@ static float check_cimatcopy(char api, char order, char trans, blasint rows, bla BLASFUNC(cimatcopy)(&order, &trans, &rows, &cols, alpha, data_cimatcopy.a_test, &lda_src, &lda_dst); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -108,6 +109,7 @@ static float check_cimatcopy(char api, char order, char trans, blasint rows, bla cblas_cimatcopy(corder, ctrans, rows, cols, alpha, data_cimatcopy.a_test, lda_src, lda_dst); } +#endif // Find the differences between output matrix computed by cimatcopy and reference func return smatrix_difference(data_cimatcopy.a_test, data_cimatcopy.a_verify, cols_out, rows_out, 2*lda_dst); @@ -502,6 +504,7 @@ CTEST(cimatcopy, rowmajor_conjtrans_col_50_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test cimatcopy by comparing it against reference @@ -681,6 +684,7 @@ CTEST(cimatcopy, c_api_rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Test error function for an invalid param order. @@ -815,4 +819,4 @@ CTEST(cimatcopy, xerbla_colmajor_trans_invalid_ldb) int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_comatcopy.c b/utest/test_extensions/test_comatcopy.c index 71663406af..b493c93a6f 100644 --- a/utest/test_extensions/test_comatcopy.c +++ b/utest/test_extensions/test_comatcopy.c @@ -99,6 +99,7 @@ static float check_comatcopy(char api, char order, char trans, blasint rows, bla BLASFUNC(comatcopy)(&order, &trans, &rows, &cols, alpha, data_comatcopy.a_test, &lda, data_comatcopy.b_test, &ldb); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -109,6 +110,7 @@ static float check_comatcopy(char api, char order, char trans, blasint rows, bla cblas_comatcopy(corder, ctrans, rows, cols, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_test, ldb); } +#endif return smatrix_difference(data_comatcopy.b_test, data_comatcopy.b_verify, b_cols, b_rows, ldb*2); } @@ -316,6 +318,7 @@ CTEST(comatcopy, rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test comatcopy by comparing it against refernce @@ -491,6 +494,7 @@ CTEST(comatcopy, c_api_rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Test error function for an invalid param order. diff --git a/utest/test_extensions/test_crot.c b/utest/test_extensions/test_crot.c index 1c55216d9c..1ff4568132 100644 --- a/utest/test_extensions/test_crot.c +++ b/utest/test_extensions/test_crot.c @@ -107,6 +107,7 @@ static float check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c, floa return (norm / 2); } +#ifndef NO_CBLAS /** * C API specific function * Comapare results computed by csrot and caxpby @@ -789,4 +790,5 @@ CTEST(crot, c_api_check_n_zero) float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_crotg.c b/utest/test_extensions/test_crotg.c index 84875ccf73..bb23a5a08e 100644 --- a/utest/test_extensions/test_crotg.c +++ b/utest/test_extensions/test_crotg.c @@ -161,7 +161,7 @@ CTEST(crotg, negative_real_negative_img) ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); } - +#ifndef NO_CBLAS /** * C API specific test * Test crotg by comparing it against pre-calculated values @@ -287,4 +287,5 @@ CTEST(crotg, c_api_negative_real_negative_img) ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_cscal.c b/utest/test_extensions/test_cscal.c index 009c600ad4..cf8b3559e8 100644 --- a/utest/test_extensions/test_cscal.c +++ b/utest/test_extensions/test_cscal.c @@ -91,8 +91,10 @@ static float check_cscal(char api, blasint n, float *alpha, blasint inc) if(api == 'F') BLASFUNC(cscal)(&n, alpha, data_cscal.x_test, &inc); +#ifndef NO_CBLAS else cblas_cscal(n, alpha, data_cscal.x_test, inc); +#endif // Find the differences between output vector computed by cscal and cscal_trusted for (i = 0; i < n * 2 * inc; i++) @@ -132,6 +134,7 @@ CTEST(cscal, alpha_r_zero_alpha_i_zero_inc_2) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test cscal by comparing it against reference @@ -161,4 +164,5 @@ CTEST(cscal, c_api_alpha_r_zero_alpha_i_zero_inc_2) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_ctrmv.c b/utest/test_extensions/test_ctrmv.c index 2a3f274168..4c61c31c8a 100644 --- a/utest/test_extensions/test_ctrmv.c +++ b/utest/test_extensions/test_ctrmv.c @@ -65,6 +65,7 @@ static float check_ctrmv(char uplo, char trans, char diag, blasint n, blasint ld blasint i; float alpha_conj[] = {1.0f, 0.0f}; char trans_verify = trans; + char cc[2]="C", cr[2]="R"; srand_generate(data_ctrmv.a_test, n * lda * 2); srand_generate(data_ctrmv.x_test, n * incx * 2); @@ -76,7 +77,7 @@ static float check_ctrmv(char uplo, char trans, char diag, blasint n, blasint ld data_ctrmv.x_verify[i] = data_ctrmv.x_test[i]; if (trans == 'R'){ - cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ctrmv.a_verify, lda, lda); + BLASFUNC(cimatcopy)(cc, cr, &n, &n, alpha_conj, data_ctrmv.a_verify, &lda, &lda); trans_verify = 'N'; } @@ -263,4 +264,4 @@ CTEST(ctrmv, conj_notrans_lower_unit_triangular_incx_2) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_ctrsv.c b/utest/test_extensions/test_ctrsv.c index 0e639bb2ab..7298ba42e0 100644 --- a/utest/test_extensions/test_ctrsv.c +++ b/utest/test_extensions/test_ctrsv.c @@ -65,6 +65,7 @@ static float check_ctrsv(char uplo, char trans, char diag, blasint n, blasint ld blasint i; float alpha_conj[] = {1.0f, 0.0f}; char trans_verify = trans; + char cc[2]="C", cr[2]="R"; srand_generate(data_ctrsv.a_test, n * lda * 2); srand_generate(data_ctrsv.x_test, n * incx * 2); @@ -76,8 +77,8 @@ static float check_ctrsv(char uplo, char trans, char diag, blasint n, blasint ld data_ctrsv.x_verify[i] = data_ctrsv.x_test[i]; if (trans == 'R'){ - cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, - alpha_conj, data_ctrsv.a_verify, lda, lda); + BLASFUNC(cimatcopy)(cc, cr, &n, &n, + alpha_conj, data_ctrsv.a_verify, &lda, &lda); trans_verify = 'N'; } @@ -264,4 +265,4 @@ CTEST(ctrsv, conj_notrans_lower_unit_triangular_incx_2) ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_daxpby.c b/utest/test_extensions/test_daxpby.c index 6e77c7c7cf..93b26810d8 100644 --- a/utest/test_extensions/test_daxpby.c +++ b/utest/test_extensions/test_daxpby.c @@ -97,6 +97,7 @@ static double check_daxpby(blasint n, double alpha, blasint incx, double beta, b return BLASFUNC(dnrm2)(&n, data_daxpby.y_test, &incy_abs); } +#ifndef NO_CBLAS /** * C API specific function * Test daxpby by comparing it with dscal and daxpy. @@ -142,7 +143,7 @@ static double c_api_check_daxpby(blasint n, double alpha, blasint incx, double b // Find the norm of differences return cblas_dnrm2(n, data_daxpby.y_test, incy_abs); } - +#endif /** * Fortran API specific test * Test daxpby by comparing it with dscal and daxpy. @@ -468,6 +469,7 @@ CTEST(daxpby, check_n_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test daxpby by comparing it with dscal and daxpy. @@ -796,4 +798,5 @@ CTEST(daxpby, c_api_check_n_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_dgeadd.c b/utest/test_extensions/test_dgeadd.c index 8f93a842e3..20e8d966bf 100644 --- a/utest/test_extensions/test_dgeadd.c +++ b/utest/test_extensions/test_dgeadd.c @@ -62,10 +62,11 @@ static void dgeadd_trusted(blasint m, blasint n, double alpha, double *aptr, blasint lda, double beta, double *cptr, blasint ldc) { blasint i; + blasint one=1; for (i = 0; i < n; i++) { - cblas_daxpby(m, alpha, aptr, 1, beta, cptr, 1); + BLASFUNC(daxpby)(&m, &alpha, aptr, &one, &beta, cptr, &one); aptr += lda; cptr += ldc; } @@ -113,9 +114,11 @@ static double check_dgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda, &beta, data_dgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda, beta, data_dgeadd.c_test, ldc); +#endif // Find the differences between output matrix caculated by dgeadd and sgemm return dmatrix_difference(data_dgeadd.c_test, data_dgeadd.c_verify, cols, rows, ldc); @@ -147,9 +150,11 @@ static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda, &beta, data_dgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda, beta, data_dgeadd.c_test, ldc); +#endif return check_error(); } @@ -417,6 +422,7 @@ CTEST(dgeadd, m_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test dgeadd by comparing it against reference @@ -875,4 +881,5 @@ CTEST(dgeadd, c_api_m_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_dgemmt.c b/utest/test_extensions/test_dgemmt.c index 22dcaf2aa1..fd8f5f6661 100644 --- a/utest/test_extensions/test_dgemmt.c +++ b/utest/test_extensions/test_dgemmt.c @@ -73,9 +73,11 @@ static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra if(api == 'F') BLASFUNC(dgemm)(&transa, &transb, &m, &m, &k, &alpha, data_dgemmt.a_test, &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_gemm, &ldc); +#ifndef NO_CBLAS else cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda, data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc); +#endif if (uplo == 'L' || uplo == CblasLower) { @@ -152,9 +154,11 @@ static double check_dgemmt(char api, enum CBLAS_ORDER order, char uplo, char tra if (api == 'F') BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test, &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda, data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc); +#endif for (i = 0; i < m * ldc; i++) data_dgemmt.c_verify[i] -= data_dgemmt.c_test[i]; @@ -189,9 +193,11 @@ static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char trans if (api == 'F') BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test, &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda, data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc); +#endif return check_error(); } @@ -480,6 +486,7 @@ CTEST(dgemmt, lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test dgemmt by comparing it against dgemm @@ -1023,6 +1030,7 @@ CTEST(dgemmt, c_api_rowmajor_lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Fortran API specific test @@ -1168,6 +1176,7 @@ CTEST(dgemmt, xerbla_ldc_invalid) ASSERT_EQUAL(TRUE, passed); } +#ifndef NO_CBLAS /** * C API specific test. * Test error function for an invalid param order. @@ -1439,4 +1448,5 @@ CTEST(dgemmt, xerbla_c_api_rowmajor_ldc_invalid) M, K, lda, ldb, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c index 811c356b3f..eebb7669eb 100644 --- a/utest/test_extensions/test_dimatcopy.c +++ b/utest/test_extensions/test_dimatcopy.c @@ -93,6 +93,7 @@ static double check_dimatcopy(char api, char order, char trans, blasint rows, bl BLASFUNC(dimatcopy)(&order, &trans, &rows, &cols, &alpha, data_dimatcopy.a_test, &lda_src, &lda_dst); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -103,6 +104,7 @@ static double check_dimatcopy(char api, char order, char trans, blasint rows, bl cblas_dimatcopy(corder, ctrans, rows, cols, alpha, data_dimatcopy.a_test, lda_src, lda_dst); } +#endif // Find the differences between output matrix computed by dimatcopy and reference func return dmatrix_difference(data_dimatcopy.a_test, data_dimatcopy.a_verify, cols_out, rows_out, lda_dst); @@ -687,6 +689,7 @@ CTEST(dimatcopy, rowmajor_notrans_col_100_row_50) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test dimatcopy by comparing it against reference @@ -778,6 +781,7 @@ CTEST(dimatcopy, c_api_rowmajor_notrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Test error function for an invalid param order. @@ -912,4 +916,4 @@ CTEST(dimatcopy, xerbla_colmajor_trans_invalid_ldb) int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_domatcopy.c b/utest/test_extensions/test_domatcopy.c index e60b9c83d8..e892271d2d 100644 --- a/utest/test_extensions/test_domatcopy.c +++ b/utest/test_extensions/test_domatcopy.c @@ -94,6 +94,7 @@ static double check_domatcopy(char api, char order, char trans, blasint rows, bl BLASFUNC(domatcopy)(&order, &trans, &rows, &cols, &alpha, data_domatcopy.a_test, &lda, data_domatcopy.b_test, &ldb); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -104,6 +105,7 @@ static double check_domatcopy(char api, char order, char trans, blasint rows, bl cblas_domatcopy(corder, ctrans, rows, cols, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_test, ldb); } +#endif return dmatrix_difference(data_domatcopy.b_test, data_domatcopy.b_verify, b_cols, b_rows, ldb); } @@ -412,6 +414,7 @@ CTEST(domatcopy, rowmajor_notrans_col_100_row_50) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test domatcopy by comparing it against refernce @@ -503,6 +506,7 @@ CTEST(domatcopy, c_api_rowmajor_notrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Test error function for an invalid param order. diff --git a/utest/test_extensions/test_drotmg.c b/utest/test_extensions/test_drotmg.c index 3073c8e3e9..3755776c9e 100644 --- a/utest/test_extensions/test_drotmg.c +++ b/utest/test_extensions/test_drotmg.c @@ -224,6 +224,7 @@ CTEST(drotmg, scaled_y_greater_than_scaled_x) } } +#ifndef NO_CBLAS /** * C API specific test * Test drotmg by comparing it against pre-calculated values @@ -411,4 +412,5 @@ CTEST(drotmg, c_api_scaled_y_greater_than_scaled_x) ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); } } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_dsum.c b/utest/test_extensions/test_dsum.c index 304619d0a9..c7e6d95614 100644 --- a/utest/test_extensions/test_dsum.c +++ b/utest/test_extensions/test_dsum.c @@ -221,6 +221,7 @@ CTEST(dsum, step_2_N_50){ ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test dsum by comparing it against pre-calculated values @@ -403,3 +404,4 @@ CTEST(dsum, c_api_step_2_N_50){ ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS); } #endif +#endif diff --git a/utest/test_extensions/test_dzsum.c b/utest/test_extensions/test_dzsum.c index a184ad9475..318d7fbe3e 100644 --- a/utest/test_extensions/test_dzsum.c +++ b/utest/test_extensions/test_dzsum.c @@ -221,6 +221,7 @@ CTEST(dzsum, step_2_N_50){ ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test dzsum by comparing it against pre-calculated values @@ -403,3 +404,4 @@ CTEST(dzsum, c_api_step_2_N_50){ ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); } #endif +#endif diff --git a/utest/test_extensions/test_icamin.c b/utest/test_extensions/test_icamin.c index cca464eac6..8ac0844a95 100644 --- a/utest/test_extensions/test_icamin.c +++ b/utest/test_extensions/test_icamin.c @@ -331,6 +331,7 @@ CTEST(icamin, min_idx_in_vec_tail){ ASSERT_EQUAL(N, index); } +#ifndef NO_CBLAS /** * C API specific test * Test icamin by comparing it against pre-calculated values @@ -622,4 +623,5 @@ CTEST(icamin, c_api_min_idx_in_vec_tail){ blasint index = cblas_icamin(N, x, inc); ASSERT_EQUAL(N - 1, index); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c index bebe76dbae..4bee258a0a 100644 --- a/utest/test_extensions/test_idamin.c +++ b/utest/test_extensions/test_idamin.c @@ -413,6 +413,7 @@ CTEST(idamin, min_idx_in_vec_tail_inc_1){ ASSERT_EQUAL(N, index); } +#ifndef NO_CBLAS /** * C API specific test * Test idamin by comparing it against pre-calculated values @@ -787,3 +788,4 @@ CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){ ASSERT_EQUAL(N - 1, index); } #endif +#endif diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c index d93813e6fb..a4a418473d 100644 --- a/utest/test_extensions/test_isamin.c +++ b/utest/test_extensions/test_isamin.c @@ -412,7 +412,7 @@ CTEST(isamin, min_idx_in_vec_tail_inc_1){ free(x); ASSERT_EQUAL(N, index); } - +#ifndef NO_CBLAS /** * C API specific test * Test isamin by comparing it against pre-calculated values @@ -787,3 +787,4 @@ CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){ ASSERT_EQUAL(N - 1, index); } #endif +#endif diff --git a/utest/test_extensions/test_izamin.c b/utest/test_extensions/test_izamin.c index a0bdae8e23..8c923c6090 100644 --- a/utest/test_extensions/test_izamin.c +++ b/utest/test_extensions/test_izamin.c @@ -331,6 +331,7 @@ CTEST(izamin, min_idx_in_vec_tail){ ASSERT_EQUAL(N, index); } +#ifndef NO_CBLAS /** * C API specific test * Test izamin by comparing it against pre-calculated values @@ -623,3 +624,4 @@ CTEST(izamin, c_api_min_idx_in_vec_tail){ ASSERT_EQUAL(N - 1, index); } #endif +#endif diff --git a/utest/test_extensions/test_saxpby.c b/utest/test_extensions/test_saxpby.c index b4bd5cf0bd..44f89240fb 100644 --- a/utest/test_extensions/test_saxpby.c +++ b/utest/test_extensions/test_saxpby.c @@ -96,6 +96,7 @@ static float check_saxpby(blasint n, float alpha, blasint incx, float beta, blas return BLASFUNC(snrm2)(&n, data_saxpby.y_test, &incy_abs); } +#ifndef NO_CBLAS /** * C API specific function * Test saxpby by comparing it with sscal and saxpy. @@ -141,6 +142,7 @@ static float c_api_check_saxpby(blasint n, float alpha, blasint incx, float beta // Find the norm of differences return cblas_snrm2(n, data_saxpby.y_test, incy_abs); } +#endif /** * Fortran API specific test @@ -467,6 +469,7 @@ CTEST(saxpby, check_n_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test saxpby by comparing it with sscal and saxpy. @@ -791,4 +794,5 @@ CTEST(saxpby, c_api_check_n_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_scsum.c b/utest/test_extensions/test_scsum.c index 8e943de229..7162808453 100644 --- a/utest/test_extensions/test_scsum.c +++ b/utest/test_extensions/test_scsum.c @@ -221,6 +221,7 @@ CTEST(scsum, step_2_N_50){ ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test scsum by comparing it against pre-calculated values @@ -403,3 +404,4 @@ CTEST(scsum, c_api_step_2_N_50){ ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); } #endif +#endif diff --git a/utest/test_extensions/test_sgeadd.c b/utest/test_extensions/test_sgeadd.c index 171132b9da..009066a9f0 100644 --- a/utest/test_extensions/test_sgeadd.c +++ b/utest/test_extensions/test_sgeadd.c @@ -63,10 +63,10 @@ static void sgeadd_trusted(blasint m, blasint n, float alpha, float *aptr, blasint lda, float beta, float *cptr, blasint ldc) { blasint i; - + blasint one=1; for (i = 0; i < n; i++) { - cblas_saxpby(m, alpha, aptr, 1, beta, cptr, 1); + BLASFUNC(saxpby)(&m, &alpha, aptr, &one, &beta, cptr, &one); aptr += lda; cptr += ldc; } @@ -115,9 +115,11 @@ static float check_sgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, BLASFUNC(sgeadd) (&m, &n, &alpha, data_sgeadd.a_test, &lda, &beta, data_sgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda, beta, data_sgeadd.c_test, ldc); +#endif // Find the differences between output matrix caculated by sgeadd and sgemm return smatrix_difference(data_sgeadd.c_test, data_sgeadd.c_verify, cols, rows, ldc); @@ -150,9 +152,11 @@ static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, BLASFUNC(sgeadd) (&m, &n, &alpha, data_sgeadd.a_test, &lda, &beta, data_sgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda, beta, data_sgeadd.c_test, ldc); +#endif return check_error(); } @@ -420,6 +424,7 @@ CTEST(sgeadd, m_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test sgeadd by comparing it against reference @@ -877,4 +882,5 @@ CTEST(sgeadd, c_api_m_zero) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_sgemmt.c b/utest/test_extensions/test_sgemmt.c index 5b51e3579b..177ce0d73b 100644 --- a/utest/test_extensions/test_sgemmt.c +++ b/utest/test_extensions/test_sgemmt.c @@ -73,9 +73,11 @@ static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra if(api == 'F') BLASFUNC(sgemm)(&transa, &transb, &m, &m, &k, &alpha, data_sgemmt.a_test, &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_gemm, &ldc); +#ifndef NO_CBLAS else cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda, data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc); +#endif if (uplo == 'L' || uplo == CblasLower) { @@ -152,9 +154,11 @@ static float check_sgemmt(char api, enum CBLAS_ORDER order, char uplo, char tran if (api == 'F') BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test, &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda, data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc); +#endif for (i = 0; i < m * ldc; i++) data_sgemmt.c_verify[i] -= data_sgemmt.c_test[i]; @@ -189,9 +193,11 @@ static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char trans if (api == 'F') BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test, &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda, data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc); +#endif return check_error(); } @@ -480,6 +486,7 @@ CTEST(sgemmt, lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test sgemmt by comparing it against sgemm @@ -1023,6 +1030,7 @@ CTEST(sgemmt, c_api_rowmajor_lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Fortran API specific test @@ -1168,6 +1176,7 @@ CTEST(sgemmt, xerbla_ldc_invalid) ASSERT_EQUAL(TRUE, passed); } +#ifndef NO_CBLAS /** * C API specific test. * Test error function for an invalid param order. @@ -1439,4 +1448,5 @@ CTEST(sgemmt, xerbla_c_api_rowmajor_ldc_invalid) M, K, lda, ldb, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c index ba388596db..c00ea0c8f0 100644 --- a/utest/test_extensions/test_simatcopy.c +++ b/utest/test_extensions/test_simatcopy.c @@ -93,6 +93,7 @@ static float check_simatcopy(char api, char order, char trans, blasint rows, bla BLASFUNC(simatcopy)(&order, &trans, &rows, &cols, &alpha, data_simatcopy.a_test, &lda_src, &lda_dst); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -103,6 +104,7 @@ static float check_simatcopy(char api, char order, char trans, blasint rows, bla cblas_simatcopy(corder, ctrans, rows, cols, alpha, data_simatcopy.a_test, lda_src, lda_dst); } +#endif // Find the differences between output matrix computed by simatcopy and reference func return smatrix_difference(data_simatcopy.a_test, data_simatcopy.a_verify, cols_out, rows_out, lda_dst); @@ -687,6 +689,7 @@ CTEST(simatcopy, rowmajor_notrans_col_100_row_50) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test simatcopy by comparing it against reference @@ -778,6 +781,7 @@ CTEST(simatcopy, c_api_rowmajor_notrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Test error function for an invalid param order. @@ -912,4 +916,4 @@ CTEST(simatcopy, xerbla_colmajor_trans_invalid_ldb) int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_somatcopy.c b/utest/test_extensions/test_somatcopy.c index b53c7cae50..62a6056d92 100644 --- a/utest/test_extensions/test_somatcopy.c +++ b/utest/test_extensions/test_somatcopy.c @@ -94,6 +94,7 @@ static float check_somatcopy(char api, char order, char trans, blasint rows, bla BLASFUNC(somatcopy)(&order, &trans, &rows, &cols, &alpha, data_somatcopy.a_test, &lda, data_somatcopy.b_test, &ldb); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -104,7 +105,8 @@ static float check_somatcopy(char api, char order, char trans, blasint rows, bla cblas_somatcopy(corder, ctrans, rows, cols, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_test, ldb); } - +#endif + return smatrix_difference(data_somatcopy.b_test, data_somatcopy.b_verify, b_cols, b_rows, ldb); } @@ -412,6 +414,7 @@ CTEST(somatcopy, rowmajor_notrans_col_100_row_50) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test somatcopy by comparing it against refernce @@ -503,6 +506,7 @@ CTEST(somatcopy, c_api_rowmajor_notrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } +#endif /** * Test error function for an invalid param order. diff --git a/utest/test_extensions/test_srotmg.c b/utest/test_extensions/test_srotmg.c index 3c97e3b4de..f0422d2b2b 100644 --- a/utest/test_extensions/test_srotmg.c +++ b/utest/test_extensions/test_srotmg.c @@ -224,6 +224,7 @@ CTEST(srotmg, scaled_y_greater_than_scaled_x) } } +#ifndef NO_CBLAS /** * C API specific test * Test srotmg by comparing it against pre-calculated values @@ -411,4 +412,5 @@ CTEST(srotmg, c_api_scaled_y_greater_than_scaled_x) ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); } } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_ssum.c b/utest/test_extensions/test_ssum.c index 9c5e297286..5b20c0b031 100644 --- a/utest/test_extensions/test_ssum.c +++ b/utest/test_extensions/test_ssum.c @@ -221,6 +221,7 @@ CTEST(ssum, step_2_N_50){ ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test ssum by comparing it against pre-calculated values @@ -403,3 +404,4 @@ CTEST(ssum, c_api_step_2_N_50){ ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS); } #endif +#endif diff --git a/utest/test_extensions/test_zaxpby.c b/utest/test_extensions/test_zaxpby.c index 6148f44c5f..d6ca999432 100644 --- a/utest/test_extensions/test_zaxpby.c +++ b/utest/test_extensions/test_zaxpby.c @@ -96,6 +96,7 @@ static double check_zaxpby(blasint n, double *alpha, blasint incx, double *beta, return BLASFUNC(dznrm2)(&n, data_zaxpby.y_test, &incy_abs); } +#ifndef NO_CBLAS /** * C API specific function * Test zaxpby by comparing it with zscal and zaxpy. @@ -145,6 +146,7 @@ static double c_api_check_zaxpby(blasint n, double *alpha, blasint incx, double // Find the norm of differences return cblas_dznrm2(n, data_zaxpby.y_test, incy_abs); } +#endif /** * Fortran API specific test @@ -387,6 +389,7 @@ CTEST(zaxpby, check_n_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zaxpby by comparing it with zscal and zaxpy. @@ -628,3 +631,4 @@ CTEST(zaxpby, c_api_check_n_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } #endif +#endif diff --git a/utest/test_extensions/test_zgeadd.c b/utest/test_extensions/test_zgeadd.c index 7496ccf88c..466b94a52b 100644 --- a/utest/test_extensions/test_zgeadd.c +++ b/utest/test_extensions/test_zgeadd.c @@ -62,13 +62,14 @@ static void zgeadd_trusted(blasint m, blasint n, double *alpha, double *aptr, blasint lda, double *beta, double *cptr, blasint ldc) { blasint i; + blasint one=1; lda *= 2; ldc *= 2; for (i = 0; i < n; i++) { - cblas_zaxpby(m, alpha, aptr, 1, beta, cptr, 1); + BLASFUNC(zaxpby)(&m, alpha, aptr, &one, beta, cptr, &one); aptr += lda; cptr += ldc; } @@ -116,9 +117,11 @@ static double check_zgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda, beta, data_zgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda, beta, data_zgeadd.c_test, ldc); +#endif // Find the differences between output matrix caculated by zgeadd and sgemm return dmatrix_difference(data_zgeadd.c_test, data_zgeadd.c_verify, cols, rows, ldc * 2); @@ -150,9 +153,11 @@ static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, if (api == 'F') BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda, beta, data_zgeadd.c_test, &ldc); +#ifndef NO_CBLAS else cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda, beta, data_zgeadd.c_test, ldc); +#endif return check_error(); } @@ -420,6 +425,7 @@ CTEST(zgeadd, m_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zgeadd by comparing it against reference @@ -877,4 +883,5 @@ CTEST(zgeadd, c_api_m_zero) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_zgemm.c b/utest/test_extensions/test_zgemm.c index 4160a50866..bd23ebca42 100644 --- a/utest/test_extensions/test_zgemm.c +++ b/utest/test_extensions/test_zgemm.c @@ -73,9 +73,10 @@ static double check_zgemm(char transa, char transb, blasint m, blasint n, blasin double alpha_conj[] = {1.0, 0.0}; char transa_verify = transa; char transb_verify = transb; + char cc[2]="C", cr[2]="R"; - int arows = k, acols = m; - int brows = n, bcols = k; + blasint arows = k, acols = m; + blasint brows = n, bcols = k; if (transa == 'T' || transa == 'C'){ arows = m; acols = k; @@ -99,12 +100,12 @@ static double check_zgemm(char transa, char transb, blasint m, blasint n, blasin data_zgemm.c_verify[i] = data_zgemm.c_test[i]; if (transa == 'R'){ - cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_zgemm.a_verify, lda, lda); + BLASFUNC(zimatcopy)(cc, cr, &arows, &acols, alpha_conj, data_zgemm.a_verify, &lda, &lda); transa_verify = 'N'; } if (transb == 'R'){ - cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_zgemm.b_verify, ldb, ldb); + BLASFUNC(zimatcopy)(cc, cr, &brows, &bcols, alpha_conj, data_zgemm.b_verify, &ldb, &ldb); transb_verify = 'N'; } @@ -270,4 +271,4 @@ CTEST(zgemm, transa_conjnotransb) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_zgemmt.c b/utest/test_extensions/test_zgemmt.c index c553810084..34b8b61867 100644 --- a/utest/test_extensions/test_zgemmt.c +++ b/utest/test_extensions/test_zgemmt.c @@ -73,9 +73,11 @@ static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra if(api == 'F') BLASFUNC(zgemm)(&transa, &transb, &m, &m, &k, alpha, data_zgemmt.a_test, &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_gemm, &ldc); +#ifndef NO_CBLAS else cblas_zgemm(order, transa, transb, m, m, k, alpha, data_zgemmt.a_test, lda, data_zgemmt.b_test, ldb, beta, data_zgemmt.c_gemm, ldc); +#endif ldc *= 2; @@ -160,9 +162,11 @@ static double check_zgemmt(char api, enum CBLAS_ORDER order, char uplo, char tra if (api == 'F') BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test, &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda, data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc); +#endif for (i = 0; i < m * ldc * 2; i++) data_zgemmt.c_verify[i] -= data_zgemmt.c_test[i]; @@ -197,9 +201,11 @@ static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char trans if (api == 'F') BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test, &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc); +#ifndef NO_CBLAS else cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda, data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc); +#endif return check_error(); } @@ -680,6 +686,7 @@ CTEST(zgemmt, lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zgemmt by comparing it against sgemm @@ -1591,6 +1598,7 @@ CTEST(zgemmt, c_api_rowmajor_lower_beta_one) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Fortran API specific test @@ -1735,7 +1743,7 @@ CTEST(zgemmt, xerbla_ldc_invalid) M, K, lda, ldb, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); } - +#ifndef NO_CBLAS /** * C API specific test. * Test error function for an invalid param order. @@ -2007,4 +2015,5 @@ CTEST(zgemmt, xerbla_c_api_rowmajor_ldc_invalid) M, K, lda, ldb, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_zgemv_t.c b/utest/test_extensions/test_zgemv_t.c index b2d0b27139..4e419ad1ba 100644 --- a/utest/test_extensions/test_zgemv_t.c +++ b/utest/test_extensions/test_zgemv_t.c @@ -65,6 +65,7 @@ static struct DATA_ZGEMV_T data_zgemv_t; static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x) { blasint i; + blasint one=1; double *a_ptr = data_zgemv_t.a_verify; double *x_ptr = data_zgemv_t.x_test; double *x_res = data_zgemv_t.x_verify; @@ -73,8 +74,12 @@ static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc for (i = 0; i < n * inc_x; i += inc_x) { - result = cblas_zdotu(lda, a_ptr, 1, x_ptr, inc_x); - x_res[0] = CREAL(result); +#ifdef RETURN_BY_STACK + BLASFUNC(zdotu)(&result, &lda, a_ptr, &one, x_ptr, &inc_x); +#else + result = BLASFUNC(zdotu)(&lda, a_ptr, &one, x_ptr, &inc_x); +#endif + x_res[0] = CREAL(result); x_res[1] = CIMAG(result); a_ptr += lda * 2; x_res += 2 * inc_x; @@ -157,6 +162,7 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgemv_t.a_test, &lda, data_zgemv_t.x_test, &inc_x, beta, data_zgemv_t.y_test, &inc_y); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -177,13 +183,14 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n cblas_zgemv(corder, ctrans, m, n, alpha, data_zgemv_t.a_test, lda, data_zgemv_t.x_test, inc_x, beta, data_zgemv_t.y_test, inc_y); } +#endif // Find the differences between output vector caculated by zgemv and reference funcs for (i = 0; i < m * inc_y * 2; i++) data_zgemv_t.y_test[i] -= data_zgemv_t.y_verify[i]; // Find the norm of differences - return cblas_dznrm2(m, data_zgemv_t.y_test, inc_y); + return BLASFUNC(dznrm2)(&m, data_zgemv_t.y_test, &inc_y); } /** @@ -217,7 +224,7 @@ static int check_badargs(char order, char trans, blasint m, blasint n, return check_error(); } - +#ifndef NO_CBLAS /** * C API specific function * Check if error function was called with expected function name @@ -1134,3 +1141,4 @@ CTEST(zgemv, c_api_xerbla_invalid_order_col_major) ASSERT_EQUAL(TRUE, passed); } #endif +#endif diff --git a/utest/test_extensions/test_zimatcopy.c b/utest/test_extensions/test_zimatcopy.c index 8376bc493a..86bc4670f2 100644 --- a/utest/test_extensions/test_zimatcopy.c +++ b/utest/test_extensions/test_zimatcopy.c @@ -98,6 +98,7 @@ static double check_zimatcopy(char api, char order, char trans, blasint rows, bl BLASFUNC(zimatcopy)(&order, &trans, &rows, &cols, alpha, data_zimatcopy.a_test, &lda_src, &lda_dst); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -108,6 +109,7 @@ static double check_zimatcopy(char api, char order, char trans, blasint rows, bl cblas_zimatcopy(corder, ctrans, rows, cols, alpha, data_zimatcopy.a_test, lda_src, lda_dst); } +#endif // Find the differences between output matrix computed by zimatcopy and reference func return dmatrix_difference(data_zimatcopy.a_test, data_zimatcopy.a_verify, cols_out, rows_out, lda_dst*2); @@ -502,6 +504,7 @@ CTEST(zimatcopy, rowmajor_conjtrans_col_50_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zimatcopy by comparing it against reference @@ -681,6 +684,7 @@ CTEST(zimatcopy, c_api_rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Test error function for an invalid param order. @@ -815,4 +819,4 @@ CTEST(zimatcopy, xerbla_colmajor_trans_invalid_ldb) int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_zomatcopy.c b/utest/test_extensions/test_zomatcopy.c index 495831c56c..208cfd981c 100644 --- a/utest/test_extensions/test_zomatcopy.c +++ b/utest/test_extensions/test_zomatcopy.c @@ -99,6 +99,7 @@ static double check_zomatcopy(char api, char order, char trans, blasint rows, bl BLASFUNC(zomatcopy)(&order, &trans, &rows, &cols, alpha, data_zomatcopy.a_test, &lda, data_zomatcopy.b_test, &ldb); } +#ifndef NO_CBLAS else { if (order == 'C') corder = CblasColMajor; if (order == 'R') corder = CblasRowMajor; @@ -109,7 +110,8 @@ static double check_zomatcopy(char api, char order, char trans, blasint rows, bl cblas_zomatcopy(corder, ctrans, rows, cols, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_test, ldb); } - +#endif + return dmatrix_difference(data_zomatcopy.b_test, data_zomatcopy.b_verify, b_cols, b_rows, ldb*2); } @@ -325,6 +327,7 @@ CTEST(zomatcopy, rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zomatcopy by comparing it against refernce @@ -508,6 +511,7 @@ CTEST(zomatcopy, c_api_rowmajor_conjtrans_col_100_row_100) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#endif /** * Test error function for an invalid param order. diff --git a/utest/test_extensions/test_zrot.c b/utest/test_extensions/test_zrot.c index 5471e051a6..c5ae22fc57 100644 --- a/utest/test_extensions/test_zrot.c +++ b/utest/test_extensions/test_zrot.c @@ -105,6 +105,7 @@ static double check_zdrot(blasint n, blasint inc_x, blasint inc_y, double *c, do return (norm / 2); } +#ifndef NO_CBLAS /** * C API specific function * Comapare results computed by zdrot and zaxpby @@ -787,4 +788,5 @@ CTEST(zrot, c_api_check_n_zero) double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_zrotg.c b/utest/test_extensions/test_zrotg.c index 1de95447d8..c834bed6e6 100644 --- a/utest/test_extensions/test_zrotg.c +++ b/utest/test_extensions/test_zrotg.c @@ -162,6 +162,7 @@ CTEST(zrotg, negative_real_negative_img) ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zrotg by comparing it against pre-calculated values @@ -287,4 +288,5 @@ CTEST(zrotg, c_api_negative_real_negative_img) ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_zscal.c b/utest/test_extensions/test_zscal.c index 132f4ee5b6..63cf355ae4 100644 --- a/utest/test_extensions/test_zscal.c +++ b/utest/test_extensions/test_zscal.c @@ -92,8 +92,10 @@ static double check_zscal(char api, blasint n, double *alpha, blasint inc) if(api == 'F') BLASFUNC(zscal)(&n, alpha, data_zscal.x_test, &inc); +#ifndef NO_CBLAS else cblas_zscal(n, alpha, data_zscal.x_test, inc); +#endif // Find the differences between output vector computed by zscal and zscal_trusted for (i = 0; i < n * 2 * inc; i++) @@ -133,6 +135,7 @@ CTEST(zscal, alpha_r_zero_alpha_i_zero_inc_2) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } +#ifndef NO_CBLAS /** * C API specific test * Test zscal by comparing it against reference @@ -162,4 +165,5 @@ CTEST(zscal, c_api_alpha_r_zero_alpha_i_zero_inc_2) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif +#endif diff --git a/utest/test_extensions/test_ztrmv.c b/utest/test_extensions/test_ztrmv.c index 5668ec2966..5819877bf2 100644 --- a/utest/test_extensions/test_ztrmv.c +++ b/utest/test_extensions/test_ztrmv.c @@ -65,7 +65,7 @@ static double check_ztrmv(char uplo, char trans, char diag, blasint n, blasint l blasint i; double alpha_conj[] = {1.0, 0.0}; char trans_verify = trans; - + char cc[2]="C", cr[2]="R"; drand_generate(data_ztrmv.a_test, n * lda * 2); drand_generate(data_ztrmv.x_test, n * incx * 2); @@ -76,7 +76,7 @@ static double check_ztrmv(char uplo, char trans, char diag, blasint n, blasint l data_ztrmv.x_verify[i] = data_ztrmv.x_test[i]; if (trans == 'R'){ - cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ztrmv.a_verify, lda, lda); + BLASFUNC(zimatcopy)(cc, cr, &n, &n, alpha_conj, data_ztrmv.a_verify, &lda, &lda); trans_verify = 'N'; } @@ -263,4 +263,4 @@ CTEST(ztrmv, conj_notrans_lower_unit_triangular_incx_2) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_ztrsv.c b/utest/test_extensions/test_ztrsv.c index 4b7ec6aaf2..5db7963eea 100644 --- a/utest/test_extensions/test_ztrsv.c +++ b/utest/test_extensions/test_ztrsv.c @@ -65,6 +65,7 @@ static double check_ztrsv(char uplo, char trans, char diag, blasint n, blasint l blasint i; double alpha_conj[] = {1.0, 0.0}; char trans_verify = trans; + char cc[2]="C", cr[2]="R"; drand_generate(data_ztrsv.a_test, n * lda * 2); drand_generate(data_ztrsv.x_test, n * incx * 2); @@ -76,8 +77,8 @@ static double check_ztrsv(char uplo, char trans, char diag, blasint n, blasint l data_ztrsv.x_verify[i] = data_ztrsv.x_test[i]; if (trans == 'R'){ - cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, - alpha_conj, data_ztrsv.a_verify, lda, lda); + BLASFUNC(zimatcopy)(cc, cr, &n, &n, + alpha_conj, data_ztrsv.a_verify, &lda, &lda); trans_verify = 'N'; } @@ -264,4 +265,4 @@ CTEST(ztrsv, conj_notrans_lower_unit_triangular_incx_2) ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } -#endif \ No newline at end of file +#endif