From ff1e7eb0851514f6818676505cc47a935e947e12 Mon Sep 17 00:00:00 2001 From: gxw Date: Sat, 16 Sep 2023 11:19:12 +0800 Subject: [PATCH] LoongArch: Small matrix opt --- Makefile.system | 1 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 + .../loongarch64/dgemm_small_kernel_nn_lasx.S | 549 ++++++++++++++++++ .../loongarch64/dgemm_small_matrix_permit.c | 39 ++ 4 files changed, 593 insertions(+) create mode 100644 kernel/loongarch64/dgemm_small_kernel_nn_lasx.S create mode 100644 kernel/loongarch64/dgemm_small_matrix_permit.c diff --git a/Makefile.system b/Makefile.system index f452011ad2..9b998b19b8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -269,6 +269,7 @@ else ifeq ($(ARCH), power) SMALL_MATRIX_OPT = 1 BUILD_BFLOAT16 = 1 endif +SMALL_MATRIX_OPT = 1 ifeq ($(SMALL_MATRIX_OPT), 1) CCOMMON_OPT += -DSMALL_MATRIX_OPT endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 2c1ab87e59..0280f90913 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -162,4 +162,8 @@ STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DGEMM_SMALL_M_PERMIT = dgemm_small_matrix_permit.c +DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_lasx.S +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_lasx.S endif diff --git a/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S b/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S new file mode 100644 index 0000000000..230bf96535 --- /dev/null +++ b/kernel/loongarch64/dgemm_small_kernel_nn_lasx.S @@ -0,0 +1,549 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +#define M $a0 +#define N $a1 +#define K $a2 +#define A $a3 +#define LDA $a4 +#define ALPHA $f0 +#define B $a5 +#define LDB $a6 +#define C $a7 +#define LDC $t0 +#ifdef B0 +#define BETA $f1 +#endif +#undef ZERO +#define ZERO $r0 + +#define M16 $t1 +#define M8 $t1 +#define M4 $t1 +#define M2 $t1 +#define M1 $t1 +#define N4 $t2 +#define N2 $t2 +#define N1 $t2 +#define K8 $t3 +#define A0 $t4 +#define X0 $t5 +#define B1 $t6 +#define B2 $t7 +#define B3 $t8 +#define C0 $s0 +#define C1 $s1 +#define C2 $s2 +#define C3 $s3 +#define K1 $s4 + +#define VALPHA $xr0 +#ifndef B0 +#define VBETA $xr1 +#endif +#define D0 $xr2 +#define D1 $xr3 +#define D2 $xr4 +#define D3 $xr5 +#define D4 $xr6 +#define D5 $xr7 +#define D6 $xr8 +#define D7 $xr9 +#define D8 $xr10 +#define D9 $xr11 +#define D10 $xr12 +#define D11 $xr13 +#define D12 $xr14 +#define D13 $xr15 +#define D14 $xr16 +#define D15 $xr17 +#define S0 $xr18 +#define S1 $xr19 +#define S2 $xr20 +#define S3 $xr21 +#define Z0 $xr22 +#define Z1 $xr23 +#define Z2 $xr24 +#define Z3 $xr25 +#define V0 $vr2 +#define V1 $vr3 +#define V2 $vr4 +#define V3 $vr5 +#define F0 $f2 +#define F1 $f3 +#define F2 $f4 +#define F3 $f5 + +.macro DGEMM_SMALL_KERNEL_NN_TAIL M + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M\M\()_N3 +.L_M\M\()_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M\M\()_N4_END +.L_M\M\()_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1, D2, S0, Z2, D2, D3, S0, Z3, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N4_K1 +.L_M\M\()_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 + GLD xv, , S0, C2, 0x00 + GMADD xvf, d, D2, S0, VBETA, D2 + GLD xv, , S0, C3, 0x00 + GMADD xvf, d, D3, S0, VBETA, D3 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00, D2, C2, 0x00, D3, C3, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00, V2, C2, 0x00, V3, C3, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00, F2, C2, 0x00, F3, C3, 0x00 +.endif + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M\M\()_N4 +.L_M\M\()_N3: + andi N2, N, 0x02 + beqz N2, .L_M\M\()_N1 +.L_M\M\()_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N2_END +.L_M\M\()_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N2_K1 +.L_M\M\()_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 + GLD xv, , S0, C1, 0x00 + GMADD xvf, d, D1, S0, VBETA, D1 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00, D1, C1, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00, V1, C1, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00, F1, C1, 0x00 +.endif + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M\M\()_N1: + andi N1, N, 0x01 + beqz N1, .L_M\M\()_END + GXOR xv, v, D0, D0, D0 + move K1, K // Restore K1 + bge ZERO, K, .L_M\M\()_N1_END +.L_M\M\()_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M\M\()_N1_K1 +.L_M\M\()_N1_END: + GMUL xvf, d, D0, D0, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00 + GMADD xvf, d, D0, S0, VBETA, D0 +#endif +.if \M == 4 + GST xv, , D0, C0, 0x00 +.elseif \M == 2 + GST v, , V0, C0, 0x00 +.elseif \M == 1 + GST f, d, F0, C0, 0x00 +.endif +.L_M\M\()_END: +.if \M == 4 + PTR_ADDI A, A, 0x20 + PTR_ADDI C, C, 0x20 +.elseif \M == 2 + PTR_ADDI A, A, 0x10 + PTR_ADDI C, C, 0x10 +.elseif \M == 1 +.endif +.endm + + PROLOGUE + PTR_LD LDC, $sp, 0 + push_if_used 5, 2 + xvreplve0.d VALPHA, VALPHA +#ifndef B0 + xvreplve0.d VBETA, VBETA +#endif + PTR_SLLI LDA, LDA, 3 + PTR_SLLI LDB, LDB, 3 + PTR_SLLI LDC, LDC, 3 + PTR_SLLI K8, K, 3 + PTR_SRAI M16, M, 4 // M >> 4 + beqz M16, .L_M15 +.L_M16: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M16_N3 +.L_M16_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7, \ + D8, D8, D8, D9, D9, D9, D10, D10, D10, D11, D11, D11, \ + D12, D12, D12, D13, D13, D13, D14, D14, D14, D15, D15, D15 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M16_N4_END +.L_M16_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7, \ + D8, S0, Z2, D8, D9, S1, Z2, D9, D10, S2, Z2, D10, D11, S3, Z2, D11, \ + D12, S0, Z3, D12, D13, S1, Z3, D13, D14, S2, Z3, D14, D15, S3, Z3, D15 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N4_K1 + .L_M16_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ + D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ + D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20, S2, C2, 0x40, S3, C2, 0x60 + GMADD xvf, d, D8, S0, VBETA, D8, D9, S1, VBETA, D9, D10, S2, VBETA, D10, D11, S3, VBETA, D11 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20, S2, C3, 0x40, S3, C3, 0x60 + GMADD xvf, d, D12, S0, VBETA, D12, D13, S1, VBETA, D13, D14, S2, VBETA, D14, D15, S3, VBETA, D15 +#endif + GST xv, , D12, C3, 0x00, D13, C3, 0x20, D14, C3, 0x40, D15, C3, 0x60, \ + D8, C2, 0x00, D9, C2, 0x20, D10, C2, 0x40, D11, C2, 0x60, \ + D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M16_N4 +.L_M16_N3: + andi N2, N, 0x02 + beqz N2, .L_M16_N1 +.L_M16_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N2_END +.L_M16_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \ + D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N2_K1 +.L_M16_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7 +#endif + GST xv, , D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M16_N1: + andi N1, N, 0x01 + beqz N1, .L_M16_END + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M16_N1_END +.L_M16_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M16_N1_K1 +.L_M16_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_ALSL X0, LDB, X0, 2 + // Restore A0 + move A0, A +.L_M16_END: + PTR_ADDI M16, M16, -1 + PTR_ADDI A, A, 0x80 + PTR_ADDI C, C, 0x80 + bnez M16, .L_M16 +.L_M15: + andi M8, M, 0x08 + beqz M8, .L_M7 +.L_M8: + PTR_SRAI N4, N, 2 // N >> 2 + move A0, A // Restore A0 + move X0, B // Restore X0 + PTR_ADD B1, X0, LDB + PTR_ADD B2, B1, LDB + PTR_ADD B3, B2, LDB + move C0, C // Restore C0 + PTR_ADD C1, C0, LDC + PTR_ADD C2, C1, LDC + PTR_ADD C3, C2, LDC + beqz N4, .L_M8_N3 +.L_M8_N4: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \ + D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7 + move K1, K // Restore K1 + PTR_ADDI N4, N4, -1 + bge ZERO, K, .L_M8_N4_END +.L_M8_N4_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3, \ + D4, S0, Z2, D4, D5, S1, Z2, D5, \ + D6, S0, Z3, D6, D7, S1, Z3, D7, + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADDI B2, B2, 0x08 + PTR_ADDI B3, B3, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N4_K1 +.L_M8_N4_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 + GLD xv, , S0, C2, 0x00, S1, C2, 0x20 + GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5 + GLD xv, , S0, C3, 0x00, S1, C3, 0x20 + GMADD xvf, d, D6, S0, VBETA, D6, D7, S1, VBETA, D7 +#endif + GST xv, , D4, C2, 0x00, D5, C2, 0x20, D6, C3, 0x00, D7, C3, 0x20, \ + D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1, C2, C3 + PTR_ALSL C0, LDC, C0, 2 + PTR_ALSL C1, LDC, C1, 2 + PTR_ALSL C2, LDC, C2, 2 + PTR_ALSL C3, LDC, C3, 2 + // Update X0, B1, B2, B3 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_SUB B2, B2, K8 + PTR_SUB B3, B3, K8 + PTR_ALSL X0, LDB, X0, 2 + PTR_ALSL B1, LDB, B1, 2 + PTR_ALSL B2, LDB, B2, 2 + PTR_ALSL B3, LDB, B3, 2 + // Restore A0 + move A0, A + bnez N4, .L_M8_N4 +.L_M8_N3: + andi N2, N, 0x02 + beqz N2, .L_M8_N1 +.L_M8_N2: + GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N2_END +.L_M8_N2_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \ + D2, S0, Z1, D2, D3, S1, Z1, D3 + PTR_ADDI X0, X0, 0x08 + PTR_ADDI B1, B1, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N2_K1 +.L_M8_N2_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 + GLD xv, , S0, C1, 0x00, S1, C1, 0x20 + GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20 + // Update C0, C1 + PTR_ALSL C0, LDC, C0, 1 + PTR_ALSL C1, LDC, C1, 1 + // Update X0, B1 + PTR_SUB X0, X0, K8 + PTR_SUB B1, B1, K8 + PTR_ALSL X0, LDB, X0, 1 + PTR_ALSL B1, LDB, B1, 1 + // Restore A0 + move A0, A +.L_M8_N1: + andi N1, N, 0x01 + beqz N1, .L_M8_END + GXOR xv, v, D0, D0, D0, D1, D1, D1 + move K1, K // Restore K1 + bge ZERO, K, .L_M8_N1_END +.L_M8_N1_K1: + PTR_ADDI K1, K1, -1 + GLD xv, , S0, A0, 0x00, S1, A0, 0x20 + GLDREPL xv, d, Z0, X0, 0x00 + GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1 + PTR_ADDI X0, X0, 0x08 + PTR_ADD A0, A0, LDA + bnez K1, .L_M8_N1_K1 +.L_M8_N1_END: + GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA +#ifndef B0 + GLD xv, , S0, C0, 0x00, S1, C0, 0x20 + GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1 +#endif + GST xv, , D0, C0, 0x00, D1, C0, 0x20 +.L_M8_END: + PTR_ADDI A, A, 0x40 + PTR_ADDI C, C, 0x40 +.L_M7: + andi M4, M, 0x04 + beqz M4, .L_M3 +.L_M4: + DGEMM_SMALL_KERNEL_NN_TAIL 4 +.L_M3: + andi M2, M, 0x02 + beqz M2, .L_M1 +.L_M2: + DGEMM_SMALL_KERNEL_NN_TAIL 2 +.L_M1: + andi M1, M, 0x01 + beqz M1, .L_M0 + DGEMM_SMALL_KERNEL_NN_TAIL 1 +.L_M0: + pop_if_used 5, 2 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_small_matrix_permit.c b/kernel/loongarch64/dgemm_small_matrix_permit.c new file mode 100644 index 0000000000..8afeb817b5 --- /dev/null +++ b/kernel/loongarch64/dgemm_small_matrix_permit.c @@ -0,0 +1,39 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + + if (MNK <= 64.0 * 64.0 * 64.0) + return 1; + + return 0; +} +