From b036235f37cd80dc70c6d1ad43035d61bc3844ca Mon Sep 17 00:00:00 2001 From: Aymen Qader Date: Mon, 9 Dec 2024 08:18:17 +0000 Subject: [PATCH] =?UTF-8?q?Add=20Arm=C2=AEv9-A=20architecture=20SME=20targ?= =?UTF-8?q?et?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new target, ARMV9SME, for Arm®v9-A architecture systems that support the Scalable Matrix Extension (SME) [1]. Initially inherits ARMV8SVE settings with updated compiler flags. This target can only be built with an SME-capable toolchain such as GCC 14 or LLVM 19. Includes some initial FEAT_SME2 feature detection on Linux targets via hwcaps. Target is disabled in DYNAMIC_ARCH builds by default. This is intended as a base target for SME2 kernels. [1] https://developer.arm.com/documentation/109246/0100/SME-Overview/SME-and-SME2 --- Makefile.arm64 | 5 +++++ Makefile.system | 10 ++++++++++ README.md | 1 + TargetList.txt | 1 + cmake/arch.cmake | 18 +++++++++++++++--- cmake/cc.cmake | 6 ++++++ cmake/prebuild.cmake | 2 +- cmake/system.cmake | 3 +++ common_arm64.h | 2 +- driver/others/dynamic_arm64.c | 19 +++++++++++++++++++ getarch.c | 14 ++++++++++++++ kernel/arm64/KERNEL.ARMV9SME | 1 + param.h | 2 +- 13 files changed, 78 insertions(+), 6 deletions(-) create mode 100644 kernel/arm64/KERNEL.ARMV9SME diff --git a/Makefile.arm64 b/Makefile.arm64 index fccc0d0d0f..f0a6ef2cbd 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve endif endif +ifeq ($(CORE), ARMV9SME) +CCOMMON_OPT += -march=armv9-a+sme2 -O3 +FCOMMON_OPT += -march=armv9-a+sve2 -O3 +endif + ifeq ($(CORE), CORTEXA53) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 ifneq ($(F_COMPILER), NAG) diff --git a/Makefile.system b/Makefile.system index 29ea819f13..b9b2453f5b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -420,6 +420,7 @@ ifeq ($(ARCH), arm64) export MACOSX_DEPLOYMENT_TARGET=11.0 ifeq ($(C_COMPILER), GCC) export NO_SVE = 1 +export NO_SME = 1 endif else export MACOSX_DEPLOYMENT_TARGET=10.8 @@ -709,6 +710,11 @@ DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += ARMV8SVE DYNAMIC_CORE += A64FX endif +# Disabled by default while ARMV9SME is WIP +NO_SME ?= 1 +ifneq ($(NO_SME), 1) +DYNAMIC_CORE += ARMV9SME +endif DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 @@ -1474,6 +1480,10 @@ ifeq ($(NO_SVE), 1) CCOMMON_OPT += -DNO_SVE endif +ifeq ($(NO_SME), 1) +CCOMMON_OPT += -DNO_SME +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER diff --git a/README.md b/README.md index d8e73b2022..0857194245 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Apple Vortex**: preliminary support based on ThunderX2/3 - **A64FX**: preliminary support, optimized Level-3 BLAS - **ARMV8SVE**: any ARMV8 cpu with SVE extensions +- **ARMV9SME**: WIP target, any Arm®v9-A core with SME2 support. Only functional for GEMM. #### PPC/PPC64 diff --git a/TargetList.txt b/TargetList.txt index 25eeddfb00..232e12ffa6 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -111,6 +111,7 @@ THUNDERX3T110 VORTEX A64FX ARMV8SVE +ARMV9SME FT2000 9.System Z: diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 27ba6f8727..ec91a2d598 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,9 +44,21 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) - if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10 + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + endif () + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + endif() + elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + endif () + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + endif() endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 775239e1cd..2a48ba5ab5 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -238,6 +238,12 @@ if (${CORE} STREQUAL ARMV8SVE) endif () endif () +if (${CORE} STREQUAL ARMV9SME) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme2") + endif () +endif () + if (${CORE} STREQUAL CORTEXA510) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 53a78d782f..f6ca73b7b6 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1014,7 +1014,7 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "NEOVERSEN2") + elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index 4ac244e3ea..d49de6f7c2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -310,6 +310,9 @@ if (${TARGET} STREQUAL NEOVERSEV1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") endif() endif() + if (${TARGET} STREQUAL ARMV9SME) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme2 -O3") + endif() if (${TARGET} STREQUAL A64FX) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") diff --git a/common_arm64.h b/common_arm64.h index 595a01995a..5856898a2b 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -175,7 +175,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HUGE_PAGESIZE ( 4 << 20) #ifndef BUFFERSIZE -#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) +#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME) #define BUFFER_SIZE (32 << 22) #else #define BUFFER_SIZE (32 << 20) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index dc88d816fb..828eccd138 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -115,6 +115,11 @@ extern gotoblas_t gotoblas_ARMV8SVE; #else #define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif +#ifdef DYN_ARMV9SME +extern gotoblas_t gotoblas_ARMV9SME; +#else +#define gotoblas_ARMV9SME gotoblas_ARMV8 +#endif #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -148,6 +153,13 @@ extern gotoblas_t gotoblas_A64FX; #define gotoblas_ARMV8SVE gotoblas_ARMV8 #define gotoblas_A64FX gotoblas_ARMV8 #endif + +#ifndef NO_SME +extern gotoblas_t gotoblas_ARMV9SME; +#else +#define gotoblas_ARMV9SME gotoblas_ARMV8SVE +#endif + extern gotoblas_t gotoblas_THUNDERX3T110; #endif #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 @@ -393,6 +405,13 @@ static gotoblas_t *get_coretype(void) { snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); } + +#if !defined(NO_SME) && defined(HWCAP2_SME2) + if ((getauxval(AT_HWCAP2) & HWCAP2_SME2)) { + return &gotoblas_ARMV9SME; + } +#endif + #ifndef NO_SVE if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { return &gotoblas_ARMV8SVE; diff --git a/getarch.c b/getarch.c index 826dd1ce0a..2097f230f0 100644 --- a/getarch.c +++ b/getarch.c @@ -1289,6 +1289,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "ARMV8SVE" #endif +#ifdef FORCE_ARMV9SME +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV9SME" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV9SME " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9" +#define LIBNAME "armv9sme" +#define CORENAME "ARMV9SME" +#endif + #ifdef FORCE_ARMV8 #define FORCE diff --git a/kernel/arm64/KERNEL.ARMV9SME b/kernel/arm64/KERNEL.ARMV9SME new file mode 100644 index 0000000000..bc59990979 --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV9SME @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/param.h b/param.h index fee9195d02..4063fe71c1 100644 --- a/param.h +++ b/param.h @@ -3667,7 +3667,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE +#elif defined(ARMV8SVE) || defined(ARMV9SME) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8