From 8315302626fc5291bd7680ce48e17c5c6e39f8e2 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Wed, 25 Dec 2024 21:03:49 +0800
Subject: [PATCH] CI: Update Arm GNU Toolchain and use Ubuntu 24.04

The pre-built GNU compiler toolchain for various Arm targets.
Version 14.2.Rel1.

Close #647
---
 .ci/check-format.sh        |  4 +--
 .ci/common.sh              |  2 +-
 .ci/cross-check.sh         |  4 +--
 .ci/cross-tool.sh          |  4 +--
 .github/workflows/main.yml | 21 ++++++++-------
 CONTRIBUTING.md            |  2 +-
 Makefile                   |  4 +--
 sse2neon.h                 | 54 +++++++++++++++++++-------------------
 tests/impl.cpp             |  2 +-
 9 files changed, 49 insertions(+), 48 deletions(-)
diff --git a/.ci/check-format.sh b/.ci/check-format.sh
index a6fdc1c9..de8aa178 100755
--- a/.ci/check-format.sh
+++ b/.ci/check-format.sh
@@ -6,7 +6,7 @@ set -x
 
 for file in ${SOURCES};
 do
-    clang-format-12 ${file} > expected-format
+    clang-format-18 ${file} > expected-format
     diff -u -p --label="${file}" --label="expected coding style" ${file} expected-format
 done
-exit $(clang-format-12 --output-replacements-xml ${SOURCES} | egrep -c "</replacement>")
+exit $(clang-format-18 --output-replacements-xml ${SOURCES} | egrep -c "</replacement>")
diff --git a/.ci/common.sh b/.ci/common.sh
index c0396062..547c4293 100644
--- a/.ci/common.sh
+++ b/.ci/common.sh
@@ -1,4 +1,4 @@
-GCC_REL=11.2-2022.02
+GCC_REL=14.2.rel1
 ARM_MIRROR=https://github.com/DLTcollab/toolchain-arm/raw/main
 
 SOURCES=$(find $(git rev-parse --show-toplevel) | egrep "\.(cpp|h)\$" | egrep -v "gcc-arm-${GCC_REL}-x86_64-aarch64-none-linux-gnu|gcc-arm-${GCC_REL}-x86_64-arm-none-linux-gnueabihf")
diff --git a/.ci/cross-check.sh b/.ci/cross-check.sh
index 5efb9fef..c0f9fb84 100755
--- a/.ci/cross-check.sh
+++ b/.ci/cross-check.sh
@@ -14,9 +14,9 @@ fi
 set -x
 
 make clean
-export PATH=gcc-arm-${GCC_REL}-x86_64-aarch64-none-linux-gnu/bin:$PATH
+export PATH=arm-gnu-toolchain-${GCC_REL}-x86_64-aarch64-none-linux-gnu/bin:$PATH
 make CROSS_COMPILE=aarch64-none-linux-gnu- check || exit 1 # ARMv8-A
 
 make clean
-export PATH=gcc-arm-${GCC_REL}-x86_64-arm-none-linux-gnueabihf/bin:$PATH
+export PATH=arm-gnu-toolchain-${GCC_REL}-x86_64-arm-none-linux-gnueabihf/bin:$PATH
 make CROSS_COMPILE=arm-none-linux-gnueabihf- check || exit 1 # ARMv7-A
diff --git a/.ci/cross-tool.sh b/.ci/cross-tool.sh
index a18e448b..c2013cb4 100755
--- a/.ci/cross-tool.sh
+++ b/.ci/cross-tool.sh
@@ -18,9 +18,9 @@ set -x
 sudo apt-get install -y curl xz-utils
 
 curl -L \
-    ${ARM_MIRROR}/gcc-arm-${GCC_REL}-x86_64-arm-none-linux-gnueabihf.tar.xz \
+    ${ARM_MIRROR}/arm-gnu-toolchain-${GCC_REL}-x86_64-arm-none-linux-gnueabihf.tar.xz \
     | tar -Jx || exit 1
 
 curl -L \
-    ${ARM_MIRROR}/gcc-arm-${GCC_REL}-x86_64-aarch64-none-linux-gnu.tar.xz \
+    ${ARM_MIRROR}/arm-gnu-toolchain-${GCC_REL}-x86_64-aarch64-none-linux-gnu.tar.xz \
     | tar -Jx || exit 1
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 5d43f3e1..7d2eeeca 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,11 +4,11 @@ on: [push, pull_request]
 
 jobs:
   host-x86:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
     strategy:
       matrix:
         arch: [x86_64]
-        cxx_compiler: [g++-10, clang++-11]
+        cxx_compiler: [g++, clang++]
     steps:
       - name: checkout code
         uses: actions/checkout@v4
@@ -52,7 +52,7 @@ jobs:
         run: mingw32-make check
 
   host-arm:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
     strategy:
       matrix:
         arch_with_features: [
@@ -61,7 +61,7 @@ jobs:
           {arch: aarch64, feature: crypto+crc, arch_cflags: none},
           {arch: armv7, feature: none, arch_cflags: '-mcpu=cortex-a32 -mfpu=neon-fp-armv8'}
         ]
-        cxx_compiler: [g++-10, clang++-11]
+        cxx_compiler: [g++, clang++-15]
     steps:
       - name: checkout code
         uses: actions/checkout@v4
@@ -71,14 +71,15 @@ jobs:
         uses: uraimo/run-on-arch-action@v2
         with:
           arch: ${{ matrix.arch_with_features.arch }}
-          distro: ubuntu20.04
+          distro: ubuntu22.04
+          # Speed up builds by storing container images in a GitHub package registry.
+          githubToken: ${{ github.token }}
           env: |
             CXX: ${{ matrix.cxx_compiler }}
             ARCH_CFLAGS: ${{ matrix.arch_with_features.arch_cflags }}
           install: |
             apt-get update -q -y
-            apt-get install -q -y "${{ matrix.cxx_compiler }}" make
-            apt-get install -q -y gcc
+            apt-get install -q -y gcc "${{ matrix.cxx_compiler }}" make
           run: |
             make FEATURE=${{ matrix.arch_with_features.feature }} check
 
@@ -101,7 +102,7 @@ jobs:
           path: ARM64
 
   test-win-msvc:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     container: linaro/wine-arm64
     needs: host-win-msvc
     steps:
@@ -115,12 +116,12 @@ jobs:
 
 
   coding-style:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
     steps:
       - name: checkout code
         uses: actions/checkout@v4
       - name: style check
         run: |
-            sudo apt-get install -q -y clang-format-12
+            sudo apt-get install -q -y clang-format-18
             sh .ci/check-format.sh
         shell: bash
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4767ed7a..60afe860 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -26,7 +26,7 @@ We welcome all contributions from corporate, acaddemic and individual developers
 * All code must adhere to the existing C coding style (see below). While we are somewhat flexible in basic style, you will adhere to what is currently in place. Uncommented, complicated algorithmic constructs will be rejected.
 * All external pull requests must contain sufficient documentation in the pull request comments in order to be accepted.
 
-Software requirement: [clang-format](https://clang.llvm.org/docs/ClangFormat.html) version 12 or later.
+Software requirement: [clang-format](https://clang.llvm.org/docs/ClangFormat.html) version 18 or later.
 
 Use the command `$ clang-format -i *.[ch]` to enforce a consistent coding style.
 
diff --git a/Makefile b/Makefile
index 999a3a7b..9cbd3b99 100644
--- a/Makefile
+++ b/Makefile
@@ -83,8 +83,8 @@ endif
 
 indent:
 	@echo "Formatting files with clang-format.."
-	@if ! hash clang-format-12; then echo "clang-format-12 is required to indent"; fi
-	clang-format-12 -i sse2neon.h tests/*.cpp tests/*.h
+	@if ! hash clang-format-18; then echo "clang-format-18 is required to indent"; fi
+	clang-format-18 -i sse2neon.h tests/*.cpp tests/*.h
 
 .PHONY: clean check format
 clean:
diff --git a/sse2neon.h b/sse2neon.h
index 80d2fea5..a83c12ce 100644
--- a/sse2neon.h
+++ b/sse2neon.h
@@ -1816,7 +1816,7 @@ FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
 #if defined(_MSC_VER) && !defined(__clang__)
     _WriteStatusReg(ARM64_FPCR, value);
 #else
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(value));  /* write */
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
 #endif
 }
 
@@ -2431,7 +2431,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
 #if defined(__aarch64__) || defined(_M_ARM64)
     _sse2neon_set_fpcr(r.value);
 #else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
 #endif
 }
 
@@ -4977,11 +4977,11 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
                                   signed char b1,
                                   signed char b0)
 {
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    int8_t ALIGN_STRUCT(16) data[16] = {
+        (int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+        (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+        (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+        (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
     return (__m128i) vld1q_s8(data);
 }
 
@@ -5112,11 +5112,11 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
                                    signed char b14,
                                    signed char b15)
 {
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    int8_t ALIGN_STRUCT(16) data[16] = {
+        (int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+        (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+        (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+        (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
     return (__m128i) vld1q_s8(data);
 }
 
@@ -6269,7 +6269,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
             uint8x8_t tmp_low;                                              \
             uint8x8_t tmp_high;                                             \
             if ((imm) >= 8) {                                               \
-                const int idx = (imm) -8;                                   \
+                const int idx = (imm) - 8;                                  \
                 tmp_low = vreinterpret_u8_m64(_a);                          \
                 tmp_high = vdup_n_u8(0);                                    \
                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
@@ -6790,14 +6790,14 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
     _sse2neon_define2(                                                  \
         __m128i, a, b,                                                  \
         const uint16_t _mask[8] =                                       \
-            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,    \
-                           ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0);   \
+            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 1)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 2)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 3)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 4)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 5)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 6)) ? (uint16_t) - 1 : 0x0,   \
+                           ((imm) & (1 << 7)) ? (uint16_t) - 1 : 0x0);  \
         uint16x8_t _mask_vec = vld1q_u16(_mask);                        \
         uint16x8_t __a = vreinterpretq_u16_m128i(_a);                   \
         uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
@@ -6822,11 +6822,11 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
 {
-    const uint32_t ALIGN_STRUCT(16)
-        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    const uint32_t
+        ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                                    ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                                    ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                                    ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
     uint32x4_t mask = vld1q_u32(data);
     float32x4_t a = vreinterpretq_f32_m128(_a);
     float32x4_t b = vreinterpretq_f32_m128(_b);
@@ -9351,7 +9351,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 #if defined(__aarch64__) || defined(_M_ARM64)
     _sse2neon_set_fpcr(r.value);
 #else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
 #endif
 }
 
diff --git a/tests/impl.cpp b/tests/impl.cpp
index a3887606..93d53b3c 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -5878,7 +5878,7 @@ result_t test_mm_shuffle_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
     int32_t _d[4];
 
 #define TEST_IMPL(IDX)              \
-    _d[0] = _a[((IDX) &0x3)];       \
+    _d[0] = _a[((IDX) & 0x3)];      \
     _d[1] = _a[((IDX >> 2) & 0x3)]; \
     _d[2] = _a[((IDX >> 4) & 0x3)]; \
     _d[3] = _a[((IDX >> 6) & 0x3)]; \