From 0b71133d0e616c1263e4d24bc28ca8339452da45 Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 23 Dec 2024 10:47:02 +0800 Subject: [PATCH 1/4] pnnx fix macos and windows build, add quick test ci (#5838) --- .github/workflows/pnnx.yml | 56 +++++++++++++++++++ tools/pnnx/CMakeLists.txt | 5 ++ tools/pnnx/src/CMakeLists.txt | 4 ++ tools/pnnx/src/pass_ncnn/torch_istft.cpp | 4 +- tools/pnnx/src/pass_ncnn/torch_stft.cpp | 4 +- .../torchaudio_F_inverse_spectrogram.cpp | 4 +- .../pass_ncnn/torchaudio_F_spectrogram.cpp | 4 +- 7 files changed, 73 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/pnnx.yml diff --git a/.github/workflows/pnnx.yml b/.github/workflows/pnnx.yml new file mode 100644 index 000000000000..3e4b354c3c21 --- /dev/null +++ b/.github/workflows/pnnx.yml @@ -0,0 +1,56 @@ +name: pnnx +on: + push: + branches: [master] + paths: + - '.github/workflows/pnnx.yml' + - 'tools/pnnx/**' + - '!tools/pnnx/README.md' + pull_request: + branches: [master] + paths: + - '.github/workflows/pnnx.yml' + - 'tools/pnnx/**' + - '!tools/pnnx/README.md' +concurrency: + group: pnnx-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + + env: + PYTHONUSERBASE: ${{ github.workspace }}/torch + UseMultiToolTask: true + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: 3.12 + + - name: setup-pytorch + run: | + python3 -m pip config set global.break-system-packages true + pip3 install --user torch --index-url https://download.pytorch.org/whl/cpu + pip3 install --user numpy packaging + + - name: build-pnnx + run: | + cd tools/pnnx + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + cmake --build . --config Release -j 4 + + - name: quick-test + if: matrix.os != 'windows-latest' + run: | + cd tools/pnnx + cd build && ctest -C Release --output-on-failure -R test_nn_Conv diff --git a/tools/pnnx/CMakeLists.txt b/tools/pnnx/CMakeLists.txt index 65b15a3cf52f..b09f4758ead3 100644 --- a/tools/pnnx/CMakeLists.txt +++ b/tools/pnnx/CMakeLists.txt @@ -19,6 +19,11 @@ if(MSVC AND NOT CMAKE_VERSION VERSION_LESS "3.15") endif() endif() +if(POLICY CMP0094) + cmake_policy(SET CMP0094 NEW) # FindPython should return the first matching Python +endif() +set(Python_FIND_REGISTRY "LAST") +set(Python_FIND_FRAMEWORK "LAST") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") include(PNNXPyTorch) diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index d56784646b5b..b1ac6f5c0245 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -625,6 +625,10 @@ if(PROTOBUF_FOUND) set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CURRENT_BINARY_DIR}) set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_LINK_LIBRARIES protobuf::libprotobuf) endif() + + if(APPLE) + set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-framework CoreFoundation") + endif() endif() endif() diff --git a/tools/pnnx/src/pass_ncnn/torch_istft.cpp b/tools/pnnx/src/pass_ncnn/torch_istft.cpp index 3acbe6540095..9d894aba7ab7 100644 --- a/tools/pnnx/src/pass_ncnn/torch_istft.cpp +++ b/tools/pnnx/src/pass_ncnn/torch_istft.cpp @@ -106,10 +106,10 @@ static int detect_window_type(const std::vector& window_data) if (!NearlyEqual(window_data[i], 1.f, 0.001)) is_one = false; - if (!NearlyEqual(window_data[i], 0.5f * (1 - cos(2 * M_PI * i / winlen)), 0.001)) + if (!NearlyEqual(window_data[i], 0.5f * (1 - cos(2 * 3.14159265358979323846 * i / winlen)), 0.001)) is_hann = false; - if (!NearlyEqual(window_data[i], 0.54f - 0.46f * cos(2 * M_PI * i / winlen), 0.001)) + if (!NearlyEqual(window_data[i], 0.54f - 0.46f * cos(2 * 3.14159265358979323846 * i / winlen), 0.001)) is_hamming = false; } diff --git a/tools/pnnx/src/pass_ncnn/torch_stft.cpp b/tools/pnnx/src/pass_ncnn/torch_stft.cpp index 2b2296ccbc2c..8993afeb12fe 100644 --- a/tools/pnnx/src/pass_ncnn/torch_stft.cpp +++ b/tools/pnnx/src/pass_ncnn/torch_stft.cpp @@ -93,10 +93,10 @@ static int detect_window_type(const std::vector& window_data) if (!NearlyEqual(window_data[i], 1.f, 0.001)) is_one = false; - if (!NearlyEqual(window_data[i], 0.5f * (1 - cos(2 * M_PI * i / winlen)), 0.001)) + if (!NearlyEqual(window_data[i], 0.5f * (1 - cos(2 * 3.14159265358979323846 * i / winlen)), 0.001)) is_hann = false; - if (!NearlyEqual(window_data[i], 0.54f - 0.46f * cos(2 * M_PI * i / winlen), 0.001)) + if (!NearlyEqual(window_data[i], 0.54f - 0.46f * cos(2 * 3.14159265358979323846 * i / winlen), 0.001)) is_hamming = false; } diff --git a/tools/pnnx/src/pass_ncnn/torchaudio_F_inverse_spectrogram.cpp b/tools/pnnx/src/pass_ncnn/torchaudio_F_inverse_spectrogram.cpp index d712fcc2990f..0c964790fb4f 100644 --- a/tools/pnnx/src/pass_ncnn/torchaudio_F_inverse_spectrogram.cpp +++ b/tools/pnnx/src/pass_ncnn/torchaudio_F_inverse_spectrogram.cpp @@ -43,10 +43,10 @@ static int detect_window_type(const std::vector& window_data) if (!NearlyEqual(window_data[i], 1.f, 0.001)) is_one = false; - if (!NearlyEqual(window_data[i], 0.5f * (1 - cos(2 * M_PI * i / winlen)), 0.001)) + if (!NearlyEqual(window_data[i], 0.5f * (1 - cos(2 * 3.14159265358979323846 * i / winlen)), 0.001)) is_hann = false; - if (!NearlyEqual(window_data[i], 0.54f - 0.46f * cos(2 * M_PI * i / winlen), 0.001)) + if (!NearlyEqual(window_data[i], 0.54f - 0.46f * cos(2 * 3.14159265358979323846 * i / winlen), 0.001)) is_hamming = false; } diff --git a/tools/pnnx/src/pass_ncnn/torchaudio_F_spectrogram.cpp b/tools/pnnx/src/pass_ncnn/torchaudio_F_spectrogram.cpp index 04084ad0ba67..225fab7060ec 100644 --- a/tools/pnnx/src/pass_ncnn/torchaudio_F_spectrogram.cpp +++ b/tools/pnnx/src/pass_ncnn/torchaudio_F_spectrogram.cpp @@ -43,10 +43,10 @@ static int detect_window_type(const std::vector& window_data) if (!NearlyEqual(window_data[i], 1.f, 0.001)) is_one = false; - if (!NearlyEqual(window_data[i], 0.5f * (1 - cos(2 * M_PI * i / winlen)), 0.001)) + if (!NearlyEqual(window_data[i], 0.5f * (1 - cos(2 * 3.14159265358979323846 * i / winlen)), 0.001)) is_hann = false; - if (!NearlyEqual(window_data[i], 0.54f - 0.46f * cos(2 * M_PI * i / winlen), 0.001)) + if (!NearlyEqual(window_data[i], 0.54f - 0.46f * cos(2 * 3.14159265358979323846 * i / winlen), 0.001)) is_hamming = false; } From d3875f0fbdb12d751cec53ebac5b5311731ab0c7 Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 23 Dec 2024 12:33:22 +0800 Subject: [PATCH 2/4] ci webassembly drop removed args for node>20 (#5843) --- .github/workflows/web-assembly.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/web-assembly.yml b/.github/workflows/web-assembly.yml index 1b5e8915a866..a650d6f62c43 100644 --- a/.github/workflows/web-assembly.yml +++ b/.github/workflows/web-assembly.yml @@ -62,7 +62,7 @@ jobs: - name: test-simd run: | cd build-simd - TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd" ctest --output-on-failure -j $(nproc) + TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j $(nproc) - name: build-simd-omp run: | source emsdk/emsdk_env.sh @@ -73,4 +73,4 @@ jobs: - name: test-simd-omp run: | cd build-simd-omp - TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd;--experimental-wasm-threads" ctest --output-on-failure -j $(nproc) + TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j $(nproc) From 6c438c4f028789e048cb290a18aedfbfbb9c054c Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 23 Dec 2024 16:22:16 +0800 Subject: [PATCH 3/4] fix android ndk-r16b ci (#5845) --- .github/workflows/android.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index 424a04c15a5c..8d3031d83770 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -117,8 +117,10 @@ jobs: env: DEBIAN_FRONTEND: noninteractive run: | - sudo apt-get update - sudo apt-get install -y libncurses5 + pushd /usr/lib/x86_64-linux-gnu/ + sudo ln -s libncurses.so.6 libncurses.so.5 + sudo ln -s libtinfo.so.6 libtinfo.so.5 + popd wget -q https://dl.google.com/android/repository/android-ndk-r16b-linux-x86_64.zip -O $GITHUB_WORKSPACE/android-ndk-r16b-linux-x86_64.zip cd $GITHUB_WORKSPACE && unzip -q android-ndk-r16b-linux-x86_64.zip From 66cd40e9349a5357f0e06a952ab8fcb4eec1eb51 Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 23 Dec 2024 16:43:24 +0800 Subject: [PATCH 4/4] fix clang avx512bf16 build (#5842) * check compiler supports isa with optimization enabled --- CMakeLists.txt | 7 ++++--- src/layer/x86/gemm_int8.h | 4 ++-- src/layer/x86/x86_usability.h | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5851552b2a5f..2b532d7c2455 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,6 +139,7 @@ endif() ############################################## include(CheckCXXCompilerFlag) +set(CMAKE_TRY_COMPILE_CONFIGURATION release) # gnu inline assembly in clang msvc does not work actually if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))) @@ -523,7 +524,7 @@ else() check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") - check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) + check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") check_cxx_source_compiles("#include \nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) @@ -560,7 +561,7 @@ else() check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16") - check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) + check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16") check_cxx_source_compiles("#include \nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) @@ -595,7 +596,7 @@ else() check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16") - check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) + check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16") check_cxx_source_compiles("#include \nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) diff --git a/src/layer/x86/gemm_int8.h b/src/layer/x86/gemm_int8.h index f9e0050fd553..132cf9d8cb9b 100644 --- a/src/layer/x86/gemm_int8.h +++ b/src/layer/x86/gemm_int8.h @@ -2014,7 +2014,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i __m256i _pp = combine4x2_epi32(_pp0, _pp1); #if !__AVXVNNIINT8__ - _w_shift = _mm256_dpbusd_epi32(_w_shift, _v127, _pp); + _w_shift = _mm256_comp_dpbusd_epi32(_w_shift, _v127, _pp); #endif // !__AVXVNNIINT8__ _mm256_storeu_si256((__m256i*)pp, _pp); @@ -2108,7 +2108,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i __m256i _pp = combine4x2_epi32(_pp0, _pp1); #if !__AVXVNNIINT8__ - _w_shift = _mm256_dpbusd_epi32(_w_shift, _v127, _pp); + _w_shift = _mm256_comp_dpbusd_epi32(_w_shift, _v127, _pp); #endif // !__AVXVNNIINT8__ _mm256_storeu_si256((__m256i*)pp, _pp); diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index f25b06745e84..8628249e76ab 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -1490,9 +1490,9 @@ static NCNN_FORCEINLINE __m256i float2bfloat_avx512(const __m512& v0) static NCNN_FORCEINLINE __m512i float2bfloat_avx512(const __m512& v0, const __m512& v1) { #if __AVX512BF16__ - __m256bh _v0 = _mm512_cvtneps_pbh(v0); - __m256bh _v1 = _mm512_cvtneps_pbh(v1); - __m512i _v = _mm512_inserti32x8(_mm512_castsi256_si512((__m256i)_v0), (__m256i)_v1, 1); + __m256i _v0 = (__m256i)_mm512_cvtneps_pbh(v0); + __m256i _v1 = (__m256i)_mm512_cvtneps_pbh(v1); + __m512i _v = _mm512_inserti32x8(_mm512_castsi256_si512(_v0), _v1, 1); #else __m512i _a = _mm512_castps_si512(v0); __m512i _b = _mm512_castps_si512(v1);