Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port SIMD to Arm Neon #71

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: CI

on: [push, pull_request]
on: [push, pull_request, workflow_dispatch]

jobs:
build:
Expand Down Expand Up @@ -88,3 +88,9 @@ jobs:
cmake --build build_avs -j 2
cmake -S . -B build_vs -G Ninja -DENABLE_DAV1D=OFF -DENABLE_MFX=OFF -DENABLE_XML2=OFF -DBUILD_AVS_PLUGIN=OFF -DENABLE_VPX=OFF
cmake --build build_vs -j 2
- name: Upload dylib
if: matrix.os == ${{'macos-latest'}}
uses: actions/upload-artifact@v3
with:
name: libvslsmashsource
path: build_vs/liblsmashsource.1.dylib
16 changes: 14 additions & 2 deletions AviUtl/colorspace_simd.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,17 @@
#include "../common/utils.h"
#include "../common/lwsimd.h"



#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
#include <tmmintrin.h>
#ifdef __GNUC__
#pragma GCC target ("ssse3")
#endif
#include <tmmintrin.h>
#elif defined(__arm__) || defined(__aarch64__)
#include "../common/sse2neon.h"
#endif

/* SSSE3 version of func convert_yv12i_to_yuy2 */
void LW_FUNC_ALIGN convert_yv12i_to_yuy2_ssse3
(
Expand Down Expand Up @@ -204,10 +211,15 @@ void LW_FUNC_ALIGN convert_yv12i_to_yuy2_ssse3
}
}


#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
#include <smmintrin.h>
#ifdef __GNUC__
#pragma GCC target ("sse4.1")
#endif
#include <smmintrin.h>
#elif defined(__arm__) || defined(__aarch64__)
#include "../common/sse2neon.h"
#endif

/* the inner loop branch should be deleted by forced inline expansion and "bit_depth" constant propagation. */
static void LW_FUNC_ALIGN LW_FORCEINLINE convert_yuv420ple_i_to_yuv444p16le_sse41
Expand Down
7 changes: 6 additions & 1 deletion AviUtl/lwcolor_simd.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,15 @@ typedef unsigned char BYTE;
typedef unsigned short USHORT;
#endif


#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
#include <smmintrin.h>
#ifdef __GNUC__
#pragma GCC target ("sse4.1")
#endif
#include <smmintrin.h>
#elif defined(__arm__) || defined(__aarch64__)
#include "sse2neon.h"
#endif

static LW_FORCEINLINE void fill_rgb_buffer_sse41( BYTE *rgb_buffer, BYTE *lw48_ptr )
{
Expand Down
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ option(ENABLE_VPX "Enable libvpx support" ON)
message(STATUS "Enable libvpx support: ${ENABLE_VPX}.")

cmake_host_system_information(RESULT sse2 QUERY HAS_SSE2)

if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm*")
# sse2neon
set(sse2 ON)
endif()

option(ENABLE_SSE2 "Enable SSE2 support" ${sse2})
message(STATUS "Enable SSE2 support: ${ENABLE_SSE2}.")

Expand Down Expand Up @@ -220,11 +226,12 @@ if (NOT CMAKE_GENERATOR MATCHES "Visual Studio")
target_compile_definitions(LSMASHSource PRIVATE DEBUG_BUILD)
else (build_type STREQUAL release)
target_compile_definitions(LSMASHSource PRIVATE RELEASE_BUILD)
target_compile_options(LSMASHSource PRIVATE -Ofast -funroll-loops)
endif()

message(STATUS "Build type - ${CMAKE_BUILD_TYPE}")
endif()


if (${sse2})
target_compile_definitions(LSMASHSource PRIVATE SSE2_ENABLED=1)
endif()
Expand Down
2 changes: 2 additions & 0 deletions VapourSynth/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ deps = [
version_h
]

add_project_arguments('-Ofast', '-funroll-loops', language: 'c')

if host_machine.cpu_family().startswith('x86')
add_project_arguments('-mfpmath=sse', '-msse2', language: 'c')
endif
Expand Down
4 changes: 4 additions & 0 deletions common/planar_yuv_sse2.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
#include <emmintrin.h>
#elif defined(__arm__) || defined(__aarch64__)
#include "sse2neon.h"
#endif
#include <stdint.h>

static inline __m128i _MM_PACKUS_EPI32(const __m128i* low, const __m128i* high)
Expand Down
Loading