diff --git a/CMakeLists.txt b/CMakeLists.txt index d4f56a6..be507d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required (VERSION 3.1.0 FATAL_ERROR) cmake_policy(VERSION 3.1.0) -project (photospline VERSION 2.1.0 LANGUAGES C CXX) +project (photospline VERSION 2.1.1 LANGUAGES C CXX) SET(CMAKE_CXX_STANDARD 11) SET(CMAKE_C_STANDARD 99) @@ -55,15 +55,50 @@ target_compile_features (photospline PUBLIC cxx_constexpr ) -target_compile_options (photospline PUBLIC -O3) +target_compile_options (photospline PUBLIC -O3 -fPIC) target_compile_options (photospline PRIVATE -Wall -Wextra) -IF (CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86_64)$") - target_compile_options (photospline PUBLIC -msse2 -msse3 -msse4 -msse4.1 -msse4.2 -mno-avx) -ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") - target_compile_options (photospline PUBLIC -maltivec) -ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "^sparc") - target_compile_options (photospline PUBLIC -mvis) + +# For newer compilers we would like to use 'target cloning'/'function multiversioning' +# to cover different instruction sets portably (using preprocessor logic so that +# downstream code picks it up 'for free'), but for old compilers which can't do that, +# we still want to make sure that a reasonable baseline instruction set is used. +# The logic here may be overridden by manually setting USE_TARGET_CLONING. +IF (NOT DEFINED USE_TARGET_CLONING) + # This logic should mirror what is in include/photospline/detail/simd.h + IF (CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86_64)$") + IF (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(USE_TARGET_CLONING FALSE) + ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + IF (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.3) + set(USE_TARGET_CLONING TRUE) + ELSE () + set(USE_TARGET_CLONING FALSE) + ENDIF () + ELSE () + # Assume other compilers do not support this + set(USE_TARGET_CLONING FALSE) + ENDIF () + ELSE () + # No detailed treatment of non-x86 architectures at this time + set(USE_TARGET_CLONING FALSE) + ENDIF() +ENDIF (NOT DEFINED USE_TARGET_CLONING) + +IF (USE_TARGET_CLONING) + MESSAGE("-- Will assume use of target cloning for vector instruction sets") +ELSE () + MESSAGE("-- Will NOT assume use of target cloning for vector instruction sets") + + IF (CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86_64)$") + # explicitly disable AVX to avoid crashing on non-AVX-enabled machines + target_compile_options (photospline PUBLIC -msse2 -msse3 -msse4 -msse4.1 -msse4.2 -mno-avx) + ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") + target_compile_options (photospline PUBLIC -maltivec) + ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "^sparc") + target_compile_options (photospline PUBLIC -mvis) + ENDIF () ENDIF () + target_link_libraries (photospline PUBLIC ${CFITSIO_LIBRARIES} diff --git a/include/photospline/detail/bspline_multi.h b/include/photospline/detail/bspline_multi.h index 518d4bf..a645077 100644 --- a/include/photospline/detail/bspline_multi.h +++ b/include/photospline/detail/bspline_multi.h @@ -7,6 +7,7 @@ namespace photospline{ template template +PHOTOSPLINE_TARGET_CLONE void splinetable::ndsplineeval_multibasis_core(const int *centers, const typename detail::simd_vector::type*** localbasis, typename detail::simd_vector::type* result) const{ #if (defined(__i386__) || defined (__x86_64__)) && defined(__ELF__) /* @@ -80,6 +81,7 @@ namespace{ template template +PHOTOSPLINE_TARGET_CLONE void splinetable::ndsplineeval_multibasis_coreD(const int *centers, const typename detail::simd_vector::type*** localbasis, typename detail::simd_vector::type* result) const{ #if (defined(__i386__) || defined (__x86_64__)) && defined(__ELF__) /* @@ -150,6 +152,7 @@ void splinetable::ndsplineeval_multibasis_coreD(const int *centers, const template template +PHOTOSPLINE_TARGET_CLONE void splinetable::ndsplineeval_multibasis_coreD_FixedOrder(const int *centers, const typename detail::simd_vector::type*** localbasis, typename detail::simd_vector::type* result) const{ #if (defined(__i386__) || defined (__x86_64__)) && defined(__ELF__) /* @@ -223,6 +226,7 @@ void splinetable::ndsplineeval_multibasis_coreD_FixedOrder(const int *cen template template +PHOTOSPLINE_TARGET_CLONE void splinetable::ndsplineeval_multibasis_core_KnownOrder(const int *centers, const typename detail::simd_vector::type*** localbasis, typename detail::simd_vector::type* result) const{ #if (defined(__i386__) || defined (__x86_64__)) && defined(__ELF__) /* @@ -294,6 +298,7 @@ void splinetable::ndsplineeval_multibasis_core_KnownOrder(const int *cent template template +PHOTOSPLINE_TARGET_CLONE void splinetable::ndsplineeval_gradient(const double* x, const int* centers, double* evaluates) const { @@ -351,6 +356,7 @@ splinetable::ndsplineeval_gradient(const double* x, const int* centers, d template template +PHOTOSPLINE_TARGET_CLONE void splinetable::evaluator_type::ndsplineeval_gradient(const double* x, const int* centers, double* evaluates) const { uint32_t maxdegree = *std::max_element(table.order,table.order+table.ndim) + 1; diff --git a/include/photospline/detail/simd.h b/include/photospline/detail/simd.h index fd2f726..20904e3 100644 --- a/include/photospline/detail/simd.h +++ b/include/photospline/detail/simd.h @@ -34,4 +34,34 @@ struct simd_vector { }} +#if defined(__i386__) || defined(__amd64__) + #define PHOTOSPLINE_VECTOR_ISN_VARIANTS "avx512f","avx2","avx","sse4.2","default" + #ifdef __clang__ //clang, obviously + // this feature exists and works nicely in clang 14, except that + // "multiversioned functions do not yet support function templates" + // which is most of the places we want to use this + #define PHOTOSPLINE_USE_TARGET_CLONING 0 + #elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) //gcc + //Activating this feature causes gcc 8.3 to crash, but 8.5 works + #if __GNUC__ >= 9 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3) + #define PHOTOSPLINE_USE_TARGET_CLONING 1 + #else + #define PHOTOSPLINE_USE_TARGET_CLONING 0 + #endif + #else + //for other compilers, assume we don't have this + #define PHOTOSPLINE_USE_TARGET_CLONING 0 + #endif +#else + //For other architectures, leave this alone for now + #define PHOTOSPLINE_USE_TARGET_CLONING 0 +#endif + +#if PHOTOSPLINE_USE_TARGET_CLONING + #define PHOTOSPLINE_TARGET_CLONE __attribute__ ((target_clones( PHOTOSPLINE_VECTOR_ISN_VARIANTS ))) +#else + //make this a no-op + #define PHOTOSPLINE_TARGET_CLONE +#endif + #endif //PHOTOSPLINE_SIMD_H