From f0470e9d1b15b1943df582a5ef41106e15e171fa Mon Sep 17 00:00:00 2001 From: PHILO-HE Date: Mon, 17 Jun 2024 10:43:28 +0800 Subject: [PATCH] Initial commit --- .github/workflows/benchmark.yml | 4 +- .github/workflows/linux-build.yml | 1 + .github/workflows/scheduled.yml | 6 + .../hyperscan.cmake | 42 ++++++ CMakeLists.txt | 3 + scripts/setup-centos9.sh | 2 +- scripts/setup-macos.sh | 2 +- scripts/setup-ubuntu.sh | 3 +- velox/functions/lib/CMakeLists.txt | 4 +- velox/functions/lib/HyperscanFunctions.cpp | 136 ++++++++++++++++++ velox/functions/lib/HyperscanFunctions.h | 29 ++++ .../lib/benchmarks/Re2FunctionsBenchmarks.cpp | 11 ++ 12 files changed, 237 insertions(+), 6 deletions(-) create mode 100644 CMake/resolve_dependency_modules/hyperscan.cmake create mode 100644 velox/functions/lib/HyperscanFunctions.cpp create mode 100644 velox/functions/lib/HyperscanFunctions.h diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 6b9c96d0426f..2b8467305d04 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -90,7 +90,7 @@ jobs: - name: "Install dependencies" if: ${{ github.event_name == 'pull_request' }} - run: source velox/scripts/setup-ubuntu.sh && install_apt_deps && install_duckdb + run: source velox/scripts/setup-ubuntu.sh && install_apt_deps && install_duckdb && install_boost - name: Build Baseline Benchmarks if: ${{ github.event_name == 'pull_request' }} @@ -117,7 +117,7 @@ jobs: submodules: 'recursive' - name: "Install dependencies" - run: source velox/scripts/setup-ubuntu.sh && install_apt_deps + run: source velox/scripts/setup-ubuntu.sh && install_apt_deps && install_boost - name: Build Contender Benchmarks working-directory: velox diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index 1ca5cce9c7ec..43222de8b615 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -61,6 +61,7 @@ jobs: VELOX_DEPENDENCY_SOURCE: SYSTEM simdjson_SOURCE: BUNDLED xsimd_SOURCE: BUNDLED + hyperscan_SOURCE: BUNDLED CUDA_VERSION: "12.4" steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml index a487dd4bbb4c..f4e72d5dbd8a 100644 --- a/.github/workflows/scheduled.yml +++ b/.github/workflows/scheduled.yml @@ -196,6 +196,12 @@ jobs: run: | ccache -sz + - name: Setup dependencies + run: | + # Install dependencies required by PR. They have not been installed in docker. + yum update -y + yum install -y ragel + - name: Build env: EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_BUILD_PYTHON_PACKAGE=ON ${{ inputs.extraCMakeFlags }}" diff --git a/CMake/resolve_dependency_modules/hyperscan.cmake b/CMake/resolve_dependency_modules/hyperscan.cmake new file mode 100644 index 000000000000..9bae1f5d41dd --- /dev/null +++ b/CMake/resolve_dependency_modules/hyperscan.cmake @@ -0,0 +1,42 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +if(DEFINED ENV{VELOX_HYPERSCAN_URL}) + set(VELOX_HYPERSCAN_SOURCE_URL "$ENV{VELOX_HYPERSCAN_URL}") +else() + set(VELOX_HYPERSCAN_VERSION v5.4.2) + set(VELOX_HYPERSCAN_SOURCE_URL + "https://github.com/intel/hyperscan/archive/refs/tags/${VELOX_HYPERSCAN_VERSION}.tar.gz" + ) + set(VELOX_HYPERSCAN_BUILD_SHA256_CHECKSUM + 32b0f24b3113bbc46b6bfaa05cf7cf45840b6b59333d078cc1f624e4c40b2b99) +endif() + +message(STATUS "Building hyperscan from source") +FetchContent_Declare( + hyperscan + URL ${VELOX_HYPERSCAN_SOURCE_URL} + URL_HASH SHA256=${VELOX_HYPERSCAN_BUILD_SHA256_CHECKSUM}) + +set(CMAKE_CXX_STANDARD_BACKUP ${CMAKE_CXX_STANDARD}) +# C++ 17 is not supported. +set(CMAKE_CXX_STANDARD 11) +set(BUILD_EXAMPLES FALSE) +set(BUILD_AVX512 ON) +FetchContent_MakeAvailable(hyperscan) +set_target_properties( + hs + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${hyperscan_SOURCE_DIR}/src) +set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_BACKUP}) diff --git a/CMakeLists.txt b/CMakeLists.txt index aca08223ad54..ea9eb2d77ad3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -447,6 +447,9 @@ endif() set_source(re2) resolve_dependency(re2) +set_source(hyperscan) +resolve_dependency(hyperscan 5.4.2) + if(${VELOX_BUILD_PYTHON_PACKAGE}) set_source(pybind11) resolve_dependency(pybind11 2.10.0) diff --git a/scripts/setup-centos9.sh b/scripts/setup-centos9.sh index d8de1b50cc33..e5731be338e9 100755 --- a/scripts/setup-centos9.sh +++ b/scripts/setup-centos9.sh @@ -61,7 +61,7 @@ function install_build_prerequisites { dnf config-manager --set-enabled crb dnf update -y dnf_install ninja-build cmake ccache gcc-toolset-12 git wget which - dnf_install autoconf automake python3-devel pip libtool + dnf_install autoconf automake python3-devel pip libtool ragel pip install cmake==3.28.3 diff --git a/scripts/setup-macos.sh b/scripts/setup-macos.sh index afccef5ecaf9..2ae3100d8274 100755 --- a/scripts/setup-macos.sh +++ b/scripts/setup-macos.sh @@ -40,7 +40,7 @@ NPROC=$(getconf _NPROCESSORS_ONLN) BUILD_DUCKDB="${BUILD_DUCKDB:-true}" DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)} -MACOS_VELOX_DEPS="bison flex gflags glog googletest icu4c libevent libsodium lz4 lzo openssl protobuf@21 snappy xz zstd" +MACOS_VELOX_DEPS="bison flex gflags glog googletest icu4c libevent libsodium lz4 lzo openssl protobuf@21 snappy xz zstd libtool automake autoconf ragel" MACOS_BUILD_DEPS="ninja cmake" FB_OS_VERSION="v2024.07.01.00" FMT_VERSION="10.1.1" diff --git a/scripts/setup-ubuntu.sh b/scripts/setup-ubuntu.sh index 2676bd59ae1d..6b97ee46825b 100755 --- a/scripts/setup-ubuntu.sh +++ b/scripts/setup-ubuntu.sh @@ -110,7 +110,8 @@ function install_velox_deps_from_apt { bison \ flex \ libfl-dev \ - tzdata + tzdata \ + ragel } function install_fmt { diff --git a/velox/functions/lib/CMakeLists.txt b/velox/functions/lib/CMakeLists.txt index bdff97bba2d9..f02f6d44d9b1 100644 --- a/velox/functions/lib/CMakeLists.txt +++ b/velox/functions/lib/CMakeLists.txt @@ -31,6 +31,7 @@ velox_add_library( ArrayShuffle.cpp CheckDuplicateKeys.cpp CheckNestedNulls.cpp + HyperscanFunctions.cpp KllSketch.cpp MapConcat.cpp Re2Functions.cpp @@ -47,7 +48,8 @@ velox_link_libraries( velox_vector velox_type_tz re2::re2 - Folly::folly) + Folly::folly + hs) add_subdirectory(aggregates) add_subdirectory(string) diff --git a/velox/functions/lib/HyperscanFunctions.cpp b/velox/functions/lib/HyperscanFunctions.cpp new file mode 100644 index 000000000000..2618e437321b --- /dev/null +++ b/velox/functions/lib/HyperscanFunctions.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/functions/lib/HyperscanFunctions.h" +#include +#include +#include +#include +#include "hs.h" +#include "velox/functions/lib/string/StringImpl.h" +#include "velox/vector/FunctionVector.h" + +namespace facebook::velox::functions { +namespace { + +class MatchConstantPattern final : public exec::VectorFunction { + public: + explicit MatchConstantPattern(StringView pattern) { + hs_compile_error_t* compile_err; + if (hs_compile( + pattern.data(), + // Using single match flag, which can greatly improve performance. + HS_FLAG_SINGLEMATCH | HS_FLAG_DOTALL, + HS_MODE_BLOCK, + NULL, + &database_, + &compile_err) != HS_SUCCESS) { + fprintf( + stderr, + "ERROR: Unable to compile pattern \"%s\": %s\n", + pattern.data(), + compile_err->message); + hs_free_compile_error(compile_err); + } + + if (hs_alloc_scratch(database_, &scratch_) != HS_SUCCESS) { + fprintf(stderr, "ERROR: Unable to allocate scratch space. Exiting.\n"); + hs_free_database(database_); + } + } + + /** + * Callback function for matching case. + */ + static int eventHandler( + unsigned int id, + unsigned long long from, + unsigned long long to, + unsigned int flags, + void* ctx) { + *(bool*)ctx = true; + return 0; + } + + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& /* outputType */, + exec::EvalCtx& context, + VectorPtr& resultRef) const final { + VELOX_CHECK_EQ(args.size(), 2); + context.ensureWritable(rows, BOOLEAN(), resultRef); + FlatVector& result = *resultRef->as>(); + exec::LocalDecodedVector toSearch(context, *args[0], rows); + + context.applyToSelectedNoThrow(rows, [&](vector_size_t i) { + StringView toSearchString = toSearch->valueAt(i); + bool isMatched = false; + if (hs_scan( + database_, + toSearchString.data(), + toSearchString.size(), + 0, + scratch_, + eventHandler, + &isMatched) != HS_SUCCESS) { + fprintf(stderr, "ERROR: Unable to scan input buffer. Exiting.\n"); + hs_free_scratch(scratch_); + hs_free_database(database_); + } + result.set(i, isMatched); + }); + } + + private: + hs_database_t* database_; + hs_scratch_t* scratch_ = NULL; +}; +} // namespace + +std::string printTypesCsv( + const std::vector& inputArgs) { + std::string result; + result.reserve(inputArgs.size() * 10); + for (const auto& input : inputArgs) { + folly::toAppend( + result.empty() ? "" : ", ", input.type->toString(), &result); + } + return result; +} + +std::shared_ptr makeHyperscanMatch( + const std::string& name, + const std::vector& inputArgs, + const core::QueryConfig& config) { + if (inputArgs.size() != 2 || !inputArgs[0].type->isVarchar() || + !inputArgs[1].type->isVarchar()) { + VELOX_UNSUPPORTED( + "{} expected (VARCHAR, VARCHAR) but got ({})", + name, + printTypesCsv(inputArgs)); + } + + BaseVector* constantPattern = inputArgs[1].constantValue.get(); + + if (constantPattern != nullptr && !constantPattern->isNullAt(0)) { + return std::make_shared( + constantPattern->as>()->valueAt(0)); + } + // TODO: support non-constant pattern. + VELOX_UNREACHABLE(); +} + +} // namespace facebook::velox::functions diff --git a/velox/functions/lib/HyperscanFunctions.h b/velox/functions/lib/HyperscanFunctions.h new file mode 100644 index 000000000000..ba7b68b4d3d6 --- /dev/null +++ b/velox/functions/lib/HyperscanFunctions.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/expression/VectorFunction.h" +#include "velox/functions/Udf.h" +#include "velox/vector/BaseVector.h" + +namespace facebook::velox::functions { + +std::shared_ptr makeHyperscanMatch( + const std::string& name, + const std::vector& inputArgs, + const core::QueryConfig& config); + +} diff --git a/velox/functions/lib/benchmarks/Re2FunctionsBenchmarks.cpp b/velox/functions/lib/benchmarks/Re2FunctionsBenchmarks.cpp index 50f7c6f43ef7..d6d7b57b854e 100644 --- a/velox/functions/lib/benchmarks/Re2FunctionsBenchmarks.cpp +++ b/velox/functions/lib/benchmarks/Re2FunctionsBenchmarks.cpp @@ -19,6 +19,7 @@ #include #include +#include "velox/functions/lib/HyperscanFunctions.h" #include "velox/functions/lib/Re2Functions.h" #include "velox/functions/lib/benchmarks/FunctionBenchmarkBase.h" #include "velox/vector/fuzzer/VectorFuzzer.h" @@ -48,6 +49,14 @@ BENCHMARK_NAMED_PARAM_MULTI(regexMatch, bs1k, 1 << 10, "re2_match"); BENCHMARK_NAMED_PARAM_MULTI(regexMatch, bs10k, 10 << 10, "re2_match"); BENCHMARK_NAMED_PARAM_MULTI(regexMatch, bs100k, 100 << 10, "re2_match"); +BENCHMARK_NAMED_PARAM_MULTI(regexMatch, bs1k_hyperscan, 1 << 10, "hs_match"); +BENCHMARK_NAMED_PARAM_MULTI(regexMatch, bs10k_hyperscan, 10 << 10, "hs_match"); +BENCHMARK_NAMED_PARAM_MULTI( + regexMatch, + bs100k_hyperscan, + 100 << 10, + "hs_match"); + int regexSearch(int n, int blockSize, const char* functionName) { return regexMatch(n, blockSize, functionName); } @@ -94,6 +103,8 @@ std::shared_ptr makeRegexExtract( void registerRe2Functions() { exec::registerStatefulVectorFunction( "re2_match", re2MatchSignatures(), makeRe2Match); + exec::registerStatefulVectorFunction( + "hs_match", re2MatchSignatures(), makeHyperscanMatch); exec::registerStatefulVectorFunction( "re2_search", re2SearchSignatures(), makeRe2Search); exec::registerStatefulVectorFunction(