Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
PHILO-HE committed Oct 17, 2024
1 parent 9d4cfb2 commit f0470e9
Show file tree
Hide file tree
Showing 12 changed files with 237 additions and 6 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ jobs:
- name: "Install dependencies"
if: ${{ github.event_name == 'pull_request' }}
run: source velox/scripts/setup-ubuntu.sh && install_apt_deps && install_duckdb
run: source velox/scripts/setup-ubuntu.sh && install_apt_deps && install_duckdb && install_boost

- name: Build Baseline Benchmarks
if: ${{ github.event_name == 'pull_request' }}
Expand All @@ -117,7 +117,7 @@ jobs:
submodules: 'recursive'

- name: "Install dependencies"
run: source velox/scripts/setup-ubuntu.sh && install_apt_deps
run: source velox/scripts/setup-ubuntu.sh && install_apt_deps && install_boost

- name: Build Contender Benchmarks
working-directory: velox
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/linux-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ jobs:
VELOX_DEPENDENCY_SOURCE: SYSTEM
simdjson_SOURCE: BUNDLED
xsimd_SOURCE: BUNDLED
hyperscan_SOURCE: BUNDLED
CUDA_VERSION: "12.4"
steps:
- uses: actions/checkout@v4
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,12 @@ jobs:
run: |
ccache -sz
- name: Setup dependencies
run: |
# Install dependencies required by PR. They have not been installed in docker.
yum update -y
yum install -y ragel
- name: Build
env:
EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_BUILD_PYTHON_PACKAGE=ON ${{ inputs.extraCMakeFlags }}"
Expand Down
42 changes: 42 additions & 0 deletions CMake/resolve_dependency_modules/hyperscan.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include_guard(GLOBAL)

if(DEFINED ENV{VELOX_HYPERSCAN_URL})
set(VELOX_HYPERSCAN_SOURCE_URL "$ENV{VELOX_HYPERSCAN_URL}")
else()
set(VELOX_HYPERSCAN_VERSION v5.4.2)
set(VELOX_HYPERSCAN_SOURCE_URL
"https://github.com/intel/hyperscan/archive/refs/tags/${VELOX_HYPERSCAN_VERSION}.tar.gz"
)
set(VELOX_HYPERSCAN_BUILD_SHA256_CHECKSUM
32b0f24b3113bbc46b6bfaa05cf7cf45840b6b59333d078cc1f624e4c40b2b99)
endif()

message(STATUS "Building hyperscan from source")
FetchContent_Declare(
hyperscan
URL ${VELOX_HYPERSCAN_SOURCE_URL}
URL_HASH SHA256=${VELOX_HYPERSCAN_BUILD_SHA256_CHECKSUM})

set(CMAKE_CXX_STANDARD_BACKUP ${CMAKE_CXX_STANDARD})
# C++ 17 is not supported.
set(CMAKE_CXX_STANDARD 11)
set(BUILD_EXAMPLES FALSE)
set(BUILD_AVX512 ON)
FetchContent_MakeAvailable(hyperscan)
set_target_properties(
hs
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${hyperscan_SOURCE_DIR}/src)
set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_BACKUP})
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,9 @@ endif()
set_source(re2)
resolve_dependency(re2)

set_source(hyperscan)
resolve_dependency(hyperscan 5.4.2)

if(${VELOX_BUILD_PYTHON_PACKAGE})
set_source(pybind11)
resolve_dependency(pybind11 2.10.0)
Expand Down
2 changes: 1 addition & 1 deletion scripts/setup-centos9.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ function install_build_prerequisites {
dnf config-manager --set-enabled crb
dnf update -y
dnf_install ninja-build cmake ccache gcc-toolset-12 git wget which
dnf_install autoconf automake python3-devel pip libtool
dnf_install autoconf automake python3-devel pip libtool ragel

pip install cmake==3.28.3

Expand Down
2 changes: 1 addition & 1 deletion scripts/setup-macos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ NPROC=$(getconf _NPROCESSORS_ONLN)

BUILD_DUCKDB="${BUILD_DUCKDB:-true}"
DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)}
MACOS_VELOX_DEPS="bison flex gflags glog googletest icu4c libevent libsodium lz4 lzo openssl protobuf@21 snappy xz zstd"
MACOS_VELOX_DEPS="bison flex gflags glog googletest icu4c libevent libsodium lz4 lzo openssl protobuf@21 snappy xz zstd libtool automake autoconf ragel"
MACOS_BUILD_DEPS="ninja cmake"
FB_OS_VERSION="v2024.07.01.00"
FMT_VERSION="10.1.1"
Expand Down
3 changes: 2 additions & 1 deletion scripts/setup-ubuntu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ function install_velox_deps_from_apt {
bison \
flex \
libfl-dev \
tzdata
tzdata \
ragel
}

function install_fmt {
Expand Down
4 changes: 3 additions & 1 deletion velox/functions/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ velox_add_library(
ArrayShuffle.cpp
CheckDuplicateKeys.cpp
CheckNestedNulls.cpp
HyperscanFunctions.cpp
KllSketch.cpp
MapConcat.cpp
Re2Functions.cpp
Expand All @@ -47,7 +48,8 @@ velox_link_libraries(
velox_vector
velox_type_tz
re2::re2
Folly::folly)
Folly::folly
hs)

add_subdirectory(aggregates)
add_subdirectory(string)
Expand Down
136 changes: 136 additions & 0 deletions velox/functions/lib/HyperscanFunctions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "velox/functions/lib/HyperscanFunctions.h"
#include <errno.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include "hs.h"
#include "velox/functions/lib/string/StringImpl.h"
#include "velox/vector/FunctionVector.h"

namespace facebook::velox::functions {
namespace {

class MatchConstantPattern final : public exec::VectorFunction {
public:
explicit MatchConstantPattern(StringView pattern) {
hs_compile_error_t* compile_err;
if (hs_compile(
pattern.data(),
// Using single match flag, which can greatly improve performance.
HS_FLAG_SINGLEMATCH | HS_FLAG_DOTALL,
HS_MODE_BLOCK,
NULL,
&database_,
&compile_err) != HS_SUCCESS) {
fprintf(
stderr,
"ERROR: Unable to compile pattern \"%s\": %s\n",
pattern.data(),
compile_err->message);
hs_free_compile_error(compile_err);
}

if (hs_alloc_scratch(database_, &scratch_) != HS_SUCCESS) {
fprintf(stderr, "ERROR: Unable to allocate scratch space. Exiting.\n");
hs_free_database(database_);
}
}

/**
* Callback function for matching case.
*/
static int eventHandler(
unsigned int id,
unsigned long long from,
unsigned long long to,
unsigned int flags,
void* ctx) {
*(bool*)ctx = true;
return 0;
}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
const TypePtr& /* outputType */,
exec::EvalCtx& context,
VectorPtr& resultRef) const final {
VELOX_CHECK_EQ(args.size(), 2);
context.ensureWritable(rows, BOOLEAN(), resultRef);
FlatVector<bool>& result = *resultRef->as<FlatVector<bool>>();
exec::LocalDecodedVector toSearch(context, *args[0], rows);

context.applyToSelectedNoThrow(rows, [&](vector_size_t i) {
StringView toSearchString = toSearch->valueAt<StringView>(i);
bool isMatched = false;
if (hs_scan(
database_,
toSearchString.data(),
toSearchString.size(),
0,
scratch_,
eventHandler,
&isMatched) != HS_SUCCESS) {
fprintf(stderr, "ERROR: Unable to scan input buffer. Exiting.\n");
hs_free_scratch(scratch_);
hs_free_database(database_);
}
result.set(i, isMatched);
});
}

private:
hs_database_t* database_;
hs_scratch_t* scratch_ = NULL;
};
} // namespace

std::string printTypesCsv(
const std::vector<exec::VectorFunctionArg>& inputArgs) {
std::string result;
result.reserve(inputArgs.size() * 10);
for (const auto& input : inputArgs) {
folly::toAppend(
result.empty() ? "" : ", ", input.type->toString(), &result);
}
return result;
}

std::shared_ptr<exec::VectorFunction> makeHyperscanMatch(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& config) {
if (inputArgs.size() != 2 || !inputArgs[0].type->isVarchar() ||
!inputArgs[1].type->isVarchar()) {
VELOX_UNSUPPORTED(
"{} expected (VARCHAR, VARCHAR) but got ({})",
name,
printTypesCsv(inputArgs));
}

BaseVector* constantPattern = inputArgs[1].constantValue.get();

if (constantPattern != nullptr && !constantPattern->isNullAt(0)) {
return std::make_shared<MatchConstantPattern>(
constantPattern->as<ConstantVector<StringView>>()->valueAt(0));
}
// TODO: support non-constant pattern.
VELOX_UNREACHABLE();
}

} // namespace facebook::velox::functions
29 changes: 29 additions & 0 deletions velox/functions/lib/HyperscanFunctions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include "velox/expression/VectorFunction.h"
#include "velox/functions/Udf.h"
#include "velox/vector/BaseVector.h"

namespace facebook::velox::functions {

std::shared_ptr<exec::VectorFunction> makeHyperscanMatch(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& config);

}
11 changes: 11 additions & 0 deletions velox/functions/lib/benchmarks/Re2FunctionsBenchmarks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <folly/init/Init.h>
#include <string>

#include "velox/functions/lib/HyperscanFunctions.h"
#include "velox/functions/lib/Re2Functions.h"
#include "velox/functions/lib/benchmarks/FunctionBenchmarkBase.h"
#include "velox/vector/fuzzer/VectorFuzzer.h"
Expand Down Expand Up @@ -48,6 +49,14 @@ BENCHMARK_NAMED_PARAM_MULTI(regexMatch, bs1k, 1 << 10, "re2_match");
BENCHMARK_NAMED_PARAM_MULTI(regexMatch, bs10k, 10 << 10, "re2_match");
BENCHMARK_NAMED_PARAM_MULTI(regexMatch, bs100k, 100 << 10, "re2_match");

BENCHMARK_NAMED_PARAM_MULTI(regexMatch, bs1k_hyperscan, 1 << 10, "hs_match");
BENCHMARK_NAMED_PARAM_MULTI(regexMatch, bs10k_hyperscan, 10 << 10, "hs_match");
BENCHMARK_NAMED_PARAM_MULTI(
regexMatch,
bs100k_hyperscan,
100 << 10,
"hs_match");

int regexSearch(int n, int blockSize, const char* functionName) {
return regexMatch(n, blockSize, functionName);
}
Expand Down Expand Up @@ -94,6 +103,8 @@ std::shared_ptr<exec::VectorFunction> makeRegexExtract(
void registerRe2Functions() {
exec::registerStatefulVectorFunction(
"re2_match", re2MatchSignatures(), makeRe2Match);
exec::registerStatefulVectorFunction(
"hs_match", re2MatchSignatures(), makeHyperscanMatch);
exec::registerStatefulVectorFunction(
"re2_search", re2SearchSignatures(), makeRe2Search);
exec::registerStatefulVectorFunction(
Expand Down

0 comments on commit f0470e9

Please sign in to comment.