From 066a72fd78082adaca62452bb8bf68a58e5602ff Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Wed, 22 May 2024 21:22:19 -0700 Subject: [PATCH] Support complex types in sparksql hash and xxhash64 function (#9414) Summary: Currently, sparksql hash functions only supports primitive types. This patch adds the implementation for complex types, including array, map and row. The expected results in UT are obtained from spark's output. Spark's implementation https://github.com/apache/spark/blob/a2b7050e0fc5db6ac98db57309e4737acd26bf3a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L536-L609 To support hashing for complex types and align with Spark's implementation, this patch uses a per-row virtual function call and the function is implemented as vector function rather than simple function. Below are some notes from the benchmark results: Virtual function call per-row vs. type-switch per row: The virtual function call performs 15% better due to having 20% fewer instructions. The switch statement involves more branch instructions but both methods have a similar branch misprediction rate of 2.8%. The switch statement doesn't show higher branch misprediction because its fixed pattern allows the BPU to handle it effectively. However, if the schema becomes very complex and exceeds the BPU's history track buffer (currently at 1000 levels), the misprediction rate may increase. VectorFunction vs. Simple Function: Since the function doesn't apply default null behavior, null judgment for each field occurs within the call per row when using a simple function. In contrast, a vector function first filters the null values per column, avoiding null judgments in the top-level loop. By evaluating the implementation across all null ratios for simple/vector functions, we observed that the simpler function can take up to 3.5 times longer than the vector function. Checking for null values row by row within the loop can lead to a high branch misprediction ratio due to the randomness of null values, while vector function can maintain a consistent branch misprediction ratio across all null ratios in vector processes. Pull Request resolved: https://github.com/facebookincubator/velox/pull/9414 Reviewed By: mbasmanova Differential Revision: D56783038 Pulled By: pedroerp fbshipit-source-id: 0238f0e88f7f395c41e976003a138cddba3bd093 --- velox/docs/functions/spark/binary.rst | 9 - velox/functions/sparksql/Hash.cpp | 401 ++++++++++++++---- .../sparksql/benchmarks/CMakeLists.txt | 4 + .../sparksql/benchmarks/CompareBenchmark.cpp | 1 + .../sparksql/benchmarks/HashBenchmark.cpp | 53 +++ velox/functions/sparksql/tests/HashTest.cpp | 99 +++++ .../functions/sparksql/tests/XxHash64Test.cpp | 106 +++++ 7 files changed, 586 insertions(+), 87 deletions(-) create mode 100644 velox/functions/sparksql/benchmarks/HashBenchmark.cpp diff --git a/velox/docs/functions/spark/binary.rst b/velox/docs/functions/spark/binary.rst index 4f76b29c1148..249c7f4ee24f 100644 --- a/velox/docs/functions/spark/binary.rst +++ b/velox/docs/functions/spark/binary.rst @@ -10,30 +10,21 @@ Binary Functions Computes the hash of one or more input values using seed value of 42. For multiple arguments, their types can be different. - Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP. - .. spark:function:: hash_with_seed(seed, x, ...) -> integer Computes the hash of one or more input values using specified seed. For multiple arguments, their types can be different. - Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP. .. spark:function:: xxhash64(x, ...) -> bigint Computes the xxhash64 of one or more input values using seed value of 42. For multiple arguments, their types can be different. - Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP. .. spark:function:: xxhash64_with_seed(seed, x, ...) -> bigint Computes the xxhash64 of one or more input values using specified seed. For multiple arguments, their types can be different. - Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP. .. spark:function:: md5(x) -> varbinary diff --git a/velox/functions/sparksql/Hash.cpp b/velox/functions/sparksql/Hash.cpp index 8ad8b4e7f083..d20cf5d4c8b5 100644 --- a/velox/functions/sparksql/Hash.cpp +++ b/velox/functions/sparksql/Hash.cpp @@ -26,21 +26,262 @@ namespace { const int32_t kDefaultSeed = 42; +// Computes the hash value of input using the hash function in HashClass. +template +typename HashClass::ReturnType hashOne( + int32_t input, + typename HashClass::SeedType seed) { + return HashClass::hashInt32(input, seed); +} + +template +typename HashClass::ReturnType hashOne( + int64_t input, + typename HashClass::SeedType seed) { + return HashClass::hashInt64(input, seed); +} + +template +typename HashClass::ReturnType hashOne( + float input, + typename HashClass::SeedType seed) { + return HashClass::hashFloat(input, seed); +} + +template +typename HashClass::ReturnType hashOne( + double input, + typename HashClass::SeedType seed) { + return HashClass::hashDouble(input, seed); +} + +template +typename HashClass::ReturnType hashOne( + int128_t input, + typename HashClass::SeedType seed) { + return HashClass::hashLongDecimal(input, seed); +} + +template +typename HashClass::ReturnType hashOne( + Timestamp input, + typename HashClass::SeedType seed) { + return HashClass::hashTimestamp(input, seed); +} + +template +typename HashClass::ReturnType hashOne( + StringView input, + typename HashClass::SeedType seed) { + return HashClass::hashBytes(input, seed); +} + +template +class PrimitiveVectorHasher; + +template +class ArrayVectorHasher; + +template +class MapVectorHasher; + +template +class RowVectorHasher; + +// Class to compute hashes identical to one produced by Spark. +// Hashes are computed using the algorithm implemented in HashClass. +template +class SparkVectorHasher { + public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + + SparkVectorHasher(DecodedVector& decoded) : decoded_(decoded) {} + + virtual ~SparkVectorHasher() = default; + + // Compute the hash value of input vector at index. + ReturnType hashAt(vector_size_t index, SeedType seed) { + if (decoded_.isNullAt(index)) { + return seed; + } + return hashNotNullAt(index, seed); + } + + // Compute the hash value of input vector at index for non-null values. + virtual ReturnType hashNotNullAt(vector_size_t index, SeedType seed) = 0; + + protected: + const DecodedVector& decoded_; +}; + +template +std::shared_ptr> createPrimitiveVectorHasher( + DecodedVector& decoded) { + return std::make_shared>(decoded); +} + +template +std::shared_ptr> createVectorHasher( + DecodedVector& decoded) { + switch (decoded.base()->typeKind()) { + case TypeKind::ARRAY: + return std::make_shared>(decoded); + case TypeKind::MAP: + return std::make_shared>(decoded); + case TypeKind::ROW: + return std::make_shared>(decoded); + default: + return VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH( + createPrimitiveVectorHasher, + HashClass, + decoded.base()->typeKind(), + decoded); + } +} + +template +class PrimitiveVectorHasher : public SparkVectorHasher { + public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + + PrimitiveVectorHasher(DecodedVector& decoded) + : SparkVectorHasher(decoded) {} + + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + return hashOne( + this->decoded_.template valueAt::NativeType>( + index), + seed); + } +}; + +template +class ArrayVectorHasher : public SparkVectorHasher { + public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + + ArrayVectorHasher(DecodedVector& decoded) + : SparkVectorHasher(decoded) { + base_ = decoded.base()->as(); + indices_ = decoded.indices(); + + SelectivityVector rows(base_->elements()->size()); + decodedElements_.decode(*base_->elements(), rows); + elementHasher_ = createVectorHasher(decodedElements_); + } + + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + auto size = base_->sizeAt(indices_[index]); + auto offset = base_->offsetAt(indices_[index]); + + ReturnType result = seed; + for (auto i = 0; i < size; ++i) { + result = elementHasher_->hashAt(i + offset, result); + } + return result; + } + + private: + const ArrayVector* base_; + const int32_t* indices_; + DecodedVector decodedElements_; + std::shared_ptr> elementHasher_; +}; + +template +class MapVectorHasher : public SparkVectorHasher { + public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + + MapVectorHasher(DecodedVector& decoded) + : SparkVectorHasher(decoded) { + base_ = decoded.base()->as(); + indices_ = decoded.indices(); + + SelectivityVector rows(base_->mapKeys()->size()); + decodedKeys_.decode(*base_->mapKeys(), rows); + decodedValues_.decode(*base_->mapValues(), rows); + keyHasher_ = createVectorHasher(decodedKeys_); + valueHasher_ = createVectorHasher(decodedValues_); + } + + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + auto size = base_->sizeAt(indices_[index]); + auto offset = base_->offsetAt(indices_[index]); + + ReturnType result = seed; + for (auto i = 0; i < size; ++i) { + result = keyHasher_->hashAt(i + offset, result); + result = valueHasher_->hashAt(i + offset, result); + } + return result; + } + + private: + const MapVector* base_; + const int32_t* indices_; + DecodedVector decodedKeys_; + DecodedVector decodedValues_; + std::shared_ptr> keyHasher_; + std::shared_ptr> valueHasher_; +}; + +template +class RowVectorHasher : public SparkVectorHasher { + public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + + RowVectorHasher(DecodedVector& decoded) + : SparkVectorHasher(decoded) { + base_ = decoded.base()->as(); + indices_ = decoded.indices(); + + SelectivityVector rows(base_->size()); + decodedChildren_.resize(base_->childrenSize()); + hashers_.resize(base_->childrenSize()); + for (auto i = 0; i < base_->childrenSize(); ++i) { + decodedChildren_[i].decode(*base_->childAt(i), rows); + hashers_[i] = createVectorHasher(decodedChildren_[i]); + } + } + + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + ReturnType result = seed; + for (auto i = 0; i < base_->childrenSize(); ++i) { + result = hashers_[i]->hashAt(indices_[index], result); + } + return result; + } + + private: + const RowVector* base_; + const int32_t* indices_; + std::vector decodedChildren_; + std::vector>> hashers_; +}; + // ReturnType can be either int32_t or int64_t // HashClass contains the function like hashInt32 -template +template < + typename HashClass, + typename SeedType = typename HashClass::SeedType, + typename ReturnType = typename HashClass::ReturnType> void applyWithType( const SelectivityVector& rows, std::vector& args, // Not using const ref so we can reuse args std::optional seed, exec::EvalCtx& context, VectorPtr& resultRef) { - HashClass hash; size_t hashIdx = seed ? 1 : 0; SeedType hashSeed = seed ? *seed : kDefaultSeed; auto& result = *resultRef->as>(); - rows.applyToSelected([&](int row) { result.set(row, hashSeed); }); + rows.applyToSelected([&](auto row) { result.set(row, hashSeed); }); exec::LocalSelectivityVector selectedMinusNulls(context); @@ -54,36 +295,16 @@ void applyWithType( decoded->nulls(&rows), rows.begin(), rows.end()); selected = selectedMinusNulls.get(); } - switch (args[i]->type()->kind()) { -// Derived from InterpretedHashFunction.hash: -// https://github.com/apache/spark/blob/382b66e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L532 -#define CASE(typeEnum, hashFn, inputType) \ - case TypeKind::typeEnum: \ - selected->applyToSelected([&](int row) { \ - result.set( \ - row, hashFn(decoded->valueAt(row), result.valueAt(row))); \ - }); \ - break; - CASE(BOOLEAN, hash.hashInt32, bool); - CASE(TINYINT, hash.hashInt32, int8_t); - CASE(SMALLINT, hash.hashInt32, int16_t); - CASE(INTEGER, hash.hashInt32, int32_t); - CASE(BIGINT, hash.hashInt64, int64_t); - CASE(VARCHAR, hash.hashBytes, StringView); - CASE(VARBINARY, hash.hashBytes, StringView); - CASE(REAL, hash.hashFloat, float); - CASE(DOUBLE, hash.hashDouble, double); - CASE(HUGEINT, hash.hashLongDecimal, int128_t); - CASE(TIMESTAMP, hash.hashTimestamp, Timestamp); -#undef CASE - default: - VELOX_NYI( - "Unsupported type for HASH(): {}", args[i]->type()->toString()); - } + + auto hasher = createVectorHasher(*decoded); + selected->applyToSelected([&](auto row) { + result.set(row, hasher->hashNotNullAt(row, result.valueAt(row))); + }); } } -// Derived from src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java. +// Derived from +// src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java. // // Spark's Murmur3 seems slightly different from the original from Austin // Appleby: in particular the fmix function's first line is different. The @@ -95,13 +316,16 @@ void applyWithType( class Murmur3Hash final { public: - uint32_t hashInt32(int32_t input, uint32_t seed) { + using SeedType = int32_t; + using ReturnType = int32_t; + + static uint32_t hashInt32(int32_t input, uint32_t seed) { uint32_t k1 = mixK1(input); uint32_t h1 = mixH1(seed, k1); return fmix(h1, 4); } - uint32_t hashInt64(uint64_t input, uint32_t seed) { + static uint32_t hashInt64(uint64_t input, uint32_t seed) { uint32_t low = input; uint32_t high = input >> 32; @@ -116,19 +340,19 @@ class Murmur3Hash final { // Floating point numbers are hashed as if they are integers, with // -0f defined to have the same output as +0f. - uint32_t hashFloat(float input, uint32_t seed) { + static uint32_t hashFloat(float input, uint32_t seed) { return hashInt32( input == -0.f ? 0 : *reinterpret_cast(&input), seed); } - uint32_t hashDouble(double input, uint32_t seed) { + static uint32_t hashDouble(double input, uint32_t seed) { return hashInt64( input == -0. ? 0 : *reinterpret_cast(&input), seed); } // Spark also has an hashUnsafeBytes2 function, but it was not used at the // time of implementation. - uint32_t hashBytes(const StringView& input, uint32_t seed) { + static uint32_t hashBytes(const StringView& input, uint32_t seed) { const char* i = input.data(); const char* const end = input.data() + input.size(); uint32_t h1 = seed; @@ -141,25 +365,25 @@ class Murmur3Hash final { return fmix(h1, input.size()); } - uint32_t hashLongDecimal(int128_t input, uint32_t seed) { + static uint32_t hashLongDecimal(int128_t input, uint32_t seed) { char out[sizeof(int128_t)]; int32_t length = DecimalUtil::toByteArray(input, out); return hashBytes(StringView(out, length), seed); } - uint32_t hashTimestamp(Timestamp input, uint32_t seed) { + static uint32_t hashTimestamp(Timestamp input, uint32_t seed) { return hashInt64(input.toMicros(), seed); } private: - uint32_t mixK1(uint32_t k1) { + static uint32_t mixK1(uint32_t k1) { k1 *= 0xcc9e2d51; k1 = bits::rotateLeft(k1, 15); k1 *= 0x1b873593; return k1; } - uint32_t mixH1(uint32_t h1, uint32_t k1) { + static uint32_t mixH1(uint32_t h1, uint32_t k1) { h1 ^= k1; h1 = bits::rotateLeft(h1, 13); h1 = h1 * 5 + 0xe6546b64; @@ -167,7 +391,7 @@ class Murmur3Hash final { } // Finalization mix - force all bits of a hash block to avalanche - uint32_t fmix(uint32_t h1, uint32_t length) { + static uint32_t fmix(uint32_t h1, uint32_t length) { h1 ^= length; h1 ^= h1 >> 16; h1 *= 0x85ebca6b; @@ -190,7 +414,7 @@ class Murmur3HashFunction final : public exec::VectorFunction { exec::EvalCtx& context, VectorPtr& resultRef) const final { context.ensureWritable(rows, INTEGER(), resultRef); - applyWithType(rows, args, seed_, context, resultRef); + applyWithType(rows, args, seed_, context, resultRef); } private: @@ -198,21 +422,18 @@ class Murmur3HashFunction final : public exec::VectorFunction { }; class XxHash64 final { - const uint64_t PRIME64_1 = 0x9E3779B185EBCA87L; - const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FL; - const uint64_t PRIME64_3 = 0x165667B19E3779F9L; - const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63L; - const uint64_t PRIME64_5 = 0x27D4EB2F165667C5L; - public: - int64_t hashInt32(const int32_t input, uint64_t seed) { + using SeedType = int64_t; + using ReturnType = int64_t; + + static uint64_t hashInt32(const int32_t input, uint64_t seed) { int64_t hash = seed + PRIME64_5 + 4L; hash ^= static_cast((input & 0xFFFFFFFFL) * PRIME64_1); hash = bits::rotateLeft64(hash, 23) * PRIME64_2 + PRIME64_3; return fmix(hash); } - int64_t hashInt64(int64_t input, uint64_t seed) { + static uint64_t hashInt64(int64_t input, uint64_t seed) { int64_t hash = seed + PRIME64_5 + 8L; hash ^= bits::rotateLeft64(input * PRIME64_2, 31) * PRIME64_1; hash = bits::rotateLeft64(hash, 27) * PRIME64_1 + PRIME64_4; @@ -221,17 +442,17 @@ class XxHash64 final { // Floating point numbers are hashed as if they are integers, with // -0f defined to have the same output as +0f. - int64_t hashFloat(float input, uint64_t seed) { + static uint64_t hashFloat(float input, uint64_t seed) { return hashInt32( input == -0.f ? 0 : *reinterpret_cast(&input), seed); } - int64_t hashDouble(double input, uint64_t seed) { + static uint64_t hashDouble(double input, uint64_t seed) { return hashInt64( input == -0. ? 0 : *reinterpret_cast(&input), seed); } - uint64_t hashBytes(const StringView& input, uint64_t seed) { + static uint64_t hashBytes(const StringView& input, uint64_t seed) { const char* i = input.data(); const char* const end = input.data() + input.size(); @@ -253,18 +474,24 @@ class XxHash64 final { return fmix(hash); } - int64_t hashLongDecimal(int128_t input, uint32_t seed) { + static uint64_t hashLongDecimal(int128_t input, uint64_t seed) { char out[sizeof(int128_t)]; int32_t length = DecimalUtil::toByteArray(input, out); return hashBytes(StringView(out, length), seed); } - int64_t hashTimestamp(Timestamp input, uint32_t seed) { + static uint64_t hashTimestamp(Timestamp input, uint64_t seed) { return hashInt64(input.toMicros(), seed); } private: - uint64_t fmix(uint64_t hash) { + static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87L; + static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FL; + static const uint64_t PRIME64_3 = 0x165667B19E3779F9L; + static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63L; + static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5L; + + static uint64_t fmix(uint64_t hash) { hash ^= hash >> 33; hash *= PRIME64_2; hash ^= hash >> 29; @@ -273,7 +500,7 @@ class XxHash64 final { return hash; } - uint64_t hashBytesByWords(const StringView& input, uint64_t seed) { + static uint64_t hashBytesByWords(const StringView& input, uint64_t seed) { const char* i = input.data(); const char* const end = input.data() + input.size(); uint32_t length = input.size(); @@ -353,13 +580,52 @@ class XxHash64Function final : public exec::VectorFunction { exec::EvalCtx& context, VectorPtr& resultRef) const final { context.ensureWritable(rows, BIGINT(), resultRef); - applyWithType(rows, args, seed_, context, resultRef); + applyWithType(rows, args, seed_, context, resultRef); } private: const std::optional seed_; }; +bool checkHashElementType(const TypePtr& type) { + switch (type->kind()) { + case TypeKind::BOOLEAN: + case TypeKind::TINYINT: + case TypeKind::SMALLINT: + case TypeKind::INTEGER: + case TypeKind::BIGINT: + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + case TypeKind::REAL: + case TypeKind::DOUBLE: + case TypeKind::HUGEINT: + case TypeKind::TIMESTAMP: + return true; + case TypeKind::ARRAY: + return checkHashElementType(type->asArray().elementType()); + case TypeKind::MAP: + return checkHashElementType(type->asMap().keyType()) && + checkHashElementType(type->asMap().valueType()); + case TypeKind::ROW: { + const auto& children = type->asRow().children(); + return std::all_of( + children.begin(), children.end(), [](const auto& child) { + return checkHashElementType(child); + }); + } + default: + return false; + } +} + +void checkArgTypes(const std::vector& args) { + for (const auto& arg : args) { + if (!checkHashElementType(arg.type)) { + VELOX_USER_FAIL("Unsupported type for hash: {}", arg.type->toString()) + } + } +} + } // namespace // Not all types are supported by now. Check types when making hash function. @@ -372,27 +638,6 @@ std::vector> hashSignatures() { .build()}; } -void checkArgTypes(const std::vector& args) { - for (const auto& arg : args) { - switch (arg.type->kind()) { - case TypeKind::BOOLEAN: - case TypeKind::TINYINT: - case TypeKind::SMALLINT: - case TypeKind::INTEGER: - case TypeKind::BIGINT: - case TypeKind::VARCHAR: - case TypeKind::VARBINARY: - case TypeKind::REAL: - case TypeKind::DOUBLE: - case TypeKind::HUGEINT: - case TypeKind::TIMESTAMP: - break; - default: - VELOX_USER_FAIL("Unsupported type for hash: {}", arg.type->toString()) - } - } -} - std::shared_ptr makeHash( const std::string& name, const std::vector& inputArgs, diff --git a/velox/functions/sparksql/benchmarks/CMakeLists.txt b/velox/functions/sparksql/benchmarks/CMakeLists.txt index 5ccf7e88745a..837c983b7df8 100644 --- a/velox/functions/sparksql/benchmarks/CMakeLists.txt +++ b/velox/functions/sparksql/benchmarks/CMakeLists.txt @@ -27,3 +27,7 @@ target_link_libraries( add_executable(velox_sparksql_benchmarks_compare CompareBenchmark.cpp) target_link_libraries(velox_sparksql_benchmarks_compare velox_functions_spark velox_benchmark_builder velox_vector_test_lib) + +add_executable(velox_sparksql_benchmarks_hash HashBenchmark.cpp) +target_link_libraries(velox_sparksql_benchmarks_hash velox_functions_spark + velox_benchmark_builder velox_vector_test_lib) diff --git a/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp b/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp index 19b6f1262512..e4d3c9f3f608 100644 --- a/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp +++ b/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp @@ -26,6 +26,7 @@ using namespace facebook::velox; int main(int argc, char** argv) { folly::Init init(&argc, &argv); + memory::MemoryManager::initialize({}); functions::sparksql::registerFunctions(""); ExpressionBenchmarkBuilder benchmarkBuilder; diff --git a/velox/functions/sparksql/benchmarks/HashBenchmark.cpp b/velox/functions/sparksql/benchmarks/HashBenchmark.cpp new file mode 100644 index 000000000000..97cf592e3b81 --- /dev/null +++ b/velox/functions/sparksql/benchmarks/HashBenchmark.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "velox/benchmarks/ExpressionBenchmarkBuilder.h" +#include "velox/functions/sparksql/Register.h" + +using namespace facebook; + +using namespace facebook::velox; + +int main(int argc, char** argv) { + folly::Init init(&argc, &argv); + memory::MemoryManager::initialize({}); + functions::sparksql::registerFunctions(""); + + ExpressionBenchmarkBuilder benchmarkBuilder; + + std::vector inputTypes = { + ARRAY(MAP(INTEGER(), VARCHAR())), + ROW({"f_map", "f_array"}, {MAP(INTEGER(), VARCHAR()), ARRAY(INTEGER())}), + }; + + for (auto& inputType : inputTypes) { + benchmarkBuilder + .addBenchmarkSet( + fmt::format("hash_{}", inputType->toString()), + ROW({"c0"}, {inputType})) + .withFuzzerOptions({.vectorSize = 1000, .nullRatio = 0.1}) + .addExpression("hash", "hash(c0)") + .addExpression("xxhash64", "xxhash64(c0)") + .withIterations(100); + } + + benchmarkBuilder.registerBenchmarks(); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/functions/sparksql/tests/HashTest.cpp b/velox/functions/sparksql/tests/HashTest.cpp index 422d63643e56..58a9d4565c0d 100644 --- a/velox/functions/sparksql/tests/HashTest.cpp +++ b/velox/functions/sparksql/tests/HashTest.cpp @@ -18,6 +18,8 @@ #include +using facebook::velox::test::assertEqualVectors; + namespace facebook::velox::functions::sparksql::test { namespace { @@ -27,6 +29,10 @@ class HashTest : public SparkFunctionBaseTest { std::optional hash(std::optional arg) { return evaluateOnce("hash(c0)", arg); } + + VectorPtr hash(VectorPtr vector) { + return evaluate("hash(c0)", makeRowVector({vector})); + } }; TEST_F(HashTest, String) { @@ -128,5 +134,98 @@ TEST_F(HashTest, Float) { EXPECT_EQ(hash(-limits::infinity()), 427440766); } +TEST_F(HashTest, array) { + assertEqualVectors( + makeFlatVector({2101165938, 42, 1045631400}), + hash(makeArrayVector({{1, 2, 3, 4, 5}, {}, {1, 2, 3}}))); + + assertEqualVectors( + makeFlatVector({-559580957, 1765031574, 42}), + hash(makeNullableArrayVector( + {{1, std::nullopt}, {std::nullopt, 2}, {std::nullopt}}))); + + // Nested array. + { + auto arrayVector = makeNestedArrayVectorFromJson( + {"[[1, null, 2, 3], [4, 5]]", + "[[1, null, 2, 3], [6, 7, 8]]", + "[[]]", + "[[null]]", + "[null]"}); + assertEqualVectors( + makeFlatVector({2101165938, -992561130, 42, 42, 42}), + hash(arrayVector)); + } + + // Array of map. + { + using S = StringView; + using P = std::pair>; + std::vector

a{P{1, S{"a"}}, P{2, std::nullopt}}; + std::vector

b{P{3, S{"c"}}}; + std::vector>> data = {{a, b}}; + auto arrayVector = makeArrayOfMapVector(data); + assertEqualVectors( + makeFlatVector(std::vector{-718462205}), + hash(arrayVector)); + } + + // Array of row. + { + std::vector>>> + data = { + {{{1, "red"}}, {{2, "blue"}}, {{3, "green"}}}, + {{{1, "red"}}, std::nullopt, {{3, "green"}}}, + {std::nullopt}, + }; + auto arrayVector = makeArrayOfRowVector(data, ROW({INTEGER(), VARCHAR()})); + assertEqualVectors( + makeFlatVector({-1458343314, 551500425, 42}), + hash(arrayVector)); + } +} + +TEST_F(HashTest, map) { + auto mapVector = makeMapVector( + {{{1, 17.0}, {2, 36.0}, {3, 8.0}, {4, 28.0}, {5, 24.0}, {6, 32.0}}}); + assertEqualVectors( + makeFlatVector(std::vector{1263683448}), + hash(mapVector)); + + auto mapOfArrays = createMapOfArraysVector( + {{{1, {{1, 2, 3}}}}, {{2, {{4, 5, 6}}}}, {{3, {{7, 8, 9}}}}}); + assertEqualVectors( + makeFlatVector({-1818148947, 529298908, 825098912}), + hash(mapOfArrays)); + + auto mapWithNullArrays = createMapOfArraysVector( + {{{1, std::nullopt}}, {{2, {{4, 5, std::nullopt}}}}, {{3, {{}}}}}); + assertEqualVectors( + makeFlatVector({-1712319331, 2060637564, 519220707}), + hash(mapWithNullArrays)); +} + +TEST_F(HashTest, row) { + auto row = makeRowVector({ + makeFlatVector({1, 3}), + makeFlatVector({2, 4}), + }); + assertEqualVectors( + makeFlatVector({-1181176833, 1717636039}), hash(row)); + + row = makeRowVector({ + makeNullableFlatVector({1, std::nullopt}), + makeNullableFlatVector({std::nullopt, 4}), + }); + assertEqualVectors( + makeFlatVector({-1712319331, 1344313940}), hash(row)); + + row->setNull(0, true); + assertEqualVectors(makeFlatVector({42, 1344313940}), hash(row)); + + row->setNull(1, true); + assertEqualVectors(makeFlatVector({42, 42}), hash(row)); +} + } // namespace } // namespace facebook::velox::functions::sparksql::test diff --git a/velox/functions/sparksql/tests/XxHash64Test.cpp b/velox/functions/sparksql/tests/XxHash64Test.cpp index 09162f4a0279..6e086ffd918f 100644 --- a/velox/functions/sparksql/tests/XxHash64Test.cpp +++ b/velox/functions/sparksql/tests/XxHash64Test.cpp @@ -18,6 +18,8 @@ #include +using facebook::velox::test::assertEqualVectors; + namespace facebook::velox::functions::sparksql::test { namespace { class XxHash64Test : public SparkFunctionBaseTest { @@ -26,6 +28,10 @@ class XxHash64Test : public SparkFunctionBaseTest { std::optional xxhash64(std::optional arg) { return evaluateOnce("xxhash64(c0)", arg); } + + VectorPtr xxhash64(VectorPtr vector) { + return evaluate("xxhash64(c0)", makeRowVector({vector})); + } }; // The expected result was obtained by running SELECT xxhash64("Spark") query @@ -138,6 +144,106 @@ TEST_F(XxHash64Test, float) { EXPECT_EQ(xxhash64(-limits::infinity()), -7580553461823983095); } +TEST_F(XxHash64Test, array) { + assertEqualVectors( + makeFlatVector({-6041664978295882827, 42, 4904562767517797033}), + xxhash64(makeArrayVector({{1, 2, 3, 4, 5}, {}, {1, 2, 3}}))); + + assertEqualVectors( + makeFlatVector({-6698625589789238999, 8420071140774656230, 42}), + xxhash64(makeNullableArrayVector( + {{1, std::nullopt}, {std::nullopt, 2}, {std::nullopt}}))); + + // Nested array. + { + auto arrayVector = makeNestedArrayVectorFromJson( + {"[[1, null, 2, 3], [4, 5]]", + "[[1, null, 2, 3], [6, 7, 8]]", + "[[]]", + "[[null]]", + "[null]"}); + assertEqualVectors( + makeFlatVector( + {-6041664978295882827, -1052942565807509112, 42, 42, 42}), + xxhash64(arrayVector)); + } + + // Array of map. + { + using S = StringView; + using P = std::pair>; + std::vector

a{P{1, S{"a"}}, P{2, std::nullopt}}; + std::vector

b{P{3, S{"c"}}}; + std::vector>> data = {{a, b}}; + auto arrayVector = makeArrayOfMapVector(data); + assertEqualVectors( + makeFlatVector(std::vector{2880747995994395223}), + xxhash64(arrayVector)); + } + + // Array of row. + { + std::vector>>> + data = { + {{{1, "red"}}, {{2, "blue"}}, {{3, "green"}}}, + {{{1, "red"}}, std::nullopt, {{3, "green"}}}, + {std::nullopt}, + }; + auto arrayVector = makeArrayOfRowVector(data, ROW({INTEGER(), VARCHAR()})); + assertEqualVectors( + makeFlatVector( + {-4096178443626566478, -8973283971856715104, 42}), + xxhash64(arrayVector)); + } +} + +TEST_F(XxHash64Test, map) { + auto mapVector = makeMapVector( + {{{1, 17.0}, {2, 36.0}, {3, 8.0}, {4, 28.0}, {5, 24.0}, {6, 32.0}}}); + assertEqualVectors( + makeFlatVector(std::vector{-6303587702533348160}), + xxhash64(mapVector)); + + auto mapOfArrays = createMapOfArraysVector( + {{{1, {{1, 2, 3}}}}, {{2, {{4, 5, 6}}}}, {{3, {{7, 8, 9}}}}}); + assertEqualVectors( + makeFlatVector( + {-2103781794412908874, 1112887818746642853, 5787852566364222439}), + xxhash64(mapOfArrays)); + + auto mapWithNullArrays = createMapOfArraysVector( + {{{1, std::nullopt}}, {{2, {{4, 5, std::nullopt}}}}, {{3, {{}}}}}); + assertEqualVectors( + makeFlatVector( + {-7001672635703045582, 7217681953522744649, 3188756510806108107}), + xxhash64(mapWithNullArrays)); +} + +TEST_F(XxHash64Test, row) { + auto row = makeRowVector({ + makeFlatVector({1, 3}), + makeFlatVector({2, 4}), + }); + assertEqualVectors( + makeFlatVector({-8198029865082835910, 351067884137457704}), + xxhash64(row)); + + row = makeRowVector({ + makeNullableFlatVector({1, std::nullopt}), + makeNullableFlatVector({std::nullopt, 4}), + }); + assertEqualVectors( + makeFlatVector({-7001672635703045582, 404280023041566627}), + xxhash64(row)); + + row->setNull(0, true); + assertEqualVectors( + makeFlatVector({42, 404280023041566627}), xxhash64(row)); + + row->setNull(1, true); + assertEqualVectors(makeFlatVector({42, 42}), xxhash64(row)); +} + TEST_F(XxHash64Test, hashSeed) { auto xxhash64WithSeed = [&](int64_t seed, const std::optional& arg) { return evaluateOnce(