diff --git a/src/main/cpp/src/xxhash64.cu b/src/main/cpp/src/xxhash64.cu index daed7590c..471648277 100644 --- a/src/main/cpp/src/xxhash64.cu +++ b/src/main/cpp/src/xxhash64.cu @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -33,6 +34,8 @@ namespace { using hash_value_type = int64_t; using half_size_type = int32_t; +constexpr int MAX_NESTED_DEPTH = 8; + constexpr __device__ inline int64_t rotate_bits_left_signed(hash_value_type h, int8_t r) { return (h << r) | (h >> (64 - r)) & ~(-1 << r); @@ -271,6 +274,28 @@ hash_value_type __device__ inline XXHash_64::operator()( /** * @brief Computes the hash value of a row in the given table. * + * This functor uses Spark conventions for xxhash64 hashing, which differs from + * the xxhash64 implementation used in the rest of libcudf. These differences + * include: + * - Serially using the output hash as an input seed for the next item + * - Ignorance of null values + * + * The serial use of hashes as seeds means that data of different nested types + * can exhibit hash collisions. For example, a row of an integer column + * containing a 1 will have the same hash as a lists column of integers + * containing a list of [1] and a struct column of a single integer column + * containing a struct of {1}. + * + * As a consequence of ignoring null values, inputs like [1], [1, null], and + * [null, 1] have the same hash (an expected hash collision). This kind of + * collision can also occur across a table of nullable columns and with nulls + * in structs ({1, null} and {null, 1} have the same hash). The seed value (the + * previous element's hash value) is returned as the hash if an element is + * null. + * + * For additional differences such as special tail processing and decimal type + * handling, refer to the SparkXXHash64 functor. + * * @tparam Nullate A cudf::nullate type describing whether to check for nulls. */ template @@ -296,27 +321,186 @@ class device_row_hasher { /** * @brief Computes the hash value of an element in the given column. + * + * When the column is non-nested, this is a simple wrapper around the element_hasher. + * When the column is nested, this uses a seed value to serially compute each + * nested element, with the output hash becoming the seed for the next value. + * This requires constructing a new hash functor for each nested element, + * using the new seed from the previous element's hash. The hash of a null + * element is the input seed (the previous element's hash). */ class element_hasher_adapter { public: - template ())> + class element_hasher { + private: + Nullate _check_nulls; + hash_value_type _seed; + + public: + __device__ element_hasher(Nullate check_nulls, hash_value_type seed) + : _check_nulls(check_nulls), _seed(seed) + { + } + + template ())> + __device__ hash_value_type operator()(cudf::column_device_view const& col, + cudf::size_type row_index) const noexcept + { + if (_check_nulls && col.is_null(row_index)) { return _seed; } + return XXHash_64{_seed}(col.element(row_index)); + } + + template ())> + __device__ hash_value_type operator()(cudf::column_device_view const&, + cudf::size_type) const noexcept + { + CUDF_UNREACHABLE("Unsupported type for xxhash64"); + } + }; + + template ())> __device__ hash_value_type operator()(cudf::column_device_view const& col, cudf::size_type row_index, Nullate const _check_nulls, hash_value_type const _seed) const noexcept { - if (_check_nulls && col.is_null(row_index)) { return _seed; } - auto const hasher = XXHash_64{_seed}; - return hasher(col.element(row_index)); + auto const hasher = element_hasher{_check_nulls, _seed}; + return hasher.template operator()(col, row_index); } - template ())> - __device__ hash_value_type operator()(cudf::column_device_view const&, - cudf::size_type, - Nullate const, - hash_value_type const) const noexcept + struct col_stack_frame { + private: + cudf::column_device_view _column; // the column to process + int _idx_to_process; // the index of child or element to process next + + public: + __device__ col_stack_frame() = + delete; // Because the default constructor of `cudf::column_device_view` is deleted + + __device__ col_stack_frame(cudf::column_device_view col) + : _column(std::move(col)), _idx_to_process(0) + { + } + + __device__ int get_and_inc_idx_to_process() { return _idx_to_process++; } + + __device__ int get_idx_to_process() { return _idx_to_process; } + + __device__ cudf::column_device_view get_column() { return _column; } + }; + + /** + * @brief Functor to compute hash value for nested columns. + * + * This functor uses a stack to process nested columns. It iterates through the nested columns + * in a depth-first manner. The stack is used to keep track of the nested columns that need to + * be processed. + * + * - If the current column is a list column, it replaces the list column with its most inner + * non-list child since null values can be ignored in the xxhash64 computation. + * - If the current column is a struct column, there are two cases: + * a. If the struct column has only one row, it would be treated as a struct element. The + * children of the struct element would be pushed into the stack. + * b. If the struct column has multiple rows, it would be treated as a struct column. The + * next struct element would be pushed into the stack. + * - If the current column is a primitive column, it computes the hash value. + * + * For example, consider that the input column is of type `List>`. + * Assume that the element at `row_index` is: [(1, 2.0), (3, 4.0)]. + * The sliced column is noted as L1 here. + * + * L1 List> + * | + * S1 Struct ----> `struct_column` with multiple rows + * / \ + * S1[0] S1[1] Struct ----> `struct_element` with single row + * / \ / \ + * i1 f1 i2 f2 Primitive columns + * + * List level L1: + * |Index|List> | + * |-----|-------------------------| + * |0 | [(1, 2.0), (3, 4.0)] | + * length: 1 + * Offsets: 0, 2 + * + * Struct level S1: + * |Index|Struct| + * |-----|------------------| + * |0 | (1, 2.0) | + * |1 | (3, 4.0) | + * length: 2 + * + * @tparam T Type of the column. + * @param col The column to hash. + * @param row_index The index of the row to hash. + * @param _check_nulls A flag to indicate whether to check for null values. + * @param _seed The initial seed value for the hash computation. + * @return The computed hash value. + * + * @note This function is only enabled for nested columns. + */ + template ())> + __device__ hash_value_type operator()(cudf::column_device_view const& col, + cudf::size_type row_index, + Nullate const _check_nulls, + hash_value_type const _seed) const noexcept { - CUDF_UNREACHABLE("Unsupported type for xxhash64"); + hash_value_type ret = _seed; + cudf::column_device_view curr_col = col.slice(row_index, 1); + // The default constructor of `col_stack_frame` is deleted, so it can not allocate an array + // of `col_stack_frame` directly. + // Instead leverage the byte array to create the col_stack_frame array. + alignas(col_stack_frame) char stack_wrapper[sizeof(col_stack_frame) * MAX_NESTED_DEPTH]; + auto col_stack = reinterpret_cast(stack_wrapper); + int stack_size = 0; + + col_stack[stack_size++] = col_stack_frame(curr_col); + + while (stack_size > 0) { + col_stack_frame& top = col_stack[stack_size - 1]; + curr_col = top.get_column(); + // Replace list column with its most inner non-list child + if (curr_col.type().id() == cudf::type_id::LIST) { + do { + curr_col = cudf::detail::lists_column_device_view(curr_col).get_sliced_child(); + } while (curr_col.type().id() == cudf::type_id::LIST); + col_stack[stack_size - 1] = col_stack_frame(curr_col); + continue; + } + + if (curr_col.type().id() == cudf::type_id::STRUCT) { + if (curr_col.size() <= 1) { // struct element + // All child columns processed, pop the element + if (top.get_idx_to_process() == curr_col.num_child_columns()) { + --stack_size; + } else { + // Push the next child column into the stack + col_stack[stack_size++] = + col_stack_frame(cudf::detail::structs_column_device_view(curr_col).get_sliced_child( + top.get_and_inc_idx_to_process())); + } + } else { // struct column + if (top.get_idx_to_process() == curr_col.size()) { + --stack_size; + } else { + col_stack[stack_size++] = + col_stack_frame(curr_col.slice(top.get_and_inc_idx_to_process(), 1)); + } + } + } else { // Primitive column + ret = cudf::detail::accumulate( + thrust::counting_iterator(0), + thrust::counting_iterator(curr_col.size()), + ret, + [curr_col, _check_nulls] __device__(auto hash, auto element_index) { + return cudf::type_dispatcher( + curr_col.type(), element_hasher{_check_nulls, hash}, curr_col, element_index); + }); + --stack_size; + } + } + return ret; } }; @@ -325,6 +509,40 @@ class device_row_hasher { hash_value_type const _seed; }; +void check_nested_depth(cudf::table_view const& input) +{ + using column_checker_fn_t = std::function; + + column_checker_fn_t get_nested_depth = [&](cudf::column_view const& col) { + if (col.type().id() == cudf::type_id::LIST) { + auto const child_col = cudf::lists_column_view(col).child(); + // When encountering a List of Struct column, we need to account for an extra depth, + // as both the struct column and its elements will be pushed into the stack. + if (child_col.type().id() == cudf::type_id::STRUCT) { + return 1 + get_nested_depth(child_col); + } + return get_nested_depth(child_col); + } else if (col.type().id() == cudf::type_id::STRUCT) { + int max_child_depth = 0; + for (auto child = col.child_begin(); child != col.child_end(); ++child) { + max_child_depth = std::max(max_child_depth, get_nested_depth(*child)); + } + return 1 + max_child_depth; + } else { // Primitive type + return 1; + } + }; + + for (auto i = 0; i < input.num_columns(); i++) { + cudf::column_view const& col = input.column(i); + CUDF_EXPECTS(get_nested_depth(col) <= MAX_NESTED_DEPTH, + "The " + std::to_string(i) + + "-th column exceeds the maximum allowed nested depth. " + + "Current depth: " + std::to_string(get_nested_depth(col)) + ", " + + "Maximum allowed depth: " + std::to_string(MAX_NESTED_DEPTH)); + } +} + } // namespace std::unique_ptr xxhash64(cudf::table_view const& input, @@ -343,7 +561,9 @@ std::unique_ptr xxhash64(cudf::table_view const& input, // Return early if there's nothing to hash if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } - bool const nullable = has_nulls(input); + check_nested_depth(input); + + bool const nullable = has_nested_nulls(input); auto const input_view = cudf::table_device_view::create(input, stream); auto output_view = output->mutable_view(); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/Hash.java b/src/main/java/com/nvidia/spark/rapids/jni/Hash.java index 16971c5bd..2b8288286 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/Hash.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/Hash.java @@ -75,7 +75,6 @@ public static ColumnVector xxhash64(long seed, ColumnView columns[]) { assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isNestedType() : "Unsupported column type Nested"; columnViews[i] = columns[i].getNativeView(); } return new ColumnVector(xxhash64(seed, columnViews)); diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java index 19172a8d3..874cb84b5 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java @@ -387,6 +387,182 @@ void testXXHash64Mixed() { } } + @Test + void testXXHash64Struct() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnView structs = ColumnView.makeStructView(strings, integers, doubles, floats, bools); + ColumnVector result = Hash.xxhash64(new ColumnView[]{structs}); + ColumnVector expected = ColumnVector.fromBoxedLongs(7451748878409563026L, 6024043102550151964L, 3380664624738534402L, 8444697026100086329L, -5888679192448042852L, Hash.DEFAULT_XXHASH64_SEED)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64NestedStruct() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnView structs1 = ColumnView.makeStructView(strings, integers); + ColumnView structs2 = ColumnView.makeStructView(structs1, doubles); + ColumnView structs3 = ColumnView.makeStructView(bools); + ColumnView structs = ColumnView.makeStructView(structs2, floats, structs3); + ColumnVector result = Hash.xxhash64(new ColumnView[]{structs}); + ColumnVector expected = ColumnVector.fromBoxedLongs(7451748878409563026L, 6024043102550151964L, 3380664624738534402L, 8444697026100086329L, -5888679192448042852L, Hash.DEFAULT_XXHASH64_SEED)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64Lists() { + try (ColumnVector stringListCV = ColumnVector.fromLists( + new ListType(true, new BasicType(true, DType.STRING)), + Arrays.asList(null, "a"), + Arrays.asList("B\n", ""), + Arrays.asList("dE\"\u0100\t\u0101", " \ud720\ud721"), + Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer."), + Collections.singletonList(""), + null); + ColumnVector stringExpected = ColumnVector.fromBoxedLongs(-8582455328737087284L, 7160715839242204087L, -862482741676457612L, -3700309651391443614L, -7444071767201028348L, Hash.DEFAULT_XXHASH64_SEED); + ColumnVector stringResult = Hash.xxhash64(new ColumnView[]{stringListCV}); + ColumnVector intListCV = ColumnVector.fromLists( + new ListType(true, new BasicType(true, DType.INT32)), + Collections.emptyList(), + Arrays.asList(0, -2, 3), + Collections.singletonList(Integer.MAX_VALUE), + Arrays.asList(5, -6, null), + Collections.singletonList(Integer.MIN_VALUE), + null); + ColumnVector intExpected = ColumnVector.fromBoxedLongs(Hash.DEFAULT_XXHASH64_SEED, -4022702357093761688L, 1508894993788531228L, 7329154841501342665L, 2073849959933241805L, Hash.DEFAULT_XXHASH64_SEED); + ColumnVector intResult = Hash.xxhash64(new ColumnVector[]{intListCV})) { + assertColumnsAreEqual(stringExpected, stringResult); + assertColumnsAreEqual(intExpected, intResult); + } + } + + @Test + void testXXHash64NestedLists() { + try (ColumnVector nestedStringListCV = ColumnVector.fromLists( + new ListType(true, new ListType(true, new BasicType(true, DType.STRING))), + Arrays.asList(null, Collections.singletonList("a")), + Collections.singletonList(Arrays.asList("B\n", "")), + Arrays.asList(Collections.singletonList("dE\"\u0100\t\u0101"), Collections.singletonList(" \ud720\ud721")), + Collections.singletonList(Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.")), + Collections.singletonList(Collections.singletonList("")), + null); + ColumnVector stringExpected = ColumnVector.fromBoxedLongs(-8582455328737087284L, 7160715839242204087L, -862482741676457612L, -3700309651391443614L, -7444071767201028348L, Hash.DEFAULT_XXHASH64_SEED); + ColumnVector stringResult = Hash.xxhash64(new ColumnView[]{nestedStringListCV}); + ColumnVector nestedIntListCV = ColumnVector.fromLists( + new ListType(true, new ListType(true, new BasicType(true, DType.INT32))), + Collections.emptyList(), + Arrays.asList(Collections.singletonList(0), Collections.singletonList(-2), Collections.singletonList(3)), + Collections.singletonList(Collections.singletonList(Integer.MAX_VALUE)), + Arrays.asList(Collections.singletonList(5), Arrays.asList(-6, null)), + Collections.singletonList(Collections.singletonList(Integer.MIN_VALUE)), + null); + ColumnVector intExpected = ColumnVector.fromBoxedLongs(Hash.DEFAULT_XXHASH64_SEED, -4022702357093761688L, 1508894993788531228L, 7329154841501342665L, 2073849959933241805L, Hash.DEFAULT_XXHASH64_SEED); + ColumnVector intResult = Hash.xxhash64(new ColumnVector[]{nestedIntListCV});) { + assertColumnsAreEqual(stringExpected, stringResult); + assertColumnsAreEqual(intExpected, intResult); + } + } + + @Test + void testXXHash64StructOfList() { + try (ColumnVector stringListCV = ColumnVector.fromLists( + new ListType(true, new BasicType(true, DType.STRING)), + Arrays.asList(null, "a"), + Arrays.asList("B\n", ""), + Arrays.asList("dE\"\u0100\t\u0101", " \ud720\ud721"), + Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer."), + Collections.singletonList(""), + null); + ColumnVector intListCV = ColumnVector.fromLists( + new ListType(true, new BasicType(true, DType.INT32)), + Collections.emptyList(), + Arrays.asList(0, -2, 3), + Collections.singletonList(Integer.MAX_VALUE), + Arrays.asList(5, -6, null), + Collections.singletonList(Integer.MIN_VALUE), + null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnView structCV = ColumnView.makeStructView(intListCV, stringListCV, doubles, floats); + ColumnVector nestedExpected = ColumnVector.fromBoxedLongs(-8492741646850220468L, -6547737320918905493L, -8718220625378038731L, 5441580647216064522L, 3645801243834961127L, Hash.DEFAULT_XXHASH64_SEED); + ColumnVector nestedResult = Hash.xxhash64(new ColumnView[]{structCV})) { + assertColumnsAreEqual(nestedExpected, nestedResult); + } + } + + @Test + void testXXHash64ListOfStruct() { + try (ColumnVector structListCV = ColumnVector.fromLists(new ListType(true, new StructType(true, + new BasicType(true, DType.STRING), new BasicType(true, DType.INT32), new BasicType(true, DType.FLOAT64), new BasicType(true, DType.FLOAT32), new BasicType(true, DType.BOOL8))), + Collections.emptyList(), + Collections.singletonList(new StructData("a", 0, 0.0, 0f, true)), + Arrays.asList(new StructData("B\n", 100, 100.0, 100f, false), new StructData("dE\"\u0100\t\u0101 \ud720\ud721", -100, -100.0, -100f, null)), + Collections.singletonList(new StructData("A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", Integer.MIN_VALUE, POSITIVE_DOUBLE_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_LOWER_RANGE, false)), + Arrays.asList(new StructData(null, Integer.MAX_VALUE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, true), new StructData(null, null, null, null, null)), + null); + ColumnVector result = Hash.xxhash64(new ColumnView[]{structListCV}); + ColumnVector expected = ColumnVector.fromBoxedLongs(Hash.DEFAULT_XXHASH64_SEED, 7451748878409563026L, 948372773124634350L, 8444697026100086329L, -5888679192448042852L, Hash.DEFAULT_XXHASH64_SEED)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64NestedDepthExceedsLimit() { + try (ColumnVector nestedIntListCV = ColumnVector.fromLists( + new ListType(true, new ListType(true, new BasicType(true, DType.INT32))), + Arrays.asList(Arrays.asList(null, null), null), + Arrays.asList(Collections.singletonList(0), Collections.singletonList(-2), Collections.singletonList(3)), + Arrays.asList(null, Collections.singletonList(Integer.MAX_VALUE)), + Arrays.asList(Collections.singletonList(5), Arrays.asList(-6, null)), + Arrays.asList(Collections.singletonList(Integer.MIN_VALUE), null), + null); + ColumnVector integers = ColumnVector.fromBoxedInts( + 0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles(0.0, 100.0, -100.0, + POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats(0f, 100f, -100f, + NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans( + true, false, null, false, true, null); + ColumnView structs1 = ColumnView.makeStructView(nestedIntListCV, integers); + ColumnView structs2 = ColumnView.makeStructView(structs1, doubles); + ColumnView structs3 = ColumnView.makeStructView(structs2, bools); + ColumnView structs4 = ColumnView.makeStructView(structs3); + ColumnView structs5 = ColumnView.makeStructView(structs4, floats); + ColumnView structs6 = ColumnView.makeStructView(structs5); + ColumnView structs7 = ColumnView.makeStructView(structs6); + ColumnView nestedResult = ColumnView.makeStructView(structs7);) { + assertThrows(CudfException.class, () -> Hash.xxhash64(new ColumnView[]{nestedResult})); + } + } + @Test void testHiveHashBools() { try (ColumnVector v0 = ColumnVector.fromBoxedBooleans(true, false, null);