From e0dea797049252cca5e5360678a89c76bf72dec5 Mon Sep 17 00:00:00 2001 From: guo-shaoge Date: Tue, 17 Dec 2024 10:44:30 +0800 Subject: [PATCH] revert prefetch for StringHashTable Signed-off-by: guo-shaoge --- .../AggregateFunctionGroupUniqArray.h | 2 +- .../src/AggregateFunctions/KeyHolderHelpers.h | 2 +- dbms/src/Common/ColumnsHashing.h | 43 +-- dbms/src/Common/ColumnsHashingImpl.h | 38 -- .../src/Common/HashTable/HashTableKeyHolder.h | 8 +- dbms/src/Common/HashTable/StringHashTable.h | 143 -------- .../HashTable/TwoLevelStringHashTable.h | 63 ---- dbms/src/Interpreters/Aggregator.cpp | 342 +----------------- dbms/src/Interpreters/Aggregator.h | 50 +-- dbms/src/TiDB/Collation/Collator.cpp | 112 ------ dbms/src/TiDB/Collation/Collator.h | 10 - 11 files changed, 19 insertions(+), 794 deletions(-) diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h index d3cbea74195..f556a009551 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h @@ -182,7 +182,7 @@ class AggregateFunctionGroupUniqArrayGeneric { // We have to copy the keys to our arena. assert(arena != nullptr); - cur_set.emplace(ArenaKeyHolder{rhs_elem.getValue(), arena}, it, inserted); + cur_set.emplace(ArenaKeyHolder{rhs_elem.getValue(), *arena}, it, inserted); } } diff --git a/dbms/src/AggregateFunctions/KeyHolderHelpers.h b/dbms/src/AggregateFunctions/KeyHolderHelpers.h index b8a4ee0def3..6677866f0d3 100644 --- a/dbms/src/AggregateFunctions/KeyHolderHelpers.h +++ b/dbms/src/AggregateFunctions/KeyHolderHelpers.h @@ -24,7 +24,7 @@ inline auto getKeyHolder(const IColumn & column, size_t row_num, Arena & arena) { if constexpr (is_plain_column) { - return ArenaKeyHolder{column.getDataAt(row_num), &arena}; + return ArenaKeyHolder{column.getDataAt(row_num), arena}; } else { diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h index 23dd30ecc44..ac665874969 100644 --- a/dbms/src/Common/ColumnsHashing.h +++ b/dbms/src/Common/ColumnsHashing.h @@ -122,34 +122,19 @@ struct HashMethodString ALWAYS_INLINE inline ArenaKeyHolder getKeyHolder( ssize_t row, [[maybe_unused]] Arena * pool, - [[maybe_unused]] std::vector & sort_key_containers) const + std::vector & sort_key_containers) const { - auto key = getKey(row); + auto last_offset = row == 0 ? 0 : offsets[row - 1]; + // Remove last zero byte. + StringRef key(chars + last_offset, offsets[row] - last_offset - 1); if (likely(collator)) key = collator->sortKey(key.data, key.size, sort_key_containers[0]); - return ArenaKeyHolder{key, pool}; - } - - ALWAYS_INLINE inline ArenaKeyHolder getKeyHolder(ssize_t row, Arena * pool, Arena * sort_key_pool) const - { - auto key = getKey(row); - if (likely(collator)) - key = collator->sortKey(key.data, key.size, *sort_key_pool); - - return ArenaKeyHolder{key, pool}; + return ArenaKeyHolder{key, *pool}; } protected: friend class columns_hashing_impl::HashMethodBase; - -private: - ALWAYS_INLINE inline StringRef getKey(size_t row) const - { - auto last_offset = row == 0 ? 0 : offsets[row - 1]; - // Remove last zero byte. - return StringRef(chars + last_offset, offsets[row] - last_offset - 1); - } }; template @@ -175,16 +160,11 @@ struct HashMethodStringBin } ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, std::vector &) const - { - return getKeyHolder(row, pool, nullptr); - } - - ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, Arena *) const { auto last_offset = row == 0 ? 0 : offsets[row - 1]; StringRef key(chars + last_offset, offsets[row] - last_offset - 1); key = BinCollatorSortKey(key.data, key.size); - return ArenaKeyHolder{key, pool}; + return ArenaKeyHolder{key, *pool}; } protected: @@ -433,16 +413,7 @@ struct HashMethodFixedString if (collator) key = collator->sortKeyFastPath(key.data, key.size, sort_key_containers[0]); - return ArenaKeyHolder{key, pool}; - } - - ALWAYS_INLINE inline ArenaKeyHolder getKeyHolder(size_t row, Arena * pool, Arena * sort_key_pool) const - { - StringRef key(&(*chars)[row * n], n); - if (collator) - key = collator->sortKeyFastPath(key.data, key.size, *sort_key_pool); - - return ArenaKeyHolder{key, pool}; + return ArenaKeyHolder{key, *pool}; } protected: diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h index fcbfc4bc358..09065286dbf 100644 --- a/dbms/src/Common/ColumnsHashingImpl.h +++ b/dbms/src/Common/ColumnsHashingImpl.h @@ -204,44 +204,6 @@ class HashMethodBase } } - template - ALWAYS_INLINE inline EmplaceResult emplaceStringKey( - Data & data, - size_t idx, - std::vector & datas, - const std::vector & hashvals) - { - // For spill, hashvals.size() will be le to total_rows. - // Because only remaining rows that didn't insert into HashMap will be handled here. - assert(hashvals.size() <= static_cast(*this).total_rows); - - auto & submap = StringHashTableSubMapSelector>::getSubMap( - hashvals[idx], - data); - if constexpr (enable_prefetch) - prefetch(submap, idx, hashvals); - - return emplaceImpl(datas[idx], submap, hashvals[idx]); - } - - template - ALWAYS_INLINE inline FindResult findStringKey( - Data & data, - size_t idx, - std::vector & datas, - const std::vector & hashvals) - { - assert(hashvals.size() <= static_cast(*this).total_rows); - - auto & submap = StringHashTableSubMapSelector>::getSubMap( - hashvals[idx], - data); - if constexpr (enable_prefetch) - prefetch(submap, idx, hashvals); - - return findKeyImpl(keyHolderGetKey(datas[idx]), submap, hashvals[idx]); - } - template ALWAYS_INLINE inline size_t getHash( const Data & data, diff --git a/dbms/src/Common/HashTable/HashTableKeyHolder.h b/dbms/src/Common/HashTable/HashTableKeyHolder.h index dd8a4b53376..01b06dce87d 100644 --- a/dbms/src/Common/HashTable/HashTableKeyHolder.h +++ b/dbms/src/Common/HashTable/HashTableKeyHolder.h @@ -91,8 +91,8 @@ namespace DB */ struct ArenaKeyHolder { - StringRef key{}; - Arena * pool = nullptr; + StringRef key; + Arena & pool; }; } // namespace DB @@ -111,14 +111,14 @@ inline void ALWAYS_INLINE keyHolderPersistKey(DB::ArenaKeyHolder & holder) { // Hash table shouldn't ask us to persist a zero key assert(holder.key.size > 0); - holder.key.data = holder.pool->insert(holder.key.data, holder.key.size); + holder.key.data = holder.pool.insert(holder.key.data, holder.key.size); } inline void ALWAYS_INLINE keyHolderPersistKey(DB::ArenaKeyHolder && holder) { // Hash table shouldn't ask us to persist a zero key assert(holder.key.size > 0); - holder.key.data = holder.pool->insert(holder.key.data, holder.key.size); + holder.key.data = holder.pool.insert(holder.key.data, holder.key.size); } inline void ALWAYS_INLINE keyHolderDiscardKey(DB::ArenaKeyHolder &) {} diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h index 9bbdabb91fa..e1236caf381 100644 --- a/dbms/src/Common/HashTable/StringHashTable.h +++ b/dbms/src/Common/HashTable/StringHashTable.h @@ -16,9 +16,7 @@ #include #include -#include #include -#include #include #include @@ -194,99 +192,6 @@ struct StringHashTableLookupResult friend bool operator!=(const std::nullptr_t &, const StringHashTableLookupResult & b) { return b.mapped_ptr; } }; -template -static auto -#if defined(ADDRESS_SANITIZER) || defined(THREAD_SANITIZER) - NO_INLINE NO_SANITIZE_ADDRESS NO_SANITIZE_THREAD -#else - ALWAYS_INLINE -#endif - dispatchStringHashTable( - size_t row, - KeyHolder && key_holder, - Func0 && func0, - Func8 && func8, - Func16 && func16, - Func24 && func24, - FuncStr && func_str) -{ - const StringRef & x = keyHolderGetKey(key_holder); - const size_t sz = x.size; - if (sz == 0) - { - return func0(x, row); - } - - if (x.data[sz - 1] == 0) - { - // Strings with trailing zeros are not representable as fixed-size - // string keys. Put them to the generic table. - return func_str(key_holder, row); - } - - const char * p = x.data; - // pending bits that needs to be shifted out - const char s = (-sz & 7) * 8; - union - { - StringKey8 k8; - StringKey16 k16; - StringKey24 k24; - UInt64 n[3]; - }; - switch ((sz - 1) >> 3) - { - case 0: // 1..8 bytes - { - // first half page - if ((reinterpret_cast(p) & 2048) == 0) - { - memcpy(&n[0], p, 8); - if constexpr (DB::isLittleEndian()) - n[0] &= (-1ULL >> s); - else - n[0] &= (-1ULL << s); - } - else - { - const char * lp = x.data + x.size - 8; - memcpy(&n[0], lp, 8); - if constexpr (DB::isLittleEndian()) - n[0] >>= s; - else - n[0] <<= s; - } - return func8(k8, row); - } - case 1: // 9..16 bytes - { - memcpy(&n[0], p, 8); - const char * lp = x.data + x.size - 8; - memcpy(&n[1], lp, 8); - if constexpr (DB::isLittleEndian()) - n[1] >>= s; - else - n[1] <<= s; - return func16(k16, row); - } - case 2: // 17..24 bytes - { - memcpy(&n[0], p, 16); - const char * lp = x.data + x.size - 8; - memcpy(&n[2], lp, 8); - if constexpr (DB::isLittleEndian()) - n[2] >>= s; - else - n[2] <<= s; - return func24(k24, row); - } - default: // >= 25 bytes - { - return func_str(key_holder, row); - } - } -} - template class StringHashTable : private boost::noncopyable { @@ -307,8 +212,6 @@ class StringHashTable : private boost::noncopyable template friend class TwoLevelStringHashTable; - template - friend struct StringHashTableSubMapSelector; T0 m0; T1 m1; @@ -565,49 +468,3 @@ class StringHashTable : private boost::noncopyable ms.clearAndShrink(); } }; - -template -struct StringHashTableSubMapSelector; - -template -struct StringHashTableSubMapSelector<0, false, Data> -{ - struct Hash - { - static ALWAYS_INLINE size_t operator()(const StringRef &) { return 0; } - }; - - static typename Data::T0 & getSubMap(size_t, Data & data) { return data.m0; } -}; - -template -struct StringHashTableSubMapSelector<1, false, Data> -{ - using Hash = StringHashTableHash; - - static typename Data::T1 & getSubMap(size_t, Data & data) { return data.m1; } -}; - -template -struct StringHashTableSubMapSelector<2, false, Data> -{ - using Hash = StringHashTableHash; - - static typename Data::T2 & getSubMap(size_t, Data & data) { return data.m2; } -}; - -template -struct StringHashTableSubMapSelector<3, false, Data> -{ - using Hash = StringHashTableHash; - - static typename Data::T3 & getSubMap(size_t, Data & data) { return data.m3; } -}; - -template -struct StringHashTableSubMapSelector<4, false, Data> -{ - using Hash = StringHashTableHash; - - static typename Data::Ts & getSubMap(size_t, Data & data) { return data.ms; } -}; diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h index 403b8d3941c..7659b5a73fb 100644 --- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h +++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h @@ -277,66 +277,3 @@ class TwoLevelStringHashTable : private boost::noncopyable return res; } }; - -template -struct StringHashTableSubMapSelector<0, true, Data> -{ - struct Hash - { - static ALWAYS_INLINE size_t operator()(const StringRef &) { return 0; } - }; - - static typename Data::Impl::T0 & getSubMap(size_t hashval, Data & data) - { - const auto bucket = Data::getBucketFromHash(hashval); - return data.impls[bucket].m0; - } -}; - -template -struct StringHashTableSubMapSelector<1, true, Data> -{ - using Hash = StringHashTableHash; - - static typename Data::Impl::T1 & getSubMap(size_t hashval, Data & data) - { - const auto bucket = Data::getBucketFromHash(hashval); - return data.impls[bucket].m1; - } -}; - -template -struct StringHashTableSubMapSelector<2, true, Data> -{ - using Hash = StringHashTableHash; - - static typename Data::Impl::T2 & getSubMap(size_t hashval, Data & data) - { - const auto bucket = Data::getBucketFromHash(hashval); - return data.impls[bucket].m2; - } -}; - -template -struct StringHashTableSubMapSelector<3, true, Data> -{ - using Hash = StringHashTableHash; - - static typename Data::Impl::T3 & getSubMap(size_t hashval, Data & data) - { - const auto bucket = Data::getBucketFromHash(hashval); - return data.impls[bucket].m3; - } -}; - -template -struct StringHashTableSubMapSelector<4, true, Data> -{ - using Hash = StringHashTableHash; - - static typename Data::Impl::Ts & getSubMap(size_t hashval, Data & data) - { - const auto bucket = Data::getBucketFromHash(hashval); - return data.impls[bucket].ms; - } -}; diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp index 3fcb1dcde58..5e00c2e72e6 100644 --- a/dbms/src/Interpreters/Aggregator.cpp +++ b/dbms/src/Interpreters/Aggregator.cpp @@ -666,9 +666,6 @@ void NO_INLINE Aggregator::executeImpl( { typename Method::State state(agg_process_info.key_columns, key_sizes, collators); - // start_row!=0 and stringHashTableRecoveryInfo not empty and cannot be true at the same time. - RUNTIME_CHECK(!(agg_process_info.start_row != 0 && !agg_process_info.stringHashTableRecoveryInfoEmpty())); - #ifndef NDEBUG bool disable_prefetch = (method.data.getBufferSizeInCells() < 8192); fiu_do_on(FailPoints::force_agg_prefetch, { disable_prefetch = false; }); @@ -676,12 +673,11 @@ void NO_INLINE Aggregator::executeImpl( const bool disable_prefetch = (method.data.getBufferSizeInCells() < 8192); #endif - // key_serialized and key_string(StringHashMap) needs column-wise handling for prefetch. + // key_serialized needs column-wise handling for prefetch. // Because: - // 1. StringHashMap(key_string) is composed by 5 submaps, so prefetch needs to be done for each specific submap. - // 2. getKeyHolder of key_serialized have to copy real data into Arena. + // 1. getKeyHolder of key_serialized have to copy real data into Arena. // It means we better getKeyHolder for all Columns once and then use it both for getHash() and emplaceKey(). - // 3. For other group by key(key_int8/16/32/...), it's ok to use row-wise handling even prefetch is enabled. + // 2. For other group by key(key_int8/16/32/...), it's ok to use row-wise handling even prefetch is enabled. // But getHashVals() still needs to be column-wise. if constexpr (Method::State::is_serialized_key) { @@ -694,17 +690,8 @@ void NO_INLINE Aggregator::executeImpl( } else if constexpr (Method::Data::is_string_hash_map) { - // If agg_process_info.start_row != 0, it means the computation process of the current block was interrupted by resize exception in executeImplByRow. - // For clarity and simplicity of implementation, the processing functions for column-wise and row-wise methods handle the entire block independently. - // A block will not be processed first by the row-wise method and then by the column-wise method, or vice-versa. - if (!disable_prefetch && likely(agg_process_info.start_row == 0)) - executeImplStringHashMapByCol( - method, - state, - aggregates_pool, - agg_process_info); - else - executeImplByRow(method, state, aggregates_pool, agg_process_info); + // StringHashMap doesn't support prefetch. + executeImplByRow(method, state, aggregates_pool, agg_process_info); } else { @@ -785,95 +772,6 @@ std::optional::Res } } -// This is only used by executeImplStringHashMapByCol. -// It will choose specifix submap of StringHashMap then do emplace/find. -// StringKeyType can be StringRef/StringKey8/StringKey16/StringKey24/ArenaKeyHolder. -template < - size_t SubMapIndex, - bool collect_hit_rate, - bool only_lookup, - bool enable_prefetch, - bool zero_agg_func_size, - typename Data, - typename State, - typename StringKeyType> -size_t Aggregator::emplaceOrFindStringKey( - Data & data, - State & state, - const std::vector & key_infos, - std::vector & key_datas, - Arena & aggregates_pool, - std::vector & places, - AggProcessInfo & agg_process_info) const -{ - static_assert(!(collect_hit_rate && only_lookup)); - assert(key_infos.size() == key_datas.size()); - - using Hash = typename StringHashTableSubMapSelector>::Hash; - std::vector hashvals(key_infos.size(), 0); - for (size_t i = 0; i < key_infos.size(); ++i) - hashvals[i] = Hash::operator()(keyHolderGetKey(key_datas[i])); - - // alloc 0 bytes is useful when agg func size is zero. - AggregateDataPtr agg_state = aggregates_pool.alloc(0); - for (size_t i = 0; i < key_infos.size(); ++i) - { - try - { - if constexpr (only_lookup) - { - auto find_result - = state.template findStringKey(data, i, key_datas, hashvals); - if (find_result.isFound()) - { - agg_state = find_result.getMapped(); - } - else - { - agg_process_info.not_found_rows.push_back(key_infos[i]); - } - } - else - { - auto emplace_result - = state.template emplaceStringKey(data, i, key_datas, hashvals); - if (emplace_result.isInserted()) - { - if constexpr (zero_agg_func_size) - { - emplace_result.setMapped(agg_state); - } - else - { - emplace_result.setMapped(nullptr); - - agg_state - = aggregates_pool.alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); - createAggregateStates(agg_state); - - emplace_result.setMapped(agg_state); - } - } - else - { - if constexpr (!zero_agg_func_size) - agg_state = emplace_result.getMapped(); - - if constexpr (collect_hit_rate) - ++agg_process_info.hit_row_cnt; - } - if constexpr (!zero_agg_func_size) - places[i] = agg_state; - } - } - catch (ResizeException &) - { - return i; - } - } - return key_infos.size(); -} - template ALWAYS_INLINE void Aggregator::executeImplByRow( Method & method, @@ -881,14 +779,8 @@ ALWAYS_INLINE void Aggregator::executeImplByRow( Arena * aggregates_pool, AggProcessInfo & agg_process_info) const { - LOG_TRACE(log, "executeImplByRow"); // collect_hit_rate and only_lookup cannot be true at the same time. static_assert(!(collect_hit_rate && only_lookup)); - // If agg_process_info.stringHashTableRecoveryInfoEmpty() is false, it means the current block was - // handled by executeImplStringHashMapByCol(column-wise) before, and resize execption happened. - // This situation is unexpected because for the sake of clarity, we assume that a block will be **fully** processed - // either column-wise or row-wise and cannot be split for processing. - RUNTIME_CHECK(agg_process_info.stringHashTableRecoveryInfoEmpty()); std::vector sort_key_containers; sort_key_containers.resize(params.keys_size, ""); @@ -1097,230 +989,6 @@ ALWAYS_INLINE void Aggregator::executeImplByRow( } } -#define M(SUBMAPINDEX) \ - template \ - ALWAYS_INLINE inline void setupExceptionRecoveryInfoForStringHashTable( \ - Aggregator::AggProcessInfo & agg_process_info, \ - size_t row, \ - const std::vector & key_infos, \ - const std::vector & key_datas, \ - std::integral_constant) \ - { \ - agg_process_info.submap_m##SUBMAPINDEX##_infos \ - = std::vector(key_infos.begin() + row, key_infos.end()); \ - agg_process_info.submap_m##SUBMAPINDEX##_datas \ - = std::vector(key_datas.begin() + row, key_datas.end()); \ - } - -M(0) -M(1) -M(2) -M(3) -M(4) - -#undef M - -// prefetch/empalce each specifix submap directly instead of accessing StringHashMap interface, -// which is better for performance. -// NOTE: this function is column-wise, which means sort key buffer cannot be reused. -// This buffer will not be release until this block is processed done. -template -ALWAYS_INLINE void Aggregator::executeImplStringHashMapByCol( - Method & method, - typename Method::State & state, - Arena * aggregates_pool, - AggProcessInfo & agg_process_info) const -{ - LOG_TRACE(log, "executeImplStringHashMapByCol"); - // collect_hit_rate and only_lookup cannot be true at the same time. - static_assert(!(collect_hit_rate && only_lookup)); - static_assert(Method::Data::is_string_hash_map); - -#define M(SUBMAPINDEX) \ - RUNTIME_CHECK( \ - agg_process_info.submap_m##SUBMAPINDEX##_infos.size() \ - == agg_process_info.submap_m##SUBMAPINDEX##_datas.size()); - - M(0) - M(1) - M(2) - M(3) - M(4) -#undef M - - const size_t rows = agg_process_info.end_row - agg_process_info.start_row; - auto sort_key_pool = std::make_unique(); - std::vector sort_key_containers; - -#define M(INFO, DATA, KEYTYPE) \ - std::vector(INFO); \ - std::vector(DATA); - - M(key0_infos, key0_datas, StringRef) - M(key8_infos, key8_datas, StringKey8) - M(key16_infos, key16_datas, StringKey16) - M(key24_infos, key24_datas, StringKey24) - M(key_str_infos, key_str_datas, ArenaKeyHolder) -#undef M - - // If no resize exception happens, so this is a new Block. - // If resize exception happens, start_row has already been set to zero at the end of this function. - RUNTIME_CHECK_MSG( - agg_process_info.start_row == 0, - "unexpected agg_process_info.start_row: {}, end_row: {}", - agg_process_info.start_row, - agg_process_info.end_row); - - if likely (agg_process_info.stringHashTableRecoveryInfoEmpty()) - { - // sort_key_pool should already been reset by AggProcessInfo::restBlock() - RUNTIME_CHECK(!agg_process_info.sort_key_pool); - - const size_t reserve_size = rows / 4; - -#define M(INFO, DATA, SUBMAPINDEX, KEYTYPE) \ - (INFO).reserve(reserve_size); \ - (DATA).reserve(reserve_size); \ - auto dispatch_callback_key##SUBMAPINDEX \ - = [&INFO, &DATA](const KEYTYPE & key, size_t row) { /* NOLINT(bugprone-macro-parentheses) */ \ - (INFO).push_back(row); \ - (DATA).push_back(key); \ - }; - - M(key0_infos, key0_datas, 0, StringRef) - M(key8_infos, key8_datas, 8, StringKey8) - M(key16_infos, key16_datas, 16, StringKey16) - M(key24_infos, key24_datas, 24, StringKey24) - // Argument type is ArenaKeyHolder instead of StringRef, - // because it will only be persisted when insert into HashTable. - M(key_str_infos, key_str_datas, str, ArenaKeyHolder) -#undef M - - for (size_t i = 0; i < rows; ++i) - { - // Use Arena for collation sort key, because we are doing agg in column-wise way. - // So a big arena is needed to store decoded key, and we can avoid resize std::string by using Arena. - auto key_holder = state.getKeyHolder(i, aggregates_pool, sort_key_pool.get()); - dispatchStringHashTable( - i, - key_holder, - dispatch_callback_key0, - dispatch_callback_key8, - dispatch_callback_key16, - dispatch_callback_key24, - dispatch_callback_keystr); - } - } - else - { -#define M(INFO, DATA, SUBMAPINDEX) \ - (INFO) = agg_process_info.submap_m##SUBMAPINDEX##_infos; \ - (DATA) = agg_process_info.submap_m##SUBMAPINDEX##_datas; - - M(key0_infos, key0_datas, 0) - M(key8_infos, key8_datas, 1) - M(key16_infos, key16_datas, 2) - M(key24_infos, key24_datas, 3) - M(key_str_infos, key_str_datas, 4) -#undef M - } - - std::vector key0_places(key0_infos.size(), nullptr); - std::vector key8_places(key8_infos.size(), nullptr); - std::vector key16_places(key16_infos.size(), nullptr); - std::vector key24_places(key24_infos.size(), nullptr); - std::vector key_str_places(key_str_infos.size(), nullptr); - - bool got_resize_exception = false; - size_t emplaced_index = 0; - bool zero_agg_func_size = (params.aggregates_size == 0); - -#define M(INDEX, INFO, DATA, PLACES) \ - if (!got_resize_exception && !(INFO).empty()) \ - { \ - if (zero_agg_func_size) \ - emplaced_index = emplaceOrFindStringKey( \ - method.data, \ - state, \ - (INFO), \ - (DATA), \ - *aggregates_pool, \ - (PLACES), \ - agg_process_info); \ - else \ - emplaced_index = emplaceOrFindStringKey( \ - method.data, \ - state, \ - (INFO), \ - (DATA), \ - *aggregates_pool, \ - (PLACES), \ - agg_process_info); \ - if unlikely (emplaced_index != (INFO).size()) \ - got_resize_exception = true; \ - } \ - else \ - { \ - emplaced_index = 0; \ - } \ - setupExceptionRecoveryInfoForStringHashTable( \ - agg_process_info, \ - emplaced_index, \ - (INFO), \ - (DATA), \ - std::integral_constant{}); - - M(0, key0_infos, key0_datas, key0_places) - M(1, key8_infos, key8_datas, key8_places) - M(2, key16_infos, key16_datas, key16_places) - M(3, key24_infos, key24_datas, key24_places) - M(4, key_str_infos, key_str_datas, key_str_places) -#undef M - - if (!zero_agg_func_size) - { - std::vector places(rows, nullptr); -#define M(INFO, PLACES) \ - for (size_t i = 0; i < (INFO).size(); ++i) \ - { \ - const auto row = (INFO)[i]; \ - places[row] = (PLACES)[i]; \ - } - - M(key0_infos, key0_places) - M(key8_infos, key8_places) - M(key16_infos, key16_places) - M(key24_infos, key24_places) - M(key_str_infos, key_str_places) -#undef M - - for (AggregateFunctionInstruction * inst = agg_process_info.aggregate_functions_instructions.data(); inst->that; - ++inst) - { - inst->batch_that->addBatch( - agg_process_info.start_row, - rows, - &places[0], - inst->state_offset, - inst->batch_arguments, - aggregates_pool); - } - } - - if unlikely (got_resize_exception) - { - RUNTIME_CHECK(!agg_process_info.stringHashTableRecoveryInfoEmpty()); - agg_process_info.sort_key_pool = std::move(sort_key_pool); - // For StringHashTable, start_row is meanless, instead submap_mx_infos/submap_mx_datas are used. - // So set it to zero when got_resize_exception. - agg_process_info.start_row = 0; - } - else - { - agg_process_info.start_row = agg_process_info.end_row; - } -} - void NO_INLINE Aggregator::executeWithoutKeyImpl(AggregatedDataWithoutKey & res, AggProcessInfo & agg_process_info, Arena * arena) { diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h index d88a97278f9..19e6fc7a919 100644 --- a/dbms/src/Interpreters/Aggregator.h +++ b/dbms/src/Interpreters/Aggregator.h @@ -1318,32 +1318,11 @@ class Aggregator size_t hit_row_cnt = 0; std::vector not_found_rows; - // For StringHashTable, when resize exception happens, the process will be interrupted. - // So we need these infos to continue. - std::vector submap_m0_infos{}; - std::vector submap_m1_infos{}; - std::vector submap_m2_infos{}; - std::vector submap_m3_infos{}; - std::vector submap_m4_infos{}; - std::vector submap_m0_datas{}; - std::vector submap_m1_datas{}; - std::vector submap_m2_datas{}; - std::vector submap_m3_datas{}; - std::vector submap_m4_datas{}; - std::unique_ptr sort_key_pool; - void prepareForAgg(); bool allBlockDataHandled() const { assert(start_row <= end_row); - // submap_mx_infos.size() and submap_mx_datas.size() are always equal. - // So only need to check submap_mx_infos is enough. - return (start_row == end_row && stringHashTableRecoveryInfoEmpty()) || aggregator->isCancelled(); - } - bool stringHashTableRecoveryInfoEmpty() const - { - return submap_m0_infos.empty() && submap_m1_infos.empty() && submap_m3_infos.empty() - && submap_m4_infos.empty(); + return start_row == end_row || aggregator->isCancelled(); } void resetBlock(const Block & block_) { @@ -1357,8 +1336,6 @@ class Aggregator hit_row_cnt = 0; not_found_rows.clear(); not_found_rows.reserve(block_.rows() / 2); - - sort_key_pool.reset(); } }; @@ -1483,13 +1460,6 @@ class Aggregator Arena * aggregates_pool, AggProcessInfo & agg_process_info) const; - template - void executeImplStringHashMapByCol( - Method & method, - typename Method::State & state, - Arena * aggregates_pool, - AggProcessInfo & agg_process_info) const; - template std::optional::ResultType> emplaceOrFindKey( Method & method, @@ -1507,24 +1477,6 @@ class Aggregator Arena & aggregates_pool, std::vector & sort_key_containers) const; - template < - size_t SubMapIndex, - bool collect_hit_rate, - bool only_lookup, - bool enable_prefetch, - bool zero_agg_func_size, - typename Data, - typename State, - typename StringKeyType> - size_t emplaceOrFindStringKey( - Data & data, - State & state, - const std::vector & key_infos, - std::vector & key_datas, - Arena & aggregates_pool, - std::vector & places, - AggProcessInfo & agg_process_info) const; - /// For case when there are no keys (all aggregate into one row). static void executeWithoutKeyImpl(AggregatedDataWithoutKey & res, AggProcessInfo & agg_process_info, Arena * arena); diff --git a/dbms/src/TiDB/Collation/Collator.cpp b/dbms/src/TiDB/Collation/Collator.cpp index 4365f1f0988..bf27400f8c4 100644 --- a/dbms/src/TiDB/Collation/Collator.cpp +++ b/dbms/src/TiDB/Collation/Collator.cpp @@ -192,11 +192,6 @@ class BinCollator final : public ITiDBCollator return DB::BinCollatorSortKey(s, length); } - StringRef sortKey(const char * s, size_t length, DB::Arena &) const override - { - return DB::BinCollatorSortKey(s, length); - } - StringRef sortKeyNoTrim(const char * s, size_t length, std::string &) const override { return convertForBinCollator(s, length, nullptr); @@ -278,54 +273,11 @@ class GeneralCICollator final : public ITiDBCollator return convertImpl(s, length, container, nullptr); } - StringRef sortKey(const char * s, size_t length, DB::Arena & pool) const override - { - return convertImpl(s, length, pool, nullptr); - } - StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const override { return convertImpl(s, length, container, nullptr); } - template - StringRef convertImpl(const char * s, size_t length, DB::Arena & pool, std::vector * lens) const - { - std::string_view v; - - if constexpr (need_trim) - v = rtrim(s, length); - else - v = std::string_view(s, length); - - const size_t size = length * sizeof(WeightType); - auto * buffer = pool.alignedAlloc(size, 16); - - size_t offset = 0; - size_t total_size = 0; - size_t v_length = v.length(); - - if constexpr (need_len) - { - if (lens->capacity() < v_length) - lens->reserve(v_length); - lens->resize(0); - } - - while (offset < v_length) - { - auto c = decodeChar(s, offset); - auto sk = weight(c); - buffer[total_size++] = static_cast(sk >> 8); - buffer[total_size++] = static_cast(sk); - - if constexpr (need_len) - lens->push_back(2); - } - - return StringRef(buffer, total_size); - } - template StringRef convertImpl(const char * s, size_t length, std::string & container, std::vector * lens) const { @@ -527,65 +479,11 @@ class UCACICollator final : public ITiDBCollator return convertImpl(s, length, container, nullptr); } - StringRef sortKey(const char * s, size_t length, DB::Arena & pool) const override - { - return convertImpl(s, length, pool, nullptr); - } - StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const override { return convertImpl(s, length, container, nullptr); } - // Use Arena to store decoded string. Normally it's used by column-wise Agg/Join, - // because column-wise process cannot reuse string container. - template - StringRef convertImpl(const char * s, size_t length, DB::Arena & pool, std::vector * lens) const - { - std::string_view v; - - if constexpr (need_trim) - v = preprocess(s, length); - else - v = std::string_view(s, length); - - // every char have 8 uint16 at most. - const auto size = 8 * length * sizeof(uint16_t); - auto * buffer = pool.alignedAlloc(size, 16); - - size_t offset = 0; - size_t total_size = 0; - size_t v_length = v.length(); - - uint64_t first = 0, second = 0; - - if constexpr (need_len) - { - if (lens->capacity() < v_length) - lens->reserve(v_length); - lens->resize(0); - } - - while (offset < v_length) - { - weight(first, second, offset, v_length, s); - - if constexpr (need_len) - lens->push_back(total_size); - - writeResult(first, buffer, total_size); - writeResult(second, buffer, total_size); - - if constexpr (need_len) - { - size_t end_idx = lens->size() - 1; - (*lens)[end_idx] = total_size - (*lens)[end_idx]; - } - } - - return StringRef(buffer, total_size); - } - template StringRef convertImpl(const char * s, size_t length, std::string & container, std::vector * lens) const { @@ -652,16 +550,6 @@ class UCACICollator final : public ITiDBCollator } } - static inline void writeResult(uint64_t & w, char * buffer, size_t & total_size) - { - while (w != 0) - { - buffer[total_size++] = static_cast(w >> 8); - buffer[total_size++] = static_cast(w); - w >>= 16; - } - } - static inline bool regexEq(CharType a, CharType b) { return T::regexEq(a, b); } static inline void weight(uint64_t & first, uint64_t & second, size_t & offset, size_t length, const char * s) diff --git a/dbms/src/TiDB/Collation/Collator.h b/dbms/src/TiDB/Collation/Collator.h index 08c017ba57d..6bb87883ef1 100644 --- a/dbms/src/TiDB/Collation/Collator.h +++ b/dbms/src/TiDB/Collation/Collator.h @@ -14,7 +14,6 @@ #pragma once -#include #include #include #include @@ -102,7 +101,6 @@ class ITiDBCollator = 0; virtual StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const = 0; virtual StringRef sortKey(const char * s, size_t length, std::string & container) const = 0; - virtual StringRef sortKey(const char * s, size_t length, DB::Arena &) const = 0; virtual std::unique_ptr pattern() const = 0; int32_t getCollatorId() const { return collator_id; } CollatorType getCollatorType() const { return collator_type; } @@ -137,14 +135,6 @@ class ITiDBCollator } return sortKey(s, length, container); } - ALWAYS_INLINE inline StringRef sortKeyFastPath(const char * s, size_t length, DB::Arena & pool) const - { - if (likely(isPaddingBinary())) - { - return DB::BinCollatorSortKey(s, length); - } - return sortKey(s, length, pool); - } protected: explicit ITiDBCollator(int32_t collator_id_);