From 481b6beb69e8897e7aa852e51c69f4b5c1da8b5a Mon Sep 17 00:00:00 2001 From: Zou Xinyi Date: Mon, 25 Nov 2024 12:33:08 +0800 Subject: [PATCH] 1 --- be/src/util/hash_util.hpp | 16 +++---- be/src/vec/columns/column_const.cpp | 5 ++- be/src/vec/columns/column_const.h | 5 ++- be/src/vec/columns/column_decimal.h | 4 +- be/src/vec/columns/column_dictionary.h | 22 ++++++---- be/src/vec/columns/column_nullable.cpp | 5 ++- be/src/vec/columns/column_nullable.h | 7 ++- be/src/vec/columns/column_object.cpp | 19 +++++--- be/src/vec/columns/column_object.h | 6 +-- be/src/vec/columns/column_string.cpp | 61 +++++++++++++++----------- be/src/vec/columns/column_string.h | 29 ++++++++---- be/src/vec/columns/column_vector.cpp | 3 +- be/src/vec/columns/column_vector.h | 9 ++-- be/src/vec/core/field.h | 11 +++-- 14 files changed, 125 insertions(+), 77 deletions(-) diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index e9ac72c5ccdcb4..d444daa8c68d11 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -46,7 +46,7 @@ class HashUtil { return std::hash()(value); } - static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t hash) { + static uint32_t zlib_crc_hash(const void* data, uint32_t bytes, uint32_t hash) { return crc32(hash, (const unsigned char*)data, bytes); } @@ -66,7 +66,7 @@ class HashUtil { // NOTE: Any changes made to this function need to be reflected in Codegen::GetHashFn. // TODO: crc32 hashes with different seeds do not result in different hash functions. // The resulting hashes are correlated. - static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) { + static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) { if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) { return zlib_crc_hash(data, bytes, hash); } @@ -93,7 +93,7 @@ class HashUtil { return hash; } - static uint64_t crc_hash64(const void* data, int32_t bytes, uint64_t hash) { + static uint64_t crc_hash64(const void* data, uint32_t bytes, uint64_t hash) { uint32_t words = bytes / sizeof(uint32_t); bytes = bytes % sizeof(uint32_t); @@ -125,7 +125,7 @@ class HashUtil { return converter.u64; } #else - static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) { + static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) { return zlib_crc_hash(data, bytes, hash); } #endif @@ -202,7 +202,7 @@ class HashUtil { // For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000 // is taken on the hash, all values will collide to the same bucket. // For string values, Fnv is slightly faster than boost. - static uint32_t fnv_hash(const void* data, int32_t bytes, uint32_t hash) { + static uint32_t fnv_hash(const void* data, uint32_t bytes, uint32_t hash) { const uint8_t* ptr = reinterpret_cast(data); while (bytes--) { @@ -213,7 +213,7 @@ class HashUtil { return hash; } - static uint64_t fnv_hash64(const void* data, int32_t bytes, uint64_t hash) { + static uint64_t fnv_hash64(const void* data, uint32_t bytes, uint64_t hash) { const uint8_t* ptr = reinterpret_cast(data); while (bytes--) { @@ -291,7 +291,7 @@ class HashUtil { // depending on hardware capabilities. // Seed values for different steps of the query execution should use different seeds // to prevent accidental key collisions. (See IMPALA-219 for more details). - static uint32_t hash(const void* data, int32_t bytes, uint32_t seed) { + static uint32_t hash(const void* data, uint32_t bytes, uint32_t seed) { #ifdef __SSE4_2__ if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) { @@ -305,7 +305,7 @@ class HashUtil { #endif } - static uint64_t hash64(const void* data, int32_t bytes, uint64_t seed) { + static uint64_t hash64(const void* data, uint32_t bytes, uint64_t seed) { #ifdef _SSE4_2_ if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) { return crc_hash64(data, bytes, seed); diff --git a/be/src/vec/columns/column_const.cpp b/be/src/vec/columns/column_const.cpp index a4b3127ad6cac9..3ee57a187d66e7 100644 --- a/be/src/vec/columns/column_const.cpp +++ b/be/src/vec/columns/column_const.cpp @@ -35,6 +35,7 @@ #include "vec/core/column_with_type_and_name.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" ColumnConst::ColumnConst(const ColumnPtr& data_, size_t s_) : data(data_), s(s_) { /// Squash Const of Const. @@ -66,7 +67,9 @@ ColumnConst::ColumnConst(const ColumnPtr& data_, size_t s_, bool create_with_emp } ColumnPtr ColumnConst::convert_to_full_column() const { - return data->replicate(Offsets(1, s)); + // Assuming the number of replicate rows will not exceed Offset(UInt32), + // currently Column::replicate only supports Uint32 Offsets + return data->replicate(Offsets(1, cast_set(s))); } ColumnPtr ColumnConst::remove_low_cardinality() const { diff --git a/be/src/vec/columns/column_const.h b/be/src/vec/columns/column_const.h index 980d9d64148ae4..efd9967b60cb40 100644 --- a/be/src/vec/columns/column_const.h +++ b/be/src/vec/columns/column_const.h @@ -48,6 +48,7 @@ class SipHash; namespace doris::vectorized { +#include "common/compile_check_begin.h" class Arena; class Block; @@ -277,7 +278,8 @@ class ColumnConst final : public COWHelper { template T get_value() const { - return get_field().safe_get>(); + // Here the cast is correct, relevant code is rather tricky. + return static_cast(get_field().safe_get>()); } void replace_column_data(const IColumn& rhs, size_t row, size_t self_row = 0) override { @@ -286,3 +288,4 @@ class ColumnConst final : public COWHelper { } }; } // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/columns/column_decimal.h b/be/src/vec/columns/column_decimal.h index d754831cc56a86..c49926814cbee6 100644 --- a/be/src/vec/columns/column_decimal.h +++ b/be/src/vec/columns/column_decimal.h @@ -53,6 +53,7 @@ class ColumnSorter; } // namespace doris namespace doris::vectorized { +#include "common/compile_check_begin.h" /// PaddedPODArray extended by Decimal scale template @@ -263,7 +264,7 @@ class ColumnDecimal final : public COWHelper> { for (U i = 0; i < s; ++i) res[i] = i; auto sort_end = res.end(); - if (limit && limit < s / 8.0) { + if (limit && limit < static_cast(s) / 8.0L) { sort_end = res.begin() + limit; if (reverse) std::partial_sort(res.begin(), sort_end, res.end(), @@ -307,3 +308,4 @@ template using ColumnVectorOrDecimal = typename ColumnVectorOrDecimalT>::Col; } // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h index 69e04973af77a1..9c8d70d961da8c 100644 --- a/be/src/vec/columns/column_dictionary.h +++ b/be/src/vec/columns/column_dictionary.h @@ -29,6 +29,7 @@ #include "vec/core/types.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" /** * For low cardinality string columns, using ColumnDictionary can reduce memory @@ -269,9 +270,9 @@ class ColumnDictionary final : public COWHelper> { } } - int32_t find_code(const StringRef& value) const { return _dict.find_code(value); } + T find_code(const StringRef& value) const { return _dict.find_code(value); } - int32_t find_code_by_bound(const StringRef& value, bool greater, bool eq) const { + T find_code_by_bound(const StringRef& value, bool greater, bool eq) const { return _dict.find_code_by_bound(value, greater, eq); } @@ -350,8 +351,9 @@ class ColumnDictionary final : public COWHelper> { _total_str_len += value.size; } - int32_t find_code(const StringRef& value) const { - for (size_t i = 0; i < _dict_data->size(); i++) { + T find_code(const StringRef& value) const { + // _dict_data->size will not exceed the range of T. + for (T i = 0; i < _dict_data->size(); i++) { if ((*_dict_data)[i] == value) { return i; } @@ -388,11 +390,11 @@ class ColumnDictionary final : public COWHelper> { // For dictionary data of char type, sv.size is the schema length, // so use strnlen to remove the 0 at the end to get the actual length. - int32_t len = sv.size; + size_t len = sv.size; if (type == FieldType::OLAP_FIELD_TYPE_CHAR) { len = strnlen(sv.data, sv.size); } - uint32_t hash_val = HashUtil::crc_hash(sv.data, len, 0); + uint32_t hash_val = HashUtil::crc_hash(sv.data, static_cast(len), 0); _hash_values[code] = hash_val; _compute_hash_value_flags[code] = 1; return _hash_values[code]; @@ -416,13 +418,14 @@ class ColumnDictionary final : public COWHelper> { // so upper_bound is the code 0 of b, then evaluate code < 0 and returns empty // If the predicate is col <= 'a' and upper_bound-1 is -1, // then evaluate code <= -1 and returns empty - int32_t find_code_by_bound(const StringRef& value, bool greater, bool eq) const { + T find_code_by_bound(const StringRef& value, bool greater, bool eq) const { auto code = find_code(value); if (code >= 0) { return code; } - auto bound = std::upper_bound(_dict_data->begin(), _dict_data->end(), value) - - _dict_data->begin(); + auto bound = + static_cast(std::upper_bound(_dict_data->begin(), _dict_data->end(), value) - + _dict_data->begin()); return greater ? bound - greater + eq : bound - eq; } @@ -540,3 +543,4 @@ template class ColumnDictionary; using ColumnDictI32 = vectorized::ColumnDictionary; } // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/columns/column_nullable.cpp b/be/src/vec/columns/column_nullable.cpp index 5e34ad4d8d4d1d..c58c78f5611d02 100644 --- a/be/src/vec/columns/column_nullable.cpp +++ b/be/src/vec/columns/column_nullable.cpp @@ -29,6 +29,7 @@ #include "vec/utils/util.hpp" namespace doris::vectorized { +#include "common/compile_check_begin.h" ColumnNullable::ColumnNullable(MutableColumnPtr&& nested_column_, MutableColumnPtr&& null_map_) : NullMapProvider(std::move(null_map_)), nested_column(std::move(nested_column_)) { @@ -62,7 +63,7 @@ void ColumnNullable::update_xxHash_with_value(size_t start, size_t end, uint64_t } else { const auto* __restrict real_null_data = assert_cast(get_null_map_column()).get_data().data(); - for (int i = start; i < end; ++i) { + for (size_t i = start; i < end; ++i) { if (real_null_data[i] != 0) { hash = HashUtil::xxHash64NullWithSeed(hash); } @@ -78,7 +79,7 @@ void ColumnNullable::update_crc_with_value(size_t start, size_t end, uint32_t& h } else { const auto* __restrict real_null_data = assert_cast(get_null_map_column()).get_data().data(); - for (int i = start; i < end; ++i) { + for (size_t i = start; i < end; ++i) { if (real_null_data[i] != 0) { hash = HashUtil::zlib_crc_hash_null(hash); } diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 2b87aa982cae50..87cf8e380ea032 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -43,6 +43,7 @@ class SipHash; namespace doris::vectorized { +#include "common/compile_check_begin.h" class Arena; class ColumnSorter; @@ -410,7 +411,8 @@ class ColumnNullable final : public COWHelper, public N } static constexpr auto MAX_NUMBER_OF_ROWS_FOR_FULL_SEARCH = 1000; size_t num_rows = size(); - size_t num_sampled_rows = std::min(static_cast(num_rows * sample_ratio), num_rows); + size_t num_sampled_rows = std::min( + static_cast(static_cast(num_rows) * sample_ratio), num_rows); size_t num_checked_rows = 0; size_t res = 0; if (num_sampled_rows == num_rows || num_rows <= MAX_NUMBER_OF_ROWS_FOR_FULL_SEARCH) { @@ -429,7 +431,7 @@ class ColumnNullable final : public COWHelper, public N if (num_checked_rows == 0) { return 0.0; } - return static_cast(res) / num_checked_rows; + return static_cast(res) / static_cast(num_checked_rows); } void convert_dict_codes_if_necessary() override { @@ -466,3 +468,4 @@ class ColumnNullable final : public COWHelper, public N ColumnPtr make_nullable(const ColumnPtr& column, bool is_nullable = false); ColumnPtr remove_nullable(const ColumnPtr& column); } // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index d5e52d07bcf788..3e8d3722305e8d 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -82,6 +82,7 @@ #endif namespace doris::vectorized { +#include "common/compile_check_begin.h" namespace { DataTypePtr create_array_of_type(TypeIndex type, size_t num_dimensions, bool is_nullable) { @@ -653,7 +654,7 @@ bool ColumnObject::Subcolumn::check_if_sparse_column(size_t num_rows) { defaults_ratio.push_back(data[i]->get_ratio_of_default_rows()); } double default_ratio = std::accumulate(defaults_ratio.begin(), defaults_ratio.end(), 0.0) / - defaults_ratio.size(); + static_cast(defaults_ratio.size()); return default_ratio >= config::variant_ratio_of_defaults_as_sparse_column; } @@ -1294,7 +1295,11 @@ rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInDat if (!json.IsObject()) { return nullptr; } - rapidjson::Value name(current_key.data(), current_key.size()); + /*! RapidJSON uses 32-bit array/string indices even on 64-bit platforms, + instead of using \c size_t. Users may override the SizeType by defining + \ref RAPIDJSON_NO_SIZETYPEDEFINE. + */ + rapidjson::Value name(current_key.data(), cast_set(current_key.size())); auto it = json.FindMember(name); if (it == json.MemberEnd()) { return nullptr; @@ -1312,7 +1317,7 @@ rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInDat // 3. empty root jsonb value(not null) // 4. type is nothing bool skip_empty_json(const ColumnNullable* nullable, const DataTypePtr& type, - TypeIndex base_type_id, int row, const PathInData& path) { + TypeIndex base_type_id, size_t row, const PathInData& path) { // skip nulls if (nullable && nullable->is_null_at(row)) { return true; @@ -1348,7 +1353,7 @@ Status find_and_set_leave_value(const IColumn* column, const PathInData& path, const DataTypeSerDeSPtr& type_serde, const DataTypePtr& type, TypeIndex base_type_index, rapidjson::Value& root, rapidjson::Document::AllocatorType& allocator, Arena& mem_pool, - int row) { + size_t row) { #ifndef NDEBUG // sanitize type and column if (column->get_name() != type->create_column()->get_name()) { @@ -1416,7 +1421,7 @@ void get_json_by_column_tree(rapidjson::Value& root, rapidjson::Document::Alloca } } -Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string* output) const { +Status ColumnObject::serialize_one_row_to_string(size_t row, std::string* output) const { if (!is_finalized()) { const_cast(this)->finalize(FinalizeMode::READ_MODE); } @@ -1432,7 +1437,7 @@ Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string* outpu return Status::OK(); } -Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable& output) const { +Status ColumnObject::serialize_one_row_to_string(size_t row, BufferWritable& output) const { if (!is_finalized()) { const_cast(this)->finalize(FinalizeMode::READ_MODE); } @@ -1447,7 +1452,7 @@ Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable& ou return Status::OK(); } -Status ColumnObject::serialize_one_row_to_json_format(int64_t row, rapidjson::StringBuffer* output, +Status ColumnObject::serialize_one_row_to_json_format(size_t row, rapidjson::StringBuffer* output, bool* is_null) const { CHECK(is_finalized()); if (subcolumns.empty()) { diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h index 1c8f38056c9d54..3ca9aff389aac4 100644 --- a/be/src/vec/columns/column_object.h +++ b/be/src/vec/columns/column_object.h @@ -272,12 +272,12 @@ class ColumnObject final : public COWHelper { return subcolumns.get_mutable_root()->data.get_finalized_column_ptr()->assume_mutable(); } - Status serialize_one_row_to_string(int64_t row, std::string* output) const; + Status serialize_one_row_to_string(size_t row, std::string* output) const; - Status serialize_one_row_to_string(int64_t row, BufferWritable& output) const; + Status serialize_one_row_to_string(size_t row, BufferWritable& output) const; // serialize one row to json format - Status serialize_one_row_to_json_format(int64_t row, rapidjson::StringBuffer* output, + Status serialize_one_row_to_json_format(size_t row, rapidjson::StringBuffer* output, bool* is_null) const; // merge multiple sub sparse columns into root diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 3caa194551bf79..cb83a29bbada2c 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -34,6 +34,7 @@ #include "vec/core/sort_block.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" template void ColumnStr::sanity_check() const { @@ -74,8 +75,8 @@ MutableColumnPtr ColumnStr::clone_resized(size_t to_size) const { res->offsets.assign(offsets.begin(), offsets.end()); res->chars.assign(chars.begin(), chars.end()); } - - res->offsets.resize_fill(to_size, chars.size()); + // If offset is uint32, size will not exceed, check the size when inserting data into ColumnStr. + res->offsets.resize_fill(to_size, static_cast(chars.size())); } return res; @@ -92,14 +93,14 @@ void ColumnStr::shrink_padding_chars() { // deal the 0-th element. no need to move. auto next_start = offset[0]; - offset[0] = strnlen(data, size_at(0)); + offset[0] = static_cast(strnlen(data, size_at(0))); for (size_t i = 1; i < size; i++) { // get the i-th length and whole move it to cover the last's trailing void auto length = strnlen(data + next_start, offset[i] - next_start); memmove(data + offset[i - 1], data + next_start, length); // offset i will be changed. so save the old value for (i+1)-th to get its length. next_start = offset[i]; - offset[i] = offset[i - 1] + length; + offset[i] = offset[i - 1] + static_cast(length); } chars.resize_fill(offsets.back()); // just call it to shrink memory here. no possible to expand. } @@ -125,8 +126,8 @@ void ColumnStr::insert_range_from_ignore_overflow(const doris::vectorized::IC "Parameter out of bound in IColumnStr::insert_range_from method."); } - size_t nested_offset = src_concrete.offset_at(start); - size_t nested_length = src_concrete.offsets[start + length - 1] - nested_offset; + auto nested_offset = src_concrete.offset_at(start); + auto nested_length = src_concrete.offsets[start + length - 1] - nested_offset; size_t old_chars_size = chars.size(); chars.resize(old_chars_size + nested_length); @@ -136,7 +137,7 @@ void ColumnStr::insert_range_from_ignore_overflow(const doris::vectorized::IC offsets.assign(src_concrete.offsets.begin(), src_concrete.offsets.begin() + length); } else { size_t old_size = offsets.size(); - size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray + auto prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray offsets.resize(old_size + length); for (size_t i = 0; i < length; ++i) { @@ -161,8 +162,8 @@ void ColumnStr::insert_range_from(const IColumn& src, size_t start, size_t le doris::ErrorCode::INTERNAL_ERROR, "Parameter out of bound in IColumnStr::insert_range_from method."); } - size_t nested_offset = src_offsets[static_cast(start) - 1]; - size_t nested_length = src_offsets[start + length - 1] - nested_offset; + auto nested_offset = src_offsets[static_cast(start) - 1]; + auto nested_length = src_offsets[start + length - 1] - nested_offset; size_t old_chars_size = chars.size(); check_chars_length(old_chars_size + nested_length, offsets.size() + length); @@ -174,11 +175,13 @@ void ColumnStr::insert_range_from(const IColumn& src, size_t start, size_t le offsets.assign(src_offsets.begin(), src_offsets.begin() + length); } else { size_t old_size = offsets.size(); - size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray + auto prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray offsets.resize(old_size + length); for (size_t i = 0; i < length; ++i) { - offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset; + // if Offsets is uint32, size will not exceed range of uint32, cast is OK. + offsets[old_size + i] = + static_cast(src_offsets[start + i] - nested_offset) + prev_max_offset; } } }; @@ -208,7 +211,7 @@ void ColumnStr::insert_many_from(const IColumn& src, size_t position, size_t auto prev_pos = old_chars_size; for (; start_pos < end_pos; ++start_pos) { memcpy(&chars[prev_pos], data_val, data_length); - offsets[start_pos] = prev_pos + data_length; + offsets[start_pos] = static_cast(prev_pos + data_length); prev_pos = prev_pos + data_length; } } @@ -229,7 +232,8 @@ void ColumnStr::insert_indices_from(const IColumn& src, const uint32_t* indic for (const auto* x = indices_begin; x != indices_end; ++x) { int64_t src_offset = *x; total_chars_size += src_offset_data[src_offset] - src_offset_data[src_offset - 1]; - dst_offsets_data[dst_offsets_pos++] = total_chars_size; + // if Offsets is uint32, size will not exceed range of uint32, cast is OK. + dst_offsets_data[dst_offsets_pos++] = static_cast(total_chars_size); } check_chars_length(total_chars_size, offsets.size()); @@ -267,13 +271,16 @@ void ColumnStr::update_crcs_with_value(uint32_t* __restrict hashes, doris::Pr if (null_data == nullptr) { for (size_t i = 0; i < s; i++) { auto data_ref = get_data_at(i); - hashes[i] = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, hashes[i]); + // If offset is uint32, size will not exceed, check the size when inserting data into ColumnStr. + hashes[i] = HashUtil::zlib_crc_hash(data_ref.data, static_cast(data_ref.size), + hashes[i]); } } else { for (size_t i = 0; i < s; i++) { if (null_data[i] == 0) { auto data_ref = get_data_at(i); - hashes[i] = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, hashes[i]); + hashes[i] = HashUtil::zlib_crc_hash( + data_ref.data, static_cast(data_ref.size), hashes[i]); } } } @@ -391,8 +398,9 @@ ColumnPtr ColumnStr::permute(const IColumn::Permutation& perm, size_t limit) template StringRef ColumnStr::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const { - uint32_t string_size(size_at(n)); - uint32_t offset(offset_at(n)); + // Use uint32 instead of size_t to reduce agg key's length. + auto string_size(static_cast(size_at(n))); + auto offset(static_cast(offset_at(n))); StringRef res; res.size = sizeof(string_size) + string_size; @@ -421,7 +429,7 @@ const char* ColumnStr::deserialize_and_insert_from_arena(const char* pos) { template size_t ColumnStr::get_max_row_byte_size() const { - size_t max_size = 0; + T max_size = 0; size_t num_rows = offsets.size(); for (size_t i = 0; i < num_rows; ++i) { max_size = std::max(max_size, size_at(i)); @@ -434,8 +442,9 @@ template void ColumnStr::serialize_vec(std::vector& keys, size_t num_rows, size_t max_row_byte_size) const { for (size_t i = 0; i < num_rows; ++i) { - uint32_t offset(offset_at(i)); - uint32_t string_size(size_at(i)); + // Use uint32 instead of size_t to reduce agg key's length. + auto offset(static_cast(offset_at(i))); + auto string_size(static_cast(size_at(i))); auto* ptr = const_cast(keys[i].data + keys[i].size); memcpy_fixed(ptr, (char*)&string_size); @@ -458,8 +467,8 @@ void ColumnStr::serialize_vec_with_null_map(std::vector& keys, siz memcpy(dest, null_map + i, sizeof(uint8_t)); if (null_map[i] == 0) { - UInt32 offset(offset_at(i)); - UInt32 string_size(size_at(i)); + auto offset(offset_at(i)); + auto string_size(size_at(i)); memcpy_fixed(dest + 1, (char*)&string_size); memcpy(dest + 1 + sizeof(string_size), &chars[offset], string_size); @@ -475,8 +484,8 @@ void ColumnStr::serialize_vec_with_null_map(std::vector& keys, siz // serialize null first memcpy(dest, null_map + i, sizeof(uint8_t)); - UInt32 offset(offset_at(i)); - UInt32 string_size(size_at(i)); + auto offset(offset_at(i)); + auto string_size(size_at(i)); memcpy_fixed(dest + 1, (char*)&string_size); memcpy(dest + 1 + sizeof(string_size), &chars[offset], string_size); @@ -559,8 +568,8 @@ ColumnPtr ColumnStr::replicate(const IColumn::Offsets& replicate_offsets) con T current_new_offset = 0; for (size_t i = 0; i < col_size; ++i) { - size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; - size_t string_size = offsets[i] - prev_string_offset; + T size_to_replicate = replicate_offsets[i] - prev_replicate_offset; + T string_size = offsets[i] - prev_string_offset; for (size_t j = 0; j < size_to_replicate; ++j) { current_new_offset += string_size; diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index f116d4ce1f17cf..1674fd90933dbe 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -47,6 +47,7 @@ #include "vec/core/types.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" class Arena; class ColumnSorter; @@ -86,10 +87,10 @@ class ColumnStr final : public COWHelper> { Chars chars; // Start position of i-th element. - size_t ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; } + T ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; } /// Size of i-th element, including terminating zero. - size_t ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] - offsets[i - 1]; } + T ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] - offsets[i - 1]; } template struct less; @@ -220,7 +221,7 @@ class ColumnStr final : public COWHelper> { const char* ptr = strings[0].data; for (size_t i = 0; i != num; i++) { - uint32_t len = strings[i].size; + size_t len = strings[i].size; length += len; offset += len; offsets.push_back(offset); @@ -282,7 +283,7 @@ class ColumnStr final : public COWHelper> { Char* data = chars.data(); size_t offset = old_size; for (size_t i = 0; i < num; i++) { - uint32_t len = strings[i].size; + size_t len = strings[i].size; if (len) { memcpy(data + offset, strings[i].data, len); offset += len; @@ -305,7 +306,7 @@ class ColumnStr final : public COWHelper> { Char* data = chars.data(); size_t offset = old_size; for (size_t i = 0; i < num; i++) { - uint32_t len = strings[i].size; + size_t len = strings[i].size; if (len) { memcpy(data + offset, strings[i].data, copy_length); offset += len; @@ -342,9 +343,15 @@ class ColumnStr final : public COWHelper> { for (size_t i = 0; i < num; i++) { int32_t codeword = data_array[i + start_index]; new_size += dict[codeword].size; - offsets[offset_size + i] = new_size; + offsets[offset_size + i] = static_cast(new_size); } + if (new_size > std::numeric_limits::max()) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "ColumnString insert size out of range type {} [{},{}]", + typeid(T).name(), std::numeric_limits::min(), + std::numeric_limits::max()); + } check_chars_length(new_size, offsets.size()); chars.resize(new_size); @@ -406,13 +413,16 @@ class ColumnStr final : public COWHelper> { for (size_t i = start; i < end; ++i) { if (null_data[i] == 0) { auto data_ref = get_data_at(i); - hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, hash); + // If offset is uint32, size will not exceed, check the size when inserting data into ColumnStr. + hash = HashUtil::zlib_crc_hash(data_ref.data, + static_cast(data_ref.size), hash); } } } else { for (size_t i = start; i < end; ++i) { auto data_ref = get_data_at(i); - hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, hash); + hash = HashUtil::zlib_crc_hash(data_ref.data, static_cast(data_ref.size), + hash); } } } @@ -473,7 +483,7 @@ class ColumnStr final : public COWHelper> { void insert_default() override { offsets.push_back(chars.size()); } void insert_many_defaults(size_t length) override { - offsets.resize_fill(offsets.size() + length, chars.size()); + offsets.resize_fill(offsets.size() + length, static_cast(chars.size())); } int compare_at(size_t n, size_t m, const IColumn& rhs_, @@ -525,3 +535,4 @@ class ColumnStr final : public COWHelper> { using ColumnString = ColumnStr; using ColumnString64 = ColumnStr; } // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp index 0e24446a5cdb92..f0f0bec8b99bb1 100644 --- a/be/src/vec/columns/column_vector.cpp +++ b/be/src/vec/columns/column_vector.cpp @@ -42,6 +42,7 @@ #include "vec/data_types/data_type.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" template StringRef ColumnVector::serialize_value_into_arena(size_t n, Arena& arena, @@ -242,7 +243,7 @@ void ColumnVector::get_permutation(bool reverse, size_t limit, int nan_direct if (s == 0) return; // std::partial_sort need limit << s can get performance benefit - if (limit > (s / 8.0)) limit = 0; + if (limit > (s / 8.0L)) limit = 0; if (limit) { for (size_t i = 0; i < s; ++i) res[i] = i; diff --git a/be/src/vec/columns/column_vector.h b/be/src/vec/columns/column_vector.h index 2676d6d344468b..cf26bc14b73325 100644 --- a/be/src/vec/columns/column_vector.h +++ b/be/src/vec/columns/column_vector.h @@ -61,6 +61,7 @@ class ColumnSorter; } // namespace doris namespace doris::vectorized { +#include "common/compile_check_begin.h" /** Stuff for comparing numbers. * Integer values are compared as usual. @@ -178,10 +179,9 @@ class ColumnVector final : public COWHelper> { void insert_range_of_integer(T begin, T end) { if constexpr (std::is_integral_v) { auto old_size = data.size(); - data.resize(old_size + (end - begin)); - for (int i = 0; i < end - begin; i++) { - data[old_size + i] = begin + i; - } + auto new_size = old_size + static_cast(end - begin); + data.resize(new_size); + std::iota(data.begin() + old_size, data.begin() + new_size, begin); } else { throw doris::Exception(ErrorCode::INTERNAL_ERROR, "double column not support insert_range_of_integer"); @@ -411,3 +411,4 @@ class ColumnVector final : public COWHelper> { }; } // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/core/field.h b/be/src/vec/core/field.h index 8113dc602fbd4e..341f65e075ed11 100644 --- a/be/src/vec/core/field.h +++ b/be/src/vec/core/field.h @@ -165,7 +165,7 @@ class JsonbField { public: JsonbField() = default; - JsonbField(const char* ptr, uint32_t len) : size(len) { + JsonbField(const char* ptr, size_t len) : size(len) { data = new char[size]; if (!data) { LOG(FATAL) << "new data buffer failed, size: " << size; @@ -213,7 +213,7 @@ class JsonbField { } const char* get_value() const { return data; } - uint32_t get_size() const { return size; } + size_t get_size() const { return size; } bool operator<(const JsonbField& r) const { LOG(FATAL) << "comparing between JsonbField is not supported"; @@ -252,7 +252,7 @@ class JsonbField { private: char* data = nullptr; - uint32_t size = 0; + size_t size = 0; }; template @@ -498,6 +498,9 @@ class Field { bool is_null() const { return which == Types::Null; } + // The template parameter T needs to be consistent with `which`. + // If not, use NearestFieldType<> externally. + // Maybe modify this in the future, reference: https://github.com/ClickHouse/ClickHouse/pull/22003 template T& get() { using TWithoutRef = std::remove_reference_t; @@ -520,6 +523,8 @@ class Field { return true; } + // The template parameter T needs to be consistent with `which`. + // If not, use NearestFieldType<> externally. template bool try_get(T& result) const { const Types::Which requested = TypeToEnum>::value;