Skip to content

Commit

Permalink
1
Browse files Browse the repository at this point in the history
  • Loading branch information
xinyiZzz committed Nov 21, 2024
1 parent 65ac745 commit e7935fb
Show file tree
Hide file tree
Showing 15 changed files with 135 additions and 77 deletions.
16 changes: 8 additions & 8 deletions be/src/util/hash_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class HashUtil {
return std::hash<T>()(value);
}

static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t hash) {
static uint32_t zlib_crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
return crc32(hash, (const unsigned char*)data, bytes);
}

Expand All @@ -66,7 +66,7 @@ class HashUtil {
// NOTE: Any changes made to this function need to be reflected in Codegen::GetHashFn.
// TODO: crc32 hashes with different seeds do not result in different hash functions.
// The resulting hashes are correlated.
static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) {
static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) {
return zlib_crc_hash(data, bytes, hash);
}
Expand All @@ -93,7 +93,7 @@ class HashUtil {
return hash;
}

static uint64_t crc_hash64(const void* data, int32_t bytes, uint64_t hash) {
static uint64_t crc_hash64(const void* data, uint32_t bytes, uint64_t hash) {
uint32_t words = bytes / sizeof(uint32_t);
bytes = bytes % sizeof(uint32_t);

Expand Down Expand Up @@ -125,7 +125,7 @@ class HashUtil {
return converter.u64;
}
#else
static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) {
static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
return zlib_crc_hash(data, bytes, hash);
}
#endif
Expand Down Expand Up @@ -202,7 +202,7 @@ class HashUtil {
// For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000
// is taken on the hash, all values will collide to the same bucket.
// For string values, Fnv is slightly faster than boost.
static uint32_t fnv_hash(const void* data, int32_t bytes, uint32_t hash) {
static uint32_t fnv_hash(const void* data, uint32_t bytes, uint32_t hash) {
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);

while (bytes--) {
Expand All @@ -213,7 +213,7 @@ class HashUtil {
return hash;
}

static uint64_t fnv_hash64(const void* data, int32_t bytes, uint64_t hash) {
static uint64_t fnv_hash64(const void* data, uint32_t bytes, uint64_t hash) {
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);

while (bytes--) {
Expand Down Expand Up @@ -291,7 +291,7 @@ class HashUtil {
// depending on hardware capabilities.
// Seed values for different steps of the query execution should use different seeds
// to prevent accidental key collisions. (See IMPALA-219 for more details).
static uint32_t hash(const void* data, int32_t bytes, uint32_t seed) {
static uint32_t hash(const void* data, uint32_t bytes, uint32_t seed) {
#ifdef __SSE4_2__

if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
Expand All @@ -305,7 +305,7 @@ class HashUtil {
#endif
}

static uint64_t hash64(const void* data, int32_t bytes, uint64_t seed) {
static uint64_t hash64(const void* data, uint32_t bytes, uint64_t seed) {
#ifdef _SSE4_2_
if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
return crc_hash64(data, bytes, seed);
Expand Down
5 changes: 4 additions & 1 deletion be/src/vec/columns/column_const.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "vec/core/column_with_type_and_name.h"

namespace doris::vectorized {
#include "common/compile_check_begin.h"

ColumnConst::ColumnConst(const ColumnPtr& data_, size_t s_) : data(data_), s(s_) {
/// Squash Const of Const.
Expand Down Expand Up @@ -66,7 +67,9 @@ ColumnConst::ColumnConst(const ColumnPtr& data_, size_t s_, bool create_with_emp
}

ColumnPtr ColumnConst::convert_to_full_column() const {
return data->replicate(Offsets(1, s));
// Assuming the number of replicate rows will not exceed Offset(UInt32),
// currently Column::replicate only supports Uint32 Offsets
return data->replicate(Offsets(1, cast_set<Offset>(s)));
}

ColumnPtr ColumnConst::remove_low_cardinality() const {
Expand Down
5 changes: 4 additions & 1 deletion be/src/vec/columns/column_const.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
class SipHash;

namespace doris::vectorized {
#include "common/compile_check_begin.h"

class Arena;
class Block;
Expand Down Expand Up @@ -277,7 +278,8 @@ class ColumnConst final : public COWHelper<IColumn, ColumnConst> {

template <typename T>
T get_value() const {
return get_field().safe_get<NearestFieldType<T>>();
// Here the cast is correct, relevant code is rather tricky.
return static_cast<T>(get_field().safe_get<NearestFieldType<T>>());
}

void replace_column_data(const IColumn& rhs, size_t row, size_t self_row = 0) override {
Expand All @@ -286,3 +288,4 @@ class ColumnConst final : public COWHelper<IColumn, ColumnConst> {
}
};
} // namespace doris::vectorized
#include "common/compile_check_end.h"
4 changes: 3 additions & 1 deletion be/src/vec/columns/column_decimal.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class ColumnSorter;
} // namespace doris

namespace doris::vectorized {
#include "common/compile_check_begin.h"

/// PaddedPODArray extended by Decimal scale
template <typename T>
Expand Down Expand Up @@ -263,7 +264,7 @@ class ColumnDecimal final : public COWHelper<IColumn, ColumnDecimal<T>> {
for (U i = 0; i < s; ++i) res[i] = i;

auto sort_end = res.end();
if (limit && limit < s / 8.0) {
if (limit && limit < static_cast<double>(s) / 8.0L) {
sort_end = res.begin() + limit;
if (reverse)
std::partial_sort(res.begin(), sort_end, res.end(),
Expand Down Expand Up @@ -307,3 +308,4 @@ template <typename T>
using ColumnVectorOrDecimal = typename ColumnVectorOrDecimalT<T, IsDecimalNumber<T>>::Col;

} // namespace doris::vectorized
#include "common/compile_check_end.h"
24 changes: 15 additions & 9 deletions be/src/vec/columns/column_dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "vec/core/types.h"

namespace doris::vectorized {
#include "common/compile_check_begin.h"

/**
* For low cardinality string columns, using ColumnDictionary can reduce memory
Expand Down Expand Up @@ -269,9 +270,9 @@ class ColumnDictionary final : public COWHelper<IColumn, ColumnDictionary<T>> {
}
}

int32_t find_code(const StringRef& value) const { return _dict.find_code(value); }
T find_code(const StringRef& value) const { return _dict.find_code(value); }

int32_t find_code_by_bound(const StringRef& value, bool greater, bool eq) const {
T find_code_by_bound(const StringRef& value, bool greater, bool eq) const {
return _dict.find_code_by_bound(value, greater, eq);
}

Expand Down Expand Up @@ -350,8 +351,9 @@ class ColumnDictionary final : public COWHelper<IColumn, ColumnDictionary<T>> {
_total_str_len += value.size;
}

int32_t find_code(const StringRef& value) const {
for (size_t i = 0; i < _dict_data->size(); i++) {
T find_code(const StringRef& value) const {
// _dict_data->size will not exceed the range of T.
for (T i = 0; i < _dict_data->size(); i++) {
if ((*_dict_data)[i] == value) {
return i;
}
Expand Down Expand Up @@ -388,11 +390,12 @@ class ColumnDictionary final : public COWHelper<IColumn, ColumnDictionary<T>> {

// For dictionary data of char type, sv.size is the schema length,
// so use strnlen to remove the 0 at the end to get the actual length.
int32_t len = sv.size;
size_t len = sv.size;
if (type == FieldType::OLAP_FIELD_TYPE_CHAR) {
len = strnlen(sv.data, sv.size);
}
uint32_t hash_val = HashUtil::crc_hash(sv.data, len, 0);
// Size of a String line will not exceed 4G, check the size when inserting data into ColumnStr<T>.
uint32_t hash_val = HashUtil::crc_hash(sv.data, cast_set<uint32_t>(len), 0);
_hash_values[code] = hash_val;
_compute_hash_value_flags[code] = 1;
return _hash_values[code];
Expand All @@ -416,13 +419,15 @@ class ColumnDictionary final : public COWHelper<IColumn, ColumnDictionary<T>> {
// so upper_bound is the code 0 of b, then evaluate code < 0 and returns empty
// If the predicate is col <= 'a' and upper_bound-1 is -1,
// then evaluate code <= -1 and returns empty
int32_t find_code_by_bound(const StringRef& value, bool greater, bool eq) const {
T find_code_by_bound(const StringRef& value, bool greater, bool eq) const {
auto code = find_code(value);
if (code >= 0) {
return code;
}
auto bound = std::upper_bound(_dict_data->begin(), _dict_data->end(), value) -
_dict_data->begin();
// The length of code will not exceed the range of T.
auto bound =
cast_set<T>(std::upper_bound(_dict_data->begin(), _dict_data->end(), value) -
_dict_data->begin());
return greater ? bound - greater + eq : bound - eq;
}

Expand Down Expand Up @@ -540,3 +545,4 @@ template class ColumnDictionary<int32_t>;
using ColumnDictI32 = vectorized::ColumnDictionary<doris::vectorized::Int32>;

} // namespace doris::vectorized
#include "common/compile_check_end.h"
5 changes: 3 additions & 2 deletions be/src/vec/columns/column_nullable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "vec/utils/util.hpp"

namespace doris::vectorized {
#include "common/compile_check_begin.h"

ColumnNullable::ColumnNullable(MutableColumnPtr&& nested_column_, MutableColumnPtr&& null_map_)
: NullMapProvider(std::move(null_map_)), nested_column(std::move(nested_column_)) {
Expand Down Expand Up @@ -62,7 +63,7 @@ void ColumnNullable::update_xxHash_with_value(size_t start, size_t end, uint64_t
} else {
const auto* __restrict real_null_data =
assert_cast<const ColumnUInt8&>(get_null_map_column()).get_data().data();
for (int i = start; i < end; ++i) {
for (size_t i = start; i < end; ++i) {
if (real_null_data[i] != 0) {
hash = HashUtil::xxHash64NullWithSeed(hash);
}
Expand All @@ -78,7 +79,7 @@ void ColumnNullable::update_crc_with_value(size_t start, size_t end, uint32_t& h
} else {
const auto* __restrict real_null_data =
assert_cast<const ColumnUInt8&>(get_null_map_column()).get_data().data();
for (int i = start; i < end; ++i) {
for (size_t i = start; i < end; ++i) {
if (real_null_data[i] != 0) {
hash = HashUtil::zlib_crc_hash_null(hash);
}
Expand Down
7 changes: 5 additions & 2 deletions be/src/vec/columns/column_nullable.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
class SipHash;

namespace doris::vectorized {
#include "common/compile_check_begin.h"
class Arena;
class ColumnSorter;

Expand Down Expand Up @@ -410,7 +411,8 @@ class ColumnNullable final : public COWHelper<IColumn, ColumnNullable>, public N
}
static constexpr auto MAX_NUMBER_OF_ROWS_FOR_FULL_SEARCH = 1000;
size_t num_rows = size();
size_t num_sampled_rows = std::min(static_cast<size_t>(num_rows * sample_ratio), num_rows);
size_t num_sampled_rows = std::min(
static_cast<size_t>(static_cast<double>(num_rows) * sample_ratio), num_rows);
size_t num_checked_rows = 0;
size_t res = 0;
if (num_sampled_rows == num_rows || num_rows <= MAX_NUMBER_OF_ROWS_FOR_FULL_SEARCH) {
Expand All @@ -429,7 +431,7 @@ class ColumnNullable final : public COWHelper<IColumn, ColumnNullable>, public N
if (num_checked_rows == 0) {
return 0.0;
}
return static_cast<double>(res) / num_checked_rows;
return static_cast<double>(res) / static_cast<double>(num_checked_rows);
}

void convert_dict_codes_if_necessary() override {
Expand Down Expand Up @@ -466,3 +468,4 @@ class ColumnNullable final : public COWHelper<IColumn, ColumnNullable>, public N
ColumnPtr make_nullable(const ColumnPtr& column, bool is_nullable = false);
ColumnPtr remove_nullable(const ColumnPtr& column);
} // namespace doris::vectorized
#include "common/compile_check_end.h"
23 changes: 15 additions & 8 deletions be/src/vec/columns/column_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
#endif

namespace doris::vectorized {
#include "common/compile_check_begin.h"
namespace {

DataTypePtr create_array_of_type(TypeIndex type, size_t num_dimensions, bool is_nullable) {
Expand Down Expand Up @@ -653,7 +654,7 @@ bool ColumnObject::Subcolumn::check_if_sparse_column(size_t num_rows) {
defaults_ratio.push_back(data[i]->get_ratio_of_default_rows());
}
double default_ratio = std::accumulate(defaults_ratio.begin(), defaults_ratio.end(), 0.0) /
defaults_ratio.size();
static_cast<double>(defaults_ratio.size());
return default_ratio >= config::variant_ratio_of_defaults_as_sparse_column;
}

Expand Down Expand Up @@ -1294,7 +1295,11 @@ rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInDat
if (!json.IsObject()) {
return nullptr;
}
rapidjson::Value name(current_key.data(), current_key.size());
/*! RapidJSON uses 32-bit array/string indices even on 64-bit platforms,
instead of using \c size_t. Users may override the SizeType by defining
\ref RAPIDJSON_NO_SIZETYPEDEFINE.
*/
rapidjson::Value name(current_key.data(), cast_set<unsigned>(current_key.size()));
auto it = json.FindMember(name);
if (it == json.MemberEnd()) {
return nullptr;
Expand All @@ -1312,7 +1317,7 @@ rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInDat
// 3. empty root jsonb value(not null)
// 4. type is nothing
bool skip_empty_json(const ColumnNullable* nullable, const DataTypePtr& type,
TypeIndex base_type_id, int row, const PathInData& path) {
TypeIndex base_type_id, size_t row, const PathInData& path) {
// skip nulls
if (nullable && nullable->is_null_at(row)) {
return true;
Expand Down Expand Up @@ -1348,7 +1353,7 @@ Status find_and_set_leave_value(const IColumn* column, const PathInData& path,
const DataTypeSerDeSPtr& type_serde, const DataTypePtr& type,
TypeIndex base_type_index, rapidjson::Value& root,
rapidjson::Document::AllocatorType& allocator, Arena& mem_pool,
int row) {
size_t row) {
#ifndef NDEBUG
// sanitize type and column
if (column->get_name() != type->create_column()->get_name()) {
Expand All @@ -1371,7 +1376,9 @@ Status find_and_set_leave_value(const IColumn* column, const PathInData& path,
<< ", root: " << std::string(buffer.GetString(), buffer.GetSize());
return Status::NotFound("Not found path {}", path.get_path());
}
RETURN_IF_ERROR(type_serde->write_one_cell_to_json(*column, *target, allocator, mem_pool, row));
// TODO: in next pr, the `int row_num` in serde method parameters will be changed to `size_t`
RETURN_IF_ERROR(type_serde->write_one_cell_to_json(*column, *target, allocator, mem_pool,
cast_set<int>(row)));
return Status::OK();
}

Expand Down Expand Up @@ -1416,7 +1423,7 @@ void get_json_by_column_tree(rapidjson::Value& root, rapidjson::Document::Alloca
}
}

Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string* output) const {
Status ColumnObject::serialize_one_row_to_string(size_t row, std::string* output) const {
if (!is_finalized()) {
const_cast<ColumnObject*>(this)->finalize(FinalizeMode::READ_MODE);
}
Expand All @@ -1432,7 +1439,7 @@ Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string* outpu
return Status::OK();
}

Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable& output) const {
Status ColumnObject::serialize_one_row_to_string(size_t row, BufferWritable& output) const {
if (!is_finalized()) {
const_cast<ColumnObject*>(this)->finalize(FinalizeMode::READ_MODE);
}
Expand All @@ -1447,7 +1454,7 @@ Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable& ou
return Status::OK();
}

Status ColumnObject::serialize_one_row_to_json_format(int64_t row, rapidjson::StringBuffer* output,
Status ColumnObject::serialize_one_row_to_json_format(size_t row, rapidjson::StringBuffer* output,
bool* is_null) const {
CHECK(is_finalized());
if (subcolumns.empty()) {
Expand Down
6 changes: 3 additions & 3 deletions be/src/vec/columns/column_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,12 +272,12 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {
return subcolumns.get_mutable_root()->data.get_finalized_column_ptr()->assume_mutable();
}

Status serialize_one_row_to_string(int64_t row, std::string* output) const;
Status serialize_one_row_to_string(size_t row, std::string* output) const;

Status serialize_one_row_to_string(int64_t row, BufferWritable& output) const;
Status serialize_one_row_to_string(size_t row, BufferWritable& output) const;

// serialize one row to json format
Status serialize_one_row_to_json_format(int64_t row, rapidjson::StringBuffer* output,
Status serialize_one_row_to_json_format(size_t row, rapidjson::StringBuffer* output,
bool* is_null) const;

// merge multiple sub sparse columns into root
Expand Down
Loading

0 comments on commit e7935fb

Please sign in to comment.