Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[enhance](orc) Optimize ORC Predicate Pushdown for OR-connected Predicate #43255

Merged
merged 29 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
5553608
impl _init_search_argument
suxiaogang223 Nov 5, 2024
4e40525
add profile
suxiaogang223 Nov 5, 2024
9e50296
clean code
suxiaogang223 Nov 5, 2024
7aea987
fix
suxiaogang223 Nov 5, 2024
b1920a8
fix
suxiaogang223 Nov 5, 2024
46ca9ea
fix profile
suxiaogang223 Nov 5, 2024
1d4c712
fix _build_search_argument for "is null" and "is not null"
suxiaogang223 Nov 5, 2024
90066e1
add check_expr_can_push_down
suxiaogang223 Nov 6, 2024
2fb3227
add test
suxiaogang223 Nov 6, 2024
b113a68
add be ut
suxiaogang223 Nov 6, 2024
1580092
pass be ut
suxiaogang223 Nov 6, 2024
146d957
fix be ut
suxiaogang223 Nov 7, 2024
ae06d56
add more be ut
suxiaogang223 Nov 7, 2024
1370bc8
check if slot is partition column
suxiaogang223 Nov 8, 2024
ec4cb76
Correct spelling of _make_orc_literal and improve pushdown logic in O…
suxiaogang223 Nov 11, 2024
de03334
fix bug
suxiaogang223 Nov 11, 2024
2d6aa72
check rest children are all literal
suxiaogang223 Nov 11, 2024
c7c9652
fix renamed orc table bug and upper name table bug
suxiaogang223 Nov 11, 2024
5bcd2c2
refact code
suxiaogang223 Nov 11, 2024
8906271
fix warning
suxiaogang223 Nov 11, 2024
dcdcbd2
refact and fix acid bug
suxiaogang223 Nov 11, 2024
785fc5d
fix be ut
suxiaogang223 Nov 12, 2024
ccfe50d
add more ut for build_search_argument failed
suxiaogang223 Nov 12, 2024
03604c9
fix
suxiaogang223 Nov 12, 2024
5bfe51e
fix be ut for gcc
suxiaogang223 Nov 13, 2024
1e9269b
try to push exprs in and
suxiaogang223 Nov 13, 2024
d9526c2
fix build
suxiaogang223 Nov 13, 2024
4a7f77b
add be ut for and logic
suxiaogang223 Nov 13, 2024
cdfd4bc
Merge branch 'master' into fix_orc_pushdown2
suxiaogang223 Nov 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions be/src/runtime/exec_env.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ class ExecEnv {
static void set_tracking_memory(bool tracking_memory) {
_s_tracking_memory.store(tracking_memory, std::memory_order_release);
}
void set_orc_memory_pool(orc::MemoryPool* pool) { _orc_memory_pool = pool; }
#endif
LoadStreamMapPool* load_stream_map_pool() { return _load_stream_map_pool.get(); }

Expand Down
478 changes: 327 additions & 151 deletions be/src/vec/exec/format/orc/vorc_reader.cpp

Large diffs are not rendered by default.

37 changes: 26 additions & 11 deletions be/src/vec/exec/format/orc/vorc_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
#pragma once

#include <cctz/time_zone.h>
suxiaogang223 marked this conversation as resolved.
Show resolved Hide resolved
#include <stddef.h>
#include <stdint.h>

#include <cstddef>
#include <cstdint>
#include <list>
#include <memory>
#include <orc/OrcFile.hh>
Expand Down Expand Up @@ -51,6 +51,8 @@
#include "vec/exec/format/format_common.h"
#include "vec/exec/format/generic_reader.h"
#include "vec/exec/format/table/transactional_hive_reader.h"
#include "vec/exprs/vliteral.h"
#include "vec/exprs/vslot_ref.h"

namespace doris {
class RuntimeState;
Expand Down Expand Up @@ -80,13 +82,6 @@ namespace doris::vectorized {

class ORCFileInputStream;

struct OrcPredicate {
std::string col_name;
orc::PredicateDataType data_type;
std::vector<orc::Literal> literals;
SQLFilterOp op;
};

struct LazyReadContext {
VExprContextSPtrs conjuncts;
bool can_lazy_read = false;
Expand Down Expand Up @@ -228,6 +223,8 @@ class OrcReader : public GenericReader {
RuntimeProfile::Counter* decode_value_time = nullptr;
RuntimeProfile::Counter* decode_null_map_time = nullptr;
RuntimeProfile::Counter* filter_block_time = nullptr;
RuntimeProfile::Counter* selected_row_group_count = nullptr;
RuntimeProfile::Counter* evaluated_row_group_count = nullptr;
};

class ORCFilterImpl : public orc::ORCFilter {
Expand Down Expand Up @@ -291,8 +288,23 @@ class OrcReader : public GenericReader {
bool* is_hive1_orc);
static bool _check_acid_schema(const orc::Type& type);
static const orc::Type& _remove_acid(const orc::Type& type);
bool _init_search_argument(
std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range);
std::tuple<bool, orc::Literal, orc::PredicateDataType> _make_orc_literal(
const VSlotRef* slot_ref, const VLiteral* literal);
bool _check_slot_can_push_down(const VExprSPtr& expr);
bool _check_rest_children_can_push_down(const VExprSPtr& expr);
bool _check_expr_can_push_down(const VExprSPtr& expr);
bool _build_less_than(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_less_than_equals(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_equals(const VExprSPtr& expr, std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_filter_in(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_is_null(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_search_argument(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _init_search_argument(const VExprContextSPtrs& conjuncts);
void _init_bloom_filter(
std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range);
void _init_system_properties();
Expand Down Expand Up @@ -578,11 +590,14 @@ class OrcReader : public GenericReader {
bool _is_hive1_orc_or_use_idx = false;

std::unordered_map<std::string, std::string> _col_name_to_file_col_name;
// TODO: check if we can remove _col_name_to_file_col_name_low_case
std::unordered_map<std::string, std::string> _col_name_to_file_col_name_low_case;
std::unordered_map<std::string, const orc::Type*> _type_map;
std::vector<const orc::Type*> _col_orc_type;
std::unique_ptr<ORCFileInputStream> _file_input_stream;
Statistics _statistics;
OrcProfile _orc_profile;
orc::ReaderMetrics _reader_metrics;

std::unique_ptr<orc::ColumnVectorBatch> _batch;
std::unique_ptr<orc::Reader> _reader;
Expand Down
Binary file added be/test/exec/test_data/orc_scanner/orders.orc
Binary file not shown.
29 changes: 11 additions & 18 deletions be/test/testutil/desc_tbl_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,9 @@

#include "testutil/desc_tbl_builder.h"
suxiaogang223 marked this conversation as resolved.
Show resolved Hide resolved

#include <glog/logging.h>
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>

#include <vector>

#include "common/object_pool.h"
#include "common/status.h"
#include "gtest/gtest_pred_impl.h"
#include "runtime/define_primitive_type.h"
#include "runtime/descriptors.h"
#include "util/bit_util.h"

using std::vector;

namespace doris {

Expand All @@ -44,7 +33,7 @@ TupleDescBuilder& DescriptorTblBuilder::declare_tuple() {

// item_id of -1 indicates no itemTupleId
static TSlotDescriptor make_slot_descriptor(int id, int parent_id, const TypeDescriptor& type,
int slot_idx, int item_id) {
const std::string& name, int slot_idx, int item_id) {
int null_byte = slot_idx / 8;
int null_bit = slot_idx % 8;
TSlotDescriptor slot_desc;
Expand All @@ -58,6 +47,7 @@ static TSlotDescriptor make_slot_descriptor(int id, int parent_id, const TypeDes
slot_desc.__set_nullIndicatorBit(null_bit);
slot_desc.__set_slotIdx(slot_idx);
slot_desc.__set_isMaterialized(true);
slot_desc.__set_colName(name);
// if (item_id != -1) {
// slot_desc.__set_itemTupleId(item_id);
// }
Expand All @@ -78,24 +68,27 @@ DescriptorTbl* DescriptorTblBuilder::build() {
int tuple_id = 0;
int slot_id = 0;

for (int i = 0; i < _tuples_descs.size(); ++i) {
build_tuple(_tuples_descs[i]->slot_types(), &thrift_desc_tbl, &tuple_id, &slot_id);
for (auto& _tuples_desc : _tuples_descs) {
build_tuple(_tuples_desc->slot_types(), _tuples_desc->slot_names(), &thrift_desc_tbl,
&tuple_id, &slot_id);
}

Status status = DescriptorTbl::create(_obj_pool, thrift_desc_tbl, &desc_tbl);
EXPECT_TRUE(status.ok());
return desc_tbl;
}

TTupleDescriptor DescriptorTblBuilder::build_tuple(const vector<TypeDescriptor>& slot_types,
TTupleDescriptor DescriptorTblBuilder::build_tuple(const std::vector<TypeDescriptor>& slot_types,
const std::vector<std::string>& slot_names,
TDescriptorTable* thrift_desc_tbl,
int* next_tuple_id, int* slot_id) {
// We never materialize struct slots (there's no in-memory representation of structs,
// instead the materialized fields appear directly in the tuple), but array types can
// still have a struct item type. In this case, the array item tuple contains the
// "inlined" struct fields.
if (slot_types.size() == 1 && slot_types[0].type == TYPE_STRUCT) {
return build_tuple(slot_types[0].children, thrift_desc_tbl, next_tuple_id, slot_id);
return build_tuple(slot_types[0].children, slot_types[0].field_names, thrift_desc_tbl,
next_tuple_id, slot_id);
}

int tuple_id = *next_tuple_id;
Expand All @@ -111,7 +104,7 @@ TTupleDescriptor DescriptorTblBuilder::build_tuple(const vector<TypeDescriptor>&
// }

thrift_desc_tbl->slotDescriptors.push_back(
make_slot_descriptor(*slot_id, tuple_id, slot_types[i], i, item_id));
make_slot_descriptor(*slot_id, tuple_id, slot_types[i], slot_names[i], i, item_id));
thrift_desc_tbl->__isset.slotDescriptors = true;
++(*slot_id);
}
Expand Down
17 changes: 15 additions & 2 deletions be/test/testutil/desc_tbl_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,16 @@

#include <gen_cpp/Descriptors_types.h>
suxiaogang223 marked this conversation as resolved.
Show resolved Hide resolved

#include <tuple>
#include <vector>

#include "common/object_pool.h"
#include "runtime/descriptors.h"
#include "runtime/types.h"

namespace doris {

class ObjectPool;
class TupleDescBuilder;
class DescriptorTbl;

// Aids in the construction of a DescriptorTbl by declaring tuples and slots
// associated with those tuples.
Expand All @@ -40,6 +41,7 @@ class DescriptorTbl;
// DescriptorTblBuilder builder;
// builder.declare_tuple() << TYPE_TINYINT << TYPE_TIMESTAMP; // gets TupleId 0
// builder.declare_tuple() << TYPE_FLOAT; // gets TupleId 1
// builder.declare_tuple() << std::make_tuple(TYPE_INT, "col1") << std::make_tuple(TYPE_STRING, "col2"); // gets Tuple with type and name
// DescriptorTbl desc_tbl = builder.build();
class DescriptorTblBuilder {
public:
Expand All @@ -57,20 +59,31 @@ class DescriptorTblBuilder {
std::vector<TupleDescBuilder*> _tuples_descs;

TTupleDescriptor build_tuple(const std::vector<TypeDescriptor>& slot_types,
const std::vector<std::string>& slot_names,
TDescriptorTable* thrift_desc_tbl, int* tuple_id, int* slot_id);
};

class TupleDescBuilder {
public:
using SlotType = std::tuple<TypeDescriptor, std::string>;
TupleDescBuilder& operator<<(const SlotType& slot) {
_slot_types.push_back(std::get<0>(slot));
_slot_names.push_back(std::get<1>(slot));
return *this;
}

TupleDescBuilder& operator<<(const TypeDescriptor& slot_type) {
_slot_types.push_back(slot_type);
_slot_names.emplace_back("");
return *this;
}

std::vector<TypeDescriptor> slot_types() const { return _slot_types; }
std::vector<std::string> slot_names() const { return _slot_names; }

private:
std::vector<TypeDescriptor> _slot_types;
std::vector<std::string> _slot_names;
};

} // end namespace doris
Expand Down
Loading
Loading