Skip to content

Commit

Permalink
[enhance](orc) Optimize ORC Predicate Pushdown for OR-connected Predi…
Browse files Browse the repository at this point in the history
…cate (#43255)

### What problem does this PR solve?

Problem Summary:
This issue addresses a limitation in Apache Doris where only predicates
joined by AND are pushed down to the ORC reader, leaving OR-connected
predicates unoptimized. By extending pushdown functionality to handle
these OR conditions, the aim is to better leverage ORC’s predicate
pushdown capabilities, reducing data reads and improving query
performance.
  • Loading branch information
suxiaogang223 authored and Your Name committed Nov 22, 2024
1 parent cd59bc3 commit f7e0eb1
Show file tree
Hide file tree
Showing 9 changed files with 595 additions and 182 deletions.
1 change: 1 addition & 0 deletions be/src/runtime/exec_env.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ class ExecEnv {
static void set_tracking_memory(bool tracking_memory) {
_s_tracking_memory.store(tracking_memory, std::memory_order_release);
}
void set_orc_memory_pool(orc::MemoryPool* pool) { _orc_memory_pool = pool; }
#endif
LoadStreamMapPool* load_stream_map_pool() { return _load_stream_map_pool.get(); }

Expand Down
478 changes: 327 additions & 151 deletions be/src/vec/exec/format/orc/vorc_reader.cpp

Large diffs are not rendered by default.

37 changes: 26 additions & 11 deletions be/src/vec/exec/format/orc/vorc_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
#pragma once

#include <cctz/time_zone.h>
#include <stddef.h>
#include <stdint.h>

#include <cstddef>
#include <cstdint>
#include <list>
#include <memory>
#include <orc/OrcFile.hh>
Expand Down Expand Up @@ -51,6 +51,8 @@
#include "vec/exec/format/format_common.h"
#include "vec/exec/format/generic_reader.h"
#include "vec/exec/format/table/transactional_hive_reader.h"
#include "vec/exprs/vliteral.h"
#include "vec/exprs/vslot_ref.h"

namespace doris {
class RuntimeState;
Expand Down Expand Up @@ -80,13 +82,6 @@ namespace doris::vectorized {

class ORCFileInputStream;

struct OrcPredicate {
std::string col_name;
orc::PredicateDataType data_type;
std::vector<orc::Literal> literals;
SQLFilterOp op;
};

struct LazyReadContext {
VExprContextSPtrs conjuncts;
bool can_lazy_read = false;
Expand Down Expand Up @@ -228,6 +223,8 @@ class OrcReader : public GenericReader {
RuntimeProfile::Counter* decode_value_time = nullptr;
RuntimeProfile::Counter* decode_null_map_time = nullptr;
RuntimeProfile::Counter* filter_block_time = nullptr;
RuntimeProfile::Counter* selected_row_group_count = nullptr;
RuntimeProfile::Counter* evaluated_row_group_count = nullptr;
};

class ORCFilterImpl : public orc::ORCFilter {
Expand Down Expand Up @@ -291,8 +288,23 @@ class OrcReader : public GenericReader {
bool* is_hive1_orc);
static bool _check_acid_schema(const orc::Type& type);
static const orc::Type& _remove_acid(const orc::Type& type);
bool _init_search_argument(
std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range);
std::tuple<bool, orc::Literal, orc::PredicateDataType> _make_orc_literal(
const VSlotRef* slot_ref, const VLiteral* literal);
bool _check_slot_can_push_down(const VExprSPtr& expr);
bool _check_rest_children_can_push_down(const VExprSPtr& expr);
bool _check_expr_can_push_down(const VExprSPtr& expr);
bool _build_less_than(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_less_than_equals(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_equals(const VExprSPtr& expr, std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_filter_in(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_is_null(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_search_argument(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _init_search_argument(const VExprContextSPtrs& conjuncts);
void _init_bloom_filter(
std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range);
void _init_system_properties();
Expand Down Expand Up @@ -578,11 +590,14 @@ class OrcReader : public GenericReader {
bool _is_hive1_orc_or_use_idx = false;

std::unordered_map<std::string, std::string> _col_name_to_file_col_name;
// TODO: check if we can remove _col_name_to_file_col_name_low_case
std::unordered_map<std::string, std::string> _col_name_to_file_col_name_low_case;
std::unordered_map<std::string, const orc::Type*> _type_map;
std::vector<const orc::Type*> _col_orc_type;
std::unique_ptr<ORCFileInputStream> _file_input_stream;
Statistics _statistics;
OrcProfile _orc_profile;
orc::ReaderMetrics _reader_metrics;

std::unique_ptr<orc::ColumnVectorBatch> _batch;
std::unique_ptr<orc::Reader> _reader;
Expand Down
Binary file added be/test/exec/test_data/orc_scanner/orders.orc
Binary file not shown.
29 changes: 11 additions & 18 deletions be/test/testutil/desc_tbl_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,9 @@

#include "testutil/desc_tbl_builder.h"

#include <glog/logging.h>
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>

#include <vector>

#include "common/object_pool.h"
#include "common/status.h"
#include "gtest/gtest_pred_impl.h"
#include "runtime/define_primitive_type.h"
#include "runtime/descriptors.h"
#include "util/bit_util.h"

using std::vector;

namespace doris {

Expand All @@ -44,7 +33,7 @@ TupleDescBuilder& DescriptorTblBuilder::declare_tuple() {

// item_id of -1 indicates no itemTupleId
static TSlotDescriptor make_slot_descriptor(int id, int parent_id, const TypeDescriptor& type,
int slot_idx, int item_id) {
const std::string& name, int slot_idx, int item_id) {
int null_byte = slot_idx / 8;
int null_bit = slot_idx % 8;
TSlotDescriptor slot_desc;
Expand All @@ -58,6 +47,7 @@ static TSlotDescriptor make_slot_descriptor(int id, int parent_id, const TypeDes
slot_desc.__set_nullIndicatorBit(null_bit);
slot_desc.__set_slotIdx(slot_idx);
slot_desc.__set_isMaterialized(true);
slot_desc.__set_colName(name);
// if (item_id != -1) {
// slot_desc.__set_itemTupleId(item_id);
// }
Expand All @@ -78,24 +68,27 @@ DescriptorTbl* DescriptorTblBuilder::build() {
int tuple_id = 0;
int slot_id = 0;

for (int i = 0; i < _tuples_descs.size(); ++i) {
build_tuple(_tuples_descs[i]->slot_types(), &thrift_desc_tbl, &tuple_id, &slot_id);
for (auto& _tuples_desc : _tuples_descs) {
build_tuple(_tuples_desc->slot_types(), _tuples_desc->slot_names(), &thrift_desc_tbl,
&tuple_id, &slot_id);
}

Status status = DescriptorTbl::create(_obj_pool, thrift_desc_tbl, &desc_tbl);
EXPECT_TRUE(status.ok());
return desc_tbl;
}

TTupleDescriptor DescriptorTblBuilder::build_tuple(const vector<TypeDescriptor>& slot_types,
TTupleDescriptor DescriptorTblBuilder::build_tuple(const std::vector<TypeDescriptor>& slot_types,
const std::vector<std::string>& slot_names,
TDescriptorTable* thrift_desc_tbl,
int* next_tuple_id, int* slot_id) {
// We never materialize struct slots (there's no in-memory representation of structs,
// instead the materialized fields appear directly in the tuple), but array types can
// still have a struct item type. In this case, the array item tuple contains the
// "inlined" struct fields.
if (slot_types.size() == 1 && slot_types[0].type == TYPE_STRUCT) {
return build_tuple(slot_types[0].children, thrift_desc_tbl, next_tuple_id, slot_id);
return build_tuple(slot_types[0].children, slot_types[0].field_names, thrift_desc_tbl,
next_tuple_id, slot_id);
}

int tuple_id = *next_tuple_id;
Expand All @@ -111,7 +104,7 @@ TTupleDescriptor DescriptorTblBuilder::build_tuple(const vector<TypeDescriptor>&
// }

thrift_desc_tbl->slotDescriptors.push_back(
make_slot_descriptor(*slot_id, tuple_id, slot_types[i], i, item_id));
make_slot_descriptor(*slot_id, tuple_id, slot_types[i], slot_names[i], i, item_id));
thrift_desc_tbl->__isset.slotDescriptors = true;
++(*slot_id);
}
Expand Down
17 changes: 15 additions & 2 deletions be/test/testutil/desc_tbl_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,16 @@

#include <gen_cpp/Descriptors_types.h>

#include <tuple>
#include <vector>

#include "common/object_pool.h"
#include "runtime/descriptors.h"
#include "runtime/types.h"

namespace doris {

class ObjectPool;
class TupleDescBuilder;
class DescriptorTbl;

// Aids in the construction of a DescriptorTbl by declaring tuples and slots
// associated with those tuples.
Expand All @@ -40,6 +41,7 @@ class DescriptorTbl;
// DescriptorTblBuilder builder;
// builder.declare_tuple() << TYPE_TINYINT << TYPE_TIMESTAMP; // gets TupleId 0
// builder.declare_tuple() << TYPE_FLOAT; // gets TupleId 1
// builder.declare_tuple() << std::make_tuple(TYPE_INT, "col1") << std::make_tuple(TYPE_STRING, "col2"); // gets Tuple with type and name
// DescriptorTbl desc_tbl = builder.build();
class DescriptorTblBuilder {
public:
Expand All @@ -57,20 +59,31 @@ class DescriptorTblBuilder {
std::vector<TupleDescBuilder*> _tuples_descs;

TTupleDescriptor build_tuple(const std::vector<TypeDescriptor>& slot_types,
const std::vector<std::string>& slot_names,
TDescriptorTable* thrift_desc_tbl, int* tuple_id, int* slot_id);
};

class TupleDescBuilder {
public:
using SlotType = std::tuple<TypeDescriptor, std::string>;
TupleDescBuilder& operator<<(const SlotType& slot) {
_slot_types.push_back(std::get<0>(slot));
_slot_names.push_back(std::get<1>(slot));
return *this;
}

TupleDescBuilder& operator<<(const TypeDescriptor& slot_type) {
_slot_types.push_back(slot_type);
_slot_names.emplace_back("");
return *this;
}

std::vector<TypeDescriptor> slot_types() const { return _slot_types; }
std::vector<std::string> slot_names() const { return _slot_names; }

private:
std::vector<TypeDescriptor> _slot_types;
std::vector<std::string> _slot_names;
};

} // end namespace doris
Expand Down
Loading

0 comments on commit f7e0eb1

Please sign in to comment.