diff --git a/.github/workflows/auto-cherry-pick.yml b/.github/workflows/auto-cherry-pick.yml index 2581de3f31cc4e..55a426f3d3282b 100644 --- a/.github/workflows/auto-cherry-pick.yml +++ b/.github/workflows/auto-cherry-pick.yml @@ -21,6 +21,7 @@ on: pull_request_target: types: - closed + - labeled branches: - master permissions: @@ -30,7 +31,7 @@ permissions: jobs: auto_cherry_pick: runs-on: ubuntu-latest - if: ${{ (contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') || contains(github.event.pull_request.labels.*.name, 'dev/2.1.x')) && github.event.pull_request.merged == true }} + if: ${{(contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') || contains(github.event.pull_request.labels.*.name, 'dev/2.1.x') ||github.event.label.name == 'dev/3.0.x' || github.event.label.name == 'dev/2.1.x') && github.event.pull_request.merged == true }} steps: - name: Checkout repository uses: actions/checkout@v3 @@ -54,7 +55,7 @@ jobs: echo "SHA matches: $calculated_sha" fi - name: Auto cherry-pick to branch-3.0 - if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') }} + if: ${{ ((github.event.action == 'labeled' && github.event.label.name == 'dev/3.0.x'))|| ((github.event_name == 'pull_request_target' && github.event.action == 'closed') && contains(github.event.pull_request.labels.*.name, 'dev/3.0.x')) }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO_NAME: ${{ github.repository }} @@ -62,7 +63,7 @@ jobs: run: | python tools/auto-pick-script.py ${{ github.event.pull_request.number }} branch-3.0 - name: Auto cherry-pick to branch-2.1 - if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/2.1.x') }} + if: ${{ ((github.event.action == 'labeled' && github.event.label.name == 'dev/2.1.x'))|| ((github.event_name == 'pull_request_target' && github.event.action == 'closed') && contains(github.event.pull_request.labels.*.name, 'dev/2.1.x')) }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO_NAME: ${{ github.repository }} diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index ebd1fea3dd9fac..c88b073e96494a 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -449,6 +449,12 @@ void CloudTablet::recycle_cached_data(const std::vector& rowset if (config::enable_file_cache) { for (const auto& rs : rowsets) { + if (rs.use_count() >= 1) { + LOG(WARNING) << "Rowset " << rs->rowset_id().to_string() << " has " + << rs.use_count() + << " references. File Cache won't be recycled when query is using it."; + continue; + } for (int seg_id = 0; seg_id < rs->num_segments(); ++seg_id) { // TODO: Segment::file_cache_key auto file_key = Segment::file_cache_key(rs->rowset_id().to_string(), seg_id); diff --git a/be/src/clucene b/be/src/clucene index 7cf6cf410d41d9..48fa9cc4ec32b4 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 7cf6cf410d41d95456edba263cc55b7b6f5ab027 +Subproject commit 48fa9cc4ec32b40bf3b02338d0a1b2cdbc6408cf diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index a37a006acf0b6f..796e9af62ce480 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1004,7 +1004,7 @@ DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false"); DEFINE_Bool(enable_file_cache, "false"); // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240}] // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240},{"path":"/path/to/file_cache2","total_size":21474836480,"query_limit":10737418240}] -// format: {"path": "/path/to/file_cache", "total_size":53687091200, "normal_percent":85, "disposable_percent":10, "index_percent":5} +// format: {"path": "/path/to/file_cache", "total_size":53687091200, "ttl_percent":50, "normal_percent":40, "disposable_percent":5, "index_percent":5} // format: [{"path": "xxx", "total_size":53687091200, "storage": "memory"}] // Note1: storage is "disk" by default // Note2: when the storage is "memory", the path is ignored. So you can set xxx to anything you like @@ -1020,7 +1020,7 @@ DEFINE_Int64(file_cache_each_block_size, "1048576"); // 1MB DEFINE_Bool(clear_file_cache, "false"); DEFINE_Bool(enable_file_cache_query_limit, "false"); -DEFINE_mInt32(file_cache_enter_disk_resource_limit_mode_percent, "90"); +DEFINE_mInt32(file_cache_enter_disk_resource_limit_mode_percent, "88"); DEFINE_mInt32(file_cache_exit_disk_resource_limit_mode_percent, "80"); DEFINE_mBool(enable_read_cache_file_directly, "false"); DEFINE_mBool(file_cache_enable_evict_from_other_queue_by_size, "true"); @@ -1301,8 +1301,6 @@ DEFINE_Int64(num_buffered_reader_prefetch_thread_pool_max_thread, "64"); DEFINE_Int64(num_s3_file_upload_thread_pool_min_thread, "16"); // The max thread num for S3FileUploadThreadPool DEFINE_Int64(num_s3_file_upload_thread_pool_max_thread, "64"); -// The max ratio for ttl cache's size -DEFINE_mInt64(max_ttl_cache_ratio, "50"); // The maximum jvm heap usage ratio for hdfs write workload DEFINE_mDouble(max_hdfs_wirter_jni_heap_usage_ratio, "0.5"); // The sleep milliseconds duration when hdfs write exceeds the maximum usage @@ -1370,6 +1368,7 @@ DEFINE_Int32(query_cache_size, "512"); DEFINE_mBool(enable_delete_bitmap_merge_on_compaction, "false"); // Enable validation to check the correctness of table size. DEFINE_Bool(enable_table_size_correctness_check, "false"); +DEFINE_Bool(force_regenerate_rowsetid_on_start_error, "false"); // clang-format off #ifdef BE_TEST diff --git a/be/src/common/config.h b/be/src/common/config.h index 63d62b219c12f8..c40875728a3623 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1050,7 +1050,7 @@ DECLARE_Int32(pipeline_executor_size); DECLARE_Bool(enable_file_cache); // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240}] // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240},{"path":"/path/to/file_cache2","total_size":21474836480,"query_limit":10737418240}] -// format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240,"normal_percent":85, "disposable_percent":10, "index_percent":5}] +// format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240, "ttl_percent":50, "normal_percent":40, "disposable_percent":5, "index_percent":5}] // format: [{"path": "xxx", "total_size":53687091200, "storage": "memory"}] // Note1: storage is "disk" by default // Note2: when the storage is "memory", the path is ignored. So you can set xxx to anything you like @@ -1382,8 +1382,6 @@ DECLARE_Int64(num_buffered_reader_prefetch_thread_pool_max_thread); DECLARE_Int64(num_s3_file_upload_thread_pool_min_thread); // The max thread num for S3FileUploadThreadPool DECLARE_Int64(num_s3_file_upload_thread_pool_max_thread); -// The max ratio for ttl cache's size -DECLARE_mInt64(max_ttl_cache_ratio); // The maximum jvm heap usage ratio for hdfs write workload DECLARE_mDouble(max_hdfs_wirter_jni_heap_usage_ratio); // The sleep milliseconds duration when hdfs write exceeds the maximum usage @@ -1450,6 +1448,7 @@ DECLARE_mInt32(check_score_rounds_num); // MB DECLARE_Int32(query_cache_size); +DECLARE_Bool(force_regenerate_rowsetid_on_start_error); DECLARE_mBool(enable_delete_bitmap_merge_on_compaction); // Enable validation to check the correctness of table size. diff --git a/be/src/common/status.h b/be/src/common/status.h index de029d87ec94db..344f82a81b8e25 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -489,6 +489,7 @@ class [[nodiscard]] Status { ERROR_CTOR_NOSTACK(NeedSendAgain, NEED_SEND_AGAIN) ERROR_CTOR_NOSTACK(CgroupError, CGROUP_ERROR) ERROR_CTOR_NOSTACK(ObtainLockFailed, OBTAIN_LOCK_FAILED) + ERROR_CTOR_NOSTACK(NetworkError, NETWORK_ERROR) #undef ERROR_CTOR template diff --git a/be/src/exec/lzo_decompressor.cpp b/be/src/exec/lzo_decompressor.cpp index b075509202b70f..b240e2995a0414 100644 --- a/be/src/exec/lzo_decompressor.cpp +++ b/be/src/exec/lzo_decompressor.cpp @@ -103,6 +103,7 @@ Status LzopDecompressor::decompress(uint8_t* input, size_t input_len, size_t* in ptr = get_uint32(ptr, &uncompressed_size); left_input_len -= sizeof(uint32_t); if (uncompressed_size == 0) { + *input_bytes_read += sizeof(uint32_t); *stream_end = true; return Status::OK(); } diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index f1c0ad60e06455..acd923741eb73d 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -17,6 +17,7 @@ #include "exec/tablet_info.h" +#include #include #include #include @@ -180,6 +181,17 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { auto it = slots_map.find(to_lower(pcolumn_desc.name()) + "+" + data_type_str + is_null_str); if (it == std::end(slots_map)) { + std::string keys {}; + for (const auto& [key, _] : slots_map) { + keys += fmt::format("{},", key); + } + LOG_EVERY_SECOND(WARNING) << fmt::format( + "[OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema)]: " + "unknown index column, column={}, type={}, data_type_str={}, " + "is_null_str={}, slots_map.keys()=[{}], {}\npschema={}", + pcolumn_desc.name(), pcolumn_desc.type(), data_type_str, is_null_str, + keys, debug_string(), pschema.ShortDebugString()); + return Status::InternalError("unknown index column, column={}, type={}", pcolumn_desc.name(), pcolumn_desc.type()); } @@ -286,6 +298,18 @@ Status OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema) { auto it = slots_map.find(to_lower(tcolumn_desc.column_name) + "+" + data_type_str + is_null_str); if (it == slots_map.end()) { + std::stringstream ss; + ss << tschema; + std::string keys {}; + for (const auto& [key, _] : slots_map) { + keys += fmt::format("{},", key); + } + LOG_EVERY_SECOND(WARNING) << fmt::format( + "[OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema)]: " + "unknown index column, column={}, type={}, data_type_str={}, " + "is_null_str={}, slots_map.keys()=[{}], {}\ntschema={}", + tcolumn_desc.column_name, tcolumn_desc.column_type.type, data_type_str, + is_null_str, keys, debug_string(), ss.str()); return Status::InternalError("unknown index column, column={}, type={}", tcolumn_desc.column_name, tcolumn_desc.column_type.type); diff --git a/be/src/exprs/bitmapfilter_predicate.h b/be/src/exprs/bitmapfilter_predicate.h index 5cb2b812220b10..8b161bf6213f40 100644 --- a/be/src/exprs/bitmapfilter_predicate.h +++ b/be/src/exprs/bitmapfilter_predicate.h @@ -30,11 +30,7 @@ namespace doris { // only used in Runtime Filter class BitmapFilterFuncBase : public RuntimeFilterFuncBase { public: - virtual void insert(const void* data) = 0; virtual void insert_many(const std::vector& bitmaps) = 0; - virtual bool empty() = 0; - virtual Status assign(BitmapValue* bitmap_value) = 0; - virtual void light_copy(BitmapFilterFuncBase* other) { _not_in = other->_not_in; } virtual uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, int number) = 0; virtual void find_batch(const char* data, const uint8* nullmap, size_t number, @@ -58,8 +54,6 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { ~BitmapFilterFunc() override = default; - void insert(const void* data) override; - void insert_many(const std::vector& bitmaps) override; uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, @@ -68,21 +62,8 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { void find_batch(const char* data, const uint8* nullmap, size_t number, uint8* results) const override; - bool empty() override { return _bitmap_value->empty(); } - - Status assign(BitmapValue* bitmap_value) override { - *_bitmap_value = *bitmap_value; - return Status::OK(); - } - - void light_copy(BitmapFilterFuncBase* bitmapfilter_func) override; - size_t size() const override { return _bitmap_value->cardinality(); } - uint64_t max() { return _bitmap_value->max(nullptr); } - - uint64_t min() { return _bitmap_value->min(nullptr); } - bool contains_any(CppType left, CppType right) { if (right < 0) { return false; @@ -90,23 +71,12 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { return _bitmap_value->contains_any(std::max(left, (CppType)0), right); } - std::shared_ptr get_inner_bitmap() { return _bitmap_value; } - private: std::shared_ptr _bitmap_value; bool find(CppType data) const { return _not_in ^ (data >= 0 && _bitmap_value->contains(data)); } }; -template -void BitmapFilterFunc::insert(const void* data) { - if (data == nullptr) { - return; - } - - *_bitmap_value |= *reinterpret_cast(data); -} - template void BitmapFilterFunc::insert_many(const std::vector& bitmaps) { if (bitmaps.empty()) { @@ -147,12 +117,4 @@ void BitmapFilterFunc::find_batch(const char* data, const uint8* nullmap, } } -template -void BitmapFilterFunc::light_copy(BitmapFilterFuncBase* bitmapfilter_func) { - BitmapFilterFuncBase::light_copy(bitmapfilter_func); - auto other_func = reinterpret_cast(bitmapfilter_func); - _bitmap_value = other_func->_bitmap_value; - set_filter_id(bitmapfilter_func->get_filter_id()); -} - } // namespace doris diff --git a/be/src/exprs/create_predicate_function.h b/be/src/exprs/create_predicate_function.h index 387be1f9f0b11c..44f39fb77f6d85 100644 --- a/be/src/exprs/create_predicate_function.h +++ b/be/src/exprs/create_predicate_function.h @@ -17,6 +17,8 @@ #pragma once +#include "common/exception.h" +#include "common/status.h" #include "exprs/hybrid_set.h" #include "exprs/minmax_predicate.h" #include "function_filter.h" @@ -244,12 +246,9 @@ ColumnPredicate* create_olap_column_predicate(uint32_t column_id, int be_exec_version, const TabletColumn*) { if constexpr (PT == TYPE_TINYINT || PT == TYPE_SMALLINT || PT == TYPE_INT || PT == TYPE_BIGINT) { - std::shared_ptr filter_olap; - filter_olap.reset(create_bitmap_filter(PT)); - filter_olap->light_copy(filter.get()); return new BitmapFilterColumnPredicate(column_id, filter, be_exec_version); } else { - return nullptr; + throw Exception(ErrorCode::INTERNAL_ERROR, "bitmap filter do not support type {}", PT); } } @@ -266,17 +265,14 @@ ColumnPredicate* create_olap_column_predicate(uint32_t column_id, const std::shared_ptr& filter, int, const TabletColumn* column = nullptr) { // currently only support like predicate - if constexpr (PT == TYPE_CHAR || PT == TYPE_VARCHAR || PT == TYPE_STRING) { - if constexpr (PT == TYPE_CHAR) { - return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, - filter->_string_param); - } else { - return new LikeColumnPredicate(filter->_opposite, column_id, - filter->_fn_ctx, filter->_string_param); - } - } else { - return nullptr; + if constexpr (PT == TYPE_CHAR) { + return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, + filter->_string_param); + } else if constexpr (PT == TYPE_VARCHAR || PT == TYPE_STRING) { + return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, + filter->_string_param); } + throw Exception(ErrorCode::INTERNAL_ERROR, "function filter do not support type {}", PT); } template diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 85f1c535c7038b..d05bb6fa3cfc44 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -990,14 +990,14 @@ void IRuntimeFilter::insert_batch(const vectorized::ColumnPtr column, size_t sta _wrapper->insert_batch(column, start); } -Status IRuntimeFilter::publish(bool publish_local) { +Status IRuntimeFilter::publish(RuntimeState* state, bool publish_local) { DCHECK(is_producer()); auto send_to_remote = [&](IRuntimeFilter* filter) { TNetworkAddress addr; DCHECK(_state != nullptr); RETURN_IF_ERROR(_state->runtime_filter_mgr->get_merge_addr(&addr)); - return filter->push_to_remote(&addr); + return filter->push_to_remote(state, &addr); }; auto send_to_local = [&](std::shared_ptr wrapper) { std::vector> filters; @@ -1088,8 +1088,10 @@ class SyncSizeClosure : public AutoReleaseClosure req, std::shared_ptr> callback, std::shared_ptr dependency, - RuntimeFilterContextSPtr rf_context) - : Base(req, callback), _dependency(std::move(dependency)), _rf_context(rf_context) {} + RuntimeFilterContextSPtr rf_context, std::weak_ptr context) + : Base(req, callback, context), + _dependency(std::move(dependency)), + _rf_context(rf_context) {} }; Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filter_size) { @@ -1133,8 +1135,10 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt auto callback = DummyBrpcCallback::create_shared(); // IRuntimeFilter maybe deconstructed before the rpc finished, so that could not use // a raw pointer in closure. Has to use the context's shared ptr. - auto closure = - SyncSizeClosure::create_unique(request, callback, _dependency, _wrapper->_context); + auto closure = SyncSizeClosure::create_unique( + request, callback, _dependency, _wrapper->_context, + state->query_options().ignore_runtime_filter_error ? std::weak_ptr {} + : state->get_query_ctx_weak()); auto* pquery_id = request->mutable_query_id(); pquery_id->set_hi(_state->query_id.hi()); pquery_id->set_lo(_state->query_id.lo()); @@ -1157,7 +1161,7 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt return Status::OK(); } -Status IRuntimeFilter::push_to_remote(const TNetworkAddress* addr) { +Status IRuntimeFilter::push_to_remote(RuntimeState* state, const TNetworkAddress* addr) { DCHECK(is_producer()); std::shared_ptr stub( _state->exec_env->brpc_internal_client_cache()->get_client(*addr)); @@ -1170,7 +1174,10 @@ Status IRuntimeFilter::push_to_remote(const TNetworkAddress* addr) { auto merge_filter_callback = DummyBrpcCallback::create_shared(); auto merge_filter_closure = AutoReleaseClosure>:: - create_unique(merge_filter_request, merge_filter_callback); + create_unique(merge_filter_request, merge_filter_callback, + state->query_options().ignore_runtime_filter_error + ? std::weak_ptr {} + : state->get_query_ctx_weak()); void* data = nullptr; int len = 0; diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index f5a069d9e55f85..84a7f36c8a808c 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -225,7 +225,7 @@ class IRuntimeFilter { // publish filter // push filter to remote node or push down it to scan_node - Status publish(bool publish_local = false); + Status publish(RuntimeState* state, bool publish_local = false); Status send_filter_size(RuntimeState* state, uint64_t local_filter_size); @@ -293,7 +293,7 @@ class IRuntimeFilter { bool need_sync_filter_size(); // async push runtimefilter to remote node - Status push_to_remote(const TNetworkAddress* addr); + Status push_to_remote(RuntimeState* state, const TNetworkAddress* addr); void init_profile(RuntimeProfile* parent_profile); diff --git a/be/src/exprs/runtime_filter_slots.h b/be/src/exprs/runtime_filter_slots.h index 42c5f598633ad9..3c18735e4e82ce 100644 --- a/be/src/exprs/runtime_filter_slots.h +++ b/be/src/exprs/runtime_filter_slots.h @@ -149,10 +149,10 @@ class VRuntimeFilterSlots { } // publish runtime filter - Status publish(bool publish_local) { + Status publish(RuntimeState* state, bool publish_local) { for (auto& pair : _runtime_filters_map) { for (auto& filter : pair.second) { - RETURN_IF_ERROR(filter->publish(publish_local)); + RETURN_IF_ERROR(filter->publish(state, publish_local)); } } return Status::OK(); diff --git a/be/src/exprs/runtime_filter_slots_cross.h b/be/src/exprs/runtime_filter_slots_cross.h index 01ae21a75992de..a49f2928f842a9 100644 --- a/be/src/exprs/runtime_filter_slots_cross.h +++ b/be/src/exprs/runtime_filter_slots_cross.h @@ -72,9 +72,9 @@ class VRuntimeFilterSlotsCross { return Status::OK(); } - Status publish() { + Status publish(RuntimeState* state) { for (auto filter : _runtime_filters) { - RETURN_IF_ERROR(filter->publish()); + RETURN_IF_ERROR(filter->publish(state)); } return Status::OK(); } diff --git a/be/src/http/http_client.cpp b/be/src/http/http_client.cpp index c842a4fe2dd4ce..fc4c997fce8397 100644 --- a/be/src/http/http_client.cpp +++ b/be/src/http/http_client.cpp @@ -27,6 +27,7 @@ #include "http/http_headers.h" #include "http/http_status.h" #include "runtime/exec_env.h" +#include "util/security.h" #include "util/stack_util.h" namespace doris { @@ -205,9 +206,11 @@ Status HttpClient::execute(const std::function& callback) { Status status; @@ -293,7 +305,9 @@ Status HttpClient::execute_with_retry(int retry_times, int sleep_time, if (http_status == 200) { return status; } else { - auto error_msg = fmt::format("http status code is not 200, code={}", http_status); + std::string url = mask_token(client._get_url()); + auto error_msg = fmt::format("http status code is not 200, code={}, url={}", + http_status, url); LOG(WARNING) << error_msg; return Status::HttpError(error_msg); } diff --git a/be/src/http/http_client.h b/be/src/http/http_client.h index fb692c50268484..c0c8863a9b06d4 100644 --- a/be/src/http/http_client.h +++ b/be/src/http/http_client.h @@ -164,7 +164,8 @@ class HttpClient { Status _escape_url(const std::string& url, std::string* escaped_url); private: - const char* _to_errmsg(CURLcode code); + const char* _to_errmsg(CURLcode code) const; + const char* _get_url() const; private: CURL* _curl = nullptr; diff --git a/be/src/index-tools/index_tool.cpp b/be/src/index-tools/index_tool.cpp index adea2cd84c95f6..ca0575dc545459 100644 --- a/be/src/index-tools/index_tool.cpp +++ b/be/src/index-tools/index_tool.cpp @@ -170,7 +170,7 @@ void search(lucene::store::Directory* dir, std::string& field, std::string& toke std::vector terms = split(token, '|'); doris::TQueryOptions queryOptions; - ConjunctionQuery conjunct_query(s, queryOptions); + ConjunctionQuery conjunct_query(s, queryOptions, nullptr); conjunct_query.add(field_ws, terms); conjunct_query.search(result); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 4fb3f3e02cb58c..596afb64232b5b 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -86,42 +86,42 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _total_evict_size_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_total_evict_size"); - _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = + _evict_by_time_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_disposable_to_normal"); - _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = + "file_cache_evict_by_time_disposable_to_normal"); + _evict_by_time_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_disposable_to_index"); - _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = + "file_cache_evict_by_time_disposable_to_index"); + _evict_by_time_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_disposable_to_ttl"); - _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = + "file_cache_evict_by_time_disposable_to_ttl"); + _evict_by_time_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_normal_to_disposable"); - _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = + "file_cache_evict_by_time_normal_to_disposable"); + _evict_by_time_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_normal_to_index"); - _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = + "file_cache_evict_by_time_normal_to_index"); + _evict_by_time_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_normal_to_ttl"); - _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = + "file_cache_evict_by_time_normal_to_ttl"); + _evict_by_time_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_index_to_disposable"); - _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = + "file_cache_evict_by_time_index_to_disposable"); + _evict_by_time_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_index_to_normal"); - _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = + "file_cache_evict_by_time_index_to_normal"); + _evict_by_time_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_index_to_ttl"); - _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = + "file_cache_evict_by_time_index_to_ttl"); + _evict_by_time_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_ttl_to_disposable"); - _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = + "file_cache_evict_by_time_ttl_to_disposable"); + _evict_by_time_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_ttl_to_normal"); - _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = + "file_cache_evict_by_time_ttl_to_normal"); + _evict_by_time_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_ttl_to_index"); + "file_cache_evict_by_time_ttl_to_index"); _evict_by_self_lru_metrics_matrix[FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), @@ -197,8 +197,8 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, "file_cache_hit_ratio_5m", 0.0); _hit_ratio_1h = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_1h", 0.0); - _disk_limit_mode_metrics = - std::make_shared>(_cache_base_path.c_str(), "disk_limit_mode", 0); + _disk_limit_mode_metrics = std::make_shared>( + _cache_base_path.c_str(), "file_cache_disk_limit_mode", 0); _disposable_queue = LRUQueue(cache_settings.disposable_queue_size, cache_settings.disposable_queue_elements, 60 * 60); @@ -970,67 +970,6 @@ void BlockFileCache::find_evict_candidates(LRUQueue& queue, size_t size, size_t } } -bool BlockFileCache::try_reserve_for_ttl_without_lru(size_t size, - std::lock_guard& cache_lock) { - size_t removed_size = 0; - size_t cur_cache_size = _cur_cache_size; - auto limit = config::max_ttl_cache_ratio * _capacity; - - TEST_INJECTION_POINT_CALLBACK("BlockFileCache::change_limit1", &limit); - - if ((_cur_ttl_size + size) * 100 > limit) { - return false; - } - - size_t normal_queue_size = _normal_queue.get_capacity(cache_lock); - size_t disposable_queue_size = _disposable_queue.get_capacity(cache_lock); - size_t index_queue_size = _index_queue.get_capacity(cache_lock); - if (is_overflow(removed_size, size, cur_cache_size) && normal_queue_size == 0 && - disposable_queue_size == 0 && index_queue_size == 0) { - return false; - } - std::vector to_evict; - auto collect_eliminate_fragments = [&](LRUQueue& queue) { - size_t cur_removed_size = 0; - find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - cur_removed_size); - }; - if (disposable_queue_size != 0) { - collect_eliminate_fragments(get_queue(FileCacheType::DISPOSABLE)); - } - if (normal_queue_size != 0) { - collect_eliminate_fragments(get_queue(FileCacheType::NORMAL)); - } - if (index_queue_size != 0) { - collect_eliminate_fragments(get_queue(FileCacheType::INDEX)); - } - remove_file_blocks(to_evict, cache_lock); - if (is_overflow(removed_size, size, cur_cache_size)) { - return false; - } - return true; -} - -bool BlockFileCache::try_reserve_for_ttl(size_t size, std::lock_guard& cache_lock) { - if (try_reserve_for_ttl_without_lru(size, cache_lock)) { - return true; - } else if (config::enable_ttl_cache_evict_using_lru) { - auto& queue = get_queue(FileCacheType::TTL); - size_t removed_size = 0; - size_t cur_cache_size = _cur_cache_size; - - std::vector to_evict; - size_t cur_removed_size = 0; - find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - cur_removed_size); - remove_file_blocks_and_clean_time_maps(to_evict, cache_lock); - - return !is_overflow(removed_size, size, cur_cache_size); - } else { - return false; - } -} - // 1. if async load file cache not finish // a. evict from lru queue // 2. if ttl cache @@ -1283,7 +1222,7 @@ void BlockFileCache::reset_range(const UInt128Wrapper& hash, size_t offset, size _cur_cache_size += new_size; } -bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( +bool BlockFileCache::try_reserve_from_other_queue_by_time_interval( FileCacheType cur_type, std::vector other_cache_types, size_t size, int64_t cur_time, std::lock_guard& cache_lock) { size_t removed_size = 0; @@ -1316,7 +1255,7 @@ bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( remove_size_per_type += cell_size; } } - *(_evict_by_heat_metrics_matrix[cache_type][cur_type]) << remove_size_per_type; + *(_evict_by_time_metrics_matrix[cache_type][cur_type]) << remove_size_per_type; } remove_file_blocks(to_evict, cache_lock); @@ -1365,7 +1304,7 @@ bool BlockFileCache::try_reserve_from_other_queue(FileCacheType cur_cache_type, std::lock_guard& cache_lock) { // currently, TTL cache is not considered as a candidate auto other_cache_types = get_other_cache_type_without_ttl(cur_cache_type); - bool reserve_success = try_reserve_from_other_queue_by_hot_interval( + bool reserve_success = try_reserve_from_other_queue_by_time_interval( cur_cache_type, other_cache_types, size, cur_time, cache_lock); if (reserve_success || !config::file_cache_enable_evict_from_other_queue_by_size) { return reserve_success; @@ -1777,50 +1716,56 @@ void BlockFileCache::run_background_operation() { break; } } + // report + { + SCOPED_CACHE_LOCK(_mutex); + _cur_cache_size_metrics->set_value(_cur_cache_size); + _cur_ttl_cache_size_metrics->set_value(_cur_cache_size - + _index_queue.get_capacity(cache_lock) - + _normal_queue.get_capacity(cache_lock) - + _disposable_queue.get_capacity(cache_lock)); + _cur_ttl_cache_lru_queue_cache_size_metrics->set_value( + _ttl_queue.get_capacity(cache_lock)); + _cur_ttl_cache_lru_queue_element_count_metrics->set_value( + _ttl_queue.get_elements_num(cache_lock)); + _cur_normal_queue_cache_size_metrics->set_value(_normal_queue.get_capacity(cache_lock)); + _cur_normal_queue_element_count_metrics->set_value( + _normal_queue.get_elements_num(cache_lock)); + _cur_index_queue_cache_size_metrics->set_value(_index_queue.get_capacity(cache_lock)); + _cur_index_queue_element_count_metrics->set_value( + _index_queue.get_elements_num(cache_lock)); + _cur_disposable_queue_cache_size_metrics->set_value( + _disposable_queue.get_capacity(cache_lock)); + _cur_disposable_queue_element_count_metrics->set_value( + _disposable_queue.get_elements_num(cache_lock)); + + if (_num_read_blocks->get_value() > 0) { + _hit_ratio->set_value((double)_num_hit_blocks->get_value() / + _num_read_blocks->get_value()); + } + if (_num_read_blocks_5m->get_value() > 0) { + _hit_ratio_5m->set_value((double)_num_hit_blocks_5m->get_value() / + _num_read_blocks_5m->get_value()); + } + if (_num_read_blocks_1h->get_value() > 0) { + _hit_ratio_1h->set_value((double)_num_hit_blocks_1h->get_value() / + _num_read_blocks_1h->get_value()); + } + } + recycle_stale_rowset_async_bottom_half(); recycle_deleted_blocks(); // gc - int64_t cur_time = UnixSeconds(); - SCOPED_CACHE_LOCK(_mutex); - while (!_time_to_key.empty()) { - auto begin = _time_to_key.begin(); - if (cur_time < begin->first) { - break; + { + int64_t cur_time = UnixSeconds(); + SCOPED_CACHE_LOCK(_mutex); + while (!_time_to_key.empty()) { + auto begin = _time_to_key.begin(); + if (cur_time < begin->first) { + break; + } + remove_if_ttl_file_unlock(begin->second, false, cache_lock); } - remove_if_ttl_file_unlock(begin->second, false, cache_lock); - } - - // report - _cur_cache_size_metrics->set_value(_cur_cache_size); - _cur_ttl_cache_size_metrics->set_value(_cur_cache_size - - _index_queue.get_capacity(cache_lock) - - _normal_queue.get_capacity(cache_lock) - - _disposable_queue.get_capacity(cache_lock)); - _cur_ttl_cache_lru_queue_cache_size_metrics->set_value(_ttl_queue.get_capacity(cache_lock)); - _cur_ttl_cache_lru_queue_element_count_metrics->set_value( - _ttl_queue.get_elements_num(cache_lock)); - _cur_normal_queue_cache_size_metrics->set_value(_normal_queue.get_capacity(cache_lock)); - _cur_normal_queue_element_count_metrics->set_value( - _normal_queue.get_elements_num(cache_lock)); - _cur_index_queue_cache_size_metrics->set_value(_index_queue.get_capacity(cache_lock)); - _cur_index_queue_element_count_metrics->set_value( - _index_queue.get_elements_num(cache_lock)); - _cur_disposable_queue_cache_size_metrics->set_value( - _disposable_queue.get_capacity(cache_lock)); - _cur_disposable_queue_element_count_metrics->set_value( - _disposable_queue.get_elements_num(cache_lock)); - - if (_num_read_blocks->get_value() > 0) { - _hit_ratio->set_value((double)_num_hit_blocks->get_value() / - _num_read_blocks->get_value()); - } - if (_num_read_blocks_5m->get_value() > 0) { - _hit_ratio_5m->set_value((double)_num_hit_blocks_5m->get_value() / - _num_read_blocks_5m->get_value()); - } - if (_num_read_blocks_1h->get_value() > 0) { - _hit_ratio_1h->set_value((double)_num_hit_blocks_1h->get_value() / - _num_read_blocks_1h->get_value()); } } } diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index 0de33dadc8249d..f23d5a3799e0cf 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -397,10 +397,6 @@ class BlockFileCache { size_t get_available_cache_size(FileCacheType cache_type) const; - bool try_reserve_for_ttl(size_t size, std::lock_guard& cache_lock); - - bool try_reserve_for_ttl_without_lru(size_t size, std::lock_guard& cache_lock); - FileBlocks split_range_into_cells(const UInt128Wrapper& hash, const CacheContext& context, size_t offset, size_t size, FileBlock::State state, std::lock_guard& cache_lock); @@ -436,10 +432,10 @@ class BlockFileCache { void recycle_deleted_blocks(); - bool try_reserve_from_other_queue_by_hot_interval(FileCacheType cur_type, - std::vector other_cache_types, - size_t size, int64_t cur_time, - std::lock_guard& cache_lock); + bool try_reserve_from_other_queue_by_time_interval(FileCacheType cur_type, + std::vector other_cache_types, + size_t size, int64_t cur_time, + std::lock_guard& cache_lock); bool try_reserve_from_other_queue_by_size(FileCacheType cur_type, std::vector other_cache_types, @@ -515,7 +511,7 @@ class BlockFileCache { std::shared_ptr> _cur_disposable_queue_cache_size_metrics; std::array>, 4> _queue_evict_size_metrics; std::shared_ptr> _total_evict_size_metrics; - std::shared_ptr> _evict_by_heat_metrics_matrix[4][4]; + std::shared_ptr> _evict_by_time_metrics_matrix[4][4]; std::shared_ptr> _evict_by_size_metrics_matrix[4][4]; std::shared_ptr> _evict_by_self_lru_metrics_matrix[4]; std::shared_ptr> _evict_by_try_release; diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index 674879300452df..19041938a08346 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -42,7 +42,8 @@ std::string FileCacheSettings::to_string() const { FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cache_size, size_t normal_percent, size_t disposable_percent, - size_t index_percent, const std::string& storage) { + size_t index_percent, size_t ttl_percent, + const std::string& storage) { io::FileCacheSettings settings; if (capacity == 0) return settings; settings.capacity = capacity; @@ -59,12 +60,12 @@ FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cach std::max(settings.index_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); - settings.ttl_queue_size = per_size * config::max_ttl_cache_ratio; + settings.ttl_queue_size = per_size * ttl_percent; settings.ttl_queue_elements = std::max(settings.ttl_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); - settings.query_queue_size = - settings.capacity - settings.disposable_queue_size - settings.index_queue_size; + settings.query_queue_size = settings.capacity - settings.disposable_queue_size - + settings.index_queue_size - settings.ttl_queue_size; settings.query_queue_elements = std::max(settings.query_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 30579ba7851b28..0d700d9303191f 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -29,6 +29,7 @@ inline static constexpr size_t FILE_CACHE_MAX_FILE_BLOCK_SIZE = 1 * 1024 * 1024; inline static constexpr size_t DEFAULT_NORMAL_PERCENT = 40; inline static constexpr size_t DEFAULT_DISPOSABLE_PERCENT = 5; inline static constexpr size_t DEFAULT_INDEX_PERCENT = 5; +inline static constexpr size_t DEFAULT_TTL_PERCENT = 50; using uint128_t = vectorized::UInt128; @@ -107,6 +108,7 @@ FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cach size_t normal_percent = DEFAULT_NORMAL_PERCENT, size_t disposable_percent = DEFAULT_DISPOSABLE_PERCENT, size_t index_percent = DEFAULT_INDEX_PERCENT, + size_t ttl_percent = DEFAULT_TTL_PERCENT, const std::string& storage = "disk"); struct CacheContext { diff --git a/be/src/io/fs/s3_file_writer.cpp b/be/src/io/fs/s3_file_writer.cpp index e40b9e171eb08f..7a06ce22074621 100644 --- a/be/src/io/fs/s3_file_writer.cpp +++ b/be/src/io/fs/s3_file_writer.cpp @@ -379,7 +379,14 @@ Status S3FileWriter::_set_upload_to_remote_less_than_buffer_size() { } void S3FileWriter::_put_object(UploadFileBuffer& buf) { - DCHECK(state() != State::CLOSED) << fmt::format("state is {}", state()); + if (state() == State::CLOSED) { + DCHECK(state() != State::CLOSED) + << "state=" << (int)state() << " path=" << _obj_storage_path_opts.path.native(); + LOG_WARNING("failed to put object because file closed, file path {}", + _obj_storage_path_opts.path.native()); + buf.set_status(Status::InternalError("try to put closed file")); + return; + } const auto& client = _obj_client->get(); if (nullptr == client) { buf.set_status(Status::InternalError("invalid obj storage client")); diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index d9d37d13198bbd..cd4f89b57ec50d 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -65,7 +65,9 @@ class BloomFilterColumnPredicate : public ColumnPredicate { uint16_t evaluate(const vectorized::IColumn& column, const uint8_t* null_map, uint16_t* sel, uint16_t size) const { if constexpr (is_nullable) { - DCHECK(null_map); + if (!null_map) { + throw Exception(ErrorCode::INTERNAL_ERROR, "null_map is nullptr"); + } } uint16_t new_size = 0; @@ -91,7 +93,9 @@ class BloomFilterColumnPredicate : public ColumnPredicate { int get_filter_id() const override { int filter_id = _filter->get_filter_id(); - DCHECK(filter_id != -1); + if (filter_id == 1) { + throw Exception(ErrorCode::INTERNAL_ERROR, "filter_id is -1"); + } return filter_id; } diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index a40e28669e90cc..738087a702f070 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -613,11 +613,9 @@ Status Compaction::do_inverted_index_compaction() { fs, std::string {InvertedIndexDescriptor::get_index_file_path_prefix(seg_path)}, _cur_tablet_schema->get_inverted_index_storage_format(), rowset->rowset_meta()->inverted_index_file_info(seg_id)); - bool open_idx_file_cache = false; RETURN_NOT_OK_STATUS_WITH_WARN( - inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache), - "inverted_index_file_reader init failed"); + inverted_index_file_reader->init(config::inverted_index_read_buffer_size), + "inverted_index_file_reader init faiqled"); inverted_index_file_readers[m.second] = std::move(inverted_index_file_reader); } @@ -666,9 +664,11 @@ Status Compaction::do_inverted_index_compaction() { DORIS_TRY(inverted_index_file_readers[src_segment_id]->open(index_meta)); } for (int dest_segment_id = 0; dest_segment_id < dest_segment_num; dest_segment_id++) { - auto* dest_dir = + auto dest_dir = DORIS_TRY(inverted_index_file_writers[dest_segment_id]->open(index_meta)); - dest_index_dirs[dest_segment_id] = dest_dir; + // Destination directories in dest_index_dirs do not need to be deconstructed, + // but their lifecycle must be managed by inverted_index_file_writers. + dest_index_dirs[dest_segment_id] = dest_dir.get(); } auto st = compact_column(index_meta->index_id(), src_idx_dirs, dest_index_dirs, index_tmp_path.native(), trans_vec, dest_segment_num_rows); @@ -783,9 +783,8 @@ void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { InvertedIndexDescriptor::get_index_file_path_prefix(*seg_path)}, _cur_tablet_schema->get_inverted_index_storage_format(), rowset->rowset_meta()->inverted_index_file_info(i)); - bool open_idx_file_cache = false; auto st = inverted_index_file_reader->init( - config::inverted_index_read_buffer_size, open_idx_file_cache); + config::inverted_index_read_buffer_size); index_file_path = inverted_index_file_reader->get_index_file_path(index_meta); DBUG_EXECUTE_IF( "Compaction::construct_skip_inverted_index_index_file_reader_init_" @@ -1127,6 +1126,18 @@ Status CloudCompactionMixin::execute_compact_impl(int64_t permits) { RETURN_IF_ERROR(merge_input_rowsets()); + DBUG_EXECUTE_IF("CloudFullCompaction::modify_rowsets.wrong_rowset_id", { + DCHECK(compaction_type() == ReaderType::READER_FULL_COMPACTION); + RowsetId id; + id.version = 2; + id.hi = _output_rowset->rowset_meta()->rowset_id().hi + ((int64_t)(1) << 56); + id.mi = _output_rowset->rowset_meta()->rowset_id().mi; + id.lo = _output_rowset->rowset_meta()->rowset_id().lo; + _output_rowset->rowset_meta()->set_rowset_id(id); + LOG(INFO) << "[Debug wrong rowset id]:" + << _output_rowset->rowset_meta()->rowset_id().to_string(); + }) + RETURN_IF_ERROR(_engine.meta_mgr().commit_rowset(*_output_rowset->rowset_meta().get())); // 4. modify rowsets in memory diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index d3bd0f0a3a2436..11249bafb1e3c0 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -35,6 +35,7 @@ #include #include +#include "common/config.h" #include "io/io_common.h" #include "olap/olap_define.h" #include "olap/rowset/rowset_fwd.h" @@ -394,6 +395,8 @@ using ColumnId = uint32_t; using UniqueIdSet = std::set; // Column unique Id -> column id map using UniqueIdToColumnIdMap = std::map; +struct RowsetId; +RowsetId next_rowset_id(); // 8 bit rowset id version // 56 bit, inc number from 1 @@ -412,7 +415,12 @@ struct RowsetId { auto [_, ec] = std::from_chars(rowset_id_str.data(), rowset_id_str.data() + rowset_id_str.length(), high); if (ec != std::errc {}) [[unlikely]] { - LOG(FATAL) << "failed to init rowset id: " << rowset_id_str; + if (config::force_regenerate_rowsetid_on_start_error) { + LOG(WARNING) << "failed to init rowset id: " << rowset_id_str; + high = next_rowset_id().hi; + } else { + LOG(FATAL) << "failed to init rowset id: " << rowset_id_str; + } } init(1, high, 0, 0); } else { diff --git a/be/src/olap/options.cpp b/be/src/olap/options.cpp index 9c500c10993395..8668f8319d10e6 100644 --- a/be/src/olap/options.cpp +++ b/be/src/olap/options.cpp @@ -32,6 +32,7 @@ #include "common/status.h" #include "gutil/strings/split.h" #include "gutil/strings/strip.h" +#include "io/cache/file_cache_common.h" #include "io/fs/local_file_system.h" #include "olap/olap_define.h" #include "olap/utils.h" @@ -56,6 +57,7 @@ static std::string CACHE_QUERY_LIMIT_SIZE = "query_limit"; static std::string CACHE_NORMAL_PERCENT = "normal_percent"; static std::string CACHE_DISPOSABLE_PERCENT = "disposable_percent"; static std::string CACHE_INDEX_PERCENT = "index_percent"; +static std::string CACHE_TTL_PERCENT = "ttl_percent"; static std::string CACHE_STORAGE = "storage"; static std::string CACHE_STORAGE_DISK = "disk"; static std::string CACHE_STORAGE_MEMORY = "memory"; @@ -206,7 +208,7 @@ void parse_conf_broken_store_paths(const string& config_path, std::set rlock(_load_index_lock); if (_inverted_index) { - RETURN_IF_ERROR(_inverted_index->new_iterator(read_options.stats, + RETURN_IF_ERROR(_inverted_index->new_iterator(read_options.io_ctx, read_options.stats, read_options.runtime_state, iterator)); } } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp index fb2479517166fc..6e9d61db7fddb4 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp @@ -20,8 +20,9 @@ namespace doris::segment_v2 { ConjunctionQuery::ConjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, const io::IOContext* io_ctx) : _searcher(searcher), + _io_ctx(io_ctx), _index_version(_searcher->getReader()->getIndexVersion()), _conjunction_ratio(query_options.inverted_index_conjunction_opt_threshold) {} @@ -48,7 +49,7 @@ void ConjunctionQuery::add(const std::wstring& field_name, const std::vectorgetReader()->termDocs(t); + TermDocs* term_doc = _searcher->getReader()->termDocs(t, _io_ctx); _term_docs.push_back(term_doc); iterators.emplace_back(term_doc); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h index 2571392d5294e9..b9bfee2bfb1f7a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h @@ -27,7 +27,7 @@ namespace doris::segment_v2 { class ConjunctionQuery : public Query { public: ConjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~ConjunctionQuery() override; void add(const std::wstring& field_name, const std::vector& terms) override; @@ -41,6 +41,7 @@ class ConjunctionQuery : public Query { public: std::shared_ptr _searcher; + const io::IOContext* _io_ctx = nullptr; IndexVersion _index_version = IndexVersion::kV0; int32_t _conjunction_ratio = 1000; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp index 650a88c064611c..852357073d3b1d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp @@ -20,8 +20,8 @@ namespace doris::segment_v2 { DisjunctionQuery::DisjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) - : _searcher(searcher) {} + const TQueryOptions& query_options, const io::IOContext* io_ctx) + : _searcher(searcher), _io_ctx(io_ctx) {} void DisjunctionQuery::add(const std::wstring& field_name, const std::vector& terms) { if (terms.empty()) { @@ -36,7 +36,7 @@ void DisjunctionQuery::search(roaring::Roaring& roaring) { auto func = [this, &roaring](const std::string& term, bool first) { std::wstring ws_term = StringUtil::string_to_wstring(term); auto* t = _CLNEW Term(_field_name.c_str(), ws_term.c_str()); - auto* term_doc = _searcher->getReader()->termDocs(t); + auto* term_doc = _searcher->getReader()->termDocs(t, _io_ctx); TermIterator iterator(term_doc); DocRange doc_range; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h index 357831461571c7..8d0559ee4b0c98 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h @@ -27,7 +27,7 @@ namespace doris::segment_v2 { class DisjunctionQuery : public Query { public: DisjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~DisjunctionQuery() override = default; void add(const std::wstring& field_name, const std::vector& terms) override; @@ -35,6 +35,7 @@ class DisjunctionQuery : public Query { private: std::shared_ptr _searcher; + const io::IOContext* _io_ctx = nullptr; std::wstring _field_name; std::vector _terms; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp index ec1b5bdd9e4d35..f82433826e9581 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp @@ -30,7 +30,7 @@ namespace doris::segment_v2 { PhraseEdgeQuery::PhraseEdgeQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, const io::IOContext* io_ctx) : _searcher(searcher), _query(std::make_unique()), _max_expansions(query_options.inverted_index_max_expansions) {} diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h index 5daf382e0d08fa..9eb3bd57c4a916 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h @@ -31,7 +31,7 @@ namespace doris::segment_v2 { class PhraseEdgeQuery : public Query { public: PhraseEdgeQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~PhraseEdgeQuery() override = default; void add(const std::wstring& field_name, const std::vector& terms) override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp index 407e515dc9212f..88bb3c1171fa30 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp @@ -23,7 +23,8 @@ namespace doris::segment_v2 { PhrasePrefixQuery::PhrasePrefixQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, + const io::IOContext* io_ctx) : _searcher(searcher), _query(std::make_unique()), _max_expansions(query_options.inverted_index_max_expansions) {} diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h index e565c0409cf4cd..5cac597951eac7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h @@ -31,7 +31,7 @@ namespace doris::segment_v2 { class PhrasePrefixQuery : public Query { public: PhrasePrefixQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~PhrasePrefixQuery() override = default; void add(const std::wstring& field_name, const std::vector& terms) override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp index 9a3ecc68f89fa0..38e60b0f089dc0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp @@ -123,8 +123,8 @@ bool OrderedSloppyPhraseMatcher::stretch_to_order(PostingsAndPosition* prev_post } PhraseQuery::PhraseQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) - : _searcher(searcher) {} + const TQueryOptions& query_options, const io::IOContext* io_ctx) + : _searcher(searcher), _io_ctx(io_ctx) {} PhraseQuery::~PhraseQuery() { for (auto& term_doc : _term_docs) { @@ -173,7 +173,7 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vectorgetReader()->termDocs(t); + TermDocs* term_doc = _searcher->getReader()->termDocs(t, _io_ctx); _term_docs.push_back(term_doc); _lead1 = TermIterator(term_doc); return; @@ -185,7 +185,7 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vectorgetReader()->termPositions(t); + TermPositions* term_pos = _searcher->getReader()->termPositions(t, _io_ctx); _term_docs.push_back(term_pos); if (is_save_iter) { iterators.emplace_back(term_pos); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h index 35a479ff7f9781..a2c3a7ae91afcc 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h @@ -87,7 +87,7 @@ using Matcher = std::variant; class PhraseQuery : public Query { public: PhraseQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~PhraseQuery() override; void add(const InvertedIndexQueryInfo& query_info) override; @@ -112,6 +112,7 @@ class PhraseQuery : public Query { private: std::shared_ptr _searcher; + const io::IOContext* _io_ctx = nullptr; TermIterator _lead1; TermIterator _lead2; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h index c295765ec63478..c0eac69deaeaf3 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h @@ -27,6 +27,7 @@ #include #include "common/status.h" +#include "io/io_common.h" #include "roaring/roaring.hh" CL_NS_USE(index) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp index 007da8289dcdb0..69de4b7818b870 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp @@ -25,10 +25,10 @@ namespace doris::segment_v2 { RegexpQuery::RegexpQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, const io::IOContext* io_ctx) : _searcher(searcher), _max_expansions(query_options.inverted_index_max_expansions), - _query(searcher, query_options) {} + _query(searcher, query_options, io_ctx) {} void RegexpQuery::add(const std::wstring& field_name, const std::vector& patterns) { if (patterns.size() != 1) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h index 336b2d0b6a671d..650ad2bf10b002 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h @@ -28,7 +28,7 @@ namespace doris::segment_v2 { class RegexpQuery : public Query { public: RegexpQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~RegexpQuery() override = default; void add(const std::wstring& field_name, const std::vector& patterns) override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp index 88a8f2417228bc..f988c46c027c26 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp @@ -76,13 +76,6 @@ Status compact_column(int64_t index_id, // when index_writer is destroyed, if closeDir is set, dir will be close // _CLDECDELETE(dir) will try to ref_cnt--, when it decreases to 1, dir will be destroyed. _CLDECDELETE(dir) - for (auto* d : dest_index_dirs) { - if (d != nullptr) { - // NOTE: DO NOT close dest dir here, because it will be closed when dest index writer finalize. - //d->close(); - //_CLDELETE(d); - } - } // delete temporary segment_path, only when inverted_index_ram_dir_enable is false if (!config::inverted_index_ram_dir_enable) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp index 7613df112ed9aa..60006ea84550a2 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp @@ -59,6 +59,8 @@ class CSIndexInput : public lucene::store::BufferedIndexInput { CL_NS(store)::IndexInput* base; int64_t fileOffset; int64_t _length; + const io::IOContext* _io_ctx = nullptr; + bool _is_index_file = false; // Indicates if the file is a TII file protected: void readInternal(uint8_t* /*b*/, const int32_t /*len*/) override; @@ -75,6 +77,8 @@ class CSIndexInput : public lucene::store::BufferedIndexInput { const char* getDirectoryType() const override { return DorisCompoundReader::getClassName(); } const char* getObjectName() const override { return getClassName(); } static const char* getClassName() { return "CSIndexInput"; } + void setIoContext(const void* io_ctx) override; + void setIndexFile(bool isIndexFile) override; }; CSIndexInput::CSIndexInput(CL_NS(store)::IndexInput* base, const int64_t fileOffset, @@ -92,9 +96,12 @@ void CSIndexInput::readInternal(uint8_t* b, const int32_t len) { if (start + len > _length) { _CLTHROWA(CL_ERR_IO, "read past EOF"); } + base->setIoContext(_io_ctx); + base->setIndexFile(_is_index_file); base->seek(fileOffset + start); bool read_from_buffer = true; base->readBytes(b, len, read_from_buffer); + base->setIoContext(nullptr); } CSIndexInput::~CSIndexInput() = default; @@ -111,6 +118,14 @@ CSIndexInput::CSIndexInput(const CSIndexInput& clone) : BufferedIndexInput(clone void CSIndexInput::close() {} +void CSIndexInput::setIoContext(const void* io_ctx) { + _io_ctx = static_cast(io_ctx); +} + +void CSIndexInput::setIndexFile(bool isIndexFile) { + _is_index_file = isIndexFile; +} + DorisCompoundReader::DorisCompoundReader(CL_NS(store)::IndexInput* stream, int32_t read_buffer_size) : _ram_dir(new lucene::store::RAMDirectory()), _stream(stream), diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp index e0c75922c98bb2..113833d560fd06 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp @@ -27,10 +27,9 @@ namespace doris::segment_v2 { -Status InvertedIndexFileReader::init(int32_t read_buffer_size, bool open_idx_file_cache) { +Status InvertedIndexFileReader::init(int32_t read_buffer_size) { if (!_inited) { _read_buffer_size = read_buffer_size; - _open_idx_file_cache = open_idx_file_cache; if (_storage_format == InvertedIndexStorageFormatPB::V2) { auto st = _init_from_v2(read_buffer_size); if (!st.ok()) { @@ -76,7 +75,6 @@ Status InvertedIndexFileReader::_init_from_v2(int32_t read_buffer_size) { "CLuceneError occur when open idx file {}, error msg: {}", index_file_full_path, err.what()); } - index_input->setIdxFileCache(_open_idx_file_cache); _stream = std::unique_ptr(index_input); // 3. read file @@ -198,7 +196,6 @@ Result> InvertedIndexFileReader::_open( } // 3. read file in DorisCompoundReader - index_input->setIdxFileCache(_open_idx_file_cache); compound_reader = std::make_unique(index_input, _read_buffer_size); } catch (CLuceneError& err) { return ResultError(Status::Error( diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h index 8bc28b1882f9d8..3b7161c7643cef 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h @@ -58,8 +58,7 @@ class InvertedIndexFileReader { _storage_format(storage_format), _idx_file_info(idx_file_info) {} - Status init(int32_t read_buffer_size = config::inverted_index_read_buffer_size, - bool open_idx_file_cache = false); + Status init(int32_t read_buffer_size = config::inverted_index_read_buffer_size); Result> open(const TabletIndex* index_meta) const; void debug_file_entries(); std::string get_index_file_cache_key(const TabletIndex* index_meta) const; @@ -80,7 +79,6 @@ class InvertedIndexFileReader { const io::FileSystemSPtr _fs; std::string _index_path_prefix; int32_t _read_buffer_size = -1; - bool _open_idx_file_cache = false; InvertedIndexStorageFormatPB _storage_format; mutable std::shared_mutex _mutex; // Use mutable for const read operations bool _inited = false; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp index 5599faa351dfd6..2d50730daffe8a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp @@ -19,17 +19,14 @@ #include +#include #include #include "common/status.h" -#include "io/fs/file_writer.h" -#include "io/fs/local_file_system.h" -#include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" #include "olap/rowset/segment_v2/inverted_index_fs_directory.h" #include "olap/rowset/segment_v2/inverted_index_reader.h" #include "olap/tablet_schema.h" -#include "runtime/exec_env.h" namespace doris::segment_v2 { @@ -38,32 +35,11 @@ Status InvertedIndexFileWriter::initialize(InvertedIndexDirectoryMap& indices_di return Status::OK(); } -Result InvertedIndexFileWriter::open(const TabletIndex* index_meta) { - auto tmp_file_dir = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir(); - const auto& local_fs = io::global_local_filesystem(); - auto local_fs_index_path = InvertedIndexDescriptor::get_temporary_index_path( - tmp_file_dir.native(), _rowset_id, _seg_id, index_meta->index_id(), - index_meta->get_index_suffix()); - bool exists = false; - auto st = local_fs->exists(local_fs_index_path, &exists); - DBUG_EXECUTE_IF("InvertedIndexFileWriter::open_local_fs_exists_error", - { st = Status::Error("debug point: no such file error"); }) - if (!st.ok()) { - LOG(ERROR) << "index_path:" << local_fs_index_path << " exists error:" << st; - return ResultError(st); - } - DBUG_EXECUTE_IF("InvertedIndexFileWriter::open_local_fs_exists_true", { exists = true; }) - if (exists) { - LOG(ERROR) << "try to init a directory:" << local_fs_index_path << " already exists"; - return ResultError( - Status::InternalError("InvertedIndexFileWriter::open directory already exists")); - } - - bool can_use_ram_dir = true; - auto* dir = DorisFSDirectoryFactory::getDirectory(local_fs, local_fs_index_path.c_str(), - can_use_ram_dir); - auto key = std::make_pair(index_meta->index_id(), index_meta->get_index_suffix()); - auto [it, inserted] = _indices_dirs.emplace(key, std::unique_ptr(dir)); +Status InvertedIndexFileWriter::_insert_directory_into_map(int64_t index_id, + const std::string& index_suffix, + std::shared_ptr dir) { + auto key = std::make_pair(index_id, index_suffix); + auto [it, inserted] = _indices_dirs.emplace(key, std::move(dir)); if (!inserted) { LOG(ERROR) << "InvertedIndexFileWriter::open attempted to insert a duplicate key: (" << key.first << ", " << key.second << ")"; @@ -71,8 +47,23 @@ Result InvertedIndexFileWriter::open(const TabletIndex* index for (const auto& entry : _indices_dirs) { LOG(ERROR) << "Key: (" << entry.first.first << ", " << entry.first.second << ")"; } - return ResultError(Status::InternalError( - "InvertedIndexFileWriter::open attempted to insert a duplicate dir")); + return Status::InternalError( + "InvertedIndexFileWriter::open attempted to insert a duplicate dir"); + } + return Status::OK(); +} + +Result> InvertedIndexFileWriter::open( + const TabletIndex* index_meta) { + auto local_fs_index_path = InvertedIndexDescriptor::get_temporary_index_path( + _tmp_dir, _rowset_id, _seg_id, index_meta->index_id(), index_meta->get_index_suffix()); + bool can_use_ram_dir = true; + auto dir = std::shared_ptr(DorisFSDirectoryFactory::getDirectory( + _local_fs, local_fs_index_path.c_str(), can_use_ram_dir)); + auto st = + _insert_directory_into_map(index_meta->index_id(), index_meta->get_index_suffix(), dir); + if (!st.ok()) { + return ResultError(st); } return dir; @@ -222,7 +213,7 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire int64_t chunk = bufferLength; while (remainder > 0) { - int64_t len = std::min(std::min(chunk, length), remainder); + int64_t len = std::min({chunk, length, remainder}); input->readBytes(buffer, len); output->writeBytes(buffer, len); remainder -= len; @@ -252,244 +243,326 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire Status InvertedIndexFileWriter::write_v1() { int64_t total_size = 0; + std::string err_msg; + lucene::store::Directory* out_dir = nullptr; + std::exception_ptr eptr; + std::unique_ptr output = nullptr; for (const auto& entry : _indices_dirs) { const int64_t index_id = entry.first.first; const auto& index_suffix = entry.first.second; try { - const auto& directory = entry.second; - std::vector files; - directory->list(&files); - // remove write.lock file - auto it = std::find(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE); - if (it != files.end()) { - files.erase(it); - } + const auto& directory = entry.second.get(); - std::vector sorted_files; - for (auto file : files) { - FileInfo file_info; - file_info.filename = file; - file_info.filesize = directory->fileLength(file.c_str()); - sorted_files.emplace_back(std::move(file_info)); - } - sort_files(sorted_files); - - int32_t file_count = sorted_files.size(); - - io::Path cfs_path(InvertedIndexDescriptor::get_index_file_path_v1( - _index_path_prefix, index_id, index_suffix)); - auto idx_path = cfs_path.parent_path(); - std::string idx_name = cfs_path.filename(); - // write file entries to ram directory to get header length - lucene::store::RAMDirectory ram_dir; - auto* out_idx = ram_dir.createOutput(idx_name.c_str()); - DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_ram_output_is_nullptr", - { out_idx = nullptr; }) - if (out_idx == nullptr) { - LOG(WARNING) << "Write compound file error: RAMDirectory output is nullptr."; - _CLTHROWA(CL_ERR_IO, "Create RAMDirectory output error"); - } + // Prepare sorted file list + auto sorted_files = prepare_sorted_files(directory); + + // Calculate header length + auto [header_length, header_file_count] = + calculate_header_length(sorted_files, directory); + + // Create output stream + auto result = create_output_stream_v1(index_id, index_suffix); + out_dir = result.first; + output = std::move(result.second); - std::unique_ptr ram_output(out_idx); - ram_output->writeVInt(file_count); - // write file entries in ram directory - // number of files, which data are in header - int header_file_count = 0; - int64_t header_file_length = 0; - const int64_t buffer_length = 16384; - uint8_t ram_buffer[buffer_length]; - for (auto file : sorted_files) { - ram_output->writeString(file.filename); // file name - ram_output->writeLong(0); // data offset - ram_output->writeLong(file.filesize); // file length - header_file_length += file.filesize; - if (header_file_length <= DorisFSDirectory::MAX_HEADER_DATA_SIZE) { - copyFile(file.filename.c_str(), directory.get(), ram_output.get(), ram_buffer, - buffer_length); - header_file_count++; - } - } - auto header_len = ram_output->getFilePointer(); - ram_output->close(); - ram_dir.deleteFile(idx_name.c_str()); - ram_dir.close(); - - auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, idx_path.c_str()); - out_dir->set_file_writer_opts(_opts); - - auto* out = out_dir->createOutput(idx_name.c_str()); - DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_out_dir_createOutput_nullptr", - { out = nullptr; }); - if (out == nullptr) { - LOG(WARNING) << "Write compound file error: CompoundDirectory output is nullptr."; - _CLTHROWA(CL_ERR_IO, "Create CompoundDirectory output error"); - } - std::unique_ptr output(out); size_t start = output->getFilePointer(); - output->writeVInt(file_count); - // write file entries - int64_t data_offset = header_len; - uint8_t header_buffer[buffer_length]; - for (int i = 0; i < sorted_files.size(); ++i) { - auto file = sorted_files[i]; - output->writeString(file.filename); // FileName - // DataOffset - if (i < header_file_count) { - // file data write in header, so we set its offset to -1. - output->writeLong(-1); - } else { - output->writeLong(data_offset); - } - output->writeLong(file.filesize); // FileLength - if (i < header_file_count) { - // append data - copyFile(file.filename.c_str(), directory.get(), output.get(), header_buffer, - buffer_length); - } else { - data_offset += file.filesize; - } - } - // write rest files' data - uint8_t data_buffer[buffer_length]; - for (int i = header_file_count; i < sorted_files.size(); ++i) { - auto file = sorted_files[i]; - copyFile(file.filename.c_str(), directory.get(), output.get(), data_buffer, - buffer_length); - } - out_dir->close(); - // NOTE: need to decrease ref count, but not to delete here, - // because index cache may get the same directory from DIRECTORIES - _CLDECDELETE(out_dir) + // Write header and data + write_header_and_data_v1(output.get(), sorted_files, directory, header_length, + header_file_count); + + // Collect file information auto compound_file_size = output->getFilePointer() - start; - output->close(); - //LOG(INFO) << (idx_path / idx_name).c_str() << " size:" << compound_file_size; total_size += compound_file_size; - InvertedIndexFileInfo_IndexInfo index_info; - index_info.set_index_id(index_id); - index_info.set_index_suffix(index_suffix); - index_info.set_index_file_size(compound_file_size); - auto* new_index_info = _file_info.add_index_info(); - *new_index_info = index_info; + add_index_info(index_id, index_suffix, compound_file_size); } catch (CLuceneError& err) { + eptr = std::current_exception(); auto index_path = InvertedIndexDescriptor::get_index_file_path_v1( _index_path_prefix, index_id, index_suffix); - LOG(ERROR) << "CLuceneError occur when write_v1 idx file " << index_path - << " error msg: " << err.what(); + err_msg = "CLuceneError occur when write_v1 idx file " + index_path + + " error msg: " + err.what(); + } - return Status::Error( - "CLuceneError occur when write_v1 idx file: {}, error msg: {}", index_path, - err.what()); + // Close and clean up + finalize_output_dir(out_dir); + if (output) { + output->close(); + } + + if (eptr) { + LOG(ERROR) << err_msg; + return Status::Error(err_msg); } } + _total_file_size = total_size; return Status::OK(); } Status InvertedIndexFileWriter::write_v2() { - io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; - std::unique_ptr compound_file_output; + std::string err_msg; + lucene::store::Directory* out_dir = nullptr; + std::unique_ptr compound_file_output = nullptr; + std::exception_ptr eptr; try { - // Create the output stream to write the compound file + // Calculate header length and initialize offset int64_t current_offset = headerLength(); + // Prepare file metadata + auto file_metadata = prepare_file_metadata_v2(current_offset); - io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; + // Create output stream + auto result = create_output_stream_v2(); + out_dir = result.first; + compound_file_output = std::move(result.second); - auto* out_dir = - DorisFSDirectoryFactory::getDirectory(_fs, index_path.parent_path().c_str()); - out_dir->set_file_writer_opts(_opts); + // Write version and number of indices + write_version_and_indices_count(compound_file_output.get()); - std::unique_ptr compound_file_output; + // Write index headers and file metadata + write_index_headers_and_metadata(compound_file_output.get(), file_metadata); - DCHECK(_idx_v2_writer != nullptr) << "inverted index file writer v2 is nullptr"; - compound_file_output = std::unique_ptr( - out_dir->createOutputV2(_idx_v2_writer.get())); + // Copy file data + copy_files_data_v2(compound_file_output.get(), file_metadata); + + _total_file_size = compound_file_output->getFilePointer(); + _file_info.set_index_size(_total_file_size); + } catch (CLuceneError& err) { + eptr = std::current_exception(); + auto index_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); + err_msg = "CLuceneError occur when close idx file " + index_path + + " error msg: " + err.what(); + } - // Write the version number - compound_file_output->writeInt(InvertedIndexStorageFormatPB::V2); + // Close and clean up + finalize_output_dir(out_dir); + if (compound_file_output) { + compound_file_output->close(); + } - // Write the number of indices - const auto numIndices = static_cast(_indices_dirs.size()); - compound_file_output->writeInt(numIndices); + if (eptr) { + LOG(ERROR) << err_msg; + return Status::Error(err_msg); + } - std::vector> - file_metadata; // Store file name, offset, file length, and corresponding directory + return Status::OK(); +} - // First, write all index information and file metadata - for (const auto& entry : _indices_dirs) { - const int64_t index_id = entry.first.first; - const auto& index_suffix = entry.first.second; - const auto& dir = entry.second; - std::vector files; - dir->list(&files); - - auto it = std::find(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE); - if (it != files.end()) { - files.erase(it); - } - // sort file list by file length - std::vector> sorted_files; - for (const auto& file : files) { - sorted_files.emplace_back(file, dir->fileLength(file.c_str())); - } +// Helper function implementations +std::vector InvertedIndexFileWriter::prepare_sorted_files( + lucene::store::Directory* directory) { + std::vector files; + directory->list(&files); + + // Remove write.lock file + files.erase(std::remove(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE), + files.end()); + + std::vector sorted_files; + for (const auto& file : files) { + FileInfo file_info; + file_info.filename = file; + file_info.filesize = directory->fileLength(file.c_str()); + sorted_files.push_back(std::move(file_info)); + } - std::sort( - sorted_files.begin(), sorted_files.end(), - [](const std::pair& a, - const std::pair& b) { return (a.second < b.second); }); - - int32_t file_count = sorted_files.size(); - - // Write the index ID and the number of files - compound_file_output->writeLong(index_id); - compound_file_output->writeInt(static_cast(index_suffix.length())); - compound_file_output->writeBytes(reinterpret_cast(index_suffix.data()), - index_suffix.length()); - compound_file_output->writeInt(file_count); - - // Calculate the offset for each file and write the file metadata - for (const auto& file : sorted_files) { - int64_t file_length = dir->fileLength(file.first.c_str()); - compound_file_output->writeInt(static_cast(file.first.length())); - compound_file_output->writeBytes( - reinterpret_cast(file.first.data()), file.first.length()); - compound_file_output->writeLong(current_offset); - compound_file_output->writeLong(file_length); - - file_metadata.emplace_back(file.first, current_offset, file_length, dir.get()); - current_offset += file_length; // Update the data offset - } + // Sort the files + sort_files(sorted_files); + return sorted_files; +} + +void InvertedIndexFileWriter::finalize_output_dir(lucene::store::Directory* out_dir) { + if (out_dir != nullptr) { + out_dir->close(); + _CLDECDELETE(out_dir) + } +} + +void InvertedIndexFileWriter::add_index_info(int64_t index_id, const std::string& index_suffix, + int64_t compound_file_size) { + InvertedIndexFileInfo_IndexInfo index_info; + index_info.set_index_id(index_id); + index_info.set_index_suffix(index_suffix); + index_info.set_index_file_size(compound_file_size); + auto* new_index_info = _file_info.add_index_info(); + *new_index_info = index_info; +} + +std::pair InvertedIndexFileWriter::calculate_header_length( + const std::vector& sorted_files, lucene::store::Directory* directory) { + // Use RAMDirectory to calculate header length + lucene::store::RAMDirectory ram_dir; + auto* out_idx = ram_dir.createOutput("temp_idx"); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::calculate_header_length_ram_output_is_nullptr", + { out_idx = nullptr; }) + if (out_idx == nullptr) { + LOG(WARNING) << "InvertedIndexFileWriter::calculate_header_length error: RAMDirectory " + "output is nullptr."; + _CLTHROWA(CL_ERR_IO, "Create RAMDirectory output error"); + } + std::unique_ptr ram_output(out_idx); + int32_t file_count = sorted_files.size(); + ram_output->writeVInt(file_count); + + int64_t header_file_length = 0; + const int64_t buffer_length = 16384; + uint8_t ram_buffer[buffer_length]; + int32_t header_file_count = 0; + for (const auto& file : sorted_files) { + ram_output->writeString(file.filename); + ram_output->writeLong(0); + ram_output->writeLong(file.filesize); + header_file_length += file.filesize; + + if (header_file_length <= DorisFSDirectory::MAX_HEADER_DATA_SIZE) { + copyFile(file.filename.c_str(), directory, ram_output.get(), ram_buffer, buffer_length); + header_file_count++; } + } - const int64_t buffer_length = 16384; - uint8_t header_buffer[buffer_length]; + int64_t header_length = ram_output->getFilePointer(); + ram_output->close(); + ram_dir.close(); + return {header_length, header_file_count}; +} + +std::pair> +InvertedIndexFileWriter::create_output_stream_v1(int64_t index_id, + const std::string& index_suffix) { + io::Path cfs_path(InvertedIndexDescriptor::get_index_file_path_v1(_index_path_prefix, index_id, + index_suffix)); + auto idx_path = cfs_path.parent_path(); + std::string idx_name = cfs_path.filename(); + + auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, idx_path.c_str()); + out_dir->set_file_writer_opts(_opts); + + auto* out = out_dir->createOutput(idx_name.c_str()); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_out_dir_createOutput_nullptr", + { out = nullptr; }); + if (out == nullptr) { + LOG(WARNING) << "InvertedIndexFileWriter::create_output_stream_v1 error: CompoundDirectory " + "output is nullptr."; + _CLTHROWA(CL_ERR_IO, "Create CompoundDirectory output error"); + } - // Next, write the file data - for (const auto& info : file_metadata) { - const std::string& file = std::get<0>(info); - auto* dir = std::get<3>(info); + std::unique_ptr output(out); + return {out_dir, std::move(output)}; +} - // Write the actual file data - copyFile(file.c_str(), dir, compound_file_output.get(), header_buffer, buffer_length); +void InvertedIndexFileWriter::write_header_and_data_v1(lucene::store::IndexOutput* output, + const std::vector& sorted_files, + lucene::store::Directory* directory, + int64_t header_length, + int32_t header_file_count) { + output->writeVInt(sorted_files.size()); + int64_t data_offset = header_length; + const int64_t buffer_length = 16384; + uint8_t buffer[buffer_length]; + + for (int i = 0; i < sorted_files.size(); ++i) { + auto file = sorted_files[i]; + output->writeString(file.filename); + + // DataOffset + if (i < header_file_count) { + // file data write in header, so we set its offset to -1. + output->writeLong(-1); + } else { + output->writeLong(data_offset); + } + output->writeLong(file.filesize); // FileLength + if (i < header_file_count) { + // append data + copyFile(file.filename.c_str(), directory, output, buffer, buffer_length); + } else { + data_offset += file.filesize; } + } - out_dir->close(); - // NOTE: need to decrease ref count, but not to delete here, - // because index cache may get the same directory from DIRECTORIES - _CLDECDELETE(out_dir) - _total_file_size = compound_file_output->getFilePointer(); - compound_file_output->close(); - _file_info.set_index_size(_total_file_size); - } catch (CLuceneError& err) { - LOG(ERROR) << "CLuceneError occur when close idx file " << index_path - << " error msg: " << err.what(); - if (compound_file_output) { - compound_file_output->close(); - compound_file_output.reset(); + for (size_t i = header_file_count; i < sorted_files.size(); ++i) { + copyFile(sorted_files[i].filename.c_str(), directory, output, buffer, buffer_length); + } +} + +std::pair> +InvertedIndexFileWriter::create_output_stream_v2() { + io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; + auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, index_path.parent_path().c_str()); + out_dir->set_file_writer_opts(_opts); + DCHECK(_idx_v2_writer != nullptr) << "inverted index file writer v2 is nullptr"; + auto compound_file_output = std::unique_ptr( + out_dir->createOutputV2(_idx_v2_writer.get())); + return std::make_pair(out_dir, std::move(compound_file_output)); +} + +void InvertedIndexFileWriter::write_version_and_indices_count(lucene::store::IndexOutput* output) { + // Write the version number + output->writeInt(InvertedIndexStorageFormatPB::V2); + + // Write the number of indices + const auto num_indices = static_cast(_indices_dirs.size()); + output->writeInt(num_indices); +} + +std::vector +InvertedIndexFileWriter::prepare_file_metadata_v2(int64_t& current_offset) { + std::vector file_metadata; + + for (const auto& entry : _indices_dirs) { + const int64_t index_id = entry.first.first; + const auto& index_suffix = entry.first.second; + auto* dir = entry.second.get(); + + // Get sorted files + auto sorted_files = prepare_sorted_files(dir); + + for (const auto& file : sorted_files) { + file_metadata.emplace_back(index_id, index_suffix, file.filename, current_offset, + file.filesize, dir); + current_offset += file.filesize; // Update the data offset } - return Status::Error( - "CLuceneError occur when close idx file: {}, error msg: {}", index_path.c_str(), - err.what()); } - return Status::OK(); + return file_metadata; +} + +void InvertedIndexFileWriter::write_index_headers_and_metadata( + lucene::store::IndexOutput* output, const std::vector& file_metadata) { + // Group files by index_id and index_suffix + std::map, std::vector> indices; + + for (const auto& meta : file_metadata) { + indices[{meta.index_id, meta.index_suffix}].push_back(meta); + } + + for (const auto& index_entry : indices) { + int64_t index_id = index_entry.first.first; + const std::string& index_suffix = index_entry.first.second; + const auto& files = index_entry.second; + + // Write the index ID and the number of files + output->writeLong(index_id); + output->writeInt(static_cast(index_suffix.length())); + output->writeBytes(reinterpret_cast(index_suffix.data()), + index_suffix.length()); + output->writeInt(static_cast(files.size())); + + // Write file metadata + for (const auto& file : files) { + output->writeInt(static_cast(file.filename.length())); + output->writeBytes(reinterpret_cast(file.filename.data()), + file.filename.length()); + output->writeLong(file.offset); + output->writeLong(file.length); + } + } +} + +void InvertedIndexFileWriter::copy_files_data_v2(lucene::store::IndexOutput* output, + const std::vector& file_metadata) { + const int64_t buffer_length = 16384; + uint8_t buffer[buffer_length]; + + for (const auto& meta : file_metadata) { + copyFile(meta.filename.c_str(), meta.directory, output, buffer, buffer_length); + } } } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h index 31e287d6dd3f71..3a2fcc1e6acaa7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h @@ -28,7 +28,9 @@ #include "io/fs/file_system.h" #include "io/fs/file_writer.h" +#include "io/fs/local_file_system.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" +#include "runtime/exec_env.h" namespace doris { class TabletIndex; @@ -36,7 +38,7 @@ class TabletIndex; namespace segment_v2 { class DorisFSDirectory; using InvertedIndexDirectoryMap = - std::map, std::unique_ptr>; + std::map, std::shared_ptr>; class InvertedIndexFileWriter; using InvertedIndexFileWriterPtr = std::unique_ptr; @@ -58,16 +60,19 @@ class InvertedIndexFileWriter { _rowset_id(std::move(rowset_id)), _seg_id(seg_id), _storage_format(storage_format), - _idx_v2_writer(std::move(file_writer)) {} + _local_fs(io::global_local_filesystem()), + _idx_v2_writer(std::move(file_writer)) { + auto tmp_file_dir = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir(); + _tmp_dir = tmp_file_dir.native(); + } - Result open(const TabletIndex* index_meta); + Result> open(const TabletIndex* index_meta); Status delete_index(const TabletIndex* index_meta); Status initialize(InvertedIndexDirectoryMap& indices_dirs); - ~InvertedIndexFileWriter() = default; + virtual ~InvertedIndexFileWriter() = default; Status write_v2(); Status write_v1(); Status close(); - int64_t headerLength(); const InvertedIndexFileInfo* get_index_file_info() const { DCHECK(_closed) << debug_string(); return &_file_info; @@ -77,11 +82,7 @@ class InvertedIndexFileWriter { return _total_file_size; } const io::FileSystemSPtr& get_fs() const { return _fs; } - void sort_files(std::vector& file_infos); - void copyFile(const char* fileName, lucene::store::Directory* dir, - lucene::store::IndexOutput* output, uint8_t* buffer, int64_t bufferLength); InvertedIndexStorageFormatPB get_storage_format() const { return _storage_format; } - void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } std::string debug_string() const { @@ -99,12 +100,61 @@ class InvertedIndexFileWriter { } private: + // Helper functions shared between write_v1 and write_v2 + std::vector prepare_sorted_files(lucene::store::Directory* directory); + void sort_files(std::vector& file_infos); + void copyFile(const char* fileName, lucene::store::Directory* dir, + lucene::store::IndexOutput* output, uint8_t* buffer, int64_t bufferLength); + void finalize_output_dir(lucene::store::Directory* out_dir); + void add_index_info(int64_t index_id, const std::string& index_suffix, + int64_t compound_file_size); + int64_t headerLength(); + // Helper functions specific to write_v1 + std::pair calculate_header_length(const std::vector& sorted_files, + lucene::store::Directory* directory); + std::pair> + create_output_stream_v1(int64_t index_id, const std::string& index_suffix); + virtual void write_header_and_data_v1(lucene::store::IndexOutput* output, + const std::vector& sorted_files, + lucene::store::Directory* directory, + int64_t header_length, int32_t header_file_count); + // Helper functions specific to write_v2 + std::pair> + create_output_stream_v2(); + void write_version_and_indices_count(lucene::store::IndexOutput* output); + struct FileMetadata { + int64_t index_id; + std::string index_suffix; + std::string filename; + int64_t offset; + int64_t length; + lucene::store::Directory* directory; + + FileMetadata(int64_t id, const std::string& suffix, const std::string& file, int64_t off, + int64_t len, lucene::store::Directory* dir) + : index_id(id), + index_suffix(suffix), + filename(file), + offset(off), + length(len), + directory(dir) {} + }; + std::vector prepare_file_metadata_v2(int64_t& current_offset); + virtual void write_index_headers_and_metadata(lucene::store::IndexOutput* output, + const std::vector& file_metadata); + void copy_files_data_v2(lucene::store::IndexOutput* output, + const std::vector& file_metadata); + Status _insert_directory_into_map(int64_t index_id, const std::string& index_suffix, + std::shared_ptr dir); + // Member variables... InvertedIndexDirectoryMap _indices_dirs; const io::FileSystemSPtr _fs; std::string _index_path_prefix; std::string _rowset_id; int64_t _seg_id; InvertedIndexStorageFormatPB _storage_format; + std::string _tmp_dir; + const std::shared_ptr& _local_fs; // write to disk or stream io::FileWriterPtr _idx_v2_writer = nullptr; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp index ded71c8a6cc73e..29caf29936dddf 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp @@ -219,6 +219,27 @@ void DorisFSDirectory::FSIndexInput::close() { }*/ } +void DorisFSDirectory::FSIndexInput::setIoContext(const void* io_ctx) { + if (io_ctx) { + const auto& ctx = static_cast(io_ctx); + _io_ctx.reader_type = ctx->reader_type; + _io_ctx.query_id = ctx->query_id; + _io_ctx.file_cache_stats = ctx->file_cache_stats; + } else { + _io_ctx.reader_type = ReaderType::UNKNOWN; + _io_ctx.query_id = nullptr; + _io_ctx.file_cache_stats = nullptr; + } +} + +const void* DorisFSDirectory::FSIndexInput::getIoContext() { + return &_io_ctx; +} + +void DorisFSDirectory::FSIndexInput::setIndexFile(bool isIndexFile) { + _io_ctx.is_index_data = isIndexFile; +} + void DorisFSDirectory::FSIndexInput::seekInternal(const int64_t position) { CND_PRECONDITION(position >= 0 && position < _handle->_length, "Seeking out of range"); _pos = position; @@ -239,9 +260,23 @@ void DorisFSDirectory::FSIndexInput::readInternal(uint8_t* b, const int32_t len) _handle->_fpos = _pos; } + DBUG_EXECUTE_IF( + "DorisFSDirectory::FSIndexInput::readInternal", ({ + static thread_local std::unordered_map + thread_file_cache_map; + auto it = thread_file_cache_map.find(_io_ctx.query_id); + if (it != thread_file_cache_map.end()) { + if (_io_ctx.file_cache_stats != it->second) { + _CLTHROWA(CL_ERR_IO, "File cache statistics mismatch"); + } + } else { + thread_file_cache_map[_io_ctx.query_id] = _io_ctx.file_cache_stats; + } + })); + Slice result {b, (size_t)len}; size_t bytes_read = 0; - auto st = _handle->_reader->read_at(_pos, result, &bytes_read, &_io_ctx); + Status st = _handle->_reader->read_at(_pos, result, &bytes_read, &_io_ctx); DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexInput::readInternal_reader_read_at_error", { st = Status::InternalError( "debug point: DorisFSDirectory::FSIndexInput::readInternal_reader_read_at_error"); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h index 59ae6db1a9630d..fd92873c9707bf 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h @@ -180,8 +180,6 @@ class DorisFSDirectory::FSIndexInput : public lucene::store::BufferedIndexInput : BufferedIndexInput(buffer_size) { this->_pos = 0; this->_handle = std::move(handle); - this->_io_ctx.reader_type = ReaderType::READER_QUERY; - this->_io_ctx.is_index_data = false; } protected: @@ -199,8 +197,9 @@ class DorisFSDirectory::FSIndexInput : public lucene::store::BufferedIndexInput const char* getDirectoryType() const override { return DorisFSDirectory::getClassName(); } const char* getObjectName() const override { return getClassName(); } static const char* getClassName() { return "FSIndexInput"; } - - void setIdxFileCache(bool index) override { _io_ctx.is_index_data = index; } + void setIoContext(const void* io_ctx) override; + const void* getIoContext() override; + void setIndexFile(bool isIndexFile) override; std::mutex _this_lock; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index b7cfe7dfaffb31..889fee1fc87ef9 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -102,7 +102,8 @@ std::string InvertedIndexReader::get_index_file_path() { return _inverted_index_file_reader->get_index_file_path(&_index_meta); } -Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, +Status InvertedIndexReader::read_null_bitmap(const io::IOContext* io_ctx, + OlapReaderStatistics* stats, InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir) { SCOPED_RAW_TIMER(&stats->inverted_index_query_null_bitmap_timer); @@ -120,9 +121,7 @@ Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, if (!dir) { // TODO: ugly code here, try to refact. - bool open_idx_file_cache = true; - auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache); + auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size); if (!st.ok()) { LOG(WARNING) << st; return st; @@ -138,6 +137,7 @@ Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, InvertedIndexDescriptor::get_temporary_null_bitmap_file_name(); if (dir->fileExists(null_bitmap_file_name)) { null_bitmap_in = dir->openInput(null_bitmap_file_name); + null_bitmap_in->setIoContext(io_ctx); size_t null_bitmap_size = null_bitmap_in->length(); faststring buf; buf.resize(null_bitmap_size); @@ -165,7 +165,8 @@ Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, } Status InvertedIndexReader::handle_searcher_cache( - InvertedIndexCacheHandle* inverted_index_cache_handle, OlapReaderStatistics* stats) { + InvertedIndexCacheHandle* inverted_index_cache_handle, const io::IOContext* io_ctx, + OlapReaderStatistics* stats) { auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta); InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); if (InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, @@ -179,9 +180,7 @@ Status InvertedIndexReader::handle_searcher_cache( SCOPED_RAW_TIMER(&stats->inverted_index_searcher_open_timer); IndexSearcherPtr searcher; - bool open_idx_file_cache = true; - auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache); + auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size); if (!st.ok()) { LOG(WARNING) << st; return st; @@ -191,7 +190,7 @@ Status InvertedIndexReader::handle_searcher_cache( // to avoid open directory additionally for null_bitmap // TODO: handle null bitmap procedure in new format. InvertedIndexQueryCacheHandle null_bitmap_cache_handle; - static_cast(read_null_bitmap(stats, &null_bitmap_cache_handle, dir.get())); + static_cast(read_null_bitmap(io_ctx, stats, &null_bitmap_cache_handle, dir.get())); RETURN_IF_ERROR(create_index_searcher(dir.release(), &searcher, mem_tracker.get(), type())); auto* cache_value = new InvertedIndexSearcherCache::CacheValue( std::move(searcher), mem_tracker->consumption(), UnixMillis()); @@ -211,22 +210,21 @@ Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir, auto searcher_result = DORIS_TRY(index_searcher_builder->get_index_searcher(dir)); *searcher = searcher_result; - if (std::string(dir->getObjectName()) == "DorisCompoundReader") { - static_cast(dir)->getDorisIndexInput()->setIdxFileCache(false); - } + // NOTE: before mem_tracker hook becomes active, we caculate reader memory size by hand. mem_tracker->consume(index_searcher_builder->get_reader_size()); return Status::OK(); }; Status InvertedIndexReader::match_index_search( - OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, + const io::IOContext* io_ctx, OlapReaderStatistics* stats, RuntimeState* runtime_state, + InvertedIndexQueryType query_type, const InvertedIndexQueryInfo& query_info, + const FulltextIndexSearcherPtr& index_searcher, const std::shared_ptr& term_match_bitmap) { TQueryOptions queryOptions = runtime_state->query_options(); try { SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - auto query = QueryFactory::create(query_type, index_searcher, queryOptions); + auto query = QueryFactory::create(query_type, index_searcher, queryOptions, io_ctx); if (!query) { return Status::Error( "query type " + query_type_to_string(query_type) + ", query is nullptr"); @@ -240,15 +238,17 @@ Status InvertedIndexReader::match_index_search( return Status::OK(); } -Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, +Status FullTextIndexReader::new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); + *iterator = + InvertedIndexIterator::create_unique(io_ctx, stats, runtime_state, shared_from_this()); return Status::OK(); } -Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, +Status FullTextIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) { SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); @@ -314,12 +314,12 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); + RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); searcher_ptr = std::get_if(&searcher_variant); if (searcher_ptr != nullptr) { term_match_bitmap = std::make_shared(); - RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, + RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, query_type, query_info, *searcher_ptr, term_match_bitmap)); term_match_bitmap->runOptimize(); cache->insert(cache_key, term_match_bitmap, &cache_handler); @@ -337,13 +337,15 @@ InvertedIndexReaderType FullTextIndexReader::type() { } Status StringTypeInvertedIndexReader::new_iterator( - OlapReaderStatistics* stats, RuntimeState* runtime_state, + const io::IOContext& io_ctx, OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); + *iterator = + InvertedIndexIterator::create_unique(io_ctx, stats, runtime_state, shared_from_this()); return Status::OK(); } -Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, +Status StringTypeInvertedIndexReader::query(const io::IOContext* io_ctx, + OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, @@ -387,7 +389,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, auto result = std::make_shared(); FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); + RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); searcher_ptr = std::get_if(&searcher_variant); if (searcher_ptr != nullptr) { @@ -396,7 +398,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, case InvertedIndexQueryType::MATCH_ANY_QUERY: case InvertedIndexQueryType::MATCH_ALL_QUERY: case InvertedIndexQueryType::EQUAL_QUERY: { - RETURN_IF_ERROR(match_index_search(stats, runtime_state, + RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, InvertedIndexQueryType::MATCH_ANY_QUERY, query_info, *searcher_ptr, result)); break; @@ -404,8 +406,8 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, case InvertedIndexQueryType::MATCH_PHRASE_QUERY: case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: case InvertedIndexQueryType::MATCH_REGEXP_QUERY: { - RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, - *searcher_ptr, result)); + RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, query_type, + query_info, *searcher_ptr, result)); break; } case InvertedIndexQueryType::LESS_THAN_QUERY: @@ -470,9 +472,11 @@ InvertedIndexReaderType StringTypeInvertedIndexReader::type() { return InvertedIndexReaderType::STRING_TYPE; } -Status BkdIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, +Status BkdIndexReader::new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); + *iterator = + InvertedIndexIterator::create_unique(io_ctx, stats, runtime_state, shared_from_this()); return Status::OK(); } @@ -600,12 +604,12 @@ Status BkdIndexReader::invoke_bkd_query(const void* query_value, InvertedIndexQu return Status::OK(); } -Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) { +Status BkdIndexReader::try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) { try { std::shared_ptr r; - auto st = get_bkd_reader(r, stats); + auto st = get_bkd_reader(r, io_ctx, stats); if (!st.ok()) { LOG(WARNING) << "get bkd reader for " << _inverted_index_file_reader->get_index_file_path(&_index_meta) @@ -637,15 +641,15 @@ Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const std::string& return Status::OK(); } -Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, +Status BkdIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) { SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); try { std::shared_ptr r; - auto st = get_bkd_reader(r, stats); + auto st = get_bkd_reader(r, io_ctx, stats); if (!st.ok()) { LOG(WARNING) << "get bkd reader for " << _inverted_index_file_reader->get_index_file_path(&_index_meta) @@ -681,11 +685,11 @@ Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_ } } -Status BkdIndexReader::get_bkd_reader(BKDIndexSearcherPtr& bkd_reader, +Status BkdIndexReader::get_bkd_reader(BKDIndexSearcherPtr& bkd_reader, const io::IOContext* io_ctx, OlapReaderStatistics* stats) { BKDIndexSearcherPtr* bkd_searcher = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); + RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); bkd_searcher = std::get_if(&searcher_variant); if (bkd_searcher) { @@ -1115,8 +1119,8 @@ Status InvertedIndexIterator::read_from_inverted_index( } } - RETURN_IF_ERROR( - _reader->query(_stats, _runtime_state, column_name, query_value, query_type, bit_map)); + RETURN_IF_ERROR(_reader->query(&_io_ctx, _stats, _runtime_state, column_name, query_value, + query_type, bit_map)); return Status::OK(); } @@ -1130,7 +1134,8 @@ Status InvertedIndexIterator::try_read_from_inverted_index(const std::string& co query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY || query_type == InvertedIndexQueryType::LESS_THAN_QUERY || query_type == InvertedIndexQueryType::EQUAL_QUERY) { - RETURN_IF_ERROR(_reader->try_query(_stats, column_name, query_value, query_type, count)); + RETURN_IF_ERROR( + _reader->try_query(&_io_ctx, _stats, column_name, query_value, query_type, count)); } return Status::OK(); } @@ -1148,4 +1153,5 @@ template class InvertedIndexVisitor; template class InvertedIndexVisitor; template class InvertedIndexVisitor; template class InvertedIndexVisitor; + } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index d3a0ff3cf118ba..a1445603286619 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -182,17 +182,18 @@ class InvertedIndexReader : public std::enable_shared_from_this* iterator) = 0; - virtual Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + virtual Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) = 0; - virtual Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) = 0; + virtual Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) = 0; - Status read_null_bitmap(OlapReaderStatistics* stats, + Status read_null_bitmap(const io::IOContext* io_ctx, OlapReaderStatistics* stats, InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir = nullptr); @@ -223,15 +224,15 @@ class InvertedIndexReader : public std::enable_shared_from_this& term_match_bitmap); @@ -253,15 +254,16 @@ class FullTextIndexReader : public InvertedIndexReader { : InvertedIndexReader(index_meta, inverted_index_file_reader) {} ~FullTextIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, + Status new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) override; - Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; - Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) override { + Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) override { return Status::Error( "FullTextIndexReader not support try_query"); } @@ -279,15 +281,16 @@ class StringTypeInvertedIndexReader : public InvertedIndexReader { : InvertedIndexReader(index_meta, inverted_index_file_reader) {} ~StringTypeInvertedIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, + Status new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) override; - Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; - Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) override { + Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) override { return Status::Error( "StringTypeInvertedIndexReader not support try_query"); } @@ -338,16 +341,17 @@ class BkdIndexReader : public InvertedIndexReader { : InvertedIndexReader(index_meta, inverted_index_file_reader) {} ~BkdIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, + Status new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) override; - Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; - Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) override; + Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) override; Status invoke_bkd_try_query(const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr r, uint32_t* count); Status invoke_bkd_query(const void* query_value, InvertedIndexQueryType query_type, @@ -359,7 +363,8 @@ class BkdIndexReader : public InvertedIndexReader { InvertedIndexVisitor* visitor); InvertedIndexReaderType type() override; - Status get_bkd_reader(BKDIndexSearcherPtr& reader, OlapReaderStatistics* stats); + Status get_bkd_reader(BKDIndexSearcherPtr& reader, const io::IOContext* io_ctx, + OlapReaderStatistics* stats); private: const TypeInfo* _type_info {}; @@ -447,9 +452,12 @@ class InvertedIndexIterator { ENABLE_FACTORY_CREATOR(InvertedIndexIterator); public: - InvertedIndexIterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, - std::shared_ptr reader) - : _stats(stats), _runtime_state(runtime_state), _reader(std::move(reader)) {} + InvertedIndexIterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::shared_ptr reader) + : _io_ctx(io_ctx), + _stats(stats), + _runtime_state(runtime_state), + _reader(std::move(reader)) {} Status read_from_inverted_index(const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, uint32_t segment_num_rows, @@ -460,7 +468,7 @@ class InvertedIndexIterator { Status read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir = nullptr) { - return _reader->read_null_bitmap(_stats, cache_handle, dir); + return _reader->read_null_bitmap(&_io_ctx, _stats, cache_handle, dir); } [[nodiscard]] InvertedIndexReaderType get_inverted_index_reader_type() const; @@ -470,6 +478,7 @@ class InvertedIndexIterator { const InvertedIndexReaderPtr& reader() { return _reader; } private: + io::IOContext _io_ctx; OlapReaderStatistics* _stats = nullptr; RuntimeState* _runtime_state = nullptr; std::shared_ptr _reader; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 29fe4609e59e9c..a4f3ca55dd11c0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -197,7 +197,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { bool create_index = true; bool close_dir_on_shutdown = true; auto index_writer = std::make_unique( - _dir, _analyzer.get(), create_index, close_dir_on_shutdown); + _dir.get(), _analyzer.get(), create_index, close_dir_on_shutdown); DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_index_writer_setRAMBufferSizeMB_error", { index_writer->setRAMBufferSizeMB(-100); }) DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_index_writer_setMaxBufferedDocs_error", @@ -708,7 +708,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { std::unique_ptr _char_string_reader = nullptr; std::shared_ptr _bkd_writer = nullptr; InvertedIndexCtxSPtr _inverted_index_ctx = nullptr; - DorisFSDirectory* _dir = nullptr; + std::shared_ptr _dir = nullptr; const KeyCoder* _value_key_coder; const TabletIndex* _index_meta; InvertedIndexParserType _parser_type; diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 09ff3f6ed3be86..fc22c3570e52a2 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -253,10 +253,10 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& co opts.data_page_size = storage_page_size; } DBUG_EXECUTE_IF("VerticalSegmentWriter._create_column_writer.storage_page_size", { - auto table_id = DebugPoints::instance()->get_debug_param_or_default( + auto table_id = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "table_id", INT_MIN); - auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( + auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "storage_page_size", INT_MIN); if (table_id == INT_MIN || target_data_page_size == INT_MIN) { diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index 12028812f0d92b..ce16e2d502b622 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -248,10 +248,10 @@ Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletCo opts.data_page_size = storage_page_size; } DBUG_EXECUTE_IF("VerticalSegmentWriter._create_column_writer.storage_page_size", { - auto table_id = DebugPoints::instance()->get_debug_param_or_default( + auto table_id = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "table_id", INT_MIN); - auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( + auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "storage_page_size", INT_MIN); if (table_id == INT_MIN || target_data_page_size == INT_MIN) { diff --git a/be/src/olap/rowset/unique_rowset_id_generator.cpp b/be/src/olap/rowset/unique_rowset_id_generator.cpp index 0ac7f63837a099..49e07e5835957a 100644 --- a/be/src/olap/rowset/unique_rowset_id_generator.cpp +++ b/be/src/olap/rowset/unique_rowset_id_generator.cpp @@ -17,8 +17,17 @@ #include "olap/rowset/unique_rowset_id_generator.h" +#include + +#include "olap/storage_engine.h" +#include "runtime/exec_env.h" + namespace doris { +RowsetId next_rowset_id() { + return ExecEnv::GetInstance()->storage_engine().next_rowset_id(); +} + UniqueRowsetIdGenerator::UniqueRowsetIdGenerator(const UniqueId& backend_uid) : _backend_uid(backend_uid), _inc_id(1) {} diff --git a/be/src/olap/single_replica_compaction.cpp b/be/src/olap/single_replica_compaction.cpp index 7470afe0ef62c7..458f3949b17017 100644 --- a/be/src/olap/single_replica_compaction.cpp +++ b/be/src/olap/single_replica_compaction.cpp @@ -39,6 +39,7 @@ #include "task/engine_clone_task.h" #include "util/brpc_client_cache.h" #include "util/doris_metrics.h" +#include "util/security.h" #include "util/thrift_rpc_helper.h" #include "util/trace.h" @@ -373,7 +374,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, // then it will try to clone from BE 2, but it will find the file 1 already exist, but file 1 with same // name may have different versions. VLOG_DEBUG << "single replica compaction begin to download files, remote path=" - << _mask_token(remote_url_prefix) << " local_path=" << local_path; + << mask_token(remote_url_prefix) << " local_path=" << local_path; RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(local_path)); RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(local_path)); @@ -438,10 +439,10 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, std::string local_file_path = local_path + file_name; LOG(INFO) << "single replica compaction begin to download file from: " - << _mask_token(remote_file_url) << " to: " << local_file_path + << mask_token(remote_file_url) << " to: " << local_file_path << ". size(B): " << file_size << ", timeout(s): " << estimate_timeout; - auto download_cb = [this, &remote_file_url, estimate_timeout, &local_file_path, + auto download_cb = [&remote_file_url, estimate_timeout, &local_file_path, file_size](HttpClient* client) { RETURN_IF_ERROR(client->init(remote_file_url)); client->set_timeout_ms(estimate_timeout * 1000); @@ -453,7 +454,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, uint64_t local_file_size = std::filesystem::file_size(local_file_path); if (local_file_size != file_size) { LOG(WARNING) << "download file length error" - << ", remote_path=" << _mask_token(remote_file_url) + << ", remote_path=" << mask_token(remote_file_url) << ", file_size=" << file_size << ", local_file_size=" << local_file_size; return Status::InternalError("downloaded file size is not equal"); @@ -585,9 +586,4 @@ Status SingleReplicaCompaction::_finish_clone(const string& clone_dir, return res; } -std::string SingleReplicaCompaction::_mask_token(const std::string& str) { - std::regex pattern("token=[\\w|-]+"); - return regex_replace(str, pattern, "token=******"); -} - } // namespace doris diff --git a/be/src/olap/single_replica_compaction.h b/be/src/olap/single_replica_compaction.h index 67f5527dd7b336..10ec65ec3f0570 100644 --- a/be/src/olap/single_replica_compaction.h +++ b/be/src/olap/single_replica_compaction.h @@ -62,7 +62,6 @@ class SingleReplicaCompaction final : public CompactionMixin { const std::string& local_path); Status _release_snapshot(const std::string& ip, int port, const std::string& snapshot_path); Status _finish_clone(const std::string& clone_dir, const Version& version); - std::string _mask_token(const std::string& str); CompactionType _compaction_type; std::vector _pending_rs_guards; diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 0a5927dab5aad3..2c5cc2fc980104 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -1212,7 +1212,7 @@ void DeleteBitmap::remove_stale_delete_bitmap_from_queue(const std::vector std::vector> to_delete; - auto tablet_id = -1; + int64_t tablet_id = -1; for (auto& version_str : vector) { auto it = _stale_delete_bitmap.find(version_str); if (it != _stale_delete_bitmap.end()) { diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index fc3a69fd5cde52..75cbcf68e956c1 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -64,6 +63,7 @@ #include "util/debug_points.h" #include "util/defer_op.h" #include "util/network_util.h" +#include "util/security.h" #include "util/stopwatch.hpp" #include "util/thrift_rpc_helper.h" #include "util/trace.h" @@ -415,7 +415,7 @@ Status EngineCloneTask::_make_and_download_snapshots(DataDir& data_dir, status = _download_files(&data_dir, remote_url_prefix, local_data_path); if (!status.ok()) [[unlikely]] { LOG_WARNING("failed to download snapshot from remote BE") - .tag("url", _mask_token(remote_url_prefix)) + .tag("url", mask_token(remote_url_prefix)) .error(status); continue; // Try another BE } @@ -552,11 +552,11 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re std::string local_file_path = local_path + "/" + file_name; - LOG(INFO) << "clone begin to download file from: " << _mask_token(remote_file_url) + LOG(INFO) << "clone begin to download file from: " << mask_token(remote_file_url) << " to: " << local_file_path << ". size(B): " << file_size << ", timeout(s): " << estimate_timeout; - auto download_cb = [this, &remote_file_url, estimate_timeout, &local_file_path, + auto download_cb = [&remote_file_url, estimate_timeout, &local_file_path, file_size](HttpClient* client) { RETURN_IF_ERROR(client->init(remote_file_url)); client->set_timeout_ms(estimate_timeout * 1000); @@ -572,7 +572,7 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re } if (local_file_size != file_size) { LOG(WARNING) << "download file length error" - << ", remote_path=" << _mask_token(remote_file_url) + << ", remote_path=" << mask_token(remote_file_url) << ", file_size=" << file_size << ", local_file_size=" << local_file_size; return Status::InternalError("downloaded file size is not equal"); @@ -600,7 +600,7 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re /// This method will only be called if tablet already exist in this BE when doing clone. /// This method will do the following things: -/// 1. Linke all files from CLONE dir to tablet dir if file does not exist in tablet dir +/// 1. Link all files from CLONE dir to tablet dir if file does not exist in tablet dir /// 2. Call _finish_xx_clone() to revise the tablet meta. Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_dir, int64_t version, bool is_incremental_clone) { @@ -864,9 +864,4 @@ Status EngineCloneTask::_finish_full_clone(Tablet* tablet, // TODO(plat1ko): write cooldown meta to remote if this replica is cooldown replica } -std::string EngineCloneTask::_mask_token(const std::string& str) { - std::regex pattern("token=[\\w|-]+"); - return regex_replace(str, pattern, "token=******"); -} - } // namespace doris diff --git a/be/src/olap/task/engine_clone_task.h b/be/src/olap/task/engine_clone_task.h index 9290ed9552ecf9..a11d4c742f4bcc 100644 --- a/be/src/olap/task/engine_clone_task.h +++ b/be/src/olap/task/engine_clone_task.h @@ -86,8 +86,6 @@ class EngineCloneTask final : public EngineTask { Status _release_snapshot(const std::string& ip, int port, const std::string& snapshot_path); - std::string _mask_token(const std::string& str); - private: StorageEngine& _engine; const TCloneReq& _clone_req; diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 37de9ac93d839f..74db3a5c06caf0 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -157,7 +157,7 @@ Status HashJoinBuildSinkLocalState::close(RuntimeState* state, Status exec_statu } } SCOPED_TIMER(_publish_runtime_filter_timer); - RETURN_IF_ERROR(_runtime_filter_slots->publish(!_should_build_hash_table)); + RETURN_IF_ERROR(_runtime_filter_slots->publish(state, !_should_build_hash_table)); return Base::close(state, exec_status); } diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp index 83b378e792c3fa..41cd8068dd771e 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp @@ -43,7 +43,7 @@ struct RuntimeFilterBuild { } { SCOPED_TIMER(_parent->publish_runtime_filter_timer()); - RETURN_IF_ERROR(runtime_filter_slots.publish()); + RETURN_IF_ERROR(runtime_filter_slots.publish(state)); } return Status::OK(); diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp index afa1a2e59b798c..f4f4ef21ece746 100644 --- a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp @@ -516,23 +516,20 @@ Status NestedLoopJoinProbeOperatorX::pull(RuntimeState* state, vectorized::Block local_state._matched_rows_done : local_state._matched_rows_done); + size_t join_block_column_size = local_state._join_block.columns(); { - vectorized::Block tmp_block = local_state._join_block; - - // Here make _join_block release the columns' ptr - local_state._join_block.set_columns(local_state._join_block.clone_empty_columns()); - - local_state.add_tuple_is_null_column(&tmp_block); + local_state.add_tuple_is_null_column(&local_state._join_block); { SCOPED_TIMER(local_state._join_filter_timer); RETURN_IF_ERROR(vectorized::VExprContext::filter_block( - local_state._conjuncts, &tmp_block, tmp_block.columns())); + local_state._conjuncts, &local_state._join_block, + local_state._join_block.columns())); } - RETURN_IF_ERROR(local_state._build_output_block(&tmp_block, block, false)); + RETURN_IF_ERROR( + local_state._build_output_block(&local_state._join_block, block, false)); local_state._reset_tuple_is_null_column(); } - local_state._join_block.clear_column_data(); - + local_state._join_block.clear_column_data(join_block_column_size); if (!(*eos) and !local_state._need_more_input_data) { auto func = [&](auto&& join_op_variants, auto set_build_side_flag, auto set_probe_side_flag) { diff --git a/be/src/pipeline/exec/result_sink_operator.cpp b/be/src/pipeline/exec/result_sink_operator.cpp index a3f1133f00e78e..f8196910021b2c 100644 --- a/be/src/pipeline/exec/result_sink_operator.cpp +++ b/be/src/pipeline/exec/result_sink_operator.cpp @@ -46,14 +46,25 @@ Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) _wait_for_dependency_timer = ADD_TIMER_WITH_LEVEL(_profile, timer_name, 1); auto fragment_instance_id = state->fragment_instance_id(); + auto& p = _parent->cast(); if (state->query_options().enable_parallel_result_sink) { _sender = _parent->cast()._sender; } else { - auto& p = _parent->cast(); RETURN_IF_ERROR(state->exec_env()->result_mgr()->create_sender( fragment_instance_id, p._result_sink_buffer_size_rows, &_sender, state)); } _sender->set_dependency(fragment_instance_id, _dependency->shared_from_this()); + + _output_vexpr_ctxs.resize(p._output_vexpr_ctxs.size()); + for (size_t i = 0; i < _output_vexpr_ctxs.size(); i++) { + RETURN_IF_ERROR(p._output_vexpr_ctxs[i]->clone(state, _output_vexpr_ctxs[i])); + } + if (p._sink_type == TResultSinkType::ARROW_FLIGHT_PROTOCAL) { + std::shared_ptr arrow_schema; + RETURN_IF_ERROR(get_arrow_schema_from_expr_ctxs(_output_vexpr_ctxs, &arrow_schema, + state->timezone())); + _sender->register_arrow_schema(arrow_schema); + } return Status::OK(); } @@ -62,10 +73,6 @@ Status ResultSinkLocalState::open(RuntimeState* state) { SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(Base::open(state)); auto& p = _parent->cast(); - _output_vexpr_ctxs.resize(p._output_vexpr_ctxs.size()); - for (size_t i = 0; i < _output_vexpr_ctxs.size(); i++) { - RETURN_IF_ERROR(p._output_vexpr_ctxs[i]->clone(state, _output_vexpr_ctxs[i])); - } // create writer based on sink type switch (p._sink_type) { case TResultSinkType::MYSQL_PROTOCAL: { @@ -79,10 +86,6 @@ Status ResultSinkLocalState::open(RuntimeState* state) { break; } case TResultSinkType::ARROW_FLIGHT_PROTOCAL: { - std::shared_ptr arrow_schema; - RETURN_IF_ERROR(get_arrow_schema_from_expr_ctxs(_output_vexpr_ctxs, &arrow_schema, - state->timezone())); - _sender->register_arrow_schema(arrow_schema); _writer.reset(new (std::nothrow) vectorized::VArrowFlightResultWriter( _sender.get(), _output_vexpr_ctxs, _profile)); break; diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 95e5f8e2ce14f3..c4f633d84aa278 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -1017,8 +1017,14 @@ void FragmentMgr::cancel_worker() { } } - for (auto it : brpc_stub_with_queries) { - _check_brpc_available(it.first, it.second); + if (config::enable_brpc_connection_check) { + for (auto it : brpc_stub_with_queries) { + if (!it.first) { + LOG(WARNING) << "brpc stub is nullptr, skip it."; + continue; + } + _check_brpc_available(it.first, it.second); + } } if (!queries_lost_coordinator.empty()) { @@ -1265,7 +1271,7 @@ Status FragmentMgr::send_filter_size(const PSendFilterSizeRequest* request) { std::shared_ptr filter_controller; RETURN_IF_ERROR(_runtimefilter_controller.acquire(queryid, &filter_controller)); - auto merge_status = filter_controller->send_filter_size(request); + auto merge_status = filter_controller->send_filter_size(query_ctx, request); return merge_status; } @@ -1307,7 +1313,7 @@ Status FragmentMgr::merge_filter(const PMergeFilterRequest* request, SCOPED_ATTACH_TASK(query_ctx.get()); std::shared_ptr filter_controller; RETURN_IF_ERROR(_runtimefilter_controller.acquire(queryid, &filter_controller)); - auto merge_status = filter_controller->merge(request, attach_data); + auto merge_status = filter_controller->merge(query_ctx, request, attach_data); return merge_status; } diff --git a/be/src/runtime/query_context.h b/be/src/runtime/query_context.h index 4746553040521b..d557245bf2339d 100644 --- a/be/src/runtime/query_context.h +++ b/be/src/runtime/query_context.h @@ -165,6 +165,12 @@ class QueryContext { return _query_options.__isset.fe_process_uuid ? _query_options.fe_process_uuid : 0; } + bool ignore_runtime_filter_error() const { + return _query_options.__isset.ignore_runtime_filter_error + ? _query_options.ignore_runtime_filter_error + : false; + } + // global runtime filter mgr, the runtime filter have remote target or // need local merge should regist here. before publish() or push_to_remote() // the runtime filter should do the local merge work diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index 1a238787207b17..4b4f48801239e3 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -305,7 +305,8 @@ Status RuntimeFilterMergeControllerEntity::init(UniqueId query_id, return Status::OK(); } -Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSizeRequest* request) { +Status RuntimeFilterMergeControllerEntity::send_filter_size(std::weak_ptr query_ctx, + const PSendFilterSizeRequest* request) { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); std::shared_ptr cnt_val; @@ -326,6 +327,8 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz Status st = Status::OK(); if (cnt_val->source_addrs.size() == cnt_val->producer_size) { + auto ctx = query_ctx.lock()->ignore_runtime_filter_error() ? std::weak_ptr {} + : query_ctx; for (auto addr : cnt_val->source_addrs) { std::shared_ptr stub( ExecEnv::GetInstance()->brpc_internal_client_cache()->get_client(addr)); @@ -339,7 +342,7 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz auto closure = AutoReleaseClosure>:: create_unique(std::make_shared(), - DummyBrpcCallback::create_shared()); + DummyBrpcCallback::create_shared(), ctx); auto* pquery_id = closure->request_->mutable_query_id(); pquery_id->set_hi(_state->query_id.hi()); @@ -377,7 +380,8 @@ Status RuntimeFilterMgr::sync_filter_size(const PSyncFilterSizeRequest* request) } // merge data -Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* request, +Status RuntimeFilterMergeControllerEntity::merge(std::weak_ptr query_ctx, + const PMergeFilterRequest* request, butil::IOBufAsZeroCopyInputStream* attach_data) { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); std::shared_ptr cnt_val; @@ -444,12 +448,14 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ has_attachment = true; } + auto ctx = query_ctx.lock()->ignore_runtime_filter_error() ? std::weak_ptr {} + : query_ctx; std::vector& targets = cnt_val->targetv2_info; for (auto& target : targets) { auto closure = AutoReleaseClosure>:: create_unique(std::make_shared(apply_request), - DummyBrpcCallback::create_shared()); + DummyBrpcCallback::create_shared(), ctx); closure->request_->set_filter_id(request->filter_id()); closure->request_->set_merge_time(merge_time); diff --git a/be/src/runtime/runtime_filter_mgr.h b/be/src/runtime/runtime_filter_mgr.h index b0aea7568cff65..bac61d6248a88f 100644 --- a/be/src/runtime/runtime_filter_mgr.h +++ b/be/src/runtime/runtime_filter_mgr.h @@ -156,10 +156,11 @@ class RuntimeFilterMergeControllerEntity { const TQueryOptions& query_options); // handle merge rpc - Status merge(const PMergeFilterRequest* request, + Status merge(std::weak_ptr query_ctx, const PMergeFilterRequest* request, butil::IOBufAsZeroCopyInputStream* attach_data); - Status send_filter_size(const PSendFilterSizeRequest* request); + Status send_filter_size(std::weak_ptr query_ctx, + const PSendFilterSizeRequest* request); UniqueId query_id() const { return _query_id; } diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index e3f9d075c8ffc2..38522f49dc3e13 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -40,6 +40,7 @@ #include "pipeline/exec/operator.h" #include "pipeline/pipeline_task.h" #include "runtime/exec_env.h" +#include "runtime/fragment_mgr.h" #include "runtime/load_path_mgr.h" #include "runtime/memory/mem_tracker_limiter.h" #include "runtime/memory/thread_mem_tracker_mgr.h" @@ -129,7 +130,6 @@ RuntimeState::RuntimeState(pipeline::PipelineFragmentContext*, const TUniqueId& : _profile("Fragment " + print_id(instance_id)), _load_channel_profile(""), _obj_pool(new ObjectPool()), - _runtime_filter_mgr(nullptr), _unreported_error_idx(0), _query_id(query_id), _fragment_id(fragment_id), @@ -294,6 +294,10 @@ Status RuntimeState::init(const TUniqueId& fragment_instance_id, const TQueryOpt return Status::OK(); } +std::weak_ptr RuntimeState::get_query_ctx_weak() { + return _exec_env->fragment_mgr()->get_or_erase_query_ctx_with_lock(_query_ctx->query_id()); +} + void RuntimeState::init_mem_trackers(const std::string& name, const TUniqueId& id) { _query_mem_tracker = MemTrackerLimiter::create_shared( MemTrackerLimiter::Type::OTHER, fmt::format("{}#Id={}", name, print_id(id))); diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 88deee491d19c4..73f854896f48fc 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -449,6 +449,8 @@ class RuntimeState { QueryContext* get_query_ctx() { return _query_ctx; } + std::weak_ptr get_query_ctx_weak(); + void set_query_mem_tracker(const std::shared_ptr& tracker) { _query_mem_tracker = tracker; } diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 29eb01bad2aaa8..be99278ab541a3 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -903,6 +903,7 @@ void PInternalService::fetch_arrow_flight_schema(google::protobuf::RpcController auto st = ExecEnv::GetInstance()->result_mgr()->find_arrow_schema( UniqueId(request->finst_id()).to_thrift(), &schema); if (!st.ok()) { + LOG(WARNING) << "fetch arrow flight schema failed, errmsg=" << st; st.to_protobuf(result->mutable_status()); return; } diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp index 74dab466340330..ea991e158a1138 100644 --- a/be/src/service/point_query_executor.cpp +++ b/be/src/service/point_query_executor.cpp @@ -396,17 +396,6 @@ Status PointQueryExecutor::_lookup_row_key() { specified_rowsets = _tablet->get_rowset_by_ids(nullptr); } std::vector> segment_caches(specified_rowsets.size()); - // init segment_cache - { - SCOPED_TIMER(&_profile_metrics.load_segment_key_stage_ns); - for (size_t i = 0; i < specified_rowsets.size(); i++) { - auto& rs = specified_rowsets[i]; - segment_caches[i] = std::make_unique(); - RETURN_IF_ERROR(SegmentLoader::instance()->load_segments( - std::static_pointer_cast(rs), segment_caches[i].get(), true, true, - &_profile_metrics.read_stats)); - } - } for (size_t i = 0; i < _row_read_ctxs.size(); ++i) { RowLocation location; if (!config::disable_storage_row_cache) { diff --git a/be/src/util/ref_count_closure.h b/be/src/util/ref_count_closure.h index 92772a82373fec..560aebb98ee15e 100644 --- a/be/src/util/ref_count_closure.h +++ b/be/src/util/ref_count_closure.h @@ -20,7 +20,9 @@ #include #include +#include +#include "runtime/query_context.h" #include "runtime/thread_context.h" #include "service/brpc.h" #include "util/ref_count_closure.h" @@ -79,8 +81,9 @@ class AutoReleaseClosure : public google::protobuf::Closure { ENABLE_FACTORY_CREATOR(AutoReleaseClosure); public: - AutoReleaseClosure(std::shared_ptr req, std::shared_ptr callback) - : request_(req), callback_(callback) { + AutoReleaseClosure(std::shared_ptr req, std::shared_ptr callback, + std::weak_ptr context = {}) + : request_(req), callback_(callback), context_(std::move(context)) { this->cntl_ = callback->cntl_; this->response_ = callback->response_; } @@ -113,12 +116,22 @@ class AutoReleaseClosure : public google::protobuf::Closure { protected: virtual void _process_if_rpc_failed() { - LOG(WARNING) << "RPC meet failed: " << cntl_->ErrorText(); + std::string error_msg = "RPC meet failed: " + cntl_->ErrorText(); + if (auto ctx = context_.lock(); ctx) { + ctx->cancel(Status::NetworkError(error_msg)); + } else { + LOG(WARNING) << error_msg; + } } virtual void _process_if_meet_error_status(const Status& status) { - // no need to log END_OF_FILE, reduce the unlessful log - if (!status.is()) { + if (status.is()) { + // no need to log END_OF_FILE, reduce the unlessful log + return; + } + if (auto ctx = context_.lock(); ctx) { + ctx->cancel(status); + } else { LOG(WARNING) << "RPC meet error status: " << status; } } @@ -136,6 +149,7 @@ class AutoReleaseClosure : public google::protobuf::Closure { // Use a weak ptr to keep the callback, so that the callback can be deleted if the main // thread is freed. Weak callback_; + std::weak_ptr context_; }; } // namespace doris diff --git a/be/src/util/security.h b/be/src/util/security.h new file mode 100644 index 00000000000000..d2201b1b297b70 --- /dev/null +++ b/be/src/util/security.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace doris { + +inline std::string mask_token(const std::string& str) { + std::regex pattern("token=[\\w|-]+"); + return std::regex_replace(str, pattern, "token=******"); +} + +inline std::string mask_token(const char* str) { + std::regex pattern("token=[\\w|-]+"); + return std::regex_replace(str, pattern, "token=******"); +} + +} // namespace doris diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index e9148716f99f35..32fc9d5efce771 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -43,7 +43,7 @@ class IDataType; struct AggregateFunctionAttr { bool enable_decimal256 {false}; - std::vector> column_infos; + std::vector column_names; }; template diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top.h index 7885321bba3e11..399af84f43cf20 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top.h @@ -18,12 +18,92 @@ #pragma once #include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_nullable.h" namespace doris::vectorized { class AggregateFunctionApproxTop { public: + AggregateFunctionApproxTop(const std::vector& column_names) + : _column_names(column_names) {} + + static int32_t is_valid_const_columns(const std::vector& is_const_columns) { + int32_t true_count = 0; + bool found_false_after_true = false; + for (int32_t i = is_const_columns.size() - 1; i >= 0; --i) { + if (is_const_columns[i]) { + true_count++; + if (found_false_after_true) { + return false; + } + } else { + if (true_count > 2) { + return false; + } + found_false_after_true = true; + } + } + if (true_count > 2) { + throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid is_const_columns configuration"); + } + return true_count; + } + +protected: + void lazy_init(const IColumn** columns, ssize_t row_num, + const DataTypes& argument_types) const { + auto get_param = [](size_t idx, const DataTypes& data_types, + const IColumn** columns) -> uint64_t { + const auto& data_type = data_types.at(idx); + const IColumn* column = columns[idx]; + + const auto* type = data_type.get(); + if (type->is_nullable()) { + type = assert_cast(type) + ->get_nested_type() + .get(); + } + int64_t value = 0; + WhichDataType which(type); + if (which.idx == TypeIndex::Int8) { + value = assert_cast(column) + ->get_element(0); + } else if (which.idx == TypeIndex::Int16) { + value = assert_cast(column) + ->get_element(0); + } else if (which.idx == TypeIndex::Int32) { + value = assert_cast(column) + ->get_element(0); + } + if (value <= 0) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "The parameter cannot be less than or equal to 0."); + } + return value; + }; + + _threshold = + std::min(get_param(_column_names.size(), argument_types, columns), (uint64_t)4096); + _reserved = std::min( + std::max(get_param(_column_names.size() + 1, argument_types, columns), _threshold), + (uint64_t)4096); + + if (_threshold == 0 || _reserved == 0 || _threshold > 4096 || _reserved > 4096) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "approx_top_sum param error, _threshold: {}, _reserved: {}", _threshold, + _reserved); + } + + _init_flag = true; + } + static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; + + mutable std::vector _column_names; + mutable bool _init_flag = false; + mutable uint64_t _threshold = 10; + mutable uint64_t _reserved = 30; }; } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp index d6298881a90630..0aa7adc253da0f 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp @@ -24,58 +24,16 @@ namespace doris::vectorized { -int32_t is_valid_const_columns(const std::vector& is_const_columns) { - int32_t true_count = 0; - bool found_false_after_true = false; - for (int32_t i = is_const_columns.size() - 1; i >= 0; --i) { - if (is_const_columns[i]) { - true_count++; - if (found_false_after_true) { - return false; - } - } else { - if (true_count > 2) { - return false; - } - found_false_after_true = true; - } - } - if (true_count > 2) { - throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid is_const_columns configuration"); - } - return true_count; -} - AggregateFunctionPtr create_aggregate_function_approx_top_k(const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, const AggregateFunctionAttr& attr) { - if (argument_types.empty()) { + if (argument_types.size() < 3) { return nullptr; } - std::vector is_const_columns; - std::vector column_names; - for (const auto& [name, is_const] : attr.column_infos) { - is_const_columns.push_back(is_const); - if (!is_const) { - column_names.push_back(name); - } - } - - int32_t true_count = is_valid_const_columns(is_const_columns); - if (true_count == 0) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else if (true_count == 1) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else if (true_count == 2) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else { - return nullptr; - } + return creator_without_type::create( + argument_types, result_is_nullable, attr.column_names); } void register_aggregate_function_approx_top_k(AggregateFunctionSimpleFactory& factory) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h index 7253ae8a96e200..93ea3232c311a1 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h @@ -45,28 +45,25 @@ namespace doris::vectorized { -inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; - struct AggregateFunctionTopKGenericData { using Set = SpaceSaving; Set value; }; -template class AggregateFunctionApproxTopK final : public IAggregateFunctionDataHelper>, + AggregateFunctionApproxTopK>, AggregateFunctionApproxTop { private: using State = AggregateFunctionTopKGenericData; public: - AggregateFunctionApproxTopK(std::vector column_names, + AggregateFunctionApproxTopK(const std::vector& column_names, const DataTypes& argument_types_) : IAggregateFunctionDataHelper>(argument_types_), - _column_names(std::move(column_names)) {} + AggregateFunctionApproxTopK>(argument_types_), + AggregateFunctionApproxTop(column_names) {} String get_name() const override { return "approx_top_k"; } @@ -88,7 +85,7 @@ class AggregateFunctionApproxTopK final void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, Arena* arena) const override { auto readStringBinaryInto = [](Arena& arena, BufferReadable& buf) { - size_t size = 0; + uint64_t size = 0; read_var_uint(size, buf); if (UNLIKELY(size > DEFAULT_MAX_STRING_SIZE)) { @@ -104,7 +101,7 @@ class AggregateFunctionApproxTopK final auto& set = this->data(place).value; set.clear(); - size_t size = 0; + uint64_t size = 0; read_var_uint(size, buf); if (UNLIKELY(size > TOP_K_MAX_SIZE)) { throw Exception(ErrorCode::INTERNAL_ERROR, @@ -141,7 +138,7 @@ class AggregateFunctionApproxTopK final void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, Arena* arena) const override { if (!_init_flag) { - lazy_init(columns, row_num); + lazy_init(columns, row_num, this->get_argument_types()); } auto& set = this->data(place).value; @@ -227,64 +224,6 @@ class AggregateFunctionApproxTopK final std::string res = buffer.GetString(); data_to.insert_data(res.data(), res.size()); } - -private: - void lazy_init(const IColumn** columns, ssize_t row_num) const { - auto get_param = [](size_t idx, const DataTypes& data_types, - const IColumn** columns) -> uint64_t { - const auto& data_type = data_types.at(idx); - const IColumn* column = columns[idx]; - - const auto* type = data_type.get(); - if (type->is_nullable()) { - type = assert_cast(type) - ->get_nested_type() - .get(); - } - int64_t value = 0; - WhichDataType which(type); - if (which.idx == TypeIndex::Int8) { - value = assert_cast(column) - ->get_element(0); - } else if (which.idx == TypeIndex::Int16) { - value = assert_cast(column) - ->get_element(0); - } else if (which.idx == TypeIndex::Int32) { - value = assert_cast(column) - ->get_element(0); - } - if (value <= 0) { - throw Exception(ErrorCode::INVALID_ARGUMENT, - "The parameter cannot be less than or equal to 0."); - } - return value; - }; - - const auto& data_types = this->get_argument_types(); - if (ArgsSize == 1) { - _threshold = - std::min(get_param(_column_names.size(), data_types, columns), (uint64_t)1000); - } else if (ArgsSize == 2) { - _threshold = - std::min(get_param(_column_names.size(), data_types, columns), (uint64_t)1000); - _reserved = std::min( - std::max(get_param(_column_names.size() + 1, data_types, columns), _threshold), - (uint64_t)1000); - } - - if (_threshold == 0 || _reserved == 0 || _threshold > 1000 || _reserved > 1000) { - throw Exception(ErrorCode::INTERNAL_ERROR, - "approx_top_k param error, _threshold: {}, _reserved: {}", _threshold, - _reserved); - } - - _init_flag = true; - } - - mutable std::vector _column_names; - mutable bool _init_flag = false; - mutable uint64_t _threshold = 10; - mutable uint64_t _reserved = 300; }; } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp new file mode 100644 index 00000000000000..7325651d141c13 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/aggregate_functions/aggregate_function_approx_top_sum.h" + +#include "common/exception.h" +#include "vec/aggregate_functions/aggregate_function_simple_factory.h" +#include "vec/aggregate_functions/helpers.h" +#include "vec/data_types/data_type.h" + +namespace doris::vectorized { + +template +AggregateFunctionPtr create_aggregate_function_multi_top_sum_impl( + const DataTypes& argument_types, const bool result_is_nullable, + const std::vector& column_names) { + if (N == argument_types.size() - 3) { + return creator_with_type_base::template create< + AggregateFunctionApproxTopSumSimple>(argument_types, result_is_nullable, + column_names); + } else { + return create_aggregate_function_multi_top_sum_impl( + argument_types, result_is_nullable, column_names); + } +} + +template <> +AggregateFunctionPtr create_aggregate_function_multi_top_sum_impl<0>( + const DataTypes& argument_types, const bool result_is_nullable, + const std::vector& column_names) { + return creator_with_type_base::template create< + AggregateFunctionApproxTopSumSimple>(argument_types, result_is_nullable, column_names); +} + +AggregateFunctionPtr create_aggregate_function_approx_top_sum(const std::string& name, + const DataTypes& argument_types, + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { + if (argument_types.size() < 3) { + return nullptr; + } + + constexpr size_t max_param_value = 10; + if (argument_types.size() > max_param_value) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Argument types size exceeds the supported limit."); + } + + return create_aggregate_function_multi_top_sum_impl( + argument_types, result_is_nullable, attr.column_names); +} + +void register_aggregate_function_approx_top_sum(AggregateFunctionSimpleFactory& factory) { + factory.register_function_both("approx_top_sum", create_aggregate_function_approx_top_sum); +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h new file mode 100644 index 00000000000000..9b3ba6a965091a --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/aggregate_functions/aggregate_function_approx_top.h" +#include "vec/columns/column.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_struct.h" +#include "vec/columns/column_vector.h" +#include "vec/columns/columns_number.h" +#include "vec/common/assert_cast.h" +#include "vec/common/space_saving.h" +#include "vec/common/string_ref.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_ipv4.h" +#include "vec/data_types/data_type_struct.h" +#include "vec/io/io_helper.h" + +namespace doris::vectorized { + +struct AggregateFunctionTopKGenericData { + using Set = SpaceSaving; + + Set value; +}; + +template +class AggregateFunctionApproxTopSum final + : public IAggregateFunctionDataHelper>, + AggregateFunctionApproxTop { +private: + using State = AggregateFunctionTopKGenericData; + + using ResultDataType = DataTypeNumber; + using ColVecType = ColumnVector; + using ColVecResult = ColumnVector; + +public: + AggregateFunctionApproxTopSum(const std::vector& column_names, + const DataTypes& argument_types_) + : IAggregateFunctionDataHelper>( + argument_types_), + AggregateFunctionApproxTop(column_names) {} + + String get_name() const override { return "approx_top_sum"; } + + DataTypePtr get_return_type() const override { return std::make_shared(); } + + // Serializes the aggregate function's state (including the SpaceSaving structure and threshold) into a buffer. + void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override { + this->data(place).value.write(buf); + + write_var_uint(_column_names.size(), buf); + for (const auto& column_name : _column_names) { + write_string_binary(column_name, buf); + } + write_var_uint(_threshold, buf); + write_var_uint(_reserved, buf); + } + + // Deserializes the aggregate function's state from a buffer (including the SpaceSaving structure and threshold). + void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, + Arena* arena) const override { + auto readStringBinaryInto = [](Arena& arena, BufferReadable& buf) { + size_t size = 0; + read_var_uint(size, buf); + + if (UNLIKELY(size > DEFAULT_MAX_STRING_SIZE)) { + throw Exception(ErrorCode::INTERNAL_ERROR, "Too large string size."); + } + + char* data = arena.alloc(size); + buf.read(data, size); + + return StringRef(data, size); + }; + + auto& set = this->data(place).value; + set.clear(); + + size_t size = 0; + read_var_uint(size, buf); + if (UNLIKELY(size > TOP_K_MAX_SIZE)) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Too large size ({}) for aggregate function '{}' state (maximum is {})", + size, get_name(), TOP_K_MAX_SIZE); + } + + set.resize(size); + for (size_t i = 0; i < size; ++i) { + auto ref = readStringBinaryInto(*arena, buf); + uint64_t count = 0; + uint64_t error = 0; + read_var_uint(count, buf); + read_var_uint(error, buf); + set.insert(ref, count, error); + arena->rollback(ref.size); + } + + set.read_alpha_map(buf); + + uint64_t column_size = 0; + read_var_uint(column_size, buf); + _column_names.clear(); + for (uint64_t i = 0; i < column_size; i++) { + std::string column_name; + read_string_binary(column_name, buf); + _column_names.emplace_back(std::move(column_name)); + } + read_var_uint(_threshold, buf); + read_var_uint(_reserved, buf); + } + + // Adds a new row of data to the aggregate function (inserts a new value into the SpaceSaving structure). + void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, + Arena* arena) const override { + if (!_init_flag) { + lazy_init(columns, row_num, this->get_argument_types()); + } + + auto& set = this->data(place).value; + if (set.capacity() != _reserved) { + set.resize(_reserved); + } + + auto all_serialize_value_into_arena = + [](size_t i, size_t keys_size, const IColumn** columns, Arena* arena) -> StringRef { + const char* begin = nullptr; + + size_t sum_size = 0; + for (size_t j = 0; j < keys_size; ++j) { + sum_size += columns[j]->serialize_value_into_arena(i, *arena, begin).size; + } + + return {begin, sum_size}; + }; + + StringRef str_serialized = + all_serialize_value_into_arena(row_num, _column_names.size(), columns, arena); + const auto& column = assert_cast( + *columns[_column_names.size() - 1]); + set.insert(str_serialized, TResult(column.get_data()[row_num])); + arena->rollback(str_serialized.size); + } + + void add_many(AggregateDataPtr __restrict place, const IColumn** columns, + std::vector& rows, Arena* arena) const override { + for (auto row : rows) { + add(place, columns, row, arena); + } + } + + void reset(AggregateDataPtr __restrict place) const override { + this->data(place).value.clear(); + } + + // Merges the state of another aggregate function into the current one (merges two SpaceSaving sets). + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, + Arena*) const override { + auto& rhs_set = this->data(rhs).value; + if (!rhs_set.size()) { + return; + } + + auto& set = this->data(place).value; + if (set.capacity() != _reserved) { + set.resize(_reserved); + } + set.merge(rhs_set); + } + + void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { + auto& data_to = assert_cast(to); + + const typename State::Set& set = this->data(place).value; + auto result_vec = set.top_k(_threshold); + + rapidjson::StringBuffer buffer; + rapidjson::PrettyWriter writer(buffer); + writer.StartArray(); + for (auto& result : result_vec) { + auto argument_types = this->get_argument_types(); + MutableColumns argument_columns(_column_names.size()); + for (size_t i = 0; i < _column_names.size(); ++i) { + argument_columns[i] = argument_types[i]->create_column(); + } + rapidjson::StringBuffer sub_buffer; + rapidjson::Writer sub_writer(sub_buffer); + sub_writer.StartObject(); + const char* begin = result.key.data; + for (size_t i = 0; i < _column_names.size(); i++) { + begin = argument_columns[i]->deserialize_and_insert_from_arena(begin); + std::string row_str = argument_types[i]->to_string(*argument_columns[i], 0); + sub_writer.Key(_column_names[i].data(), _column_names[i].size()); + sub_writer.String(row_str.data(), row_str.size()); + } + sub_writer.Key("sum"); + sub_writer.String(std::to_string(result.count).c_str()); + sub_writer.EndObject(); + writer.RawValue(sub_buffer.GetString(), sub_buffer.GetSize(), rapidjson::kObjectType); + } + writer.EndArray(); + std::string res = buffer.GetString(); + data_to.insert_data(res.data(), res.size()); + } +}; + +template +struct TopSumSimple { + using ResultType = T; + using AggregateDataType = AggregateFunctionTopKGenericData; + using Function = AggregateFunctionApproxTopSum; +}; + +template +using AggregateFunctionApproxTopSumSimple = typename TopSumSimple::Function; + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_collect.h b/be/src/vec/aggregate_functions/aggregate_function_collect.h index 02490be56a0bf1..da310c6e0cc4c2 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_collect.h +++ b/be/src/vec/aggregate_functions/aggregate_function_collect.h @@ -98,7 +98,7 @@ struct AggregateFunctionCollectSetData { } void read(BufferReadable& buf) { - size_t new_size = 0; + uint64_t new_size = 0; read_var_uint(new_size, buf); ElementNativeType x; for (size_t i = 0; i < new_size; ++i) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp index 71d09f61de4302..4c5fe1321952d6 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp @@ -29,20 +29,15 @@ namespace doris::vectorized { -template