diff --git a/.github/workflows/auto-cherry-pick.yml b/.github/workflows/auto-cherry-pick.yml index 2581de3f31cc4e1..df1a44153ac9dd7 100644 --- a/.github/workflows/auto-cherry-pick.yml +++ b/.github/workflows/auto-cherry-pick.yml @@ -21,6 +21,7 @@ on: pull_request_target: types: - closed + - labeled branches: - master permissions: @@ -30,7 +31,7 @@ permissions: jobs: auto_cherry_pick: runs-on: ubuntu-latest - if: ${{ (contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') || contains(github.event.pull_request.labels.*.name, 'dev/2.1.x')) && github.event.pull_request.merged == true }} + if: ${{(contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') || contains(github.event.pull_request.labels.*.name, 'dev/2.1.x') ||github.event.label.name == 'dev/3.0.x' || github.event.label.name == 'dev/2.1.x') && github.event.pull_request.merged == true }} steps: - name: Checkout repository uses: actions/checkout@v3 @@ -54,18 +55,18 @@ jobs: echo "SHA matches: $calculated_sha" fi - name: Auto cherry-pick to branch-3.0 - if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') }} + if: ${{ ((github.event.action == 'labeled' && github.event.label.name == 'dev/3.0.x'))|| ((github.event_name == 'pull_request_target' && github.event.action == 'closed') && contains(github.event.pull_request.labels.*.name, 'dev/3.0.x')) }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO_NAME: ${{ github.repository }} - CONFLICT_LABEL: cherry-pick-conflict-in-3.0 + CONFLICT_LABEL: dev/3.0.x-conflict run: | python tools/auto-pick-script.py ${{ github.event.pull_request.number }} branch-3.0 - name: Auto cherry-pick to branch-2.1 - if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/2.1.x') }} + if: ${{ ((github.event.action == 'labeled' && github.event.label.name == 'dev/2.1.x'))|| ((github.event_name == 'pull_request_target' && github.event.action == 'closed') && contains(github.event.pull_request.labels.*.name, 'dev/2.1.x')) }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO_NAME: ${{ github.repository }} - CONFLICT_LABEL: cherry-pick-conflict-in-2.1.x + CONFLICT_LABEL: dev/2.1.x-conflict run: | python tools/auto-pick-script.py ${{ github.event.pull_request.number }} branch-2.1 diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 1d79048f96511c5..d476af8e2110df2 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -130,6 +130,8 @@ message(STATUS "THIRDPARTY_DIR is ${THIRDPARTY_DIR}") option(MAKE_TEST "ON for make unit test or OFF for not" OFF) message(STATUS "make test: ${MAKE_TEST}") +option(BUILD_BENCHMARK "ON for make google benchmark or OFF for not" OFF) +message(STATUS "make benchmark: ${BUILD_BENCHMARK}") option(WITH_MYSQL "Support access MySQL" ON) @@ -568,7 +570,7 @@ if (OS_MACOSX) ) endif() -if (MAKE_TEST) +if (BUILD_BENCHMARK) set(COMMON_THIRDPARTY ${COMMON_THIRDPARTY} benchmark @@ -708,6 +710,11 @@ if (MAKE_TEST) endif() endif () +# use this to avoid some runtime tracker. reuse BE_TEST symbol, no need another. +if (BUILD_BENCHMARK) + add_definitions(-DBE_TEST) +endif() + get_directory_property(COMPILER_FLAGS COMPILE_OPTIONS) get_directory_property(COMPILER_DEFINES COMPILE_DEFINITIONS) message(STATUS "Compiler: ${CMAKE_CXX_COMPILER_ID}-${CMAKE_CXX_COMPILER_VERSION}") @@ -754,7 +761,7 @@ add_subdirectory(${SRC_DIR}/http) add_subdirectory(${SRC_DIR}/io) add_subdirectory(${SRC_DIR}/olap) add_subdirectory(${SRC_DIR}/runtime) -add_subdirectory(${SRC_DIR}/service) +add_subdirectory(${SRC_DIR}/service) # this include doris_be add_subdirectory(${SRC_DIR}/udf) add_subdirectory(${SRC_DIR}/cloud) @@ -772,36 +779,44 @@ add_subdirectory(${SRC_DIR}/util) add_subdirectory(${SRC_DIR}/vec) add_subdirectory(${SRC_DIR}/pipeline) +# this include doris_be_test if (MAKE_TEST) add_subdirectory(${TEST_DIR}) endif () add_subdirectory(${COMMON_SRC_DIR}/cpp ${BUILD_DIR}/src/common_cpp) -# Install be -install(DIRECTORY DESTINATION ${OUTPUT_DIR}) -install(DIRECTORY DESTINATION ${OUTPUT_DIR}/bin) -install(DIRECTORY DESTINATION ${OUTPUT_DIR}/conf) - -install(FILES - ${BASE_DIR}/../bin/start_be.sh - ${BASE_DIR}/../bin/stop_be.sh - ${BASE_DIR}/../tools/jeprof - PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE - GROUP_READ GROUP_WRITE GROUP_EXECUTE - WORLD_READ WORLD_EXECUTE - DESTINATION ${OUTPUT_DIR}/bin) - -install(FILES - ${BASE_DIR}/../conf/be.conf - ${BASE_DIR}/../conf/odbcinst.ini - ${BASE_DIR}/../conf/asan_suppr.conf - ${BASE_DIR}/../conf/lsan_suppr.conf - DESTINATION ${OUTPUT_DIR}/conf) +if(NOT BUILD_BENCHMARK) + # Install be + install(DIRECTORY DESTINATION ${OUTPUT_DIR}) + install(DIRECTORY DESTINATION ${OUTPUT_DIR}/bin) + install(DIRECTORY DESTINATION ${OUTPUT_DIR}/conf) + + install(FILES + ${BASE_DIR}/../bin/start_be.sh + ${BASE_DIR}/../bin/stop_be.sh + ${BASE_DIR}/../tools/jeprof + PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE + GROUP_READ GROUP_WRITE GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE + DESTINATION ${OUTPUT_DIR}/bin) + + install(FILES + ${BASE_DIR}/../conf/be.conf + ${BASE_DIR}/../conf/odbcinst.ini + ${BASE_DIR}/../conf/asan_suppr.conf + ${BASE_DIR}/../conf/lsan_suppr.conf + DESTINATION ${OUTPUT_DIR}/conf) +endif() get_property(dirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) foreach(dir ${dirs}) message(STATUS "dir='${dir}'") endforeach() - +if (BUILD_BENCHMARK) + add_executable(benchmark_test ${BASE_DIR}/benchmark/benchmark_main.cpp) + target_link_libraries(benchmark_test ${DORIS_LINK_LIBS}) + message(STATUS "Add benchmark to build") + install(TARGETS benchmark_test DESTINATION ${OUTPUT_DIR}/lib) +endif() \ No newline at end of file diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp new file mode 100644 index 000000000000000..cad6463e9818521 --- /dev/null +++ b/be/benchmark/benchmark_main.cpp @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "vec/columns/column_string.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { // change if need + +static void Example1(benchmark::State& state) { + // init. dont time it. + state.PauseTiming(); + Block block; + DataTypePtr str_type = std::make_shared(); + std::vector vals {100, "content"}; + state.ResumeTiming(); + + // do test + for (auto _ : state) { + auto str_col = ColumnString::create(); + for (auto& v : vals) { + str_col->insert_data(v.data(), v.size()); + } + block.insert({std::move(str_col), str_type, "col"}); + benchmark::DoNotOptimize(block); // mark the watched target + } +} +// could BENCHMARK many functions to compare them together. +BENCHMARK(Example1); + +} // namespace doris::vectorized + +BENCHMARK_MAIN(); diff --git a/be/src/agent/cgroup_cpu_ctl.cpp b/be/src/agent/cgroup_cpu_ctl.cpp index e68535a708c49b1..76b72f2c9d00ae7 100644 --- a/be/src/agent/cgroup_cpu_ctl.cpp +++ b/be/src/agent/cgroup_cpu_ctl.cpp @@ -158,11 +158,11 @@ uint64_t CgroupCpuCtl::cpu_soft_limit_default_value() { return _is_enable_cgroup_v2_in_env ? 100 : 1024; } -std::unique_ptr CgroupCpuCtl::create_cgroup_cpu_ctl(uint64_t wg_id) { +std::shared_ptr CgroupCpuCtl::create_cgroup_cpu_ctl(uint64_t wg_id) { if (_is_enable_cgroup_v2_in_env) { - return std::make_unique(wg_id); + return std::make_shared(wg_id); } else if (_is_enable_cgroup_v1_in_env) { - return std::make_unique(wg_id); + return std::make_shared(wg_id); } return nullptr; } diff --git a/be/src/agent/cgroup_cpu_ctl.h b/be/src/agent/cgroup_cpu_ctl.h index 84e191159f15f1c..b23f1f4dd9cadba 100644 --- a/be/src/agent/cgroup_cpu_ctl.h +++ b/be/src/agent/cgroup_cpu_ctl.h @@ -52,7 +52,7 @@ class CgroupCpuCtl { static Status delete_unused_cgroup_path(std::set& used_wg_ids); - static std::unique_ptr create_cgroup_cpu_ctl(uint64_t wg_id); + static std::shared_ptr create_cgroup_cpu_ctl(uint64_t wg_id); static bool is_a_valid_cgroup_path(std::string cg_path); diff --git a/be/src/agent/topic_subscriber.cpp b/be/src/agent/topic_subscriber.cpp index f62bdaef0991c94..b470e1534e1c6ff 100644 --- a/be/src/agent/topic_subscriber.cpp +++ b/be/src/agent/topic_subscriber.cpp @@ -40,14 +40,12 @@ void TopicSubscriber::handle_topic_info(const TPublishTopicRequest& topic_reques // eg, update workload info may delay other listener, then we need add a thread here // to handle_topic_info asynchronous std::shared_lock lock(_listener_mtx); - LOG(INFO) << "[topic_publish]begin handle topic info"; for (auto& listener_pair : _registered_listeners) { if (topic_request.topic_map.find(listener_pair.first) != topic_request.topic_map.end()) { - LOG(INFO) << "[topic_publish]begin handle topic " << listener_pair.first - << ", size=" << topic_request.topic_map.at(listener_pair.first).size(); listener_pair.second->handle_topic_info( topic_request.topic_map.at(listener_pair.first)); - LOG(INFO) << "[topic_publish]finish handle topic " << listener_pair.first; + LOG(INFO) << "[topic_publish]finish handle topic " << listener_pair.first + << ", size=" << topic_request.topic_map.at(listener_pair.first).size(); } } } diff --git a/be/src/agent/workload_group_listener.cpp b/be/src/agent/workload_group_listener.cpp index f0f57869f2545ad..7b688b7dcdf6efc 100644 --- a/be/src/agent/workload_group_listener.cpp +++ b/be/src/agent/workload_group_listener.cpp @@ -59,7 +59,7 @@ void WorkloadGroupListener::handle_topic_info(const std::vector& topi workload_group_info.enable_cpu_hard_limit); // 4 create and update task scheduler - wg->upsert_task_scheduler(&workload_group_info, _exec_env); + wg->upsert_task_scheduler(&workload_group_info); // 5 upsert io throttle wg->upsert_scan_io_throttle(&workload_group_info); diff --git a/be/src/cloud/cloud_schema_change_job.cpp b/be/src/cloud/cloud_schema_change_job.cpp index 896804578d7db9c..0bab742c3ad6e5a 100644 --- a/be/src/cloud/cloud_schema_change_job.cpp +++ b/be/src/cloud/cloud_schema_change_job.cpp @@ -363,7 +363,8 @@ Status CloudSchemaChangeJob::_convert_historical_rowsets(const SchemaChangeParam // If there are historical versions of rowsets, we need to recalculate their delete // bitmaps, otherwise we will miss the delete bitmaps of incremental rowsets int64_t start_calc_delete_bitmap_version = - already_exist_any_version ? 0 : sc_job->alter_version() + 1; + // [0-1] is a placeholder rowset, start from 2. + already_exist_any_version ? 2 : sc_job->alter_version() + 1; RETURN_IF_ERROR(_process_delete_bitmap(sc_job->alter_version(), start_calc_delete_bitmap_version, initiator)); sc_job->set_delete_bitmap_lock_initiator(initiator); diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index 5d7b445917aa206..dc6abbac31ba1bf 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -231,7 +231,7 @@ Result CloudStorageEngine::get_tablet(int64_t tablet_id) { }); } -Status CloudStorageEngine::start_bg_threads() { +Status CloudStorageEngine::start_bg_threads(std::shared_ptr wg_sptr) { RETURN_IF_ERROR(Thread::create( "CloudStorageEngine", "refresh_s3_info_thread", [this]() { this->_refresh_storage_vault_info_thread_callback(); }, @@ -266,14 +266,27 @@ Status CloudStorageEngine::start_bg_threads() { // compaction tasks producer thread int base_thread_num = get_base_thread_num(); int cumu_thread_num = get_cumu_thread_num(); - RETURN_IF_ERROR(ThreadPoolBuilder("BaseCompactionTaskThreadPool") - .set_min_threads(base_thread_num) - .set_max_threads(base_thread_num) - .build(&_base_compaction_thread_pool)); - RETURN_IF_ERROR(ThreadPoolBuilder("CumuCompactionTaskThreadPool") - .set_min_threads(cumu_thread_num) - .set_max_threads(cumu_thread_num) - .build(&_cumu_compaction_thread_pool)); + if (wg_sptr->get_cgroup_cpu_ctl_wptr().lock()) { + RETURN_IF_ERROR(ThreadPoolBuilder("BaseCompactionTaskThreadPool") + .set_min_threads(base_thread_num) + .set_max_threads(base_thread_num) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_base_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("CumuCompactionTaskThreadPool") + .set_min_threads(cumu_thread_num) + .set_max_threads(cumu_thread_num) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_cumu_compaction_thread_pool)); + } else { + RETURN_IF_ERROR(ThreadPoolBuilder("BaseCompactionTaskThreadPool") + .set_min_threads(base_thread_num) + .set_max_threads(base_thread_num) + .build(&_base_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("CumuCompactionTaskThreadPool") + .set_min_threads(cumu_thread_num) + .set_max_threads(cumu_thread_num) + .build(&_cumu_compaction_thread_pool)); + } RETURN_IF_ERROR(Thread::create( "StorageEngine", "compaction_tasks_producer_thread", [this]() { this->_compaction_tasks_producer_callback(); }, diff --git a/be/src/cloud/cloud_storage_engine.h b/be/src/cloud/cloud_storage_engine.h index 92d2917a916f6ac..072b8366542253a 100644 --- a/be/src/cloud/cloud_storage_engine.h +++ b/be/src/cloud/cloud_storage_engine.h @@ -57,7 +57,7 @@ class CloudStorageEngine final : public BaseStorageEngine { Result get_tablet(int64_t tablet_id) override; - Status start_bg_threads() override; + Status start_bg_threads(std::shared_ptr wg_sptr = nullptr) override; Status set_cluster_id(int32_t cluster_id) override { _effective_cluster_id = cluster_id; diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index ebd1fea3dd9facd..c88b073e96494a8 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -449,6 +449,12 @@ void CloudTablet::recycle_cached_data(const std::vector& rowset if (config::enable_file_cache) { for (const auto& rs : rowsets) { + if (rs.use_count() >= 1) { + LOG(WARNING) << "Rowset " << rs->rowset_id().to_string() << " has " + << rs.use_count() + << " references. File Cache won't be recycled when query is using it."; + continue; + } for (int seg_id = 0; seg_id < rs->num_segments(); ++seg_id) { // TODO: Segment::file_cache_key auto file_key = Segment::file_cache_key(rs->rowset_id().to_string(), seg_id); diff --git a/be/src/clucene b/be/src/clucene index 7cf6cf410d41d95..48fa9cc4ec32b40 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 7cf6cf410d41d95456edba263cc55b7b6f5ab027 +Subproject commit 48fa9cc4ec32b40bf3b02338d0a1b2cdbc6408cf diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index a37a006acf0b6f3..d9b9a02260a4e6b 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -63,8 +63,29 @@ DEFINE_Int32(brpc_port, "8060"); DEFINE_Int32(arrow_flight_sql_port, "-1"); -DEFINE_mString(public_access_ip, ""); -DEFINE_Int32(public_access_port, "-1"); +// If the external client cannot directly access priority_networks, set public_host to be accessible +// to external client. +// There are usually two usage scenarios: +// 1. in production environment, it is often inconvenient to expose Doris BE nodes to the external network. +// However, a reverse proxy (such as Nginx) can be added to all Doris BE nodes, and the external client will be +// randomly routed to a Doris BE node when connecting to Nginx. set public_host to the host of Nginx. +// 2. if priority_networks is an internal network IP, and BE node has its own independent external IP, +// but Doris currently does not support modifying priority_networks, setting public_host to the real external IP. +DEFINE_mString(public_host, ""); + +// If the BE node is connected to the external network through a reverse proxy like Nginx +// and need to use Arrow Flight SQL, should add a server in Nginx to reverse proxy +// `Nginx:arrow_flight_sql_proxy_port` to `BE_priority_networks:arrow_flight_sql_port`. For example: +// upstream arrowflight { +// server 10.16.10.8:8069; +// server 10.16.10.8:8068; +//} +// server { +// listen 8167 http2; +// listen [::]:8167 http2; +// server_name doris.arrowflight.com; +// } +DEFINE_Int32(arrow_flight_sql_proxy_port, "-1"); // the number of bthreads for brpc, the default value is set to -1, // which means the number of bthreads is #cpu-cores @@ -1004,7 +1025,7 @@ DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false"); DEFINE_Bool(enable_file_cache, "false"); // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240}] // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240},{"path":"/path/to/file_cache2","total_size":21474836480,"query_limit":10737418240}] -// format: {"path": "/path/to/file_cache", "total_size":53687091200, "normal_percent":85, "disposable_percent":10, "index_percent":5} +// format: {"path": "/path/to/file_cache", "total_size":53687091200, "ttl_percent":50, "normal_percent":40, "disposable_percent":5, "index_percent":5} // format: [{"path": "xxx", "total_size":53687091200, "storage": "memory"}] // Note1: storage is "disk" by default // Note2: when the storage is "memory", the path is ignored. So you can set xxx to anything you like @@ -1020,7 +1041,7 @@ DEFINE_Int64(file_cache_each_block_size, "1048576"); // 1MB DEFINE_Bool(clear_file_cache, "false"); DEFINE_Bool(enable_file_cache_query_limit, "false"); -DEFINE_mInt32(file_cache_enter_disk_resource_limit_mode_percent, "90"); +DEFINE_mInt32(file_cache_enter_disk_resource_limit_mode_percent, "88"); DEFINE_mInt32(file_cache_exit_disk_resource_limit_mode_percent, "80"); DEFINE_mBool(enable_read_cache_file_directly, "false"); DEFINE_mBool(file_cache_enable_evict_from_other_queue_by_size, "true"); @@ -1301,8 +1322,6 @@ DEFINE_Int64(num_buffered_reader_prefetch_thread_pool_max_thread, "64"); DEFINE_Int64(num_s3_file_upload_thread_pool_min_thread, "16"); // The max thread num for S3FileUploadThreadPool DEFINE_Int64(num_s3_file_upload_thread_pool_max_thread, "64"); -// The max ratio for ttl cache's size -DEFINE_mInt64(max_ttl_cache_ratio, "50"); // The maximum jvm heap usage ratio for hdfs write workload DEFINE_mDouble(max_hdfs_wirter_jni_heap_usage_ratio, "0.5"); // The sleep milliseconds duration when hdfs write exceeds the maximum usage @@ -1370,6 +1389,7 @@ DEFINE_Int32(query_cache_size, "512"); DEFINE_mBool(enable_delete_bitmap_merge_on_compaction, "false"); // Enable validation to check the correctness of table size. DEFINE_Bool(enable_table_size_correctness_check, "false"); +DEFINE_Bool(force_regenerate_rowsetid_on_start_error, "false"); // clang-format off #ifdef BE_TEST diff --git a/be/src/common/config.h b/be/src/common/config.h index 63d62b219c12f8d..7f18406eeee721b 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -100,11 +100,29 @@ DECLARE_Int32(brpc_port); // Default -1, do not start arrow flight sql server. DECLARE_Int32(arrow_flight_sql_port); -// If priority_networks is incorrect but cannot be modified, set public_access_ip as BE’s real IP. -// For ADBC client fetch result, default is empty, the ADBC client uses the backend ip to fetch the result. -// If ADBC client cannot access the backend ip, can set public_access_ip to modify the fetch result ip. -DECLARE_mString(public_access_ip); -DECLARE_Int32(public_access_port); +// If the external client cannot directly access priority_networks, set public_host to be accessible +// to external client. +// There are usually two usage scenarios: +// 1. in production environment, it is often inconvenient to expose Doris BE nodes to the external network. +// However, a reverse proxy (such as Nginx) can be added to all Doris BE nodes, and the external client will be +// randomly routed to a Doris BE node when connecting to Nginx. set public_host to the host of Nginx. +// 2. if priority_networks is an internal network IP, and BE node has its own independent external IP, +// but Doris currently does not support modifying priority_networks, setting public_host to the real external IP. +DECLARE_mString(public_host); + +// If the BE node is connected to the external network through a reverse proxy like Nginx +// and need to use Arrow Flight SQL, should add a server in Nginx to reverse proxy +// `Nginx:arrow_flight_sql_proxy_port` to `BE_priority_networks:arrow_flight_sql_port`. For example: +// upstream arrowflight { +// server 10.16.10.8:8069; +// server 10.16.10.8:8068; +//} +// server { +// listen 8167 http2; +// listen [::]:8167 http2; +// server_name doris.arrowflight.com; +// } +DECLARE_Int32(arrow_flight_sql_proxy_port); // the number of bthreads for brpc, the default value is set to -1, // which means the number of bthreads is #cpu-cores @@ -1050,7 +1068,7 @@ DECLARE_Int32(pipeline_executor_size); DECLARE_Bool(enable_file_cache); // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240}] // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240},{"path":"/path/to/file_cache2","total_size":21474836480,"query_limit":10737418240}] -// format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240,"normal_percent":85, "disposable_percent":10, "index_percent":5}] +// format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240, "ttl_percent":50, "normal_percent":40, "disposable_percent":5, "index_percent":5}] // format: [{"path": "xxx", "total_size":53687091200, "storage": "memory"}] // Note1: storage is "disk" by default // Note2: when the storage is "memory", the path is ignored. So you can set xxx to anything you like @@ -1382,8 +1400,6 @@ DECLARE_Int64(num_buffered_reader_prefetch_thread_pool_max_thread); DECLARE_Int64(num_s3_file_upload_thread_pool_min_thread); // The max thread num for S3FileUploadThreadPool DECLARE_Int64(num_s3_file_upload_thread_pool_max_thread); -// The max ratio for ttl cache's size -DECLARE_mInt64(max_ttl_cache_ratio); // The maximum jvm heap usage ratio for hdfs write workload DECLARE_mDouble(max_hdfs_wirter_jni_heap_usage_ratio); // The sleep milliseconds duration when hdfs write exceeds the maximum usage @@ -1450,6 +1466,7 @@ DECLARE_mInt32(check_score_rounds_num); // MB DECLARE_Int32(query_cache_size); +DECLARE_Bool(force_regenerate_rowsetid_on_start_error); DECLARE_mBool(enable_delete_bitmap_merge_on_compaction); // Enable validation to check the correctness of table size. diff --git a/be/src/common/status.h b/be/src/common/status.h index fac63b19f075ff5..344f82a81b8e255 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -77,6 +77,7 @@ namespace ErrorCode { TStatusError(TABLET_MISSING, true); \ TStatusError(NOT_MASTER, true); \ TStatusError(OBTAIN_LOCK_FAILED, false); \ + TStatusError(SNAPSHOT_EXPIRED, false); \ TStatusError(DELETE_BITMAP_LOCK_ERROR, false); // E error_name, error_code, print_stacktrace #define APPLY_FOR_OLAP_ERROR_CODES(E) \ @@ -488,6 +489,7 @@ class [[nodiscard]] Status { ERROR_CTOR_NOSTACK(NeedSendAgain, NEED_SEND_AGAIN) ERROR_CTOR_NOSTACK(CgroupError, CGROUP_ERROR) ERROR_CTOR_NOSTACK(ObtainLockFailed, OBTAIN_LOCK_FAILED) + ERROR_CTOR_NOSTACK(NetworkError, NETWORK_ERROR) #undef ERROR_CTOR template diff --git a/be/src/exec/lzo_decompressor.cpp b/be/src/exec/lzo_decompressor.cpp index b075509202b70fb..b240e2995a0414d 100644 --- a/be/src/exec/lzo_decompressor.cpp +++ b/be/src/exec/lzo_decompressor.cpp @@ -103,6 +103,7 @@ Status LzopDecompressor::decompress(uint8_t* input, size_t input_len, size_t* in ptr = get_uint32(ptr, &uncompressed_size); left_input_len -= sizeof(uint32_t); if (uncompressed_size == 0) { + *input_bytes_read += sizeof(uint32_t); *stream_end = true; return Status::OK(); } diff --git a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp index 43562a8f52cbf10..481360eee905574 100644 --- a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp @@ -39,8 +39,8 @@ std::vector SchemaWorkloadGroupsScanner::_s_tbls_colu {"SCAN_THREAD_NUM", TYPE_BIGINT, sizeof(int64_t), true}, {"MAX_REMOTE_SCAN_THREAD_NUM", TYPE_BIGINT, sizeof(int64_t), true}, {"MIN_REMOTE_SCAN_THREAD_NUM", TYPE_BIGINT, sizeof(int64_t), true}, - {"SPILL_THRESHOLD_LOW_WATERMARK", TYPE_VARCHAR, sizeof(StringRef), true}, - {"SPILL_THRESHOLD_HIGH_WATERMARK", TYPE_VARCHAR, sizeof(StringRef), true}, + {"MEMORY_LOW_WATERMARK", TYPE_VARCHAR, sizeof(StringRef), true}, + {"MEMORY_HIGH_WATERMARK", TYPE_VARCHAR, sizeof(StringRef), true}, {"TAG", TYPE_VARCHAR, sizeof(StringRef), true}, {"READ_BYTES_PER_SECOND", TYPE_BIGINT, sizeof(int64_t), true}, {"REMOTE_READ_BYTES_PER_SECOND", TYPE_BIGINT, sizeof(int64_t), true}, diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index f1c0ad60e06455c..acd923741eb73d5 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -17,6 +17,7 @@ #include "exec/tablet_info.h" +#include #include #include #include @@ -180,6 +181,17 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { auto it = slots_map.find(to_lower(pcolumn_desc.name()) + "+" + data_type_str + is_null_str); if (it == std::end(slots_map)) { + std::string keys {}; + for (const auto& [key, _] : slots_map) { + keys += fmt::format("{},", key); + } + LOG_EVERY_SECOND(WARNING) << fmt::format( + "[OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema)]: " + "unknown index column, column={}, type={}, data_type_str={}, " + "is_null_str={}, slots_map.keys()=[{}], {}\npschema={}", + pcolumn_desc.name(), pcolumn_desc.type(), data_type_str, is_null_str, + keys, debug_string(), pschema.ShortDebugString()); + return Status::InternalError("unknown index column, column={}, type={}", pcolumn_desc.name(), pcolumn_desc.type()); } @@ -286,6 +298,18 @@ Status OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema) { auto it = slots_map.find(to_lower(tcolumn_desc.column_name) + "+" + data_type_str + is_null_str); if (it == slots_map.end()) { + std::stringstream ss; + ss << tschema; + std::string keys {}; + for (const auto& [key, _] : slots_map) { + keys += fmt::format("{},", key); + } + LOG_EVERY_SECOND(WARNING) << fmt::format( + "[OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema)]: " + "unknown index column, column={}, type={}, data_type_str={}, " + "is_null_str={}, slots_map.keys()=[{}], {}\ntschema={}", + tcolumn_desc.column_name, tcolumn_desc.column_type.type, data_type_str, + is_null_str, keys, debug_string(), ss.str()); return Status::InternalError("unknown index column, column={}, type={}", tcolumn_desc.column_name, tcolumn_desc.column_type.type); diff --git a/be/src/exprs/bitmapfilter_predicate.h b/be/src/exprs/bitmapfilter_predicate.h index 5cb2b812220b10e..8b161bf6213f40a 100644 --- a/be/src/exprs/bitmapfilter_predicate.h +++ b/be/src/exprs/bitmapfilter_predicate.h @@ -30,11 +30,7 @@ namespace doris { // only used in Runtime Filter class BitmapFilterFuncBase : public RuntimeFilterFuncBase { public: - virtual void insert(const void* data) = 0; virtual void insert_many(const std::vector& bitmaps) = 0; - virtual bool empty() = 0; - virtual Status assign(BitmapValue* bitmap_value) = 0; - virtual void light_copy(BitmapFilterFuncBase* other) { _not_in = other->_not_in; } virtual uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, int number) = 0; virtual void find_batch(const char* data, const uint8* nullmap, size_t number, @@ -58,8 +54,6 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { ~BitmapFilterFunc() override = default; - void insert(const void* data) override; - void insert_many(const std::vector& bitmaps) override; uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, @@ -68,21 +62,8 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { void find_batch(const char* data, const uint8* nullmap, size_t number, uint8* results) const override; - bool empty() override { return _bitmap_value->empty(); } - - Status assign(BitmapValue* bitmap_value) override { - *_bitmap_value = *bitmap_value; - return Status::OK(); - } - - void light_copy(BitmapFilterFuncBase* bitmapfilter_func) override; - size_t size() const override { return _bitmap_value->cardinality(); } - uint64_t max() { return _bitmap_value->max(nullptr); } - - uint64_t min() { return _bitmap_value->min(nullptr); } - bool contains_any(CppType left, CppType right) { if (right < 0) { return false; @@ -90,23 +71,12 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { return _bitmap_value->contains_any(std::max(left, (CppType)0), right); } - std::shared_ptr get_inner_bitmap() { return _bitmap_value; } - private: std::shared_ptr _bitmap_value; bool find(CppType data) const { return _not_in ^ (data >= 0 && _bitmap_value->contains(data)); } }; -template -void BitmapFilterFunc::insert(const void* data) { - if (data == nullptr) { - return; - } - - *_bitmap_value |= *reinterpret_cast(data); -} - template void BitmapFilterFunc::insert_many(const std::vector& bitmaps) { if (bitmaps.empty()) { @@ -147,12 +117,4 @@ void BitmapFilterFunc::find_batch(const char* data, const uint8* nullmap, } } -template -void BitmapFilterFunc::light_copy(BitmapFilterFuncBase* bitmapfilter_func) { - BitmapFilterFuncBase::light_copy(bitmapfilter_func); - auto other_func = reinterpret_cast(bitmapfilter_func); - _bitmap_value = other_func->_bitmap_value; - set_filter_id(bitmapfilter_func->get_filter_id()); -} - } // namespace doris diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index b5204fa767d59e8..54ad75028104845 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -17,6 +17,8 @@ #pragma once +#include "common/exception.h" +#include "common/status.h" #include "exprs/block_bloom_filter.hpp" #include "exprs/runtime_filter.h" #include "olap/rowset/segment_v2/bloom_filter.h" // IWYU pragma: keep @@ -205,7 +207,7 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { bool contain_null() const { if (!_bloom_filter) { - throw Status::InternalError("_bloom_filter is nullptr"); + throw Exception(ErrorCode::INTERNAL_ERROR, "_bloom_filter is nullptr"); } return _bloom_filter->contain_null(); } diff --git a/be/src/exprs/create_predicate_function.h b/be/src/exprs/create_predicate_function.h index 387be1f9f0b11c0..44f39fb77f6d857 100644 --- a/be/src/exprs/create_predicate_function.h +++ b/be/src/exprs/create_predicate_function.h @@ -17,6 +17,8 @@ #pragma once +#include "common/exception.h" +#include "common/status.h" #include "exprs/hybrid_set.h" #include "exprs/minmax_predicate.h" #include "function_filter.h" @@ -244,12 +246,9 @@ ColumnPredicate* create_olap_column_predicate(uint32_t column_id, int be_exec_version, const TabletColumn*) { if constexpr (PT == TYPE_TINYINT || PT == TYPE_SMALLINT || PT == TYPE_INT || PT == TYPE_BIGINT) { - std::shared_ptr filter_olap; - filter_olap.reset(create_bitmap_filter(PT)); - filter_olap->light_copy(filter.get()); return new BitmapFilterColumnPredicate(column_id, filter, be_exec_version); } else { - return nullptr; + throw Exception(ErrorCode::INTERNAL_ERROR, "bitmap filter do not support type {}", PT); } } @@ -266,17 +265,14 @@ ColumnPredicate* create_olap_column_predicate(uint32_t column_id, const std::shared_ptr& filter, int, const TabletColumn* column = nullptr) { // currently only support like predicate - if constexpr (PT == TYPE_CHAR || PT == TYPE_VARCHAR || PT == TYPE_STRING) { - if constexpr (PT == TYPE_CHAR) { - return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, - filter->_string_param); - } else { - return new LikeColumnPredicate(filter->_opposite, column_id, - filter->_fn_ctx, filter->_string_param); - } - } else { - return nullptr; + if constexpr (PT == TYPE_CHAR) { + return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, + filter->_string_param); + } else if constexpr (PT == TYPE_VARCHAR || PT == TYPE_STRING) { + return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, + filter->_string_param); } + throw Exception(ErrorCode::INTERNAL_ERROR, "function filter do not support type {}", PT); } template diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 85f1c535c7038b6..24333360ff62540 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -362,8 +362,11 @@ class RuntimePredicateWrapper { } Status init_bloom_filter(const size_t build_bf_cardinality) { - DCHECK(_filter_type == RuntimeFilterType::BLOOM_FILTER || - _filter_type == RuntimeFilterType::IN_OR_BLOOM_FILTER); + if (_filter_type != RuntimeFilterType::BLOOM_FILTER && + _filter_type != RuntimeFilterType::IN_OR_BLOOM_FILTER) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "init_bloom_filter meet invalid input type {}", int(_filter_type)); + } return _context->bloom_filter_func->init_with_cardinality(build_bf_cardinality); } @@ -391,7 +394,9 @@ class RuntimePredicateWrapper { BloomFilterFuncBase* get_bloomfilter() const { return _context->bloom_filter_func.get(); } void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) { - DCHECK(!is_ignored()); + if (is_ignored()) { + throw Exception(ErrorCode::INTERNAL_ERROR, "insert_fixed_len meet ignored rf"); + } switch (_filter_type) { case RuntimeFilterType::IN_FILTER: { _context->hybrid_set->insert_fixed_len(column, start); @@ -918,7 +923,10 @@ class RuntimePredicateWrapper { return _context->bloom_filter_func->contain_null(); } if (_context->hybrid_set) { - DCHECK(get_real_type() == RuntimeFilterType::IN_FILTER); + if (get_real_type() != RuntimeFilterType::IN_FILTER) { + throw Exception(ErrorCode::INTERNAL_ERROR, "rf has hybrid_set but real type is {}", + int(get_real_type())); + } return _context->hybrid_set->contain_null(); } if (_context->minmax_func) { @@ -975,8 +983,8 @@ class RuntimePredicateWrapper { Status IRuntimeFilter::create(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc, const TQueryOptions* query_options, const RuntimeFilterRole role, int node_id, std::shared_ptr* res, - bool build_bf_exactly, bool need_local_merge) { - *res = std::make_shared(state, desc, need_local_merge); + bool build_bf_exactly) { + *res = std::make_shared(state, desc); (*res)->set_role(role); return (*res)->init_with_desc(desc, query_options, node_id, build_bf_exactly); } @@ -990,53 +998,62 @@ void IRuntimeFilter::insert_batch(const vectorized::ColumnPtr column, size_t sta _wrapper->insert_batch(column, start); } -Status IRuntimeFilter::publish(bool publish_local) { +Status IRuntimeFilter::publish(RuntimeState* state, bool publish_local) { DCHECK(is_producer()); - auto send_to_remote = [&](IRuntimeFilter* filter) { + auto send_to_remote_targets = [&](IRuntimeFilter* filter, uint64_t local_merge_time) { TNetworkAddress addr; DCHECK(_state != nullptr); - RETURN_IF_ERROR(_state->runtime_filter_mgr->get_merge_addr(&addr)); - return filter->push_to_remote(&addr); + RETURN_IF_ERROR(_state->global_runtime_filter_mgr()->get_merge_addr(&addr)); + return filter->push_to_remote(state, &addr, local_merge_time); }; - auto send_to_local = [&](std::shared_ptr wrapper) { - std::vector> filters; - RETURN_IF_ERROR(_state->runtime_filter_mgr->get_consume_filters(_filter_id, filters)); - DCHECK(!filters.empty()); - // push down + auto send_to_local_targets = [&](std::shared_ptr wrapper, bool global, + uint64_t local_merge_time = 0) { + std::vector> filters = + global ? _state->global_runtime_filter_mgr()->get_consume_filters(_filter_id) + : _state->local_runtime_filter_mgr()->get_consume_filters(_filter_id); for (auto filter : filters) { filter->_wrapper = wrapper; - filter->update_runtime_filter_type_to_profile(); + filter->update_runtime_filter_type_to_profile(local_merge_time); filter->signal(); } return Status::OK(); }; - auto do_local_merge = [&]() { - LocalMergeFilters* local_merge_filters = nullptr; - RETURN_IF_ERROR(_state->runtime_filter_mgr->get_local_merge_producer_filters( - _filter_id, &local_merge_filters)); - std::lock_guard l(*local_merge_filters->lock); - RETURN_IF_ERROR(local_merge_filters->filters[0]->merge_from(_wrapper.get())); - local_merge_filters->merge_time--; - if (local_merge_filters->merge_time == 0) { - if (_has_local_target) { - RETURN_IF_ERROR(send_to_local(local_merge_filters->filters[0]->_wrapper)); - } else { - RETURN_IF_ERROR(send_to_remote(local_merge_filters->filters[0].get())); + auto do_merge = [&]() { + if (!_state->global_runtime_filter_mgr()->get_consume_filters(_filter_id).empty()) { + LocalMergeFilters* local_merge_filters = nullptr; + RETURN_IF_ERROR(_state->global_runtime_filter_mgr()->get_local_merge_producer_filters( + _filter_id, &local_merge_filters)); + local_merge_filters->merge_watcher.start(); + std::lock_guard l(*local_merge_filters->lock); + RETURN_IF_ERROR(local_merge_filters->filters[0]->merge_from(_wrapper.get())); + local_merge_filters->merge_time--; + local_merge_filters->merge_watcher.stop(); + if (local_merge_filters->merge_time == 0) { + if (_has_local_target) { + RETURN_IF_ERROR(send_to_local_targets( + local_merge_filters->filters[0]->_wrapper, true, + local_merge_filters->merge_watcher.elapsed_time())); + } else { + RETURN_IF_ERROR(send_to_remote_targets( + local_merge_filters->filters[0].get(), + local_merge_filters->merge_watcher.elapsed_time())); + } } } return Status::OK(); }; - if (_need_local_merge && _has_local_target) { - RETURN_IF_ERROR(do_local_merge()); - } else if (_has_local_target) { - RETURN_IF_ERROR(send_to_local(_wrapper)); + if (_has_local_target) { + // A runtime filter may have multiple targets and some of those are local-merge RF and others are not. + // So for all runtime filters' producers, `publish` should notify all consumers in global RF mgr which manages local-merge RF and local RF mgr which manages others. + RETURN_IF_ERROR(do_merge()); + RETURN_IF_ERROR(send_to_local_targets(_wrapper, false)); } else if (!publish_local) { - if (_is_broadcast_join || _state->be_exec_version < USE_NEW_SERDE) { - RETURN_IF_ERROR(send_to_remote(this)); + if (_is_broadcast_join || _state->get_query_ctx()->be_exec_version() < USE_NEW_SERDE) { + RETURN_IF_ERROR(send_to_remote_targets(this, 0)); } else { - RETURN_IF_ERROR(do_local_merge()); + RETURN_IF_ERROR(do_merge()); } } else { // remote broadcast join only push onetime in build shared hash table @@ -1088,20 +1105,25 @@ class SyncSizeClosure : public AutoReleaseClosure req, std::shared_ptr> callback, std::shared_ptr dependency, - RuntimeFilterContextSPtr rf_context) - : Base(req, callback), _dependency(std::move(dependency)), _rf_context(rf_context) {} + RuntimeFilterContextSPtr rf_context, std::weak_ptr context) + : Base(req, callback, context), + _dependency(std::move(dependency)), + _rf_context(rf_context) {} }; Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filter_size) { DCHECK(is_producer()); - if (_need_local_merge) { + if (!_state->global_runtime_filter_mgr()->get_consume_filters(_filter_id).empty()) { LocalMergeFilters* local_merge_filters = nullptr; - RETURN_IF_ERROR(_state->runtime_filter_mgr->get_local_merge_producer_filters( + RETURN_IF_ERROR(_state->global_runtime_filter_mgr()->get_local_merge_producer_filters( _filter_id, &local_merge_filters)); std::lock_guard l(*local_merge_filters->lock); local_merge_filters->merge_size_times--; local_merge_filters->local_merged_size += local_filter_size; + if (_has_local_target) { + set_synced_size(local_filter_size); + } if (local_merge_filters->merge_size_times) { return Status::OK(); } else { @@ -1121,9 +1143,9 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt TNetworkAddress addr; DCHECK(_state != nullptr); - RETURN_IF_ERROR(_state->runtime_filter_mgr->get_merge_addr(&addr)); + RETURN_IF_ERROR(_state->global_runtime_filter_mgr()->get_merge_addr(&addr)); std::shared_ptr stub( - _state->exec_env->brpc_internal_client_cache()->get_client(addr)); + _state->get_query_ctx()->exec_env()->brpc_internal_client_cache()->get_client(addr)); if (!stub) { return Status::InternalError("Get rpc stub failed, host={}, port={}", addr.hostname, addr.port); @@ -1133,11 +1155,13 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt auto callback = DummyBrpcCallback::create_shared(); // IRuntimeFilter maybe deconstructed before the rpc finished, so that could not use // a raw pointer in closure. Has to use the context's shared ptr. - auto closure = - SyncSizeClosure::create_unique(request, callback, _dependency, _wrapper->_context); + auto closure = SyncSizeClosure::create_unique( + request, callback, _dependency, _wrapper->_context, + state->query_options().ignore_runtime_filter_error ? std::weak_ptr {} + : state->get_query_ctx_weak()); auto* pquery_id = request->mutable_query_id(); - pquery_id->set_hi(_state->query_id.hi()); - pquery_id->set_lo(_state->query_id.lo()); + pquery_id->set_hi(_state->get_query_ctx()->query_id().hi); + pquery_id->set_lo(_state->get_query_ctx()->query_id().lo); auto* source_addr = request->mutable_source_addr(); source_addr->set_hostname(BackendOptions::get_local_backend().host); @@ -1157,10 +1181,11 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt return Status::OK(); } -Status IRuntimeFilter::push_to_remote(const TNetworkAddress* addr) { +Status IRuntimeFilter::push_to_remote(RuntimeState* state, const TNetworkAddress* addr, + uint64_t local_merge_time) { DCHECK(is_producer()); std::shared_ptr stub( - _state->exec_env->brpc_internal_client_cache()->get_client(*addr)); + _state->get_query_ctx()->exec_env()->brpc_internal_client_cache()->get_client(*addr)); if (!stub) { return Status::InternalError( fmt::format("Get rpc stub failed, host={}, port={}", addr->hostname, addr->port)); @@ -1170,24 +1195,28 @@ Status IRuntimeFilter::push_to_remote(const TNetworkAddress* addr) { auto merge_filter_callback = DummyBrpcCallback::create_shared(); auto merge_filter_closure = AutoReleaseClosure>:: - create_unique(merge_filter_request, merge_filter_callback); + create_unique(merge_filter_request, merge_filter_callback, + state->query_options().ignore_runtime_filter_error + ? std::weak_ptr {} + : state->get_query_ctx_weak()); void* data = nullptr; int len = 0; auto* pquery_id = merge_filter_request->mutable_query_id(); - pquery_id->set_hi(_state->query_id.hi()); - pquery_id->set_lo(_state->query_id.lo()); + pquery_id->set_hi(_state->get_query_ctx()->query_id().hi); + pquery_id->set_lo(_state->get_query_ctx()->query_id().lo); auto* pfragment_instance_id = merge_filter_request->mutable_fragment_instance_id(); pfragment_instance_id->set_hi(BackendOptions::get_local_backend().id); pfragment_instance_id->set_lo((int64_t)this); merge_filter_request->set_filter_id(_filter_id); + merge_filter_request->set_local_merge_time(local_merge_time); auto column_type = _wrapper->column_type(); RETURN_IF_CATCH_EXCEPTION(merge_filter_request->set_column_type(to_proto(column_type))); merge_filter_callback->cntl_->set_timeout_ms( - get_execution_rpc_timeout_ms(_state->execution_timeout)); + get_execution_rpc_timeout_ms(_state->get_query_ctx()->execution_timeout())); if (config::execution_ignore_eovercrowded) { merge_filter_callback->cntl_->ignore_eovercrowded(); } @@ -1222,9 +1251,9 @@ Status IRuntimeFilter::get_push_expr_ctxs(std::listadd_info_string("Info", formatted_state()); // The runtime filter is pushed down, adding filtering information. - auto* expr_filtered_rows_counter = ADD_COUNTER(_profile, "expr_filtered_rows", TUnit::UNIT); - auto* expr_input_rows_counter = ADD_COUNTER(_profile, "expr_input_rows", TUnit::UNIT); - auto* always_true_counter = ADD_COUNTER(_profile, "always_true_pass_rows", TUnit::UNIT); + auto* expr_filtered_rows_counter = ADD_COUNTER(_profile, "ExprFilteredRows", TUnit::UNIT); + auto* expr_input_rows_counter = ADD_COUNTER(_profile, "ExprInputRows", TUnit::UNIT); + auto* always_true_counter = ADD_COUNTER(_profile, "AlwaysTruePassRows", TUnit::UNIT); for (auto i = origin_size; i < push_exprs.size(); i++) { push_exprs[i]->attach_profile_counter(expr_filtered_rows_counter, expr_input_rows_counter, always_true_counter); @@ -1234,8 +1263,8 @@ Status IRuntimeFilter::get_push_expr_ctxs(std::listexecution_timeout * 1000; - auto runtime_filter_wait_time_ms = _state->runtime_filter_wait_time_ms; + auto execution_timeout = _state->get_query_ctx()->execution_timeout() * 1000; + auto runtime_filter_wait_time_ms = _state->get_query_ctx()->runtime_filter_wait_time_ms(); // bitmap filter is precise filter and only filter once, so it must be applied. int64_t wait_times_ms = _runtime_filter_type == RuntimeFilterType::BITMAP_FILTER ? execution_timeout @@ -1244,6 +1273,7 @@ void IRuntimeFilter::update_state() { // In pipelineX, runtime filters will be ready or timeout before open phase. if (expected == RuntimeFilterState::NOT_READY) { DCHECK(MonotonicMillis() - registration_time_ >= wait_times_ms); + COUNTER_SET(_wait_timer, MonotonicMillis() - registration_time_); _rf_state_atomic = RuntimeFilterState::TIME_OUT; } } @@ -1262,6 +1292,7 @@ PrimitiveType IRuntimeFilter::column_type() const { void IRuntimeFilter::signal() { DCHECK(is_consumer()); + COUNTER_SET(_wait_timer, MonotonicMillis() - registration_time_); _rf_state_atomic.store(RuntimeFilterState::READY); if (!_filter_timer.empty()) { for (auto& timer : _filter_timer) { @@ -1312,10 +1343,10 @@ bool IRuntimeFilter::get_ignored() { std::string IRuntimeFilter::formatted_state() const { return fmt::format( - "[IsPushDown = {}, RuntimeFilterState = {}, HasRemoteTarget = {}, " + "[Id = {}, IsPushDown = {}, RuntimeFilterState = {}, HasRemoteTarget = {}, " "HasLocalTarget = {}, Ignored = {}]", - _is_push_down, _get_explain_state_string(), _has_remote_target, _has_local_target, - _wrapper->_context->ignored); + _filter_id, _is_push_down, _get_explain_state_string(), _has_remote_target, + _has_local_target, _wrapper->_context->ignored); } Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQueryOptions* options, @@ -1341,18 +1372,19 @@ Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQue params.runtime_bloom_filter_max_size = options->__isset.runtime_bloom_filter_max_size ? options->runtime_bloom_filter_max_size : 0; - // We build runtime filter by exact distinct count iff three conditions are met: + auto sync_filter_size = desc->__isset.sync_filter_size && desc->sync_filter_size; + // We build runtime filter by exact distinct count if all of 3 conditions are met: // 1. Only 1 join key - // 2. Do not have remote target (e.g. do not need to merge), or broadcast join - // 3. Bloom filter + // 2. Bloom filter + // 3. Size of all bloom filters will be same (size will be sync or this is a broadcast join). params.build_bf_exactly = build_bf_exactly && (_runtime_filter_type == RuntimeFilterType::BLOOM_FILTER || _runtime_filter_type == RuntimeFilterType::IN_OR_BLOOM_FILTER); params.bloom_filter_size_calculated_by_ndv = desc->bloom_filter_size_calculated_by_ndv; - if (!desc->__isset.sync_filter_size || !desc->sync_filter_size) { - params.build_bf_exactly &= (!_has_remote_target || _is_broadcast_join); + if (!sync_filter_size) { + params.build_bf_exactly &= !_is_broadcast_join; } if (desc->__isset.bloom_filter_size_bytes) { @@ -1500,18 +1532,21 @@ void IRuntimeFilter::init_profile(RuntimeProfile* parent_profile) { _profile_init = true; parent_profile->add_child(_profile.get(), true, nullptr); _profile->add_info_string("Info", formatted_state()); + _wait_timer = ADD_TIMER(_profile, "WaitTime"); } } -void IRuntimeFilter::update_runtime_filter_type_to_profile() { +void IRuntimeFilter::update_runtime_filter_type_to_profile(uint64_t local_merge_time) { _profile->add_info_string("RealRuntimeFilterType", to_string(_wrapper->get_real_type())); + _profile->add_info_string("LocalMergeTime", + std::to_string(local_merge_time / 1000000000.0) + " s"); } std::string IRuntimeFilter::debug_string() const { return fmt::format( - "RuntimeFilter: (id = {}, type = {}, need_local_merge: {}, is_broadcast: {}, " + "RuntimeFilter: (id = {}, type = {}, is_broadcast: {}, " "build_bf_cardinality: {}, error_msg: {}", - _filter_id, to_string(_runtime_filter_type), _need_local_merge, _is_broadcast_join, + _filter_id, to_string(_runtime_filter_type), _is_broadcast_join, _wrapper->get_build_bf_cardinality(), _wrapper->_context->err_msg); } @@ -1841,24 +1876,9 @@ bool IRuntimeFilter::need_sync_filter_size() { _wrapper->get_build_bf_cardinality() && !_is_broadcast_join; } -Status IRuntimeFilter::update_filter(const UpdateRuntimeFilterParams* param) { - _profile->add_info_string("MergeTime", std::to_string(param->request->merge_time()) + " ms"); - - if (param->request->has_ignored() && param->request->ignored()) { - set_ignored(); - } else { - std::unique_ptr wrapper; - RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(param, &wrapper)); - RETURN_IF_ERROR(_wrapper->merge(wrapper.get())); - update_runtime_filter_type_to_profile(); - } - this->signal(); - - return Status::OK(); -} - void IRuntimeFilter::update_filter(std::shared_ptr wrapper, - int64_t merge_time, int64_t start_apply) { + int64_t merge_time, int64_t start_apply, + uint64_t local_merge_time) { _profile->add_info_string("UpdateTime", std::to_string(MonotonicMillis() - start_apply) + " ms"); _profile->add_info_string("MergeTime", std::to_string(merge_time) + " ms"); @@ -1868,7 +1888,7 @@ void IRuntimeFilter::update_filter(std::shared_ptr wrap wrapper->_column_return_type = _wrapper->_column_return_type; } _wrapper = wrapper; - update_runtime_filter_type_to_profile(); + update_runtime_filter_type_to_profile(local_merge_time); signal(); } diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index f5a069d9e55f858..50ee52865be6d6f 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -192,8 +192,7 @@ enum RuntimeFilterState { /// that can be pushed down to node based on the results of the right table. class IRuntimeFilter { public: - IRuntimeFilter(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc, - bool need_local_merge = false) + IRuntimeFilter(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc) : _state(state), _filter_id(desc->filter_id), _is_broadcast_join(true), @@ -203,20 +202,19 @@ class IRuntimeFilter { _role(RuntimeFilterRole::PRODUCER), _expr_order(-1), registration_time_(MonotonicMillis()), - _wait_infinitely(_state->runtime_filter_wait_infinitely), - _rf_wait_time_ms(_state->runtime_filter_wait_time_ms), + _wait_infinitely(_state->get_query_ctx()->runtime_filter_wait_infinitely()), + _rf_wait_time_ms(_state->get_query_ctx()->runtime_filter_wait_time_ms()), _runtime_filter_type(get_runtime_filter_type(desc)), - _profile( - new RuntimeProfile(fmt::format("RuntimeFilter: (id = {}, type = {})", - _filter_id, to_string(_runtime_filter_type)))), - _need_local_merge(need_local_merge) {} + _profile(new RuntimeProfile(fmt::format("RuntimeFilter: (id = {}, type = {})", + _filter_id, + to_string(_runtime_filter_type)))) {} ~IRuntimeFilter() = default; static Status create(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc, const TQueryOptions* query_options, const RuntimeFilterRole role, int node_id, std::shared_ptr* res, - bool build_bf_exactly = false, bool need_local_merge = false); + bool build_bf_exactly = false); RuntimeFilterContextSPtr& get_shared_context_ref(); @@ -225,7 +223,7 @@ class IRuntimeFilter { // publish filter // push filter to remote node or push down it to scan_node - Status publish(bool publish_local = false); + Status publish(RuntimeState* state, bool publish_local = false); Status send_filter_size(RuntimeState* state, uint64_t local_filter_size); @@ -280,9 +278,8 @@ class IRuntimeFilter { std::shared_ptr* wrapper); Status change_to_bloom_filter(); Status init_bloom_filter(const size_t build_bf_cardinality); - Status update_filter(const UpdateRuntimeFilterParams* param); void update_filter(std::shared_ptr filter_wrapper, int64_t merge_time, - int64_t start_apply); + int64_t start_apply, uint64_t local_merge_time); void set_ignored(); @@ -293,13 +290,14 @@ class IRuntimeFilter { bool need_sync_filter_size(); // async push runtimefilter to remote node - Status push_to_remote(const TNetworkAddress* addr); + Status push_to_remote(RuntimeState* state, const TNetworkAddress* addr, + uint64_t local_merge_time); void init_profile(RuntimeProfile* parent_profile); std::string debug_string() const; - void update_runtime_filter_type_to_profile(); + void update_runtime_filter_type_to_profile(uint64_t local_merge_time); int filter_id() const { return _filter_id; } @@ -335,7 +333,7 @@ class IRuntimeFilter { int32_t wait_time_ms() const { int32_t res = 0; if (wait_infinitely()) { - res = _state->execution_timeout; + res = _state->get_query_ctx()->execution_timeout(); // Convert to ms res *= 1000; } else { @@ -417,9 +415,7 @@ class IRuntimeFilter { // parent profile // only effect on consumer std::unique_ptr _profile; - // `_need_local_merge` indicates whether this runtime filter is global on this BE. - // All runtime filters should be merged on each BE before push_to_remote or publish. - bool _need_local_merge = false; + RuntimeProfile::Counter* _wait_timer = nullptr; std::vector> _filter_timer; diff --git a/be/src/exprs/runtime_filter_slots.h b/be/src/exprs/runtime_filter_slots.h index 42c5f598633ad9f..3c18735e4e82ce6 100644 --- a/be/src/exprs/runtime_filter_slots.h +++ b/be/src/exprs/runtime_filter_slots.h @@ -149,10 +149,10 @@ class VRuntimeFilterSlots { } // publish runtime filter - Status publish(bool publish_local) { + Status publish(RuntimeState* state, bool publish_local) { for (auto& pair : _runtime_filters_map) { for (auto& filter : pair.second) { - RETURN_IF_ERROR(filter->publish(publish_local)); + RETURN_IF_ERROR(filter->publish(state, publish_local)); } } return Status::OK(); diff --git a/be/src/exprs/runtime_filter_slots_cross.h b/be/src/exprs/runtime_filter_slots_cross.h index 01ae21a75992de0..a49f2928f842a97 100644 --- a/be/src/exprs/runtime_filter_slots_cross.h +++ b/be/src/exprs/runtime_filter_slots_cross.h @@ -72,9 +72,9 @@ class VRuntimeFilterSlotsCross { return Status::OK(); } - Status publish() { + Status publish(RuntimeState* state) { for (auto filter : _runtime_filters) { - RETURN_IF_ERROR(filter->publish()); + RETURN_IF_ERROR(filter->publish(state)); } return Status::OK(); } diff --git a/be/src/http/http_client.cpp b/be/src/http/http_client.cpp index c842a4fe2dd4ced..fc4c997fce8397b 100644 --- a/be/src/http/http_client.cpp +++ b/be/src/http/http_client.cpp @@ -27,6 +27,7 @@ #include "http/http_headers.h" #include "http/http_status.h" #include "runtime/exec_env.h" +#include "util/security.h" #include "util/stack_util.h" namespace doris { @@ -205,9 +206,11 @@ Status HttpClient::execute(const std::function& callback) { Status status; @@ -293,7 +305,9 @@ Status HttpClient::execute_with_retry(int retry_times, int sleep_time, if (http_status == 200) { return status; } else { - auto error_msg = fmt::format("http status code is not 200, code={}", http_status); + std::string url = mask_token(client._get_url()); + auto error_msg = fmt::format("http status code is not 200, code={}, url={}", + http_status, url); LOG(WARNING) << error_msg; return Status::HttpError(error_msg); } diff --git a/be/src/http/http_client.h b/be/src/http/http_client.h index fb692c50268484c..c0c8863a9b06d4b 100644 --- a/be/src/http/http_client.h +++ b/be/src/http/http_client.h @@ -164,7 +164,8 @@ class HttpClient { Status _escape_url(const std::string& url, std::string* escaped_url); private: - const char* _to_errmsg(CURLcode code); + const char* _to_errmsg(CURLcode code) const; + const char* _get_url() const; private: CURL* _curl = nullptr; diff --git a/be/src/index-tools/index_tool.cpp b/be/src/index-tools/index_tool.cpp index adea2cd84c95f63..e45902c0f24df15 100644 --- a/be/src/index-tools/index_tool.cpp +++ b/be/src/index-tools/index_tool.cpp @@ -170,7 +170,7 @@ void search(lucene::store::Directory* dir, std::string& field, std::string& toke std::vector terms = split(token, '|'); doris::TQueryOptions queryOptions; - ConjunctionQuery conjunct_query(s, queryOptions); + ConjunctionQuery conjunct_query(s, queryOptions, nullptr); conjunct_query.add(field_ws, terms); conjunct_query.search(result); @@ -562,7 +562,7 @@ int main(int argc, char** argv) { auto dir = std::forward(st).value(); auto analyzer = _CLNEW lucene::analysis::standard95::StandardAnalyzer(); // auto analyzer = _CLNEW lucene::analysis::SimpleAnalyzer(); - auto indexwriter = _CLNEW lucene::index::IndexWriter(dir, analyzer, true, true); + auto indexwriter = _CLNEW lucene::index::IndexWriter(dir.get(), analyzer, true, true); indexwriter->setRAMBufferSizeMB(512); indexwriter->setMaxFieldLength(0x7FFFFFFFL); indexwriter->setMergeFactor(100000000); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 4fb3f3e02cb58c5..ebcbe9135daa351 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -86,42 +86,42 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _total_evict_size_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_total_evict_size"); - _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = + _evict_by_time_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_disposable_to_normal"); - _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = + "file_cache_evict_by_time_disposable_to_normal"); + _evict_by_time_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_disposable_to_index"); - _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = + "file_cache_evict_by_time_disposable_to_index"); + _evict_by_time_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_disposable_to_ttl"); - _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = + "file_cache_evict_by_time_disposable_to_ttl"); + _evict_by_time_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_normal_to_disposable"); - _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = + "file_cache_evict_by_time_normal_to_disposable"); + _evict_by_time_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_normal_to_index"); - _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = + "file_cache_evict_by_time_normal_to_index"); + _evict_by_time_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_normal_to_ttl"); - _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = + "file_cache_evict_by_time_normal_to_ttl"); + _evict_by_time_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_index_to_disposable"); - _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = + "file_cache_evict_by_time_index_to_disposable"); + _evict_by_time_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_index_to_normal"); - _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = + "file_cache_evict_by_time_index_to_normal"); + _evict_by_time_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_index_to_ttl"); - _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = + "file_cache_evict_by_time_index_to_ttl"); + _evict_by_time_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_ttl_to_disposable"); - _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = + "file_cache_evict_by_time_ttl_to_disposable"); + _evict_by_time_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_ttl_to_normal"); - _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = + "file_cache_evict_by_time_ttl_to_normal"); + _evict_by_time_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_ttl_to_index"); + "file_cache_evict_by_time_ttl_to_index"); _evict_by_self_lru_metrics_matrix[FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), @@ -197,8 +197,8 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, "file_cache_hit_ratio_5m", 0.0); _hit_ratio_1h = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_1h", 0.0); - _disk_limit_mode_metrics = - std::make_shared>(_cache_base_path.c_str(), "disk_limit_mode", 0); + _disk_limit_mode_metrics = std::make_shared>( + _cache_base_path.c_str(), "file_cache_disk_limit_mode", 0); _disposable_queue = LRUQueue(cache_settings.disposable_queue_size, cache_settings.disposable_queue_elements, 60 * 60); @@ -393,6 +393,15 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte auto& file_blocks = it->second; DCHECK(!file_blocks.empty()); + if (file_blocks.empty()) { + LOG(WARNING) << "file_blocks is empty for hash=" << hash.to_string() + << " cache type=" << context.cache_type + << " cache expiration time=" << context.expiration_time + << " cache range=" << range.left << " " << range.right + << " query id=" << context.query_id; + _files.erase(hash); + return {}; + } // change to ttl if the blocks aren't ttl if (context.cache_type == FileCacheType::TTL && _key_to_time.find(hash) == _key_to_time.end()) { for (auto& [_, cell] : file_blocks) { @@ -970,67 +979,6 @@ void BlockFileCache::find_evict_candidates(LRUQueue& queue, size_t size, size_t } } -bool BlockFileCache::try_reserve_for_ttl_without_lru(size_t size, - std::lock_guard& cache_lock) { - size_t removed_size = 0; - size_t cur_cache_size = _cur_cache_size; - auto limit = config::max_ttl_cache_ratio * _capacity; - - TEST_INJECTION_POINT_CALLBACK("BlockFileCache::change_limit1", &limit); - - if ((_cur_ttl_size + size) * 100 > limit) { - return false; - } - - size_t normal_queue_size = _normal_queue.get_capacity(cache_lock); - size_t disposable_queue_size = _disposable_queue.get_capacity(cache_lock); - size_t index_queue_size = _index_queue.get_capacity(cache_lock); - if (is_overflow(removed_size, size, cur_cache_size) && normal_queue_size == 0 && - disposable_queue_size == 0 && index_queue_size == 0) { - return false; - } - std::vector to_evict; - auto collect_eliminate_fragments = [&](LRUQueue& queue) { - size_t cur_removed_size = 0; - find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - cur_removed_size); - }; - if (disposable_queue_size != 0) { - collect_eliminate_fragments(get_queue(FileCacheType::DISPOSABLE)); - } - if (normal_queue_size != 0) { - collect_eliminate_fragments(get_queue(FileCacheType::NORMAL)); - } - if (index_queue_size != 0) { - collect_eliminate_fragments(get_queue(FileCacheType::INDEX)); - } - remove_file_blocks(to_evict, cache_lock); - if (is_overflow(removed_size, size, cur_cache_size)) { - return false; - } - return true; -} - -bool BlockFileCache::try_reserve_for_ttl(size_t size, std::lock_guard& cache_lock) { - if (try_reserve_for_ttl_without_lru(size, cache_lock)) { - return true; - } else if (config::enable_ttl_cache_evict_using_lru) { - auto& queue = get_queue(FileCacheType::TTL); - size_t removed_size = 0; - size_t cur_cache_size = _cur_cache_size; - - std::vector to_evict; - size_t cur_removed_size = 0; - find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - cur_removed_size); - remove_file_blocks_and_clean_time_maps(to_evict, cache_lock); - - return !is_overflow(removed_size, size, cur_cache_size); - } else { - return false; - } -} - // 1. if async load file cache not finish // a. evict from lru queue // 2. if ttl cache @@ -1283,7 +1231,7 @@ void BlockFileCache::reset_range(const UInt128Wrapper& hash, size_t offset, size _cur_cache_size += new_size; } -bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( +bool BlockFileCache::try_reserve_from_other_queue_by_time_interval( FileCacheType cur_type, std::vector other_cache_types, size_t size, int64_t cur_time, std::lock_guard& cache_lock) { size_t removed_size = 0; @@ -1316,7 +1264,7 @@ bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( remove_size_per_type += cell_size; } } - *(_evict_by_heat_metrics_matrix[cache_type][cur_type]) << remove_size_per_type; + *(_evict_by_time_metrics_matrix[cache_type][cur_type]) << remove_size_per_type; } remove_file_blocks(to_evict, cache_lock); @@ -1365,7 +1313,7 @@ bool BlockFileCache::try_reserve_from_other_queue(FileCacheType cur_cache_type, std::lock_guard& cache_lock) { // currently, TTL cache is not considered as a candidate auto other_cache_types = get_other_cache_type_without_ttl(cur_cache_type); - bool reserve_success = try_reserve_from_other_queue_by_hot_interval( + bool reserve_success = try_reserve_from_other_queue_by_time_interval( cur_cache_type, other_cache_types, size, cur_time, cache_lock); if (reserve_success || !config::file_cache_enable_evict_from_other_queue_by_size) { return reserve_success; @@ -1777,50 +1725,56 @@ void BlockFileCache::run_background_operation() { break; } } + // report + { + SCOPED_CACHE_LOCK(_mutex); + _cur_cache_size_metrics->set_value(_cur_cache_size); + _cur_ttl_cache_size_metrics->set_value(_cur_cache_size - + _index_queue.get_capacity(cache_lock) - + _normal_queue.get_capacity(cache_lock) - + _disposable_queue.get_capacity(cache_lock)); + _cur_ttl_cache_lru_queue_cache_size_metrics->set_value( + _ttl_queue.get_capacity(cache_lock)); + _cur_ttl_cache_lru_queue_element_count_metrics->set_value( + _ttl_queue.get_elements_num(cache_lock)); + _cur_normal_queue_cache_size_metrics->set_value(_normal_queue.get_capacity(cache_lock)); + _cur_normal_queue_element_count_metrics->set_value( + _normal_queue.get_elements_num(cache_lock)); + _cur_index_queue_cache_size_metrics->set_value(_index_queue.get_capacity(cache_lock)); + _cur_index_queue_element_count_metrics->set_value( + _index_queue.get_elements_num(cache_lock)); + _cur_disposable_queue_cache_size_metrics->set_value( + _disposable_queue.get_capacity(cache_lock)); + _cur_disposable_queue_element_count_metrics->set_value( + _disposable_queue.get_elements_num(cache_lock)); + + if (_num_read_blocks->get_value() > 0) { + _hit_ratio->set_value((double)_num_hit_blocks->get_value() / + _num_read_blocks->get_value()); + } + if (_num_read_blocks_5m->get_value() > 0) { + _hit_ratio_5m->set_value((double)_num_hit_blocks_5m->get_value() / + _num_read_blocks_5m->get_value()); + } + if (_num_read_blocks_1h->get_value() > 0) { + _hit_ratio_1h->set_value((double)_num_hit_blocks_1h->get_value() / + _num_read_blocks_1h->get_value()); + } + } + recycle_stale_rowset_async_bottom_half(); recycle_deleted_blocks(); // gc - int64_t cur_time = UnixSeconds(); - SCOPED_CACHE_LOCK(_mutex); - while (!_time_to_key.empty()) { - auto begin = _time_to_key.begin(); - if (cur_time < begin->first) { - break; + { + int64_t cur_time = UnixSeconds(); + SCOPED_CACHE_LOCK(_mutex); + while (!_time_to_key.empty()) { + auto begin = _time_to_key.begin(); + if (cur_time < begin->first) { + break; + } + remove_if_ttl_file_unlock(begin->second, false, cache_lock); } - remove_if_ttl_file_unlock(begin->second, false, cache_lock); - } - - // report - _cur_cache_size_metrics->set_value(_cur_cache_size); - _cur_ttl_cache_size_metrics->set_value(_cur_cache_size - - _index_queue.get_capacity(cache_lock) - - _normal_queue.get_capacity(cache_lock) - - _disposable_queue.get_capacity(cache_lock)); - _cur_ttl_cache_lru_queue_cache_size_metrics->set_value(_ttl_queue.get_capacity(cache_lock)); - _cur_ttl_cache_lru_queue_element_count_metrics->set_value( - _ttl_queue.get_elements_num(cache_lock)); - _cur_normal_queue_cache_size_metrics->set_value(_normal_queue.get_capacity(cache_lock)); - _cur_normal_queue_element_count_metrics->set_value( - _normal_queue.get_elements_num(cache_lock)); - _cur_index_queue_cache_size_metrics->set_value(_index_queue.get_capacity(cache_lock)); - _cur_index_queue_element_count_metrics->set_value( - _index_queue.get_elements_num(cache_lock)); - _cur_disposable_queue_cache_size_metrics->set_value( - _disposable_queue.get_capacity(cache_lock)); - _cur_disposable_queue_element_count_metrics->set_value( - _disposable_queue.get_elements_num(cache_lock)); - - if (_num_read_blocks->get_value() > 0) { - _hit_ratio->set_value((double)_num_hit_blocks->get_value() / - _num_read_blocks->get_value()); - } - if (_num_read_blocks_5m->get_value() > 0) { - _hit_ratio_5m->set_value((double)_num_hit_blocks_5m->get_value() / - _num_read_blocks_5m->get_value()); - } - if (_num_read_blocks_1h->get_value() > 0) { - _hit_ratio_1h->set_value((double)_num_hit_blocks_1h->get_value() / - _num_read_blocks_1h->get_value()); } } } diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index 0de33dadc8249d0..f23d5a3799e0cfe 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -397,10 +397,6 @@ class BlockFileCache { size_t get_available_cache_size(FileCacheType cache_type) const; - bool try_reserve_for_ttl(size_t size, std::lock_guard& cache_lock); - - bool try_reserve_for_ttl_without_lru(size_t size, std::lock_guard& cache_lock); - FileBlocks split_range_into_cells(const UInt128Wrapper& hash, const CacheContext& context, size_t offset, size_t size, FileBlock::State state, std::lock_guard& cache_lock); @@ -436,10 +432,10 @@ class BlockFileCache { void recycle_deleted_blocks(); - bool try_reserve_from_other_queue_by_hot_interval(FileCacheType cur_type, - std::vector other_cache_types, - size_t size, int64_t cur_time, - std::lock_guard& cache_lock); + bool try_reserve_from_other_queue_by_time_interval(FileCacheType cur_type, + std::vector other_cache_types, + size_t size, int64_t cur_time, + std::lock_guard& cache_lock); bool try_reserve_from_other_queue_by_size(FileCacheType cur_type, std::vector other_cache_types, @@ -515,7 +511,7 @@ class BlockFileCache { std::shared_ptr> _cur_disposable_queue_cache_size_metrics; std::array>, 4> _queue_evict_size_metrics; std::shared_ptr> _total_evict_size_metrics; - std::shared_ptr> _evict_by_heat_metrics_matrix[4][4]; + std::shared_ptr> _evict_by_time_metrics_matrix[4][4]; std::shared_ptr> _evict_by_size_metrics_matrix[4][4]; std::shared_ptr> _evict_by_self_lru_metrics_matrix[4]; std::shared_ptr> _evict_by_try_release; diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index 674879300452dfc..19041938a08346d 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -42,7 +42,8 @@ std::string FileCacheSettings::to_string() const { FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cache_size, size_t normal_percent, size_t disposable_percent, - size_t index_percent, const std::string& storage) { + size_t index_percent, size_t ttl_percent, + const std::string& storage) { io::FileCacheSettings settings; if (capacity == 0) return settings; settings.capacity = capacity; @@ -59,12 +60,12 @@ FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cach std::max(settings.index_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); - settings.ttl_queue_size = per_size * config::max_ttl_cache_ratio; + settings.ttl_queue_size = per_size * ttl_percent; settings.ttl_queue_elements = std::max(settings.ttl_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); - settings.query_queue_size = - settings.capacity - settings.disposable_queue_size - settings.index_queue_size; + settings.query_queue_size = settings.capacity - settings.disposable_queue_size - + settings.index_queue_size - settings.ttl_queue_size; settings.query_queue_elements = std::max(settings.query_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 30579ba7851b28e..0d700d9303191f4 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -29,6 +29,7 @@ inline static constexpr size_t FILE_CACHE_MAX_FILE_BLOCK_SIZE = 1 * 1024 * 1024; inline static constexpr size_t DEFAULT_NORMAL_PERCENT = 40; inline static constexpr size_t DEFAULT_DISPOSABLE_PERCENT = 5; inline static constexpr size_t DEFAULT_INDEX_PERCENT = 5; +inline static constexpr size_t DEFAULT_TTL_PERCENT = 50; using uint128_t = vectorized::UInt128; @@ -107,6 +108,7 @@ FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cach size_t normal_percent = DEFAULT_NORMAL_PERCENT, size_t disposable_percent = DEFAULT_DISPOSABLE_PERCENT, size_t index_percent = DEFAULT_INDEX_PERCENT, + size_t ttl_percent = DEFAULT_TTL_PERCENT, const std::string& storage = "disk"); struct CacheContext { diff --git a/be/src/io/fs/s3_file_writer.cpp b/be/src/io/fs/s3_file_writer.cpp index e40b9e171eb08f6..7a06ce22074621e 100644 --- a/be/src/io/fs/s3_file_writer.cpp +++ b/be/src/io/fs/s3_file_writer.cpp @@ -379,7 +379,14 @@ Status S3FileWriter::_set_upload_to_remote_less_than_buffer_size() { } void S3FileWriter::_put_object(UploadFileBuffer& buf) { - DCHECK(state() != State::CLOSED) << fmt::format("state is {}", state()); + if (state() == State::CLOSED) { + DCHECK(state() != State::CLOSED) + << "state=" << (int)state() << " path=" << _obj_storage_path_opts.path.native(); + LOG_WARNING("failed to put object because file closed, file path {}", + _obj_storage_path_opts.path.native()); + buf.set_status(Status::InternalError("try to put closed file")); + return; + } const auto& client = _obj_client->get(); if (nullptr == client) { buf.set_status(Status::InternalError("invalid obj storage client")); diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp index e5ec38738155e52..89886ec8b4bea79 100644 --- a/be/src/olap/base_tablet.cpp +++ b/be/src/olap/base_tablet.cpp @@ -1566,6 +1566,10 @@ Status BaseTablet::check_rowid_conversion( VLOG_DEBUG << "check_rowid_conversion, location_map is empty"; return Status::OK(); } + if (!tablet_schema()->cluster_key_idxes().empty()) { + VLOG_DEBUG << "skip check_rowid_conversion for mow tables with cluster keys"; + return Status::OK(); + } std::vector dst_segments; RETURN_IF_ERROR( diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index d9d37d13198bbd8..cd4f89b57ec50d9 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -65,7 +65,9 @@ class BloomFilterColumnPredicate : public ColumnPredicate { uint16_t evaluate(const vectorized::IColumn& column, const uint8_t* null_map, uint16_t* sel, uint16_t size) const { if constexpr (is_nullable) { - DCHECK(null_map); + if (!null_map) { + throw Exception(ErrorCode::INTERNAL_ERROR, "null_map is nullptr"); + } } uint16_t new_size = 0; @@ -91,7 +93,9 @@ class BloomFilterColumnPredicate : public ColumnPredicate { int get_filter_id() const override { int filter_id = _filter->get_filter_id(); - DCHECK(filter_id != -1); + if (filter_id == 1) { + throw Exception(ErrorCode::INTERNAL_ERROR, "filter_id is -1"); + } return filter_id; } diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index a40e28669e90cc7..738087a702f0709 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -613,11 +613,9 @@ Status Compaction::do_inverted_index_compaction() { fs, std::string {InvertedIndexDescriptor::get_index_file_path_prefix(seg_path)}, _cur_tablet_schema->get_inverted_index_storage_format(), rowset->rowset_meta()->inverted_index_file_info(seg_id)); - bool open_idx_file_cache = false; RETURN_NOT_OK_STATUS_WITH_WARN( - inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache), - "inverted_index_file_reader init failed"); + inverted_index_file_reader->init(config::inverted_index_read_buffer_size), + "inverted_index_file_reader init faiqled"); inverted_index_file_readers[m.second] = std::move(inverted_index_file_reader); } @@ -666,9 +664,11 @@ Status Compaction::do_inverted_index_compaction() { DORIS_TRY(inverted_index_file_readers[src_segment_id]->open(index_meta)); } for (int dest_segment_id = 0; dest_segment_id < dest_segment_num; dest_segment_id++) { - auto* dest_dir = + auto dest_dir = DORIS_TRY(inverted_index_file_writers[dest_segment_id]->open(index_meta)); - dest_index_dirs[dest_segment_id] = dest_dir; + // Destination directories in dest_index_dirs do not need to be deconstructed, + // but their lifecycle must be managed by inverted_index_file_writers. + dest_index_dirs[dest_segment_id] = dest_dir.get(); } auto st = compact_column(index_meta->index_id(), src_idx_dirs, dest_index_dirs, index_tmp_path.native(), trans_vec, dest_segment_num_rows); @@ -783,9 +783,8 @@ void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { InvertedIndexDescriptor::get_index_file_path_prefix(*seg_path)}, _cur_tablet_schema->get_inverted_index_storage_format(), rowset->rowset_meta()->inverted_index_file_info(i)); - bool open_idx_file_cache = false; auto st = inverted_index_file_reader->init( - config::inverted_index_read_buffer_size, open_idx_file_cache); + config::inverted_index_read_buffer_size); index_file_path = inverted_index_file_reader->get_index_file_path(index_meta); DBUG_EXECUTE_IF( "Compaction::construct_skip_inverted_index_index_file_reader_init_" @@ -1127,6 +1126,18 @@ Status CloudCompactionMixin::execute_compact_impl(int64_t permits) { RETURN_IF_ERROR(merge_input_rowsets()); + DBUG_EXECUTE_IF("CloudFullCompaction::modify_rowsets.wrong_rowset_id", { + DCHECK(compaction_type() == ReaderType::READER_FULL_COMPACTION); + RowsetId id; + id.version = 2; + id.hi = _output_rowset->rowset_meta()->rowset_id().hi + ((int64_t)(1) << 56); + id.mi = _output_rowset->rowset_meta()->rowset_id().mi; + id.lo = _output_rowset->rowset_meta()->rowset_id().lo; + _output_rowset->rowset_meta()->set_rowset_id(id); + LOG(INFO) << "[Debug wrong rowset id]:" + << _output_rowset->rowset_meta()->rowset_id().to_string(); + }) + RETURN_IF_ERROR(_engine.meta_mgr().commit_rowset(*_output_rowset->rowset_meta().get())); // 4. modify rowsets in memory diff --git a/be/src/olap/delete_bitmap_calculator.cpp b/be/src/olap/delete_bitmap_calculator.cpp index 6f6e0ec88899542..017e3cff3d0489d 100644 --- a/be/src/olap/delete_bitmap_calculator.cpp +++ b/be/src/olap/delete_bitmap_calculator.cpp @@ -90,8 +90,10 @@ bool MergeIndexDeleteBitmapCalculatorContext::Comparator::operator()( // std::proiroty_queue is a max heap, and function should return the result of `lhs < rhs` // so if the result of the function is true, rhs will be popped before lhs Slice key1, key2; - RETURN_IF_ERROR(lhs->get_current_key(key1)); - RETURN_IF_ERROR(rhs->get_current_key(key2)); + // MergeIndexDeleteBitmapCalculatorContext::get_current_key may return non-OK status if encounter + // memory allocation failure, we can only throw exception here to propagate error in this situation + THROW_IF_ERROR(lhs->get_current_key(key1)); + THROW_IF_ERROR(rhs->get_current_key(key2)); if (_sequence_length == 0 && _rowid_length == 0) { auto cmp_result = key1.compare(key2); // when key1 is the same as key2, @@ -135,28 +137,30 @@ Status MergeIndexDeleteBitmapCalculator::init(RowsetId rowset_id, std::vector const& segments, size_t seq_col_length, size_t rowdid_length, size_t max_batch_size) { - _rowset_id = rowset_id; - _seq_col_length = seq_col_length; - _rowid_length = rowdid_length; - _comparator = - MergeIndexDeleteBitmapCalculatorContext::Comparator(seq_col_length, _rowid_length); - _contexts.reserve(segments.size()); - _heap = std::make_unique(_comparator); + RETURN_IF_CATCH_EXCEPTION({ + _rowset_id = rowset_id; + _seq_col_length = seq_col_length; + _rowid_length = rowdid_length; + _comparator = + MergeIndexDeleteBitmapCalculatorContext::Comparator(seq_col_length, _rowid_length); + _contexts.reserve(segments.size()); + _heap = std::make_unique(_comparator); - for (auto& segment : segments) { - RETURN_IF_ERROR(segment->load_index()); - auto pk_idx = segment->get_primary_key_index(); - std::unique_ptr index; - RETURN_IF_ERROR(pk_idx->new_iterator(&index)); - auto index_type = vectorized::DataTypeFactory::instance().create_data_type( - pk_idx->type_info()->type(), 1, 0); - _contexts.emplace_back(std::move(index), index_type, segment->id(), pk_idx->num_rows()); - _heap->push(&_contexts.back()); - } - if (_rowid_length > 0) { - _rowid_coder = get_key_coder( - get_scalar_type_info()->type()); - } + for (auto& segment : segments) { + RETURN_IF_ERROR(segment->load_index()); + auto pk_idx = segment->get_primary_key_index(); + std::unique_ptr index; + RETURN_IF_ERROR(pk_idx->new_iterator(&index)); + auto index_type = vectorized::DataTypeFactory::instance().create_data_type( + pk_idx->type_info()->type(), 1, 0); + _contexts.emplace_back(std::move(index), index_type, segment->id(), pk_idx->num_rows()); + _heap->push(&_contexts.back()); + } + if (_rowid_length > 0) { + _rowid_coder = get_key_coder( + get_scalar_type_info()->type()); + } + }); return Status::OK(); } @@ -209,16 +213,18 @@ Status MergeIndexDeleteBitmapCalculator::calculate_one(RowLocation& loc) { } Status MergeIndexDeleteBitmapCalculator::calculate_all(DeleteBitmapPtr delete_bitmap) { - RowLocation loc; - while (true) { - auto st = calculate_one(loc); - if (st.is()) { - break; + RETURN_IF_CATCH_EXCEPTION({ + RowLocation loc; + while (true) { + auto st = calculate_one(loc); + if (st.is()) { + break; + } + RETURN_IF_ERROR(st); + delete_bitmap->add({_rowset_id, loc.segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, + loc.row_id); } - RETURN_IF_ERROR(st); - delete_bitmap->add({_rowset_id, loc.segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, - loc.row_id); - } + }); return Status::OK(); } diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index e0f19b1624df5b1..233c59f0910871a 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -385,8 +385,12 @@ Status MemTable::_sort_by_cluster_keys() { for (int i = 0; i < row_in_blocks.size(); i++) { row_pos_vec.emplace_back(row_in_blocks[i]->_row_pos); } + std::vector column_offset; + for (int i = 0; i < _column_offset.size(); ++i) { + column_offset.emplace_back(i); + } return _output_mutable_block.add_rows(&in_block, row_pos_vec.data(), - row_pos_vec.data() + in_block.rows(), &_column_offset); + row_pos_vec.data() + in_block.rows(), &column_offset); } void MemTable::_sort_one_column(std::vector& row_in_blocks, Tie& tie, diff --git a/be/src/olap/memtable_memory_limiter.cpp b/be/src/olap/memtable_memory_limiter.cpp index 1cb6c0c8e2de046..043ce9967fbe5a4 100644 --- a/be/src/olap/memtable_memory_limiter.cpp +++ b/be/src/olap/memtable_memory_limiter.cpp @@ -141,7 +141,7 @@ void MemTableMemoryLimiter::handle_memtable_flush() { << ", flush: " << PrettyPrinter::print_bytes(_flush_mem_usage); _flush_active_memtables(need_flush); } - } while (_hard_limit_reached()); + } while (_hard_limit_reached() && !_load_usage_low()); g_memtable_memory_limit_waiting_threads << -1; timer.stop(); int64_t time_ms = timer.elapsed_time() / 1000 / 1000; diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index a79434551b5cc11..694b7d6db84d8e1 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -260,8 +260,10 @@ Status Merger::vertical_compact_one_group( } reader_params.tablet_schema = merge_tablet_schema; + bool has_cluster_key = false; if (!tablet->tablet_schema()->cluster_key_idxes().empty()) { reader_params.delete_bitmap = &tablet->tablet_meta()->delete_bitmap(); + has_cluster_key = true; } if (is_key && stats_output && stats_output->rowid_conversion) { @@ -290,7 +292,8 @@ Status Merger::vertical_compact_one_group( "failed to read next block when merging rowsets of tablet " + std::to_string(tablet->tablet_id())); RETURN_NOT_OK_STATUS_WITH_WARN( - dst_rowset_writer->add_columns(&block, column_group, is_key, max_rows_per_segment), + dst_rowset_writer->add_columns(&block, column_group, is_key, max_rows_per_segment, + has_cluster_key), "failed to write block when merging rowsets of tablet " + std::to_string(tablet->tablet_id())); diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index d3bd0f0a3a24363..11249bafb1e3c07 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -35,6 +35,7 @@ #include #include +#include "common/config.h" #include "io/io_common.h" #include "olap/olap_define.h" #include "olap/rowset/rowset_fwd.h" @@ -394,6 +395,8 @@ using ColumnId = uint32_t; using UniqueIdSet = std::set; // Column unique Id -> column id map using UniqueIdToColumnIdMap = std::map; +struct RowsetId; +RowsetId next_rowset_id(); // 8 bit rowset id version // 56 bit, inc number from 1 @@ -412,7 +415,12 @@ struct RowsetId { auto [_, ec] = std::from_chars(rowset_id_str.data(), rowset_id_str.data() + rowset_id_str.length(), high); if (ec != std::errc {}) [[unlikely]] { - LOG(FATAL) << "failed to init rowset id: " << rowset_id_str; + if (config::force_regenerate_rowsetid_on_start_error) { + LOG(WARNING) << "failed to init rowset id: " << rowset_id_str; + high = next_rowset_id().hi; + } else { + LOG(FATAL) << "failed to init rowset id: " << rowset_id_str; + } } init(1, high, 0, 0); } else { diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp index a0c5a05636bfa26..736bdaa99304d37 100644 --- a/be/src/olap/olap_server.cpp +++ b/be/src/olap/olap_server.cpp @@ -210,7 +210,7 @@ static int32_t get_single_replica_compaction_threads_num(size_t data_dirs_num) { return threads_num; } -Status StorageEngine::start_bg_threads() { +Status StorageEngine::start_bg_threads(std::shared_ptr wg_sptr) { RETURN_IF_ERROR(Thread::create( "StorageEngine", "unused_rowset_monitor_thread", [this]() { this->_unused_rowset_monitor_thread_callback(); }, @@ -243,29 +243,60 @@ Status StorageEngine::start_bg_threads() { auto single_replica_compaction_threads = get_single_replica_compaction_threads_num(data_dirs.size()); - RETURN_IF_ERROR(ThreadPoolBuilder("BaseCompactionTaskThreadPool") - .set_min_threads(base_compaction_threads) - .set_max_threads(base_compaction_threads) - .build(&_base_compaction_thread_pool)); - RETURN_IF_ERROR(ThreadPoolBuilder("CumuCompactionTaskThreadPool") - .set_min_threads(cumu_compaction_threads) - .set_max_threads(cumu_compaction_threads) - .build(&_cumu_compaction_thread_pool)); - RETURN_IF_ERROR(ThreadPoolBuilder("SingleReplicaCompactionTaskThreadPool") - .set_min_threads(single_replica_compaction_threads) - .set_max_threads(single_replica_compaction_threads) - .build(&_single_replica_compaction_thread_pool)); - - if (config::enable_segcompaction) { - RETURN_IF_ERROR(ThreadPoolBuilder("SegCompactionTaskThreadPool") - .set_min_threads(config::segcompaction_num_threads) - .set_max_threads(config::segcompaction_num_threads) - .build(&_seg_compaction_thread_pool)); + if (wg_sptr->get_cgroup_cpu_ctl_wptr().lock()) { + RETURN_IF_ERROR(ThreadPoolBuilder("gBaseCompactionTaskThreadPool") + .set_min_threads(base_compaction_threads) + .set_max_threads(base_compaction_threads) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_base_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("gCumuCompactionTaskThreadPool") + .set_min_threads(cumu_compaction_threads) + .set_max_threads(cumu_compaction_threads) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_cumu_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("gSingleReplicaCompactionTaskThreadPool") + .set_min_threads(single_replica_compaction_threads) + .set_max_threads(single_replica_compaction_threads) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_single_replica_compaction_thread_pool)); + + if (config::enable_segcompaction) { + RETURN_IF_ERROR(ThreadPoolBuilder("gSegCompactionTaskThreadPool") + .set_min_threads(config::segcompaction_num_threads) + .set_max_threads(config::segcompaction_num_threads) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_seg_compaction_thread_pool)); + } + RETURN_IF_ERROR(ThreadPoolBuilder("gColdDataCompactionTaskThreadPool") + .set_min_threads(config::cold_data_compaction_thread_num) + .set_max_threads(config::cold_data_compaction_thread_num) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_cold_data_compaction_thread_pool)); + } else { + RETURN_IF_ERROR(ThreadPoolBuilder("BaseCompactionTaskThreadPool") + .set_min_threads(base_compaction_threads) + .set_max_threads(base_compaction_threads) + .build(&_base_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("CumuCompactionTaskThreadPool") + .set_min_threads(cumu_compaction_threads) + .set_max_threads(cumu_compaction_threads) + .build(&_cumu_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("SingleReplicaCompactionTaskThreadPool") + .set_min_threads(single_replica_compaction_threads) + .set_max_threads(single_replica_compaction_threads) + .build(&_single_replica_compaction_thread_pool)); + + if (config::enable_segcompaction) { + RETURN_IF_ERROR(ThreadPoolBuilder("SegCompactionTaskThreadPool") + .set_min_threads(config::segcompaction_num_threads) + .set_max_threads(config::segcompaction_num_threads) + .build(&_seg_compaction_thread_pool)); + } + RETURN_IF_ERROR(ThreadPoolBuilder("ColdDataCompactionTaskThreadPool") + .set_min_threads(config::cold_data_compaction_thread_num) + .set_max_threads(config::cold_data_compaction_thread_num) + .build(&_cold_data_compaction_thread_pool)); } - RETURN_IF_ERROR(ThreadPoolBuilder("ColdDataCompactionTaskThreadPool") - .set_min_threads(config::cold_data_compaction_thread_num) - .set_max_threads(config::cold_data_compaction_thread_num) - .build(&_cold_data_compaction_thread_pool)); // compaction tasks producer thread RETURN_IF_ERROR(Thread::create( diff --git a/be/src/olap/options.cpp b/be/src/olap/options.cpp index 9c500c10993395e..6e4cb61e3d01823 100644 --- a/be/src/olap/options.cpp +++ b/be/src/olap/options.cpp @@ -32,6 +32,7 @@ #include "common/status.h" #include "gutil/strings/split.h" #include "gutil/strings/strip.h" +#include "io/cache/file_cache_common.h" #include "io/fs/local_file_system.h" #include "olap/olap_define.h" #include "olap/utils.h" @@ -56,6 +57,7 @@ static std::string CACHE_QUERY_LIMIT_SIZE = "query_limit"; static std::string CACHE_NORMAL_PERCENT = "normal_percent"; static std::string CACHE_DISPOSABLE_PERCENT = "disposable_percent"; static std::string CACHE_INDEX_PERCENT = "index_percent"; +static std::string CACHE_TTL_PERCENT = "ttl_percent"; static std::string CACHE_STORAGE = "storage"; static std::string CACHE_STORAGE_DISK = "disk"; static std::string CACHE_STORAGE_MEMORY = "memory"; @@ -206,7 +208,7 @@ void parse_conf_broken_store_paths(const string& config_path, std::set 0) + << "found duplicate key or key is not sorted! current key: " << key + << ", last max key: " << _max_key; _max_key.clear(); _max_key.append(key.get_data(), key.get_size()); _num_rows++; diff --git a/be/src/olap/rowset/rowset_writer.h b/be/src/olap/rowset/rowset_writer.h index ad42982488b3166..f84ff964ea30516 100644 --- a/be/src/olap/rowset/rowset_writer.h +++ b/be/src/olap/rowset/rowset_writer.h @@ -80,7 +80,7 @@ class RowsetWriter { "RowsetWriter not support add_block"); } virtual Status add_columns(const vectorized::Block* block, const std::vector& col_ids, - bool is_key, uint32_t max_rows_per_segment) { + bool is_key, uint32_t max_rows_per_segment, bool has_cluster_key) { return Status::Error( "RowsetWriter not support add_columns"); } diff --git a/be/src/olap/rowset/segment_creator.cpp b/be/src/olap/rowset/segment_creator.cpp index e0eb7534123a860..c2a4469d97f324e 100644 --- a/be/src/olap/rowset/segment_creator.cpp +++ b/be/src/olap/rowset/segment_creator.cpp @@ -115,7 +115,8 @@ Status SegmentFlusher::close() { bool SegmentFlusher::need_buffering() { // buffering variants for schema change return _context.write_type == DataWriteType::TYPE_SCHEMA_CHANGE && - _context.tablet_schema->num_variant_columns() > 0; + (_context.tablet_schema->num_variant_columns() > 0 || + !_context.tablet_schema->cluster_key_idxes().empty()); } Status SegmentFlusher::_add_rows(std::unique_ptr& segment_writer, diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index e043164ef286337..3b3c6ad3feab923 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -266,8 +266,7 @@ class BinaryPlainPageDecoder : public PageDecoder { auto total = *n; size_t read_count = 0; - _len_array.resize(total); - _start_offset_array.resize(total); + _binary_data.resize(total); for (size_t i = 0; i < total; ++i) { ordinal_t ord = rowids[i] - page_first_ordinal; if (UNLIKELY(ord >= _num_elems)) { @@ -275,14 +274,13 @@ class BinaryPlainPageDecoder : public PageDecoder { } const uint32_t start_offset = offset(ord); - _start_offset_array[read_count] = start_offset; - _len_array[read_count] = offset(ord + 1) - start_offset; + _binary_data[read_count].data = _data.mutable_data() + start_offset; + _binary_data[read_count].size = offset(ord + 1) - start_offset; read_count++; } if (LIKELY(read_count > 0)) { - dst->insert_many_binary_data(_data.mutable_data(), _len_array.data(), - _start_offset_array.data(), read_count); + dst->insert_many_strings(_binary_data.data(), read_count); } *n = read_count; @@ -342,13 +340,11 @@ class BinaryPlainPageDecoder : public PageDecoder { if (idx >= _num_elems) { return _offsets_pos; } - const uint8_t* p = - reinterpret_cast(&_data[_offsets_pos + idx * SIZE_OF_INT32]); - return decode_fixed32_le(p); + return guarded_offset(idx); } uint32_t guarded_offset(size_t idx) const { - const uint8_t* p = + const auto* p = reinterpret_cast(&_data[_offsets_pos + idx * SIZE_OF_INT32]); return decode_fixed32_le(p); } @@ -361,8 +357,7 @@ class BinaryPlainPageDecoder : public PageDecoder { uint32_t _offsets_pos; std::vector _offsets; - std::vector _len_array; - std::vector _start_offset_array; + std::vector _binary_data; // Index of the currently seeked element in the page. uint32_t _cur_idx; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index aad3725d5a3f6e1..b96cf4f7e6794ee 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -344,7 +344,7 @@ Status ColumnReader::new_inverted_index_iterator( { std::shared_lock rlock(_load_index_lock); if (_inverted_index) { - RETURN_IF_ERROR(_inverted_index->new_iterator(read_options.stats, + RETURN_IF_ERROR(_inverted_index->new_iterator(read_options.io_ctx, read_options.stats, read_options.runtime_state, iterator)); } } @@ -411,7 +411,7 @@ Status ColumnReader::next_batch_of_zone_map(size_t* n, vectorized::MutableColumn } else { if (is_string) { auto sv = (StringRef*)min_value->cell_ptr(); - dst->insert_many_data(sv->data, sv->size, size); + dst->insert_data_repeatedly(sv->data, sv->size, size); } else { // TODO: the work may cause performance problem, opt latter for (int i = 0; i < size; ++i) { @@ -1508,7 +1508,7 @@ void DefaultValueColumnIterator::insert_default_data(const TypeInfo* type_info, value.cast_to_date(); int64 = binary_cast(value); - dst->insert_many_data(data_ptr, data_len, n); + dst->insert_data_repeatedly(data_ptr, data_len, n); break; } case FieldType::OLAP_FIELD_TYPE_DATETIME: { @@ -1526,7 +1526,7 @@ void DefaultValueColumnIterator::insert_default_data(const TypeInfo* type_info, value.to_datetime(); int64 = binary_cast(value); - dst->insert_many_data(data_ptr, data_len, n); + dst->insert_data_repeatedly(data_ptr, data_len, n); break; } case FieldType::OLAP_FIELD_TYPE_DECIMAL: { @@ -1538,7 +1538,7 @@ void DefaultValueColumnIterator::insert_default_data(const TypeInfo* type_info, sizeof(FieldTypeTraits::CppType)); //decimal12_t decimal12_t* d = (decimal12_t*)mem_value; int128 = DecimalV2Value(d->integer, d->fraction).value(); - dst->insert_many_data(data_ptr, data_len, n); + dst->insert_data_repeatedly(data_ptr, data_len, n); break; } case FieldType::OLAP_FIELD_TYPE_STRING: @@ -1548,7 +1548,7 @@ void DefaultValueColumnIterator::insert_default_data(const TypeInfo* type_info, case FieldType::OLAP_FIELD_TYPE_AGG_STATE: { char* data_ptr = ((Slice*)mem_value)->data; size_t data_len = ((Slice*)mem_value)->size; - dst->insert_many_data(data_ptr, data_len, n); + dst->insert_data_repeatedly(data_ptr, data_len, n); break; } case FieldType::OLAP_FIELD_TYPE_ARRAY: { @@ -1566,7 +1566,7 @@ void DefaultValueColumnIterator::insert_default_data(const TypeInfo* type_info, default: { char* data_ptr = (char*)mem_value; size_t data_len = type_size; - dst->insert_many_data(data_ptr, data_len, n); + dst->insert_data_repeatedly(data_ptr, data_len, n); } } } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp index fb2479517166fc1..6e9d61db7fddb42 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp @@ -20,8 +20,9 @@ namespace doris::segment_v2 { ConjunctionQuery::ConjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, const io::IOContext* io_ctx) : _searcher(searcher), + _io_ctx(io_ctx), _index_version(_searcher->getReader()->getIndexVersion()), _conjunction_ratio(query_options.inverted_index_conjunction_opt_threshold) {} @@ -48,7 +49,7 @@ void ConjunctionQuery::add(const std::wstring& field_name, const std::vectorgetReader()->termDocs(t); + TermDocs* term_doc = _searcher->getReader()->termDocs(t, _io_ctx); _term_docs.push_back(term_doc); iterators.emplace_back(term_doc); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h index 2571392d5294e94..b9bfee2bfb1f7ae 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h @@ -27,7 +27,7 @@ namespace doris::segment_v2 { class ConjunctionQuery : public Query { public: ConjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~ConjunctionQuery() override; void add(const std::wstring& field_name, const std::vector& terms) override; @@ -41,6 +41,7 @@ class ConjunctionQuery : public Query { public: std::shared_ptr _searcher; + const io::IOContext* _io_ctx = nullptr; IndexVersion _index_version = IndexVersion::kV0; int32_t _conjunction_ratio = 1000; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp index 650a88c064611ce..852357073d3b1d0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp @@ -20,8 +20,8 @@ namespace doris::segment_v2 { DisjunctionQuery::DisjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) - : _searcher(searcher) {} + const TQueryOptions& query_options, const io::IOContext* io_ctx) + : _searcher(searcher), _io_ctx(io_ctx) {} void DisjunctionQuery::add(const std::wstring& field_name, const std::vector& terms) { if (terms.empty()) { @@ -36,7 +36,7 @@ void DisjunctionQuery::search(roaring::Roaring& roaring) { auto func = [this, &roaring](const std::string& term, bool first) { std::wstring ws_term = StringUtil::string_to_wstring(term); auto* t = _CLNEW Term(_field_name.c_str(), ws_term.c_str()); - auto* term_doc = _searcher->getReader()->termDocs(t); + auto* term_doc = _searcher->getReader()->termDocs(t, _io_ctx); TermIterator iterator(term_doc); DocRange doc_range; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h index 357831461571c7a..8d0559ee4b0c982 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h @@ -27,7 +27,7 @@ namespace doris::segment_v2 { class DisjunctionQuery : public Query { public: DisjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~DisjunctionQuery() override = default; void add(const std::wstring& field_name, const std::vector& terms) override; @@ -35,6 +35,7 @@ class DisjunctionQuery : public Query { private: std::shared_ptr _searcher; + const io::IOContext* _io_ctx = nullptr; std::wstring _field_name; std::vector _terms; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp index ec1b5bdd9e4d35d..f82433826e95815 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp @@ -30,7 +30,7 @@ namespace doris::segment_v2 { PhraseEdgeQuery::PhraseEdgeQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, const io::IOContext* io_ctx) : _searcher(searcher), _query(std::make_unique()), _max_expansions(query_options.inverted_index_max_expansions) {} diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h index 5daf382e0d08fa7..9eb3bd57c4a9163 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h @@ -31,7 +31,7 @@ namespace doris::segment_v2 { class PhraseEdgeQuery : public Query { public: PhraseEdgeQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~PhraseEdgeQuery() override = default; void add(const std::wstring& field_name, const std::vector& terms) override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp index 407e515dc9212f1..88bb3c1171fa307 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp @@ -23,7 +23,8 @@ namespace doris::segment_v2 { PhrasePrefixQuery::PhrasePrefixQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, + const io::IOContext* io_ctx) : _searcher(searcher), _query(std::make_unique()), _max_expansions(query_options.inverted_index_max_expansions) {} diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h index e565c0409cf4cd1..5cac597951eac78 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h @@ -31,7 +31,7 @@ namespace doris::segment_v2 { class PhrasePrefixQuery : public Query { public: PhrasePrefixQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~PhrasePrefixQuery() override = default; void add(const std::wstring& field_name, const std::vector& terms) override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp index 9a3ecc68f89fa0c..38e60b0f089dc0e 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp @@ -123,8 +123,8 @@ bool OrderedSloppyPhraseMatcher::stretch_to_order(PostingsAndPosition* prev_post } PhraseQuery::PhraseQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) - : _searcher(searcher) {} + const TQueryOptions& query_options, const io::IOContext* io_ctx) + : _searcher(searcher), _io_ctx(io_ctx) {} PhraseQuery::~PhraseQuery() { for (auto& term_doc : _term_docs) { @@ -173,7 +173,7 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vectorgetReader()->termDocs(t); + TermDocs* term_doc = _searcher->getReader()->termDocs(t, _io_ctx); _term_docs.push_back(term_doc); _lead1 = TermIterator(term_doc); return; @@ -185,7 +185,7 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vectorgetReader()->termPositions(t); + TermPositions* term_pos = _searcher->getReader()->termPositions(t, _io_ctx); _term_docs.push_back(term_pos); if (is_save_iter) { iterators.emplace_back(term_pos); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h index 35a479ff7f9781d..a2c3a7ae91afcc7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h @@ -87,7 +87,7 @@ using Matcher = std::variant; class PhraseQuery : public Query { public: PhraseQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~PhraseQuery() override; void add(const InvertedIndexQueryInfo& query_info) override; @@ -112,6 +112,7 @@ class PhraseQuery : public Query { private: std::shared_ptr _searcher; + const io::IOContext* _io_ctx = nullptr; TermIterator _lead1; TermIterator _lead2; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h index c295765ec63478e..c0eac69deaeaf37 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h @@ -27,6 +27,7 @@ #include #include "common/status.h" +#include "io/io_common.h" #include "roaring/roaring.hh" CL_NS_USE(index) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp index 007da8289dcdb07..69de4b7818b870c 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp @@ -25,10 +25,10 @@ namespace doris::segment_v2 { RegexpQuery::RegexpQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, const io::IOContext* io_ctx) : _searcher(searcher), _max_expansions(query_options.inverted_index_max_expansions), - _query(searcher, query_options) {} + _query(searcher, query_options, io_ctx) {} void RegexpQuery::add(const std::wstring& field_name, const std::vector& patterns) { if (patterns.size() != 1) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h index 336b2d0b6a671da..650ad2bf10b0029 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h @@ -28,7 +28,7 @@ namespace doris::segment_v2 { class RegexpQuery : public Query { public: RegexpQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~RegexpQuery() override = default; void add(const std::wstring& field_name, const std::vector& patterns) override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp index 88a8f2417228bc3..f988c46c027c268 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp @@ -76,13 +76,6 @@ Status compact_column(int64_t index_id, // when index_writer is destroyed, if closeDir is set, dir will be close // _CLDECDELETE(dir) will try to ref_cnt--, when it decreases to 1, dir will be destroyed. _CLDECDELETE(dir) - for (auto* d : dest_index_dirs) { - if (d != nullptr) { - // NOTE: DO NOT close dest dir here, because it will be closed when dest index writer finalize. - //d->close(); - //_CLDELETE(d); - } - } // delete temporary segment_path, only when inverted_index_ram_dir_enable is false if (!config::inverted_index_ram_dir_enable) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp index 7613df112ed9aad..60006ea84550a23 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp @@ -59,6 +59,8 @@ class CSIndexInput : public lucene::store::BufferedIndexInput { CL_NS(store)::IndexInput* base; int64_t fileOffset; int64_t _length; + const io::IOContext* _io_ctx = nullptr; + bool _is_index_file = false; // Indicates if the file is a TII file protected: void readInternal(uint8_t* /*b*/, const int32_t /*len*/) override; @@ -75,6 +77,8 @@ class CSIndexInput : public lucene::store::BufferedIndexInput { const char* getDirectoryType() const override { return DorisCompoundReader::getClassName(); } const char* getObjectName() const override { return getClassName(); } static const char* getClassName() { return "CSIndexInput"; } + void setIoContext(const void* io_ctx) override; + void setIndexFile(bool isIndexFile) override; }; CSIndexInput::CSIndexInput(CL_NS(store)::IndexInput* base, const int64_t fileOffset, @@ -92,9 +96,12 @@ void CSIndexInput::readInternal(uint8_t* b, const int32_t len) { if (start + len > _length) { _CLTHROWA(CL_ERR_IO, "read past EOF"); } + base->setIoContext(_io_ctx); + base->setIndexFile(_is_index_file); base->seek(fileOffset + start); bool read_from_buffer = true; base->readBytes(b, len, read_from_buffer); + base->setIoContext(nullptr); } CSIndexInput::~CSIndexInput() = default; @@ -111,6 +118,14 @@ CSIndexInput::CSIndexInput(const CSIndexInput& clone) : BufferedIndexInput(clone void CSIndexInput::close() {} +void CSIndexInput::setIoContext(const void* io_ctx) { + _io_ctx = static_cast(io_ctx); +} + +void CSIndexInput::setIndexFile(bool isIndexFile) { + _is_index_file = isIndexFile; +} + DorisCompoundReader::DorisCompoundReader(CL_NS(store)::IndexInput* stream, int32_t read_buffer_size) : _ram_dir(new lucene::store::RAMDirectory()), _stream(stream), diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp index e0c75922c98bb20..113833d560fd060 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp @@ -27,10 +27,9 @@ namespace doris::segment_v2 { -Status InvertedIndexFileReader::init(int32_t read_buffer_size, bool open_idx_file_cache) { +Status InvertedIndexFileReader::init(int32_t read_buffer_size) { if (!_inited) { _read_buffer_size = read_buffer_size; - _open_idx_file_cache = open_idx_file_cache; if (_storage_format == InvertedIndexStorageFormatPB::V2) { auto st = _init_from_v2(read_buffer_size); if (!st.ok()) { @@ -76,7 +75,6 @@ Status InvertedIndexFileReader::_init_from_v2(int32_t read_buffer_size) { "CLuceneError occur when open idx file {}, error msg: {}", index_file_full_path, err.what()); } - index_input->setIdxFileCache(_open_idx_file_cache); _stream = std::unique_ptr(index_input); // 3. read file @@ -198,7 +196,6 @@ Result> InvertedIndexFileReader::_open( } // 3. read file in DorisCompoundReader - index_input->setIdxFileCache(_open_idx_file_cache); compound_reader = std::make_unique(index_input, _read_buffer_size); } catch (CLuceneError& err) { return ResultError(Status::Error( diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h index 8bc28b1882f9d85..3b7161c7643cefe 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h @@ -58,8 +58,7 @@ class InvertedIndexFileReader { _storage_format(storage_format), _idx_file_info(idx_file_info) {} - Status init(int32_t read_buffer_size = config::inverted_index_read_buffer_size, - bool open_idx_file_cache = false); + Status init(int32_t read_buffer_size = config::inverted_index_read_buffer_size); Result> open(const TabletIndex* index_meta) const; void debug_file_entries(); std::string get_index_file_cache_key(const TabletIndex* index_meta) const; @@ -80,7 +79,6 @@ class InvertedIndexFileReader { const io::FileSystemSPtr _fs; std::string _index_path_prefix; int32_t _read_buffer_size = -1; - bool _open_idx_file_cache = false; InvertedIndexStorageFormatPB _storage_format; mutable std::shared_mutex _mutex; // Use mutable for const read operations bool _inited = false; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp index 5599faa351dfd6d..2d50730daffe8a8 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp @@ -19,17 +19,14 @@ #include +#include #include #include "common/status.h" -#include "io/fs/file_writer.h" -#include "io/fs/local_file_system.h" -#include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" #include "olap/rowset/segment_v2/inverted_index_fs_directory.h" #include "olap/rowset/segment_v2/inverted_index_reader.h" #include "olap/tablet_schema.h" -#include "runtime/exec_env.h" namespace doris::segment_v2 { @@ -38,32 +35,11 @@ Status InvertedIndexFileWriter::initialize(InvertedIndexDirectoryMap& indices_di return Status::OK(); } -Result InvertedIndexFileWriter::open(const TabletIndex* index_meta) { - auto tmp_file_dir = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir(); - const auto& local_fs = io::global_local_filesystem(); - auto local_fs_index_path = InvertedIndexDescriptor::get_temporary_index_path( - tmp_file_dir.native(), _rowset_id, _seg_id, index_meta->index_id(), - index_meta->get_index_suffix()); - bool exists = false; - auto st = local_fs->exists(local_fs_index_path, &exists); - DBUG_EXECUTE_IF("InvertedIndexFileWriter::open_local_fs_exists_error", - { st = Status::Error("debug point: no such file error"); }) - if (!st.ok()) { - LOG(ERROR) << "index_path:" << local_fs_index_path << " exists error:" << st; - return ResultError(st); - } - DBUG_EXECUTE_IF("InvertedIndexFileWriter::open_local_fs_exists_true", { exists = true; }) - if (exists) { - LOG(ERROR) << "try to init a directory:" << local_fs_index_path << " already exists"; - return ResultError( - Status::InternalError("InvertedIndexFileWriter::open directory already exists")); - } - - bool can_use_ram_dir = true; - auto* dir = DorisFSDirectoryFactory::getDirectory(local_fs, local_fs_index_path.c_str(), - can_use_ram_dir); - auto key = std::make_pair(index_meta->index_id(), index_meta->get_index_suffix()); - auto [it, inserted] = _indices_dirs.emplace(key, std::unique_ptr(dir)); +Status InvertedIndexFileWriter::_insert_directory_into_map(int64_t index_id, + const std::string& index_suffix, + std::shared_ptr dir) { + auto key = std::make_pair(index_id, index_suffix); + auto [it, inserted] = _indices_dirs.emplace(key, std::move(dir)); if (!inserted) { LOG(ERROR) << "InvertedIndexFileWriter::open attempted to insert a duplicate key: (" << key.first << ", " << key.second << ")"; @@ -71,8 +47,23 @@ Result InvertedIndexFileWriter::open(const TabletIndex* index for (const auto& entry : _indices_dirs) { LOG(ERROR) << "Key: (" << entry.first.first << ", " << entry.first.second << ")"; } - return ResultError(Status::InternalError( - "InvertedIndexFileWriter::open attempted to insert a duplicate dir")); + return Status::InternalError( + "InvertedIndexFileWriter::open attempted to insert a duplicate dir"); + } + return Status::OK(); +} + +Result> InvertedIndexFileWriter::open( + const TabletIndex* index_meta) { + auto local_fs_index_path = InvertedIndexDescriptor::get_temporary_index_path( + _tmp_dir, _rowset_id, _seg_id, index_meta->index_id(), index_meta->get_index_suffix()); + bool can_use_ram_dir = true; + auto dir = std::shared_ptr(DorisFSDirectoryFactory::getDirectory( + _local_fs, local_fs_index_path.c_str(), can_use_ram_dir)); + auto st = + _insert_directory_into_map(index_meta->index_id(), index_meta->get_index_suffix(), dir); + if (!st.ok()) { + return ResultError(st); } return dir; @@ -222,7 +213,7 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire int64_t chunk = bufferLength; while (remainder > 0) { - int64_t len = std::min(std::min(chunk, length), remainder); + int64_t len = std::min({chunk, length, remainder}); input->readBytes(buffer, len); output->writeBytes(buffer, len); remainder -= len; @@ -252,244 +243,326 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire Status InvertedIndexFileWriter::write_v1() { int64_t total_size = 0; + std::string err_msg; + lucene::store::Directory* out_dir = nullptr; + std::exception_ptr eptr; + std::unique_ptr output = nullptr; for (const auto& entry : _indices_dirs) { const int64_t index_id = entry.first.first; const auto& index_suffix = entry.first.second; try { - const auto& directory = entry.second; - std::vector files; - directory->list(&files); - // remove write.lock file - auto it = std::find(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE); - if (it != files.end()) { - files.erase(it); - } + const auto& directory = entry.second.get(); - std::vector sorted_files; - for (auto file : files) { - FileInfo file_info; - file_info.filename = file; - file_info.filesize = directory->fileLength(file.c_str()); - sorted_files.emplace_back(std::move(file_info)); - } - sort_files(sorted_files); - - int32_t file_count = sorted_files.size(); - - io::Path cfs_path(InvertedIndexDescriptor::get_index_file_path_v1( - _index_path_prefix, index_id, index_suffix)); - auto idx_path = cfs_path.parent_path(); - std::string idx_name = cfs_path.filename(); - // write file entries to ram directory to get header length - lucene::store::RAMDirectory ram_dir; - auto* out_idx = ram_dir.createOutput(idx_name.c_str()); - DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_ram_output_is_nullptr", - { out_idx = nullptr; }) - if (out_idx == nullptr) { - LOG(WARNING) << "Write compound file error: RAMDirectory output is nullptr."; - _CLTHROWA(CL_ERR_IO, "Create RAMDirectory output error"); - } + // Prepare sorted file list + auto sorted_files = prepare_sorted_files(directory); + + // Calculate header length + auto [header_length, header_file_count] = + calculate_header_length(sorted_files, directory); + + // Create output stream + auto result = create_output_stream_v1(index_id, index_suffix); + out_dir = result.first; + output = std::move(result.second); - std::unique_ptr ram_output(out_idx); - ram_output->writeVInt(file_count); - // write file entries in ram directory - // number of files, which data are in header - int header_file_count = 0; - int64_t header_file_length = 0; - const int64_t buffer_length = 16384; - uint8_t ram_buffer[buffer_length]; - for (auto file : sorted_files) { - ram_output->writeString(file.filename); // file name - ram_output->writeLong(0); // data offset - ram_output->writeLong(file.filesize); // file length - header_file_length += file.filesize; - if (header_file_length <= DorisFSDirectory::MAX_HEADER_DATA_SIZE) { - copyFile(file.filename.c_str(), directory.get(), ram_output.get(), ram_buffer, - buffer_length); - header_file_count++; - } - } - auto header_len = ram_output->getFilePointer(); - ram_output->close(); - ram_dir.deleteFile(idx_name.c_str()); - ram_dir.close(); - - auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, idx_path.c_str()); - out_dir->set_file_writer_opts(_opts); - - auto* out = out_dir->createOutput(idx_name.c_str()); - DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_out_dir_createOutput_nullptr", - { out = nullptr; }); - if (out == nullptr) { - LOG(WARNING) << "Write compound file error: CompoundDirectory output is nullptr."; - _CLTHROWA(CL_ERR_IO, "Create CompoundDirectory output error"); - } - std::unique_ptr output(out); size_t start = output->getFilePointer(); - output->writeVInt(file_count); - // write file entries - int64_t data_offset = header_len; - uint8_t header_buffer[buffer_length]; - for (int i = 0; i < sorted_files.size(); ++i) { - auto file = sorted_files[i]; - output->writeString(file.filename); // FileName - // DataOffset - if (i < header_file_count) { - // file data write in header, so we set its offset to -1. - output->writeLong(-1); - } else { - output->writeLong(data_offset); - } - output->writeLong(file.filesize); // FileLength - if (i < header_file_count) { - // append data - copyFile(file.filename.c_str(), directory.get(), output.get(), header_buffer, - buffer_length); - } else { - data_offset += file.filesize; - } - } - // write rest files' data - uint8_t data_buffer[buffer_length]; - for (int i = header_file_count; i < sorted_files.size(); ++i) { - auto file = sorted_files[i]; - copyFile(file.filename.c_str(), directory.get(), output.get(), data_buffer, - buffer_length); - } - out_dir->close(); - // NOTE: need to decrease ref count, but not to delete here, - // because index cache may get the same directory from DIRECTORIES - _CLDECDELETE(out_dir) + // Write header and data + write_header_and_data_v1(output.get(), sorted_files, directory, header_length, + header_file_count); + + // Collect file information auto compound_file_size = output->getFilePointer() - start; - output->close(); - //LOG(INFO) << (idx_path / idx_name).c_str() << " size:" << compound_file_size; total_size += compound_file_size; - InvertedIndexFileInfo_IndexInfo index_info; - index_info.set_index_id(index_id); - index_info.set_index_suffix(index_suffix); - index_info.set_index_file_size(compound_file_size); - auto* new_index_info = _file_info.add_index_info(); - *new_index_info = index_info; + add_index_info(index_id, index_suffix, compound_file_size); } catch (CLuceneError& err) { + eptr = std::current_exception(); auto index_path = InvertedIndexDescriptor::get_index_file_path_v1( _index_path_prefix, index_id, index_suffix); - LOG(ERROR) << "CLuceneError occur when write_v1 idx file " << index_path - << " error msg: " << err.what(); + err_msg = "CLuceneError occur when write_v1 idx file " + index_path + + " error msg: " + err.what(); + } - return Status::Error( - "CLuceneError occur when write_v1 idx file: {}, error msg: {}", index_path, - err.what()); + // Close and clean up + finalize_output_dir(out_dir); + if (output) { + output->close(); + } + + if (eptr) { + LOG(ERROR) << err_msg; + return Status::Error(err_msg); } } + _total_file_size = total_size; return Status::OK(); } Status InvertedIndexFileWriter::write_v2() { - io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; - std::unique_ptr compound_file_output; + std::string err_msg; + lucene::store::Directory* out_dir = nullptr; + std::unique_ptr compound_file_output = nullptr; + std::exception_ptr eptr; try { - // Create the output stream to write the compound file + // Calculate header length and initialize offset int64_t current_offset = headerLength(); + // Prepare file metadata + auto file_metadata = prepare_file_metadata_v2(current_offset); - io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; + // Create output stream + auto result = create_output_stream_v2(); + out_dir = result.first; + compound_file_output = std::move(result.second); - auto* out_dir = - DorisFSDirectoryFactory::getDirectory(_fs, index_path.parent_path().c_str()); - out_dir->set_file_writer_opts(_opts); + // Write version and number of indices + write_version_and_indices_count(compound_file_output.get()); - std::unique_ptr compound_file_output; + // Write index headers and file metadata + write_index_headers_and_metadata(compound_file_output.get(), file_metadata); - DCHECK(_idx_v2_writer != nullptr) << "inverted index file writer v2 is nullptr"; - compound_file_output = std::unique_ptr( - out_dir->createOutputV2(_idx_v2_writer.get())); + // Copy file data + copy_files_data_v2(compound_file_output.get(), file_metadata); + + _total_file_size = compound_file_output->getFilePointer(); + _file_info.set_index_size(_total_file_size); + } catch (CLuceneError& err) { + eptr = std::current_exception(); + auto index_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); + err_msg = "CLuceneError occur when close idx file " + index_path + + " error msg: " + err.what(); + } - // Write the version number - compound_file_output->writeInt(InvertedIndexStorageFormatPB::V2); + // Close and clean up + finalize_output_dir(out_dir); + if (compound_file_output) { + compound_file_output->close(); + } - // Write the number of indices - const auto numIndices = static_cast(_indices_dirs.size()); - compound_file_output->writeInt(numIndices); + if (eptr) { + LOG(ERROR) << err_msg; + return Status::Error(err_msg); + } - std::vector> - file_metadata; // Store file name, offset, file length, and corresponding directory + return Status::OK(); +} - // First, write all index information and file metadata - for (const auto& entry : _indices_dirs) { - const int64_t index_id = entry.first.first; - const auto& index_suffix = entry.first.second; - const auto& dir = entry.second; - std::vector files; - dir->list(&files); - - auto it = std::find(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE); - if (it != files.end()) { - files.erase(it); - } - // sort file list by file length - std::vector> sorted_files; - for (const auto& file : files) { - sorted_files.emplace_back(file, dir->fileLength(file.c_str())); - } +// Helper function implementations +std::vector InvertedIndexFileWriter::prepare_sorted_files( + lucene::store::Directory* directory) { + std::vector files; + directory->list(&files); + + // Remove write.lock file + files.erase(std::remove(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE), + files.end()); + + std::vector sorted_files; + for (const auto& file : files) { + FileInfo file_info; + file_info.filename = file; + file_info.filesize = directory->fileLength(file.c_str()); + sorted_files.push_back(std::move(file_info)); + } - std::sort( - sorted_files.begin(), sorted_files.end(), - [](const std::pair& a, - const std::pair& b) { return (a.second < b.second); }); - - int32_t file_count = sorted_files.size(); - - // Write the index ID and the number of files - compound_file_output->writeLong(index_id); - compound_file_output->writeInt(static_cast(index_suffix.length())); - compound_file_output->writeBytes(reinterpret_cast(index_suffix.data()), - index_suffix.length()); - compound_file_output->writeInt(file_count); - - // Calculate the offset for each file and write the file metadata - for (const auto& file : sorted_files) { - int64_t file_length = dir->fileLength(file.first.c_str()); - compound_file_output->writeInt(static_cast(file.first.length())); - compound_file_output->writeBytes( - reinterpret_cast(file.first.data()), file.first.length()); - compound_file_output->writeLong(current_offset); - compound_file_output->writeLong(file_length); - - file_metadata.emplace_back(file.first, current_offset, file_length, dir.get()); - current_offset += file_length; // Update the data offset - } + // Sort the files + sort_files(sorted_files); + return sorted_files; +} + +void InvertedIndexFileWriter::finalize_output_dir(lucene::store::Directory* out_dir) { + if (out_dir != nullptr) { + out_dir->close(); + _CLDECDELETE(out_dir) + } +} + +void InvertedIndexFileWriter::add_index_info(int64_t index_id, const std::string& index_suffix, + int64_t compound_file_size) { + InvertedIndexFileInfo_IndexInfo index_info; + index_info.set_index_id(index_id); + index_info.set_index_suffix(index_suffix); + index_info.set_index_file_size(compound_file_size); + auto* new_index_info = _file_info.add_index_info(); + *new_index_info = index_info; +} + +std::pair InvertedIndexFileWriter::calculate_header_length( + const std::vector& sorted_files, lucene::store::Directory* directory) { + // Use RAMDirectory to calculate header length + lucene::store::RAMDirectory ram_dir; + auto* out_idx = ram_dir.createOutput("temp_idx"); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::calculate_header_length_ram_output_is_nullptr", + { out_idx = nullptr; }) + if (out_idx == nullptr) { + LOG(WARNING) << "InvertedIndexFileWriter::calculate_header_length error: RAMDirectory " + "output is nullptr."; + _CLTHROWA(CL_ERR_IO, "Create RAMDirectory output error"); + } + std::unique_ptr ram_output(out_idx); + int32_t file_count = sorted_files.size(); + ram_output->writeVInt(file_count); + + int64_t header_file_length = 0; + const int64_t buffer_length = 16384; + uint8_t ram_buffer[buffer_length]; + int32_t header_file_count = 0; + for (const auto& file : sorted_files) { + ram_output->writeString(file.filename); + ram_output->writeLong(0); + ram_output->writeLong(file.filesize); + header_file_length += file.filesize; + + if (header_file_length <= DorisFSDirectory::MAX_HEADER_DATA_SIZE) { + copyFile(file.filename.c_str(), directory, ram_output.get(), ram_buffer, buffer_length); + header_file_count++; } + } - const int64_t buffer_length = 16384; - uint8_t header_buffer[buffer_length]; + int64_t header_length = ram_output->getFilePointer(); + ram_output->close(); + ram_dir.close(); + return {header_length, header_file_count}; +} + +std::pair> +InvertedIndexFileWriter::create_output_stream_v1(int64_t index_id, + const std::string& index_suffix) { + io::Path cfs_path(InvertedIndexDescriptor::get_index_file_path_v1(_index_path_prefix, index_id, + index_suffix)); + auto idx_path = cfs_path.parent_path(); + std::string idx_name = cfs_path.filename(); + + auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, idx_path.c_str()); + out_dir->set_file_writer_opts(_opts); + + auto* out = out_dir->createOutput(idx_name.c_str()); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_out_dir_createOutput_nullptr", + { out = nullptr; }); + if (out == nullptr) { + LOG(WARNING) << "InvertedIndexFileWriter::create_output_stream_v1 error: CompoundDirectory " + "output is nullptr."; + _CLTHROWA(CL_ERR_IO, "Create CompoundDirectory output error"); + } - // Next, write the file data - for (const auto& info : file_metadata) { - const std::string& file = std::get<0>(info); - auto* dir = std::get<3>(info); + std::unique_ptr output(out); + return {out_dir, std::move(output)}; +} - // Write the actual file data - copyFile(file.c_str(), dir, compound_file_output.get(), header_buffer, buffer_length); +void InvertedIndexFileWriter::write_header_and_data_v1(lucene::store::IndexOutput* output, + const std::vector& sorted_files, + lucene::store::Directory* directory, + int64_t header_length, + int32_t header_file_count) { + output->writeVInt(sorted_files.size()); + int64_t data_offset = header_length; + const int64_t buffer_length = 16384; + uint8_t buffer[buffer_length]; + + for (int i = 0; i < sorted_files.size(); ++i) { + auto file = sorted_files[i]; + output->writeString(file.filename); + + // DataOffset + if (i < header_file_count) { + // file data write in header, so we set its offset to -1. + output->writeLong(-1); + } else { + output->writeLong(data_offset); + } + output->writeLong(file.filesize); // FileLength + if (i < header_file_count) { + // append data + copyFile(file.filename.c_str(), directory, output, buffer, buffer_length); + } else { + data_offset += file.filesize; } + } - out_dir->close(); - // NOTE: need to decrease ref count, but not to delete here, - // because index cache may get the same directory from DIRECTORIES - _CLDECDELETE(out_dir) - _total_file_size = compound_file_output->getFilePointer(); - compound_file_output->close(); - _file_info.set_index_size(_total_file_size); - } catch (CLuceneError& err) { - LOG(ERROR) << "CLuceneError occur when close idx file " << index_path - << " error msg: " << err.what(); - if (compound_file_output) { - compound_file_output->close(); - compound_file_output.reset(); + for (size_t i = header_file_count; i < sorted_files.size(); ++i) { + copyFile(sorted_files[i].filename.c_str(), directory, output, buffer, buffer_length); + } +} + +std::pair> +InvertedIndexFileWriter::create_output_stream_v2() { + io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; + auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, index_path.parent_path().c_str()); + out_dir->set_file_writer_opts(_opts); + DCHECK(_idx_v2_writer != nullptr) << "inverted index file writer v2 is nullptr"; + auto compound_file_output = std::unique_ptr( + out_dir->createOutputV2(_idx_v2_writer.get())); + return std::make_pair(out_dir, std::move(compound_file_output)); +} + +void InvertedIndexFileWriter::write_version_and_indices_count(lucene::store::IndexOutput* output) { + // Write the version number + output->writeInt(InvertedIndexStorageFormatPB::V2); + + // Write the number of indices + const auto num_indices = static_cast(_indices_dirs.size()); + output->writeInt(num_indices); +} + +std::vector +InvertedIndexFileWriter::prepare_file_metadata_v2(int64_t& current_offset) { + std::vector file_metadata; + + for (const auto& entry : _indices_dirs) { + const int64_t index_id = entry.first.first; + const auto& index_suffix = entry.first.second; + auto* dir = entry.second.get(); + + // Get sorted files + auto sorted_files = prepare_sorted_files(dir); + + for (const auto& file : sorted_files) { + file_metadata.emplace_back(index_id, index_suffix, file.filename, current_offset, + file.filesize, dir); + current_offset += file.filesize; // Update the data offset } - return Status::Error( - "CLuceneError occur when close idx file: {}, error msg: {}", index_path.c_str(), - err.what()); } - return Status::OK(); + return file_metadata; +} + +void InvertedIndexFileWriter::write_index_headers_and_metadata( + lucene::store::IndexOutput* output, const std::vector& file_metadata) { + // Group files by index_id and index_suffix + std::map, std::vector> indices; + + for (const auto& meta : file_metadata) { + indices[{meta.index_id, meta.index_suffix}].push_back(meta); + } + + for (const auto& index_entry : indices) { + int64_t index_id = index_entry.first.first; + const std::string& index_suffix = index_entry.first.second; + const auto& files = index_entry.second; + + // Write the index ID and the number of files + output->writeLong(index_id); + output->writeInt(static_cast(index_suffix.length())); + output->writeBytes(reinterpret_cast(index_suffix.data()), + index_suffix.length()); + output->writeInt(static_cast(files.size())); + + // Write file metadata + for (const auto& file : files) { + output->writeInt(static_cast(file.filename.length())); + output->writeBytes(reinterpret_cast(file.filename.data()), + file.filename.length()); + output->writeLong(file.offset); + output->writeLong(file.length); + } + } +} + +void InvertedIndexFileWriter::copy_files_data_v2(lucene::store::IndexOutput* output, + const std::vector& file_metadata) { + const int64_t buffer_length = 16384; + uint8_t buffer[buffer_length]; + + for (const auto& meta : file_metadata) { + copyFile(meta.filename.c_str(), meta.directory, output, buffer, buffer_length); + } } } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h index 31e287d6dd3f711..ddb22975d684670 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h @@ -28,7 +28,9 @@ #include "io/fs/file_system.h" #include "io/fs/file_writer.h" +#include "io/fs/local_file_system.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" +#include "runtime/exec_env.h" namespace doris { class TabletIndex; @@ -36,7 +38,7 @@ class TabletIndex; namespace segment_v2 { class DorisFSDirectory; using InvertedIndexDirectoryMap = - std::map, std::unique_ptr>; + std::map, std::shared_ptr>; class InvertedIndexFileWriter; using InvertedIndexFileWriterPtr = std::unique_ptr; @@ -58,16 +60,19 @@ class InvertedIndexFileWriter { _rowset_id(std::move(rowset_id)), _seg_id(seg_id), _storage_format(storage_format), - _idx_v2_writer(std::move(file_writer)) {} + _local_fs(io::global_local_filesystem()), + _idx_v2_writer(std::move(file_writer)) { + auto tmp_file_dir = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir(); + _tmp_dir = tmp_file_dir.native(); + } - Result open(const TabletIndex* index_meta); + Result> open(const TabletIndex* index_meta); Status delete_index(const TabletIndex* index_meta); Status initialize(InvertedIndexDirectoryMap& indices_dirs); - ~InvertedIndexFileWriter() = default; + virtual ~InvertedIndexFileWriter() = default; Status write_v2(); Status write_v1(); Status close(); - int64_t headerLength(); const InvertedIndexFileInfo* get_index_file_info() const { DCHECK(_closed) << debug_string(); return &_file_info; @@ -77,11 +82,7 @@ class InvertedIndexFileWriter { return _total_file_size; } const io::FileSystemSPtr& get_fs() const { return _fs; } - void sort_files(std::vector& file_infos); - void copyFile(const char* fileName, lucene::store::Directory* dir, - lucene::store::IndexOutput* output, uint8_t* buffer, int64_t bufferLength); InvertedIndexStorageFormatPB get_storage_format() const { return _storage_format; } - void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } std::string debug_string() const { @@ -99,12 +100,61 @@ class InvertedIndexFileWriter { } private: + // Helper functions shared between write_v1 and write_v2 + std::vector prepare_sorted_files(lucene::store::Directory* directory); + void sort_files(std::vector& file_infos); + void copyFile(const char* fileName, lucene::store::Directory* dir, + lucene::store::IndexOutput* output, uint8_t* buffer, int64_t bufferLength); + void finalize_output_dir(lucene::store::Directory* out_dir); + void add_index_info(int64_t index_id, const std::string& index_suffix, + int64_t compound_file_size); + int64_t headerLength(); + // Helper functions specific to write_v1 + std::pair calculate_header_length(const std::vector& sorted_files, + lucene::store::Directory* directory); + virtual std::pair> + create_output_stream_v1(int64_t index_id, const std::string& index_suffix); + virtual void write_header_and_data_v1(lucene::store::IndexOutput* output, + const std::vector& sorted_files, + lucene::store::Directory* directory, + int64_t header_length, int32_t header_file_count); + // Helper functions specific to write_v2 + virtual std::pair> + create_output_stream_v2(); + void write_version_and_indices_count(lucene::store::IndexOutput* output); + struct FileMetadata { + int64_t index_id; + std::string index_suffix; + std::string filename; + int64_t offset; + int64_t length; + lucene::store::Directory* directory; + + FileMetadata(int64_t id, const std::string& suffix, const std::string& file, int64_t off, + int64_t len, lucene::store::Directory* dir) + : index_id(id), + index_suffix(suffix), + filename(file), + offset(off), + length(len), + directory(dir) {} + }; + std::vector prepare_file_metadata_v2(int64_t& current_offset); + virtual void write_index_headers_and_metadata(lucene::store::IndexOutput* output, + const std::vector& file_metadata); + void copy_files_data_v2(lucene::store::IndexOutput* output, + const std::vector& file_metadata); + Status _insert_directory_into_map(int64_t index_id, const std::string& index_suffix, + std::shared_ptr dir); + // Member variables... InvertedIndexDirectoryMap _indices_dirs; const io::FileSystemSPtr _fs; std::string _index_path_prefix; std::string _rowset_id; int64_t _seg_id; InvertedIndexStorageFormatPB _storage_format; + std::string _tmp_dir; + const std::shared_ptr& _local_fs; // write to disk or stream io::FileWriterPtr _idx_v2_writer = nullptr; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp index ded71c8a6cc73e6..fe0a81c41a6970c 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp @@ -83,39 +83,6 @@ namespace doris::segment_v2 { const char* const DorisFSDirectory::WRITE_LOCK_FILE = "write.lock"; -class DorisFSDirectory::FSIndexOutput : public lucene::store::BufferedIndexOutput { -protected: - void flushBuffer(const uint8_t* b, const int32_t size) override; - -public: - FSIndexOutput() = default; - void init(const io::FileSystemSPtr& fs, const char* path); - ~FSIndexOutput() override; - void close() override; - int64_t length() const override; - - void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } - -private: - io::FileWriterPtr _writer; - io::FileWriterOptions _opts; -}; - -class DorisFSDirectory::FSIndexOutputV2 : public lucene::store::BufferedIndexOutput { -private: - io::FileWriter* _index_v2_file_writer = nullptr; - -protected: - void flushBuffer(const uint8_t* b, const int32_t size) override; - -public: - FSIndexOutputV2() = default; - void init(io::FileWriter* file_writer); - ~FSIndexOutputV2() override; - void close() override; - int64_t length() const override; -}; - bool DorisFSDirectory::FSIndexInput::open(const io::FileSystemSPtr& fs, const char* path, IndexInput*& ret, CLuceneError& error, int32_t buffer_size, int64_t file_size) { @@ -219,6 +186,27 @@ void DorisFSDirectory::FSIndexInput::close() { }*/ } +void DorisFSDirectory::FSIndexInput::setIoContext(const void* io_ctx) { + if (io_ctx) { + const auto& ctx = static_cast(io_ctx); + _io_ctx.reader_type = ctx->reader_type; + _io_ctx.query_id = ctx->query_id; + _io_ctx.file_cache_stats = ctx->file_cache_stats; + } else { + _io_ctx.reader_type = ReaderType::UNKNOWN; + _io_ctx.query_id = nullptr; + _io_ctx.file_cache_stats = nullptr; + } +} + +const void* DorisFSDirectory::FSIndexInput::getIoContext() { + return &_io_ctx; +} + +void DorisFSDirectory::FSIndexInput::setIndexFile(bool isIndexFile) { + _io_ctx.is_index_data = isIndexFile; +} + void DorisFSDirectory::FSIndexInput::seekInternal(const int64_t position) { CND_PRECONDITION(position >= 0 && position < _handle->_length, "Seeking out of range"); _pos = position; @@ -239,9 +227,23 @@ void DorisFSDirectory::FSIndexInput::readInternal(uint8_t* b, const int32_t len) _handle->_fpos = _pos; } + DBUG_EXECUTE_IF( + "DorisFSDirectory::FSIndexInput::readInternal", ({ + static thread_local std::unordered_map + thread_file_cache_map; + auto it = thread_file_cache_map.find(_io_ctx.query_id); + if (it != thread_file_cache_map.end()) { + if (_io_ctx.file_cache_stats != it->second) { + _CLTHROWA(CL_ERR_IO, "File cache statistics mismatch"); + } + } else { + thread_file_cache_map[_io_ctx.query_id] = _io_ctx.file_cache_stats; + } + })); + Slice result {b, (size_t)len}; size_t bytes_read = 0; - auto st = _handle->_reader->read_at(_pos, result, &bytes_read, &_io_ctx); + Status st = _handle->_reader->read_at(_pos, result, &bytes_read, &_io_ctx); DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexInput::readInternal_reader_read_at_error", { st = Status::InternalError( "debug point: DorisFSDirectory::FSIndexInput::readInternal_reader_read_at_error"); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h index 59ae6db1a9630d3..dde436054cd35b4 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h @@ -180,8 +180,6 @@ class DorisFSDirectory::FSIndexInput : public lucene::store::BufferedIndexInput : BufferedIndexInput(buffer_size) { this->_pos = 0; this->_handle = std::move(handle); - this->_io_ctx.reader_type = ReaderType::READER_QUERY; - this->_io_ctx.is_index_data = false; } protected: @@ -199,8 +197,9 @@ class DorisFSDirectory::FSIndexInput : public lucene::store::BufferedIndexInput const char* getDirectoryType() const override { return DorisFSDirectory::getClassName(); } const char* getObjectName() const override { return getClassName(); } static const char* getClassName() { return "FSIndexInput"; } - - void setIdxFileCache(bool index) override { _io_ctx.is_index_data = index; } + void setIoContext(const void* io_ctx) override; + const void* getIoContext() override; + void setIndexFile(bool isIndexFile) override; std::mutex _this_lock; @@ -211,6 +210,39 @@ class DorisFSDirectory::FSIndexInput : public lucene::store::BufferedIndexInput void readInternal(uint8_t* b, const int32_t len) override; }; +class DorisFSDirectory::FSIndexOutput : public lucene::store::BufferedIndexOutput { +protected: + void flushBuffer(const uint8_t* b, const int32_t size) override; + +public: + FSIndexOutput() = default; + void init(const io::FileSystemSPtr& fs, const char* path); + ~FSIndexOutput() override; + void close() override; + int64_t length() const override; + + void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } + +private: + io::FileWriterPtr _writer; + io::FileWriterOptions _opts; +}; + +class DorisFSDirectory::FSIndexOutputV2 : public lucene::store::BufferedIndexOutput { +private: + io::FileWriter* _index_v2_file_writer = nullptr; + +protected: + void flushBuffer(const uint8_t* b, const int32_t size) override; + +public: + FSIndexOutputV2() = default; + void init(io::FileWriter* file_writer); + ~FSIndexOutputV2() override; + void close() override; + int64_t length() const override; +}; + /** * Factory function to create DorisFSDirectory */ diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index b7cfe7dfaffb318..889fee1fc87ef9f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -102,7 +102,8 @@ std::string InvertedIndexReader::get_index_file_path() { return _inverted_index_file_reader->get_index_file_path(&_index_meta); } -Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, +Status InvertedIndexReader::read_null_bitmap(const io::IOContext* io_ctx, + OlapReaderStatistics* stats, InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir) { SCOPED_RAW_TIMER(&stats->inverted_index_query_null_bitmap_timer); @@ -120,9 +121,7 @@ Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, if (!dir) { // TODO: ugly code here, try to refact. - bool open_idx_file_cache = true; - auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache); + auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size); if (!st.ok()) { LOG(WARNING) << st; return st; @@ -138,6 +137,7 @@ Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, InvertedIndexDescriptor::get_temporary_null_bitmap_file_name(); if (dir->fileExists(null_bitmap_file_name)) { null_bitmap_in = dir->openInput(null_bitmap_file_name); + null_bitmap_in->setIoContext(io_ctx); size_t null_bitmap_size = null_bitmap_in->length(); faststring buf; buf.resize(null_bitmap_size); @@ -165,7 +165,8 @@ Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, } Status InvertedIndexReader::handle_searcher_cache( - InvertedIndexCacheHandle* inverted_index_cache_handle, OlapReaderStatistics* stats) { + InvertedIndexCacheHandle* inverted_index_cache_handle, const io::IOContext* io_ctx, + OlapReaderStatistics* stats) { auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta); InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); if (InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, @@ -179,9 +180,7 @@ Status InvertedIndexReader::handle_searcher_cache( SCOPED_RAW_TIMER(&stats->inverted_index_searcher_open_timer); IndexSearcherPtr searcher; - bool open_idx_file_cache = true; - auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache); + auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size); if (!st.ok()) { LOG(WARNING) << st; return st; @@ -191,7 +190,7 @@ Status InvertedIndexReader::handle_searcher_cache( // to avoid open directory additionally for null_bitmap // TODO: handle null bitmap procedure in new format. InvertedIndexQueryCacheHandle null_bitmap_cache_handle; - static_cast(read_null_bitmap(stats, &null_bitmap_cache_handle, dir.get())); + static_cast(read_null_bitmap(io_ctx, stats, &null_bitmap_cache_handle, dir.get())); RETURN_IF_ERROR(create_index_searcher(dir.release(), &searcher, mem_tracker.get(), type())); auto* cache_value = new InvertedIndexSearcherCache::CacheValue( std::move(searcher), mem_tracker->consumption(), UnixMillis()); @@ -211,22 +210,21 @@ Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir, auto searcher_result = DORIS_TRY(index_searcher_builder->get_index_searcher(dir)); *searcher = searcher_result; - if (std::string(dir->getObjectName()) == "DorisCompoundReader") { - static_cast(dir)->getDorisIndexInput()->setIdxFileCache(false); - } + // NOTE: before mem_tracker hook becomes active, we caculate reader memory size by hand. mem_tracker->consume(index_searcher_builder->get_reader_size()); return Status::OK(); }; Status InvertedIndexReader::match_index_search( - OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, + const io::IOContext* io_ctx, OlapReaderStatistics* stats, RuntimeState* runtime_state, + InvertedIndexQueryType query_type, const InvertedIndexQueryInfo& query_info, + const FulltextIndexSearcherPtr& index_searcher, const std::shared_ptr& term_match_bitmap) { TQueryOptions queryOptions = runtime_state->query_options(); try { SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - auto query = QueryFactory::create(query_type, index_searcher, queryOptions); + auto query = QueryFactory::create(query_type, index_searcher, queryOptions, io_ctx); if (!query) { return Status::Error( "query type " + query_type_to_string(query_type) + ", query is nullptr"); @@ -240,15 +238,17 @@ Status InvertedIndexReader::match_index_search( return Status::OK(); } -Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, +Status FullTextIndexReader::new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); + *iterator = + InvertedIndexIterator::create_unique(io_ctx, stats, runtime_state, shared_from_this()); return Status::OK(); } -Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, +Status FullTextIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) { SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); @@ -314,12 +314,12 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); + RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); searcher_ptr = std::get_if(&searcher_variant); if (searcher_ptr != nullptr) { term_match_bitmap = std::make_shared(); - RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, + RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, query_type, query_info, *searcher_ptr, term_match_bitmap)); term_match_bitmap->runOptimize(); cache->insert(cache_key, term_match_bitmap, &cache_handler); @@ -337,13 +337,15 @@ InvertedIndexReaderType FullTextIndexReader::type() { } Status StringTypeInvertedIndexReader::new_iterator( - OlapReaderStatistics* stats, RuntimeState* runtime_state, + const io::IOContext& io_ctx, OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); + *iterator = + InvertedIndexIterator::create_unique(io_ctx, stats, runtime_state, shared_from_this()); return Status::OK(); } -Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, +Status StringTypeInvertedIndexReader::query(const io::IOContext* io_ctx, + OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, @@ -387,7 +389,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, auto result = std::make_shared(); FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); + RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); searcher_ptr = std::get_if(&searcher_variant); if (searcher_ptr != nullptr) { @@ -396,7 +398,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, case InvertedIndexQueryType::MATCH_ANY_QUERY: case InvertedIndexQueryType::MATCH_ALL_QUERY: case InvertedIndexQueryType::EQUAL_QUERY: { - RETURN_IF_ERROR(match_index_search(stats, runtime_state, + RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, InvertedIndexQueryType::MATCH_ANY_QUERY, query_info, *searcher_ptr, result)); break; @@ -404,8 +406,8 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, case InvertedIndexQueryType::MATCH_PHRASE_QUERY: case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: case InvertedIndexQueryType::MATCH_REGEXP_QUERY: { - RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, - *searcher_ptr, result)); + RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, query_type, + query_info, *searcher_ptr, result)); break; } case InvertedIndexQueryType::LESS_THAN_QUERY: @@ -470,9 +472,11 @@ InvertedIndexReaderType StringTypeInvertedIndexReader::type() { return InvertedIndexReaderType::STRING_TYPE; } -Status BkdIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, +Status BkdIndexReader::new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); + *iterator = + InvertedIndexIterator::create_unique(io_ctx, stats, runtime_state, shared_from_this()); return Status::OK(); } @@ -600,12 +604,12 @@ Status BkdIndexReader::invoke_bkd_query(const void* query_value, InvertedIndexQu return Status::OK(); } -Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) { +Status BkdIndexReader::try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) { try { std::shared_ptr r; - auto st = get_bkd_reader(r, stats); + auto st = get_bkd_reader(r, io_ctx, stats); if (!st.ok()) { LOG(WARNING) << "get bkd reader for " << _inverted_index_file_reader->get_index_file_path(&_index_meta) @@ -637,15 +641,15 @@ Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const std::string& return Status::OK(); } -Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, +Status BkdIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) { SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); try { std::shared_ptr r; - auto st = get_bkd_reader(r, stats); + auto st = get_bkd_reader(r, io_ctx, stats); if (!st.ok()) { LOG(WARNING) << "get bkd reader for " << _inverted_index_file_reader->get_index_file_path(&_index_meta) @@ -681,11 +685,11 @@ Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_ } } -Status BkdIndexReader::get_bkd_reader(BKDIndexSearcherPtr& bkd_reader, +Status BkdIndexReader::get_bkd_reader(BKDIndexSearcherPtr& bkd_reader, const io::IOContext* io_ctx, OlapReaderStatistics* stats) { BKDIndexSearcherPtr* bkd_searcher = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); + RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); bkd_searcher = std::get_if(&searcher_variant); if (bkd_searcher) { @@ -1115,8 +1119,8 @@ Status InvertedIndexIterator::read_from_inverted_index( } } - RETURN_IF_ERROR( - _reader->query(_stats, _runtime_state, column_name, query_value, query_type, bit_map)); + RETURN_IF_ERROR(_reader->query(&_io_ctx, _stats, _runtime_state, column_name, query_value, + query_type, bit_map)); return Status::OK(); } @@ -1130,7 +1134,8 @@ Status InvertedIndexIterator::try_read_from_inverted_index(const std::string& co query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY || query_type == InvertedIndexQueryType::LESS_THAN_QUERY || query_type == InvertedIndexQueryType::EQUAL_QUERY) { - RETURN_IF_ERROR(_reader->try_query(_stats, column_name, query_value, query_type, count)); + RETURN_IF_ERROR( + _reader->try_query(&_io_ctx, _stats, column_name, query_value, query_type, count)); } return Status::OK(); } @@ -1148,4 +1153,5 @@ template class InvertedIndexVisitor; template class InvertedIndexVisitor; template class InvertedIndexVisitor; template class InvertedIndexVisitor; + } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index d3a0ff3cf118ba7..a14456032866191 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -182,17 +182,18 @@ class InvertedIndexReader : public std::enable_shared_from_this* iterator) = 0; - virtual Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + virtual Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) = 0; - virtual Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) = 0; + virtual Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) = 0; - Status read_null_bitmap(OlapReaderStatistics* stats, + Status read_null_bitmap(const io::IOContext* io_ctx, OlapReaderStatistics* stats, InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir = nullptr); @@ -223,15 +224,15 @@ class InvertedIndexReader : public std::enable_shared_from_this& term_match_bitmap); @@ -253,15 +254,16 @@ class FullTextIndexReader : public InvertedIndexReader { : InvertedIndexReader(index_meta, inverted_index_file_reader) {} ~FullTextIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, + Status new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) override; - Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; - Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) override { + Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) override { return Status::Error( "FullTextIndexReader not support try_query"); } @@ -279,15 +281,16 @@ class StringTypeInvertedIndexReader : public InvertedIndexReader { : InvertedIndexReader(index_meta, inverted_index_file_reader) {} ~StringTypeInvertedIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, + Status new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) override; - Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; - Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) override { + Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) override { return Status::Error( "StringTypeInvertedIndexReader not support try_query"); } @@ -338,16 +341,17 @@ class BkdIndexReader : public InvertedIndexReader { : InvertedIndexReader(index_meta, inverted_index_file_reader) {} ~BkdIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, + Status new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) override; - Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; - Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) override; + Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) override; Status invoke_bkd_try_query(const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr r, uint32_t* count); Status invoke_bkd_query(const void* query_value, InvertedIndexQueryType query_type, @@ -359,7 +363,8 @@ class BkdIndexReader : public InvertedIndexReader { InvertedIndexVisitor* visitor); InvertedIndexReaderType type() override; - Status get_bkd_reader(BKDIndexSearcherPtr& reader, OlapReaderStatistics* stats); + Status get_bkd_reader(BKDIndexSearcherPtr& reader, const io::IOContext* io_ctx, + OlapReaderStatistics* stats); private: const TypeInfo* _type_info {}; @@ -447,9 +452,12 @@ class InvertedIndexIterator { ENABLE_FACTORY_CREATOR(InvertedIndexIterator); public: - InvertedIndexIterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, - std::shared_ptr reader) - : _stats(stats), _runtime_state(runtime_state), _reader(std::move(reader)) {} + InvertedIndexIterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::shared_ptr reader) + : _io_ctx(io_ctx), + _stats(stats), + _runtime_state(runtime_state), + _reader(std::move(reader)) {} Status read_from_inverted_index(const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, uint32_t segment_num_rows, @@ -460,7 +468,7 @@ class InvertedIndexIterator { Status read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir = nullptr) { - return _reader->read_null_bitmap(_stats, cache_handle, dir); + return _reader->read_null_bitmap(&_io_ctx, _stats, cache_handle, dir); } [[nodiscard]] InvertedIndexReaderType get_inverted_index_reader_type() const; @@ -470,6 +478,7 @@ class InvertedIndexIterator { const InvertedIndexReaderPtr& reader() { return _reader; } private: + io::IOContext _io_ctx; OlapReaderStatistics* _stats = nullptr; RuntimeState* _runtime_state = nullptr; std::shared_ptr _reader; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp b/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp index de8b494cd8be6de..5dfbd984813fd88 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp @@ -34,6 +34,9 @@ Status FulltextIndexSearcherBuilder::build(lucene::store::Directory* directory, reader = lucene::index::IndexReader::open( directory, config::inverted_index_read_buffer_size, close_directory); } catch (const CLuceneError& e) { + std::vector file_names; + directory->list(&file_names); + LOG(ERROR) << fmt::format("Directory list: {}", fmt::join(file_names, ", ")); std::string msg = "FulltextIndexSearcherBuilder build error: " + std::string(e.what()); if (e.number() == CL_ERR_EmptyIndexSegment) { return Status::Error(msg); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 29fe4609e59e9ca..a4f3ca55dd11c0b 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -197,7 +197,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { bool create_index = true; bool close_dir_on_shutdown = true; auto index_writer = std::make_unique( - _dir, _analyzer.get(), create_index, close_dir_on_shutdown); + _dir.get(), _analyzer.get(), create_index, close_dir_on_shutdown); DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_index_writer_setRAMBufferSizeMB_error", { index_writer->setRAMBufferSizeMB(-100); }) DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_index_writer_setMaxBufferedDocs_error", @@ -708,7 +708,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { std::unique_ptr _char_string_reader = nullptr; std::shared_ptr _bkd_writer = nullptr; InvertedIndexCtxSPtr _inverted_index_ctx = nullptr; - DorisFSDirectory* _dir = nullptr; + std::shared_ptr _dir = nullptr; const KeyCoder* _value_key_coder; const TabletIndex* _index_meta; InvertedIndexParserType _parser_type; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 5b1bfaf076279f0..96b0bea2ae82125 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -2205,17 +2205,23 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { _sel_rowid_idx[i] = i; } + // Here we just use col0 as row_number indicator. when reach here, we will calculate the predicates first. + // then use the result to reduce our data read(that is, expr push down). there's now row in block means the first + // column is not in common expr. so it's safe to replace it temporarily to provide correct `selected_size`. if (block->rows() == 0) { vectorized::MutableColumnPtr col0 = std::move(*block->get_by_position(0).column).mutate(); - auto res_column = vectorized::ColumnString::create(); - res_column->insert_data("", 0); - auto col_const = - vectorized::ColumnConst::create(std::move(res_column), selected_size); - block->replace_by_position(0, std::move(col_const)); + // temporary replace the column with a row number indicator. using a ColumnConst is more efficient than + // insert_many_default + auto tmp_indicator_col = + block->get_by_position(0).type->create_column_const_with_default_value( + selected_size); + block->replace_by_position(0, std::move(tmp_indicator_col)); + _output_index_result_column_for_expr(_sel_rowid_idx.data(), selected_size, block); block->shrink_char_type_column_suffix_zero(_char_type_idx_no_0); RETURN_IF_ERROR(_execute_common_expr(_sel_rowid_idx.data(), selected_size, block)); + // now recover the origin col0 block->replace_by_position(0, std::move(col0)); } else { _output_index_result_column_for_expr(_sel_rowid_idx.data(), selected_size, block); @@ -2258,8 +2264,10 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { size_t rows = block->rows(); for (const auto& entry : *block) { if (entry.column->size() != rows) { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, "unmatched size {}, expected {}", - entry.column->size(), rows); + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "unmatched size {}, expected {}, column: {}, type: {}", + entry.column->size(), rows, entry.column->get_name(), + entry.type->get_name()); } } #endif diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 09ff3f6ed3be868..fc22c3570e52a2c 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -253,10 +253,10 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& co opts.data_page_size = storage_page_size; } DBUG_EXECUTE_IF("VerticalSegmentWriter._create_column_writer.storage_page_size", { - auto table_id = DebugPoints::instance()->get_debug_param_or_default( + auto table_id = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "table_id", INT_MIN); - auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( + auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "storage_page_size", INT_MIN); if (table_id == INT_MIN || target_data_page_size == INT_MIN) { diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index 12028812f0d92b1..ce16e2d502b6225 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -248,10 +248,10 @@ Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletCo opts.data_page_size = storage_page_size; } DBUG_EXECUTE_IF("VerticalSegmentWriter._create_column_writer.storage_page_size", { - auto table_id = DebugPoints::instance()->get_debug_param_or_default( + auto table_id = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "table_id", INT_MIN); - auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( + auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "storage_page_size", INT_MIN); if (table_id == INT_MIN || target_data_page_size == INT_MIN) { diff --git a/be/src/olap/rowset/unique_rowset_id_generator.cpp b/be/src/olap/rowset/unique_rowset_id_generator.cpp index 0ac7f63837a0993..49e07e5835957af 100644 --- a/be/src/olap/rowset/unique_rowset_id_generator.cpp +++ b/be/src/olap/rowset/unique_rowset_id_generator.cpp @@ -17,8 +17,17 @@ #include "olap/rowset/unique_rowset_id_generator.h" +#include + +#include "olap/storage_engine.h" +#include "runtime/exec_env.h" + namespace doris { +RowsetId next_rowset_id() { + return ExecEnv::GetInstance()->storage_engine().next_rowset_id(); +} + UniqueRowsetIdGenerator::UniqueRowsetIdGenerator(const UniqueId& backend_uid) : _backend_uid(backend_uid), _inc_id(1) {} diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp index 46070f8dccd7ce1..ee9bfd97745c9be 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp @@ -50,7 +50,8 @@ template requires std::is_base_of_v Status VerticalBetaRowsetWriter::add_columns(const vectorized::Block* block, const std::vector& col_ids, bool is_key, - uint32_t max_rows_per_segment) { + uint32_t max_rows_per_segment, + bool has_cluster_key) { auto& context = this->_context; VLOG_NOTICE << "VerticalBetaRowsetWriter::add_columns, columns: " << block->columns(); @@ -71,7 +72,10 @@ Status VerticalBetaRowsetWriter::add_columns(const vectorized::Block* block, _cur_writer_idx = 0; RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block(block, 0, num_rows)); } else if (is_key) { - if (_segment_writers[_cur_writer_idx]->num_rows_written() > max_rows_per_segment) { + // TODO for cluster key, always create new segment writer because the primary keys are + // sorted in SegmentWriter::_generate_primary_key_index, will cause too many segments + if (_segment_writers[_cur_writer_idx]->num_rows_written() > max_rows_per_segment || + has_cluster_key) { // segment is full, need flush columns and create new segment writer RETURN_IF_ERROR(_flush_columns(_segment_writers[_cur_writer_idx].get(), true)); diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.h b/be/src/olap/rowset/vertical_beta_rowset_writer.h index dcb4ae5a8b5d16f..ce756334308fcd1 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.h +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.h @@ -41,7 +41,7 @@ class VerticalBetaRowsetWriter final : public T { ~VerticalBetaRowsetWriter() override = default; Status add_columns(const vectorized::Block* block, const std::vector& col_ids, - bool is_key, uint32_t max_rows_per_segment) override; + bool is_key, uint32_t max_rows_per_segment, bool has_cluster_key) override; // flush last segment's column Status flush_columns(bool is_key) override; diff --git a/be/src/olap/single_replica_compaction.cpp b/be/src/olap/single_replica_compaction.cpp index 7470afe0ef62c72..458f3949b170171 100644 --- a/be/src/olap/single_replica_compaction.cpp +++ b/be/src/olap/single_replica_compaction.cpp @@ -39,6 +39,7 @@ #include "task/engine_clone_task.h" #include "util/brpc_client_cache.h" #include "util/doris_metrics.h" +#include "util/security.h" #include "util/thrift_rpc_helper.h" #include "util/trace.h" @@ -373,7 +374,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, // then it will try to clone from BE 2, but it will find the file 1 already exist, but file 1 with same // name may have different versions. VLOG_DEBUG << "single replica compaction begin to download files, remote path=" - << _mask_token(remote_url_prefix) << " local_path=" << local_path; + << mask_token(remote_url_prefix) << " local_path=" << local_path; RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(local_path)); RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(local_path)); @@ -438,10 +439,10 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, std::string local_file_path = local_path + file_name; LOG(INFO) << "single replica compaction begin to download file from: " - << _mask_token(remote_file_url) << " to: " << local_file_path + << mask_token(remote_file_url) << " to: " << local_file_path << ". size(B): " << file_size << ", timeout(s): " << estimate_timeout; - auto download_cb = [this, &remote_file_url, estimate_timeout, &local_file_path, + auto download_cb = [&remote_file_url, estimate_timeout, &local_file_path, file_size](HttpClient* client) { RETURN_IF_ERROR(client->init(remote_file_url)); client->set_timeout_ms(estimate_timeout * 1000); @@ -453,7 +454,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, uint64_t local_file_size = std::filesystem::file_size(local_file_path); if (local_file_size != file_size) { LOG(WARNING) << "download file length error" - << ", remote_path=" << _mask_token(remote_file_url) + << ", remote_path=" << mask_token(remote_file_url) << ", file_size=" << file_size << ", local_file_size=" << local_file_size; return Status::InternalError("downloaded file size is not equal"); @@ -585,9 +586,4 @@ Status SingleReplicaCompaction::_finish_clone(const string& clone_dir, return res; } -std::string SingleReplicaCompaction::_mask_token(const std::string& str) { - std::regex pattern("token=[\\w|-]+"); - return regex_replace(str, pattern, "token=******"); -} - } // namespace doris diff --git a/be/src/olap/single_replica_compaction.h b/be/src/olap/single_replica_compaction.h index 67f5527dd7b3368..10ec65ec3f05707 100644 --- a/be/src/olap/single_replica_compaction.h +++ b/be/src/olap/single_replica_compaction.h @@ -62,7 +62,6 @@ class SingleReplicaCompaction final : public CompactionMixin { const std::string& local_path); Status _release_snapshot(const std::string& ip, int port, const std::string& snapshot_path); Status _finish_clone(const std::string& clone_dir, const Version& version); - std::string _mask_token(const std::string& str); CompactionType _compaction_type; std::vector _pending_rs_guards; diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index 421c0eb352d7128..a22015898988b3d 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -72,6 +72,7 @@ class ReportWorker; class CreateTabletRRIdxCache; struct DirInfo; class SnapshotManager; +class WorkloadGroup; using SegCompactionCandidates = std::vector; using SegCompactionCandidatesSharedPtr = std::shared_ptr; @@ -105,7 +106,7 @@ class BaseStorageEngine { virtual bool stopped() = 0; // start all background threads. This should be call after env is ready. - virtual Status start_bg_threads() = 0; + virtual Status start_bg_threads(std::shared_ptr wg_sptr = nullptr) = 0; virtual Result get_tablet(int64_t tablet_id) = 0; @@ -278,7 +279,7 @@ class StorageEngine final : public BaseStorageEngine { return _default_rowset_type; } - Status start_bg_threads() override; + Status start_bg_threads(std::shared_ptr wg_sptr = nullptr) override; // clear trash and snapshot file // option: update disk usage after sweep diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 13bbdaa9389faae..4005c818bc5023c 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -1198,7 +1198,7 @@ void DeleteBitmap::remove_stale_delete_bitmap_from_queue(const std::vector std::vector> to_delete; - auto tablet_id = -1; + int64_t tablet_id = -1; for (auto& version_str : vector) { auto it = _stale_delete_bitmap.find(version_str); if (it != _stale_delete_bitmap.end()) { diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index fc3a69fd5cde52c..75cbcf68e956c19 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -64,6 +63,7 @@ #include "util/debug_points.h" #include "util/defer_op.h" #include "util/network_util.h" +#include "util/security.h" #include "util/stopwatch.hpp" #include "util/thrift_rpc_helper.h" #include "util/trace.h" @@ -415,7 +415,7 @@ Status EngineCloneTask::_make_and_download_snapshots(DataDir& data_dir, status = _download_files(&data_dir, remote_url_prefix, local_data_path); if (!status.ok()) [[unlikely]] { LOG_WARNING("failed to download snapshot from remote BE") - .tag("url", _mask_token(remote_url_prefix)) + .tag("url", mask_token(remote_url_prefix)) .error(status); continue; // Try another BE } @@ -552,11 +552,11 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re std::string local_file_path = local_path + "/" + file_name; - LOG(INFO) << "clone begin to download file from: " << _mask_token(remote_file_url) + LOG(INFO) << "clone begin to download file from: " << mask_token(remote_file_url) << " to: " << local_file_path << ". size(B): " << file_size << ", timeout(s): " << estimate_timeout; - auto download_cb = [this, &remote_file_url, estimate_timeout, &local_file_path, + auto download_cb = [&remote_file_url, estimate_timeout, &local_file_path, file_size](HttpClient* client) { RETURN_IF_ERROR(client->init(remote_file_url)); client->set_timeout_ms(estimate_timeout * 1000); @@ -572,7 +572,7 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re } if (local_file_size != file_size) { LOG(WARNING) << "download file length error" - << ", remote_path=" << _mask_token(remote_file_url) + << ", remote_path=" << mask_token(remote_file_url) << ", file_size=" << file_size << ", local_file_size=" << local_file_size; return Status::InternalError("downloaded file size is not equal"); @@ -600,7 +600,7 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re /// This method will only be called if tablet already exist in this BE when doing clone. /// This method will do the following things: -/// 1. Linke all files from CLONE dir to tablet dir if file does not exist in tablet dir +/// 1. Link all files from CLONE dir to tablet dir if file does not exist in tablet dir /// 2. Call _finish_xx_clone() to revise the tablet meta. Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_dir, int64_t version, bool is_incremental_clone) { @@ -864,9 +864,4 @@ Status EngineCloneTask::_finish_full_clone(Tablet* tablet, // TODO(plat1ko): write cooldown meta to remote if this replica is cooldown replica } -std::string EngineCloneTask::_mask_token(const std::string& str) { - std::regex pattern("token=[\\w|-]+"); - return regex_replace(str, pattern, "token=******"); -} - } // namespace doris diff --git a/be/src/olap/task/engine_clone_task.h b/be/src/olap/task/engine_clone_task.h index 9290ed9552ecf9b..a11d4c742f4bcca 100644 --- a/be/src/olap/task/engine_clone_task.h +++ b/be/src/olap/task/engine_clone_task.h @@ -86,8 +86,6 @@ class EngineCloneTask final : public EngineTask { Status _release_snapshot(const std::string& ip, int port, const std::string& snapshot_path); - std::string _mask_token(const std::string& str); - private: StorageEngine& _engine; const TCloneReq& _clone_req; diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index 4cc3aceaeebdfae..f7712625d3e9a67 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -571,14 +571,12 @@ struct AnalyticSharedState : public BasicSharedState { int64_t current_row_position = 0; BlockRowPos partition_by_end; - vectorized::VExprContextSPtrs partition_by_eq_expr_ctxs; int64_t input_total_rows = 0; BlockRowPos all_block_end; std::vector input_blocks; bool input_eos = false; BlockRowPos found_partition_end; std::vector origin_cols; - vectorized::VExprContextSPtrs order_by_eq_expr_ctxs; std::vector input_block_first_row_positions; std::vector> agg_input_columns; diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index abde34a1d0255bc..377aeb6fa12be4d 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -60,15 +60,14 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { _agg_expr_ctxs[i][j]->root()->data_type()->create_column(); } } - _shared_state->partition_by_eq_expr_ctxs.resize(p._partition_by_eq_expr_ctxs.size()); - for (size_t i = 0; i < _shared_state->partition_by_eq_expr_ctxs.size(); i++) { - RETURN_IF_ERROR(p._partition_by_eq_expr_ctxs[i]->clone( - state, _shared_state->partition_by_eq_expr_ctxs[i])); - } - _shared_state->order_by_eq_expr_ctxs.resize(p._order_by_eq_expr_ctxs.size()); - for (size_t i = 0; i < _shared_state->order_by_eq_expr_ctxs.size(); i++) { + _partition_by_eq_expr_ctxs.resize(p._partition_by_eq_expr_ctxs.size()); + for (size_t i = 0; i < _partition_by_eq_expr_ctxs.size(); i++) { RETURN_IF_ERROR( - p._order_by_eq_expr_ctxs[i]->clone(state, _shared_state->order_by_eq_expr_ctxs[i])); + p._partition_by_eq_expr_ctxs[i]->clone(state, _partition_by_eq_expr_ctxs[i])); + } + _order_by_eq_expr_ctxs.resize(p._order_by_eq_expr_ctxs.size()); + for (size_t i = 0; i < _order_by_eq_expr_ctxs.size(); i++) { + RETURN_IF_ERROR(p._order_by_eq_expr_ctxs[i]->clone(state, _order_by_eq_expr_ctxs[i])); } return Status::OK(); } @@ -80,11 +79,11 @@ bool AnalyticSinkLocalState::_whether_need_next_partition(BlockRowPos& found_par shared_state.partition_by_end.pos)) { //now still have partition data return false; } - if ((shared_state.partition_by_eq_expr_ctxs.empty() && !shared_state.input_eos) || + if ((_partition_by_eq_expr_ctxs.empty() && !shared_state.input_eos) || (found_partition_end.pos == 0)) { //no partition, get until fetch to EOS return true; } - if (!shared_state.partition_by_eq_expr_ctxs.empty() && + if (!_partition_by_eq_expr_ctxs.empty() && found_partition_end.pos == shared_state.all_block_end.pos && !shared_state.input_eos) { //current partition data calculate done return true; @@ -177,13 +176,13 @@ BlockRowPos AnalyticSinkLocalState::_get_partition_by_end() { return shared_state.partition_by_end; } - if (shared_state.partition_by_eq_expr_ctxs.empty() || + if (_partition_by_eq_expr_ctxs.empty() || (shared_state.input_total_rows == 0)) { //no partition_by, the all block is end return shared_state.all_block_end; } BlockRowPos cal_end = shared_state.all_block_end; - for (size_t i = 0; i < shared_state.partition_by_eq_expr_ctxs.size(); + for (size_t i = 0; i < _partition_by_eq_expr_ctxs.size(); ++i) { //have partition_by, binary search the partiton end cal_end = _compare_row_to_find_end(shared_state.partition_by_column_idxs[i], shared_state.partition_by_end, cal_end); @@ -303,10 +302,10 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block } { SCOPED_TIMER(local_state._compute_partition_by_timer); - for (size_t i = 0; i < local_state._shared_state->partition_by_eq_expr_ctxs.size(); ++i) { + for (size_t i = 0; i < local_state._partition_by_eq_expr_ctxs.size(); ++i) { int result_col_id = -1; - RETURN_IF_ERROR(local_state._shared_state->partition_by_eq_expr_ctxs[i]->execute( - input_block, &result_col_id)); + RETURN_IF_ERROR(local_state._partition_by_eq_expr_ctxs[i]->execute(input_block, + &result_col_id)); DCHECK_GE(result_col_id, 0); local_state._shared_state->partition_by_column_idxs[i] = result_col_id; } @@ -314,10 +313,10 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block { SCOPED_TIMER(local_state._compute_order_by_timer); - for (size_t i = 0; i < local_state._shared_state->order_by_eq_expr_ctxs.size(); ++i) { + for (size_t i = 0; i < local_state._order_by_eq_expr_ctxs.size(); ++i) { int result_col_id = -1; - RETURN_IF_ERROR(local_state._shared_state->order_by_eq_expr_ctxs[i]->execute( - input_block, &result_col_id)); + RETURN_IF_ERROR( + local_state._order_by_eq_expr_ctxs[i]->execute(input_block, &result_col_id)); DCHECK_GE(result_col_id, 0); local_state._shared_state->ordey_by_column_idxs[i] = result_col_id; } diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index e04b220ee351e7f..0ff7c4e4e047bd4 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -63,6 +63,8 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_expr_ctxs; + vectorized::VExprContextSPtrs _partition_by_eq_expr_ctxs; + vectorized::VExprContextSPtrs _order_by_eq_expr_ctxs; }; class AnalyticSinkOperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp index 2e041ab45d20bf4..3a9156f45b6758a 100644 --- a/be/src/pipeline/exec/analytic_source_operator.cpp +++ b/be/src/pipeline/exec/analytic_source_operator.cpp @@ -122,13 +122,15 @@ BlockRowPos AnalyticLocalState::_get_partition_by_end() { return shared_state.partition_by_end; } - if (shared_state.partition_by_eq_expr_ctxs.empty() || + const auto partition_exprs_size = + _parent->cast()._partition_exprs_size; + if (partition_exprs_size == 0 || (shared_state.input_total_rows == 0)) { //no partition_by, the all block is end return shared_state.all_block_end; } BlockRowPos cal_end = shared_state.all_block_end; - for (size_t i = 0; i < shared_state.partition_by_eq_expr_ctxs.size(); + for (size_t i = 0; i < partition_exprs_size; ++i) { //have partition_by, binary search the partiton end cal_end = _compare_row_to_find_end(shared_state.partition_by_column_idxs[i], shared_state.partition_by_end, cal_end); @@ -144,12 +146,13 @@ bool AnalyticLocalState::_whether_need_next_partition(BlockRowPos& found_partiti shared_state.partition_by_end.pos)) { //now still have partition data return false; } - if ((shared_state.partition_by_eq_expr_ctxs.empty() && !shared_state.input_eos) || + const auto partition_exprs_size = + _parent->cast()._partition_exprs_size; + if ((partition_exprs_size == 0 && !shared_state.input_eos) || (found_partition_end.pos == 0)) { //no partition, get until fetch to EOS return true; } - if (!shared_state.partition_by_eq_expr_ctxs.empty() && - found_partition_end.pos == shared_state.all_block_end.pos && + if (partition_exprs_size != 0 && found_partition_end.pos == shared_state.all_block_end.pos && !shared_state.input_eos) { //current partition data calculate done return true; } @@ -401,7 +404,7 @@ Status AnalyticLocalState::_get_next_for_range(size_t current_block_rows) { void AnalyticLocalState::_update_order_by_range() { _order_by_start = _order_by_end; _order_by_end = _shared_state->partition_by_end; - for (size_t i = 0; i < _shared_state->order_by_eq_expr_ctxs.size(); ++i) { + for (size_t i = 0; i < _parent->cast()._order_by_exprs_size; ++i) { _order_by_end = _compare_row_to_find_end(_shared_state->ordey_by_column_idxs[i], _order_by_start, _order_by_end, true); } @@ -476,7 +479,9 @@ AnalyticSourceOperatorX::AnalyticSourceOperatorX(ObjectPool* pool, const TPlanNo _has_window(tnode.analytic_node.__isset.window), _has_range_window(tnode.analytic_node.window.type == TAnalyticWindowType::RANGE), _has_window_start(tnode.analytic_node.window.__isset.window_start), - _has_window_end(tnode.analytic_node.window.__isset.window_end) { + _has_window_end(tnode.analytic_node.window.__isset.window_end), + _partition_exprs_size(tnode.analytic_node.partition_exprs.size()), + _order_by_exprs_size(tnode.analytic_node.order_by_exprs.size()) { _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; _fn_scope = AnalyticFnScope::PARTITION; if (tnode.analytic_node.__isset.window && diff --git a/be/src/pipeline/exec/analytic_source_operator.h b/be/src/pipeline/exec/analytic_source_operator.h index 8f44b77f567e559..56c664cec6193b1 100644 --- a/be/src/pipeline/exec/analytic_source_operator.h +++ b/be/src/pipeline/exec/analytic_source_operator.h @@ -150,6 +150,8 @@ class AnalyticSourceOperatorX final : public OperatorX { size_t _align_aggregate_states = 1; std::vector _change_to_nullable_flags; + const size_t _partition_exprs_size; + const size_t _order_by_exprs_size; }; } // namespace pipeline diff --git a/be/src/pipeline/exec/exchange_sink_operator.cpp b/be/src/pipeline/exec/exchange_sink_operator.cpp index 1f91af01aa1f6bb..85f58417197d52e 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.cpp +++ b/be/src/pipeline/exec/exchange_sink_operator.cpp @@ -59,6 +59,7 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _local_send_timer = ADD_TIMER(_profile, "LocalSendTime"); _split_block_hash_compute_timer = ADD_TIMER(_profile, "SplitBlockHashComputeTime"); _distribute_rows_into_channels_timer = ADD_TIMER(_profile, "DistributeRowsIntoChannelsTime"); + _send_new_partition_timer = ADD_TIMER(_profile, "SendNewPartitionTime"); _blocks_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "BlocksProduced", TUnit::UNIT, 1); _overall_throughput = _profile->add_derived_counter( "OverallThroughput", TUnit::BYTES_PER_SECOND, @@ -275,23 +276,14 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { return Status::OK(); } -Status ExchangeSinkLocalState::_send_new_partition_batch() { - if (_row_distribution.need_deal_batching()) { // maybe try_close more than 1 time - RETURN_IF_ERROR(_row_distribution.automatic_create_partition()); - vectorized::Block tmp_block = - _row_distribution._batching_block->to_block(); // Borrow out, for lval ref - auto& p = _parent->cast(); - // these order is unique. - // 1. clear batching stats(and flag goes true) so that we won't make a new batching process in dealing batched block. - // 2. deal batched block - // 3. now reuse the column of lval block. cuz write doesn't real adjust it. it generate a new block from that. - _row_distribution.clear_batching_stats(); - RETURN_IF_ERROR(p.sink(_state, &tmp_block, false)); - // Recovery back - _row_distribution._batching_block->set_mutable_columns(tmp_block.mutate_columns()); - _row_distribution._batching_block->clear_column_data(); - _row_distribution._deal_batched = false; - } +Status ExchangeSinkLocalState::_send_new_partition_batch(vectorized::Block* input_block) { + RETURN_IF_ERROR(_row_distribution.automatic_create_partition()); + auto& p = _parent->cast(); + // Recovery back + _row_distribution.clear_batching_stats(); + _row_distribution._batching_block->clear_column_data(); + _row_distribution._deal_batched = false; + RETURN_IF_ERROR(p.sink(_state, input_block, false)); return Status::OK(); } @@ -521,7 +513,6 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block old_channel_mem_usage += channel->mem_usage(); } // check out of limit - RETURN_IF_ERROR(local_state._send_new_partition_batch()); std::shared_ptr convert_block = std::make_shared(); const auto& num_channels = local_state._partition_count; std::vector> channel2rows; @@ -536,21 +527,21 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block RETURN_IF_ERROR(local_state._row_distribution.generate_rows_distribution( *block, convert_block, filtered_rows, has_filtered_rows, local_state._row_part_tablet_ids, local_state._number_input_rows)); - - const auto& row_ids = local_state._row_part_tablet_ids[0].row_ids; - const auto& tablet_ids = local_state._row_part_tablet_ids[0].tablet_ids; - for (int idx = 0; idx < row_ids.size(); ++idx) { - const auto& row = row_ids[idx]; - const auto& tablet_id_hash = - HashUtil::zlib_crc_hash(&tablet_ids[idx], sizeof(int64), 0); - channel2rows[tablet_id_hash % num_channels].emplace_back(row); + if (local_state._row_distribution.batching_rows() > 0) { + SCOPED_TIMER(local_state._send_new_partition_timer); + RETURN_IF_ERROR(local_state._send_new_partition_batch(block)); + } else { + const auto& row_ids = local_state._row_part_tablet_ids[0].row_ids; + const auto& tablet_ids = local_state._row_part_tablet_ids[0].tablet_ids; + for (int idx = 0; idx < row_ids.size(); ++idx) { + const auto& row = row_ids[idx]; + const auto& tablet_id_hash = + HashUtil::zlib_crc_hash(&tablet_ids[idx], sizeof(int64), 0); + channel2rows[tablet_id_hash % num_channels].emplace_back(row); + } } } - if (eos) { - local_state._row_distribution._deal_batched = true; - RETURN_IF_ERROR(local_state._send_new_partition_batch()); - } { SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); // the convert_block maybe different with block after execute exprs diff --git a/be/src/pipeline/exec/exchange_sink_operator.h b/be/src/pipeline/exec/exchange_sink_operator.h index 63d502900054703..91ee1bd27a63e79 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.h +++ b/be/src/pipeline/exec/exchange_sink_operator.h @@ -96,7 +96,7 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { static Status empty_callback_function(void* sender, TCreatePartitionResult* result) { return Status::OK(); } - Status _send_new_partition_batch(); + Status _send_new_partition_batch(vectorized::Block* input_block); std::vector> channels; int current_channel_idx {0}; // index of current channel to send to if _random == true bool only_local_exchange {false}; @@ -127,6 +127,7 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { // Used to counter send bytes under local data exchange RuntimeProfile::Counter* _local_bytes_send_counter = nullptr; RuntimeProfile::Counter* _merge_block_timer = nullptr; + RuntimeProfile::Counter* _send_new_partition_timer = nullptr; RuntimeProfile::Counter* _wait_queue_timer = nullptr; RuntimeProfile::Counter* _wait_broadcast_buffer_timer = nullptr; diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 37de9ac93d839f5..bf177742ab35638 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -92,8 +92,7 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _runtime_filters.resize(p._runtime_filter_descs.size()); for (size_t i = 0; i < p._runtime_filter_descs.size(); i++) { RETURN_IF_ERROR(state->register_producer_runtime_filter( - p._runtime_filter_descs[i], p._need_local_merge, &_runtime_filters[i], - _build_expr_ctxs.size() == 1)); + p._runtime_filter_descs[i], &_runtime_filters[i], _build_expr_ctxs.size() == 1)); } _runtime_filter_slots = @@ -157,7 +156,7 @@ Status HashJoinBuildSinkLocalState::close(RuntimeState* state, Status exec_statu } } SCOPED_TIMER(_publish_runtime_filter_timer); - RETURN_IF_ERROR(_runtime_filter_slots->publish(!_should_build_hash_table)); + RETURN_IF_ERROR(_runtime_filter_slots->publish(state, !_should_build_hash_table)); return Base::close(state, exec_status); } @@ -353,8 +352,7 @@ Status HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) { HashJoinBuildSinkOperatorX::HashJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, - const DescriptorTbl& descs, - bool need_local_merge) + const DescriptorTbl& descs) : JoinBuildSinkOperatorX(pool, operator_id, tnode, descs), _join_distribution(tnode.hash_join_node.__isset.dist_type ? tnode.hash_join_node.dist_type : TJoinDistributionType::NONE), @@ -362,8 +360,7 @@ HashJoinBuildSinkOperatorX::HashJoinBuildSinkOperatorX(ObjectPool* pool, int ope tnode.hash_join_node.is_broadcast_join), _partition_exprs(tnode.__isset.distribute_expr_lists && !_is_broadcast_join ? tnode.distribute_expr_lists[1] - : std::vector {}), - _need_local_merge(need_local_merge) {} + : std::vector {}) {} Status HashJoinBuildSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(JoinBuildSinkOperatorX::init(tnode, state)); diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h index 45aa1e8c8a262dc..d905afa27582fbb 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.h +++ b/be/src/pipeline/exec/hashjoin_build_sink.h @@ -106,7 +106,7 @@ class HashJoinBuildSinkOperatorX final : public JoinBuildSinkOperatorX { public: HashJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, - const DescriptorTbl& descs, bool use_global_rf); + const DescriptorTbl& descs); Status init(const TDataSink& tsink) override { return Status::InternalError("{} should not init with TDataSink", JoinBuildSinkOperatorX::_name); @@ -163,8 +163,6 @@ class HashJoinBuildSinkOperatorX final vectorized::SharedHashTableContextPtr _shared_hash_table_context = nullptr; const std::vector _partition_exprs; - const bool _need_local_merge; - std::vector _hash_output_slot_ids; std::vector _should_keep_column_flags; bool _should_keep_hash_key_column = false; diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp index 83b378e792c3fa3..9e3e8a08ca83a55 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp @@ -43,7 +43,7 @@ struct RuntimeFilterBuild { } { SCOPED_TIMER(_parent->publish_runtime_filter_timer()); - RETURN_IF_ERROR(runtime_filter_slots.publish()); + RETURN_IF_ERROR(runtime_filter_slots.publish(state)); } return Status::OK(); @@ -66,8 +66,8 @@ Status NestedLoopJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkSta _shared_state->join_op_variants = p._join_op_variants; _runtime_filters.resize(p._runtime_filter_descs.size()); for (size_t i = 0; i < p._runtime_filter_descs.size(); i++) { - RETURN_IF_ERROR(state->register_producer_runtime_filter( - p._runtime_filter_descs[i], p._need_local_merge, &_runtime_filters[i], false)); + RETURN_IF_ERROR(state->register_producer_runtime_filter(p._runtime_filter_descs[i], + &_runtime_filters[i], false)); } return Status::OK(); } @@ -87,11 +87,9 @@ Status NestedLoopJoinBuildSinkLocalState::open(RuntimeState* state) { NestedLoopJoinBuildSinkOperatorX::NestedLoopJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, - const DescriptorTbl& descs, - bool need_local_merge) + const DescriptorTbl& descs) : JoinBuildSinkOperatorX(pool, operator_id, tnode, descs), - _need_local_merge(need_local_merge), _is_output_left_side_only(tnode.nested_loop_join_node.__isset.is_output_left_side_only && tnode.nested_loop_join_node.is_output_left_side_only), _row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples) {} diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.h b/be/src/pipeline/exec/nested_loop_join_build_operator.h index d6e72799f97d92d..5c41088a7059d4a 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.h +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.h @@ -59,7 +59,7 @@ class NestedLoopJoinBuildSinkOperatorX final : public JoinBuildSinkOperatorX { public: NestedLoopJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, - const DescriptorTbl& descs, bool need_local_merge); + const DescriptorTbl& descs); Status init(const TDataSink& tsink) override { return Status::InternalError( "{} should not init with TDataSink", @@ -85,7 +85,6 @@ class NestedLoopJoinBuildSinkOperatorX final vectorized::VExprContextSPtrs _filter_src_expr_ctxs; - bool _need_local_merge; const bool _is_output_left_side_only; RowDescriptor _row_descriptor; }; diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp index afa1a2e59b798ce..f4f4ef21ece7464 100644 --- a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp @@ -516,23 +516,20 @@ Status NestedLoopJoinProbeOperatorX::pull(RuntimeState* state, vectorized::Block local_state._matched_rows_done : local_state._matched_rows_done); + size_t join_block_column_size = local_state._join_block.columns(); { - vectorized::Block tmp_block = local_state._join_block; - - // Here make _join_block release the columns' ptr - local_state._join_block.set_columns(local_state._join_block.clone_empty_columns()); - - local_state.add_tuple_is_null_column(&tmp_block); + local_state.add_tuple_is_null_column(&local_state._join_block); { SCOPED_TIMER(local_state._join_filter_timer); RETURN_IF_ERROR(vectorized::VExprContext::filter_block( - local_state._conjuncts, &tmp_block, tmp_block.columns())); + local_state._conjuncts, &local_state._join_block, + local_state._join_block.columns())); } - RETURN_IF_ERROR(local_state._build_output_block(&tmp_block, block, false)); + RETURN_IF_ERROR( + local_state._build_output_block(&local_state._join_block, block, false)); local_state._reset_tuple_is_null_column(); } - local_state._join_block.clear_column_data(); - + local_state._join_block.clear_column_data(join_block_column_size); if (!(*eos) and !local_state._need_more_input_data) { auto func = [&](auto&& join_op_variants, auto set_build_side_flag, auto set_probe_side_flag) { diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp index 83a205e59c78fbe..d221eaeed0faba4 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp @@ -393,9 +393,11 @@ void PartitionedHashJoinSinkLocalState::_spill_to_disk( } } -PartitionedHashJoinSinkOperatorX::PartitionedHashJoinSinkOperatorX( - ObjectPool* pool, int operator_id, const TPlanNode& tnode, const DescriptorTbl& descs, - bool use_global_rf, uint32_t partition_count) +PartitionedHashJoinSinkOperatorX::PartitionedHashJoinSinkOperatorX(ObjectPool* pool, + int operator_id, + const TPlanNode& tnode, + const DescriptorTbl& descs, + uint32_t partition_count) : JoinBuildSinkOperatorX(pool, operator_id, tnode, descs), _join_distribution(tnode.hash_join_node.__isset.dist_type ? tnode.hash_join_node.dist_type diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h index 8e89763b50a9d5a..d1fe30e06f2dd2c 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h @@ -82,8 +82,7 @@ class PartitionedHashJoinSinkOperatorX : public JoinBuildSinkOperatorX { public: PartitionedHashJoinSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, - const DescriptorTbl& descs, bool use_global_rf, - uint32_t partition_count); + const DescriptorTbl& descs, uint32_t partition_count); Status init(const TDataSink& tsink) override { return Status::InternalError("{} should not init with TDataSink", diff --git a/be/src/pipeline/exec/result_sink_operator.cpp b/be/src/pipeline/exec/result_sink_operator.cpp index a3f1133f00e78e3..f8196910021b2c6 100644 --- a/be/src/pipeline/exec/result_sink_operator.cpp +++ b/be/src/pipeline/exec/result_sink_operator.cpp @@ -46,14 +46,25 @@ Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) _wait_for_dependency_timer = ADD_TIMER_WITH_LEVEL(_profile, timer_name, 1); auto fragment_instance_id = state->fragment_instance_id(); + auto& p = _parent->cast(); if (state->query_options().enable_parallel_result_sink) { _sender = _parent->cast()._sender; } else { - auto& p = _parent->cast(); RETURN_IF_ERROR(state->exec_env()->result_mgr()->create_sender( fragment_instance_id, p._result_sink_buffer_size_rows, &_sender, state)); } _sender->set_dependency(fragment_instance_id, _dependency->shared_from_this()); + + _output_vexpr_ctxs.resize(p._output_vexpr_ctxs.size()); + for (size_t i = 0; i < _output_vexpr_ctxs.size(); i++) { + RETURN_IF_ERROR(p._output_vexpr_ctxs[i]->clone(state, _output_vexpr_ctxs[i])); + } + if (p._sink_type == TResultSinkType::ARROW_FLIGHT_PROTOCAL) { + std::shared_ptr arrow_schema; + RETURN_IF_ERROR(get_arrow_schema_from_expr_ctxs(_output_vexpr_ctxs, &arrow_schema, + state->timezone())); + _sender->register_arrow_schema(arrow_schema); + } return Status::OK(); } @@ -62,10 +73,6 @@ Status ResultSinkLocalState::open(RuntimeState* state) { SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(Base::open(state)); auto& p = _parent->cast(); - _output_vexpr_ctxs.resize(p._output_vexpr_ctxs.size()); - for (size_t i = 0; i < _output_vexpr_ctxs.size(); i++) { - RETURN_IF_ERROR(p._output_vexpr_ctxs[i]->clone(state, _output_vexpr_ctxs[i])); - } // create writer based on sink type switch (p._sink_type) { case TResultSinkType::MYSQL_PROTOCAL: { @@ -79,10 +86,6 @@ Status ResultSinkLocalState::open(RuntimeState* state) { break; } case TResultSinkType::ARROW_FLIGHT_PROTOCAL: { - std::shared_ptr arrow_schema; - RETURN_IF_ERROR(get_arrow_schema_from_expr_ctxs(_output_vexpr_ctxs, &arrow_schema, - state->timezone())); - _sender->register_arrow_schema(arrow_schema); _writer.reset(new (std::nothrow) vectorized::VArrowFlightResultWriter( _sender.get(), _output_vexpr_ctxs, _profile)); break; diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp index a939d25654b4cc8..22007a4b220348b 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp @@ -62,6 +62,7 @@ Status LocalExchangeSinkOperatorX::init(ExchangeType type, const int num_buckets _num_partitions)); RETURN_IF_ERROR(_partitioner->init(_texprs)); } else if (_type == ExchangeType::BUCKET_HASH_SHUFFLE) { + DCHECK_GT(num_buckets, 0); _partitioner.reset( new vectorized::Crc32HashPartitioner(num_buckets)); RETURN_IF_ERROR(_partitioner->init(_texprs)); @@ -90,6 +91,9 @@ Status LocalExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo "UseGlobalShuffle", std::to_string(_parent->cast()._use_global_shuffle)); } + _profile->add_info_string( + "PartitionExprsSize", + std::to_string(_parent->cast()._partitioned_exprs_num)); _channel_id = info.task_idx; return Status::OK(); } diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h index 4c4a400c2bde3bc..435f7a410a4ca6e 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h @@ -91,6 +91,7 @@ class LocalExchangeSinkOperatorX final : public DataSinkOperatorX& _texprs; + const size_t _partitioned_exprs_num; std::unique_ptr _partitioner; const std::map _bucket_seq_to_instance_idx; std::vector> _shuffle_idx_to_instance_idx; diff --git a/be/src/pipeline/local_exchange/local_exchanger.cpp b/be/src/pipeline/local_exchange/local_exchanger.cpp index c5f99ca5d6a4a54..647988f8b794cb8 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.cpp +++ b/be/src/pipeline/local_exchange/local_exchanger.cpp @@ -226,19 +226,6 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest new_block_wrapper->unref(local_state._shared_state, local_state._channel_id); } } - } else if (_num_senders != _num_sources) { - // In this branch, data just should be distributed equally into all instances. - new_block_wrapper->ref(_num_partitions); - for (size_t i = 0; i < _num_partitions; i++) { - uint32_t start = local_state._partition_rows_histogram[i]; - uint32_t size = local_state._partition_rows_histogram[i + 1] - start; - if (size > 0) { - _enqueue_data_and_set_ready(i % _num_sources, local_state, - {new_block_wrapper, {row_idx, start, size}}); - } else { - new_block_wrapper->unref(local_state._shared_state, local_state._channel_id); - } - } } else { DCHECK(!bucket_seq_to_instance_idx.empty()); new_block_wrapper->ref(_num_partitions); diff --git a/be/src/pipeline/local_exchange/local_exchanger.h b/be/src/pipeline/local_exchange/local_exchanger.h index bf052ac3b924ca5..4912ab3369815ba 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.h +++ b/be/src/pipeline/local_exchange/local_exchanger.h @@ -220,9 +220,7 @@ class ShuffleExchanger : public Exchanger { ShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, int free_block_limit) : Exchanger(running_sink_operators, num_sources, num_partitions, - free_block_limit) { - _data_queue.resize(num_partitions); - } + free_block_limit) {} Status _split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, LocalExchangeSinkLocalState& local_state); }; @@ -232,7 +230,10 @@ class BucketShuffleExchanger final : public ShuffleExchanger { BucketShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, int free_block_limit) : ShuffleExchanger(running_sink_operators, num_sources, num_partitions, - free_block_limit) {} + free_block_limit) { + DCHECK_GT(num_partitions, 0); + _data_queue.resize(std::max(num_partitions, num_sources)); + } ~BucketShuffleExchanger() override = default; ExchangeType get_type() const override { return ExchangeType::BUCKET_HASH_SHUFFLE; } }; diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp index d14a0d0c3cd4a7a..7572b20d34112ec 100644 --- a/be/src/pipeline/pipeline_fragment_context.cpp +++ b/be/src/pipeline/pipeline_fragment_context.cpp @@ -296,8 +296,6 @@ Status PipelineFragmentContext::prepare(const doris::TPipelineFragmentParams& re if (local_params.__isset.topn_filter_descs) { _query_ctx->init_runtime_predicates(local_params.topn_filter_descs); } - - _need_local_merge = request.__isset.parallel_instances; } { @@ -369,29 +367,9 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag auto fragment_instance_id = local_params.fragment_instance_id; _fragment_instance_ids[i] = fragment_instance_id; - auto filterparams = std::make_unique(); - - { - filterparams->runtime_filter_wait_infinitely = - _runtime_state->runtime_filter_wait_infinitely(); - filterparams->runtime_filter_wait_time_ms = - _runtime_state->runtime_filter_wait_time_ms(); - filterparams->execution_timeout = _runtime_state->execution_timeout(); - - filterparams->exec_env = ExecEnv::GetInstance(); - filterparams->query_id.set_hi(_runtime_state->query_id().hi); - filterparams->query_id.set_lo(_runtime_state->query_id().lo); - - filterparams->be_exec_version = _runtime_state->be_exec_version(); - filterparams->query_ctx = _query_ctx.get(); - } - - auto runtime_filter_mgr = std::make_unique( - request.query_id, filterparams.get(), _query_ctx->query_mem_tracker); - - filterparams->runtime_filter_mgr = runtime_filter_mgr.get(); - - _runtime_filter_states[i] = std::move(filterparams); + _runtime_filter_states[i] = RuntimeFilterParamsContext::create(_query_ctx.get()); + std::unique_ptr runtime_filter_mgr = std::make_unique( + request.query_id, _runtime_filter_states[i], _query_ctx->query_mem_tracker, false); std::map pipeline_id_to_task; auto get_local_exchange_state = [&](PipelinePtr pipeline) -> std::map, @@ -423,6 +401,7 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag request.fragment_id, request.query_options, _query_ctx->query_globals, _exec_env, _query_ctx.get()); auto& task_runtime_state = _task_runtime_states[pip_idx][i]; + _runtime_filter_states[i]->set_state(task_runtime_state.get()); { // Initialize runtime state for this task task_runtime_state->set_query_mem_tracker(_query_ctx->query_mem_tracker); @@ -454,9 +433,8 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag task_runtime_state->set_load_stream_per_node(request.load_stream_per_node); task_runtime_state->set_total_load_streams(request.total_load_streams); task_runtime_state->set_num_local_sink(request.num_local_sink); - DCHECK(_runtime_filter_states[i]->runtime_filter_mgr); - task_runtime_state->set_runtime_filter_mgr( - _runtime_filter_states[i]->runtime_filter_mgr); + + task_runtime_state->set_runtime_filter_mgr(runtime_filter_mgr.get()); } auto cur_task_id = _total_tasks++; task_runtime_state->set_task_id(cur_task_id); @@ -969,9 +947,9 @@ Status PipelineFragmentContext::_plan_local_exchange( // if 'num_buckets == 0' means the fragment is colocated by exchange node not the // scan node. so here use `_num_instance` to replace the `num_buckets` to prevent dividing 0 // still keep colocate plan after local shuffle - RETURN_IF_ERROR(_plan_local_exchange( - _use_serial_source || num_buckets == 0 ? _num_instances : num_buckets, pip_idx, - _pipelines[pip_idx], bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); + RETURN_IF_ERROR(_plan_local_exchange(num_buckets, pip_idx, _pipelines[pip_idx], + bucket_seq_to_instance_idx, + shuffle_idx_to_instance_idx)); } return Status::OK(); } @@ -1386,8 +1364,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo const uint32_t partition_count = 32; auto inner_probe_operator = std::make_shared(pool, tnode_, 0, descs); - auto inner_sink_operator = std::make_shared( - pool, 0, tnode_, descs, _need_local_merge); + auto inner_sink_operator = + std::make_shared(pool, 0, tnode_, descs); RETURN_IF_ERROR(inner_probe_operator->init(tnode_, _runtime_state.get())); RETURN_IF_ERROR(inner_sink_operator->init(tnode_, _runtime_state.get())); @@ -1407,8 +1385,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); auto sink_operator = std::make_shared( - pool, next_sink_operator_id(), tnode_, descs, _need_local_merge, - partition_count); + pool, next_sink_operator_id(), tnode_, descs, partition_count); sink_operator->set_inner_operators(inner_sink_operator, inner_probe_operator); DataSinkOperatorPtr sink = std::move(sink_operator); sink->set_dests_id({op->operator_id()}); @@ -1432,8 +1409,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); DataSinkOperatorPtr sink; - sink.reset(new HashJoinBuildSinkOperatorX(pool, next_sink_operator_id(), tnode, descs, - _need_local_merge)); + sink.reset(new HashJoinBuildSinkOperatorX(pool, next_sink_operator_id(), tnode, descs)); sink->set_dests_id({op->operator_id()}); RETURN_IF_ERROR(build_side_pipe->set_sink(sink)); RETURN_IF_ERROR(build_side_pipe->sink()->init(tnode, _runtime_state.get())); @@ -1460,8 +1436,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); DataSinkOperatorPtr sink; - sink.reset(new NestedLoopJoinBuildSinkOperatorX(pool, next_sink_operator_id(), tnode, descs, - _need_local_merge)); + sink.reset( + new NestedLoopJoinBuildSinkOperatorX(pool, next_sink_operator_id(), tnode, descs)); sink->set_dests_id({op->operator_id()}); RETURN_IF_ERROR(build_side_pipe->set_sink(sink)); RETURN_IF_ERROR(build_side_pipe->sink()->init(tnode, _runtime_state.get())); diff --git a/be/src/pipeline/pipeline_fragment_context.h b/be/src/pipeline/pipeline_fragment_context.h index 289f5c8236522f9..1674afa886d520d 100644 --- a/be/src/pipeline/pipeline_fragment_context.h +++ b/be/src/pipeline/pipeline_fragment_context.h @@ -228,8 +228,6 @@ class PipelineFragmentContext : public TaskExecutionContext { // this is a [n * m] matrix. n is parallelism of pipeline engine and m is the number of pipelines. std::vector>> _tasks; - bool _need_local_merge = false; - // TODO: remove the _sink and _multi_cast_stream_sink_senders to set both // of it in pipeline task not the fragment_context #ifdef __clang__ @@ -301,7 +299,7 @@ class PipelineFragmentContext : public TaskExecutionContext { */ std::vector>> _task_runtime_states; - std::vector> _runtime_filter_states; + std::vector _runtime_filter_states; // Total instance num running on all BEs int _total_instances = -1; diff --git a/be/src/pipeline/task_scheduler.h b/be/src/pipeline/task_scheduler.h index bdb5bec1776f584..3c1b08063dfa61e 100644 --- a/be/src/pipeline/task_scheduler.h +++ b/be/src/pipeline/task_scheduler.h @@ -43,7 +43,7 @@ namespace doris::pipeline { class TaskScheduler { public: - TaskScheduler(int core_num, std::string name, CgroupCpuCtl* cgroup_cpu_ctl) + TaskScheduler(int core_num, std::string name, std::shared_ptr cgroup_cpu_ctl) : _task_queue(core_num), _shutdown(false), _name(std::move(name)), @@ -65,7 +65,7 @@ class TaskScheduler { std::vector _markers; bool _shutdown; std::string _name; - CgroupCpuCtl* _cgroup_cpu_ctl = nullptr; + std::weak_ptr _cgroup_cpu_ctl; void _do_work(int index); }; diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 75ec588aa50c1d8..706fd7efd07d0f9 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -276,6 +276,7 @@ Status ExecEnv::_init(const std::vector& store_paths, _pipeline_tracer_ctx = std::make_unique(); // before query RETURN_IF_ERROR(init_pipeline_task_scheduler()); _workload_group_manager = new WorkloadGroupMgr(); + _workload_group_manager->init_internal_workload_group(); _scanner_scheduler = new doris::vectorized::ScannerScheduler(); _fragment_mgr = new FragmentMgr(this); _result_cache = new ResultCache(config::query_cache_max_size_mb, @@ -364,7 +365,8 @@ Status ExecEnv::_init(const std::vector& store_paths, return st; } _storage_engine->set_heartbeat_flags(this->heartbeat_flags()); - if (st = _storage_engine->start_bg_threads(); !st.ok()) { + WorkloadGroupPtr internal_wg = _workload_group_manager->get_internal_wg(); + if (st = _storage_engine->start_bg_threads(internal_wg); !st.ok()) { LOG(ERROR) << "Failed to starge bg threads of storage engine, res=" << st; return st; } @@ -419,8 +421,8 @@ void ExecEnv::init_file_cache_factory(std::vector& cache_paths std::unordered_set cache_path_set; Status rest = doris::parse_conf_cache_paths(doris::config::file_cache_path, cache_paths); if (!rest) { - LOG(FATAL) << "parse config file cache path failed, path=" - << doris::config::file_cache_path; + LOG(FATAL) << "parse config file cache path failed, path=" << doris::config::file_cache_path + << ", reason=" << rest.msg(); exit(-1); } std::vector file_cache_init_threads; diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 95e5f8e2ce14f33..5a8ea2377aa80fc 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -1017,8 +1017,14 @@ void FragmentMgr::cancel_worker() { } } - for (auto it : brpc_stub_with_queries) { - _check_brpc_available(it.first, it.second); + if (config::enable_brpc_connection_check) { + for (auto it : brpc_stub_with_queries) { + if (!it.first) { + LOG(WARNING) << "brpc stub is nullptr, skip it."; + continue; + } + _check_brpc_available(it.first, it.second); + } } if (!queries_lost_coordinator.empty()) { @@ -1239,7 +1245,9 @@ Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(¶ms, &filter_wrapper)); std::ranges::for_each(filters, [&](auto& filter) { - filter->update_filter(filter_wrapper, request->merge_time(), start_apply); + filter->update_filter( + filter_wrapper, request->merge_time(), start_apply, + request->has_local_merge_time() ? request->local_merge_time() : 0); }); } @@ -1265,7 +1273,7 @@ Status FragmentMgr::send_filter_size(const PSendFilterSizeRequest* request) { std::shared_ptr filter_controller; RETURN_IF_ERROR(_runtimefilter_controller.acquire(queryid, &filter_controller)); - auto merge_status = filter_controller->send_filter_size(request); + auto merge_status = filter_controller->send_filter_size(query_ctx, request); return merge_status; } @@ -1307,7 +1315,7 @@ Status FragmentMgr::merge_filter(const PMergeFilterRequest* request, SCOPED_ATTACH_TASK(query_ctx.get()); std::shared_ptr filter_controller; RETURN_IF_ERROR(_runtimefilter_controller.acquire(queryid, &filter_controller)); - auto merge_status = filter_controller->merge(request, attach_data); + auto merge_status = filter_controller->merge(query_ctx, request, attach_data); return merge_status; } diff --git a/be/src/runtime/jsonb_value.cpp b/be/src/runtime/jsonb_value.cpp index e88ce3b3d74d1a1..0227281fdd0d0ec 100644 --- a/be/src/runtime/jsonb_value.cpp +++ b/be/src/runtime/jsonb_value.cpp @@ -28,7 +28,7 @@ namespace doris { -Status JsonBinaryValue::from_json_string(const char* s, int length) { +Status JsonBinaryValue::from_json_string(const char* s, size_t length) { JsonbErrType error = JsonbErrType::E_NONE; if (!parser.parse(s, length)) { error = parser.getErrorCode(); diff --git a/be/src/runtime/jsonb_value.h b/be/src/runtime/jsonb_value.h index 1df9469e1720cd6..65f4927759c3047 100644 --- a/be/src/runtime/jsonb_value.h +++ b/be/src/runtime/jsonb_value.h @@ -43,7 +43,7 @@ struct JsonBinaryValue { JsonbParser parser; JsonBinaryValue() : ptr(nullptr), len(0) {} - JsonBinaryValue(char* ptr, int len) { + JsonBinaryValue(char* ptr, size_t len) { static_cast(from_json_string(const_cast(ptr), len)); } JsonBinaryValue(const std::string& s) { @@ -115,7 +115,7 @@ struct JsonBinaryValue { __builtin_unreachable(); } - Status from_json_string(const char* s, int len); + Status from_json_string(const char* s, size_t len); std::string to_json_string() const; diff --git a/be/src/runtime/memory/jemalloc_hook.cpp b/be/src/runtime/memory/jemalloc_hook.cpp index 445d60d382c270e..dffc1344b71dbc2 100644 --- a/be/src/runtime/memory/jemalloc_hook.cpp +++ b/be/src/runtime/memory/jemalloc_hook.cpp @@ -60,7 +60,7 @@ void* doris_realloc(void* p, size_t size) __THROW { return nullptr; } -#if USE_MEM_TRACKER +#if defined(USE_MEM_TRACKER) && !defined(BE_TEST) int64_t old_size = jemalloc_usable_size(p); CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN( [](size_t size, int64_t old_size) { return jenallocx(size, 0) - old_size; }, size, diff --git a/be/src/runtime/query_context.cpp b/be/src/runtime/query_context.cpp index 811fa6002b5cf53..161964420e9ad6d 100644 --- a/be/src/runtime/query_context.cpp +++ b/be/src/runtime/query_context.cpp @@ -86,7 +86,7 @@ QueryContext::QueryContext(TUniqueId query_id, ExecEnv* exec_env, _shared_hash_table_controller.reset(new vectorized::SharedHashTableController()); _execution_dependency = pipeline::Dependency::create_unique(-1, -1, "ExecutionDependency"); _runtime_filter_mgr = std::make_unique( - TUniqueId(), RuntimeFilterParamsContext::create(this), query_mem_tracker); + TUniqueId(), RuntimeFilterParamsContext::create(this), query_mem_tracker, true); _timeout_second = query_options.execution_timeout; diff --git a/be/src/runtime/query_context.h b/be/src/runtime/query_context.h index 4746553040521b2..d557245bf2339d8 100644 --- a/be/src/runtime/query_context.h +++ b/be/src/runtime/query_context.h @@ -165,6 +165,12 @@ class QueryContext { return _query_options.__isset.fe_process_uuid ? _query_options.fe_process_uuid : 0; } + bool ignore_runtime_filter_error() const { + return _query_options.__isset.ignore_runtime_filter_error + ? _query_options.ignore_runtime_filter_error + : false; + } + // global runtime filter mgr, the runtime filter have remote target or // need local merge should regist here. before publish() or push_to_remote() // the runtime filter should do the local merge work diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index 1a238787207b173..bb100fcbb42ec5f 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -45,12 +45,12 @@ namespace doris { RuntimeFilterMgr::RuntimeFilterMgr(const UniqueId& query_id, RuntimeFilterParamsContext* state, - const std::shared_ptr& query_mem_tracker) { - _state = state; - _state->runtime_filter_mgr = this; - _query_mem_tracker = query_mem_tracker; - _tracker = std::make_unique("RuntimeFilterMgr(experimental)"); -} + const std::shared_ptr& query_mem_tracker, + const bool is_global) + : _is_global(is_global), + _state(state), + _tracker(std::make_unique("RuntimeFilterMgr(experimental)")), + _query_mem_tracker(query_mem_tracker) {} RuntimeFilterMgr::~RuntimeFilterMgr() { CHECK(_query_mem_tracker != nullptr); @@ -60,6 +60,7 @@ RuntimeFilterMgr::~RuntimeFilterMgr() { Status RuntimeFilterMgr::get_consume_filters( const int filter_id, std::vector>& consumer_filters) { + DCHECK(_is_global); std::lock_guard l(_lock); auto iter = _consumer_map.find(filter_id); if (iter == _consumer_map.end()) { @@ -72,6 +73,20 @@ Status RuntimeFilterMgr::get_consume_filters( return Status::OK(); } +std::vector> RuntimeFilterMgr::get_consume_filters( + const int filter_id) { + std::lock_guard l(_lock); + auto iter = _consumer_map.find(filter_id); + if (iter == _consumer_map.end()) { + return {}; + } + std::vector> consumer_filters; + for (auto& holder : iter->second) { + consumer_filters.emplace_back(holder.filter); + } + return consumer_filters; +} + Status RuntimeFilterMgr::register_consumer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, int node_id, std::shared_ptr* consumer_filter, @@ -90,11 +105,12 @@ Status RuntimeFilterMgr::register_consumer_filter(const TRuntimeFilterDesc& desc } } + DCHECK(!(_is_global xor need_local_merge)) + << " _is_global: " << _is_global << " need_local_merge: " << need_local_merge; if (!has_exist) { std::shared_ptr filter; RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::CONSUMER, - node_id, &filter, build_bf_exactly, - need_local_merge)); + node_id, &filter, build_bf_exactly)); _consumer_map[key].emplace_back(node_id, filter); *consumer_filter = filter; } else if (!need_local_merge) { @@ -106,7 +122,8 @@ Status RuntimeFilterMgr::register_consumer_filter(const TRuntimeFilterDesc& desc Status RuntimeFilterMgr::register_local_merge_producer_filter( const doris::TRuntimeFilterDesc& desc, const doris::TQueryOptions& options, - std::shared_ptr* producer_filter, bool build_bf_exactly) { + std::shared_ptr producer_filter, bool build_bf_exactly) { + DCHECK(_is_global); SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); int32_t key = desc.filter_id; @@ -121,27 +138,26 @@ Status RuntimeFilterMgr::register_local_merge_producer_filter( } DCHECK(_state != nullptr); - RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::PRODUCER, -1, - producer_filter, build_bf_exactly, true)); { std::lock_guard l(*iter->second.lock); if (iter->second.filters.empty()) { std::shared_ptr merge_filter; RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::PRODUCER, -1, &merge_filter, - build_bf_exactly, true)); + build_bf_exactly)); merge_filter->set_ignored(); iter->second.filters.emplace_back(merge_filter); } iter->second.merge_time++; iter->second.merge_size_times++; - iter->second.filters.emplace_back(*producer_filter); + iter->second.filters.emplace_back(producer_filter); } return Status::OK(); } Status RuntimeFilterMgr::get_local_merge_producer_filters( int filter_id, doris::LocalMergeFilters** local_merge_filters) { + DCHECK(_is_global); std::lock_guard l(_lock); auto iter = _local_merge_producer_map.find(filter_id); if (iter == _local_merge_producer_map.end()) { @@ -155,10 +171,21 @@ Status RuntimeFilterMgr::get_local_merge_producer_filters( return Status::OK(); } +doris::LocalMergeFilters* RuntimeFilterMgr::get_local_merge_producer_filters(int filter_id) { + DCHECK(_is_global); + std::lock_guard l(_lock); + auto iter = _local_merge_producer_map.find(filter_id); + if (iter == _local_merge_producer_map.end()) { + return nullptr; + } + return &iter->second; +} + Status RuntimeFilterMgr::register_producer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, std::shared_ptr* producer_filter, bool build_bf_exactly) { + DCHECK(!_is_global); SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); int32_t key = desc.filter_id; std::lock_guard l(_lock); @@ -174,33 +201,6 @@ Status RuntimeFilterMgr::register_producer_filter(const TRuntimeFilterDesc& desc return Status::OK(); } -Status RuntimeFilterMgr::update_filter(const PPublishFilterRequest* request, - butil::IOBufAsZeroCopyInputStream* data) { - SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); - UpdateRuntimeFilterParams params(request, data); - int filter_id = request->filter_id(); - std::vector> filters; - // The code is organized for upgrade compatibility to prevent infinite waiting - // old way update filter the code should be deleted after the upgrade is complete. - { - std::lock_guard l(_lock); - auto iter = _consumer_map.find(filter_id); - if (iter == _consumer_map.end()) { - return Status::InternalError("update_filter meet unknown filter: {}, role: CONSUMER.", - filter_id); - } - for (auto& holder : iter->second) { - filters.emplace_back(holder.filter); - } - iter->second.clear(); - } - for (auto filter : filters) { - RETURN_IF_ERROR(filter->update_filter(¶ms)); - } - - return Status::OK(); -} - void RuntimeFilterMgr::set_runtime_filter_params( const TRuntimeFilterParams& runtime_filter_params) { std::lock_guard l(_lock); @@ -305,7 +305,8 @@ Status RuntimeFilterMergeControllerEntity::init(UniqueId query_id, return Status::OK(); } -Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSizeRequest* request) { +Status RuntimeFilterMergeControllerEntity::send_filter_size(std::weak_ptr query_ctx, + const PSendFilterSizeRequest* request) { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); std::shared_ptr cnt_val; @@ -326,6 +327,8 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz Status st = Status::OK(); if (cnt_val->source_addrs.size() == cnt_val->producer_size) { + auto ctx = query_ctx.lock()->ignore_runtime_filter_error() ? std::weak_ptr {} + : query_ctx; for (auto addr : cnt_val->source_addrs) { std::shared_ptr stub( ExecEnv::GetInstance()->brpc_internal_client_cache()->get_client(addr)); @@ -339,12 +342,13 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz auto closure = AutoReleaseClosure>:: create_unique(std::make_shared(), - DummyBrpcCallback::create_shared()); + DummyBrpcCallback::create_shared(), ctx); auto* pquery_id = closure->request_->mutable_query_id(); - pquery_id->set_hi(_state->query_id.hi()); - pquery_id->set_lo(_state->query_id.lo()); - closure->cntl_->set_timeout_ms(get_execution_rpc_timeout_ms(_state->execution_timeout)); + pquery_id->set_hi(_state->get_query_ctx()->query_id().hi); + pquery_id->set_lo(_state->get_query_ctx()->query_id().lo); + closure->cntl_->set_timeout_ms( + get_execution_rpc_timeout_ms(_state->get_query_ctx()->execution_timeout())); if (config::execution_ignore_eovercrowded) { closure->cntl_->ignore_eovercrowded(); } @@ -361,12 +365,6 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz } Status RuntimeFilterMgr::sync_filter_size(const PSyncFilterSizeRequest* request) { - auto filter = try_get_product_filter(request->filter_id()); - if (filter) { - filter->set_synced_size(request->filter_size()); - return Status::OK(); - } - LocalMergeFilters* local_merge_filters = nullptr; RETURN_IF_ERROR(get_local_merge_producer_filters(request->filter_id(), &local_merge_filters)); // first filter size merged filter @@ -377,7 +375,8 @@ Status RuntimeFilterMgr::sync_filter_size(const PSyncFilterSizeRequest* request) } // merge data -Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* request, +Status RuntimeFilterMergeControllerEntity::merge(std::weak_ptr query_ctx, + const PMergeFilterRequest* request, butil::IOBufAsZeroCopyInputStream* attach_data) { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); std::shared_ptr cnt_val; @@ -416,6 +415,8 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ DCHECK_LE(merged_size, cnt_val->producer_size); cnt_val->merge_time += (MonotonicMillis() - start_merge); merge_time = cnt_val->merge_time; + cnt_val->local_merge_time += + request->has_local_merge_time() ? request->local_merge_time() : 0; } if (merged_size == cnt_val->producer_size) { @@ -444,21 +445,25 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ has_attachment = true; } + auto ctx = query_ctx.lock()->ignore_runtime_filter_error() ? std::weak_ptr {} + : query_ctx; std::vector& targets = cnt_val->targetv2_info; for (auto& target : targets) { auto closure = AutoReleaseClosure>:: create_unique(std::make_shared(apply_request), - DummyBrpcCallback::create_shared()); + DummyBrpcCallback::create_shared(), ctx); closure->request_->set_filter_id(request->filter_id()); closure->request_->set_merge_time(merge_time); + closure->request_->set_local_merge_time(cnt_val->local_merge_time); *closure->request_->mutable_query_id() = request->query_id(); if (has_attachment) { closure->cntl_->request_attachment().append(request_attachment); } - closure->cntl_->set_timeout_ms(get_execution_rpc_timeout_ms(_state->execution_timeout)); + closure->cntl_->set_timeout_ms( + get_execution_rpc_timeout_ms(_state->get_query_ctx()->execution_timeout())); if (config::execution_ignore_eovercrowded) { closure->cntl_->ignore_eovercrowded(); } @@ -521,31 +526,22 @@ void RuntimeFilterMergeController::remove_entity(UniqueId query_id) { RuntimeFilterParamsContext* RuntimeFilterParamsContext::create(RuntimeState* state) { RuntimeFilterParamsContext* params = state->get_query_ctx()->obj_pool.add(new RuntimeFilterParamsContext()); - params->runtime_filter_wait_infinitely = state->runtime_filter_wait_infinitely(); - params->runtime_filter_wait_time_ms = state->runtime_filter_wait_time_ms(); - params->execution_timeout = state->execution_timeout(); - params->runtime_filter_mgr = state->local_runtime_filter_mgr(); - params->exec_env = state->exec_env(); - params->query_id.set_hi(state->query_id().hi); - params->query_id.set_lo(state->query_id().lo); - - params->be_exec_version = state->be_exec_version(); - params->query_ctx = state->get_query_ctx(); + params->_query_ctx = state->get_query_ctx(); + params->_state = state; return params; } +RuntimeFilterMgr* RuntimeFilterParamsContext::global_runtime_filter_mgr() { + return _query_ctx->runtime_filter_mgr(); +} + +RuntimeFilterMgr* RuntimeFilterParamsContext::local_runtime_filter_mgr() { + return _state->local_runtime_filter_mgr(); +} + RuntimeFilterParamsContext* RuntimeFilterParamsContext::create(QueryContext* query_ctx) { RuntimeFilterParamsContext* params = query_ctx->obj_pool.add(new RuntimeFilterParamsContext()); - params->runtime_filter_wait_infinitely = query_ctx->runtime_filter_wait_infinitely(); - params->runtime_filter_wait_time_ms = query_ctx->runtime_filter_wait_time_ms(); - params->execution_timeout = query_ctx->execution_timeout(); - params->runtime_filter_mgr = query_ctx->runtime_filter_mgr(); - params->exec_env = query_ctx->exec_env(); - params->query_id.set_hi(query_ctx->query_id().hi); - params->query_id.set_lo(query_ctx->query_id().lo); - - params->be_exec_version = query_ctx->be_exec_version(); - params->query_ctx = query_ctx; + params->_query_ctx = query_ctx; return params; } diff --git a/be/src/runtime/runtime_filter_mgr.h b/be/src/runtime/runtime_filter_mgr.h index b0aea7568cff65b..0a6f8318feaba03 100644 --- a/be/src/runtime/runtime_filter_mgr.h +++ b/be/src/runtime/runtime_filter_mgr.h @@ -34,6 +34,7 @@ #include "common/object_pool.h" #include "common/status.h" +#include "util/stopwatch.hpp" #include "util/uid_util.h" namespace butil { @@ -60,6 +61,7 @@ struct LocalMergeFilters { int merge_size_times = 0; uint64_t local_merged_size = 0; std::vector> filters; + MonotonicStopWatch merge_watcher; }; /// producer: @@ -77,12 +79,14 @@ struct LocalMergeFilters { class RuntimeFilterMgr { public: RuntimeFilterMgr(const UniqueId& query_id, RuntimeFilterParamsContext* state, - const std::shared_ptr& query_mem_tracker); + const std::shared_ptr& query_mem_tracker, + const bool is_global); ~RuntimeFilterMgr(); Status get_consume_filters(const int filter_id, std::vector>& consumer_filters); + std::vector> get_consume_filters(const int filter_id); std::shared_ptr try_get_product_filter(const int filter_id) { std::lock_guard l(_lock); @@ -100,19 +104,17 @@ class RuntimeFilterMgr { Status register_local_merge_producer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, - std::shared_ptr* producer_filter, + std::shared_ptr producer_filter, bool build_bf_exactly = false); Status get_local_merge_producer_filters(int filter_id, LocalMergeFilters** local_merge_filters); + LocalMergeFilters* get_local_merge_producer_filters(int filter_id); Status register_producer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, std::shared_ptr* producer_filter, bool build_bf_exactly = false); // update filter by remote - Status update_filter(const PPublishFilterRequest* request, - butil::IOBufAsZeroCopyInputStream* data); - void set_runtime_filter_params(const TRuntimeFilterParams& runtime_filter_params); Status get_merge_addr(TNetworkAddress* addr); @@ -124,6 +126,18 @@ class RuntimeFilterMgr { int node_id; std::shared_ptr filter; }; + /** + * `_is_global = true` means this runtime filter manager menages query-level runtime filters. + * If so, all consumers in this query shared the same RF with the same ID. For producers, all + * RFs produced should be merged. + * + * If `_is_global` is false, a RF will be produced and consumed by a single-producer-single-consumer mode. + * This is usually happened in a co-located join and scan operators are not serial. + * + * `_local_merge_producer_map` is used only if `_is_global` is true. It is said, RFs produced by + * different producers need to be merged only if it is a global RF. + */ + const bool _is_global; // RuntimeFilterMgr is owned by RuntimeState, so we only // use filter_id as key // key: "filter-id" @@ -156,10 +170,11 @@ class RuntimeFilterMergeControllerEntity { const TQueryOptions& query_options); // handle merge rpc - Status merge(const PMergeFilterRequest* request, + Status merge(std::weak_ptr query_ctx, const PMergeFilterRequest* request, butil::IOBufAsZeroCopyInputStream* attach_data); - Status send_filter_size(const PSendFilterSizeRequest* request); + Status send_filter_size(std::weak_ptr query_ctx, + const PSendFilterSizeRequest* request); UniqueId query_id() const { return _query_id; } @@ -173,6 +188,7 @@ class RuntimeFilterMergeControllerEntity { std::unordered_set arrive_id; std::vector source_addrs; std::shared_ptr pool; + uint64_t local_merge_time = 0; }; private: @@ -267,24 +283,22 @@ class RuntimeFilterMergeController { FilterControllerMap _filter_controller_map[kShardNum]; }; -//There are two types of runtime filters: -// one is global, originating from QueryContext, -// and the other is local, originating from RuntimeState. -// In practice, we have already distinguished between them through UpdateRuntimeFilterParamsV2/V1. -// RuntimeState/QueryContext is only used to store runtime_filter_wait_time_ms... +// There are two types of runtime filters: +// 1. Global runtime filter. Managed by QueryContext's RuntimeFilterMgr which is produced by multiple producers and shared by multiple consumers. +// 2. Local runtime filter. Managed by RuntimeState's RuntimeFilterMgr which is 1-producer-1-consumer mode. struct RuntimeFilterParamsContext { - RuntimeFilterParamsContext() = default; static RuntimeFilterParamsContext* create(RuntimeState* state); static RuntimeFilterParamsContext* create(QueryContext* query_ctx); - bool runtime_filter_wait_infinitely; - int32_t runtime_filter_wait_time_ms; - int32_t execution_timeout; - RuntimeFilterMgr* runtime_filter_mgr; - ExecEnv* exec_env; - PUniqueId query_id; - int be_exec_version; - QueryContext* query_ctx; - QueryContext* get_query_ctx() const { return query_ctx; } + QueryContext* get_query_ctx() const { return _query_ctx; } + void set_state(RuntimeState* state) { _state = state; } + RuntimeFilterMgr* global_runtime_filter_mgr(); + RuntimeFilterMgr* local_runtime_filter_mgr(); + +private: + RuntimeFilterParamsContext() = default; + + QueryContext* _query_ctx; + RuntimeState* _state; }; } // namespace doris diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index e3f9d075c8ffc2a..344180bad771ac5 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -40,6 +40,7 @@ #include "pipeline/exec/operator.h" #include "pipeline/pipeline_task.h" #include "runtime/exec_env.h" +#include "runtime/fragment_mgr.h" #include "runtime/load_path_mgr.h" #include "runtime/memory/mem_tracker_limiter.h" #include "runtime/memory/thread_mem_tracker_mgr.h" @@ -129,7 +130,6 @@ RuntimeState::RuntimeState(pipeline::PipelineFragmentContext*, const TUniqueId& : _profile("Fragment " + print_id(instance_id)), _load_channel_profile(""), _obj_pool(new ObjectPool()), - _runtime_filter_mgr(nullptr), _unreported_error_idx(0), _query_id(query_id), _fragment_id(fragment_id), @@ -294,6 +294,10 @@ Status RuntimeState::init(const TUniqueId& fragment_instance_id, const TQueryOpt return Status::OK(); } +std::weak_ptr RuntimeState::get_query_ctx_weak() { + return _exec_env->fragment_mgr()->get_or_erase_query_ctx_with_lock(_query_ctx->query_id()); +} + void RuntimeState::init_mem_trackers(const std::string& name, const TUniqueId& id) { _query_mem_tracker = MemTrackerLimiter::create_shared( MemTrackerLimiter::Type::OTHER, fmt::format("{}#Id={}", name, print_id(id))); @@ -512,15 +516,15 @@ RuntimeFilterMgr* RuntimeState::global_runtime_filter_mgr() { } Status RuntimeState::register_producer_runtime_filter( - const TRuntimeFilterDesc& desc, bool need_local_merge, - std::shared_ptr* producer_filter, bool build_bf_exactly) { - if (desc.has_remote_targets || need_local_merge) { - return global_runtime_filter_mgr()->register_local_merge_producer_filter( - desc, query_options(), producer_filter, build_bf_exactly); - } else { - return local_runtime_filter_mgr()->register_producer_filter( - desc, query_options(), producer_filter, build_bf_exactly); - } + const TRuntimeFilterDesc& desc, std::shared_ptr* producer_filter, + bool build_bf_exactly) { + // Producers are created by local runtime filter mgr and shared by global runtime filter manager. + // When RF is published, consumers in both global and local RF mgr will be found. + RETURN_IF_ERROR(local_runtime_filter_mgr()->register_producer_filter( + desc, query_options(), producer_filter, build_bf_exactly)); + RETURN_IF_ERROR(global_runtime_filter_mgr()->register_local_merge_producer_filter( + desc, query_options(), *producer_filter, build_bf_exactly)); + return Status::OK(); } Status RuntimeState::register_consumer_runtime_filter( diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 88deee491d19c4d..0bc81bca4d99a1e 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -449,6 +449,8 @@ class RuntimeState { QueryContext* get_query_ctx() { return _query_ctx; } + std::weak_ptr get_query_ctx_weak(); + void set_query_mem_tracker(const std::shared_ptr& tracker) { _query_mem_tracker = tracker; } @@ -559,7 +561,6 @@ class RuntimeState { } Status register_producer_runtime_filter(const doris::TRuntimeFilterDesc& desc, - bool need_local_merge, std::shared_ptr* producer_filter, bool build_bf_exactly); diff --git a/be/src/runtime/workload_group/workload_group.cpp b/be/src/runtime/workload_group/workload_group.cpp index c6a3c07adda1dde..f9405de12737dc6 100644 --- a/be/src/runtime/workload_group/workload_group.cpp +++ b/be/src/runtime/workload_group/workload_group.cpp @@ -47,10 +47,12 @@ namespace doris { const static std::string MEMORY_LIMIT_DEFAULT_VALUE = "0%"; const static bool ENABLE_MEMORY_OVERCOMMIT_DEFAULT_VALUE = true; const static int CPU_HARD_LIMIT_DEFAULT_VALUE = -1; -const static int SPILL_LOW_WATERMARK_DEFAULT_VALUE = 50; -const static int SPILL_HIGH_WATERMARK_DEFAULT_VALUE = 80; +const static int MEMORY_LOW_WATERMARK_DEFAULT_VALUE = 50; +const static int MEMORY_HIGH_WATERMARK_DEFAULT_VALUE = 80; -WorkloadGroup::WorkloadGroup(const WorkloadGroupInfo& tg_info) +WorkloadGroup::WorkloadGroup(const WorkloadGroupInfo& wg_info) : WorkloadGroup(wg_info, true) {} + +WorkloadGroup::WorkloadGroup(const WorkloadGroupInfo& tg_info, bool need_create_query_thread_pool) : _id(tg_info.id), _name(tg_info.name), _version(tg_info.version), @@ -62,10 +64,11 @@ WorkloadGroup::WorkloadGroup(const WorkloadGroupInfo& tg_info) _scan_thread_num(tg_info.scan_thread_num), _max_remote_scan_thread_num(tg_info.max_remote_scan_thread_num), _min_remote_scan_thread_num(tg_info.min_remote_scan_thread_num), - _spill_low_watermark(tg_info.spill_low_watermark), - _spill_high_watermark(tg_info.spill_high_watermark), + _memory_low_watermark(tg_info.memory_low_watermark), + _memory_high_watermark(tg_info.memory_high_watermark), _scan_bytes_per_second(tg_info.read_bytes_per_second), - _remote_scan_bytes_per_second(tg_info.remote_read_bytes_per_second) { + _remote_scan_bytes_per_second(tg_info.remote_read_bytes_per_second), + _need_create_query_thread_pool(need_create_query_thread_pool) { std::vector& data_dir_list = io::BeConfDataDirReader::be_config_data_dir_list; for (const auto& data_dir : data_dir_list) { _scan_io_throttle_map[data_dir.path] = @@ -88,12 +91,12 @@ std::string WorkloadGroup::debug_string() const { "TG[id = {}, name = {}, cpu_share = {}, memory_limit = {}, enable_memory_overcommit = " "{}, version = {}, cpu_hard_limit = {}, scan_thread_num = " "{}, max_remote_scan_thread_num = {}, min_remote_scan_thread_num = {}, " - "spill_low_watermark={}, spill_high_watermark={}, is_shutdown={}, query_num={}, " + "memory_low_watermark={}, memory_high_watermark={}, is_shutdown={}, query_num={}, " "read_bytes_per_second={}, remote_read_bytes_per_second={}]", _id, _name, cpu_share(), PrettyPrinter::print(_memory_limit, TUnit::BYTES), _enable_memory_overcommit ? "true" : "false", _version, cpu_hard_limit(), _scan_thread_num, _max_remote_scan_thread_num, _min_remote_scan_thread_num, - _spill_low_watermark, _spill_high_watermark, _is_shutdown, _query_ctxs.size(), + _memory_low_watermark, _memory_high_watermark, _is_shutdown, _query_ctxs.size(), _scan_bytes_per_second, _remote_scan_bytes_per_second); } @@ -101,14 +104,14 @@ std::string WorkloadGroup::memory_debug_string() const { return fmt::format( "TG[id = {}, name = {}, memory_limit = {}, enable_memory_overcommit = " "{}, weighted_memory_limit = {}, total_mem_used = {}, " - "wg_refresh_interval_memory_growth = {}, spill_low_watermark = {}, " - "spill_high_watermark = {}, version = {}, is_shutdown = {}, query_num = {}]", + "wg_refresh_interval_memory_growth = {}, memory_low_watermark = {}, " + "memory_high_watermark = {}, version = {}, is_shutdown = {}, query_num = {}]", _id, _name, PrettyPrinter::print(_memory_limit, TUnit::BYTES), _enable_memory_overcommit ? "true" : "false", PrettyPrinter::print(_weighted_memory_limit, TUnit::BYTES), PrettyPrinter::print(_total_mem_used, TUnit::BYTES), PrettyPrinter::print(_wg_refresh_interval_memory_growth, TUnit::BYTES), - _spill_low_watermark, _spill_high_watermark, _version, _is_shutdown, + _memory_low_watermark, _memory_high_watermark, _version, _is_shutdown, _query_ctxs.size()); } @@ -134,8 +137,8 @@ void WorkloadGroup::check_and_update(const WorkloadGroupInfo& tg_info) { _scan_thread_num = tg_info.scan_thread_num; _max_remote_scan_thread_num = tg_info.max_remote_scan_thread_num; _min_remote_scan_thread_num = tg_info.min_remote_scan_thread_num; - _spill_low_watermark = tg_info.spill_low_watermark; - _spill_high_watermark = tg_info.spill_high_watermark; + _memory_low_watermark = tg_info.memory_low_watermark; + _memory_high_watermark = tg_info.memory_high_watermark; _scan_bytes_per_second = tg_info.read_bytes_per_second; _remote_scan_bytes_per_second = tg_info.remote_read_bytes_per_second; } else { @@ -342,19 +345,19 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( // 4 cpu_share uint64_t cpu_share = CgroupCpuCtl::cpu_soft_limit_default_value(); - if (tworkload_group_info.__isset.cpu_share) { + if (tworkload_group_info.__isset.cpu_share && tworkload_group_info.cpu_share > 0) { cpu_share = tworkload_group_info.cpu_share; } // 5 cpu hard limit int cpu_hard_limit = CPU_HARD_LIMIT_DEFAULT_VALUE; - if (tworkload_group_info.__isset.cpu_hard_limit) { + if (tworkload_group_info.__isset.cpu_hard_limit && tworkload_group_info.cpu_hard_limit > 0) { cpu_hard_limit = tworkload_group_info.cpu_hard_limit; } // 6 mem_limit std::string mem_limit_str = MEMORY_LIMIT_DEFAULT_VALUE; - if (tworkload_group_info.__isset.mem_limit) { + if (tworkload_group_info.__isset.mem_limit && tworkload_group_info.mem_limit != "-1") { mem_limit_str = tworkload_group_info.mem_limit; } bool is_percent = true; @@ -393,27 +396,29 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( min_remote_scan_thread_num = tworkload_group_info.min_remote_scan_thread_num; } - // 12 spill low watermark - int spill_low_watermark = SPILL_LOW_WATERMARK_DEFAULT_VALUE; - if (tworkload_group_info.__isset.spill_threshold_low_watermark) { - spill_low_watermark = tworkload_group_info.spill_threshold_low_watermark; + // 12 memory low watermark + int memory_low_watermark = MEMORY_LOW_WATERMARK_DEFAULT_VALUE; + if (tworkload_group_info.__isset.memory_low_watermark) { + memory_low_watermark = tworkload_group_info.memory_low_watermark; } - // 13 spil high watermark - int spill_high_watermark = SPILL_HIGH_WATERMARK_DEFAULT_VALUE; - if (tworkload_group_info.__isset.spill_threshold_high_watermark) { - spill_high_watermark = tworkload_group_info.spill_threshold_high_watermark; + // 13 memory high watermark + int memory_high_watermark = MEMORY_HIGH_WATERMARK_DEFAULT_VALUE; + if (tworkload_group_info.__isset.memory_high_watermark) { + memory_high_watermark = tworkload_group_info.memory_high_watermark; } // 14 scan io int read_bytes_per_second = -1; - if (tworkload_group_info.__isset.read_bytes_per_second) { + if (tworkload_group_info.__isset.read_bytes_per_second && + tworkload_group_info.read_bytes_per_second > 0) { read_bytes_per_second = tworkload_group_info.read_bytes_per_second; } // 15 remote scan io int remote_read_bytes_per_second = -1; - if (tworkload_group_info.__isset.remote_read_bytes_per_second) { + if (tworkload_group_info.__isset.remote_read_bytes_per_second && + tworkload_group_info.remote_read_bytes_per_second > 0) { remote_read_bytes_per_second = tworkload_group_info.remote_read_bytes_per_second; } @@ -428,60 +433,66 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( .scan_thread_num = scan_thread_num, .max_remote_scan_thread_num = max_remote_scan_thread_num, .min_remote_scan_thread_num = min_remote_scan_thread_num, - .spill_low_watermark = spill_low_watermark, - .spill_high_watermark = spill_high_watermark, + .memory_low_watermark = memory_low_watermark, + .memory_high_watermark = memory_high_watermark, .read_bytes_per_second = read_bytes_per_second, .remote_read_bytes_per_second = remote_read_bytes_per_second}; } -void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* exec_env) { - uint64_t tg_id = tg_info->id; - std::string tg_name = tg_info->name; - int cpu_hard_limit = tg_info->cpu_hard_limit; - uint64_t cpu_shares = tg_info->cpu_share; - bool enable_cpu_hard_limit = tg_info->enable_cpu_hard_limit; - int scan_thread_num = tg_info->scan_thread_num; - int max_remote_scan_thread_num = tg_info->max_remote_scan_thread_num; - int min_remote_scan_thread_num = tg_info->min_remote_scan_thread_num; +std::weak_ptr WorkloadGroup::get_cgroup_cpu_ctl_wptr() { + std::shared_lock rlock(_task_sched_lock); + return _cgroup_cpu_ctl; +} +void WorkloadGroup::create_cgroup_cpu_ctl() { std::lock_guard wlock(_task_sched_lock); + create_cgroup_cpu_ctl_no_lock(); +} + +void WorkloadGroup::create_cgroup_cpu_ctl_no_lock() { if (config::doris_cgroup_cpu_path != "" && _cgroup_cpu_ctl == nullptr) { - std::unique_ptr cgroup_cpu_ctl = CgroupCpuCtl::create_cgroup_cpu_ctl(tg_id); + std::shared_ptr cgroup_cpu_ctl = CgroupCpuCtl::create_cgroup_cpu_ctl(_id); if (cgroup_cpu_ctl) { Status ret = cgroup_cpu_ctl->init(); if (ret.ok()) { _cgroup_cpu_ctl = std::move(cgroup_cpu_ctl); - LOG(INFO) << "[upsert wg thread pool] cgroup init success, wg_id=" << tg_id; + LOG(INFO) << "[upsert wg thread pool] cgroup init success, wg_id=" << _id; } else { - LOG(INFO) << "[upsert wg thread pool] cgroup init failed, wg_id=" << tg_id + LOG(INFO) << "[upsert wg thread pool] cgroup init failed, wg_id=" << _id << ", reason=" << ret.to_string(); } } else { - LOG(INFO) << "[upsert wg thread pool] create cgroup cpu ctl for " << tg_id << " failed"; + LOG(INFO) << "[upsert wg thread pool] create cgroup cpu ctl wg_id=" << _id << " failed"; } } +} - CgroupCpuCtl* cg_cpu_ctl_ptr = _cgroup_cpu_ctl.get(); - +void WorkloadGroup::upsert_thread_pool_no_lock(WorkloadGroupInfo* wg_info, + std::shared_ptr cg_cpu_ctl_ptr) { + uint64_t wg_id = wg_info->id; + std::string wg_name = wg_info->name; + int scan_thread_num = wg_info->scan_thread_num; + int max_remote_scan_thread_num = wg_info->max_remote_scan_thread_num; + int min_remote_scan_thread_num = wg_info->min_remote_scan_thread_num; if (_task_sched == nullptr) { int32_t executors_size = config::pipeline_executor_size; if (executors_size <= 0) { executors_size = CpuInfo::num_cores(); } std::unique_ptr pipeline_task_scheduler = - std::make_unique(executors_size, "Pipe_" + tg_name, + std::make_unique(executors_size, "Pipe_" + wg_name, cg_cpu_ctl_ptr); Status ret = pipeline_task_scheduler->start(); if (ret.ok()) { _task_sched = std::move(pipeline_task_scheduler); } else { - LOG(INFO) << "[upsert wg thread pool] task scheduler start failed, gid= " << tg_id; + LOG(INFO) << "[upsert wg thread pool] task scheduler start failed, gid= " << wg_id; } } if (_scan_task_sched == nullptr) { std::unique_ptr scan_scheduler = - std::make_unique("Scan_" + tg_name, + std::make_unique("Scan_" + wg_name, cg_cpu_ctl_ptr); Status ret = scan_scheduler->start(config::doris_scanner_thread_pool_thread_num, config::doris_scanner_thread_pool_thread_num, @@ -489,7 +500,7 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e if (ret.ok()) { _scan_task_sched = std::move(scan_scheduler); } else { - LOG(INFO) << "[upsert wg thread pool] scan scheduler start failed, gid=" << tg_id; + LOG(INFO) << "[upsert wg thread pool] scan scheduler start failed, gid=" << wg_id; } } if (scan_thread_num > 0 && _scan_task_sched) { @@ -501,7 +512,7 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e int remote_scan_thread_queue_size = vectorized::ScannerScheduler::get_remote_scan_thread_queue_size(); std::unique_ptr remote_scan_scheduler = - std::make_unique("RScan_" + tg_name, + std::make_unique("RScan_" + wg_name, cg_cpu_ctl_ptr); Status ret = remote_scan_scheduler->start(remote_max_thread_num, config::doris_scanner_min_thread_pool_thread_num, @@ -510,7 +521,7 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e _remote_scan_task_sched = std::move(remote_scan_scheduler); } else { LOG(INFO) << "[upsert wg thread pool] remote scan scheduler start failed, gid=" - << tg_id; + << wg_id; } } if (max_remote_scan_thread_num >= min_remote_scan_thread_num && _remote_scan_task_sched) { @@ -532,7 +543,7 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e : std::min(num_disk * min_threads, num_cpus * config::wg_flush_thread_num_per_cpu); - std::string pool_name = "wg_flush_" + tg_name; + std::string pool_name = "wg_flush_" + wg_name; auto ret = ThreadPoolBuilder(pool_name) .set_min_threads(min_threads) .set_max_threads(max_threads) @@ -540,17 +551,24 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e .build(&thread_pool); if (!ret.ok()) { LOG(INFO) << "[upsert wg thread pool] create " + pool_name + " failed, gid=" - << tg_id; + << wg_id; } else { _memtable_flush_pool = std::move(thread_pool); - LOG(INFO) << "[upsert wg thread pool] create " + pool_name + " succ, gid=" << tg_id + LOG(INFO) << "[upsert wg thread pool] create " + pool_name + " succ, gid=" << wg_id << ", max thread num=" << max_threads << ", min thread num=" << min_threads; } } } +} + +void WorkloadGroup::upsert_cgroup_cpu_ctl_no_lock(WorkloadGroupInfo* wg_info) { + uint64_t wg_id = wg_info->id; + int cpu_hard_limit = wg_info->cpu_hard_limit; + uint64_t cpu_shares = wg_info->cpu_share; + bool enable_cpu_hard_limit = wg_info->enable_cpu_hard_limit; + create_cgroup_cpu_ctl_no_lock(); - // step 6: update cgroup cpu if needed if (_cgroup_cpu_ctl) { if (enable_cpu_hard_limit) { if (cpu_hard_limit > 0) { @@ -560,15 +578,24 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e } else { LOG(INFO) << "[upsert wg thread pool] enable cpu hard limit but value is " "illegal: " - << cpu_hard_limit << ", gid=" << tg_id; + << cpu_hard_limit << ", gid=" << wg_id; } } else { _cgroup_cpu_ctl->update_cpu_soft_limit(cpu_shares); _cgroup_cpu_ctl->update_cpu_hard_limit( CPU_HARD_LIMIT_DEFAULT_VALUE); // disable cpu hard limit } - _cgroup_cpu_ctl->get_cgroup_cpu_info(&(tg_info->cgroup_cpu_shares), - &(tg_info->cgroup_cpu_hard_limit)); + _cgroup_cpu_ctl->get_cgroup_cpu_info(&(wg_info->cgroup_cpu_shares), + &(wg_info->cgroup_cpu_hard_limit)); + } +} + +void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* wg_info) { + std::lock_guard wlock(_task_sched_lock); + upsert_cgroup_cpu_ctl_no_lock(wg_info); + + if (_need_create_query_thread_pool) { + upsert_thread_pool_no_lock(wg_info, _cgroup_cpu_ctl); } } diff --git a/be/src/runtime/workload_group/workload_group.h b/be/src/runtime/workload_group/workload_group.h index 2ba84ce982b3041..fb89ed8101ad49e 100644 --- a/be/src/runtime/workload_group/workload_group.h +++ b/be/src/runtime/workload_group/workload_group.h @@ -58,6 +58,8 @@ class WorkloadGroup : public std::enable_shared_from_this { public: explicit WorkloadGroup(const WorkloadGroupInfo& tg_info); + explicit WorkloadGroup(const WorkloadGroupInfo& tg_info, bool need_create_query_thread_pool); + int64_t version() const { return _version; } uint64_t cpu_share() const { return _cpu_share.load(); } @@ -92,11 +94,11 @@ class WorkloadGroup : public std::enable_shared_from_this { void do_sweep(); - int spill_threshold_low_water_mark() const { - return _spill_low_watermark.load(std::memory_order_relaxed); + int memory_low_watermark() const { + return _memory_low_watermark.load(std::memory_order_relaxed); } - int spill_threashold_high_water_mark() const { - return _spill_high_watermark.load(std::memory_order_relaxed); + int memory_high_watermark() const { + return _memory_high_watermark.load(std::memory_order_relaxed); } void set_weighted_memory_ratio(double ratio); @@ -105,7 +107,7 @@ class WorkloadGroup : public std::enable_shared_from_this { _total_mem_used + _wg_refresh_interval_memory_growth.load() + size; if ((realtime_total_mem_used > ((double)_weighted_memory_limit * - _spill_high_watermark.load(std::memory_order_relaxed) / 100))) { + _memory_high_watermark.load(std::memory_order_relaxed) / 100))) { return false; } else { _wg_refresh_interval_memory_growth.fetch_add(size); @@ -120,10 +122,10 @@ class WorkloadGroup : public std::enable_shared_from_this { auto realtime_total_mem_used = _total_mem_used + _wg_refresh_interval_memory_growth.load(); *is_low_wartermark = (realtime_total_mem_used > ((double)_weighted_memory_limit * - _spill_low_watermark.load(std::memory_order_relaxed) / 100)); + _memory_low_watermark.load(std::memory_order_relaxed) / 100)); *is_high_wartermark = (realtime_total_mem_used > ((double)_weighted_memory_limit * - _spill_high_watermark.load(std::memory_order_relaxed) / 100)); + _memory_high_watermark.load(std::memory_order_relaxed) / 100)); } std::string debug_string() const; @@ -165,7 +167,7 @@ class WorkloadGroup : public std::enable_shared_from_this { int64_t gc_memory(int64_t need_free_mem, RuntimeProfile* profile, bool is_minor_gc); - void upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* exec_env); + void upsert_task_scheduler(WorkloadGroupInfo* tg_info); void get_query_scheduler(doris::pipeline::TaskScheduler** exec_sched, vectorized::SimplifiedScanScheduler** scan_sched, @@ -198,18 +200,21 @@ class WorkloadGroup : public std::enable_shared_from_this { } int64_t get_remote_scan_bytes_per_second(); - CgroupCpuCtl* get_cgroup_cpu_ctl_ptr() { - std::shared_lock rlock(_task_sched_lock); - return _cgroup_cpu_ctl.get(); - } - ThreadPool* get_memtable_flush_pool_ptr() { // no lock here because this is called by memtable flush, // to avoid lock competition with the workload thread pool's update return _memtable_flush_pool.get(); } + void create_cgroup_cpu_ctl(); + + std::weak_ptr get_cgroup_cpu_ctl_wptr(); private: + void create_cgroup_cpu_ctl_no_lock(); + void upsert_cgroup_cpu_ctl_no_lock(WorkloadGroupInfo* wg_info); + void upsert_thread_pool_no_lock(WorkloadGroupInfo* wg_info, + std::shared_ptr cg_cpu_ctl_ptr); + mutable std::shared_mutex _mutex; // lock _name, _version, _cpu_share, _memory_limit const uint64_t _id; std::string _name; @@ -228,8 +233,8 @@ class WorkloadGroup : public std::enable_shared_from_this { std::atomic _scan_thread_num; std::atomic _max_remote_scan_thread_num; std::atomic _min_remote_scan_thread_num; - std::atomic _spill_low_watermark; - std::atomic _spill_high_watermark; + std::atomic _memory_low_watermark; + std::atomic _memory_high_watermark; std::atomic _scan_bytes_per_second {-1}; std::atomic _remote_scan_bytes_per_second {-1}; @@ -240,7 +245,10 @@ class WorkloadGroup : public std::enable_shared_from_this { std::unordered_map> _query_ctxs; std::shared_mutex _task_sched_lock; - std::unique_ptr _cgroup_cpu_ctl {nullptr}; + // _cgroup_cpu_ctl not only used by threadpool which managed by WorkloadGroup, + // but also some global background threadpool which not owned by WorkloadGroup, + // so it should be shared ptr; + std::shared_ptr _cgroup_cpu_ctl {nullptr}; std::unique_ptr _task_sched {nullptr}; std::unique_ptr _scan_task_sched {nullptr}; std::unique_ptr _remote_scan_task_sched {nullptr}; @@ -249,6 +257,9 @@ class WorkloadGroup : public std::enable_shared_from_this { std::map> _scan_io_throttle_map; std::shared_ptr _remote_scan_io_throttle {nullptr}; + // for some background workload, it doesn't need to create query thread pool + const bool _need_create_query_thread_pool; + // bvar metric std::unique_ptr> _mem_used_status; std::unique_ptr> _cpu_usage_adder; @@ -271,8 +282,8 @@ struct WorkloadGroupInfo { const int scan_thread_num = 0; const int max_remote_scan_thread_num = 0; const int min_remote_scan_thread_num = 0; - const int spill_low_watermark = 0; - const int spill_high_watermark = 0; + const int memory_low_watermark = 0; + const int memory_high_watermark = 0; const int read_bytes_per_second = -1; const int remote_read_bytes_per_second = -1; // log cgroup cpu info diff --git a/be/src/runtime/workload_group/workload_group_manager.cpp b/be/src/runtime/workload_group/workload_group_manager.cpp index 927d4d138142672..4d32fc8700eaa5a 100644 --- a/be/src/runtime/workload_group/workload_group_manager.cpp +++ b/be/src/runtime/workload_group/workload_group_manager.cpp @@ -34,6 +34,25 @@ namespace doris { +void WorkloadGroupMgr::init_internal_workload_group() { + WorkloadGroupPtr internal_wg = nullptr; + { + std::lock_guard w_lock(_group_mutex); + if (_workload_groups.find(INTERNAL_WORKLOAD_GROUP_ID) == _workload_groups.end()) { + WorkloadGroupInfo internal_wg_info { + .id = INTERNAL_WORKLOAD_GROUP_ID, + .name = INTERNAL_WORKLOAD_GROUP_NAME, + .cpu_share = CgroupCpuCtl::cpu_soft_limit_default_value()}; + internal_wg = std::make_shared(internal_wg_info, false); + _workload_groups[internal_wg_info.id] = internal_wg; + } + } + DCHECK(internal_wg != nullptr); + if (internal_wg) { + internal_wg->create_cgroup_cpu_ctl(); + } +} + WorkloadGroupPtr WorkloadGroupMgr::get_or_create_workload_group( const WorkloadGroupInfo& workload_group_info) { { @@ -86,6 +105,10 @@ void WorkloadGroupMgr::delete_workload_group_by_ids(std::set used_wg_i old_wg_size = _workload_groups.size(); for (auto iter = _workload_groups.begin(); iter != _workload_groups.end(); iter++) { uint64_t wg_id = iter->first; + // internal workload group created by BE can not be dropped + if (wg_id == INTERNAL_WORKLOAD_GROUP_ID) { + continue; + } auto workload_group_ptr = iter->second; if (used_wg_id.find(wg_id) == used_wg_id.end()) { workload_group_ptr->shutdown(); diff --git a/be/src/runtime/workload_group/workload_group_manager.h b/be/src/runtime/workload_group/workload_group_manager.h index f76e98d26063ba8..18a0687b373325f 100644 --- a/be/src/runtime/workload_group/workload_group_manager.h +++ b/be/src/runtime/workload_group/workload_group_manager.h @@ -36,11 +36,18 @@ class TaskScheduler; class MultiCoreTaskQueue; } // namespace pipeline +// internal_group is used for doris internal workload, currently is mainly compaction +const static uint64_t INTERNAL_WORKLOAD_GROUP_ID = + static_cast(TWorkloadType::type::INTERNAL); +const static std::string INTERNAL_WORKLOAD_GROUP_NAME = "_internal"; + class WorkloadGroupMgr { public: WorkloadGroupMgr() = default; ~WorkloadGroupMgr() = default; + void init_internal_workload_group(); + WorkloadGroupPtr get_or_create_workload_group(const WorkloadGroupInfo& workload_group_info); void get_related_workload_groups(const std::function& pred, @@ -64,6 +71,11 @@ class WorkloadGroupMgr { void get_wg_resource_usage(vectorized::Block* block); + WorkloadGroupPtr get_internal_wg() { + std::shared_lock r_lock(_group_mutex); + return _workload_groups[INTERNAL_WORKLOAD_GROUP_ID]; + } + private: std::shared_mutex _group_mutex; std::unordered_map _workload_groups; diff --git a/be/src/service/CMakeLists.txt b/be/src/service/CMakeLists.txt index 4ce611345840c1e..e44045dffce17ef 100644 --- a/be/src/service/CMakeLists.txt +++ b/be/src/service/CMakeLists.txt @@ -28,7 +28,7 @@ add_library(Service STATIC ${SRC_FILES}) pch_reuse(Service) -if (${MAKE_TEST} STREQUAL "OFF") +if (${MAKE_TEST} STREQUAL "OFF" AND ${BUILD_BENCHMARK} STREQUAL "OFF") add_executable(doris_be doris_main.cpp ) diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 29eb01bad2aaa8e..adcd07e7de74849 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -903,6 +903,7 @@ void PInternalService::fetch_arrow_flight_schema(google::protobuf::RpcController auto st = ExecEnv::GetInstance()->result_mgr()->find_arrow_schema( UniqueId(request->finst_id()).to_thrift(), &schema); if (!st.ok()) { + LOG(WARNING) << "fetch arrow flight schema failed, errmsg=" << st; st.to_protobuf(result->mutable_status()); return; } @@ -911,9 +912,11 @@ void PInternalService::fetch_arrow_flight_schema(google::protobuf::RpcController st = serialize_arrow_schema(&schema, &schema_str); if (st.ok()) { result->set_schema(std::move(schema_str)); - if (!config::public_access_ip.empty() && config::public_access_port != -1) { - result->set_be_arrow_flight_ip(config::public_access_ip); - result->set_be_arrow_flight_port(config::public_access_port); + if (!config::public_host.empty()) { + result->set_be_arrow_flight_ip(config::public_host); + } + if (config::arrow_flight_sql_proxy_port != -1) { + result->set_be_arrow_flight_port(config::arrow_flight_sql_proxy_port); } } st.to_protobuf(result->mutable_status()); diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp index 74dab4663403301..ea991e158a1138b 100644 --- a/be/src/service/point_query_executor.cpp +++ b/be/src/service/point_query_executor.cpp @@ -396,17 +396,6 @@ Status PointQueryExecutor::_lookup_row_key() { specified_rowsets = _tablet->get_rowset_by_ids(nullptr); } std::vector> segment_caches(specified_rowsets.size()); - // init segment_cache - { - SCOPED_TIMER(&_profile_metrics.load_segment_key_stage_ns); - for (size_t i = 0; i < specified_rowsets.size(); i++) { - auto& rs = specified_rowsets[i]; - segment_caches[i] = std::make_unique(); - RETURN_IF_ERROR(SegmentLoader::instance()->load_segments( - std::static_pointer_cast(rs), segment_caches[i].get(), true, true, - &_profile_metrics.read_stats)); - } - } for (size_t i = 0; i < _row_read_ctxs.size(); ++i) { RowLocation location; if (!config::disable_storage_row_cache) { diff --git a/be/src/util/arrow/block_convertor.cpp b/be/src/util/arrow/block_convertor.cpp index 817231e02ba03e1..eb2508c8d0cb74a 100644 --- a/be/src/util/arrow/block_convertor.cpp +++ b/be/src/util/arrow/block_convertor.cpp @@ -391,8 +391,9 @@ Status FromBlockConverter::convert(std::shared_ptr* out) { _cur_start, _cur_start + _cur_rows, _timezone_obj); } catch (std::exception& e) { - return Status::InternalError("Fail to convert block data to arrow data, error: {}", - e.what()); + return Status::InternalError( + "Fail to convert block data to arrow data, tyep: {}, name: {}, error: {}", + _cur_type->get_name(), e.what()); } arrow_st = _cur_builder->Finish(&_arrays[_cur_field_idx]); if (!arrow_st.ok()) { diff --git a/be/src/util/jsonb_document.h b/be/src/util/jsonb_document.h index 016da3142cd24c2..909ee70742998e2 100644 --- a/be/src/util/jsonb_document.h +++ b/be/src/util/jsonb_document.h @@ -180,7 +180,7 @@ class JsonbDocument { static JsonbDocument* createDocument(const char* pb, size_t size); // create an JsonbValue from JSONB packed bytes - static JsonbValue* createValue(const char* pb, uint32_t size); + static JsonbValue* createValue(const char* pb, size_t size); uint8_t version() { return header_.ver_; } @@ -1160,7 +1160,7 @@ inline void JsonbDocument::setValue(const JsonbValue* value) { memcpy(payload_, value, value->numPackedBytes()); } -inline JsonbValue* JsonbDocument::createValue(const char* pb, uint32_t size) { +inline JsonbValue* JsonbDocument::createValue(const char* pb, size_t size) { if (!pb || size < sizeof(JsonbHeader) + sizeof(JsonbValue)) { return nullptr; } diff --git a/be/src/util/jsonb_parser.h b/be/src/util/jsonb_parser.h index c90012a4fbef303..4192e36ea5cc802 100644 --- a/be/src/util/jsonb_parser.h +++ b/be/src/util/jsonb_parser.h @@ -84,16 +84,16 @@ class JsonbParserT { // parse a UTF-8 JSON string bool parse(const std::string& str, hDictInsert handler = nullptr) { - return parse(str.c_str(), (unsigned int)str.size(), handler); + return parse(str.c_str(), str.size(), handler); } // parse a UTF-8 JSON c-style string (NULL terminated) bool parse(const char* c_str, hDictInsert handler = nullptr) { - return parse(c_str, (unsigned int)strlen(c_str), handler); + return parse(c_str, strlen(c_str), handler); } // parse a UTF-8 JSON string with length - bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) { + bool parse(const char* pch, size_t len, hDictInsert handler = nullptr) { if (!pch || len == 0) { err_ = JsonbErrType::E_EMPTY_DOCUMENT; return false; diff --git a/be/src/util/jsonb_parser_simd.h b/be/src/util/jsonb_parser_simd.h index 6621912a9d04000..96ce866f74e256d 100644 --- a/be/src/util/jsonb_parser_simd.h +++ b/be/src/util/jsonb_parser_simd.h @@ -85,16 +85,16 @@ class JsonbParserTSIMD { // parse a UTF-8 JSON string bool parse(const std::string& str, hDictInsert handler = nullptr) { - return parse(str.c_str(), (unsigned int)str.size(), handler); + return parse(str.c_str(), str.size(), handler); } // parse a UTF-8 JSON c-style string (NULL terminated) bool parse(const char* c_str, hDictInsert handler = nullptr) { - return parse(c_str, (unsigned int)strlen(c_str), handler); + return parse(c_str, strlen(c_str), handler); } // parse a UTF-8 JSON string with length - bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) { + bool parse(const char* pch, size_t len, hDictInsert handler = nullptr) { // reset state before parse reset(); diff --git a/be/src/util/jsonb_stream.h b/be/src/util/jsonb_stream.h index 4567ab8384bd9d8..2ea5d9090c735bc 100644 --- a/be/src/util/jsonb_stream.h +++ b/be/src/util/jsonb_stream.h @@ -72,7 +72,7 @@ class JsonbInBuffer : public std::streambuf { */ class JsonbOutStream : public std::ostream { public: - explicit JsonbOutStream(uint32_t capacity = 1024) + explicit JsonbOutStream(uint64_t capacity = 1024) : std::ostream(nullptr), head_(nullptr), size_(0), capacity_(capacity), alloc_(true) { if (capacity_ == 0) { capacity_ = 1024; @@ -81,7 +81,7 @@ class JsonbOutStream : public std::ostream { head_ = (char*)malloc(capacity_); } - JsonbOutStream(char* buffer, uint32_t capacity) + JsonbOutStream(char* buffer, uint64_t capacity) : std::ostream(nullptr), head_(buffer), size_(0), capacity_(capacity), alloc_(false) { assert(buffer && capacity_ > 0); } @@ -94,10 +94,12 @@ class JsonbOutStream : public std::ostream { void put(char c) { write(&c, 1); } - void write(const char* c_str) { write(c_str, (uint32_t)strlen(c_str)); } + void write(const char* c_str) { write(c_str, strlen(c_str)); } - void write(const char* bytes, uint32_t len) { - if (len == 0) return; + void write(const char* bytes, uint64_t len) { + if (len == 0) { + return; + } if (size_ + len > capacity_) { realloc(len); @@ -156,14 +158,14 @@ class JsonbOutStream : public std::ostream { pos_type tellp() const { return size_; } - void seekp(pos_type pos) { size_ = (uint32_t)pos; } + void seekp(pos_type pos) { size_ = (uint64_t)pos; } const char* getBuffer() const { return head_; } pos_type getSize() const { return tellp(); } private: - void realloc(uint32_t len) { + void realloc(uint64_t len) { assert(capacity_ > 0); capacity_ *= 2; @@ -186,8 +188,8 @@ class JsonbOutStream : public std::ostream { private: char* head_ = nullptr; - uint32_t size_; - uint32_t capacity_; + uint64_t size_; + uint64_t capacity_; bool alloc_; }; diff --git a/be/src/util/jsonb_writer.h b/be/src/util/jsonb_writer.h index 61bd28bb783bd2e..52d912d29d3b6db 100644 --- a/be/src/util/jsonb_writer.h +++ b/be/src/util/jsonb_writer.h @@ -315,7 +315,9 @@ class JsonbWriterT { return false; } - uint32_t writeString(const char* str, uint32_t len) { + // TODO: here changed length to uint64_t, as some api also need changed, But the thirdparty api is uint_32t + // need consider a better way to handle case. + uint64_t writeString(const char* str, uint64_t len) { if (kvState_ == WS_String) { os_->write(str, len); return len; @@ -324,9 +326,7 @@ class JsonbWriterT { return 0; } - uint32_t writeString(const std::string& str) { - return writeString(str.c_str(), (uint32_t)str.size()); - } + uint32_t writeString(const std::string& str) { return writeString(str.c_str(), str.size()); } uint32_t writeString(char ch) { if (kvState_ == WS_String) { os_->put(ch); @@ -372,7 +372,7 @@ class JsonbWriterT { return false; } - uint32_t writeBinary(const char* bin, uint32_t len) { + uint64_t writeBinary(const char* bin, uint64_t len) { if (kvState_ == WS_Binary) { os_->write(bin, len); return len; @@ -483,8 +483,7 @@ class JsonbWriterT { } JsonbValue* getValue() { - return JsonbDocument::createValue(getOutput()->getBuffer(), - (uint32_t)getOutput()->getSize()); + return JsonbDocument::createValue(getOutput()->getBuffer(), getOutput()->getSize()); } bool writeEnd() { diff --git a/be/src/util/jvm_metrics.cpp b/be/src/util/jvm_metrics.cpp index 4cb71f5e827878d..b1089ef413628dc 100644 --- a/be/src/util/jvm_metrics.cpp +++ b/be/src/util/jvm_metrics.cpp @@ -485,8 +485,8 @@ Status JvmStats::refresh(JvmMetrics* jvm_metrics) const { jvm_metrics->jvm_thread_count->set_value(threadCount < 0 ? 0 : threadCount); for (int i = 0; i < threadCount; i++) { - JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, threadInfo, env, - GetObjectArrayElement((jobjectArray)threadInfos, i)); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, threadInfo, env, GetObjectArrayElement((jobjectArray)threadInfos, i)); if (threadInfo == nullptr) { continue; diff --git a/be/src/util/ref_count_closure.h b/be/src/util/ref_count_closure.h index 92772a82373fec7..560aebb98ee15e2 100644 --- a/be/src/util/ref_count_closure.h +++ b/be/src/util/ref_count_closure.h @@ -20,7 +20,9 @@ #include #include +#include +#include "runtime/query_context.h" #include "runtime/thread_context.h" #include "service/brpc.h" #include "util/ref_count_closure.h" @@ -79,8 +81,9 @@ class AutoReleaseClosure : public google::protobuf::Closure { ENABLE_FACTORY_CREATOR(AutoReleaseClosure); public: - AutoReleaseClosure(std::shared_ptr req, std::shared_ptr callback) - : request_(req), callback_(callback) { + AutoReleaseClosure(std::shared_ptr req, std::shared_ptr callback, + std::weak_ptr context = {}) + : request_(req), callback_(callback), context_(std::move(context)) { this->cntl_ = callback->cntl_; this->response_ = callback->response_; } @@ -113,12 +116,22 @@ class AutoReleaseClosure : public google::protobuf::Closure { protected: virtual void _process_if_rpc_failed() { - LOG(WARNING) << "RPC meet failed: " << cntl_->ErrorText(); + std::string error_msg = "RPC meet failed: " + cntl_->ErrorText(); + if (auto ctx = context_.lock(); ctx) { + ctx->cancel(Status::NetworkError(error_msg)); + } else { + LOG(WARNING) << error_msg; + } } virtual void _process_if_meet_error_status(const Status& status) { - // no need to log END_OF_FILE, reduce the unlessful log - if (!status.is()) { + if (status.is()) { + // no need to log END_OF_FILE, reduce the unlessful log + return; + } + if (auto ctx = context_.lock(); ctx) { + ctx->cancel(status); + } else { LOG(WARNING) << "RPC meet error status: " << status; } } @@ -136,6 +149,7 @@ class AutoReleaseClosure : public google::protobuf::Closure { // Use a weak ptr to keep the callback, so that the callback can be deleted if the main // thread is freed. Weak callback_; + std::weak_ptr context_; }; } // namespace doris diff --git a/be/src/util/security.h b/be/src/util/security.h new file mode 100644 index 000000000000000..d2201b1b297b700 --- /dev/null +++ b/be/src/util/security.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace doris { + +inline std::string mask_token(const std::string& str) { + std::regex pattern("token=[\\w|-]+"); + return std::regex_replace(str, pattern, "token=******"); +} + +inline std::string mask_token(const char* str) { + std::regex pattern("token=[\\w|-]+"); + return std::regex_replace(str, pattern, "token=******"); +} + +} // namespace doris diff --git a/be/src/util/threadpool.cpp b/be/src/util/threadpool.cpp index 15fb36181d4336b..f5ea38515def363 100644 --- a/be/src/util/threadpool.cpp +++ b/be/src/util/threadpool.cpp @@ -75,7 +75,8 @@ ThreadPoolBuilder& ThreadPoolBuilder::set_max_queue_size(int max_queue_size) { return *this; } -ThreadPoolBuilder& ThreadPoolBuilder::set_cgroup_cpu_ctl(CgroupCpuCtl* cgroup_cpu_ctl) { +ThreadPoolBuilder& ThreadPoolBuilder::set_cgroup_cpu_ctl( + std::weak_ptr cgroup_cpu_ctl) { _cgroup_cpu_ctl = cgroup_cpu_ctl; return *this; } @@ -476,8 +477,8 @@ void ThreadPool::dispatch_thread() { _num_threads++; _num_threads_pending_start--; - if (_cgroup_cpu_ctl != nullptr) { - static_cast(_cgroup_cpu_ctl->add_thread_to_cgroup()); + if (std::shared_ptr cg_cpu_ctl_sptr = _cgroup_cpu_ctl.lock()) { + static_cast(cg_cpu_ctl_sptr->add_thread_to_cgroup()); } // Owned by this worker thread and added/removed from _idle_threads as needed. diff --git a/be/src/util/threadpool.h b/be/src/util/threadpool.h index 5ce27e2f27b9a57..9bd4a7246fb0b18 100644 --- a/be/src/util/threadpool.h +++ b/be/src/util/threadpool.h @@ -107,7 +107,7 @@ class ThreadPoolBuilder { ThreadPoolBuilder& set_min_threads(int min_threads); ThreadPoolBuilder& set_max_threads(int max_threads); ThreadPoolBuilder& set_max_queue_size(int max_queue_size); - ThreadPoolBuilder& set_cgroup_cpu_ctl(CgroupCpuCtl* cgroup_cpu_ctl); + ThreadPoolBuilder& set_cgroup_cpu_ctl(std::weak_ptr cgroup_cpu_ctl); template ThreadPoolBuilder& set_idle_timeout(const std::chrono::duration& idle_timeout) { _idle_timeout = std::chrono::duration_cast(idle_timeout); @@ -133,7 +133,7 @@ class ThreadPoolBuilder { int _min_threads; int _max_threads; int _max_queue_size; - CgroupCpuCtl* _cgroup_cpu_ctl = nullptr; + std::weak_ptr _cgroup_cpu_ctl; std::chrono::milliseconds _idle_timeout; ThreadPoolBuilder(const ThreadPoolBuilder&) = delete; @@ -345,7 +345,7 @@ class ThreadPool { // Protected by _lock. int _total_queued_tasks; - CgroupCpuCtl* _cgroup_cpu_ctl = nullptr; + std::weak_ptr _cgroup_cpu_ctl; // All allocated tokens. // diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index e9148716f99f359..32fc9d5efce7714 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -43,7 +43,7 @@ class IDataType; struct AggregateFunctionAttr { bool enable_decimal256 {false}; - std::vector> column_infos; + std::vector column_names; }; template diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top.h index 7885321bba3e11b..399af84f43cf202 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top.h @@ -18,12 +18,92 @@ #pragma once #include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_nullable.h" namespace doris::vectorized { class AggregateFunctionApproxTop { public: + AggregateFunctionApproxTop(const std::vector& column_names) + : _column_names(column_names) {} + + static int32_t is_valid_const_columns(const std::vector& is_const_columns) { + int32_t true_count = 0; + bool found_false_after_true = false; + for (int32_t i = is_const_columns.size() - 1; i >= 0; --i) { + if (is_const_columns[i]) { + true_count++; + if (found_false_after_true) { + return false; + } + } else { + if (true_count > 2) { + return false; + } + found_false_after_true = true; + } + } + if (true_count > 2) { + throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid is_const_columns configuration"); + } + return true_count; + } + +protected: + void lazy_init(const IColumn** columns, ssize_t row_num, + const DataTypes& argument_types) const { + auto get_param = [](size_t idx, const DataTypes& data_types, + const IColumn** columns) -> uint64_t { + const auto& data_type = data_types.at(idx); + const IColumn* column = columns[idx]; + + const auto* type = data_type.get(); + if (type->is_nullable()) { + type = assert_cast(type) + ->get_nested_type() + .get(); + } + int64_t value = 0; + WhichDataType which(type); + if (which.idx == TypeIndex::Int8) { + value = assert_cast(column) + ->get_element(0); + } else if (which.idx == TypeIndex::Int16) { + value = assert_cast(column) + ->get_element(0); + } else if (which.idx == TypeIndex::Int32) { + value = assert_cast(column) + ->get_element(0); + } + if (value <= 0) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "The parameter cannot be less than or equal to 0."); + } + return value; + }; + + _threshold = + std::min(get_param(_column_names.size(), argument_types, columns), (uint64_t)4096); + _reserved = std::min( + std::max(get_param(_column_names.size() + 1, argument_types, columns), _threshold), + (uint64_t)4096); + + if (_threshold == 0 || _reserved == 0 || _threshold > 4096 || _reserved > 4096) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "approx_top_sum param error, _threshold: {}, _reserved: {}", _threshold, + _reserved); + } + + _init_flag = true; + } + static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; + + mutable std::vector _column_names; + mutable bool _init_flag = false; + mutable uint64_t _threshold = 10; + mutable uint64_t _reserved = 30; }; } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp index d6298881a906308..0aa7adc253da0f5 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp @@ -24,58 +24,16 @@ namespace doris::vectorized { -int32_t is_valid_const_columns(const std::vector& is_const_columns) { - int32_t true_count = 0; - bool found_false_after_true = false; - for (int32_t i = is_const_columns.size() - 1; i >= 0; --i) { - if (is_const_columns[i]) { - true_count++; - if (found_false_after_true) { - return false; - } - } else { - if (true_count > 2) { - return false; - } - found_false_after_true = true; - } - } - if (true_count > 2) { - throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid is_const_columns configuration"); - } - return true_count; -} - AggregateFunctionPtr create_aggregate_function_approx_top_k(const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, const AggregateFunctionAttr& attr) { - if (argument_types.empty()) { + if (argument_types.size() < 3) { return nullptr; } - std::vector is_const_columns; - std::vector column_names; - for (const auto& [name, is_const] : attr.column_infos) { - is_const_columns.push_back(is_const); - if (!is_const) { - column_names.push_back(name); - } - } - - int32_t true_count = is_valid_const_columns(is_const_columns); - if (true_count == 0) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else if (true_count == 1) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else if (true_count == 2) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else { - return nullptr; - } + return creator_without_type::create( + argument_types, result_is_nullable, attr.column_names); } void register_aggregate_function_approx_top_k(AggregateFunctionSimpleFactory& factory) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h index 7253ae8a96e200d..93ea3232c311a10 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h @@ -45,28 +45,25 @@ namespace doris::vectorized { -inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; - struct AggregateFunctionTopKGenericData { using Set = SpaceSaving; Set value; }; -template class AggregateFunctionApproxTopK final : public IAggregateFunctionDataHelper>, + AggregateFunctionApproxTopK>, AggregateFunctionApproxTop { private: using State = AggregateFunctionTopKGenericData; public: - AggregateFunctionApproxTopK(std::vector column_names, + AggregateFunctionApproxTopK(const std::vector& column_names, const DataTypes& argument_types_) : IAggregateFunctionDataHelper>(argument_types_), - _column_names(std::move(column_names)) {} + AggregateFunctionApproxTopK>(argument_types_), + AggregateFunctionApproxTop(column_names) {} String get_name() const override { return "approx_top_k"; } @@ -88,7 +85,7 @@ class AggregateFunctionApproxTopK final void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, Arena* arena) const override { auto readStringBinaryInto = [](Arena& arena, BufferReadable& buf) { - size_t size = 0; + uint64_t size = 0; read_var_uint(size, buf); if (UNLIKELY(size > DEFAULT_MAX_STRING_SIZE)) { @@ -104,7 +101,7 @@ class AggregateFunctionApproxTopK final auto& set = this->data(place).value; set.clear(); - size_t size = 0; + uint64_t size = 0; read_var_uint(size, buf); if (UNLIKELY(size > TOP_K_MAX_SIZE)) { throw Exception(ErrorCode::INTERNAL_ERROR, @@ -141,7 +138,7 @@ class AggregateFunctionApproxTopK final void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, Arena* arena) const override { if (!_init_flag) { - lazy_init(columns, row_num); + lazy_init(columns, row_num, this->get_argument_types()); } auto& set = this->data(place).value; @@ -227,64 +224,6 @@ class AggregateFunctionApproxTopK final std::string res = buffer.GetString(); data_to.insert_data(res.data(), res.size()); } - -private: - void lazy_init(const IColumn** columns, ssize_t row_num) const { - auto get_param = [](size_t idx, const DataTypes& data_types, - const IColumn** columns) -> uint64_t { - const auto& data_type = data_types.at(idx); - const IColumn* column = columns[idx]; - - const auto* type = data_type.get(); - if (type->is_nullable()) { - type = assert_cast(type) - ->get_nested_type() - .get(); - } - int64_t value = 0; - WhichDataType which(type); - if (which.idx == TypeIndex::Int8) { - value = assert_cast(column) - ->get_element(0); - } else if (which.idx == TypeIndex::Int16) { - value = assert_cast(column) - ->get_element(0); - } else if (which.idx == TypeIndex::Int32) { - value = assert_cast(column) - ->get_element(0); - } - if (value <= 0) { - throw Exception(ErrorCode::INVALID_ARGUMENT, - "The parameter cannot be less than or equal to 0."); - } - return value; - }; - - const auto& data_types = this->get_argument_types(); - if (ArgsSize == 1) { - _threshold = - std::min(get_param(_column_names.size(), data_types, columns), (uint64_t)1000); - } else if (ArgsSize == 2) { - _threshold = - std::min(get_param(_column_names.size(), data_types, columns), (uint64_t)1000); - _reserved = std::min( - std::max(get_param(_column_names.size() + 1, data_types, columns), _threshold), - (uint64_t)1000); - } - - if (_threshold == 0 || _reserved == 0 || _threshold > 1000 || _reserved > 1000) { - throw Exception(ErrorCode::INTERNAL_ERROR, - "approx_top_k param error, _threshold: {}, _reserved: {}", _threshold, - _reserved); - } - - _init_flag = true; - } - - mutable std::vector _column_names; - mutable bool _init_flag = false; - mutable uint64_t _threshold = 10; - mutable uint64_t _reserved = 300; }; } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp new file mode 100644 index 000000000000000..7325651d141c13a --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/aggregate_functions/aggregate_function_approx_top_sum.h" + +#include "common/exception.h" +#include "vec/aggregate_functions/aggregate_function_simple_factory.h" +#include "vec/aggregate_functions/helpers.h" +#include "vec/data_types/data_type.h" + +namespace doris::vectorized { + +template +AggregateFunctionPtr create_aggregate_function_multi_top_sum_impl( + const DataTypes& argument_types, const bool result_is_nullable, + const std::vector& column_names) { + if (N == argument_types.size() - 3) { + return creator_with_type_base::template create< + AggregateFunctionApproxTopSumSimple>(argument_types, result_is_nullable, + column_names); + } else { + return create_aggregate_function_multi_top_sum_impl( + argument_types, result_is_nullable, column_names); + } +} + +template <> +AggregateFunctionPtr create_aggregate_function_multi_top_sum_impl<0>( + const DataTypes& argument_types, const bool result_is_nullable, + const std::vector& column_names) { + return creator_with_type_base::template create< + AggregateFunctionApproxTopSumSimple>(argument_types, result_is_nullable, column_names); +} + +AggregateFunctionPtr create_aggregate_function_approx_top_sum(const std::string& name, + const DataTypes& argument_types, + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { + if (argument_types.size() < 3) { + return nullptr; + } + + constexpr size_t max_param_value = 10; + if (argument_types.size() > max_param_value) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Argument types size exceeds the supported limit."); + } + + return create_aggregate_function_multi_top_sum_impl( + argument_types, result_is_nullable, attr.column_names); +} + +void register_aggregate_function_approx_top_sum(AggregateFunctionSimpleFactory& factory) { + factory.register_function_both("approx_top_sum", create_aggregate_function_approx_top_sum); +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h new file mode 100644 index 000000000000000..12b89bd02b51fdc --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/aggregate_functions/aggregate_function_approx_top.h" +#include "vec/columns/column.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_struct.h" +#include "vec/columns/column_vector.h" +#include "vec/columns/columns_number.h" +#include "vec/common/assert_cast.h" +#include "vec/common/space_saving.h" +#include "vec/common/string_ref.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_ipv4.h" +#include "vec/data_types/data_type_struct.h" +#include "vec/io/io_helper.h" + +namespace doris::vectorized { + +struct AggregateFunctionTopKGenericData { + using Set = SpaceSaving; + + Set value; +}; + +template +class AggregateFunctionApproxTopSum final + : public IAggregateFunctionDataHelper>, + AggregateFunctionApproxTop { +private: + using State = AggregateFunctionTopKGenericData; + + using ResultDataType = DataTypeNumber; + using ColVecType = ColumnVector; + using ColVecResult = ColumnVector; + +public: + AggregateFunctionApproxTopSum(const std::vector& column_names, + const DataTypes& argument_types_) + : IAggregateFunctionDataHelper>( + argument_types_), + AggregateFunctionApproxTop(column_names) {} + + String get_name() const override { return "approx_top_sum"; } + + DataTypePtr get_return_type() const override { return std::make_shared(); } + + // Serializes the aggregate function's state (including the SpaceSaving structure and threshold) into a buffer. + void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override { + this->data(place).value.write(buf); + + write_var_uint(_column_names.size(), buf); + for (const auto& column_name : _column_names) { + write_string_binary(column_name, buf); + } + write_var_uint(_threshold, buf); + write_var_uint(_reserved, buf); + } + + // Deserializes the aggregate function's state from a buffer (including the SpaceSaving structure and threshold). + void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, + Arena* arena) const override { + auto readStringBinaryInto = [](Arena& arena, BufferReadable& buf) { + uint64_t size = 0; + read_var_uint(size, buf); + + if (UNLIKELY(size > DEFAULT_MAX_STRING_SIZE)) { + throw Exception(ErrorCode::INTERNAL_ERROR, "Too large string size."); + } + + char* data = arena.alloc(size); + buf.read(data, size); + + return StringRef(data, size); + }; + + auto& set = this->data(place).value; + set.clear(); + + uint64_t size = 0; + read_var_uint(size, buf); + if (UNLIKELY(size > TOP_K_MAX_SIZE)) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Too large size ({}) for aggregate function '{}' state (maximum is {})", + size, get_name(), TOP_K_MAX_SIZE); + } + + set.resize(size); + for (size_t i = 0; i < size; ++i) { + auto ref = readStringBinaryInto(*arena, buf); + uint64_t count = 0; + uint64_t error = 0; + read_var_uint(count, buf); + read_var_uint(error, buf); + set.insert(ref, count, error); + arena->rollback(ref.size); + } + + set.read_alpha_map(buf); + + uint64_t column_size = 0; + read_var_uint(column_size, buf); + _column_names.clear(); + for (uint64_t i = 0; i < column_size; i++) { + std::string column_name; + read_string_binary(column_name, buf); + _column_names.emplace_back(std::move(column_name)); + } + read_var_uint(_threshold, buf); + read_var_uint(_reserved, buf); + } + + // Adds a new row of data to the aggregate function (inserts a new value into the SpaceSaving structure). + void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, + Arena* arena) const override { + if (!_init_flag) { + lazy_init(columns, row_num, this->get_argument_types()); + } + + auto& set = this->data(place).value; + if (set.capacity() != _reserved) { + set.resize(_reserved); + } + + auto all_serialize_value_into_arena = + [](size_t i, size_t keys_size, const IColumn** columns, Arena* arena) -> StringRef { + const char* begin = nullptr; + + size_t sum_size = 0; + for (size_t j = 0; j < keys_size; ++j) { + sum_size += columns[j]->serialize_value_into_arena(i, *arena, begin).size; + } + + return {begin, sum_size}; + }; + + StringRef str_serialized = + all_serialize_value_into_arena(row_num, _column_names.size(), columns, arena); + const auto& column = assert_cast( + *columns[_column_names.size() - 1]); + set.insert(str_serialized, TResult(column.get_data()[row_num])); + arena->rollback(str_serialized.size); + } + + void add_many(AggregateDataPtr __restrict place, const IColumn** columns, + std::vector& rows, Arena* arena) const override { + for (auto row : rows) { + add(place, columns, row, arena); + } + } + + void reset(AggregateDataPtr __restrict place) const override { + this->data(place).value.clear(); + } + + // Merges the state of another aggregate function into the current one (merges two SpaceSaving sets). + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, + Arena*) const override { + auto& rhs_set = this->data(rhs).value; + if (!rhs_set.size()) { + return; + } + + auto& set = this->data(place).value; + if (set.capacity() != _reserved) { + set.resize(_reserved); + } + set.merge(rhs_set); + } + + void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { + auto& data_to = assert_cast(to); + + const typename State::Set& set = this->data(place).value; + auto result_vec = set.top_k(_threshold); + + rapidjson::StringBuffer buffer; + rapidjson::PrettyWriter writer(buffer); + writer.StartArray(); + for (auto& result : result_vec) { + auto argument_types = this->get_argument_types(); + MutableColumns argument_columns(_column_names.size()); + for (size_t i = 0; i < _column_names.size(); ++i) { + argument_columns[i] = argument_types[i]->create_column(); + } + rapidjson::StringBuffer sub_buffer; + rapidjson::Writer sub_writer(sub_buffer); + sub_writer.StartObject(); + const char* begin = result.key.data; + for (size_t i = 0; i < _column_names.size(); i++) { + begin = argument_columns[i]->deserialize_and_insert_from_arena(begin); + std::string row_str = argument_types[i]->to_string(*argument_columns[i], 0); + sub_writer.Key(_column_names[i].data(), _column_names[i].size()); + sub_writer.String(row_str.data(), row_str.size()); + } + sub_writer.Key("sum"); + sub_writer.String(std::to_string(result.count).c_str()); + sub_writer.EndObject(); + writer.RawValue(sub_buffer.GetString(), sub_buffer.GetSize(), rapidjson::kObjectType); + } + writer.EndArray(); + std::string res = buffer.GetString(); + data_to.insert_data(res.data(), res.size()); + } +}; + +template +struct TopSumSimple { + using ResultType = T; + using AggregateDataType = AggregateFunctionTopKGenericData; + using Function = AggregateFunctionApproxTopSum; +}; + +template +using AggregateFunctionApproxTopSumSimple = typename TopSumSimple::Function; + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_collect.h b/be/src/vec/aggregate_functions/aggregate_function_collect.h index 02490be56a0bf1c..da310c6e0cc4c22 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_collect.h +++ b/be/src/vec/aggregate_functions/aggregate_function_collect.h @@ -98,7 +98,7 @@ struct AggregateFunctionCollectSetData { } void read(BufferReadable& buf) { - size_t new_size = 0; + uint64_t new_size = 0; read_var_uint(new_size, buf); ElementNativeType x; for (size_t i = 0; i < new_size; ++i) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp index 71d09f61de4302d..4c5fe1321952d68 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp @@ -29,20 +29,15 @@ namespace doris::vectorized { -template