From 6e3751a5c2c5ccff00c342e6701afbd1150eeb93 Mon Sep 17 00:00:00 2001 From: huanghaibin Date: Sun, 17 Nov 2024 21:04:39 +0800 Subject: [PATCH] [fix](cloud-mow) MS should create new Transaction to continue geting delete bitmap when encounter TXN_TOO_OLD (#44074) When delete bitmap count is big, geting delete bitmap may encounter TXN_TOO_OLD, ms should create a new transaction to reading the remaining data instead of returning TXN_TOO_OLD code. pick pr:https://github.com/apache/doris/pull/43509 --- cloud/src/common/config.h | 3 + cloud/src/meta-service/meta_service.cpp | 64 ++++++- cloud/src/meta-service/meta_service_helper.h | 3 +- cloud/test/meta_service_test.cpp | 167 ++++++++++++++++++- gensrc/proto/cloud.proto | 1 + 5 files changed, 232 insertions(+), 6 deletions(-) diff --git a/cloud/src/common/config.h b/cloud/src/common/config.h index 9fe98c16510f11..f6e0073b92464f 100644 --- a/cloud/src/common/config.h +++ b/cloud/src/common/config.h @@ -214,6 +214,9 @@ CONF_mInt64(max_s3_client_retry, "10"); // Max aborted txn num for the same label name CONF_mInt64(max_num_aborted_txn, "100"); +// Max byte getting delete bitmap can return, default is 1GB +CONF_mInt64(max_get_delete_bitmap_byte, "1073741824"); + CONF_Bool(enable_cloud_txn_lazy_commit, "true"); CONF_Int32(txn_lazy_commit_rowsets_thresold, "1000"); CONF_Int32(txn_lazy_commit_num_threads, "8"); diff --git a/cloud/src/meta-service/meta_service.cpp b/cloud/src/meta-service/meta_service.cpp index 69740dbf49aec9..acca75b84047e1 100644 --- a/cloud/src/meta-service/meta_service.cpp +++ b/cloud/src/meta-service/meta_service.cpp @@ -1907,6 +1907,12 @@ void MetaServiceImpl::get_delete_bitmap(google::protobuf::RpcController* control return; } + response->set_tablet_id(tablet_id); + int64_t delete_bitmap_num = 0; + int64_t delete_bitmap_byte = 0; + bool test = false; + TEST_SYNC_POINT_CALLBACK("get_delete_bitmap_test", &test); + for (size_t i = 0; i < rowset_ids.size(); i++) { // create a new transaction every time, avoid using one transaction that takes too long std::unique_ptr txn; @@ -1931,11 +1937,40 @@ void MetaServiceImpl::get_delete_bitmap(google::protobuf::RpcController* control std::unique_ptr it; int64_t last_ver = -1; int64_t last_seg_id = -1; + int64_t round = 0; do { - err = txn->get(start_key, end_key, &it); + if (test) { + LOG(INFO) << "test"; + err = txn->get(start_key, end_key, &it, false, 2); + } else { + err = txn->get(start_key, end_key, &it); + } + TEST_SYNC_POINT_CALLBACK("get_delete_bitmap_err", &round, &err); + int64_t retry = 0; + while (err == TxnErrorCode::TXN_TOO_OLD && retry < 3) { + txn = nullptr; + err = txn_kv_->create_txn(&txn); + if (err != TxnErrorCode::TXN_OK) { + code = cast_as(err); + ss << "failed to init txn, retry=" << retry << ", internal round=" << round; + msg = ss.str(); + return; + } + if (test) { + err = txn->get(start_key, end_key, &it, false, 2); + } else { + err = txn->get(start_key, end_key, &it); + } + retry++; + LOG(INFO) << "retry get delete bitmap, tablet=" << tablet_id << ", retry=" << retry + << ", internal round=" << round + << ", delete_bitmap_num=" << delete_bitmap_num + << ", delete_bitmap_byte=" << delete_bitmap_byte; + } if (err != TxnErrorCode::TXN_OK) { code = cast_as(err); - ss << "internal error, failed to get delete bitmap, ret=" << err; + ss << "internal error, failed to get delete bitmap, internal round=" << round + << ", ret=" << err; msg = ss.str(); return; } @@ -1960,18 +1995,39 @@ void MetaServiceImpl::get_delete_bitmap(google::protobuf::RpcController* control response->add_segment_delete_bitmaps(std::string(v)); last_ver = ver; last_seg_id = seg_id; + delete_bitmap_num++; + delete_bitmap_byte += v.length(); } else { TEST_SYNC_POINT_CALLBACK("get_delete_bitmap_code", &code); if (code != MetaServiceCode::OK) { - msg = "test get get_delete_bitmap fail,code=" + MetaServiceCode_Name(code); + ss << "test get get_delete_bitmap fail, code=" << MetaServiceCode_Name(code) + << ", internal round=" << round; + msg = ss.str(); return; } + delete_bitmap_byte += v.length(); response->mutable_segment_delete_bitmaps()->rbegin()->append(v); } } + if (delete_bitmap_byte > config::max_get_delete_bitmap_byte) { + code = MetaServiceCode::KV_TXN_GET_ERR; + ss << "tablet=" << tablet_id << ", get_delete_bitmap_byte=" << delete_bitmap_byte + << ",exceed max byte"; + msg = ss.str(); + LOG(WARNING) << msg; + return; + } + round++; start_key = it->next_begin_key(); // Update to next smallest key for iteration } while (it->more()); - } + LOG(INFO) << "get delete bitmap for tablet=" << tablet_id << ", rowset=" << rowset_ids[i] + << ", start version=" << begin_versions[i] << ", end version=" << end_versions[i] + << ", internal round=" << round << ", delete_bitmap_num=" << delete_bitmap_num + << ", delete_bitmap_byte=" << delete_bitmap_byte; + } + LOG(INFO) << "finish get delete bitmap for tablet=" << tablet_id + << ", delete_bitmap_num=" << delete_bitmap_num + << ", delete_bitmap_byte=" << delete_bitmap_byte; if (request->has_idx()) { std::unique_ptr txn; diff --git a/cloud/src/meta-service/meta_service_helper.h b/cloud/src/meta-service/meta_service_helper.h index 8a1a53061949e0..4ef98ea2fb7b0c 100644 --- a/cloud/src/meta-service/meta_service_helper.h +++ b/cloud/src/meta-service/meta_service_helper.h @@ -116,7 +116,8 @@ void finish_rpc(std::string_view func_name, brpc::Controller* ctrl, Response* re } LOG(INFO) << "finish " << func_name << " from " << ctrl->remote_side() << " status=" << res->status().ShortDebugString() - << " delete_bitmap_size=" << res->segment_delete_bitmaps_size(); + << " tablet=" << res->tablet_id() + << " delete_bitmap_count=" << res->segment_delete_bitmaps_size(); } else if constexpr (std::is_same_v || std::is_same_v) { std::string debug_string = res->DebugString(); diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp index c67b49aac3f6f7..1f93b228c6de08 100644 --- a/cloud/test/meta_service_test.cpp +++ b/cloud/test/meta_service_test.cpp @@ -5140,7 +5140,7 @@ TEST(MetaServiceTest, DeleteBimapCommitTxnTest) { } } -TEST(MetaServiceTest, GetDeleteBitmapWithRetryTest) { +TEST(MetaServiceTest, GetDeleteBitmapWithRetryTest1) { auto meta_service = get_meta_service(); SyncPoint::get_instance()->enable_processing(); size_t index = 0; @@ -5211,6 +5211,171 @@ TEST(MetaServiceTest, GetDeleteBitmapWithRetryTest) { SyncPoint::get_instance()->clear_all_call_backs(); } +TEST(MetaServiceTest, GetDeleteBitmapWithRetryTest2) { + auto meta_service = get_meta_service(); + SyncPoint::get_instance()->enable_processing(); + size_t index = 0; + SyncPoint::get_instance()->set_call_back("get_delete_bitmap_test", [&](auto&& args) { + auto* test = try_any_cast(args[0]); + *test = true; + LOG(INFO) << "GET_DELETE_BITMAP_TEST, test=" << *test; + }); + SyncPoint::get_instance()->set_call_back("get_delete_bitmap_err", [&](auto&& args) { + auto* round = try_any_cast(args[0]); + LOG(INFO) << "GET_DELETE_BITMAP_CODE,index=" << index << ",round=" << *round; + if (*round > 2 && ++index < 2) { + *try_any_cast(args[1]) = TxnErrorCode::TXN_TOO_OLD; + } + }); + + // get delete bitmap update lock + brpc::Controller cntl; + GetDeleteBitmapUpdateLockRequest get_lock_req; + GetDeleteBitmapUpdateLockResponse get_lock_res; + get_lock_req.set_cloud_unique_id("test_cloud_unique_id"); + get_lock_req.set_table_id(100); + get_lock_req.add_partition_ids(123); + get_lock_req.set_expiration(5); + get_lock_req.set_lock_id(888); + get_lock_req.set_initiator(-1); + meta_service->get_delete_bitmap_update_lock( + reinterpret_cast<::google::protobuf::RpcController*>(&cntl), &get_lock_req, + &get_lock_res, nullptr); + ASSERT_EQ(get_lock_res.status().code(), MetaServiceCode::OK); + + //first update new key + UpdateDeleteBitmapRequest update_delete_bitmap_req; + UpdateDeleteBitmapResponse update_delete_bitmap_res; + update_delete_bitmap_req.set_cloud_unique_id("test_cloud_unique_id"); + update_delete_bitmap_req.set_table_id(100); + update_delete_bitmap_req.set_partition_id(123); + update_delete_bitmap_req.set_lock_id(888); + update_delete_bitmap_req.set_initiator(-1); + update_delete_bitmap_req.set_tablet_id(333); + std::string rowset_id = "456"; + std::string segment_delete_bitmaps[5]; + for (int i = 0; i < 5; i++) { + segment_delete_bitmaps[i] = generate_random_string(300 * 1000 * 3); + } + int count = 5; + for (int i = 0; i < count; i++) { + update_delete_bitmap_req.add_rowset_ids(rowset_id); + update_delete_bitmap_req.add_segment_ids(i); + update_delete_bitmap_req.add_versions(i + 1); + update_delete_bitmap_req.add_segment_delete_bitmaps(segment_delete_bitmaps[i]); + } + meta_service->update_delete_bitmap(reinterpret_cast(&cntl), + &update_delete_bitmap_req, &update_delete_bitmap_res, + nullptr); + ASSERT_EQ(update_delete_bitmap_res.status().code(), MetaServiceCode::OK); + + GetDeleteBitmapRequest get_delete_bitmap_req; + GetDeleteBitmapResponse get_delete_bitmap_res; + get_delete_bitmap_req.set_cloud_unique_id("test_cloud_unique_id"); + get_delete_bitmap_req.set_tablet_id(333); + + get_delete_bitmap_req.add_rowset_ids(rowset_id); + get_delete_bitmap_req.add_begin_versions(1); + get_delete_bitmap_req.add_end_versions(count); + + meta_service->get_delete_bitmap(reinterpret_cast(&cntl), + &get_delete_bitmap_req, &get_delete_bitmap_res, nullptr); + ASSERT_EQ(get_delete_bitmap_res.status().code(), MetaServiceCode::OK); + ASSERT_EQ(get_delete_bitmap_res.rowset_ids_size(), count); + ASSERT_EQ(get_delete_bitmap_res.segment_ids_size(), count); + ASSERT_EQ(get_delete_bitmap_res.versions_size(), count); + ASSERT_EQ(get_delete_bitmap_res.segment_delete_bitmaps_size(), count); + + for (int i = 0; i < count; i++) { + ASSERT_EQ(get_delete_bitmap_res.rowset_ids(i), rowset_id); + ASSERT_EQ(get_delete_bitmap_res.segment_ids(i), i); + ASSERT_EQ(get_delete_bitmap_res.versions(i), i + 1); + ASSERT_EQ(get_delete_bitmap_res.segment_delete_bitmaps(i), segment_delete_bitmaps[i]); + } + SyncPoint::get_instance()->disable_processing(); + SyncPoint::get_instance()->clear_all_call_backs(); +} + +TEST(MetaServiceTest, GetDeleteBitmapWithRetryTest3) { + auto meta_service = get_meta_service(); + SyncPoint::get_instance()->enable_processing(); + size_t index = 0; + SyncPoint::get_instance()->set_call_back("get_delete_bitmap_err", [&](auto&& args) { + auto* round = try_any_cast(args[0]); + LOG(INFO) << "GET_DELETE_BITMAP_CODE,index=" << index << ",round=" << *round; + if (*round > 2 && ++index < 2) { + *try_any_cast(args[1]) = TxnErrorCode::TXN_TOO_OLD; + } + }); + + // get delete bitmap update lock + brpc::Controller cntl; + GetDeleteBitmapUpdateLockRequest get_lock_req; + GetDeleteBitmapUpdateLockResponse get_lock_res; + get_lock_req.set_cloud_unique_id("test_cloud_unique_id"); + get_lock_req.set_table_id(100); + get_lock_req.add_partition_ids(123); + get_lock_req.set_expiration(5); + get_lock_req.set_lock_id(888); + get_lock_req.set_initiator(-1); + meta_service->get_delete_bitmap_update_lock( + reinterpret_cast<::google::protobuf::RpcController*>(&cntl), &get_lock_req, + &get_lock_res, nullptr); + ASSERT_EQ(get_lock_res.status().code(), MetaServiceCode::OK); + + //first update new key + UpdateDeleteBitmapRequest update_delete_bitmap_req; + UpdateDeleteBitmapResponse update_delete_bitmap_res; + update_delete_bitmap_req.set_cloud_unique_id("test_cloud_unique_id"); + update_delete_bitmap_req.set_table_id(100); + update_delete_bitmap_req.set_partition_id(123); + update_delete_bitmap_req.set_lock_id(888); + update_delete_bitmap_req.set_initiator(-1); + update_delete_bitmap_req.set_tablet_id(333); + std::string rowset_id = "456"; + std::string segment_delete_bitmaps[5]; + for (int i = 0; i < 5; i++) { + segment_delete_bitmaps[i] = generate_random_string(300 * 1000 * 3); + } + int count = 5; + for (int i = 0; i < count; i++) { + update_delete_bitmap_req.add_rowset_ids(rowset_id); + update_delete_bitmap_req.add_segment_ids(i); + update_delete_bitmap_req.add_versions(i + 1); + update_delete_bitmap_req.add_segment_delete_bitmaps(segment_delete_bitmaps[i]); + } + meta_service->update_delete_bitmap(reinterpret_cast(&cntl), + &update_delete_bitmap_req, &update_delete_bitmap_res, + nullptr); + ASSERT_EQ(update_delete_bitmap_res.status().code(), MetaServiceCode::OK); + + GetDeleteBitmapRequest get_delete_bitmap_req; + GetDeleteBitmapResponse get_delete_bitmap_res; + get_delete_bitmap_req.set_cloud_unique_id("test_cloud_unique_id"); + get_delete_bitmap_req.set_tablet_id(333); + + get_delete_bitmap_req.add_rowset_ids(rowset_id); + get_delete_bitmap_req.add_begin_versions(1); + get_delete_bitmap_req.add_end_versions(count); + + meta_service->get_delete_bitmap(reinterpret_cast(&cntl), + &get_delete_bitmap_req, &get_delete_bitmap_res, nullptr); + ASSERT_EQ(get_delete_bitmap_res.status().code(), MetaServiceCode::OK); + ASSERT_EQ(get_delete_bitmap_res.rowset_ids_size(), count); + ASSERT_EQ(get_delete_bitmap_res.segment_ids_size(), count); + ASSERT_EQ(get_delete_bitmap_res.versions_size(), count); + ASSERT_EQ(get_delete_bitmap_res.segment_delete_bitmaps_size(), count); + + for (int i = 0; i < count; i++) { + ASSERT_EQ(get_delete_bitmap_res.rowset_ids(i), rowset_id); + ASSERT_EQ(get_delete_bitmap_res.segment_ids(i), i); + ASSERT_EQ(get_delete_bitmap_res.versions(i), i + 1); + ASSERT_EQ(get_delete_bitmap_res.segment_delete_bitmaps(i), segment_delete_bitmaps[i]); + } + SyncPoint::get_instance()->disable_processing(); + SyncPoint::get_instance()->clear_all_call_backs(); +} + TEST(MetaServiceTest, GetVersion) { auto service = get_meta_service(); diff --git a/gensrc/proto/cloud.proto b/gensrc/proto/cloud.proto index 8d93b973b96fa2..381075074335c5 100644 --- a/gensrc/proto/cloud.proto +++ b/gensrc/proto/cloud.proto @@ -1421,6 +1421,7 @@ message GetDeleteBitmapResponse { repeated int64 versions = 4; // Serialized roaring bitmaps indexed with {rowset_id, segment_id, version} repeated bytes segment_delete_bitmaps = 5; + optional int64 tablet_id = 6; } message RemoveDeleteBitmapRequest {