diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index 5c8b6db2ba3..3f7b029572e 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -339,6 +339,91 @@ TEST_F(DBMemTableTest, ColumnFamilyId) { } } +TEST_F(DBMemTableTest, IntegrityChecks) { + // We insert keys key000000, key000001 and key000002 into skiplist at fixed + // height 1 (smallest height). Then we corrupt the second key to aey000001 to + // make it smaller. With `paranoid_memory_checks` set to true, if the + // skip list sees key000000 and then aey000001, then it will report out of + // order keys with corruption status. With `paranoid_memory_checks` set + // to false, read/scan may return wrong results. + for (bool allow_data_in_error : {false, true}) { + Options options = CurrentOptions(); + options.allow_data_in_errors = allow_data_in_error; + options.paranoid_memory_checks = true; + DestroyAndReopen(options); + SyncPoint::GetInstance()->SetCallBack( + "InlineSkipList::RandomHeight::height", [](void* h) { + auto height_ptr = static_cast(h); + *height_ptr = 1; + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(0), "val0")); + ASSERT_OK(Put(Key(2), "val2")); + // p will point to the buffer for encoded key000001 + char* p = nullptr; + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:BeforeReturn:Encoded", [&](void* encoded) { + p = const_cast(static_cast(encoded)->data()); + }); + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_TRUE(p); + // Offset 0 is key size, key bytes start at offset 1. + // "key000001 -> aey000001" + p[1] = 'a'; + + ReadOptions rops; + std::string val; + Status s = db_->Get(rops, Key(1), &val); + ASSERT_TRUE(s.IsCorruption()); + std::string key0 = Slice(Key(0)).ToString(true); + ASSERT_EQ(s.ToString().find(key0) != std::string::npos, + allow_data_in_error); + // Without `paranoid_memory_checks`, NotFound will be returned. + // This would fail an assertion in InlineSkipList::FindGreaterOrEqual(). + // If we remove the assertion, this passes. + // ASSERT_TRUE(db_->Get(ReadOptions(), Key(1), &val).IsNotFound()); + + std::vector vals; + std::vector statuses = db_->MultiGet( + rops, {db_->DefaultColumnFamily()}, {Key(1)}, &vals, nullptr); + ASSERT_TRUE(statuses[0].IsCorruption()); + ASSERT_EQ(statuses[0].ToString().find(key0) != std::string::npos, + allow_data_in_error); + + std::unique_ptr iter{db_->NewIterator(rops)}; + ASSERT_OK(iter->status()); + iter->Seek(Key(1)); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + + iter->Seek(Key(0)); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + // iterating through skip list at height at 1 should catch out-of-order keys + iter->Next(); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + ASSERT_FALSE(iter->Valid()); + + iter->SeekForPrev(Key(2)); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + + // Internally DB Iter will iterate backwards (call Prev()) after + // SeekToLast() to find the correct internal key with the last user key. + // Prev() will do integrity checks and catch corruption. + iter->SeekToLast(); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + ASSERT_FALSE(iter->Valid()); + } +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/memtable.cc b/db/memtable.cc index b1df2ae9c56..ef1184ded48 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -67,9 +67,10 @@ ImmutableMemTableOptions::ImmutableMemTableOptions( statistics(ioptions.stats), merge_operator(ioptions.merge_operator.get()), info_log(ioptions.logger), - allow_data_in_errors(ioptions.allow_data_in_errors), protection_bytes_per_key( - mutable_cf_options.memtable_protection_bytes_per_key) {} + mutable_cf_options.memtable_protection_bytes_per_key), + allow_data_in_errors(ioptions.allow_data_in_errors), + paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {} MemTable::MemTable(const InternalKeyComparator& cmp, const ImmutableOptions& ioptions, @@ -370,15 +371,17 @@ class MemTableIterator : public InternalIterator { : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), comparator_(mem.comparator_), - valid_(false), seqno_to_time_mapping_(seqno_to_time_mapping), - arena_mode_(arena != nullptr), - value_pinned_( - !mem.GetImmutableMemTableOptions()->inplace_update_support), - protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key), status_(Status::OK()), logger_(mem.moptions_.info_log), - ts_sz_(mem.ts_sz_) { + ts_sz_(mem.ts_sz_), + protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key), + valid_(false), + value_pinned_( + !mem.GetImmutableMemTableOptions()->inplace_update_support), + arena_mode_(arena != nullptr), + paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks), + allow_data_in_error(mem.moptions_.allow_data_in_errors) { if (use_range_del_table) { iter_ = mem.range_del_table_->GetIterator(arena); } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && @@ -406,6 +409,7 @@ class MemTableIterator : public InternalIterator { } else { delete iter_; } + status_.PermitUncheckedError(); } #ifndef NDEBUG @@ -415,10 +419,16 @@ class MemTableIterator : public InternalIterator { PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; #endif - bool Valid() const override { return valid_ && status_.ok(); } + bool Valid() const override { + // If inner iter_ is not valid, then this iter should also not be valid. + assert(iter_->Valid() || !(valid_ && status_.ok())); + return valid_ && status_.ok(); + } + void Seek(const Slice& k) override { PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); + status_ = Status::OK(); if (bloom_) { // iterator should only use prefix bloom filter Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); @@ -433,13 +443,18 @@ class MemTableIterator : public InternalIterator { } } } - iter_->Seek(k, nullptr); + if (paranoid_memory_checks_) { + status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error); + } else { + iter_->Seek(k, nullptr); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); } void SeekForPrev(const Slice& k) override { PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); + status_ = Status::OK(); if (bloom_) { Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); if (prefix_extractor_->InDomain(user_k_without_ts)) { @@ -453,7 +468,11 @@ class MemTableIterator : public InternalIterator { } } } - iter_->Seek(k, nullptr); + if (paranoid_memory_checks_) { + status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error); + } else { + iter_->Seek(k, nullptr); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); if (!Valid() && status().ok()) { @@ -464,11 +483,13 @@ class MemTableIterator : public InternalIterator { } } void SeekToFirst() override { + status_ = Status::OK(); iter_->SeekToFirst(); valid_ = iter_->Valid(); VerifyEntryChecksum(); } void SeekToLast() override { + status_ = Status::OK(); iter_->SeekToLast(); valid_ = iter_->Valid(); VerifyEntryChecksum(); @@ -476,8 +497,12 @@ class MemTableIterator : public InternalIterator { void Next() override { PERF_COUNTER_ADD(next_on_memtable_count, 1); assert(Valid()); - iter_->Next(); - TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); + if (paranoid_memory_checks_) { + status_ = iter_->NextAndValidate(allow_data_in_error); + } else { + iter_->Next(); + TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); } @@ -494,7 +519,11 @@ class MemTableIterator : public InternalIterator { void Prev() override { PERF_COUNTER_ADD(prev_on_memtable_count, 1); assert(Valid()); - iter_->Prev(); + if (paranoid_memory_checks_) { + status_ = iter_->PrevAndValidate(allow_data_in_error); + } else { + iter_->Prev(); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); } @@ -540,15 +569,17 @@ class MemTableIterator : public InternalIterator { const SliceTransform* const prefix_extractor_; const MemTable::KeyComparator comparator_; MemTableRep::Iterator* iter_; - bool valid_; // The seqno to time mapping is owned by the SuperVersion. UnownedPtr seqno_to_time_mapping_; - bool arena_mode_; - bool value_pinned_; - uint32_t protection_bytes_per_key_; Status status_; Logger* logger_; size_t ts_sz_; + uint32_t protection_bytes_per_key_; + bool valid_; + bool value_pinned_; + bool arena_mode_; + const bool paranoid_memory_checks_; + const bool allow_data_in_error; void VerifyEntryChecksum() { if (protection_bytes_per_key_ > 0 && Valid()) { @@ -1355,7 +1386,18 @@ void MemTable::GetFromTable(const LookupKey& key, saver.do_merge = do_merge; saver.allow_data_in_errors = moptions_.allow_data_in_errors; saver.protection_bytes_per_key = moptions_.protection_bytes_per_key; - table_->Get(key, &saver, SaveValue); + + if (!moptions_.paranoid_memory_checks) { + table_->Get(key, &saver, SaveValue); + } else { + Status check_s = table_->GetAndValidate(key, &saver, SaveValue, + moptions_.allow_data_in_errors); + if (check_s.IsCorruption()) { + *(saver.status) = check_s; + // Should stop searching the LSM. + *(saver.found_final_value) = true; + } + } assert(s->ok() || s->IsMergeInProgress() || *found_final_value); *seq = saver.seq; } diff --git a/db/memtable.h b/db/memtable.h index a2bf354c599..ca0652bc044 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -60,8 +60,9 @@ struct ImmutableMemTableOptions { Statistics* statistics; MergeOperator* merge_operator; Logger* info_log; - bool allow_data_in_errors; uint32_t protection_bytes_per_key; + bool allow_data_in_errors; + bool paranoid_memory_checks; }; // Batched counters to updated when inserting keys in one write batch. @@ -266,6 +267,11 @@ class MemTable { // If do_merge = false then any Merge Operands encountered for key are simply // stored in merge_context.operands_list and never actually merged to get a // final value. The raw Merge Operands are eventually returned to the user. + // @param value If not null and memtable contains a value for key, `value` + // will be set to the result value. + // @param column If not null and memtable contains a value/WideColumn for key, + // `column` will be set to the result value/WideColumn. + // Note: only one of `value` and `column` can be non-nullptr. // @param immutable_memtable Whether this memtable is immutable. Used // internally by NewRangeTombstoneIterator(). See comment above // NewRangeTombstoneIterator() for more detail. diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 08b529d0dae..67f82808fe5 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -274,6 +274,7 @@ DECLARE_bool(verification_only); DECLARE_string(last_level_temperature); DECLARE_string(default_write_temperature); DECLARE_string(default_temperature); +DECLARE_bool(paranoid_memory_checks); // Options for transaction dbs. // Use TransactionDB (a.k.a. Pessimistic Transaction DB) diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 160095d4769..bb2d9d453ea 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -1448,4 +1448,8 @@ DEFINE_uint32(uncache_aggressiveness, "obsolete. 0 = disabled, 1 = minimum, 100 = moderate, 10000 = " "normal max"); +DEFINE_bool(paranoid_memory_checks, + ROCKSDB_NAMESPACE::Options().paranoid_memory_checks, + "Sets CF option paranoid_memory_checks."); + #endif // GFLAGS diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 11998a98daf..b8ab0cc4f58 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -4055,6 +4055,7 @@ void InitializeOptionsFromFlags( options.memtable_protection_bytes_per_key = FLAGS_memtable_protection_bytes_per_key; options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; + options.paranoid_memory_checks = FLAGS_paranoid_memory_checks; // Integrated BlobDB options.enable_blob_files = FLAGS_enable_blob_files; diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index cbe1eb52fc0..11f971c2426 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -1090,6 +1090,13 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through the SetOptions() API. uint32_t bottommost_file_compaction_delay = 0; + // Enables additional integrity checks during reads/scans. + // Specifically, for skiplist-based memtables, we verify that keys visited + // are in order. This is helpful to detect corrupted memtable keys during + // reads. Enabling this feature incurs a performance overhead due to an + // additional key comparison during memtable lookup. + bool paranoid_memory_checks = false; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index d109a542fe3..fd63f127f46 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -194,6 +194,15 @@ class MemTableRep { virtual void Get(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry)); + // Same as Get() but performs data integrity validation. + virtual Status GetAndValidate(const LookupKey& /* k */, + void* /* callback_args */, + bool (* /* callback_func */)(void* arg, + const char* entry), + bool /*allow_data_in_error*/) { + return Status::NotSupported("GetAndValidate() not implemented."); + } + virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, const Slice& /*end_key*/) { return 0; @@ -235,13 +244,38 @@ class MemTableRep { // REQUIRES: Valid() virtual void Next() = 0; + // Advances to the next position and performs integrity validations on the + // skip list. Iterator becomes invalid and Corruption is returned if a + // corruption is found. + // REQUIRES: Valid() + virtual Status NextAndValidate(bool /* allow_data_in_errors */) { + return Status::NotSupported("NextAndValidate() not implemented."); + } + // Advances to the previous position. // REQUIRES: Valid() virtual void Prev() = 0; + // Advances to the previous position and performs integrity validations on + // the skip list. Iterator becomes invalid and Corruption is returned if a + // corruption is found. + // REQUIRES: Valid() + virtual Status PrevAndValidate(bool /* allow_data_in_errors */) { + return Status::NotSupported("PrevAndValidate() not implemented."); + } + // Advance to the first entry with a key >= target virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; + // Seek and perform integrity validations on the skip list. + // Iterator becomes invalid and Corruption is returned if a + // corruption is found. + virtual Status SeekAndValidate(const Slice& /* internal_key */, + const char* /* memtable_key */, + bool /* allow_data_in_errors */) { + return Status::NotSupported("SeekAndValidate() not implemented."); + } + // retreat to the first entry with a key <= target virtual void SeekForPrev(const Slice& internal_key, const char* memtable_key) = 0; diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index 8e2d548b430..ceaa246ae6d 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -52,6 +52,7 @@ #include "port/likely.h" #include "port/port.h" #include "rocksdb/slice.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/random.h" @@ -169,13 +170,20 @@ class InlineSkipList { // REQUIRES: Valid() void Next(); + [[nodiscard]] Status NextAndValidate(bool allow_data_in_errors); + // Advances to the previous position. // REQUIRES: Valid() void Prev(); + [[nodiscard]] Status PrevAndValidate(bool allow_data_in_errors); + // Advance to the first entry with a key >= target void Seek(const char* target); + [[nodiscard]] Status SeekAndValidate(const char* target, + bool allow_data_in_errors); + // Retreat to the last entry with a key <= target void SeekForPrev(const char* target); @@ -237,21 +245,20 @@ class InlineSkipList { bool KeyIsAfterNode(const DecodedKey& key, Node* n) const; // Returns the earliest node with a key >= key. - // Return nullptr if there is no such node. - Node* FindGreaterOrEqual(const char* key) const; - - // Return the latest node with a key < key. - // Return head_ if there is no such node. + // Returns nullptr if there is no such node. + // @param out_of_order_node If not null, will validate the order of visited + // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be + // returned and *out_of_order_node will be set to n2. + Node* FindGreaterOrEqual(const char* key, Node** out_of_order_node) const; + + // Returns the latest node with a key < key. + // Returns head_ if there is no such node. // Fills prev[level] with pointer to previous node at "level" for every // level in [0..max_height_-1], if prev is non-null. - Node* FindLessThan(const char* key, Node** prev = nullptr) const; - - // Return the latest node with a key < key on bottom_level. Start searching - // from root node on the level below top_level. - // Fills prev[level] with pointer to previous node at "level" for every - // level in [bottom_level..top_level-1], if prev is non-null. - Node* FindLessThan(const char* key, Node** prev, Node* root, int top_level, - int bottom_level) const; + // @param out_of_order_node If not null, will validate the order of visited + // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be + // returned and *out_of_order_node will be set to n2. + Node* FindLessThan(const char* key, Node** out_of_order_node) const; // Return the last node in the list. // Return head_ if list is empty. @@ -274,6 +281,8 @@ class InlineSkipList { // lowest_level (inclusive). void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice, int recompute_level); + + static Status Corruption(Node* prev, Node* next, bool allow_data_in_errors); }; // Implementation details follow @@ -392,20 +401,68 @@ inline void InlineSkipList::Iterator::Next() { node_ = node_->Next(0); } +template +inline Status InlineSkipList::Iterator::NextAndValidate( + bool allow_data_in_errors) { + assert(Valid()); + Node* prev_node = node_; + node_ = node_->Next(0); + // Verify that keys are increasing. + if (prev_node != list_->head_ && node_ != nullptr && + list_->compare_(prev_node->Key(), node_->Key()) >= 0) { + Node* node = node_; + // invalidates the iterator + node_ = nullptr; + return Corruption(prev_node, node, allow_data_in_errors); + } + return Status::OK(); +} + template inline void InlineSkipList::Iterator::Prev() { // Instead of using explicit "prev" links, we just search for the // last node that falls before key. assert(Valid()); - node_ = list_->FindLessThan(node_->Key()); + node_ = list_->FindLessThan(node_->Key(), nullptr); if (node_ == list_->head_) { node_ = nullptr; } } +template +inline Status InlineSkipList::Iterator::PrevAndValidate( + const bool allow_data_in_errors) { + assert(Valid()); + // Skip list validation is done in FindLessThan(). + Node* out_of_order_node = nullptr; + node_ = list_->FindLessThan(node_->Key(), &out_of_order_node); + if (out_of_order_node) { + Node* node = node_; + node_ = nullptr; + return Corruption(node, out_of_order_node, allow_data_in_errors); + } + if (node_ == list_->head_) { + node_ = nullptr; + } + return Status::OK(); +} + template inline void InlineSkipList::Iterator::Seek(const char* target) { - node_ = list_->FindGreaterOrEqual(target); + node_ = list_->FindGreaterOrEqual(target, nullptr); +} + +template +inline Status InlineSkipList::Iterator::SeekAndValidate( + const char* target, const bool allow_data_in_errors) { + Node* out_of_order_node = nullptr; + node_ = list_->FindGreaterOrEqual(target, &out_of_order_node); + if (out_of_order_node) { + Node* node = node_; + node_ = nullptr; + return Corruption(node, out_of_order_node, allow_data_in_errors); + } + return Status::OK(); } template @@ -448,6 +505,7 @@ int InlineSkipList::RandomHeight() { rnd->Next() < kScaledInverseBranching_) { height++; } + TEST_SYNC_POINT_CALLBACK("InlineSkipList::RandomHeight::height", &height); assert(height > 0); assert(height <= kMaxHeight_); assert(height <= kMaxPossibleHeight); @@ -472,7 +530,8 @@ bool InlineSkipList::KeyIsAfterNode(const DecodedKey& key, template typename InlineSkipList::Node* -InlineSkipList::FindGreaterOrEqual(const char* key) const { +InlineSkipList::FindGreaterOrEqual( + const char* key, Node** const out_of_order_node) const { // Note: It looks like we could reduce duplication by implementing // this function as FindLessThan(key)->Next(0), but we wouldn't be able // to exit early on equality and the result wouldn't even be correct. @@ -486,6 +545,11 @@ InlineSkipList::FindGreaterOrEqual(const char* key) const { Node* next = x->Next(level); if (next != nullptr) { PREFETCH(next->Next(level), 0, 1); + if (out_of_order_node && x != head_ && + compare_(x->Key(), next->Key()) >= 0) { + *out_of_order_node = next; + return x; + } } // Make sure the lists are sorted assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); @@ -509,18 +573,11 @@ InlineSkipList::FindGreaterOrEqual(const char* key) const { template typename InlineSkipList::Node* -InlineSkipList::FindLessThan(const char* key, Node** prev) const { - return FindLessThan(key, prev, head_, GetMaxHeight(), 0); -} - -template -typename InlineSkipList::Node* -InlineSkipList::FindLessThan(const char* key, Node** prev, - Node* root, int top_level, - int bottom_level) const { - assert(top_level > bottom_level); - int level = top_level - 1; - Node* x = root; +InlineSkipList::FindLessThan(const char* key, + Node** const out_of_order_node) const { + int level = GetMaxHeight() - 1; + assert(level >= 0); + Node* x = head_; // KeyIsAfter(key, last_not_after) is definitely false Node* last_not_after = nullptr; const DecodedKey key_decoded = compare_.decode_key(key); @@ -529,6 +586,11 @@ InlineSkipList::FindLessThan(const char* key, Node** prev, Node* next = x->Next(level); if (next != nullptr) { PREFETCH(next->Next(level), 0, 1); + if (out_of_order_node && x != head_ && + compare_(x->Key(), next->Key()) >= 0) { + *out_of_order_node = next; + return x; + } } assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); assert(x == head_ || KeyIsAfterNode(key_decoded, x)); @@ -537,10 +599,7 @@ InlineSkipList::FindLessThan(const char* key, Node** prev, assert(next != nullptr); x = next; } else { - if (prev != nullptr) { - prev[level] = x; - } - if (level == bottom_level) { + if (level == 0) { return x; } else { // Switch to next list, reuse KeyIsAfterNode() result @@ -999,7 +1058,7 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, template bool InlineSkipList::Contains(const char* key) const { - Node* x = FindGreaterOrEqual(key); + Node* x = FindGreaterOrEqual(key, nullptr); if (x != nullptr && Equal(key, x->Key())) { return true; } else { @@ -1048,4 +1107,14 @@ void InlineSkipList::TEST_Validate() const { } } +template +Status InlineSkipList::Corruption(Node* prev, Node* next, + bool allow_data_in_errors) { + std::string msg = "Out-of-order keys found in skiplist."; + if (allow_data_in_errors) { + msg.append(" prev key: " + Slice(prev->Key()).ToString(true)); + msg.append(" next key: " + Slice(next->Key()).ToString(true)); + } + return Status::Corruption(msg); +} } // namespace ROCKSDB_NAMESPACE diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index e615ef9f68c..3b2f3f4d8d1 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -92,6 +92,20 @@ class SkipListRep : public MemTableRep { } } + Status GetAndValidate(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry), + bool allow_data_in_errors) override { + SkipListRep::Iterator iter(&skip_list_); + Slice dummy_slice; + Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(), + allow_data_in_errors); + for (; iter.Valid() && status.ok() && + callback_func(callback_args, iter.key()); + status = iter.NextAndValidate(allow_data_in_errors)) { + } + return status; + } + uint64_t ApproximateNumEntries(const Slice& start_ikey, const Slice& end_ikey) override { std::string tmp; @@ -181,15 +195,24 @@ class SkipListRep : public MemTableRep { // Returns the key at the current position. // REQUIRES: Valid() - const char* key() const override { return iter_.key(); } + const char* key() const override { + assert(Valid()); + return iter_.key(); + } // Advances to the next position. // REQUIRES: Valid() - void Next() override { iter_.Next(); } + void Next() override { + assert(Valid()); + iter_.Next(); + } // Advances to the previous position. // REQUIRES: Valid() - void Prev() override { iter_.Prev(); } + void Prev() override { + assert(Valid()); + iter_.Prev(); + } // Advance to the first entry with a key >= target void Seek(const Slice& user_key, const char* memtable_key) override { @@ -219,6 +242,26 @@ class SkipListRep : public MemTableRep { // Final state of iterator is Valid() iff list is not empty. void SeekToLast() override { iter_.SeekToLast(); } + Status NextAndValidate(bool allow_data_in_errors) override { + assert(Valid()); + return iter_.NextAndValidate(allow_data_in_errors); + } + + Status SeekAndValidate(const Slice& user_key, const char* memtable_key, + bool allow_data_in_errors) override { + if (memtable_key != nullptr) { + return iter_.SeekAndValidate(memtable_key, allow_data_in_errors); + } else { + return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key), + allow_data_in_errors); + } + } + + Status PrevAndValidate(bool allow_data_in_error) override { + assert(Valid()); + return iter_.PrevAndValidate(allow_data_in_error); + } + protected: std::string tmp_; // For passing to EncodeKey }; diff --git a/options/cf_options.cc b/options/cf_options.cc index cc9e630b9ca..7f2cd031325 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -531,6 +531,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, block_protection_bytes_per_key), OptionType::kUInt8T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"paranoid_memory_checks", + {offsetof(struct MutableCFOptions, paranoid_memory_checks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {kOptNameCompOpts, OptionTypeInfo::Struct( kOptNameCompOpts, &compression_options_type_info, @@ -1104,6 +1108,8 @@ void MutableCFOptions::Dump(Logger* log) const { ttl); ROCKS_LOG_INFO(log, " periodic_compaction_seconds: %" PRIu64, periodic_compaction_seconds); + ROCKS_LOG_INFO(log, " paranoid_memory_checks: %d", + paranoid_memory_checks); std::string result; char buf[10]; for (const auto m : max_bytes_for_level_multiplier_additional) { diff --git a/options/cf_options.h b/options/cf_options.h index 372a0daf544..3a0c3b09a82 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -168,6 +168,7 @@ struct MutableCFOptions { memtable_protection_bytes_per_key( options.memtable_protection_bytes_per_key), block_protection_bytes_per_key(options.block_protection_bytes_per_key), + paranoid_memory_checks(options.paranoid_memory_checks), sample_for_compression( options.sample_for_compression), // TODO: is 0 fine here? compression_per_level(options.compression_per_level), @@ -317,6 +318,7 @@ struct MutableCFOptions { Temperature default_write_temperature; uint32_t memtable_protection_bytes_per_key; uint8_t block_protection_bytes_per_key; + bool paranoid_memory_checks; uint64_t sample_for_compression; std::vector compression_per_level; diff --git a/options/options_helper.cc b/options/options_helper.cc index 5cc13f4fe4b..ec62dd1f5c9 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -213,6 +213,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.memtable_protection_bytes_per_key; cf_opts->block_protection_bytes_per_key = moptions.block_protection_bytes_per_key; + cf_opts->paranoid_memory_checks = moptions.paranoid_memory_checks; cf_opts->bottommost_file_compaction_delay = moptions.bottommost_file_compaction_delay; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 31a07373c4f..2bf349b1cbb 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -568,7 +568,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "block_protection_bytes_per_key=1;" "memtable_max_range_deletions=999999;" "bottommost_file_compaction_delay=7200;" - "uncache_aggressiveness=1234;", + "uncache_aggressiveness=1234;" + "paranoid_memory_checks=1;", new_options)); ASSERT_NE(new_options->blob_cache.get(), nullptr); diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index b4554bfacea..d51dbf30ad4 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1280,6 +1280,9 @@ DEFINE_bool( auto_readahead_size, false, "When set true, RocksDB does auto tuning of readahead size during Scans"); +DEFINE_bool(paranoid_memory_checks, false, + "Sets CF option paranoid_memory_checks"); + static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType( const char* ctype) { assert(ctype); @@ -4739,6 +4742,7 @@ class Benchmark { FLAGS_memtable_protection_bytes_per_key; options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; + options.paranoid_memory_checks = FLAGS_paranoid_memory_checks; } void InitializeOptionsGeneral(Options* opts) { diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 9d320ad2cf8..c95851310d8 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -340,6 +340,7 @@ "check_multiget_entity_consistency": lambda: random.choice([0, 0, 0, 1]), "use_timed_put_one_in": lambda: random.choice([0] * 7 + [1, 5, 10]), "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]), + "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]), } _TEST_DIR_ENV_VAR = "TEST_TMPDIR" # If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR diff --git a/unreleased_history/new_features/memtable-paranoid-checks.md b/unreleased_history/new_features/memtable-paranoid-checks.md new file mode 100644 index 00000000000..e2a4510dcfe --- /dev/null +++ b/unreleased_history/new_features/memtable-paranoid-checks.md @@ -0,0 +1 @@ +* Introduce a new mutable CF option `paranoid_memory_checks`. It enables additional validation on data integrity during reads/scanning. Currently, skip list based memtable will validate key ordering during look up and scans. \ No newline at end of file