Skip to content

Commit

Permalink
Add an option to verify memtable key order during reads (#12889)
Browse files Browse the repository at this point in the history
Summary:
add a new CF option `paranoid_memory_checks` that allows additional data integrity validations during read/scan. Currently, skiplist-based memtable will validate the order of keys visited. Further data validation can be added in different layers. The option will be opt-in due to performance overhead.

The motivation for this feature is for services where data correctness is critical and want to detect in-memory corruption earlier. For a corrupted memtable key, this feature can help to detect it during during reads instead of during flush with existing protections (OutputValidator that verifies key order or per kv checksum). See internally linked task for more context.

Pull Request resolved: #12889

Test Plan:
* new unit test added for paranoid_memory_checks=true.
* existing unit test for paranoid_memory_checks=false.
* enable in stress test.

Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR:
* Memtable-only randomread ops/sec:
```
(for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --num=250000 --reads=500000  --seed=1723056275 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }';

Main: 608146
PR with paranoid_memory_checks=false: 607727 (- %0.07)
PR with paranoid_memory_checks=true: 521889 (-%14.2)
```

* Memtable-only sequential scan ops/sec:
```
(for I in $(seq 1 50); do ./db_bench--benchmarks=fillseq,readseq[-X10] --write_buffer_size=268435456 --num=1000000  --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }';

Main: 9180077
PR with paranoid_memory_checks=false: 9536241 (+%3.8)
PR with paranoid_memory_checks=true: 7653934 (-%16.6)
```

* Memtable-only reverse scan ops/sec:
```
(for I in $(seq 1 20); do ./db_bench --benchmarks=fillseq,readreverse[-X10] --write_buffer_size=268435456 --num=1000000  --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }';

 Main: 1285719
 PR with integrity_checks=false: 1431626 (+%11.3)
 PR with integrity_checks=true: 811031 (-%36.9)
```

The `readrandom` benchmark shows no regression. The scanning benchmarks show improvement that I can't explain.

Reviewed By: pdillinger

Differential Revision: D60414267

Pulled By: cbi42

fbshipit-source-id: a70b0cbeea131f1a249a5f78f9dc3a62dacfaa91
  • Loading branch information
cbi42 authored and facebook-github-bot committed Aug 19, 2024
1 parent 273b3ea commit defd97b
Show file tree
Hide file tree
Showing 17 changed files with 365 additions and 57 deletions.
85 changes: 85 additions & 0 deletions db/db_memtable_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,91 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
}
}

TEST_F(DBMemTableTest, IntegrityChecks) {
// We insert keys key000000, key000001 and key000002 into skiplist at fixed
// height 1 (smallest height). Then we corrupt the second key to aey000001 to
// make it smaller. With `paranoid_memory_checks` set to true, if the
// skip list sees key000000 and then aey000001, then it will report out of
// order keys with corruption status. With `paranoid_memory_checks` set
// to false, read/scan may return wrong results.
for (bool allow_data_in_error : {false, true}) {
Options options = CurrentOptions();
options.allow_data_in_errors = allow_data_in_error;
options.paranoid_memory_checks = true;
DestroyAndReopen(options);
SyncPoint::GetInstance()->SetCallBack(
"InlineSkipList::RandomHeight::height", [](void* h) {
auto height_ptr = static_cast<int*>(h);
*height_ptr = 1;
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(Put(Key(0), "val0"));
ASSERT_OK(Put(Key(2), "val2"));
// p will point to the buffer for encoded key000001
char* p = nullptr;
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:BeforeReturn:Encoded", [&](void* encoded) {
p = const_cast<char*>(static_cast<Slice*>(encoded)->data());
});
ASSERT_OK(Put(Key(1), "val1"));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
ASSERT_TRUE(p);
// Offset 0 is key size, key bytes start at offset 1.
// "key000001 -> aey000001"
p[1] = 'a';

ReadOptions rops;
std::string val;
Status s = db_->Get(rops, Key(1), &val);
ASSERT_TRUE(s.IsCorruption());
std::string key0 = Slice(Key(0)).ToString(true);
ASSERT_EQ(s.ToString().find(key0) != std::string::npos,
allow_data_in_error);
// Without `paranoid_memory_checks`, NotFound will be returned.
// This would fail an assertion in InlineSkipList::FindGreaterOrEqual().
// If we remove the assertion, this passes.
// ASSERT_TRUE(db_->Get(ReadOptions(), Key(1), &val).IsNotFound());

std::vector<std::string> vals;
std::vector<Status> statuses = db_->MultiGet(
rops, {db_->DefaultColumnFamily()}, {Key(1)}, &vals, nullptr);
ASSERT_TRUE(statuses[0].IsCorruption());
ASSERT_EQ(statuses[0].ToString().find(key0) != std::string::npos,
allow_data_in_error);

std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
ASSERT_OK(iter->status());
iter->Seek(Key(1));
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);

iter->Seek(Key(0));
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
// iterating through skip list at height at 1 should catch out-of-order keys
iter->Next();
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
ASSERT_FALSE(iter->Valid());

iter->SeekForPrev(Key(2));
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);

// Internally DB Iter will iterate backwards (call Prev()) after
// SeekToLast() to find the correct internal key with the last user key.
// Prev() will do integrity checks and catch corruption.
iter->SeekToLast();
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
ASSERT_FALSE(iter->Valid());
}
}
} // namespace ROCKSDB_NAMESPACE

int main(int argc, char** argv) {
Expand Down
80 changes: 61 additions & 19 deletions db/memtable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,10 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
statistics(ioptions.stats),
merge_operator(ioptions.merge_operator.get()),
info_log(ioptions.logger),
allow_data_in_errors(ioptions.allow_data_in_errors),
protection_bytes_per_key(
mutable_cf_options.memtable_protection_bytes_per_key) {}
mutable_cf_options.memtable_protection_bytes_per_key),
allow_data_in_errors(ioptions.allow_data_in_errors),
paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {}

MemTable::MemTable(const InternalKeyComparator& cmp,
const ImmutableOptions& ioptions,
Expand Down Expand Up @@ -370,15 +371,17 @@ class MemTableIterator : public InternalIterator {
: bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_),
comparator_(mem.comparator_),
valid_(false),
seqno_to_time_mapping_(seqno_to_time_mapping),
arena_mode_(arena != nullptr),
value_pinned_(
!mem.GetImmutableMemTableOptions()->inplace_update_support),
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
status_(Status::OK()),
logger_(mem.moptions_.info_log),
ts_sz_(mem.ts_sz_) {
ts_sz_(mem.ts_sz_),
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
valid_(false),
value_pinned_(
!mem.GetImmutableMemTableOptions()->inplace_update_support),
arena_mode_(arena != nullptr),
paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
allow_data_in_error(mem.moptions_.allow_data_in_errors) {
if (use_range_del_table) {
iter_ = mem.range_del_table_->GetIterator(arena);
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
Expand Down Expand Up @@ -406,6 +409,7 @@ class MemTableIterator : public InternalIterator {
} else {
delete iter_;
}
status_.PermitUncheckedError();
}

#ifndef NDEBUG
Expand All @@ -415,10 +419,16 @@ class MemTableIterator : public InternalIterator {
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
#endif

bool Valid() const override { return valid_ && status_.ok(); }
bool Valid() const override {
// If inner iter_ is not valid, then this iter should also not be valid.
assert(iter_->Valid() || !(valid_ && status_.ok()));
return valid_ && status_.ok();
}

void Seek(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
status_ = Status::OK();
if (bloom_) {
// iterator should only use prefix bloom filter
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
Expand All @@ -433,13 +443,18 @@ class MemTableIterator : public InternalIterator {
}
}
}
iter_->Seek(k, nullptr);
if (paranoid_memory_checks_) {
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
} else {
iter_->Seek(k, nullptr);
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
void SeekForPrev(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
status_ = Status::OK();
if (bloom_) {
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
if (prefix_extractor_->InDomain(user_k_without_ts)) {
Expand All @@ -453,7 +468,11 @@ class MemTableIterator : public InternalIterator {
}
}
}
iter_->Seek(k, nullptr);
if (paranoid_memory_checks_) {
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
} else {
iter_->Seek(k, nullptr);
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
if (!Valid() && status().ok()) {
Expand All @@ -464,20 +483,26 @@ class MemTableIterator : public InternalIterator {
}
}
void SeekToFirst() override {
status_ = Status::OK();
iter_->SeekToFirst();
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
void SeekToLast() override {
status_ = Status::OK();
iter_->SeekToLast();
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
void Next() override {
PERF_COUNTER_ADD(next_on_memtable_count, 1);
assert(Valid());
iter_->Next();
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
if (paranoid_memory_checks_) {
status_ = iter_->NextAndValidate(allow_data_in_error);
} else {
iter_->Next();
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
Expand All @@ -494,7 +519,11 @@ class MemTableIterator : public InternalIterator {
void Prev() override {
PERF_COUNTER_ADD(prev_on_memtable_count, 1);
assert(Valid());
iter_->Prev();
if (paranoid_memory_checks_) {
status_ = iter_->PrevAndValidate(allow_data_in_error);
} else {
iter_->Prev();
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
Expand Down Expand Up @@ -540,15 +569,17 @@ class MemTableIterator : public InternalIterator {
const SliceTransform* const prefix_extractor_;
const MemTable::KeyComparator comparator_;
MemTableRep::Iterator* iter_;
bool valid_;
// The seqno to time mapping is owned by the SuperVersion.
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping_;
bool arena_mode_;
bool value_pinned_;
uint32_t protection_bytes_per_key_;
Status status_;
Logger* logger_;
size_t ts_sz_;
uint32_t protection_bytes_per_key_;
bool valid_;
bool value_pinned_;
bool arena_mode_;
const bool paranoid_memory_checks_;
const bool allow_data_in_error;

void VerifyEntryChecksum() {
if (protection_bytes_per_key_ > 0 && Valid()) {
Expand Down Expand Up @@ -1355,7 +1386,18 @@ void MemTable::GetFromTable(const LookupKey& key,
saver.do_merge = do_merge;
saver.allow_data_in_errors = moptions_.allow_data_in_errors;
saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
table_->Get(key, &saver, SaveValue);

if (!moptions_.paranoid_memory_checks) {
table_->Get(key, &saver, SaveValue);
} else {
Status check_s = table_->GetAndValidate(key, &saver, SaveValue,
moptions_.allow_data_in_errors);
if (check_s.IsCorruption()) {
*(saver.status) = check_s;
// Should stop searching the LSM.
*(saver.found_final_value) = true;
}
}
assert(s->ok() || s->IsMergeInProgress() || *found_final_value);
*seq = saver.seq;
}
Expand Down
8 changes: 7 additions & 1 deletion db/memtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@ struct ImmutableMemTableOptions {
Statistics* statistics;
MergeOperator* merge_operator;
Logger* info_log;
bool allow_data_in_errors;
uint32_t protection_bytes_per_key;
bool allow_data_in_errors;
bool paranoid_memory_checks;
};

// Batched counters to updated when inserting keys in one write batch.
Expand Down Expand Up @@ -266,6 +267,11 @@ class MemTable {
// If do_merge = false then any Merge Operands encountered for key are simply
// stored in merge_context.operands_list and never actually merged to get a
// final value. The raw Merge Operands are eventually returned to the user.
// @param value If not null and memtable contains a value for key, `value`
// will be set to the result value.
// @param column If not null and memtable contains a value/WideColumn for key,
// `column` will be set to the result value/WideColumn.
// Note: only one of `value` and `column` can be non-nullptr.
// @param immutable_memtable Whether this memtable is immutable. Used
// internally by NewRangeTombstoneIterator(). See comment above
// NewRangeTombstoneIterator() for more detail.
Expand Down
1 change: 1 addition & 0 deletions db_stress_tool/db_stress_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ DECLARE_bool(verification_only);
DECLARE_string(last_level_temperature);
DECLARE_string(default_write_temperature);
DECLARE_string(default_temperature);
DECLARE_bool(paranoid_memory_checks);

// Options for transaction dbs.
// Use TransactionDB (a.k.a. Pessimistic Transaction DB)
Expand Down
4 changes: 4 additions & 0 deletions db_stress_tool/db_stress_gflags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1448,4 +1448,8 @@ DEFINE_uint32(uncache_aggressiveness,
"obsolete. 0 = disabled, 1 = minimum, 100 = moderate, 10000 = "
"normal max");

DEFINE_bool(paranoid_memory_checks,
ROCKSDB_NAMESPACE::Options().paranoid_memory_checks,
"Sets CF option paranoid_memory_checks.");

#endif // GFLAGS
1 change: 1 addition & 0 deletions db_stress_tool/db_stress_test_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4055,6 +4055,7 @@ void InitializeOptionsFromFlags(
options.memtable_protection_bytes_per_key =
FLAGS_memtable_protection_bytes_per_key;
options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key;
options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;

// Integrated BlobDB
options.enable_blob_files = FLAGS_enable_blob_files;
Expand Down
7 changes: 7 additions & 0 deletions include/rocksdb/advanced_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,13 @@ struct AdvancedColumnFamilyOptions {
// Dynamically changeable through the SetOptions() API.
uint32_t bottommost_file_compaction_delay = 0;

// Enables additional integrity checks during reads/scans.
// Specifically, for skiplist-based memtables, we verify that keys visited
// are in order. This is helpful to detect corrupted memtable keys during
// reads. Enabling this feature incurs a performance overhead due to an
// additional key comparison during memtable lookup.
bool paranoid_memory_checks = false;

// Create ColumnFamilyOptions with default values for all fields
AdvancedColumnFamilyOptions();
// Create ColumnFamilyOptions from Options
Expand Down
34 changes: 34 additions & 0 deletions include/rocksdb/memtablerep.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,15 @@ class MemTableRep {
virtual void Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry));

// Same as Get() but performs data integrity validation.
virtual Status GetAndValidate(const LookupKey& /* k */,
void* /* callback_args */,
bool (* /* callback_func */)(void* arg,
const char* entry),
bool /*allow_data_in_error*/) {
return Status::NotSupported("GetAndValidate() not implemented.");
}

virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
const Slice& /*end_key*/) {
return 0;
Expand Down Expand Up @@ -235,13 +244,38 @@ class MemTableRep {
// REQUIRES: Valid()
virtual void Next() = 0;

// Advances to the next position and performs integrity validations on the
// skip list. Iterator becomes invalid and Corruption is returned if a
// corruption is found.
// REQUIRES: Valid()
virtual Status NextAndValidate(bool /* allow_data_in_errors */) {
return Status::NotSupported("NextAndValidate() not implemented.");
}

// Advances to the previous position.
// REQUIRES: Valid()
virtual void Prev() = 0;

// Advances to the previous position and performs integrity validations on
// the skip list. Iterator becomes invalid and Corruption is returned if a
// corruption is found.
// REQUIRES: Valid()
virtual Status PrevAndValidate(bool /* allow_data_in_errors */) {
return Status::NotSupported("PrevAndValidate() not implemented.");
}

// Advance to the first entry with a key >= target
virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;

// Seek and perform integrity validations on the skip list.
// Iterator becomes invalid and Corruption is returned if a
// corruption is found.
virtual Status SeekAndValidate(const Slice& /* internal_key */,
const char* /* memtable_key */,
bool /* allow_data_in_errors */) {
return Status::NotSupported("SeekAndValidate() not implemented.");
}

// retreat to the first entry with a key <= target
virtual void SeekForPrev(const Slice& internal_key,
const char* memtable_key) = 0;
Expand Down
Loading

0 comments on commit defd97b

Please sign in to comment.