From 42019712a1b29bc78a29ba6e62eb2f3c573dd5d4 Mon Sep 17 00:00:00 2001 From: Bilal Akhtar Date: Thu, 31 Oct 2024 15:22:16 -0400 Subject: [PATCH] db: verify ingest sst bounds are suffixless or adjacent We now allow sstables to have range key bounds that have suffixes in them, during ingestion. This is to allow for Cockroach-side defragmentation of range keys. This change ensures that such suffixed range keys are adjacent and defrag cleanly with rangekeys in other ingested sstables, to not cause panics later on in the read path. Fixes #3829. --- ingest.go | 88 +++++++++++++++++------- internal/base/comparer.go | 7 ++ iterator_histories_test.go | 10 +++ open.go | 9 ++- testdata/iter_histories/prefix_iteration | 17 +++-- 5 files changed, 102 insertions(+), 29 deletions(-) diff --git a/ingest.go b/ingest.go index 3fe0aae727..63ada836d8 100644 --- a/ingest.go +++ b/ingest.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/cache" "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/overlap" "github.com/cockroachdb/pebble/internal/sstableinternal" @@ -241,6 +242,10 @@ func ingestLoad1External( // ingestLoad1 creates the FileMetadata for one file. This file will be owned // by this store. +// +// lastRangeKey is the last range key from the previous file. It is used to +// ensure that the range keys defragment cleanly across files. These checks +// are disabled if disableRangeKeyChecks is true. func ingestLoad1( ctx context.Context, opts *Options, @@ -248,7 +253,9 @@ func ingestLoad1( readable objstorage.Readable, cacheID cache.ID, fileNum base.FileNum, -) (*fileMetadata, error) { + lastRangeKey keyspan.Span, + disableRangeKeyChecks bool, +) (*fileMetadata, keyspan.Span, error) { o := opts.MakeReaderOptions() o.SetInternalCacheOpts(sstableinternal.CacheOptions{ Cache: opts.Cache, @@ -257,24 +264,24 @@ func ingestLoad1( }) r, err := sstable.NewReader(ctx, readable, o) if err != nil { - return nil, err + return nil, keyspan.Span{}, err } defer r.Close() // Avoid ingesting tables with format versions this DB doesn't support. tf, err := r.TableFormat() if err != nil { - return nil, err + return nil, keyspan.Span{}, err } if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() { - return nil, errors.Newf( + return nil, keyspan.Span{}, errors.Newf( "pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)", tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(), ) } if tf.BlockColumnar() { if _, ok := opts.KeySchemas[r.Properties.KeySchemaName]; !ok { - return nil, errors.Newf( + return nil, keyspan.Span{}, errors.Newf( "pebble: table uses key schema %q unknown to the database", r.Properties.KeySchemaName) } @@ -300,52 +307,52 @@ func ingestLoad1( { iter, err := r.NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) if err != nil { - return nil, err + return nil, keyspan.Span{}, err } defer iter.Close() var smallest InternalKey if kv := iter.First(); kv != nil { if err := ingestValidateKey(opts, &kv.K); err != nil { - return nil, err + return nil, keyspan.Span{}, err } smallest = kv.K.Clone() } if err := iter.Error(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } if kv := iter.Last(); kv != nil { if err := ingestValidateKey(opts, &kv.K); err != nil { - return nil, err + return nil, keyspan.Span{}, err } meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, kv.K.Clone()) } if err := iter.Error(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } } iter, err := r.NewRawRangeDelIter(ctx, sstable.NoFragmentTransforms) if err != nil { - return nil, err + return nil, keyspan.Span{}, err } if iter != nil { defer iter.Close() var smallest InternalKey if s, err := iter.First(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } else if s != nil { key := s.SmallestKey() if err := ingestValidateKey(opts, &key); err != nil { - return nil, err + return nil, keyspan.Span{}, err } smallest = key.Clone() } if s, err := iter.Last(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } else if s != nil { k := s.SmallestKey() if err := ingestValidateKey(opts, &k); err != nil { - return nil, err + return nil, keyspan.Span{}, err } largest := s.LargestKey().Clone() meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest) @@ -356,45 +363,64 @@ func ingestLoad1( { iter, err := r.NewRawRangeKeyIter(ctx, sstable.NoFragmentTransforms) if err != nil { - return nil, err + return nil, keyspan.Span{}, err } if iter != nil { defer iter.Close() var smallest InternalKey if s, err := iter.First(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } else if s != nil { key := s.SmallestKey() if err := ingestValidateKey(opts, &key); err != nil { - return nil, err + return nil, keyspan.Span{}, err } smallest = key.Clone() + // Range keys need some additional validation as we need to ensure they + // defragment cleanly with the lastRangeKey from the previous file. + if !disableRangeKeyChecks { + if lastRangeKey.Valid() { + if opts.Comparer.Split.HasSuffix(lastRangeKey.End) && (!opts.Comparer.Equal(lastRangeKey.End, s.Start) || + !keyspan.DefragmentInternal.ShouldDefragment(opts.Comparer.CompareRangeSuffixes, &lastRangeKey, s)) { + // The last range key has a suffix, and it doesn't defragment cleanly with this range key. + return nil, keyspan.Span{}, errors.AssertionFailedf("pebble: ingest sstable has suffixed range key that won't defragment with previous sstable: %s", + opts.Comparer.FormatKey(lastRangeKey.End)) + } + } else if opts.Comparer.Split.HasSuffix(s.Start) { + return nil, keyspan.Span{}, errors.Newf("pebble: ingest sstable has suffixed range key start that won't defragment: %s", + opts.Comparer.FormatKey(s.Start)) + } + } } + lastRangeKey = keyspan.Span{} if s, err := iter.Last(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } else if s != nil { k := s.SmallestKey() if err := ingestValidateKey(opts, &k); err != nil { - return nil, err + return nil, keyspan.Span{}, err } // As range keys are fragmented, the end key of the last range key in // the table provides the upper bound for the table. largest := s.LargestKey().Clone() meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest) + lastRangeKey = s.Clone() } + } else { + lastRangeKey = keyspan.Span{} } } if !meta.HasPointKeys && !meta.HasRangeKeys { - return nil, nil + return nil, keyspan.Span{}, nil } // Sanity check that the various bounds on the file were set consistently. if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { - return nil, err + return nil, keyspan.Span{}, err } - return meta, nil + return meta, lastRangeKey, nil } type ingestLoadResult struct { @@ -445,6 +471,15 @@ func ingestLoad( var result ingestLoadResult result.local = make([]ingestLocalMeta, 0, len(paths)) + var lastRangeKey keyspan.Span + // NB: we disable range key boundary assertions if we have shared or external files + // present in this ingestion. This is because a suffixed range key in a local file + // can possibly defragment with a suffixed range key in a shared or external file. + // We also disable range key boundary assertions if we have CreateOnShared set to + // true, as that means we could have suffixed RangeKeyDels or Unsets in the local + // files that won't ever be surfaced, even if there are no shared or external files + // in the ingestion. + disableRangeKeyChecks := len(shared) > 0 || len(external) > 0 || opts.Experimental.CreateOnShared != remote.CreateOnSharedNone for i := range paths { f, err := opts.FS.Open(paths[i]) if err != nil { @@ -455,7 +490,8 @@ func ingestLoad( if err != nil { return ingestLoadResult{}, err } - m, err := ingestLoad1(ctx, opts, fmv, readable, cacheID, localFileNums[i]) + var m *fileMetadata + m, lastRangeKey, err = ingestLoad1(ctx, opts, fmv, readable, cacheID, localFileNums[i], lastRangeKey, disableRangeKeyChecks) if err != nil { return ingestLoadResult{}, err } @@ -466,6 +502,10 @@ func ingestLoad( }) } } + if !disableRangeKeyChecks && lastRangeKey.Valid() && opts.Comparer.Split.HasSuffix(lastRangeKey.End) { + return ingestLoadResult{}, errors.AssertionFailedf("pebble: last ingest sstable has suffixed range key end %s", + opts.Comparer.FormatKey(lastRangeKey.End)) + } // Sort the shared files according to level. sort.Sort(sharedByLevel(shared)) diff --git a/internal/base/comparer.go b/internal/base/comparer.go index 78a7aadd37..c734052de0 100644 --- a/internal/base/comparer.go +++ b/internal/base/comparer.go @@ -186,6 +186,13 @@ func (s Split) Prefix(k []byte) []byte { return k[:i:i] } +// HasSuffix returns true if the key k has a suffix remaining after +// Split is called on it. For keys where the entirety of the key is +// returned by Split, HasSuffix will return false. +func (s Split) HasSuffix(k []byte) bool { + return s(k) < len(k) +} + // DefaultSplit is a trivial implementation of Split which always returns the // full key. var DefaultSplit Split = func(key []byte) int { return len(key) } diff --git a/iterator_histories_test.go b/iterator_histories_test.go index e22fa18e0b..6b72c1f13d 100644 --- a/iterator_histories_test.go +++ b/iterator_histories_test.go @@ -221,6 +221,16 @@ func TestIterHistories(t *testing.T) { } } return buf.String() + case "build": + if err := runBuildCmd(td, d, d.opts.FS); err != nil { + return err.Error() + } + return "" + case "ingest-existing": + if err := runIngestCmd(td, d, d.opts.FS); err != nil { + return err.Error() + } + return "" case "ingest": if err := runBuildCmd(td, d, d.opts.FS); err != nil { return err.Error() diff --git a/open.go b/open.go index 2610e3451f..94ac3e78ba 100644 --- a/open.go +++ b/open.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/pebble/internal/cache" "github.com/cockroachdb/pebble/internal/constants" "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/manual" "github.com/cockroachdb/pebble/objstorage" @@ -807,17 +808,23 @@ func (d *DB) replayIngestedFlushable( } meta := make([]*fileMetadata, len(fileNums)) + var lastRangeKey keyspan.Span for i, n := range fileNums { readable, err := d.objProvider.OpenForReading(context.TODO(), fileTypeTable, n, objstorage.OpenOptions{MustExist: true}) if err != nil { return nil, errors.Wrap(err, "pebble: error when opening flushable ingest files") } // NB: ingestLoad1 will close readable. - meta[i], err = ingestLoad1(context.TODO(), d.opts, d.FormatMajorVersion(), readable, d.cacheID, base.PhysicalTableFileNum(n)) + meta[i], lastRangeKey, err = ingestLoad1(context.TODO(), d.opts, d.FormatMajorVersion(), + readable, d.cacheID, base.PhysicalTableFileNum(n), lastRangeKey, false /* disableRangeKeyChecks */) if err != nil { return nil, errors.Wrap(err, "pebble: error when loading flushable ingest files") } } + if lastRangeKey.Valid() && d.opts.Comparer.Split.HasSuffix(lastRangeKey.End) { + return nil, errors.AssertionFailedf("pebble: last ingest sstable has suffixed range key end %s", + d.opts.Comparer.FormatKey(lastRangeKey.End)) + } numFiles := len(meta) if exciseSpan.Valid() { diff --git a/testdata/iter_histories/prefix_iteration b/testdata/iter_histories/prefix_iteration index a5484b51e4..728b504ca8 100644 --- a/testdata/iter_histories/prefix_iteration +++ b/testdata/iter_histories/prefix_iteration @@ -140,8 +140,14 @@ ingest ext1 range-key-set a c@8 @1 bar set c@9 c@9 ---- +pebble: last ingest sstable has suffixed range key end c@8 -ingest ext2 +build ext1 +range-key-set a c@8 @1 bar +set c@9 c@9 +---- + +build ext2 range-key-set c@8 e @1 bar set c@8 c@8 set c@7 c@7 @@ -152,6 +158,9 @@ set c@3 c@3 set c@2 c@2 ---- +ingest-existing ext1 ext2 +---- + ingest ext2 range-key-set y z @1 foo set z z @@ -160,9 +169,9 @@ set z z lsm ---- L6: - 000004:[a#10,RANGEKEYSET-c@8#inf,RANGEKEYSET] - 000005:[c@8#11,RANGEKEYSET-e#inf,RANGEKEYSET] - 000006:[y#12,RANGEKEYSET-z#12,SET] + 000005:[a#10,RANGEKEYSET-c@8#inf,RANGEKEYSET] + 000006:[c@8#11,RANGEKEYSET-e#inf,RANGEKEYSET] + 000007:[y#12,RANGEKEYSET-z#12,SET] # The first seek-prefix-ge y@1 converts the iterator from lazy combined iterator