Skip to content

Commit

Permalink
db: avoid loading later files for a level in SeekPrefixGE
Browse files Browse the repository at this point in the history
Don't load arbitrarily later files during levelIter.SeekPrefixGE. This avoids
unnecessary IO and CPU overhead.

Close #3575.
  • Loading branch information
jbowens authored and itsbilal committed Nov 7, 2024
1 parent 7449865 commit 19b47dc
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 14 deletions.
4 changes: 4 additions & 0 deletions internal/base/comparer.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,10 @@ var DefaultComparer = &Comparer{
return append(dst, a...)
},

Split: func(a []byte) int {
return len(a)
},

ImmediateSuccessor: func(dst, a []byte) (ret []byte) {
return append(append(dst, a...), 0x00)
},
Expand Down
39 changes: 39 additions & 0 deletions level_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ type levelIter struct {
// recent call to SetBounds.
lower []byte
upper []byte
// prefix holds the iteration prefix when the most recent absolute
// positioning method was a SeekPrefixGE.
prefix []byte
// The iterator options for the currently open table. If
// tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary
// does not lie within the table bounds.
Expand Down Expand Up @@ -267,6 +270,7 @@ func (l *levelIter) init(
l.err = nil
l.level = level
l.logger = opts.getLogger()
l.prefix = nil
l.lower = opts.LowerBound
l.upper = opts.UpperBound
l.tableOpts.TableFilter = opts.TableFilter
Expand Down Expand Up @@ -672,6 +676,18 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato
file = l.files.Prev()
continue
}
// If we're in prefix iteration, it's possible this file's smallest
// boundary is large enough to prove the file cannot possibly contain
// any keys within the iteration prefix. Loading the next file is
// unnecessary. This has been observed in practice on slow shared
// storage. See #3575.
if l.prefix != nil && l.cmp(file.SmallestPointKey.UserKey[:l.split(file.SmallestPointKey.UserKey)], l.prefix) > 0 {
// Note that because l.iter is nil, a subsequent call to
// SeekPrefixGE with TrySeekUsingNext()=true will load the file
// (returning newFileLoaded) and disable TrySeekUsingNext before
// performing a seek in the file.
return noFileLoaded
}

var rangeDelIter keyspan.FragmentIterator
var iter internalIterator
Expand Down Expand Up @@ -730,6 +746,7 @@ func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, ba
l.boundaryContext.isSyntheticIterBoundsKey = false
l.boundaryContext.isIgnorableBoundaryKey = false
}
l.prefix = nil
// NB: the top-level Iterator has already adjusted key based on
// IterOptions.LowerBound.
loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
Expand All @@ -755,6 +772,7 @@ func (l *levelIter) SeekPrefixGE(
l.boundaryContext.isSyntheticIterBoundsKey = false
l.boundaryContext.isIgnorableBoundaryKey = false
}
l.prefix = prefix

// NB: the top-level Iterator has already adjusted key based on
// IterOptions.LowerBound.
Expand Down Expand Up @@ -826,6 +844,7 @@ func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, ba
l.boundaryContext.isSyntheticIterBoundsKey = false
l.boundaryContext.isIgnorableBoundaryKey = false
}
l.prefix = nil

// NB: the top-level Iterator has already adjusted key based on
// IterOptions.UpperBound.
Expand All @@ -844,6 +863,7 @@ func (l *levelIter) First() (*InternalKey, base.LazyValue) {
l.boundaryContext.isSyntheticIterBoundsKey = false
l.boundaryContext.isIgnorableBoundaryKey = false
}
l.prefix = nil

// NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is
// set.
Expand All @@ -862,6 +882,7 @@ func (l *levelIter) Last() (*InternalKey, base.LazyValue) {
l.boundaryContext.isSyntheticIterBoundsKey = false
l.boundaryContext.isIgnorableBoundaryKey = false
}
l.prefix = nil

// NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is
// set.
Expand Down Expand Up @@ -1090,6 +1111,24 @@ func (l *levelIter) skipEmptyFileForward() (*InternalKey, base.LazyValue) {
return l.largestBoundary, base.LazyValue{}
}
}
// If the iterator is in prefix iteration mode, it's possible that we
// are here because bloom filter matching failed. In that case it is
// likely that all keys matching the prefix are wholly within the
// current file and cannot be in a subsequent file. In that case we
// don't want to go to the next file, since loading and seeking in there
// has some cost.
//
// This is not just an optimization. We must not advance to the next
// file if the current file might possibly contain keys relevant to any
// prefix greater than our current iteration prefix. If we did, a
// subsequent SeekPrefixGE with TrySeekUsingNext could mistakenly skip
// the file's relevant keys.
if l.prefix != nil {
fileLargestPrefix := l.iterFile.LargestPointKey.UserKey[:l.split(l.iterFile.LargestPointKey.UserKey)]
if l.cmp(fileLargestPrefix, l.prefix) > 0 {
return nil, base.LazyValue{}
}
}

// Current file was exhausted. Move to the next file.
if l.loadFile(l.files.Next(), +1) == noFileLoaded {
Expand Down
6 changes: 6 additions & 0 deletions sstable/table_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,12 @@ func build(
}
tmpFileCount++

if comparer == nil {
cmp := *base.DefaultComparer
// runTestFixtures relies on split not being defined.
cmp.Split = nil
comparer = &cmp
}
writerOpts := WriterOptions{
BlockSize: blockSize,
Comparer: comparer,
Expand Down
51 changes: 50 additions & 1 deletion testdata/iter_histories/prefix_iteration
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ c@10: (., [c-"c\x00") @1=bar UPDATED)
# b. In L6, the level iterator seeks to 000004 which contains a key with
# the prefix, returning 'b@4'.
# 2. Next():
# a. Next advances the the L6 iterator to file 000005. This file contains a
# a. Next advances the L6 iterator to file 000005. This file contains a
# range key [e,f)@1=bar, which updates the lazy-combined iterator's
# state, recording the earliest observed range key as 'e'. The L6 level
# iterator then returns the file single point key 'c'.
Expand Down Expand Up @@ -297,6 +297,55 @@ next
b@4: (b@4, .)
.

# Regression test for #3610.
#
# Similar to the above case, this test consists of two SeekPrefixGEs for
# ascending keys, resulting in TrySeekUsingNext()=true for the second seek.
# Previously, during the first SeekPrefixGE the mergingIter could Next the
# levelIter beyond the file containing point keys relevant to both seeks.

reset bloom-bits-per-key=100
----

define bloom-bits-per-key=100
L4
[email protected]:b@0
L5
[email protected]:b@1
[email protected]:c@3
----
4:
000004:[b@0#10,SET-b@0#10,SET]
5:
000005:[b@8#3,RANGEDEL-c@3#0,SET]

combined-iter
seek-prefix-ge b@10
seek-prefix-ge c@10
----
b@0: (b@0, .)
c@3: (c@3, .)

# Test a seek for a prefix that falls entirely in the gap between file
# boundaries. The iterator stats should indicate that no blocks are loaded.

define bloom-bits-per-key=100
L4
[email protected]:b@0
L4
[email protected]:d@3
----
4:
000004:[b@0#10,SET-b@0#10,SET]
000005:[d@3#10,SET-d@3#10,SET]

combined-iter
seek-prefix-ge c@10
stats
----
.
stats: seeked 1 times (1 internal); stepped 0 times (0 internal)

# Regression test for #2151.
#
# This test consists of two SeekPrefixGEs for ascending keys, which results in
Expand Down
14 changes: 7 additions & 7 deletions testdata/iterator_seek_opt
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ seek-prefix-ge d
d: (2, .)
stats: seeked 22 times (21 internal); stepped 2 times (1 fwd/1 rev, internal: 1 fwd/3 rev)
SeekGEs with trySeekUsingNext: 8
SeekPrefixGEs with trySeekUsingNext: 6
SeekPrefixGEs with trySeekUsingNext: 5

# Shifting bounds, with non-overlapping and monotonic bounds, but using
# SetOptions. A set-options sits between every two seeks. We don't call
Expand All @@ -260,7 +260,7 @@ seek-ge b
b: (1, .)
stats: seeked 23 times (22 internal); stepped 2 times (1 fwd/1 rev, internal: 1 fwd/3 rev)
SeekGEs with trySeekUsingNext: 8
SeekPrefixGEs with trySeekUsingNext: 6
SeekPrefixGEs with trySeekUsingNext: 5

iter
set-options lower=c upper=e
Expand All @@ -270,7 +270,7 @@ seek-ge c
c: (1, .)
stats: seeked 24 times (23 internal); stepped 2 times (1 fwd/1 rev, internal: 1 fwd/3 rev)
SeekGEs with trySeekUsingNext: 8
SeekPrefixGEs with trySeekUsingNext: 6
SeekPrefixGEs with trySeekUsingNext: 5

# NB: Equal bounds.

Expand All @@ -282,7 +282,7 @@ seek-ge d
d: (2, .)
stats: seeked 25 times (24 internal); stepped 2 times (1 fwd/1 rev, internal: 1 fwd/3 rev)
SeekGEs with trySeekUsingNext: 10
SeekPrefixGEs with trySeekUsingNext: 6
SeekPrefixGEs with trySeekUsingNext: 5

iter
set-options lower=a upper=c
Expand All @@ -292,7 +292,7 @@ seek-prefix-ge b
b: (1, .)
stats: seeked 26 times (25 internal); stepped 2 times (1 fwd/1 rev, internal: 1 fwd/3 rev)
SeekGEs with trySeekUsingNext: 10
SeekPrefixGEs with trySeekUsingNext: 6
SeekPrefixGEs with trySeekUsingNext: 5

iter
set-options lower=c upper=e
Expand All @@ -302,7 +302,7 @@ seek-prefix-ge c
c: (1, .)
stats: seeked 27 times (26 internal); stepped 2 times (1 fwd/1 rev, internal: 1 fwd/3 rev)
SeekGEs with trySeekUsingNext: 10
SeekPrefixGEs with trySeekUsingNext: 6
SeekPrefixGEs with trySeekUsingNext: 5

# NB: Equal bounds.

Expand All @@ -314,4 +314,4 @@ seek-prefix-ge d
d: (2, .)
stats: seeked 28 times (27 internal); stepped 2 times (1 fwd/1 rev, internal: 1 fwd/3 rev)
SeekGEs with trySeekUsingNext: 10
SeekPrefixGEs with trySeekUsingNext: 8
SeekPrefixGEs with trySeekUsingNext: 6
12 changes: 6 additions & 6 deletions testdata/level_iter
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ next
next
----
d#4,1:4
dd#5,1:5
.
.

iter
Expand All @@ -141,9 +141,9 @@ prev
prev
----
d#4,1:4
dd#5,1:5
d#4,1:4
c#3,1:3
.
.
.

iter
seek-prefix-ge d
Expand Down Expand Up @@ -387,14 +387,14 @@ next
next
----
d#4,1:4
dd#5,1:5
.
.

iter lower=dd
seek-prefix-ge d
next
----
dd#5,1:5
.
.

iter lower=d
Expand Down

0 comments on commit 19b47dc

Please sign in to comment.