diff --git a/level_checker.go b/level_checker.go index 96488f32200..716d6f4727f 100644 --- a/level_checker.go +++ b/level_checker.go @@ -131,7 +131,8 @@ func (m *simpleMergingIter) step() bool { item := &m.heap.items[0] l := &m.levels[item.index] // Sentinels are not relevant for this point checking. - if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) { + if !l.isIgnorableBoundaryKey && !item.key.IsExclusiveSentinel() && + item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) { m.numPoints++ keyChanged := m.heap.cmp(item.key.UserKey, m.lastKey.UserKey) != 0 if !keyChanged { @@ -223,46 +224,49 @@ func (m *simpleMergingIter) step() bool { m.lastIterMsg = l.iter.String() // Step to the next point. - if l.iterKey, l.iterValue = l.iter.Next(); l.iterKey != nil { - // Check point keys in an sstable are ordered. Although not required, we check - // for memtables as well. A subtle check here is that successive sstables of - // L1 and higher levels are ordered. This happens when levelIter moves to the - // next sstable in the level, in which case item.key is previous sstable's - // last point key. - if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 { - m.err = errors.Errorf("out of order keys %s >= %s in %s", - item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter) - return false + l.iterKey, l.iterValue = l.iter.Next() + if !l.isIgnorableBoundaryKey { + if l.iterKey != nil { + // Check point keys in an sstable are ordered. Although not required, we check + // for memtables as well. A subtle check here is that successive sstables of + // L1 and higher levels are ordered. This happens when levelIter moves to the + // next sstable in the level, in which case item.key is previous sstable's + // last point key. + if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 { + m.err = errors.Errorf("out of order keys %s >= %s in %s", + item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter) + return false + } + item.key.Trailer = l.iterKey.Trailer + item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...) + item.value = l.iterValue + if m.heap.len() > 1 { + m.heap.fix(0) + } + } else { + m.err = l.iter.Close() + l.iter = nil + m.heap.pop() } - item.key.Trailer = l.iterKey.Trailer - item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...) - item.value = l.iterValue - if m.heap.len() > 1 { - m.heap.fix(0) + if m.err != nil { + return false } - } else { - m.err = l.iter.Close() - l.iter = nil - m.heap.pop() - } - if m.err != nil { - return false - } - if m.heap.len() == 0 { - // Last record was a MERGE record. - if m.valueMerger != nil { - var closer io.Closer - _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) - if m.err == nil && closer != nil { - m.err = closer.Close() - } - if m.err != nil { - m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s", - item.key.Pretty(m.formatKey), m.lastIterMsg) + if m.heap.len() == 0 { + // Last record was a MERGE record. + if m.valueMerger != nil { + var closer io.Closer + _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) + if m.err == nil && closer != nil { + m.err = closer.Close() + } + if m.err != nil { + m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s", + item.key.Pretty(m.formatKey), m.lastIterMsg) + } + m.valueMerger = nil } - m.valueMerger = nil + return false } - return false } m.positionRangeDels() return true diff --git a/level_iter.go b/level_iter.go index eea331e5d0e..c192daadf1e 100644 --- a/level_iter.go +++ b/level_iter.go @@ -1018,37 +1018,29 @@ func (l *levelIter) skipEmptyFileForward() (*InternalKey, base.LazyValue) { // bounds. return nil, base.LazyValue{} } - // If the boundary is a range deletion tombstone, return that key. - if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete { - l.largestBoundary = &l.iterFile.LargestPointKey - if l.boundaryContext != nil { - l.boundaryContext.isIgnorableBoundaryKey = true - } - return l.largestBoundary, base.LazyValue{} - } - // If the last point iterator positioning op might've skipped keys, - // it's possible the file's range deletions are still relevant to - // other levels. Return the largest boundary as a special ignorable - // marker to avoid advancing to the next file. - // - // The sstable iterator cannot guarantee that keys were skipped. A - // SeekGE that lands on a index separator k only knows that the - // block at the index entry contains keys ≤ k. We can't know whether - // there were actually keys between the seek key and the index - // separator key. If the block is then excluded due to block - // property filters, the iterator does not know whether keys were - // actually skipped by the block's exclusion. + // If the boundary is a range deletion tombstone, or the caller is + // accessing range dels through l.rangeDelIterPtr, pause at an + // ignorable boundary key to avoid advancing to the next file until + // other levels are caught up. // - // Since MaybeFilteredKeys cannot guarantee that keys were skipped, - // it's possible l.iterFile.Largest was already returned. Returning - // l.iterFile.Largest again is a violation of the strict + // Note that even if the largest boundary is not a range deletion, + // there may still be range deletions beyong the last point key + // returned. When block-property filters are in use, the sstable + // iterator may have transparently skipped a tail of the point keys + // in the file. If the last point key returned /was/ the largest + // key, then we'll return a key with the same user key and trailer + // twice. Returning it again is a violation of the strict // monotonicity normally provided. The mergingIter's heap can // tolerate this repeat key and in this case will keep the level at // the top of the heap and immediately skip the entry, advancing to // the next file. - if *l.rangeDelIterPtr != nil && l.filteredIter != nil && - l.filteredIter.MaybeFilteredKeys() { - l.largestBoundary = &l.iterFile.Largest + // + // TODO(jackson): We should be able to condition this only on + // *l.rangeDelIterPtr != nil, but the getIter retains tombstones + // returned by the rangeDelIter after it's nil'd the ptr. + if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete || + *l.rangeDelIterPtr != nil { + l.largestBoundary = &l.iterFile.LargestPointKey if l.boundaryContext != nil { l.boundaryContext.isIgnorableBoundaryKey = true } diff --git a/merging_iter.go b/merging_iter.go index 741d7687dd3..a2a5a8fe1b2 100644 --- a/merging_iter.go +++ b/merging_iter.go @@ -952,6 +952,17 @@ func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) { if m.levels[item.index].isSyntheticIterBoundsKey { break } + // Skip ignorable boundary keys. These are not real keys and exist to + // keep sstables open until we've surpassed their end boundaries so that + // their range deletions are visible. + if m.levels[item.index].isIgnorableBoundaryKey { + m.err = m.prevEntry(item) + if m.err != nil { + return nil, base.LazyValue{} + } + continue + } + m.addItemStats(item) if isDeleted, err := m.isPrevEntryDeleted(item); err != nil { m.err = err @@ -960,8 +971,7 @@ func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) { m.stats.PointsCoveredByRangeTombstones++ continue } - if item.iterKey.Visible(m.snapshot, m.batchSnapshot) && - (!m.levels[item.index].isIgnorableBoundaryKey) { + if item.iterKey.Visible(m.snapshot, m.batchSnapshot) { return item.iterKey, item.iterValue } m.err = m.prevEntry(item) diff --git a/testdata/level_iter_boundaries b/testdata/level_iter_boundaries index c6d57ebdfd2..1dcd0395b8f 100644 --- a/testdata/level_iter_boundaries +++ b/testdata/level_iter_boundaries @@ -131,8 +131,10 @@ c.SET.2:c iter first next +next ---- c#2,SET:c +c#2,SET: . iter diff --git a/testdata/level_iter_seek b/testdata/level_iter_seek index 1b00fee0244..495d72e9668 100644 --- a/testdata/level_iter_seek +++ b/testdata/level_iter_seek @@ -105,6 +105,8 @@ next stats next stats +next +stats reset-stats stats ---- @@ -117,6 +119,8 @@ c#7,SET:c {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} f#5,SET:f {BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} +f#5,SET: +{BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} g#4,SET:g {BlockBytes:112 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} h#3,SET:h @@ -380,8 +384,10 @@ j.SET.3:j iter seek-ge f next +next ---- f/#5,SET:f +f#5,SET: i#4,SET:i # The below count should be 2, as we skip over the rangekey-only file. diff --git a/testdata/merging_iter b/testdata/merging_iter index ec61ee1d231..5c93e79e0a6 100644 --- a/testdata/merging_iter +++ b/testdata/merging_iter @@ -40,7 +40,7 @@ c#27,SET:27 e#10,SET:10 g#20,SET:20 . -{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:5 ValueBytes:8 PointCount:5 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} +{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:6 ValueBytes:8 PointCount:6 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} {BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}} # seekGE() should not allow the rangedel to act on points in the lower sstable that are after it.