From 2e503b6575496a7e7e93baa85ed1730a26f7126e Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 29 Mar 2024 10:07:22 +0100 Subject: [PATCH] Make `OneMerge#reorder` preserve blocks. (#13128) This updates `IndexWriter` to only call `OneMerge#reorder` when it has a chance to preserve the block structure, ie. either there are no blocks or blocks are identified by a parent field. Furthermore, `MockRandomMergePolicy` is updated to preserve the block structure when a parent field is configured. --- .../org/apache/lucene/index/IndexWriter.java | 24 ++++++- .../tests/index/MockRandomMergePolicy.java | 60 +++++++++++----- .../index/TestMockRandomMergePolicy.java | 71 +++++++++++++++++++ 3 files changed, 136 insertions(+), 19 deletions(-) create mode 100644 lucene/test-framework/src/test/org/apache/lucene/tests/index/TestMockRandomMergePolicy.java diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 5dcb0e416db3..f97ed2e37591 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -3430,7 +3430,17 @@ public void addIndexesReaderMerge(MergePolicy.OneMerge merge) throws IOException readers.add(reader); } - if (config.getIndexSort() == null && readers.isEmpty() == false) { + // Don't reorder if an explicit sort is configured. + final boolean hasIndexSort = config.getIndexSort() != null; + // Don't reorder if blocks can't be identified using the parent field. + final boolean hasBlocksButNoParentField = + readers.stream().map(LeafReader::getMetaData).anyMatch(LeafMetaData::hasBlocks) + && readers.stream() + .map(CodecReader::getFieldInfos) + .map(FieldInfos::getParentField) + .anyMatch(Objects::isNull); + + if (hasIndexSort == false && hasBlocksButNoParentField == false && readers.isEmpty() == false) { CodecReader mergedReader = SlowCompositeCodecReaderWrapper.wrap(readers); DocMap docMap = merge.reorder(mergedReader, directory); if (docMap != null) { @@ -5208,7 +5218,17 @@ public int length() { } MergeState.DocMap[] reorderDocMaps = null; - if (config.getIndexSort() == null) { + // Don't reorder if an explicit sort is configured. + final boolean hasIndexSort = config.getIndexSort() != null; + // Don't reorder if blocks can't be identified using the parent field. + final boolean hasBlocksButNoParentField = + mergeReaders.stream().map(LeafReader::getMetaData).anyMatch(LeafMetaData::hasBlocks) + && mergeReaders.stream() + .map(CodecReader::getFieldInfos) + .map(FieldInfos::getParentField) + .anyMatch(Objects::isNull); + + if (hasIndexSort == false && hasBlocksButNoParentField == false) { // Create a merged view of the input segments. This effectively does the merge. CodecReader mergedView = SlowCompositeCodecReaderWrapper.wrap(mergeReaders); Sorter.DocMap docMap = merge.reorder(mergedView, directory); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java index 8b0370dfb933..1b509c7c3965 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java @@ -24,6 +24,7 @@ import java.util.Random; import java.util.Set; import org.apache.lucene.index.CodecReader; +import org.apache.lucene.index.DocValues; import org.apache.lucene.index.FilterLeafReader; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.MergeTrigger; @@ -34,6 +35,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BitSet; /** MergePolicy that makes random decisions for testing. */ public class MockRandomMergePolicy extends MergePolicy { @@ -245,26 +247,50 @@ public Sorter.DocMap reorder(CodecReader reader, Directory dir) throws IOExcepti System.out.println("NOTE: MockRandomMergePolicy now reverses reader=" + reader); } // Reverse the doc ID order - final int maxDoc = reader.maxDoc(); - return new Sorter.DocMap() { + return reverse(reader); + } + return null; + } + } - @Override - public int size() { - return maxDoc; - } + static Sorter.DocMap reverse(CodecReader reader) throws IOException { + final int maxDoc = reader.maxDoc(); + final BitSet parents; + if (reader.getFieldInfos().getParentField() == null) { + parents = null; + } else { + parents = + BitSet.of(DocValues.getNumeric(reader, reader.getFieldInfos().getParentField()), maxDoc); + } + return new Sorter.DocMap() { + + @Override + public int size() { + return maxDoc; + } - @Override - public int oldToNew(int docID) { - return maxDoc - 1 - docID; - } + @Override + public int oldToNew(int docID) { + if (parents == null) { + return maxDoc - 1 - docID; + } else { + final int oldBlockStart = docID == 0 ? 0 : parents.prevSetBit(docID - 1) + 1; + final int oldBlockEnd = parents.nextSetBit(docID); + final int newBlockEnd = maxDoc - 1 - oldBlockStart; + return newBlockEnd - (oldBlockEnd - docID); + } + } - @Override - public int newToOld(int docID) { - return maxDoc - 1 - docID; - } - }; + @Override + public int newToOld(int docID) { + if (parents == null) { + return maxDoc - 1 - docID; + } else { + final int oldBlockEnd = parents.nextSetBit(maxDoc - 1 - docID); + final int newBlockEnd = oldToNew(oldBlockEnd); + return oldBlockEnd - (newBlockEnd - docID); + } } - return null; - } + }; } } diff --git a/lucene/test-framework/src/test/org/apache/lucene/tests/index/TestMockRandomMergePolicy.java b/lucene/test-framework/src/test/org/apache/lucene/tests/index/TestMockRandomMergePolicy.java new file mode 100644 index 000000000000..4dd37c48b1c7 --- /dev/null +++ b/lucene/test-framework/src/test/org/apache/lucene/tests/index/TestMockRandomMergePolicy.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.tests.index; + +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.CodecReader; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Sorter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestMockRandomMergePolicy extends LuceneTestCase { + + public void testReverseWithParents() throws Exception { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig().setParentField("_parent")); + List docs = new ArrayList<>(); + for (int i = 0; i < 5; ++i) { + docs.add(new Document()); + } + w.addDocuments(docs.subList(0, 2)); + w.addDocuments(docs.subList(0, 4)); + w.addDocuments(docs.subList(0, 3)); + w.forceMerge(1); + w.close(); + DirectoryReader reader = DirectoryReader.open(dir); + CodecReader codecReader = (CodecReader) getOnlyLeafReader(reader); + Sorter.DocMap docMap = MockRandomMergePolicy.reverse(codecReader); + + assertEquals(7, docMap.oldToNew(0)); + assertEquals(8, docMap.oldToNew(1)); + assertEquals(3, docMap.oldToNew(2)); + assertEquals(4, docMap.oldToNew(3)); + assertEquals(5, docMap.oldToNew(4)); + assertEquals(6, docMap.oldToNew(5)); + assertEquals(0, docMap.oldToNew(6)); + assertEquals(1, docMap.oldToNew(7)); + assertEquals(2, docMap.oldToNew(8)); + + assertEquals(6, docMap.newToOld(0)); + assertEquals(7, docMap.newToOld(1)); + assertEquals(8, docMap.newToOld(2)); + assertEquals(2, docMap.newToOld(3)); + assertEquals(3, docMap.newToOld(4)); + assertEquals(4, docMap.newToOld(5)); + assertEquals(5, docMap.newToOld(6)); + assertEquals(0, docMap.newToOld(7)); + assertEquals(1, docMap.newToOld(8)); + + reader.close(); + dir.close(); + } +}