From 1e3ee1e4e80873018af716a190e541925f09c285 Mon Sep 17 00:00:00 2001 From: Anton Okolnychyi Date: Mon, 28 Oct 2024 22:02:55 +0100 Subject: [PATCH] Core: Add portable Roaring bitmap for row positions (#11372) --- LICENSE | 1 + .../RoaringPositionBitmapBenchmark.java | 162 ++++++ .../deletes/RoaringPositionBitmap.java | 318 +++++++++++ .../deletes/TestRoaringPositionBitmap.java | 515 ++++++++++++++++++ .../apache/iceberg/deletes/64map32bitvals.bin | Bin 0 -> 48 bytes .../org/apache/iceberg/deletes/64mapempty.bin | Bin 0 -> 8 bytes .../apache/iceberg/deletes/64maphighvals.bin | Bin 0 -> 1086 bytes .../iceberg/deletes/64mapspreadvals.bin | Bin 0 -> 408 bytes 8 files changed, 996 insertions(+) create mode 100644 core/src/jmh/java/org/apache/iceberg/deletes/RoaringPositionBitmapBenchmark.java create mode 100644 core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java create mode 100644 core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java create mode 100644 core/src/test/resources/org/apache/iceberg/deletes/64map32bitvals.bin create mode 100644 core/src/test/resources/org/apache/iceberg/deletes/64mapempty.bin create mode 100644 core/src/test/resources/org/apache/iceberg/deletes/64maphighvals.bin create mode 100644 core/src/test/resources/org/apache/iceberg/deletes/64mapspreadvals.bin diff --git a/LICENSE b/LICENSE index efb46dab44da..76f6113d9811 100644 --- a/LICENSE +++ b/LICENSE @@ -298,6 +298,7 @@ License: https://www.apache.org/licenses/LICENSE-2.0 This product includes code from Delta Lake. * AssignmentAlignmentSupport is an independent development but UpdateExpressionsSupport in Delta was used as a reference. +* RoaringPositionBitmap is a Java implementation of RoaringBitmapArray in Delta. Copyright: 2020 The Delta Lake Project Authors. Home page: https://delta.io/ diff --git a/core/src/jmh/java/org/apache/iceberg/deletes/RoaringPositionBitmapBenchmark.java b/core/src/jmh/java/org/apache/iceberg/deletes/RoaringPositionBitmapBenchmark.java new file mode 100644 index 000000000000..1cbc39583fbc --- /dev/null +++ b/core/src/jmh/java/org/apache/iceberg/deletes/RoaringPositionBitmapBenchmark.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.deletes; + +import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Timeout; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; +import org.roaringbitmap.longlong.Roaring64Bitmap; + +/** + * A benchmark that evaluates the performance of {@link RoaringPositionBitmap}. + * + *

To run this benchmark: + * ./gradlew :iceberg-core:jmh + * -PjmhIncludeRegex=RoaringPositionBitmapBenchmark + * -PjmhOutputPath=benchmark/roaring-position-bitmap-benchmark.txt + * + */ +@Fork(1) +@State(Scope.Benchmark) +@Warmup(iterations = 3) +@Measurement(iterations = 5) +@BenchmarkMode(Mode.SingleShotTime) +@Timeout(time = 5, timeUnit = TimeUnit.MINUTES) +public class RoaringPositionBitmapBenchmark { + + private static final Random RANDOM = new Random(); + private static final int TOTAL_POSITIONS = 5_000_000; + private static final long STEP = 5L; + + private long[] orderedPositions; + private long[] shuffledPositions; + + @Setup + public void setupBenchmark() { + this.orderedPositions = generateOrderedPositions(); + this.shuffledPositions = generateShuffledPositions(); + } + + @Benchmark + @Threads(1) + public void addOrderedPositionsIcebergBitmap(Blackhole blackhole) { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + for (long position : orderedPositions) { + bitmap.set(position); + } + blackhole.consume(bitmap); + } + + @Benchmark + @Threads(1) + public void addOrderedPositionsLibraryBitmap(Blackhole blackhole) { + Roaring64Bitmap bitmap = new Roaring64Bitmap(); + for (long position : orderedPositions) { + bitmap.add(position); + } + blackhole.consume(bitmap); + } + + @Benchmark + @Threads(1) + public void addShuffledPositionsIcebergBitmap(Blackhole blackhole) { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + for (long position : shuffledPositions) { + bitmap.set(position); + } + blackhole.consume(bitmap); + } + + @Benchmark + @Threads(1) + public void addShuffledPositionsLibraryBitmap(Blackhole blackhole) { + Roaring64Bitmap bitmap = new Roaring64Bitmap(); + for (long position : shuffledPositions) { + bitmap.add(position); + } + blackhole.consume(bitmap); + } + + @Benchmark + @Threads(1) + public void addAndCheckPositionsIcebergBitmap(Blackhole blackhole) { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + for (long position : shuffledPositions) { + bitmap.set(position); + } + + for (long position = 0; position <= TOTAL_POSITIONS * STEP; position++) { + bitmap.contains(position); + } + + blackhole.consume(bitmap); + } + + @Benchmark + @Threads(1) + public void addAndCheckPositionsLibraryBitmap(Blackhole blackhole) { + Roaring64Bitmap bitmap = new Roaring64Bitmap(); + + for (long position : shuffledPositions) { + bitmap.add(position); + } + + for (long position = 0; position <= TOTAL_POSITIONS * STEP; position++) { + bitmap.contains(position); + } + + blackhole.consume(bitmap); + } + + private static long[] generateOrderedPositions() { + long[] positions = new long[TOTAL_POSITIONS]; + for (int index = 0; index < TOTAL_POSITIONS; index++) { + positions[index] = index * STEP; + } + return positions; + } + + private static long[] generateShuffledPositions() { + long[] positions = generateOrderedPositions(); + shuffle(positions); + return positions; + } + + private static void shuffle(long[] array) { + for (int index = array.length - 1; index > 0; index--) { + // swap with an element at a random index between 0 and index + int thatIndex = RANDOM.nextInt(index + 1); + long temp = array[index]; + array[index] = array[thatIndex]; + array[thatIndex] = temp; + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java b/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java new file mode 100644 index 000000000000..eec130743d85 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.deletes; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import java.util.function.LongConsumer; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.roaringbitmap.RoaringBitmap; + +/** + * A bitmap that supports positive 64-bit positions (the most significant bit must be 0), but is + * optimized for cases where most positions fit in 32 bits by using an array of 32-bit Roaring + * bitmaps. The internal bitmap array is grown as needed to accommodate the largest position. + * + *

Incoming 64-bit positions are divided into a 32-bit "key" using the most significant 4 bytes + * and a 32-bit position using the least significant 4 bytes. For each key in the set of positions, + * a 32-bit Roaring bitmap is maintained to store a set of 32-bit positions for that key. + * + *

To test whether a certain position is set, its most significant 4 bytes (the key) are used to + * find a 32-bit bitmap and the least significant 4 bytes are tested for inclusion in the bitmap. If + * a bitmap is not found for the key, then the position is not set. + * + *

Positions must range from 0 (inclusive) to {@link #MAX_POSITION} (inclusive). This class + * cannot handle positions with the key equal to Integer.MAX_VALUE because the length of the + * internal bitmap array is a signed 32-bit integer, which must be greater than or equal to 0. + * Supporting Integer.MAX_VALUE as a key would require allocating a bitmap array with size + * Integer.MAX_VALUE + 1, triggering an integer overflow. + */ +class RoaringPositionBitmap { + + static final long MAX_POSITION = toPosition(Integer.MAX_VALUE - 1, Integer.MIN_VALUE); + private static final RoaringBitmap[] EMPTY_BITMAP_ARRAY = new RoaringBitmap[0]; + private static final long BITMAP_COUNT_SIZE_BYTES = 8L; + private static final long BITMAP_KEY_SIZE_BYTES = 4L; + + private RoaringBitmap[] bitmaps; + + RoaringPositionBitmap() { + this.bitmaps = EMPTY_BITMAP_ARRAY; + } + + private RoaringPositionBitmap(RoaringBitmap[] bitmaps) { + this.bitmaps = bitmaps; + } + + /** + * Sets a position in the bitmap. + * + * @param pos the position + */ + public void set(long pos) { + validatePosition(pos); + int key = key(pos); + int pos32Bits = pos32Bits(pos); + allocateBitmapsIfNeeded(key + 1 /* required bitmap array length */); + bitmaps[key].add(pos32Bits); + } + + /** + * Sets a range of positions in the bitmap. + * + * @param posStartInclusive the start position of the range (inclusive) + * @param posEndExclusive the end position of the range (exclusive) + */ + public void setRange(long posStartInclusive, long posEndExclusive) { + for (long pos = posStartInclusive; pos < posEndExclusive; pos++) { + set(pos); + } + } + + /** + * Sets all positions from the other bitmap in this bitmap, modifying this bitmap in place. + * + * @param that the other bitmap + */ + public void setAll(RoaringPositionBitmap that) { + allocateBitmapsIfNeeded(that.bitmaps.length); + for (int key = 0; key < that.bitmaps.length; key++) { + bitmaps[key].or(that.bitmaps[key]); + } + } + + /** + * Checks if a position is set in the bitmap. + * + * @param pos the position + * @return true if the position is set in this bitmap, false otherwise + */ + public boolean contains(long pos) { + validatePosition(pos); + int key = key(pos); + int pos32Bits = pos32Bits(pos); + return key < bitmaps.length && bitmaps[key].contains(pos32Bits); + } + + /** + * Indicates whether the bitmap has any positions set. + * + * @return true if the bitmap is empty, false otherwise + */ + public boolean isEmpty() { + return cardinality() == 0; + } + + /** + * Returns the number of set positions in the bitmap. + * + * @return the number of set positions + */ + public long cardinality() { + long cardinality = 0L; + for (RoaringBitmap bitmap : bitmaps) { + cardinality += bitmap.getLongCardinality(); + } + return cardinality; + } + + /** + * Applies run-length encoding wherever it is more space efficient. + * + * @return whether the bitmap was changed + */ + public boolean runLengthEncode() { + boolean changed = false; + for (RoaringBitmap bitmap : bitmaps) { + changed |= bitmap.runOptimize(); + } + return changed; + } + + /** + * Iterates over all positions in the bitmap. + * + * @param consumer a consumer for positions + */ + public void forEach(LongConsumer consumer) { + for (int key = 0; key < bitmaps.length; key++) { + forEach(key, bitmaps[key], consumer); + } + } + + @VisibleForTesting + int allocatedBitmapCount() { + return bitmaps.length; + } + + private void allocateBitmapsIfNeeded(int requiredLength) { + if (bitmaps.length < requiredLength) { + if (bitmaps.length == 0 && requiredLength == 1) { + this.bitmaps = new RoaringBitmap[] {new RoaringBitmap()}; + } else { + RoaringBitmap[] newBitmaps = new RoaringBitmap[requiredLength]; + System.arraycopy(bitmaps, 0, newBitmaps, 0, bitmaps.length); + for (int key = bitmaps.length; key < requiredLength; key++) { + newBitmaps[key] = new RoaringBitmap(); + } + this.bitmaps = newBitmaps; + } + } + } + + /** + * Returns the number of bytes required to serialize the bitmap. + * + * @return the serialized size in bytes + */ + public long serializedSizeInBytes() { + long size = BITMAP_COUNT_SIZE_BYTES; + for (RoaringBitmap bitmap : bitmaps) { + size += BITMAP_KEY_SIZE_BYTES + bitmap.serializedSizeInBytes(); + } + return size; + } + + /** + * Serializes the bitmap using the portable serialization format described below. + * + *

+ * + *

Note the byte order of the buffer must be little-endian. + * + * @param buffer the buffer to write to + * @see Roaring bitmap spec + */ + public void serialize(ByteBuffer buffer) { + validateByteOrder(buffer); + buffer.putLong(bitmaps.length); + for (int key = 0; key < bitmaps.length; key++) { + buffer.putInt(key); + bitmaps[key].serialize(buffer); + } + } + + /** + * Deserializes a bitmap from a buffer, assuming the portable serialization format. + * + * @param buffer the buffer to read from + * @return a new bitmap instance with the deserialized data + */ + public static RoaringPositionBitmap deserialize(ByteBuffer buffer) { + validateByteOrder(buffer); + + // the bitmap array may be sparse with more elements than the number of read bitmaps + int remainingBitmapCount = readBitmapCount(buffer); + List bitmaps = Lists.newArrayListWithExpectedSize(remainingBitmapCount); + int lastKey = -1; + + while (remainingBitmapCount > 0) { + int key = readKey(buffer, lastKey); + + // fill gaps as the bitmap array may be sparse + while (lastKey < key - 1) { + bitmaps.add(new RoaringBitmap()); + lastKey++; + } + + RoaringBitmap bitmap = readBitmap(buffer); + bitmaps.add(bitmap); + + lastKey = key; + remainingBitmapCount--; + } + + return new RoaringPositionBitmap(bitmaps.toArray(EMPTY_BITMAP_ARRAY)); + } + + private static void validateByteOrder(ByteBuffer buffer) { + Preconditions.checkArgument( + buffer.order() == ByteOrder.LITTLE_ENDIAN, + "Roaring bitmap serialization requires little-endian byte order"); + } + + private static int readBitmapCount(ByteBuffer buffer) { + long bitmapCount = buffer.getLong(); + Preconditions.checkArgument( + bitmapCount >= 0 && bitmapCount <= Integer.MAX_VALUE, + "Invalid bitmap count: %s", + bitmapCount); + return (int) bitmapCount; + } + + private static int readKey(ByteBuffer buffer, int lastKey) { + int key = buffer.getInt(); + Preconditions.checkArgument(key >= 0, "Invalid unsigned key: %s", key); + Preconditions.checkArgument(key <= Integer.MAX_VALUE - 1, "Key is too large: %s", key); + Preconditions.checkArgument(key > lastKey, "Keys must be sorted in ascending order"); + return key; + } + + private static RoaringBitmap readBitmap(ByteBuffer buffer) { + try { + RoaringBitmap bitmap = new RoaringBitmap(); + bitmap.deserialize(buffer); + buffer.position(buffer.position() + bitmap.serializedSizeInBytes()); + return bitmap; + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + // extracts high 32 bits from a 64-bit position (i.e. key) + private static int key(long pos) { + return (int) (pos >> 32); + } + + // extracts low 32 bits from a 64-bit position (i.e. 32-bit position) + private static int pos32Bits(long pos) { + return (int) pos; + } + + // combines high and low 32 bits into a 64-bit position + // the low 32 bits must be bit-masked to avoid sign extension + private static long toPosition(int key, int pos32Bits) { + return (((long) key) << 32) | (((long) pos32Bits) & 0xFFFFFFFFL); + } + + // iterates over 64-bit positions, reconstructing them from keys and 32-bit positions + private static void forEach(int key, RoaringBitmap bitmap, LongConsumer consumer) { + bitmap.forEach((int pos32Bits) -> consumer.accept(toPosition(key, pos32Bits))); + } + + private static void validatePosition(long pos) { + Preconditions.checkArgument( + pos >= 0 && pos <= MAX_POSITION, + "Bitmap supports positions that are >= 0 and <= %s: %s", + MAX_POSITION, + pos); + } +} diff --git a/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java b/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java new file mode 100644 index 000000000000..2daf0382973b --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java @@ -0,0 +1,515 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.deletes; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import java.util.Random; +import java.util.Set; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.relocated.com.google.common.io.Resources; +import org.apache.iceberg.util.Pair; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestRoaringPositionBitmap { + + private static final long BITMAP_SIZE = 0xFFFFFFFFL; + private static final long BITMAP_OFFSET = BITMAP_SIZE + 1L; + private static final long CONTAINER_SIZE = Character.MAX_VALUE; + private static final long CONTAINER_OFFSET = CONTAINER_SIZE + 1L; + private static final int VALIDATION_LOOKUP_COUNT = 20_000; + private static final Set SUPPORTED_OFFICIAL_EXAMPLE_FILES = + ImmutableSet.of("64map32bitvals.bin", "64mapempty.bin", "64mapspreadvals.bin"); + + @Parameters(name = "seed = {0}, validationSeed = {1}") + protected static List parameters() { + List parameters = Lists.newArrayList(); + Random random = new Random(); + long seed = random.nextLong(); + long validationSeed = random.nextLong(); + parameters.add(new Object[] {seed, validationSeed}); + return parameters; + } + + @Parameter(index = 0) + private long seed; + + @Parameter(index = 1) + private long validationSeed; + + @TestTemplate + public void testAdd() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + bitmap.set(10L); + assertThat(bitmap.contains(10L)).isTrue(); + + bitmap.set(0L); + assertThat(bitmap.contains(0L)).isTrue(); + + bitmap.set(10L); + assertThat(bitmap.contains(10L)).isTrue(); + } + + @TestTemplate + public void testAddPositionsRequiringMultipleBitmaps() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + // construct positions that differ in their high 32-bit parts (i.e. keys) + long pos1 = ((long) 0 << 32) | 10L; // key = 0, low = 10 + long pos2 = ((long) 1 << 32) | 20L; // key = 1, low = 20 + long pos3 = ((long) 2 << 32) | 30L; // key = 2, low = 30 + long pos4 = ((long) 100 << 32) | 40L; // key = 100, low = 40 + + bitmap.set(pos1); + bitmap.set(pos2); + bitmap.set(pos3); + bitmap.set(pos4); + + assertThat(bitmap.contains(pos1)).isTrue(); + assertThat(bitmap.contains(pos2)).isTrue(); + assertThat(bitmap.contains(pos3)).isTrue(); + assertThat(bitmap.contains(pos4)).isTrue(); + assertThat(bitmap.cardinality()).isEqualTo(4); + assertThat(bitmap.serializedSizeInBytes()).isGreaterThan(4); + assertThat(bitmap.allocatedBitmapCount()).isEqualTo(101 /* max key + 1 */); + } + + @TestTemplate + public void testAddRange() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + long posStartInclusive = 10L; + long posEndExclusive = 20L; + bitmap.setRange(posStartInclusive, posEndExclusive); + + // assert that all positions in the range [10, 20) are added + for (long pos = posStartInclusive; pos < posEndExclusive; pos++) { + assertThat(bitmap.contains(pos)).isTrue(); + } + + // assert that positions outside the range are not present + assertThat(bitmap.contains(9L)).isFalse(); + assertThat(bitmap.contains(20L)).isFalse(); + + // assert that the cardinality is correct (10 positions in range [10, 20)) + assertThat(bitmap.cardinality()).isEqualTo(10); + } + + @TestTemplate + public void testAddRangeAcrossKeys() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + long posStartInclusive = ((long) 1 << 32) - 5L; + long posEndExclusive = ((long) 1 << 32) + 5L; + bitmap.setRange(posStartInclusive, posEndExclusive); + + // assert that all positions in the range are added + for (long pos = posStartInclusive; pos < posEndExclusive; pos++) { + assertThat(bitmap.contains(pos)).isTrue(); + } + + // assert that positions outside the range are not present + assertThat(bitmap.contains(0)).isFalse(); + assertThat(bitmap.contains(posEndExclusive)).isFalse(); + + // assert that the cardinality is correct + assertThat(bitmap.cardinality()).isEqualTo(10); + } + + @TestTemplate + public void testAddEmptyRange() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + bitmap.setRange(10, 10); + assertThat(bitmap.isEmpty()).isTrue(); + } + + @TestTemplate + public void testAddAll() { + RoaringPositionBitmap bitmap1 = new RoaringPositionBitmap(); + bitmap1.set(10L); + bitmap1.set(20L); + + RoaringPositionBitmap bitmap2 = new RoaringPositionBitmap(); + bitmap2.set(30L); + bitmap2.set(40L); + bitmap2.set((long) 2 << 32); + + bitmap1.setAll(bitmap2); + + assertThat(bitmap1.contains(10L)).isTrue(); + assertThat(bitmap1.contains(20L)).isTrue(); + assertThat(bitmap1.contains(30L)).isTrue(); + assertThat(bitmap1.contains(40L)).isTrue(); + assertThat(bitmap1.contains((long) 2 << 32)).isTrue(); + assertThat(bitmap1.cardinality()).isEqualTo(5); + + assertThat(bitmap2.contains(10L)).isFalse(); + assertThat(bitmap2.contains(20L)).isFalse(); + assertThat(bitmap2.cardinality()).isEqualTo(3); + } + + @TestTemplate + public void testAddAllWithEmptyBitmap() { + RoaringPositionBitmap bitmap1 = new RoaringPositionBitmap(); + bitmap1.set(10L); + bitmap1.set(20L); + + RoaringPositionBitmap emptyBitmap = new RoaringPositionBitmap(); + + bitmap1.setAll(emptyBitmap); + + assertThat(bitmap1.contains(10L)).isTrue(); + assertThat(bitmap1.contains(20L)).isTrue(); + assertThat(bitmap1.cardinality()).isEqualTo(2); + + assertThat(emptyBitmap.contains(10L)).isFalse(); + assertThat(emptyBitmap.contains(20L)).isFalse(); + assertThat(emptyBitmap.cardinality()).isEqualTo(0); + assertThat(emptyBitmap.isEmpty()).isTrue(); + } + + @TestTemplate + public void testAddAllWithOverlappingBitmap() { + RoaringPositionBitmap bitmap1 = new RoaringPositionBitmap(); + bitmap1.set(10L); + bitmap1.set(20L); + bitmap1.set(30L); + + RoaringPositionBitmap bitmap2 = new RoaringPositionBitmap(); + bitmap2.set(20L); + bitmap2.set(40L); + + bitmap1.setAll(bitmap2); + + assertThat(bitmap1.contains(10L)).isTrue(); + assertThat(bitmap1.contains(20L)).isTrue(); + assertThat(bitmap1.contains(30L)).isTrue(); + assertThat(bitmap1.contains(40L)).isTrue(); + assertThat(bitmap1.cardinality()).isEqualTo(4); + + assertThat(bitmap2.contains(10L)).isFalse(); + assertThat(bitmap2.contains(20L)).isTrue(); + assertThat(bitmap2.contains(30L)).isFalse(); + assertThat(bitmap2.contains(40L)).isTrue(); + assertThat(bitmap2.cardinality()).isEqualTo(2); + } + + @TestTemplate + public void testAddAllSparseBitmaps() { + RoaringPositionBitmap bitmap1 = new RoaringPositionBitmap(); + bitmap1.set((long) 0 << 32 | 100L); // key = 0, low = 100 + bitmap1.set((long) 1 << 32 | 200L); // key = 1, low = 200 + + RoaringPositionBitmap bitmap2 = new RoaringPositionBitmap(); + bitmap2.set((long) 2 << 32 | 300L); // key = 2, low = 300 + bitmap2.set((long) 3 << 32 | 400L); // key = 3, low = 400 + + bitmap1.setAll(bitmap2); + + assertThat(bitmap1.contains((long) 0 << 32 | 100L)).isTrue(); + assertThat(bitmap1.contains((long) 1 << 32 | 200L)).isTrue(); + assertThat(bitmap1.contains((long) 2 << 32 | 300L)).isTrue(); + assertThat(bitmap1.contains((long) 3 << 32 | 400L)).isTrue(); + assertThat(bitmap1.cardinality()).isEqualTo(4); + } + + @TestTemplate + public void testCardinality() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + assertThat(bitmap.cardinality()).isEqualTo(0); + + bitmap.set(10L); + bitmap.set(20L); + bitmap.set(30L); + + assertThat(bitmap.cardinality()).isEqualTo(3); + + bitmap.set(10L); // already exists + + assertThat(bitmap.cardinality()).isEqualTo(3); + } + + @TestTemplate + public void testCardinalitySparseBitmaps() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + bitmap.set((long) 0 << 32 | 100L); // key = 0, low = 100 + bitmap.set((long) 0 << 32 | 101L); // key = 0, low = 101 + bitmap.set((long) 0 << 32 | 105L); // key = 0, low = 101 + bitmap.set((long) 1 << 32 | 200L); // key = 1, low = 200 + bitmap.set((long) 100 << 32 | 300L); // key = 100, low = 300 + + assertThat(bitmap.cardinality()).isEqualTo(5); + } + + @TestTemplate + public void testSerializeDeserializeAllContainerBitmap() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + // bitmap 0, container 0 (array) + bitmap.set(position(0 /* bitmap */, 0 /* container */, 5L)); + bitmap.set(position(0 /* bitmap */, 0 /* container */, 7L)); + + // bitmap 0, container 1 (array that can be compressed) + bitmap.setRange( + position(0 /* bitmap */, 1 /* container */, 1L), + position(0 /* bitmap */, 1 /* container */, 1000L)); + + // bitmap 1, container 2 (bitset) + bitmap.setRange( + position(0 /* bitmap */, 2 /* container */, 1L), + position(0 /* bitmap */, 2 /* container */, CONTAINER_OFFSET - 1L)); + + // bitmap 1, container 0 (array) + bitmap.set(position(1 /* bitmap */, 0 /* container */, 10L)); + bitmap.set(position(1 /* bitmap */, 0 /* container */, 20L)); + + // bitmap 1, container 1 (array that can be compressed) + bitmap.setRange( + position(1 /* bitmap */, 1 /* container */, 10L), + position(1 /* bitmap */, 1 /* container */, 500L)); + + // bitmap 1, container 2 (bitset) + bitmap.setRange( + position(1 /* bitmap */, 2 /* container */, 1L), + position(1 /* bitmap */, 2 /* container */, CONTAINER_OFFSET - 1)); + + assertThat(bitmap.runLengthEncode()).as("Bitmap must be RLE encoded").isTrue(); + + RoaringPositionBitmap bitmapCopy = roundTripSerialize(bitmap); + + assertThat(bitmapCopy.cardinality()).isEqualTo(bitmap.cardinality()); + bitmapCopy.forEach(position -> assertThat(bitmap.contains(position)).isTrue()); + bitmap.forEach(position -> assertThat(bitmapCopy.contains(position)).isTrue()); + } + + @TestTemplate + public void testDeserializeSupportedRoaringExamples() throws IOException { + for (String file : SUPPORTED_OFFICIAL_EXAMPLE_FILES) { + RoaringPositionBitmap bitmap = readBitmap(file); + assertThat(bitmap).isNotNull(); + } + } + + @TestTemplate + public void testDeserializeUnsupportedRoaringExample() { + // this file contains a value that is larger than the max supported value in our impl + assertThatThrownBy(() -> readBitmap("64maphighvals.bin")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Invalid unsigned key"); + } + + @TestTemplate + public void testUnsupportedPositions() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + assertThatThrownBy(() -> bitmap.set(-1L)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Bitmap supports positions that are >= 0 and <= %s", + RoaringPositionBitmap.MAX_POSITION); + + assertThatThrownBy(() -> bitmap.contains(-1L)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Bitmap supports positions that are >= 0 and <= %s", + RoaringPositionBitmap.MAX_POSITION); + + assertThatThrownBy(() -> bitmap.set(RoaringPositionBitmap.MAX_POSITION + 1L)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Bitmap supports positions that are >= 0 and <= %s", + RoaringPositionBitmap.MAX_POSITION); + + assertThatThrownBy(() -> bitmap.contains(RoaringPositionBitmap.MAX_POSITION + 1L)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Bitmap supports positions that are >= 0 and <= %s", + RoaringPositionBitmap.MAX_POSITION); + } + + @TestTemplate + public void testInvalidSerializationByteOrder() { + assertThatThrownBy(() -> RoaringPositionBitmap.deserialize(ByteBuffer.allocate(4))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("serialization requires little-endian byte order"); + } + + @TestTemplate + public void testRandomSparseBitmap() { + Pair> bitmapAndPositions = + generateSparseBitmap( + 0L /* min position */, + (long) 5 << 32 /* max position must not need more than 5 bitmaps */, + 100_000 /* cardinality */); + RoaringPositionBitmap bitmap = bitmapAndPositions.first(); + Set positions = bitmapAndPositions.second(); + assertEqual(bitmap, positions); + assertRandomPositions(bitmap, positions); + } + + @TestTemplate + public void testRandomDenseBitmap() { + Pair> bitmapAndPositions = generateDenseBitmap(7); + RoaringPositionBitmap bitmap = bitmapAndPositions.first(); + Set positions = bitmapAndPositions.second(); + assertEqual(bitmap, positions); + assertRandomPositions(bitmap, positions); + } + + @TestTemplate + public void testRandomMixedBitmap() { + Pair> bitmapAndPositions = + generateSparseBitmap( + (long) 3 << 32 /* min position must need at least 3 bitmaps */, + (long) 5 << 32 /* max position must not need more than 5 bitmaps */, + 100_000 /* cardinality */); + RoaringPositionBitmap bitmap = bitmapAndPositions.first(); + Set positions = bitmapAndPositions.second(); + + Pair> pair1 = generateDenseBitmap(9); + bitmap.setAll(pair1.first()); + positions.addAll(pair1.second()); + + Pair> pair2 = + generateSparseBitmap( + 0 /* min position */, + (long) 3 << 32 /* max position must not need more than 3 bitmaps */, + 25_000 /* cardinality */); + bitmap.setAll(pair2.first()); + positions.addAll(pair2.second()); + + Pair> pair3 = generateDenseBitmap(3); + bitmap.setAll(pair3.first()); + positions.addAll(pair3.second()); + + Pair> pair4 = + generateSparseBitmap( + 0 /* min position */, + (long) 1 << 32 /* max position must not need more than 1 bitmap */, + 5_000 /* cardinality */); + bitmap.setAll(pair4.first()); + positions.addAll(pair4.second()); + + assertEqual(bitmap, positions); + assertRandomPositions(bitmap, positions); + } + + private Pair> generateSparseBitmap( + long minInclusive, long maxExclusive, int size) { + Random random = new Random(seed); + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + Set positions = Sets.newHashSet(); + + while (positions.size() < size) { + long position = nextLong(random, minInclusive, maxExclusive); + positions.add(position); + bitmap.set(position); + } + + return Pair.of(bitmap, positions); + } + + private Pair> generateDenseBitmap(int requiredBitmapCount) { + Random random = new Random(seed); + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + Set positions = Sets.newHashSet(); + long currentPosition = 0; + + while (bitmap.allocatedBitmapCount() <= requiredBitmapCount) { + long maxRunPosition = currentPosition + nextLong(random, 1000, 2 * CONTAINER_SIZE); + for (long position = currentPosition; position <= maxRunPosition; position++) { + bitmap.set(position); + positions.add(position); + } + long shift = nextLong(random, (long) (0.1 * BITMAP_SIZE), (long) (0.25 * BITMAP_SIZE)); + currentPosition = maxRunPosition + shift; + } + + return Pair.of(bitmap, positions); + } + + private void assertRandomPositions(RoaringPositionBitmap bitmap, Set positions) { + Random random = new Random(validationSeed); + for (int ordinal = 0; ordinal < VALIDATION_LOOKUP_COUNT; ordinal++) { + long position = nextLong(random, 0, RoaringPositionBitmap.MAX_POSITION); + assertThat(bitmap.contains(position)).isEqualTo(positions.contains(position)); + } + } + + private static long nextLong(Random random, long minInclusive, long maxExclusive) { + return minInclusive + (long) (random.nextDouble() * (maxExclusive - minInclusive)); + } + + private static long position(int bitmapIndex, int containerIndex, long value) { + return bitmapIndex * BITMAP_OFFSET + containerIndex * CONTAINER_OFFSET + value; + } + + private static RoaringPositionBitmap roundTripSerialize(RoaringPositionBitmap bitmap) { + ByteBuffer buffer = ByteBuffer.allocate((int) bitmap.serializedSizeInBytes()); + buffer.order(ByteOrder.LITTLE_ENDIAN); + bitmap.serialize(buffer); + buffer.flip(); + return RoaringPositionBitmap.deserialize(buffer); + } + + private static RoaringPositionBitmap readBitmap(String resourceName) throws IOException { + byte[] bytes = readTestResource(resourceName); + ByteBuffer buffer = ByteBuffer.wrap(bytes); + buffer.order(ByteOrder.LITTLE_ENDIAN); + return RoaringPositionBitmap.deserialize(buffer); + } + + private static byte[] readTestResource(String resourceName) throws IOException { + URL resource = Resources.getResource(TestRoaringPositionBitmap.class, resourceName); + return Resources.toByteArray(resource); + } + + private static void assertEqual(RoaringPositionBitmap bitmap, Set positions) { + assertEqualContent(bitmap, positions); + + RoaringPositionBitmap bitmapCopy1 = roundTripSerialize(bitmap); + assertEqualContent(bitmapCopy1, positions); + + bitmap.runLengthEncode(); + RoaringPositionBitmap bitmapCopy2 = roundTripSerialize(bitmap); + assertEqualContent(bitmapCopy2, positions); + } + + private static void assertEqualContent(RoaringPositionBitmap bitmap, Set positions) { + assertThat(bitmap.cardinality()).isEqualTo(positions.size()); + positions.forEach(position -> assertThat(bitmap.contains(position)).isTrue()); + bitmap.forEach(position -> assertThat(positions.contains(position)).isTrue()); + } +} diff --git a/core/src/test/resources/org/apache/iceberg/deletes/64map32bitvals.bin b/core/src/test/resources/org/apache/iceberg/deletes/64map32bitvals.bin new file mode 100644 index 0000000000000000000000000000000000000000..475b894417e44cff61d8810057fc1530cef05718 GIT binary patch literal 48 ocmZQ%KmaQP1_nkjmy9 literal 0 HcmV?d00001 diff --git a/core/src/test/resources/org/apache/iceberg/deletes/64maphighvals.bin b/core/src/test/resources/org/apache/iceberg/deletes/64maphighvals.bin new file mode 100644 index 0000000000000000000000000000000000000000..d4312b8d22713991026a36d5d1293cf1960d89ed GIT binary patch literal 1086 zcmd;PfPnY=_rj5t0RsagP#7Y>#UKD@!S*SERgUMnOxf@r{zi~~v PF5QrBO1Grj(uH&%!J7vn literal 0 HcmV?d00001