From 82f77319fd9e89d0aac4d712e5ddc27a263abdfc Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 19 Nov 2024 16:39:48 +0800 Subject: [PATCH 01/35] upgrade hudi to 0.15.0 --- fe/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/pom.xml b/fe/pom.xml index d78cfd50b819b4..f1bcc0f7210bb4 100644 --- a/fe/pom.xml +++ b/fe/pom.xml @@ -317,7 +317,7 @@ under the License. 1.11.4 17.0.0 - 0.14.1 + 0.15.0 2.7.4-11 3.0.0-8 From 415c6de66190ffd5bf37c88aa13f4ff41ed946c7 Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 19 Nov 2024 22:50:48 +0800 Subject: [PATCH 02/35] copy HoodieLocalEngineContext --- .../hudi/source/HudiLocalEngineContext.java | 89 ++++++++----------- 1 file changed, 38 insertions(+), 51 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiLocalEngineContext.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiLocalEngineContext.java index 26ef6fdfef7086..e3207a6ee83810 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiLocalEngineContext.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiLocalEngineContext.java @@ -17,20 +17,11 @@ package org.apache.doris.datasource.hudi.source; -import org.apache.doris.datasource.hive.HiveMetaStoreClientHelper; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.data.HoodieAccumulator; import org.apache.hudi.common.data.HoodieAtomicLongAccumulator; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.data.HoodieData.HoodieDataCacheKey; import org.apache.hudi.common.data.HoodieListData; -import org.apache.hudi.common.engine.EngineProperty; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.engine.LocalTaskContextSupplier; -import org.apache.hudi.common.engine.TaskContextSupplier; -import org.apache.hudi.common.function.FunctionWrapper; import org.apache.hudi.common.function.SerializableBiFunction; import org.apache.hudi.common.function.SerializableConsumer; import org.apache.hudi.common.function.SerializableFunction; @@ -39,7 +30,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.storage.StorageConfiguration; import java.util.Collections; import java.util.Iterator; @@ -49,19 +40,29 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static java.util.stream.Collectors.toList; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingReduceWrapper; + /** - * This file is copied from org.apache.hudi.common.engine.HoodieLocalEngineContext. + * This file is copied from + * org.apache.hudi.common.engine.HoodieLocalEngineContext. * Because we need set ugi in thread pool - * A java based engine context, use this implementation on the query engine integrations if needed. + * A java based engine context, use this implementation on the query engine + * integrations if needed. */ -public final class HudiLocalEngineContext extends HoodieEngineContext { +public final class HoodieLocalEngineContext extends HoodieEngineContext { - public HudiLocalEngineContext(Configuration conf) { + public HoodieLocalEngineContext(StorageConfiguration conf) { this(conf, new LocalTaskContextSupplier()); } - public HudiLocalEngineContext(Configuration conf, TaskContextSupplier taskContextSupplier) { - super(new SerializableConfiguration(conf), taskContextSupplier); + public HoodieLocalEngineContext(StorageConfiguration conf, TaskContextSupplier taskContextSupplier) { + super(conf, taskContextSupplier); } @Override @@ -81,69 +82,55 @@ public HoodieData parallelize(List data, int parallelism) { @Override public List map(List data, SerializableFunction func, int parallelism) { - return data.stream().parallel().map(v1 -> { - try { - return HiveMetaStoreClientHelper.ugiDoAs(getHadoopConf().get(), () -> func.apply(v1)); - } catch (Exception e) { - throw new HoodieException("Error occurs when executing map", e); - } - }).collect(Collectors.toList()); + return data.stream().parallel().map(throwingMapWrapper(func)).collect(toList()); } @Override public List mapToPairAndReduceByKey( - List data, - SerializablePairFunction mapToPairFunc, - SerializableBiFunction reduceFunc, int parallelism) { - return data.stream().parallel().map(FunctionWrapper.throwingMapToPairWrapper(mapToPairFunc)) - .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() - .map(list -> - list.stream() - .map(e -> e.getValue()) - .reduce(FunctionWrapper.throwingReduceWrapper(reduceFunc)).get()) - .collect(Collectors.toList()); + List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, + int parallelism) { + return data.stream().parallel().map(throwingMapToPairWrapper(mapToPairFunc)) + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).get()) + .collect(Collectors.toList()); } @Override public Stream> mapPartitionsToPairAndReduceByKey( Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { - return FunctionWrapper.throwingFlatMapToPairWrapper(flatMapToPairFunc).apply(data.parallel().iterator()) - .collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream() - .map(entry -> new ImmutablePair<>(entry.getKey(), entry.getValue().stream().map( - Pair::getValue).reduce(FunctionWrapper.throwingReduceWrapper(reduceFunc)).orElse(null))) - .filter(Objects::nonNull); + return throwingFlatMapToPairWrapper(flatMapToPairFunc).apply(data.parallel().iterator()) + .collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream() + .map(entry -> new ImmutablePair<>(entry.getKey(), entry.getValue().stream().map( + Pair::getValue).reduce(throwingReduceWrapper(reduceFunc)).orElse(null))) + .filter(Objects::nonNull); } @Override public List reduceByKey( List> data, SerializableBiFunction reduceFunc, int parallelism) { return data.stream().parallel() - .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() - .map(list -> - list.stream() - .map(e -> e.getValue()) - .reduce(FunctionWrapper.throwingReduceWrapper(reduceFunc)).orElse(null)) - .filter(Objects::nonNull) - .collect(Collectors.toList()); + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)) + .orElse(null)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); } @Override public List flatMap(List data, SerializableFunction> func, int parallelism) { - return - data.stream().parallel().flatMap(FunctionWrapper.throwingFlatMapWrapper(func)).collect(Collectors.toList()); + return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(toList()); } @Override public void foreach(List data, SerializableConsumer consumer, int parallelism) { - data.stream().forEach(FunctionWrapper.throwingForeachWrapper(consumer)); + data.stream().forEach(throwingForeachWrapper(consumer)); } @Override public Map mapToPair(List data, SerializablePairFunction func, Integer parallelism) { - return data.stream().map(FunctionWrapper.throwingMapToPairWrapper(func)).collect( - Collectors.toMap(Pair::getLeft, Pair::getRight, (oldVal, newVal) -> newVal) - ); + return data.stream().map(throwingMapToPairWrapper(func)).collect( + Collectors.toMap(Pair::getLeft, Pair::getRight, (oldVal, newVal) -> newVal)); } @Override From 06cb47e2a89f05206fff9eb43b7c9add3d9ab8c3 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 20 Nov 2024 00:16:51 +0800 Subject: [PATCH 03/35] fix checkstyle --- .../hudi/source/HudiLocalEngineContext.java | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiLocalEngineContext.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiLocalEngineContext.java index e3207a6ee83810..fecc026cf8d046 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiLocalEngineContext.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiLocalEngineContext.java @@ -22,6 +22,11 @@ import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.data.HoodieData.HoodieDataCacheKey; import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.LocalTaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.function.FunctionWrapper; import org.apache.hudi.common.function.SerializableBiFunction; import org.apache.hudi.common.function.SerializableConsumer; import org.apache.hudi.common.function.SerializableFunction; @@ -40,28 +45,20 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static java.util.stream.Collectors.toList; -import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapToPairWrapper; -import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper; -import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper; -import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; -import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; -import static org.apache.hudi.common.function.FunctionWrapper.throwingReduceWrapper; - /** * This file is copied from - * org.apache.hudi.common.engine.HoodieLocalEngineContext. + * org.apache.hudi.common.engine.HudiLocalEngineContext. * Because we need set ugi in thread pool * A java based engine context, use this implementation on the query engine * integrations if needed. */ -public final class HoodieLocalEngineContext extends HoodieEngineContext { +public final class HudiLocalEngineContext extends HoodieEngineContext { - public HoodieLocalEngineContext(StorageConfiguration conf) { + public HudiLocalEngineContext(StorageConfiguration conf) { this(conf, new LocalTaskContextSupplier()); } - public HoodieLocalEngineContext(StorageConfiguration conf, TaskContextSupplier taskContextSupplier) { + public HudiLocalEngineContext(StorageConfiguration conf, TaskContextSupplier taskContextSupplier) { super(conf, taskContextSupplier); } @@ -82,16 +79,17 @@ public HoodieData parallelize(List data, int parallelism) { @Override public List map(List data, SerializableFunction func, int parallelism) { - return data.stream().parallel().map(throwingMapWrapper(func)).collect(toList()); + return data.stream().parallel().map(FunctionWrapper.throwingMapWrapper(func)).collect(Collectors.toList()); } @Override public List mapToPairAndReduceByKey( List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { - return data.stream().parallel().map(throwingMapToPairWrapper(mapToPairFunc)) + return data.stream().parallel().map(FunctionWrapper.throwingMapToPairWrapper(mapToPairFunc)) .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() - .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).get()) + .map(list -> list.stream().map(e -> e.getValue()) + .reduce(FunctionWrapper.throwingReduceWrapper(reduceFunc)).get()) .collect(Collectors.toList()); } @@ -99,10 +97,10 @@ public List mapToPairAndReduceByKey( public Stream> mapPartitionsToPairAndReduceByKey( Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { - return throwingFlatMapToPairWrapper(flatMapToPairFunc).apply(data.parallel().iterator()) + return FunctionWrapper.throwingFlatMapToPairWrapper(flatMapToPairFunc).apply(data.parallel().iterator()) .collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream() .map(entry -> new ImmutablePair<>(entry.getKey(), entry.getValue().stream().map( - Pair::getValue).reduce(throwingReduceWrapper(reduceFunc)).orElse(null))) + Pair::getValue).reduce(FunctionWrapper.throwingReduceWrapper(reduceFunc)).orElse(null))) .filter(Objects::nonNull); } @@ -111,7 +109,8 @@ public List reduceByKey( List> data, SerializableBiFunction reduceFunc, int parallelism) { return data.stream().parallel() .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() - .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)) + .map(list -> list.stream().map(e -> e.getValue()) + .reduce(FunctionWrapper.throwingReduceWrapper(reduceFunc)) .orElse(null)) .filter(Objects::nonNull) .collect(Collectors.toList()); @@ -119,17 +118,18 @@ public List reduceByKey( @Override public List flatMap(List data, SerializableFunction> func, int parallelism) { - return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(toList()); + return data.stream().parallel().flatMap(FunctionWrapper.throwingFlatMapWrapper(func)) + .collect(Collectors.toList()); } @Override public void foreach(List data, SerializableConsumer consumer, int parallelism) { - data.stream().forEach(throwingForeachWrapper(consumer)); + data.stream().forEach(FunctionWrapper.throwingForeachWrapper(consumer)); } @Override public Map mapToPair(List data, SerializablePairFunction func, Integer parallelism) { - return data.stream().map(throwingMapToPairWrapper(func)).collect( + return data.stream().map(FunctionWrapper.throwingMapToPairWrapper(func)).collect( Collectors.toMap(Pair::getLeft, Pair::getRight, (oldVal, newVal) -> newVal)); } From 5dfdd0d737b2f49cefe9131ae27d31bd098fea01 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 20 Nov 2024 10:45:49 +0800 Subject: [PATCH 04/35] fix --- .../hudi/source/HudiPartitionProcessor.java | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiPartitionProcessor.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiPartitionProcessor.java index 738b2638588e03..0ab9fef951a378 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiPartitionProcessor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiPartitionProcessor.java @@ -21,7 +21,6 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineUtils; -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.metadata.HoodieTableMetadataUtil; @@ -49,14 +48,15 @@ public List getAllPartitionNames(HoodieTableMetaClient tableMetaClient) .build(); HoodieTableMetadata newTableMetadata = HoodieTableMetadata.create( - new HudiLocalEngineContext(tableMetaClient.getHadoopConf()), metadataConfig, + new HudiLocalEngineContext(tableMetaClient.getStorageConf()), tableMetaClient.getStorage(), + metadataConfig, tableMetaClient.getBasePathV2().toString(), true); return newTableMetadata.getAllPartitionPaths(); } public List getPartitionNamesBeforeOrEquals(HoodieTimeline timeline, String timestamp) { - return new ArrayList<>(HoodieInputFormatUtils.getWritePartitionPaths( + return new ArrayList<>(HoodieTableMetadataUtil.getWritePartitionPaths( timeline.findInstantsBeforeOrEquals(timestamp).getInstants().stream().map(instant -> { try { return TimelineUtils.getCommitMetadata(instant, timeline); @@ -67,7 +67,7 @@ public List getPartitionNamesBeforeOrEquals(HoodieTimeline timeline, Str } public List getPartitionNamesInRange(HoodieTimeline timeline, String startTimestamp, String endTimestamp) { - return new ArrayList<>(HoodieInputFormatUtils.getWritePartitionPaths( + return new ArrayList<>(HoodieTableMetadataUtil.getWritePartitionPaths( timeline.findInstantsInRange(startTimestamp, endTimestamp).getInstants().stream().map(instant -> { try { return TimelineUtils.getCommitMetadata(instant, timeline); @@ -101,8 +101,10 @@ public static List parsePartitionValues(List partitionColumns, S } else { // If the partition column size is not equal to the partition fragments size // and the partition column size > 1, we do not know how to map the partition - // fragments to the partition columns and therefore return an empty tuple. We don't - // fail outright so that in some cases we can fallback to reading the table as non-partitioned + // fragments to the partition columns and therefore return an empty tuple. We + // don't + // fail outright so that in some cases we can fallback to reading the table as + // non-partitioned // one throw new RuntimeException("Failed to parse partition values of path: " + partitionPath); } From 6704722179694d94b4ed48aca6b2771692d875a5 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 20 Nov 2024 11:24:28 +0800 Subject: [PATCH 05/35] fix --- .../hudi/source/COWIncrementalRelation.java | 11 ++++++----- .../doris/datasource/hudi/source/HudiScanNode.java | 12 +++++++----- .../hudi/source/MORIncrementalRelation.java | 14 +++++++------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/COWIncrementalRelation.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/COWIncrementalRelation.java index 7981a0b4f261ff..843dded27969ad 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/COWIncrementalRelation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/COWIncrementalRelation.java @@ -37,6 +37,7 @@ import org.apache.hudi.common.table.timeline.TimelineUtils.HollowCommitHandling; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.storage.StoragePath; import java.io.IOException; import java.util.ArrayList; @@ -105,7 +106,7 @@ public COWIncrementalRelation(Map optParams, Configuration confi List commitsToReturn = commitsTimelineToReturn.getInstants(); // todo: support configuration hoodie.datasource.read.incr.filters - Path basePath = metaClient.getBasePathV2(); + StoragePath basePath = metaClient.getBasePathV2(); Map regularFileIdToFullPath = new HashMap<>(); Map metaBootstrapFileIdToFullPath = new HashMap<>(); HoodieTimeline replacedTimeline = commitsTimelineToReturn.getCompletedReplaceTimeline(); @@ -113,8 +114,8 @@ public COWIncrementalRelation(Map optParams, Configuration confi for (HoodieInstant instant : replacedTimeline.getInstants()) { HoodieReplaceCommitMetadata.fromBytes(metaClient.getActiveTimeline().getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class).getPartitionToReplaceFileIds().forEach( - (key, value) -> value.forEach( - e -> replacedFile.put(e, FSUtils.getPartitionPath(basePath, key).toString()))); + (key, value) -> value.forEach( + e -> replacedFile.put(e, FSUtils.constructAbsolutePath(basePath, key).toString()))); } fileToWriteStat = new HashMap<>(); @@ -123,7 +124,7 @@ public COWIncrementalRelation(Map optParams, Configuration confi commitTimeline.getInstantDetails(commit).get(), HoodieCommitMetadata.class); metadata.getPartitionToWriteStats().forEach((partition, stats) -> { for (HoodieWriteStat stat : stats) { - fileToWriteStat.put(FSUtils.getPartitionPath(basePath, stat.getPath()).toString(), stat); + fileToWriteStat.put(FSUtils.constructAbsolutePath(basePath, stat.getPath()).toString(), stat); } }); if (HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS.equals(commit.getTimestamp())) { @@ -158,7 +159,7 @@ public COWIncrementalRelation(Map optParams, Configuration confi } - fs = basePath.getFileSystem(configuration); + fs = new Path(basePath.toUri().getPath()).getFileSystem(configuration); fullTableScan = shouldFullTableScan(); includeStartTime = !fullTableScan; if (fullTableScan || commitsToReturn.isEmpty()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index a8f2a362bfde8d..1a3872419a0b4d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -62,6 +62,8 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.Option; +import org.apache.hudi.storage.StoragePath; +import org.apache.hudi.storage.StoragePathInfo; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -336,13 +338,13 @@ private void getPartitionSplits(HivePartition partition, List splits) thr globPath = hudiClient.getBasePathV2().toString() + "/*"; } else { partitionName = FSUtils.getRelativePartitionPath(hudiClient.getBasePathV2(), - new Path(partition.getPath())); + new StoragePath(partition.getPath())); globPath = String.format("%s/%s/*", hudiClient.getBasePathV2().toString(), partitionName); } - List statuses = FSUtils.getGlobStatusExcludingMetaFolder( - hudiClient.getRawFs(), new Path(globPath)); + List statuses = FSUtils.getGlobStatusExcludingMetaFolder( + hudiClient.getRawHoodieStorage(), new StoragePath(globPath)); HoodieTableFileSystemView fileSystemView = new HoodieTableFileSystemView(hudiClient, - timeline, statuses.toArray(new FileStatus[0])); + timeline, statuses); if (isCowOrRoTable) { fileSystemView.getLatestBaseFilesBeforeOrOn(partitionName, queryInstant).forEach(baseFile -> { @@ -473,7 +475,7 @@ private HudiSplit generateHudiSplit(FileSlice fileSlice, List partitionV fileSlice.getPartitionPath(); List logs = fileSlice.getLogFiles().map(HoodieLogFile::getPath) - .map(Path::toString) + .map(StoragePath::toString) .collect(Collectors.toList()); if (logs.isEmpty()) { noLogsSplitNum.incrementAndGet(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/MORIncrementalRelation.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/MORIncrementalRelation.java index c06fcc2a578d43..7df013599229fb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/MORIncrementalRelation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/MORIncrementalRelation.java @@ -20,9 +20,7 @@ import org.apache.doris.spi.Split; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.GlobPattern; -import org.apache.hadoop.fs.Path; import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -34,6 +32,8 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; +import org.apache.hudi.storage.StoragePathInfo; import java.io.IOException; import java.util.ArrayList; @@ -54,7 +54,7 @@ public class MORIncrementalRelation implements IncrementalRelation { private final boolean endInstantArchived; private final List includedCommits; private final List commitsMetadata; - private final FileStatus[] affectedFilesInCommits; + private final List affectedFilesInCommits; private final boolean fullTableScan; private final String globPattern; private final boolean includeStartTime; @@ -96,7 +96,7 @@ public MORIncrementalRelation(Map optParams, Configuration confi includedCommits = getIncludedCommits(); commitsMetadata = getCommitsMetadata(); affectedFilesInCommits = HoodieInputFormatUtils.listAffectedFilesForCommits(configuration, - new Path(metaClient.getBasePath()), commitsMetadata); + metaClient.getBasePathV2(), commitsMetadata); fullTableScan = shouldFullTableScan(); if (hollowCommitHandling == HollowCommitHandling.USE_TRANSITION_TIME && fullTableScan) { throw new HoodieException("Cannot use stateTransitionTime while enables full table scan"); @@ -152,8 +152,8 @@ private boolean shouldFullTableScan() throws IOException { if (should) { return true; } - for (FileStatus fileStatus : affectedFilesInCommits) { - if (!metaClient.getFs().exists(fileStatus.getPath())) { + for (StoragePathInfo fileStatus : affectedFilesInCommits) { + if (!metaClient.getRawHoodieStorage().exists(fileStatus.getPath())) { return true; } } @@ -199,7 +199,7 @@ public List collectFileSlices() throws HoodieException { String latestCommit = includedCommits.get(includedCommits.size() - 1).getTimestamp(); HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, scanTimeline, affectedFilesInCommits); - Stream fileSlices = HoodieInputFormatUtils.getWritePartitionPaths(commitsMetadata) + Stream fileSlices = HoodieTableMetadataUtil.getWritePartitionPaths(commitsMetadata) .stream().flatMap(relativePartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relativePartitionPath, latestCommit)); if ("".equals(globPattern)) { From 9c76c4ae36b21919ad6db67be33ba2be42634ab5 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 20 Nov 2024 14:45:21 +0800 Subject: [PATCH 06/35] fix --- .../doris/datasource/hive/HiveMetaStoreClientHelper.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java index 0f839d238b2b1e..f9ed2cf200fc6e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java @@ -64,6 +64,8 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.storage.StorageConfiguration; +import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -831,8 +833,10 @@ public static T ugiDoAs(Configuration conf, PrivilegedExceptionAction act public static HoodieTableMetaClient getHudiClient(HMSExternalTable table) { String hudiBasePath = table.getRemoteTable().getSd().getLocation(); Configuration conf = getConfiguration(table); + HadoopStorageConfiguration hadoopStorageConfiguration = new HadoopStorageConfiguration(conf); return HadoopUGI.ugiDoAs(AuthenticationConfig.getKerberosConfig(conf), - () -> HoodieTableMetaClient.builder().setConf(conf).setBasePath(hudiBasePath).build()); + () -> HoodieTableMetaClient.builder().setConf(hadoopStorageConfiguration).setBasePath(hudiBasePath) + .build()); } public static Configuration getConfiguration(HMSExternalTable table) { From a5574452db17eff18f70d62239c6a8086aa1a7cc Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 20 Nov 2024 15:15:25 +0800 Subject: [PATCH 07/35] build fe-core success --- .../apache/doris/datasource/hive/HiveMetaStoreClientHelper.java | 1 - .../org/apache/doris/datasource/hudi/source/HudiScanNode.java | 2 -- 2 files changed, 3 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java index f9ed2cf200fc6e..97f86612a495da 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java @@ -64,7 +64,6 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; -import org.apache.hudi.storage.StorageConfiguration; import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index 1a3872419a0b4d..ab32ee45993c01 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -48,8 +48,6 @@ import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.BaseFile; From 6991b8b28617bc391d0439094c386d8e0131e7a5 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 20 Nov 2024 22:34:23 +0800 Subject: [PATCH 08/35] fix hudi-scanner --- .../org/apache/doris/hudi/BaseSplitReader.scala | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala b/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala index dcc068ad7006d8..592ba66c0968d0 100644 --- a/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala +++ b/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala @@ -36,13 +36,15 @@ import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, T import org.apache.hudi.common.util.ValidationUtils.checkState import org.apache.hudi.common.util.{ConfigUtils, StringUtils} import org.apache.hudi.config.HoodieWriteConfig -import org.apache.hudi.hadoop.CachingPath +import org.apache.hudi.hadoop.fs.CachingPath import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper} import org.apache.hudi.internal.schema.{HoodieSchemaException, InternalSchema} -import org.apache.hudi.io.storage.HoodieAvroHFileReader +import org.apache.hudi.io.hadoop.HoodieHBaseAvroHFileReader import org.apache.hudi.metadata.HoodieTableMetadataUtil import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieSparkConfUtils, HoodieTableSchema, HoodieTableState} +import org.apache.hudi.storage.StoragePath; +import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration; import org.apache.log4j.Logger import org.apache.spark.sql.adapter.Spark3_4Adapter import org.apache.spark.sql.avro.{HoodieAvroSchemaConverters, HoodieSparkAvroSchemaConverters} @@ -430,7 +432,7 @@ abstract class BaseSplitReader(val split: HoodieSplit) { try { if (shouldExtractPartitionValuesFromPartitionPath) { val filePath = new Path(split.dataFilePath) - val tablePathWithoutScheme = CachingPath.getPathWithoutSchemeAndAuthority(tableInformation.metaClient.getBasePathV2) + val tablePathWithoutScheme = CachingPath.getPathWithoutSchemeAndAuthority(new Path(tableInformation.metaClient.getBasePathV2.toUri)) val partitionPathWithoutScheme = CachingPath.getPathWithoutSchemeAndAuthority(filePath.getParent) val relativePath = new URI(tablePathWithoutScheme.toString).relativize(new URI(partitionPathWithoutScheme.toString)).toString val hiveStylePartitioningEnabled = tableConfig.getHiveStylePartitioningEnable.toBoolean @@ -497,8 +499,10 @@ abstract class BaseSplitReader(val split: HoodieSplit) { options: Map[String, String], hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { partitionedFile => { - val reader = new HoodieAvroHFileReader( - hadoopConf, partitionedFile.filePath.toPath, new CacheConfig(hadoopConf)) + var hadoopStorageConfiguration = new HadoopStorageConfiguration(hadoopConf); + var sotragePath = new StoragePath(partitionedFile.toPath().toUri().getPath()); + val reader = new HoodieHBaseAvroHFileReader( + hadoopStorageConfiguration, sotragePath, new CacheConfig(hadoopConf)) val requiredRowSchema = requiredDataSchema.structTypeSchema // NOTE: Schema has to be parsed at this point, since Avro's [[Schema]] aren't serializable From 4558a77cef294db7ff738cd6b00e675b43b0afc6 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 20 Nov 2024 23:26:46 +0800 Subject: [PATCH 09/35] build hudi-scanner success --- .../src/main/java/org/apache/doris/hudi/Utils.java | 4 +++- .../main/scala/org/apache/doris/hudi/BaseSplitReader.scala | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fe/be-java-extensions/hudi-scanner/src/main/java/org/apache/doris/hudi/Utils.java b/fe/be-java-extensions/hudi-scanner/src/main/java/org/apache/doris/hudi/Utils.java index 5614f8bcc96eb1..3e07c8917905a3 100644 --- a/fe/be-java-extensions/hudi-scanner/src/main/java/org/apache/doris/hudi/Utils.java +++ b/fe/be-java-extensions/hudi-scanner/src/main/java/org/apache/doris/hudi/Utils.java @@ -23,6 +23,7 @@ import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration; import java.io.BufferedReader; import java.io.File; @@ -75,7 +76,8 @@ public static void killProcess(long pid) { } public static HoodieTableMetaClient getMetaClient(Configuration conf, String basePath) { + HadoopStorageConfiguration hadoopStorageConfiguration = new HadoopStorageConfiguration(conf); return HadoopUGI.ugiDoAs(AuthenticationConfig.getKerberosConfig(conf), () -> HoodieTableMetaClient.builder() - .setConf(conf).setBasePath(basePath).build()); + .setConf(hadoopStorageConfiguration).setBasePath(basePath).build()); } } diff --git a/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala b/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala index 592ba66c0968d0..5d69ab5d1b1fdf 100644 --- a/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala +++ b/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala @@ -500,9 +500,10 @@ abstract class BaseSplitReader(val split: HoodieSplit) { hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { partitionedFile => { var hadoopStorageConfiguration = new HadoopStorageConfiguration(hadoopConf); - var sotragePath = new StoragePath(partitionedFile.toPath().toUri().getPath()); + var sotragePath = new StoragePath(partitionedFile.toPath.toUri.getPath); + var emptySchema = org.apache.hudi.common.util.Option.empty[org.apache.avro.Schema]() val reader = new HoodieHBaseAvroHFileReader( - hadoopStorageConfiguration, sotragePath, new CacheConfig(hadoopConf)) + hadoopStorageConfiguration, sotragePath, emptySchema) val requiredRowSchema = requiredDataSchema.structTypeSchema // NOTE: Schema has to be parsed at this point, since Avro's [[Schema]] aren't serializable From 7f395c65cd92fc4c250c1eb49cb96288bc376d02 Mon Sep 17 00:00:00 2001 From: morningman Date: Sat, 9 Nov 2024 18:56:08 +0800 Subject: [PATCH 10/35] new hudi jni scanner using hudi-hadoop-mr --- .../vec/exec/format/table/hudi_jni_reader.cpp | 3 +- build.sh | 2 + conf/be.conf | 2 +- .../hadoop-hudi-scanner/pom.xml | 209 ++++++++++++++ .../doris/hudi/HadoopHudiColumnValue.java | 219 ++++++++++++++ .../doris/hudi/HadoopHudiJniScanner.java | 272 ++++++++++++++++++ .../src/main/resources/package.xml | 41 +++ fe/be-java-extensions/pom.xml | 1 + fe/pom.xml | 4 +- 9 files changed, 749 insertions(+), 4 deletions(-) create mode 100644 fe/be-java-extensions/hadoop-hudi-scanner/pom.xml create mode 100644 fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiColumnValue.java create mode 100644 fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java create mode 100644 fe/be-java-extensions/hadoop-hudi-scanner/src/main/resources/package.xml diff --git a/be/src/vec/exec/format/table/hudi_jni_reader.cpp b/be/src/vec/exec/format/table/hudi_jni_reader.cpp index 33ba92b540a497..750adecf0481d7 100644 --- a/be/src/vec/exec/format/table/hudi_jni_reader.cpp +++ b/be/src/vec/exec/format/table/hudi_jni_reader.cpp @@ -73,7 +73,8 @@ HudiJniReader::HudiJniReader(const TFileScanRangeParams& scan_params, } } - _jni_connector = std::make_unique("org/apache/doris/hudi/HudiJniScanner", params, + // _jni_connector = std::make_unique("org/apache/doris/hudi/HudiJniScanner", params, + _jni_connector = std::make_unique("org/apache/doris/hudi/HadoopHudiJniScanner", params, required_fields); } diff --git a/build.sh b/build.sh index c90f6b14144600..da4e701f42146a 100755 --- a/build.sh +++ b/build.sh @@ -538,6 +538,7 @@ fi if [[ "${BUILD_BE_JAVA_EXTENSIONS}" -eq 1 ]]; then modules+=("fe-common") modules+=("be-java-extensions/hudi-scanner") + modules+=("be-java-extensions/hadoop-hudi-scanner") modules+=("be-java-extensions/java-common") modules+=("be-java-extensions/java-udf") modules+=("be-java-extensions/jdbc-scanner") @@ -825,6 +826,7 @@ EOF extensions_modules=("java-udf") extensions_modules+=("jdbc-scanner") extensions_modules+=("hudi-scanner") + extensions_modules+=("hadoop-hudi-scanner") extensions_modules+=("paimon-scanner") extensions_modules+=("trino-connector-scanner") extensions_modules+=("max-compute-scanner") diff --git a/conf/be.conf b/conf/be.conf index 5ad5e07176d545..ae9524313d860a 100644 --- a/conf/be.conf +++ b/conf/be.conf @@ -24,7 +24,7 @@ LOG_DIR="${DORIS_HOME}/log/" JAVA_OPTS="-Dfile.encoding=UTF-8 -Xmx2048m -DlogPath=$LOG_DIR/jni.log -Xloggc:$LOG_DIR/be.gc.log.$CUR_DATE -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=50M -Djavax.security.auth.useSubjectCredsOnly=false -Dsun.security.krb5.debug=true -Dsun.java.command=DorisBE -XX:-CriticalJNINatives" # For jdk 17, this JAVA_OPTS will be used as default JVM options -JAVA_OPTS_FOR_JDK_17="-Dfile.encoding=UTF-8 -Xmx2048m -DlogPath=$LOG_DIR/jni.log -Xlog:gc*:$LOG_DIR/be.gc.log.$CUR_DATE:time,uptime:filecount=10,filesize=50M -Djavax.security.auth.useSubjectCredsOnly=false -Dsun.security.krb5.debug=true -Dsun.java.command=DorisBE -XX:-CriticalJNINatives -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED --add-opens=java.management/sun.management=ALL-UNNAMED" +JAVA_OPTS_FOR_JDK_17="-Dfile.encoding=UTF-8 -Djol.skipHotspotSAAttach=true -Xmx2048m -DlogPath=$LOG_DIR/jni.log -Xlog:gc*:$LOG_DIR/be.gc.log.$CUR_DATE:time,uptime:filecount=10,filesize=50M -Djavax.security.auth.useSubjectCredsOnly=false -Dsun.security.krb5.debug=true -Dsun.java.command=DorisBE -XX:-CriticalJNINatives -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED --add-opens=java.management/sun.management=ALL-UNNAMED" # Set your own JAVA_HOME # JAVA_HOME=/path/to/jdk/ diff --git a/fe/be-java-extensions/hadoop-hudi-scanner/pom.xml b/fe/be-java-extensions/hadoop-hudi-scanner/pom.xml new file mode 100644 index 00000000000000..288106e42fc863 --- /dev/null +++ b/fe/be-java-extensions/hadoop-hudi-scanner/pom.xml @@ -0,0 +1,209 @@ + + + + + be-java-extensions + org.apache.doris + ${revision} + + 4.0.0 + hadoop-hudi-scanner + + + ${basedir}/../../ + 1 + 0.15.0 + 1.11.3 + 1.5.4-2 + 3.1.2-22 + + + + + org.apache.doris + java-common + ${project.version} + + + org.apache.thrift + libthrift + + + + + + + org.apache.hadoop + hadoop-hdfs-client + ${hadoop.version} + + + + + org.apache.hadoop + hadoop-common + + + + org.apache.hadoop + hadoop-aws + + + + org.apache.hadoop + hadoop-mapreduce-client-core + + + + org.junit.jupiter + junit-jupiter + + + + + org.apache.hudi + hudi-common + ${hudi.version} + + + org.apache.hbase + hbase-client + + + org.apache.hbase + hbase-server + + + org.apache.thrift + libthrift + + + com.fasterxml.jackson.core + jackson-databind + + + + + + + org.apache.hudi + hudi-io + ${hudi.version} + + + + + org.apache.hudi + hudi-hadoop-mr + ${hudi.version} + + + + + org.apache.parquet + parquet-hadoop-bundle + ${parquet.version} + + + + + org.apache.parquet + parquet-avro + ${parquet.version} + + + + + org.apache.avro + avro + ${avro.version} + + + org.apache.commons + commons-compress + + + + + + io.airlift + concurrent + 202 + + + + + io.airlift + aircompressor + ${aircompressor.version} + + + + com.github.luben + zstd-jni + ${luben.zstd.jni.version} + + + + com.esotericsoftware + kryo-shaded + 4.0.2 + + + + + io.trino.hive + hive-apache + ${hive-apache.version} + + + org.apache.thrift + libthrift + + + org.apache.parquet + * + + + org.apache.avro + * + + + io.airlift + aircompressor + + + + + + + hadoop-hudi-scanner + + + org.apache.maven.plugins + maven-assembly-plugin + + + src/main/resources/package.xml + + + + + + + + + + make-assembly + package + + single + + + + + + + diff --git a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiColumnValue.java b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiColumnValue.java new file mode 100644 index 00000000000000..ae0199d07d27c5 --- /dev/null +++ b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiColumnValue.java @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.hudi; + +import org.apache.doris.common.jni.vec.ColumnType; +import org.apache.doris.common.jni.vec.ColumnValue; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; +import org.apache.hadoop.io.LongWritable; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.sql.Timestamp; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.util.List; +import java.util.Map; + +public class HadoopHudiColumnValue implements ColumnValue { + private ColumnType dorisType; + private ObjectInspector fieldInspector; + private Object fieldData; + private final ZoneId zoneId; + + public HadoopHudiColumnValue(ZoneId zoneId) { + this.zoneId = zoneId; + } + + public void setRow(Object record) { + this.fieldData = record; + } + + public void setField(ColumnType dorisType, ObjectInspector fieldInspector) { + this.dorisType = dorisType; + this.fieldInspector = fieldInspector; + } + + private Object inspectObject() { + return ((PrimitiveObjectInspector) fieldInspector).getPrimitiveJavaObject(fieldData); + } + + @Override + public boolean getBoolean() { + return (boolean) inspectObject(); + } + + @Override + public short getShort() { + return (short) inspectObject(); + } + + @Override + public int getInt() { + return (int) inspectObject(); + } + + @Override + public float getFloat() { + return (float) inspectObject(); + } + + @Override + public long getLong() { + return (long) inspectObject(); + } + + @Override + public double getDouble() { + return (double) inspectObject(); + } + + @Override + public String getString() { + return inspectObject().toString(); + } + + @Override + public byte[] getBytes() { + return (byte[]) inspectObject(); + } + + + @Override + public byte getByte() { + throw new UnsupportedOperationException("Hoodie type does not support tinyint"); + } + + @Override + public BigDecimal getDecimal() { + return ((HiveDecimal) inspectObject()).bigDecimalValue(); + } + + @Override + public LocalDate getDate() { + return LocalDate.ofEpochDay((((DateObjectInspector) fieldInspector).getPrimitiveJavaObject(fieldData)) + .toEpochDay()); + } + + @Override + public LocalDateTime getDateTime() { + if (fieldData instanceof Timestamp) { + return ((Timestamp) fieldData).toLocalDateTime(); + } else if (fieldData instanceof TimestampWritableV2) { + return LocalDateTime.ofInstant(Instant.ofEpochSecond((((TimestampObjectInspector) fieldInspector) + .getPrimitiveJavaObject(fieldData)).toEpochSecond()), zoneId); + } else { + long datetime = ((LongWritable) fieldData).get(); + long seconds; + long nanoseconds; + if (dorisType.getPrecision() == 3) { + seconds = datetime / 1000; + nanoseconds = (datetime % 1000) * 1000000; + } else if (dorisType.getPrecision() == 6) { + seconds = datetime / 1000000; + nanoseconds = (datetime % 1000000) * 1000; + } else { + throw new RuntimeException("Hoodie timestamp only support milliseconds and microseconds, " + + "wrong precision = " + dorisType.getPrecision()); + } + return LocalDateTime.ofInstant(Instant.ofEpochSecond(seconds, nanoseconds), zoneId); + } + } + + @Override + public boolean canGetStringAsBytes() { + return false; + } + + @Override + public boolean isNull() { + return fieldData == null; + } + + @Override + public BigInteger getBigInteger() { + throw new UnsupportedOperationException("Hoodie type does not support largeint"); + } + + @Override + public byte[] getStringAsBytes() { + throw new UnsupportedOperationException("Hoodie type does not support getStringAsBytes"); + } + + @Override + public void unpackArray(List values) { + ListObjectInspector inspector = (ListObjectInspector) fieldInspector; + List items = inspector.getList(fieldData); + ObjectInspector itemInspector = inspector.getListElementObjectInspector(); + for (int i = 0; i < items.size(); i++) { + Object item = items.get(i); + HadoopHudiColumnValue childValue = new HadoopHudiColumnValue(zoneId); + childValue.setRow(item); + childValue.setField(dorisType.getChildTypes().get(0), itemInspector); + values.add(childValue); + } + } + + @Override + public void unpackMap(List keys, List values) { + MapObjectInspector inspector = (MapObjectInspector) fieldInspector; + ObjectInspector keyObjectInspector = inspector.getMapKeyObjectInspector(); + ObjectInspector valueObjectInspector = inspector.getMapValueObjectInspector(); + for (Map.Entry kv : inspector.getMap(fieldData).entrySet()) { + HadoopHudiColumnValue key = new HadoopHudiColumnValue(zoneId); + key.setRow(kv.getKey()); + key.setField(dorisType.getChildTypes().get(0), keyObjectInspector); + keys.add(key); + + HadoopHudiColumnValue value = new HadoopHudiColumnValue(zoneId); + value.setRow(kv.getValue()); + value.setField(dorisType.getChildTypes().get(1), valueObjectInspector); + values.add(value); + } + } + + @Override + public void unpackStruct(List structFieldIndex, List values) { + StructObjectInspector inspector = (StructObjectInspector) fieldInspector; + List fields = inspector.getAllStructFieldRefs(); + for (int i = 0; i < structFieldIndex.size(); i++) { + Integer idx = structFieldIndex.get(i); + HadoopHudiColumnValue value = new HadoopHudiColumnValue(zoneId); + Object obj = null; + if (idx != null) { + StructField sf = fields.get(idx); + obj = inspector.getStructFieldData(fieldData, sf); + } + value.setRow(obj); + value.setField(dorisType.getChildTypes().get(i), fields.get(i).getFieldObjectInspector()); + values.add(value); + } + } +} diff --git a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java new file mode 100644 index 00000000000000..5ab9c5ba029c7f --- /dev/null +++ b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java @@ -0,0 +1,272 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.hudi; + +import org.apache.doris.common.classloader.ThreadClassLoaderContext; +import org.apache.doris.common.jni.JniScanner; +import org.apache.doris.common.jni.vec.ColumnType; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.base.Strings; +import com.google.common.collect.Maps; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.JavaUtils; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.util.ReflectionUtils; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.ZoneId; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * HadoopHudiJniScanner is a JniScanner implementation that reads Hudi data using hudi-hadoop-mr. + */ +public class HadoopHudiJniScanner extends JniScanner { + private static final Logger LOG = LoggerFactory.getLogger(HadoopHudiJniScanner.class); + + private static final String HADOOP_CONF_PREFIX = "hadoop_conf."; + + // Hudi data info + private final String basePath; + private final String dataFilePath; + private final long dataFileLength; + private final String[] deltaFilePaths; + private final String instantTime; + private final String serde; + private final String inputFormat; + + // schema info + private final String hudiColumnNames; + private final String[] hudiColumnTypes; + private final String[] requiredFields; + private List requiredColumnIds; + private ColumnType[] requiredTypes; + // private final String[] nestedFields; + + // Hadoop info + private RecordReader reader; + private StructObjectInspector rowInspector; + private final ObjectInspector[] fieldInspectors; + private final StructField[] structFields; + private Deserializer deserializer; + private final Map fsOptionsProps; + + // scanner info + private final HadoopHudiColumnValue columnValue; + private final int fetchSize; + private final ClassLoader classLoader; + + public HadoopHudiJniScanner(int fetchSize, Map params) { + this.basePath = params.get("base_path"); + this.dataFilePath = params.get("data_file_path"); + this.dataFileLength = Long.parseLong(params.get("data_file_length")); + if (Strings.isNullOrEmpty(params.get("delta_file_paths"))) { + this.deltaFilePaths = new String[0]; + } else { + this.deltaFilePaths = params.get("delta_file_paths").split(","); + } + this.instantTime = params.get("instant_time"); + this.serde = params.get("serde"); + this.inputFormat = params.get("input_format"); + + this.hudiColumnNames = params.get("hudi_column_names"); + this.hudiColumnTypes = params.get("hudi_column_types").split("#"); + this.requiredFields = params.get("required_fields").split(","); + + this.fieldInspectors = new ObjectInspector[requiredFields.length]; + this.structFields = new StructField[requiredFields.length]; + this.fsOptionsProps = Maps.newHashMap(); + for (Map.Entry entry : params.entrySet()) { + if (entry.getKey().startsWith(HADOOP_CONF_PREFIX)) { + fsOptionsProps.put(entry.getKey().substring(HADOOP_CONF_PREFIX.length()), entry.getValue()); + } + if (LOG.isDebugEnabled()) { + LOG.debug("get hudi params {}: {}", entry.getKey(), entry.getValue()); + } + } + + ZoneId zoneId; + if (Strings.isNullOrEmpty(params.get("time_zone"))) { + zoneId = ZoneId.systemDefault(); + } else { + zoneId = ZoneId.of(params.get("time_zone")); + } + this.columnValue = new HadoopHudiColumnValue(zoneId); + this.fetchSize = fetchSize; + this.classLoader = this.getClass().getClassLoader(); + } + + @Override + public void open() throws IOException { + try (ThreadClassLoaderContext ignored = new ThreadClassLoaderContext(classLoader)) { + initRequiredColumnsAndTypes(); + initTableInfo(requiredTypes, requiredFields, fetchSize); + Properties properties = getReaderProperties(); + initReader(properties); + } catch (Exception e) { + close(); + LOG.warn("failed to open hadoop hudi jni scanner", e); + throw new IOException("failed to open hadoop hudi jni scanner: " + e.getMessage(), e); + } + } + + @Override + public int getNext() throws IOException { + try (ThreadClassLoaderContext ignored = new ThreadClassLoaderContext(classLoader)) { + NullWritable key = reader.createKey(); + ArrayWritable value = reader.createValue(); + int numRows = 0; + for (; numRows < fetchSize; numRows++) { + if (!reader.next(key, value)) { + break; + } + Object rowData = deserializer.deserialize(value); + for (int i = 0; i < fields.length; i++) { + Object fieldData = rowInspector.getStructFieldData(rowData, structFields[i]); + columnValue.setRow(fieldData); + // LOG.info("rows: {}, column: {}, col name: {}, col type: {}, inspector: {}", + // numRows, i, types[i].getName(), types[i].getType().name(), + // fieldInspectors[i].getTypeName()); + columnValue.setField(types[i], fieldInspectors[i]); + appendData(i, columnValue); + } + } + return numRows; + } catch (Exception e) { + close(); + LOG.warn("failed to get next in hadoop hudi jni scanner", e); + throw new IOException("failed to get next in hadoop hudi jni scanner: " + e.getMessage(), e); + } + } + + @Override + public void close() throws IOException { + try (ThreadClassLoaderContext ignored = new ThreadClassLoaderContext(classLoader)) { + if (reader != null) { + reader.close(); + } + } catch (IOException e) { + LOG.warn("failed to close hadoop hudi jni scanner", e); + throw new IOException("failed to close hadoop hudi jni scanner: " + e.getMessage(), e); + } + } + + private void initRequiredColumnsAndTypes() { + String[] splitHudiColumnNames = hudiColumnNames.split(","); + + Map hudiColNameToIdx = + IntStream.range(0, splitHudiColumnNames.length) + .boxed() + .collect(Collectors.toMap(i -> splitHudiColumnNames[i], i -> i)); + + Map hudiColNameToType = + IntStream.range(0, splitHudiColumnNames.length) + .boxed() + .collect(Collectors.toMap(i -> splitHudiColumnNames[i], i -> hudiColumnTypes[i])); + + requiredTypes = Arrays.stream(requiredFields) + .map(field -> ColumnType.parseType(field, hudiColNameToType.get(field))) + .toArray(ColumnType[]::new); + + requiredColumnIds = Arrays.stream(requiredFields) + .mapToInt(hudiColNameToIdx::get) + .boxed().collect(Collectors.toList()); + } + + private Properties getReaderProperties() { + Properties properties = new Properties(); + properties.setProperty("hive.io.file.readcolumn.ids", Joiner.on(",").join(requiredColumnIds)); + properties.setProperty("hive.io.file.readcolumn.names", Joiner.on(",").join(this.requiredFields)); + properties.setProperty("columns", this.hudiColumnNames); + properties.setProperty("columns.types", Joiner.on(",").join(hudiColumnTypes)); + properties.setProperty("serialization.lib", this.serde); + properties.setProperty("hive.io.file.read.all.columns", "false"); + fsOptionsProps.forEach(properties::setProperty); + return properties; + } + + private void initReader(Properties properties) throws Exception { + String realtimePath = dataFileLength != -1 ? dataFilePath : deltaFilePaths[0]; + long realtimeLength = dataFileLength != -1 ? dataFileLength : 0; + Path path = new Path(realtimePath); + FileSplit fileSplit = new FileSplit(path, 0, realtimeLength, (String[]) null); + List logFiles = Arrays.stream(deltaFilePaths).map(HoodieLogFile::new) + .collect(Collectors.toList()); + FileSplit hudiSplit = + new HoodieRealtimeFileSplit(fileSplit, basePath, logFiles, instantTime, false, Option.empty()); + + JobConf jobConf = new JobConf(new Configuration()); + properties.stringPropertyNames().forEach(name -> jobConf.set(name, properties.getProperty(name))); + InputFormat inputFormatClass = createInputFormat(jobConf, inputFormat); + reader = (RecordReader) inputFormatClass + .getRecordReader(hudiSplit, jobConf, Reporter.NULL); + + deserializer = getDeserializer(jobConf, properties, serde); + rowInspector = getTableObjectInspector(deserializer); + for (int i = 0; i < requiredFields.length; i++) { + StructField field = rowInspector.getStructFieldRef(requiredFields[i]); + structFields[i] = field; + fieldInspectors[i] = field.getFieldObjectInspector(); + } + } + + private InputFormat createInputFormat(Configuration conf, String inputFormat) throws Exception { + Class clazz = conf.getClassByName(inputFormat); + Class> cls = + (Class>) clazz.asSubclass(InputFormat.class); + return ReflectionUtils.newInstance(cls, conf); + } + + private Deserializer getDeserializer(Configuration configuration, Properties properties, String name) + throws Exception { + Class deserializerClass = Class.forName(name, true, JavaUtils.getClassLoader()) + .asSubclass(Deserializer.class); + Deserializer deserializer = deserializerClass.getConstructor().newInstance(); + deserializer.initialize(configuration, properties); + return deserializer; + } + + private StructObjectInspector getTableObjectInspector(Deserializer deserializer) throws Exception { + ObjectInspector inspector = deserializer.getObjectInspector(); + Preconditions.checkArgument(inspector.getCategory() == ObjectInspector.Category.STRUCT, + "expected STRUCT: %s", inspector.getCategory()); + return (StructObjectInspector) inspector; + } +} diff --git a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/resources/package.xml b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/resources/package.xml new file mode 100644 index 00000000000000..4bbb2610603363 --- /dev/null +++ b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/resources/package.xml @@ -0,0 +1,41 @@ + + + + jar-with-dependencies + + jar + + false + + + / + true + true + runtime + + + **/Log4j2Plugins.dat + + + + + diff --git a/fe/be-java-extensions/pom.xml b/fe/be-java-extensions/pom.xml index bbe056739d51ec..5d56ef76e7c3ef 100644 --- a/fe/be-java-extensions/pom.xml +++ b/fe/be-java-extensions/pom.xml @@ -22,6 +22,7 @@ under the License. 4.0.0 hudi-scanner + hadoop-hudi-scanner java-common java-udf jdbc-scanner diff --git a/fe/pom.xml b/fe/pom.xml index f1bcc0f7210bb4..0d2e3f70aa6e6a 100644 --- a/fe/pom.xml +++ b/fe/pom.xml @@ -371,7 +371,7 @@ under the License. 435 2.1.1 9.4 - 202 + 202 1.2.27 12.22.0 5.3.0 @@ -1649,7 +1649,7 @@ under the License. io.airlift concurrent - ${airlift.version} + ${airlift.concurrent.version} com.azure From 76eec2b6a1a5ada275e627c40dcc2f96b3007c82 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 21 Nov 2024 17:18:23 +0800 Subject: [PATCH 11/35] fix format --- be/src/vec/exec/format/table/hudi_jni_reader.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/vec/exec/format/table/hudi_jni_reader.cpp b/be/src/vec/exec/format/table/hudi_jni_reader.cpp index 750adecf0481d7..ea801b798f377d 100644 --- a/be/src/vec/exec/format/table/hudi_jni_reader.cpp +++ b/be/src/vec/exec/format/table/hudi_jni_reader.cpp @@ -74,8 +74,8 @@ HudiJniReader::HudiJniReader(const TFileScanRangeParams& scan_params, } // _jni_connector = std::make_unique("org/apache/doris/hudi/HudiJniScanner", params, - _jni_connector = std::make_unique("org/apache/doris/hudi/HadoopHudiJniScanner", params, - required_fields); + _jni_connector = std::make_unique("org/apache/doris/hudi/HadoopHudiJniScanner", + params, required_fields); } Status HudiJniReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { From 40f3fc704b86b34b45c26d038e6b12352afe17ca Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 21 Nov 2024 17:29:01 +0800 Subject: [PATCH 12/35] add license --- .../hadoop-hudi-scanner/pom.xml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fe/be-java-extensions/hadoop-hudi-scanner/pom.xml b/fe/be-java-extensions/hadoop-hudi-scanner/pom.xml index 288106e42fc863..4b80d49de17527 100644 --- a/fe/be-java-extensions/hadoop-hudi-scanner/pom.xml +++ b/fe/be-java-extensions/hadoop-hudi-scanner/pom.xml @@ -1,4 +1,22 @@ + From 88b131c97fb445d16238f76c077c58720bfb7089 Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 22 Nov 2024 17:48:21 +0800 Subject: [PATCH 13/35] test new hudi scanner --- .../hudi/test_hudi_incremental.out | 348 ++++++++++++++++++ .../hudi/test_hudi_schema_evolution.out | 32 ++ .../hudi/test_hudi_snapshot.out | Bin 348526 -> 697561 bytes .../hudi/test_hudi_incremental.groovy | 13 +- .../hudi/test_hudi_schema_evolution.groovy | 13 + .../hudi/test_hudi_snapshot.groovy | 11 +- .../hudi/test_hudi_timetravel.groovy | 7 +- 7 files changed, 416 insertions(+), 8 deletions(-) diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out b/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out index b1bdad85013bfc..852aebe48961b5 100644 --- a/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out +++ b/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out @@ -347,3 +347,351 @@ -- !incremental_9_10 -- 1000 +-- !incremental_1_end -- +9000 + +-- !incremental_earliest_1 -- +1000 + +-- !incremental_2_end -- +8000 + +-- !incremental_earliest_2 -- +2000 + +-- !incremental_1_2 -- +1000 + +-- !incremental_3_end -- +7000 + +-- !incremental_earliest_3 -- +3000 + +-- !incremental_2_3 -- +1000 + +-- !incremental_4_end -- +6000 + +-- !incremental_earliest_4 -- +4000 + +-- !incremental_3_4 -- +1000 + +-- !incremental_5_end -- +5000 + +-- !incremental_earliest_5 -- +5000 + +-- !incremental_4_5 -- +1000 + +-- !incremental_6_end -- +4000 + +-- !incremental_earliest_6 -- +6000 + +-- !incremental_5_6 -- +1000 + +-- !incremental_7_end -- +3000 + +-- !incremental_earliest_7 -- +7000 + +-- !incremental_6_7 -- +1000 + +-- !incremental_8_end -- +2000 + +-- !incremental_earliest_8 -- +8000 + +-- !incremental_7_8 -- +1000 + +-- !incremental_9_end -- +1000 + +-- !incremental_earliest_9 -- +9000 + +-- !incremental_8_9 -- +1000 + +-- !incremental_10_end -- +0 + +-- !incremental_earliest_10 -- +10000 + +-- !incremental_9_10 -- +1000 + +-- !incremental_1_end -- +9000 + +-- !incremental_earliest_1 -- +1000 + +-- !incremental_2_end -- +8000 + +-- !incremental_earliest_2 -- +2000 + +-- !incremental_1_2 -- +1000 + +-- !incremental_3_end -- +7000 + +-- !incremental_earliest_3 -- +3000 + +-- !incremental_2_3 -- +1000 + +-- !incremental_4_end -- +6000 + +-- !incremental_earliest_4 -- +4000 + +-- !incremental_3_4 -- +1000 + +-- !incremental_5_end -- +5000 + +-- !incremental_earliest_5 -- +5000 + +-- !incremental_4_5 -- +1000 + +-- !incremental_6_end -- +4000 + +-- !incremental_earliest_6 -- +6000 + +-- !incremental_5_6 -- +1000 + +-- !incremental_7_end -- +3000 + +-- !incremental_earliest_7 -- +7000 + +-- !incremental_6_7 -- +1000 + +-- !incremental_8_end -- +2000 + +-- !incremental_earliest_8 -- +8000 + +-- !incremental_7_8 -- +1000 + +-- !incremental_9_end -- +1000 + +-- !incremental_earliest_9 -- +9000 + +-- !incremental_8_9 -- +1000 + +-- !incremental_10_end -- +0 + +-- !incremental_earliest_10 -- +10000 + +-- !incremental_9_10 -- +1000 + +-- !incremental_1_end -- +9000 + +-- !incremental_earliest_1 -- +1000 + +-- !incremental_2_end -- +8000 + +-- !incremental_earliest_2 -- +2000 + +-- !incremental_1_2 -- +1000 + +-- !incremental_3_end -- +7000 + +-- !incremental_earliest_3 -- +3000 + +-- !incremental_2_3 -- +1000 + +-- !incremental_4_end -- +6000 + +-- !incremental_earliest_4 -- +4000 + +-- !incremental_3_4 -- +1000 + +-- !incremental_5_end -- +5000 + +-- !incremental_earliest_5 -- +5000 + +-- !incremental_4_5 -- +1000 + +-- !incremental_6_end -- +4000 + +-- !incremental_earliest_6 -- +6000 + +-- !incremental_5_6 -- +1000 + +-- !incremental_7_end -- +3000 + +-- !incremental_earliest_7 -- +7000 + +-- !incremental_6_7 -- +1000 + +-- !incremental_8_end -- +2000 + +-- !incremental_earliest_8 -- +8000 + +-- !incremental_7_8 -- +1000 + +-- !incremental_9_end -- +1000 + +-- !incremental_earliest_9 -- +9000 + +-- !incremental_8_9 -- +1000 + +-- !incremental_10_end -- +0 + +-- !incremental_earliest_10 -- +10000 + +-- !incremental_9_10 -- +1000 + +-- !incremental_1_end -- +9000 + +-- !incremental_earliest_1 -- +1000 + +-- !incremental_2_end -- +8000 + +-- !incremental_earliest_2 -- +2000 + +-- !incremental_1_2 -- +1000 + +-- !incremental_3_end -- +7000 + +-- !incremental_earliest_3 -- +3000 + +-- !incremental_2_3 -- +1000 + +-- !incremental_4_end -- +6000 + +-- !incremental_earliest_4 -- +4000 + +-- !incremental_3_4 -- +1000 + +-- !incremental_5_end -- +5000 + +-- !incremental_earliest_5 -- +5000 + +-- !incremental_4_5 -- +1000 + +-- !incremental_6_end -- +4000 + +-- !incremental_earliest_6 -- +6000 + +-- !incremental_5_6 -- +1000 + +-- !incremental_7_end -- +3000 + +-- !incremental_earliest_7 -- +7000 + +-- !incremental_6_7 -- +1000 + +-- !incremental_8_end -- +2000 + +-- !incremental_earliest_8 -- +8000 + +-- !incremental_7_8 -- +1000 + +-- !incremental_9_end -- +1000 + +-- !incremental_earliest_9 -- +9000 + +-- !incremental_8_9 -- +1000 + +-- !incremental_10_end -- +0 + +-- !incremental_earliest_10 -- +10000 + +-- !incremental_9_10 -- +1000 + diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_schema_evolution.out b/regression-test/data/external_table_p2/hudi/test_hudi_schema_evolution.out index 12dd0cf086d3f0..da7273d4c14ef9 100644 --- a/regression-test/data/external_table_p2/hudi/test_hudi_schema_evolution.out +++ b/regression-test/data/external_table_p2/hudi/test_hudi_schema_evolution.out @@ -31,3 +31,35 @@ 20241118012149007 20241118012149007_0_4 5 185d101f-a484-45ce-b236-03ccd33c521b-0_0-208-622_20241118012149007.parquet 5 Eva {"age":31.5, "address":"Chengdu"} 20241118012149007 20241118012149007_0_5 6 185d101f-a484-45ce-b236-03ccd33c521b-0_0-208-622_20241118012149007.parquet 6 Frank {"age":29.2, "address":"Wuhan"} +-- !adding_simple_columns_table -- +20241118012126237 20241118012126237_0_1 1 5166112a-90d8-4ba8-8646-337fbeb2a375-0_0-35-121_20241118012132306.parquet 1 Alice \N +20241118012126237 20241118012126237_0_0 2 5166112a-90d8-4ba8-8646-337fbeb2a375-0_0-35-121_20241118012132306.parquet 2 Bob \N +20241118012126237 20241118012126237_0_2 3 5166112a-90d8-4ba8-8646-337fbeb2a375-0_0-35-121_20241118012132306.parquet 3 Cathy \N +20241118012132306 20241118012132306_0_3 4 5166112a-90d8-4ba8-8646-337fbeb2a375-0_0-35-121_20241118012132306.parquet 4 David 25 +20241118012132306 20241118012132306_0_4 5 5166112a-90d8-4ba8-8646-337fbeb2a375-0_0-35-121_20241118012132306.parquet 5 Eva 30 +20241118012132306 20241118012132306_0_5 6 5166112a-90d8-4ba8-8646-337fbeb2a375-0_0-35-121_20241118012132306.parquet 6 Frank 28 + +-- !altering_simple_columns_table -- +20241118012136512 20241118012136512_0_0 1 203f0f43-ae9d-4c17-8d5d-834f0dbc62c9-0_0-78-246_20241118012138287.parquet 1 Alice 25.0 +20241118012136512 20241118012136512_0_2 2 203f0f43-ae9d-4c17-8d5d-834f0dbc62c9-0_0-78-246_20241118012138287.parquet 2 Bob 30.0 +20241118012136512 20241118012136512_0_1 3 203f0f43-ae9d-4c17-8d5d-834f0dbc62c9-0_0-78-246_20241118012138287.parquet 3 Cathy 28.0 +20241118012138287 20241118012138287_0_3 4 203f0f43-ae9d-4c17-8d5d-834f0dbc62c9-0_0-78-246_20241118012138287.parquet 4 David 26.0 +20241118012138287 20241118012138287_0_4 5 203f0f43-ae9d-4c17-8d5d-834f0dbc62c9-0_0-78-246_20241118012138287.parquet 5 Eva 31.5 +20241118012138287 20241118012138287_0_5 6 203f0f43-ae9d-4c17-8d5d-834f0dbc62c9-0_0-78-246_20241118012138287.parquet 6 Frank 29.2 + +-- !adding_complex_columns_table -- +20241118012144831 20241118012144831_0_1 1 3c038df9-a652-4878-9b8a-221ae443448e-0_0-165-497_20241118012146150.parquet 1 Alice {"age":25, "address":"Guangzhou", "email":null} +20241118012144831 20241118012144831_0_0 2 3c038df9-a652-4878-9b8a-221ae443448e-0_0-165-497_20241118012146150.parquet 2 Bob {"age":30, "address":"Shanghai", "email":null} +20241118012144831 20241118012144831_0_2 3 3c038df9-a652-4878-9b8a-221ae443448e-0_0-165-497_20241118012146150.parquet 3 Cathy {"age":28, "address":"Beijing", "email":null} +20241118012146150 20241118012146150_0_3 4 3c038df9-a652-4878-9b8a-221ae443448e-0_0-165-497_20241118012146150.parquet 4 David {"age":25, "address":"Shenzhen", "email":"david@example.com"} +20241118012146150 20241118012146150_0_4 5 3c038df9-a652-4878-9b8a-221ae443448e-0_0-165-497_20241118012146150.parquet 5 Eva {"age":30, "address":"Chengdu", "email":"eva@example.com"} +20241118012146150 20241118012146150_0_5 6 3c038df9-a652-4878-9b8a-221ae443448e-0_0-165-497_20241118012146150.parquet 6 Frank {"age":28, "address":"Wuhan", "email":"frank@example.com"} + +-- !altering_complex_columns_table -- +20241118012147879 20241118012147879_0_0 1 185d101f-a484-45ce-b236-03ccd33c521b-0_0-208-622_20241118012149007.parquet 1 Alice {"age":25, "address":"Guangzhou"} +20241118012147879 20241118012147879_0_2 2 185d101f-a484-45ce-b236-03ccd33c521b-0_0-208-622_20241118012149007.parquet 2 Bob {"age":30, "address":"Shanghai"} +20241118012147879 20241118012147879_0_1 3 185d101f-a484-45ce-b236-03ccd33c521b-0_0-208-622_20241118012149007.parquet 3 Cathy {"age":28, "address":"Beijing"} +20241118012149007 20241118012149007_0_3 4 185d101f-a484-45ce-b236-03ccd33c521b-0_0-208-622_20241118012149007.parquet 4 David {"age":26, "address":"Shenzhen"} +20241118012149007 20241118012149007_0_4 5 185d101f-a484-45ce-b236-03ccd33c521b-0_0-208-622_20241118012149007.parquet 5 Eva {"age":31.5, "address":"Chengdu"} +20241118012149007 20241118012149007_0_5 6 185d101f-a484-45ce-b236-03ccd33c521b-0_0-208-622_20241118012149007.parquet 6 Frank {"age":29.2, "address":"Wuhan"} + diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_snapshot.out b/regression-test/data/external_table_p2/hudi/test_hudi_snapshot.out index efad67ffbfa8c407fa42d0dd28f569f492bd1d12..41f34255f40a5249cc790acd1c805f7917621a7b 100644 GIT binary patch delta 37933 zcmeHQdyL%1c?Xv~%WI{IrPJ$l5AE&Adf8mfki+5dEJsjfyS4m~PnKWR#g`#DyjyE; zcg5Y)6U5kkr<0VxVb_A}tcYuL42Q zCI$MDTq!Flh1WJDrSbrrA(zKn4u{|T=9}MskGKC|#rMCn`SCx~CYIc0A>?=t$y>-G z7-NE+&Y3ARMVU+T93jsR{Lpl9z)j-1p6S`PXEM&P?ReP76t%To_q2MrE>CRAJTFgB zdOs6m6O%m%joa=x@=DCM9m~o$<2jKhC5H0j+nje!9M85}?t3&lD0NOf`#age&I9Sb zm0Y*J@Aj?EKl}w8ZGf^23MxP~4rR*(RIq&#N_KZdO^fFD7R6`l6<)3uKb`;0qAzC3 z)v!2ToS!LIMDeCVvC*81A~?~Qi$(Fb;F6%+T!7EF*BVU`*K2VzyGlNA#3&ey5iEJ6 zjf|Oul!%KNVeloDT6MbIoC`&9+{Dx^Q3pX!$Z!9VG2nW{atIDg>XFEV{try%1j3B` zhy^S{#KJ7OL%Attr>cAwzM@!5U&8RV)WAa~7>I9Ry77N7Sr?gfb|?5xdH<$HJK zR^~j?-hW_Jo)G$o@Iwo^u5XIS@=f9}W^$Kf)AoFu3mBP}mA=((NtqQq6QS2-9{}e&-}y~k%zWwTjZzi(DSQy#6s*#uI+T!Jz~CaH;?7fZ)vM_;^Fr8 zm&fGeKhc-AcMJ}1pq7&lk~w-E z>!92XWio#o)X9ag?MP6gVQn7GK2F_qEgWouBA~1t_OsI?1rrV)gVJZ9>=e}ebJ<}B zXlxU@WpA-sOGgT?rXv;20_Zf)0#CCxUrkU(91zuRnE4GJ%IADt5<#YiIwo4QdM;?CNSo4VQMiKX2Kst*#CJ93fg3JOl;EwNM#v(ho zNPMG8trn+mylY6$Rm5sK zf;M0<*K#Fak}I~4{LhdOt`7nL^9Zvo6Cf}&S>#cZxj}?o581*?CA3?Zc_S0Vu4Xl7 z;@W(2!}(fV3A;tvwQvc`!#j;Nnl1O^9D3%j3}ciR?whFrdUmz(HV)(ezNVAK z`Hxl^aDO84ZF(UYcf;#^At(YIG=tE`07}d^JxVAbITt>LVd(^@jlQ;FQ!_5tlW&g2 z*?ECiW?`P_ZkI)Ii6+Rjs(t!1*T_S+=1TdUJlH=cD&?wNnm3jkzs@Tax&K{bm5yvn zUQcoxW!=1sJD2Rs`}q*5!m`yJiK#Bb_n(0J11Jlw`3nzPgZR> zS`Q^)P$goxf)c0~&(%lv!ak9`3%2bKCBpdqFFyL%JnW6^ui?3il8NW0&eXRo5%KyR z6Whi*^AI+Xo!?6VpMo05*uhMwqH<*xWGbpvDuAV3D(-`zDVF(cqgDl>T98l0xs8VF zQY`T{ZDiE8N|Yc*?F(eep(UGH)B(NgW__R)f=cOzzR7HWO+sv8GAu-*3R+mWcIfT{6(t&buQDGS0KM(DdG7Mt^(BYeO473*|Ydlp_(EG(ym1ACxW61(+of@odHe zK^RR}JUv{Ow_v)au)NAvSm}xfW{T+br&JYcj;>j80LEf3rKbm$iY&;azP2$rVZQ`MImU*HZvVd&_p>Ursu$G z4lzg~yo|uMXmXG4IW$akv!u~ove=yAO{p<`r4AE%qB9S)k9_YM`5*g1wSTUqEm{qF zeYe`2Y8m(NY9Qa7(w6J+)_;GruORDR(S{lEBM#G{fYAwBGx2C>a>sT}${g2riN_;K zE^L!1H)rIT-!}SJ*5>*7z)At`sXX(9zDD!p#H5Z#`TV|eHN(^NIkFY(eXsCxPpt-c*v0pXq*1xb4>H-h$rw&_I?EYK{+aUy(Sd&F$LQdF%a^M1JefwPCsIuz}?deyA<#djQ0^(LVE!n{~^vrS?O8?Dw{< zfl0)1e2ft^CDCkbgiX%^9oDizgu>AEUH2l*bruw;wCB4dsLtzIJDNZ_?CWj`B_&Yb z6U~*%{p&RJiC^RKL0;HVj)RJjhu8KkmtR=Z*KdGu(6dssNS=IM|CD5ha_HE~E!ydI z@~@ZVZqSIFu>01723$K~tk*1gW+0b-8ipN~Cx51|e)8lV?byjZxqn?M4?LDzbt546 z6hKlhw2wUeu+}f%KBMhv^>Xq9#XfN2b|Z~6!`lZ9>wD_+;~feu7dm@tz<0djy+$#>?4 zTklIVT}t{W(c?iix?=RvEH5egNYO`E6@9c!KJu0}a3c)yDQxvbA<3h94i-x^FMQ)eBQ~2^pwNb-8j3)ctn? ziYW%_BQQ`zzBZB@*lc4qMS3Ek~`X(Jcbv5 zYpHzrhQ5IhZU}YFK|>irDMKjb1(!J~ek@s^W%8}RF$O-QArwxi=MY>C8MrQx2!na6 z=Wvr*t{-8C`VM6sLnsE$T(-wtPcGeK47J|h;qH=JLS-flSI!b@i*lq>j&xU-Bi%At z`fuaISwb-sfOjVtK~c*CQ+WUy3hz^gm8gvwyV1Mis8lA$c2lqJ+T z$EgqJi$}8A??YHZaZEx76R?DGBAENY{K;hCTMK^Ej1W#xNQl+3gi3^#pv1#D6>vxTTO&{103W4Jj2Tvm~go3CO1k)t`gyn#*k8ei|hZZ;ib_}5yID~@( z4sqpUF9UcgL#Qk24X6yElrx}m2JE<$riM`8>xJ}6T>#sEa{us)rw&XR@~idSb>Ig* zd}wvwvDJN=msjzjRD7tSiVvmYLuKK_RD39KmQnGcRD38EA40+{#fMVyp)LpuuHr+TjSjBjL#g24o!IRkOS32yA1Vn7d|AVbtN2hVy!h1? zUR=e8y8IE!ReY$Qhm-!WR#7TG)MbuwuHr-e!Wic&KGfw7lCI)IsTk?qq2yJJ^eY)7 zUB!n=LKbHU^Hj+6G;fW{mzU-VRQd8Ur^U;cFRx#!_)scip30c_anG2i;zNDVoOz0o z`sj?5iVp>f@CV77rx>OGgHclPp+J(dt;fW9iW5?tkm7`XS@Pzo_)tmmVpM=C$(g)m zBEzyz!mg7w7LtckNct&03%P#F)ie*WFCb@OlD~5v@_4414I9n5C`uA~&c)~D^Sm8$ zb&9yI458{5rQwdtkovSMHzoxCX4)4wzY4`$BSUMa#QYQZJ-PO$u`P_}-mEzUtayQ6R9 zQ2W`>FKRC%zmT_5@u9M1UMt(J&UU-^!T!~6J_NOyooQw}yL9R~IGTboot-&XE7I$H caSm(q${A2O1O9S5115%0J;aC7@paezFRByvL;wH) delta 18799 zcmeHvYj7Obb*ASD3}s0q0g`wS;NZch)WdtbZ@(Z@B1F>EgOmu8Hl>gfb-VjEFydeu zni-ISWgDWrij!T%7Cp9Z?QU&K%+{9kV!Gk0#^bIy0Z^PPF?>)YP`oi$JYL2BX_`P)zD z2ewe^6oh3m7I>a!FyGf0aj53nRA__=W)W%#R~OCkmHX$Hb78@yHZ#luHl659qf{gM zVPcF?cZ`yf>AD6n3ys>WD6o?27W&_L{Z+cx%D?#~oM7w2ejAd^~Ufcg{Ox7 zXnI=I{4%c$j}6ynCnp8|fGYz&J2}-5HN09Ayi%D{FV0kWHGaVt)!xSN$JGBXH|Frt zu4uN_sEWCIIS}&Pe@qXjDU(B=&KHYj$Dyb$)I0=6|=khX>Q;YPsI1@kTkSs*%;p z(=(MhIX;u>%GjvVYQr-Dk-xu||y!u?OyC-`b(!@kCa*k{CmnmaS1XH(eB%B>xBT>-*<9v6J}rmxE4nkx zV)DWp+0o|8gZ(3+>(J0NHJ6&M#tg^Pcwn0vu>$7lI&(P<<;&m7Z!Ff;Tvp@zGmsh`(T9?M{JG&`8{K6yI zfxu^$O>EepZs5Se(AOLygywk;WlliwC4;4-s0e)C(lMNhYA4idUL}fxWmgbUa7a;M$0+ixc=H1P9DzUaO<^K9=>-3=hG)zFSK0--kiZ@)I~gD0cc*!Nq&bF-*mj2e%{eyb~^&RTPUC~KVPu$K!d|EBM%gtXw38A5e(l7T(Wc=^%Or2OnKqiPO|O@CU&m);WAGQZh!fNq|>>cjLs~%>Xy`y{L0xCgMndE z$8rKq*9{LNV;Gq)a0bVTs02$1+0v1fr}<>L8oPf#VULr_1TaFs!t3?q@N9YJ?^6Tv z`5h~=vg?bv?Sth>{Qr9KK!lZ%oWEUDvo(K8uKM5U-ZazY^FQhsmM=e@8aPT2?;OEG z*egOPzv?I@o*AD^rOlmD|K|cIq^^Za%M$3E5G_|XX*X- z@p@Fnjh&v3s`BiC&aUk2WQEJied%>+c)pchYU2x9M^^su8|k%X=)wp+>TAB?I~WY{ zV4Td<1o3R*=uSujZ|US!<%Qo&cMVp=NfE~l*#A^=vJ68nG0AtN`{X-8iuN6;sfAcP zG}F5M?J%S4LB9A*dTkbwo=7^99hN)0GOL=Ox?^CAWwC3VAmvHhV_8r)_zVrJ0t9uWv`Y6u->Qgv;2s^e2`;!(9R(_*8#olwQ@L@c(wn03E zE)E~TzUt%aaGd4p@NpbE?!*DxH(qW%-lQ&UZI925+c@jer?>1>kJumT#o=Sv-}?P) zaEB=0RTI7_pG4>b8krW;UW{2h5hT4*9PIDsHBt3u5tFd?>kX_nII#U(^6HA79J5^J z$g6*o-%wPzj8V8LT$VsA)YeNBo>!NbU(5H-+r;${n@!CNT|{Z&*qQ^_p!uH1S>XG` z<=1(N#@x(x#&~K9138I7E0+;_Q=9;DEn5B-h3BOB0~Qw|eO(-vZ@-+nS^jusiuR9J z%E}auh(-7Y&_dMZnZM1h%@UWg=4o?87Hg@#5uE|WxPoXl2@Ti+)*AP14tU|gzX*T~ zOFnTMa1>&JJpDg&U85&NJY`{21FBKOEH|bw;H1lSo8;XM@^79?ZSOxg$E(E;18e~e zs;d)*&^)lVKjQ+)o%?)ts9F5!z^D$tYB4wnT^R88oyn4`Yrcr z7?!;B?T$O$AOQT-74CFw+)8dcn#&PQclodE&X;jX3OHrbkRkaGlqu&C$9na6T2EEuk1+E&4$ZC3yYWgVvW7@=djQ6i>c zP&<)%94o-I?zay%-g*uv@58=bnQ_b1C_&l+Q;Wies17%;obBGO?gbJJ_0X8DRb%S| z@Cst;Wqi&lOVL2%-sj-erqStWx|O0P3H z4XCu7*H|RRn9&TpxhhH!1TBH$)1dtM^K-q7&tYKP9SiVq2B_ z5sB61B3l~Qv%T`MyRv!tiGS=Y4eaMNm_ys#5+i*;oGR{?XLFrE5w84ECcbz%pewcpnP4pttY zxBSoyGMBPnp5d(&XeO$c8w-&js;UfZ_|#OnB9f6{KcA~BWUA(BNesxi4wH0EX2S{% zt{(QGy9n5Cc6Q|m;;I3(*wH&`c`mR$^EAisRdPl!aVA58Xot2fC=r(7$%&_P-7y$T zF1wjp6wLYy39Bb6Ww`ytx0lGv)14|6IsIoTGPo;>W<;%cU;)9k{MQ3O;8Vrjyg9MI zJA-g9FI~-#O7ewFpC<@1G!h6+1c*kA0^m5raW%`)eV@6mK#*Fx0&8NLm*W&VIaYtR z0;^s$I+Mak72N4?WYyA3TX&+W_@xCaKt5$b(Wf&44H{0+k-Ru2{Ny7CQ|BK!nEqd@ zr1?APb=wT$9(N16L6M!fzJ`<apraSGHcl z@uAfhfAa%)T&q%LY{v>?t&>~uFoAub^>jNPyuP*<0*7b83suTt?5C%ZTTZuqH#|xV zdEGcc`PN{2YMgbzeDye~ z#+zKdu&7c@JzqR?)$jjflKV8~zZ|xKaKb0WH^Uw;iv2E_cZJLvpwjg(B zG2Z)fX2%E@p~(X*QA?<4b`fK_W8$f9_#PrI>`=U@|F2sIg{_q%Uz#OmGTFG zq{=h$(FfCO3k$BRAy1b(K(_#d%ykaEa3s|&*N>!o=P7XzQB2d+E#y$hixrQ}Ra^_e zrMn0)0H2obxS~)mPgZAVjt5ABceF|*$Ti%80UP%%d@I5o5H=n`=%k3X)IwwgZ9;*y z;0XXNTM-7Fzj0jse`FC%{_EFLT`-8B|LKgFz%>Q)J))J^syw3uxauXHA>pu+q2?ws z39Gj&IH*$1(eL~Qj_lYm9I?G`KMo(nzA-d*V`2wP!D1+Xp0uA9R6$oG_HNZytOGFWy@1-@W{0LFn5Yd36Z+OT(QRDP!K=Hg_k)L zV`AK>Vi~Mg07fKmcSrBxqFLPf<}s>>!t6>cX^ zYZIu^F=VkPJ&>YuJeMc=7B?<0{3O>U_gze{&$uo~#t$l^eYm54)aJzSthnl;BlmE@ zOFF*mXs}m6GFLZkVPC8Mjx2!esi@In=6w*mi;Y3@lbJrb@*hBx>?)*7&F8me*Dfsp zAB-@Ajkt!9bsdnBd?thOY)|(ccZDrN1{R0oqfvE)GeiBrN6x47(BRG+2+#Y&?!)tplV=n7%M|`RaqlA2wgLEZd#U$auJwQ{(4~z?644q#Pb8+ zLs<*B5cwJE3zTpT(;F}(3kZS;C&v|6pz<|vZg9cr_4CX5kb*-%fSB*ussS0oVV#aK z$Q&qE9!OP^wAIed`}$ARU=$0+4eacwNCqgNRut-H^;w7#hHu{=b2QbU%>bMN?~KJC z8wXhb2-V3kplTnQ1NHwBl7-gNNEpq7m3s7~Wx+WzYVN>E>nJn2zH5141fR@p%z%rM&%Zz46Z6{_ zIsxR2<`Z{$_iVp7<274;jU05cH9t^Aciond#=smPh~!c&j3X|us!f zx70eH%6{_Jm(u5NeJSG?48LFjUpcrY>{4ZCG)aXe@$QxoL zWn(wQNWXGor2qD#%%&|4xYOl`j3m8~-hV@mbVH7GLyoj4*ZEbHBX!Alb{6`Vtw(CP zj^!wqeu0W4)N+2Qilm2My`e}_IOB#Q>HjN5(s$BZw=7qU)P@kt&@J7NBeewbH{?i3 zIgRyygfzafBqiMe)8Rp=jC(iwU4T&-~V36J?B9vIV1C+wL-M+1P}xns^sK?*mF^2 zMR9=m;BX<2P>ht}1eSxI3;a>ZdrZ{=hkg_-44`bJij?;RF0-J@AO-}gffF;(cwZiJ z=pSL7DD?V}Ifb}Px2?b?0oDB7t{?aC;Ef3+zg%@P7)q*QtLr?26QT!DP($mA!9gI# zxeu*~ZJ`y9TV@z*q0T(I?)`cUxgmD1v+uT`B~`Y!oZ9`j<4$>NdzhUG)Z70Tjsw z*9C{L1Zrt;PD?suMv%1P-4pD%3;B^w zRbL89Nh2Xg>xsvq$$>5&Vo02EloK?a3R|~OKy*S$F{vvo=dMiU=SbxVS= zR~6^H{DX(kT(Y+Ybw+U)R9(=(pZi(1TOQn#Tb(g5J>zzINM1S6(IF4NQP_->={SlT zKfm`&UV5!#tp)WFsznj1{b4W{N-o~c=_;#}`G>KunJu{jGVhbYaI5Dx`w&Pn1ca;aBLu?DP z)SI`0=h(37@wkUi&$N^RHl?;4>Pc-F!fYdkKrgx^b_2wrn+7N^7Hf@$0k^n?YNsU- zahQOJkwbH%g$O;6svA%VQ2E^M)aKh&K^`^s7-_uMFYy1+3Zq}DnYz;ZnSaPPe|n)b zI$ocGqzql>t%jDi4ylzZ?Bc42tc=!|ujTsF=%%@FDKpfZnI4$;^$`CcUAJAVMCgia zz)JL5K+}Lm1``dq_H~Wtkb%adODZn}w6wLTs?@8k+fa|DMWc+axb{t}HqZ)~fC1>b zN7sk!rs%NQIowG{R3}i>P%WJp=wwo&(fSVK)k%3}f2uocyU-z@{>P3TBT$B-?J5Kg zLa#8Qy8_Xk50Ro75J6#+fz!OSrz@+Jz;b9szN>3-vx3Xq63}Is?Q0HPp&Raj-dgpY z*Nb~A=)6*kt(9;ci^a_c zK>TRpNBtTHgUjMSr@8^`m5wk#x!i-+QVT*3#fvZ?Yo+iSP+XG`_y~9kCLQZ7+o;5l zPwZN;e*Wiwj01fo{$WUVEVIP>Kp|otreuuKF@uySl%Uj1Dnyi!A6pvxhd%cG-@8j$ z@X40>Eyy0RdZz25a(JPyd*Syi>XBa-@lR4jGuL`;D(C=I0{lfkS#jSiZ>g=5Hg#{n zry@CYv2zVX2})EoOrZvWjv9*&v!6i1fbq0sxv;a=FIc#S0aY_JH4NQeShg0{fQ37c zrVfL18+vD(HZQK-JTO_V#j3B#T6C(hsG}H=xg`OH_+%Qo!nePI7Qss$neGf)SLMVl z`864I2FK0npSi2g;S}P6KnZs&^@{`O8iu|W_e@DQEbjS`j9+Ics<~fvn|1faeQos! zKcA5_0n5hnI~U~ou&f|ME*?bFv+AHnf2oowC05{GB{mJ@)qhT}P3w+4^KNRRym%#_ zO+6yZH|I8PwG@UepdUQs+)(y_yZ{0XsFZ>bDWmE}ff@qc0rJf4*=~p*4GPsSEQv#a z2(=4RF@lB}j&uW4>V)WnHZh<0ddH#3Y6mEM?#}G``DJ{HUN9K+^60n57RM(mR{xvm zh8#0li9x4`Ur$L$3VX2VHeo+j-LeA@Dl}oUS`RqBCC72xgmi6Q{Xt#+(Ze_>>ETfO zGvaTBmVmg?Ub*g>cI(=-2;k@A`Q&I|pNx~Y$oJ7Zj*w7A<9__NX*46jv*DLn(8idE z3HLw~)9w|(EoqMa^=Q_FcaalrYH$GEu#UOZ`J+b`xjKA*>Y>hFOr#rx=u_07eSsbf zLK(DD;A?-Y~tXW6c;C#un@iU?ZWxN}G{(H1P=_vo$)sv znysTH_;3xq2J-4xvzz3_uXSeS$Y^F*e)_M{{iEpUA(TV!2hlnV#()QMl-x=W(Ba|f z!Z!s3x91C8a>z@sTe(P$ODUvkzn)qzj}4`1^TFryJ-ZNWQXn^H6bvU{7WMFvg;cnIBSCS;T*etN`nuIolqze}LpFk=70&zs=4 zXUY}y{Uq$jbn!DC@`-G2ee>dHx1*B}Kk{&QEsB!kibeE?U|ks4nzYGe>iziwA1vj*JTmVD$O8>bj<9T)kpN4AJ|ASfo*Qu2)&+j31r!QeO&xB7 zjByP}uNiJJI1jWm6XwJ6C{$4Bm`B$cnn8R4UA*?ga)Nf7 4.5 ORDER BY rating DESC LIMIT 5;""" + qt_q10 """SELECT * FROM ${table_name} WHERE rating > 4.5 ORDER BY event_time DESC LIMIT 5;""" // Query all users' signup dates and limit output qt_q11 """SELECT user_id, signup_date FROM ${table_name} ORDER BY signup_date DESC LIMIT 10;""" @@ -79,9 +79,16 @@ suite("test_hudi_snapshot", "p2,external,hudi,external_remote,external_remote_hu qt_q14 """SELECT * FROM ${table_name} WHERE signup_date = '2024-01-15' ORDER BY user_id LIMIT 5;""" // Query the total count of purchases for each user and limit output - qt_q15 """SELECT user_id, array_size(purchases) AS purchase_count FROM ${table_name} ORDER BY purchase_count DESC LIMIT 5;""" + qt_q15 """SELECT user_id, array_size(purchases) AS purchase_count FROM ${table_name} ORDER BY user_id LIMIT 5;""" } + sql """set force_jni_scanner=true;""" + test_hudi_snapshot_querys("user_activity_log_cow_non_partition") + test_hudi_snapshot_querys("user_activity_log_cow_partition") + test_hudi_snapshot_querys("user_activity_log_mor_non_partition") + test_hudi_snapshot_querys("user_activity_log_mor_partition") + + sql """set force_jni_scanner=false;""" test_hudi_snapshot_querys("user_activity_log_cow_non_partition") test_hudi_snapshot_querys("user_activity_log_cow_partition") test_hudi_snapshot_querys("user_activity_log_mor_non_partition") diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_timetravel.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_timetravel.groovy index 4d458dc4381dcf..111b3d528a42e1 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_timetravel.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_timetravel.groovy @@ -54,7 +54,6 @@ suite("test_hudi_timetravel", "p2,external,hudi,external_remote,external_remote_ "20241114152009764", "20241114152011901", ] - test_hudi_timetravel_querys("user_activity_log_cow_non_partition", timestamps_cow_non_partition) // spark-sql "select distinct _hoodie_commit_time from user_activity_log_cow_partition order by _hoodie_commit_time;" def timestamps_cow_partition = [ @@ -69,7 +68,6 @@ suite("test_hudi_timetravel", "p2,external,hudi,external_remote,external_remote_ "20241114152147114", "20241114152156417", ] - test_hudi_timetravel_querys("user_activity_log_cow_partition", timestamps_cow_partition) // spark-sql "select distinct _hoodie_commit_time from user_activity_log_mor_non_partition order by _hoodie_commit_time;" def timestamps_mor_non_partition = [ @@ -84,7 +82,6 @@ suite("test_hudi_timetravel", "p2,external,hudi,external_remote,external_remote_ "20241114152028770", "20241114152030746", ] - test_hudi_timetravel_querys("user_activity_log_mor_non_partition", timestamps_mor_non_partition) // spark-sql "select distinct _hoodie_commit_time from user_activity_log_mor_partition order by _hoodie_commit_time;" def timestamps_mor_partition = [ @@ -99,6 +96,10 @@ suite("test_hudi_timetravel", "p2,external,hudi,external_remote,external_remote_ "20241114152323587", "20241114152334111", ] + + test_hudi_timetravel_querys("user_activity_log_cow_non_partition", timestamps_cow_non_partition) + test_hudi_timetravel_querys("user_activity_log_cow_partition", timestamps_cow_partition) + test_hudi_timetravel_querys("user_activity_log_mor_non_partition", timestamps_mor_non_partition) test_hudi_timetravel_querys("user_activity_log_mor_partition", timestamps_mor_partition) sql """drop catalog if exists ${catalog_name};""" From 667191a0739de1556f6830e7d204fa12a7c81708 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 25 Nov 2024 15:45:02 +0800 Subject: [PATCH 14/35] support force_jni_reader and use_old_hudi_jni_reader for hudi --- be/src/common/config.cpp | 2 ++ be/src/common/config.h | 3 +++ be/src/vec/exec/format/table/hudi_jni_reader.cpp | 14 +++++++++----- be/src/vec/exec/format/table/hudi_jni_reader.h | 4 +--- be/src/vec/exec/scan/vfile_scanner.cpp | 3 ++- .../apache/doris/datasource/FileQueryScanNode.java | 1 + .../doris/datasource/hudi/source/HudiScanNode.java | 9 ++++++--- gensrc/thrift/PlanNodes.thrift | 3 ++- 8 files changed, 26 insertions(+), 13 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 63989a76261bb6..f4877dbe0e0ac6 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1396,6 +1396,8 @@ DEFINE_mBool(enable_delete_bitmap_merge_on_compaction, "false"); DEFINE_Bool(enable_table_size_correctness_check, "false"); DEFINE_Bool(force_regenerate_rowsetid_on_start_error, "false"); +DEFINE_Bool(use_old_hudi_jni_reader, "false"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 29e55e6406390e..03cb3e44d202d0 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1482,6 +1482,9 @@ DECLARE_mBool(enable_delete_bitmap_merge_on_compaction); // Enable validation to check the correctness of table size. DECLARE_Bool(enable_table_size_correctness_check); +// Use old hudi jni reader +DECLARE_mBool(use_old_hudi_jni_reader); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/vec/exec/format/table/hudi_jni_reader.cpp b/be/src/vec/exec/format/table/hudi_jni_reader.cpp index ea801b798f377d..f6cda63b6b5b4a 100644 --- a/be/src/vec/exec/format/table/hudi_jni_reader.cpp +++ b/be/src/vec/exec/format/table/hudi_jni_reader.cpp @@ -18,8 +18,8 @@ #include "hudi_jni_reader.h" #include -#include +#include "common/config.h" #include "runtime/descriptors.h" #include "runtime/runtime_state.h" #include "runtime/types.h" @@ -65,7 +65,7 @@ HudiJniReader::HudiJniReader(const TFileScanRangeParams& scan_params, {"input_format", _hudi_params.input_format}}; // Use compatible hadoop client to read data - for (auto& kv : _scan_params.properties) { + for (const auto& kv : _scan_params.properties) { if (kv.first.starts_with(HOODIE_CONF_PREFIX)) { params[kv.first] = kv.second; } else { @@ -73,9 +73,13 @@ HudiJniReader::HudiJniReader(const TFileScanRangeParams& scan_params, } } - // _jni_connector = std::make_unique("org/apache/doris/hudi/HudiJniScanner", params, - _jni_connector = std::make_unique("org/apache/doris/hudi/HadoopHudiJniScanner", - params, required_fields); + if (config::use_old_hudi_jni_reader) [[unlikely]] { + _jni_connector = std::make_unique("org/apache/doris/hudi/HudiJniScanner", + params, required_fields); + } else { + _jni_connector = std::make_unique( + "org/apache/doris/hudi/HadoopHudiJniScanner", params, required_fields); + } } Status HudiJniReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { diff --git a/be/src/vec/exec/format/table/hudi_jni_reader.h b/be/src/vec/exec/format/table/hudi_jni_reader.h index e9bb55a69a77e7..bfa0291a61035c 100644 --- a/be/src/vec/exec/format/table/hudi_jni_reader.h +++ b/be/src/vec/exec/format/table/hudi_jni_reader.h @@ -17,9 +17,7 @@ #pragma once -#include - -#include +#include #include #include #include diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 3053adebbb521e..e5f00fb8525d2c 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -758,7 +758,8 @@ Status VFileScanner::_get_next_reader() { // JNI reader can only push down column value range bool push_down_predicates = !_is_load && _params->format_type != TFileFormatType::FORMAT_JNI; - if (format_type == TFileFormatType::FORMAT_JNI && range.__isset.table_format_params) { + if (!_params->force_jni_reader && format_type == TFileFormatType::FORMAT_JNI && + range.__isset.table_format_params) { if (range.table_format_params.table_format_type == "hudi" && range.table_format_params.hudi_params.delta_logs.empty()) { // fall back to native reader if there is no log file diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java index 3107b4ed5d7b85..f60c6016264f47 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java @@ -374,6 +374,7 @@ public void createScanRangeLocations() throws UserException { scanBackendIds.add(backend.getId()); } } + params.setForceJniReader(ConnectContext.get().getSessionVariable().isForceJniScanner()); getSerializedTable().ifPresent(params::setSerializedTable); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index ab32ee45993c01..3115a964066cfb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -116,9 +116,11 @@ public class HudiScanNode extends HiveScanNode { /** * External file scan node for Query Hudi table - * needCheckColumnPriv: Some of ExternalFileScanNode do not need to check column priv + * needCheckColumnPriv: Some of ExternalFileScanNode do not need to check column + * priv * eg: s3 tvf - * These scan nodes do not have corresponding catalog/database/table info, so no need to do priv check + * These scan nodes do not have corresponding catalog/database/table info, so no + * need to do priv check */ public HudiScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumnPriv, Optional scanParams, Optional incrementalRelation) { @@ -304,7 +306,8 @@ private List getPrunedPartitions( } } } - // unpartitioned table, create a dummy partition to save location and inputformat, + // unpartitioned table, create a dummy partition to save location and + // inputformat, // so that we can unify the interface. HivePartition dummyPartition = new HivePartition(hmsTable.getDbName(), hmsTable.getName(), true, hmsTable.getRemoteTable().getSd().getInputFormat(), diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 0bbd364fda1c2a..165fdc598eca60 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -353,7 +353,6 @@ struct TMaxComputeFileDesc { 1: optional string partition_spec // deprecated 2: optional string session_id 3: optional string table_batch_read_session - } struct THudiFileDesc { @@ -453,6 +452,8 @@ struct TFileScanRangeParams { // 1. Reduce the access to HMS and HDFS on the JNI side. // 2. There will be no inconsistency between the fe and be tables. 24: optional string serialized_table + // if set true, be will be forced to use jni reader + 25: bool force_jni_reader; } struct TFileRangeDesc { From 61643f3ffb3a37b02a9247bc07ff00a263205a3c Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 25 Nov 2024 19:13:58 +0800 Subject: [PATCH 15/35] support force_jni_scanner for hudi in fe --- .../apache/doris/datasource/hudi/source/HudiScanNode.java | 8 ++++++-- .../nereids/glue/translator/PhysicalPlanTranslator.java | 2 +- .../java/org/apache/doris/planner/SingleNodePlanner.java | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index 3115a964066cfb..69316a31c9356b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -37,6 +37,7 @@ import org.apache.doris.planner.ListPartitionPrunerV2; import org.apache.doris.planner.PlanNodeId; import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.SessionVariable; import org.apache.doris.spi.Split; import org.apache.doris.statistics.StatisticalType; import org.apache.doris.thrift.TExplainLevel; @@ -113,6 +114,7 @@ public class HudiScanNode extends HiveScanNode { private boolean incrementalRead = false; private TableScanParams scanParams; private IncrementalRelation incrementalRelation; + private SessionVariable sessionVariable; /** * External file scan node for Query Hudi table @@ -123,7 +125,8 @@ public class HudiScanNode extends HiveScanNode { * need to do priv check */ public HudiScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumnPriv, - Optional scanParams, Optional incrementalRelation) { + Optional scanParams, Optional incrementalRelation, + SessionVariable sessionVariable) { super(id, desc, "HUDI_SCAN_NODE", StatisticalType.HUDI_SCAN_NODE, needCheckColumnPriv); isCowOrRoTable = hmsTable.isHoodieCowTable(); if (LOG.isDebugEnabled()) { @@ -138,11 +141,12 @@ public HudiScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumn this.scanParams = scanParams.orElse(null); this.incrementalRelation = incrementalRelation.orElse(null); this.incrementalRead = (this.scanParams != null && this.scanParams.incrementalRead()); + this.sessionVariable = sessionVariable; } @Override public TFileFormatType getFileFormatType() throws UserException { - if (isCowOrRoTable) { + if (!sessionVariable.isForceJniScanner() && isCowOrRoTable) { return super.getFileFormatType(); } else { // Use jni to read hudi table in BE diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java index 56ae65ec722941..6e6f775592729a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java @@ -648,7 +648,7 @@ public PlanFragment visitPhysicalHudiScan(PhysicalHudiScan fileScan, PlanTransla + " for Hudi table"); PhysicalHudiScan hudiScan = (PhysicalHudiScan) fileScan; ScanNode scanNode = new HudiScanNode(context.nextPlanNodeId(), tupleDescriptor, false, - hudiScan.getScanParams(), hudiScan.getIncrementalRelation()); + hudiScan.getScanParams(), hudiScan.getIncrementalRelation(), ConnectContext.get().getSessionVariable()); if (fileScan.getTableSnapshot().isPresent()) { ((FileQueryScanNode) scanNode).setQueryTableSnapshot(fileScan.getTableSnapshot().get()); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java b/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java index d94ad0a2552240..df898c69ebcbfc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java @@ -1969,7 +1969,7 @@ private PlanNode createScanNode(Analyzer analyzer, TableRef tblRef, SelectStmt s + "please set enable_nereids_planner = true to enable new optimizer"); } scanNode = new HudiScanNode(ctx.getNextNodeId(), tblRef.getDesc(), true, - Optional.empty(), Optional.empty()); + Optional.empty(), Optional.empty(), ConnectContext.get().getSessionVariable()); break; case ICEBERG: scanNode = new IcebergScanNode(ctx.getNextNodeId(), tblRef.getDesc(), true); From bb4012375b713b9bb32655c81573eba61272c241 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 25 Nov 2024 23:45:27 +0800 Subject: [PATCH 16/35] fix force_jni_reader for HudiScanNode --- .../doris/datasource/hudi/source/HudiScanNode.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index 69316a31c9356b..07649fa7b41f54 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -146,7 +146,7 @@ public HudiScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumn @Override public TFileFormatType getFileFormatType() throws UserException { - if (!sessionVariable.isForceJniScanner() && isCowOrRoTable) { + if (canUseNativeReader()) { return super.getFileFormatType(); } else { // Use jni to read hudi table in BE @@ -265,6 +265,10 @@ private void setHudiParams(TFileRangeDesc rangeDesc, HudiSplit hudiSplit) { rangeDesc.setTableFormatParams(tableFormatFileDesc); } + private boolean canUseNativeReader() { + return !sessionVariable.isForceJniScanner() && isCowOrRoTable; + } + private List getPrunedPartitions( HoodieTableMetaClient metaClient, Option snapshotTimestamp) throws AnalysisException { List partitionColumnTypes = hmsTable.getPartitionColumnTypes(); @@ -322,7 +326,7 @@ private List getPrunedPartitions( } private List getIncrementalSplits() { - if (isCowOrRoTable) { + if (canUseNativeReader()) { List splits = incrementalRelation.collectSplits(); noLogsSplitNum.addAndGet(splits.size()); return splits; @@ -351,7 +355,7 @@ private void getPartitionSplits(HivePartition partition, List splits) thr HoodieTableFileSystemView fileSystemView = new HoodieTableFileSystemView(hudiClient, timeline, statuses); - if (isCowOrRoTable) { + if (canUseNativeReader()) { fileSystemView.getLatestBaseFilesBeforeOrOn(partitionName, queryInstant).forEach(baseFile -> { noLogsSplitNum.incrementAndGet(); String filePath = baseFile.getPath(); From b5343f1dea8471e559bd545dbc68bedd0d05096c Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 26 Nov 2024 01:17:52 +0800 Subject: [PATCH 17/35] fix hudi jni scanner --- .../main/java/org/apache/doris/datasource/hudi/HudiUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/HudiUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/HudiUtils.java index d7803b1a516f9e..c98d994a28a08f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/HudiUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/HudiUtils.java @@ -86,7 +86,7 @@ public static String convertAvroToHiveType(Schema schema) { case LONG: if (logicalType instanceof LogicalTypes.TimestampMillis || logicalType instanceof LogicalTypes.TimestampMicros) { - return logicalType.getName(); + return "timestamp"; } if (logicalType instanceof LogicalTypes.TimeMicros) { return handleUnsupportedType(schema); From dd70da2c3fdf027c87e0c99dc166c011c8f70c58 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 27 Nov 2024 15:48:13 +0800 Subject: [PATCH 18/35] disable fall back to native reader for hudi in be --- be/src/vec/exec/scan/vfile_scanner.cpp | 11 +++-------- .../org/apache/doris/hudi/HadoopHudiJniScanner.java | 1 - .../apache/doris/datasource/FileQueryScanNode.java | 1 - 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index e5f00fb8525d2c..91da916b55d349 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -758,14 +758,9 @@ Status VFileScanner::_get_next_reader() { // JNI reader can only push down column value range bool push_down_predicates = !_is_load && _params->format_type != TFileFormatType::FORMAT_JNI; - if (!_params->force_jni_reader && format_type == TFileFormatType::FORMAT_JNI && - range.__isset.table_format_params) { - if (range.table_format_params.table_format_type == "hudi" && - range.table_format_params.hudi_params.delta_logs.empty()) { - // fall back to native reader if there is no log file - format_type = TFileFormatType::FORMAT_PARQUET; - } else if (range.table_format_params.table_format_type == "paimon" && - !range.table_format_params.paimon_params.__isset.paimon_split) { + if (format_type == TFileFormatType::FORMAT_JNI && range.__isset.table_format_params) { + if (range.table_format_params.table_format_type == "paimon" && + !range.table_format_params.paimon_params.__isset.paimon_split) { // use native reader auto format = range.table_format_params.paimon_params.file_format; if (format == "orc") { diff --git a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java index 5ab9c5ba029c7f..f2b38815a366fe 100644 --- a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java +++ b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java @@ -78,7 +78,6 @@ public class HadoopHudiJniScanner extends JniScanner { private final String[] requiredFields; private List requiredColumnIds; private ColumnType[] requiredTypes; - // private final String[] nestedFields; // Hadoop info private RecordReader reader; diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java index f60c6016264f47..3107b4ed5d7b85 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java @@ -374,7 +374,6 @@ public void createScanRangeLocations() throws UserException { scanBackendIds.add(backend.getId()); } } - params.setForceJniReader(ConnectContext.get().getSessionVariable().isForceJniScanner()); getSerializedTable().ifPresent(params::setSerializedTable); From 1e0e700b57ebc1d35874718b7f78f42e8ae8ac7d Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 27 Nov 2024 17:44:33 +0800 Subject: [PATCH 19/35] fix --- be/src/vec/exec/scan/vfile_scanner.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 91da916b55d349..f52c6d172bf40b 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -23,18 +23,15 @@ #include #include -#include #include #include #include -#include #include #include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" -#include "common/object_pool.h" #include "io/cache/block_file_cache_profile.h" #include "runtime/descriptors.h" #include "runtime/runtime_state.h" @@ -47,7 +44,6 @@ #include "vec/common/string_ref.h" #include "vec/core/column_with_type_and_name.h" #include "vec/core/columns_with_type_and_name.h" -#include "vec/core/field.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_factory.hpp" #include "vec/data_types/data_type_nullable.h" From 572e9a677994f51e4fc140b73b334b965f7176bd Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 Nov 2024 16:40:14 +0800 Subject: [PATCH 20/35] Revert "disable fall back to native reader for hudi in be" This reverts commit 101ab44c3d718e0b09d90398417a1ef4e34f189c. --- be/src/vec/exec/scan/vfile_scanner.cpp | 11 ++++++++--- .../org/apache/doris/hudi/HadoopHudiJniScanner.java | 1 + .../apache/doris/datasource/FileQueryScanNode.java | 1 + 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index f52c6d172bf40b..9df752156ef8fe 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -754,9 +754,14 @@ Status VFileScanner::_get_next_reader() { // JNI reader can only push down column value range bool push_down_predicates = !_is_load && _params->format_type != TFileFormatType::FORMAT_JNI; - if (format_type == TFileFormatType::FORMAT_JNI && range.__isset.table_format_params) { - if (range.table_format_params.table_format_type == "paimon" && - !range.table_format_params.paimon_params.__isset.paimon_split) { + if (!_params->force_jni_reader && format_type == TFileFormatType::FORMAT_JNI && + range.__isset.table_format_params) { + if (range.table_format_params.table_format_type == "hudi" && + range.table_format_params.hudi_params.delta_logs.empty()) { + // fall back to native reader if there is no log file + format_type = TFileFormatType::FORMAT_PARQUET; + } else if (range.table_format_params.table_format_type == "paimon" && + !range.table_format_params.paimon_params.__isset.paimon_split) { // use native reader auto format = range.table_format_params.paimon_params.file_format; if (format == "orc") { diff --git a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java index f2b38815a366fe..5ab9c5ba029c7f 100644 --- a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java +++ b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java @@ -78,6 +78,7 @@ public class HadoopHudiJniScanner extends JniScanner { private final String[] requiredFields; private List requiredColumnIds; private ColumnType[] requiredTypes; + // private final String[] nestedFields; // Hadoop info private RecordReader reader; diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java index 3107b4ed5d7b85..f60c6016264f47 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java @@ -374,6 +374,7 @@ public void createScanRangeLocations() throws UserException { scanBackendIds.add(backend.getId()); } } + params.setForceJniReader(ConnectContext.get().getSessionVariable().isForceJniScanner()); getSerializedTable().ifPresent(params::setSerializedTable); From 952d2c65396cdb93da08dbb5c012cdab92dbd558 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 Nov 2024 16:40:32 +0800 Subject: [PATCH 21/35] fix --- .../main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java | 1 - 1 file changed, 1 deletion(-) diff --git a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java index 5ab9c5ba029c7f..f2b38815a366fe 100644 --- a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java +++ b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java @@ -78,7 +78,6 @@ public class HadoopHudiJniScanner extends JniScanner { private final String[] requiredFields; private List requiredColumnIds; private ColumnType[] requiredTypes; - // private final String[] nestedFields; // Hadoop info private RecordReader reader; From fae66fcf4808733e0feee1cd6a5c6f1541066429 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 Nov 2024 20:35:29 +0800 Subject: [PATCH 22/35] fix p2 case --- .../hudi/test_hudi_incremental.out | 348 ------------------ .../hudi/test_hudi_snapshot.out | Bin 697561 -> 696105 bytes .../hudi/test_hudi_timetravel.out | 120 ++++++ .../hudi/test_hudi_incremental.groovy | 12 +- .../hudi/test_hudi_schema_evolution.groovy | 9 +- .../hudi/test_hudi_snapshot.groovy | 10 +- .../hudi/test_hudi_timetravel.groovy | 6 + 7 files changed, 141 insertions(+), 364 deletions(-) diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out b/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out index 852aebe48961b5..b1bdad85013bfc 100644 --- a/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out +++ b/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out @@ -347,351 +347,3 @@ -- !incremental_9_10 -- 1000 --- !incremental_1_end -- -9000 - --- !incremental_earliest_1 -- -1000 - --- !incremental_2_end -- -8000 - --- !incremental_earliest_2 -- -2000 - --- !incremental_1_2 -- -1000 - --- !incremental_3_end -- -7000 - --- !incremental_earliest_3 -- -3000 - --- !incremental_2_3 -- -1000 - --- !incremental_4_end -- -6000 - --- !incremental_earliest_4 -- -4000 - --- !incremental_3_4 -- -1000 - --- !incremental_5_end -- -5000 - --- !incremental_earliest_5 -- -5000 - --- !incremental_4_5 -- -1000 - --- !incremental_6_end -- -4000 - --- !incremental_earliest_6 -- -6000 - --- !incremental_5_6 -- -1000 - --- !incremental_7_end -- -3000 - --- !incremental_earliest_7 -- -7000 - --- !incremental_6_7 -- -1000 - --- !incremental_8_end -- -2000 - --- !incremental_earliest_8 -- -8000 - --- !incremental_7_8 -- -1000 - --- !incremental_9_end -- -1000 - --- !incremental_earliest_9 -- -9000 - --- !incremental_8_9 -- -1000 - --- !incremental_10_end -- -0 - --- !incremental_earliest_10 -- -10000 - --- !incremental_9_10 -- -1000 - --- !incremental_1_end -- -9000 - --- !incremental_earliest_1 -- -1000 - --- !incremental_2_end -- -8000 - --- !incremental_earliest_2 -- -2000 - --- !incremental_1_2 -- -1000 - --- !incremental_3_end -- -7000 - --- !incremental_earliest_3 -- -3000 - --- !incremental_2_3 -- -1000 - --- !incremental_4_end -- -6000 - --- !incremental_earliest_4 -- -4000 - --- !incremental_3_4 -- -1000 - --- !incremental_5_end -- -5000 - --- !incremental_earliest_5 -- -5000 - --- !incremental_4_5 -- -1000 - --- !incremental_6_end -- -4000 - --- !incremental_earliest_6 -- -6000 - --- !incremental_5_6 -- -1000 - --- !incremental_7_end -- -3000 - --- !incremental_earliest_7 -- -7000 - --- !incremental_6_7 -- -1000 - --- !incremental_8_end -- -2000 - --- !incremental_earliest_8 -- -8000 - --- !incremental_7_8 -- -1000 - --- !incremental_9_end -- -1000 - --- !incremental_earliest_9 -- -9000 - --- !incremental_8_9 -- -1000 - --- !incremental_10_end -- -0 - --- !incremental_earliest_10 -- -10000 - --- !incremental_9_10 -- -1000 - --- !incremental_1_end -- -9000 - --- !incremental_earliest_1 -- -1000 - --- !incremental_2_end -- -8000 - --- !incremental_earliest_2 -- -2000 - --- !incremental_1_2 -- -1000 - --- !incremental_3_end -- -7000 - --- !incremental_earliest_3 -- -3000 - --- !incremental_2_3 -- -1000 - --- !incremental_4_end -- -6000 - --- !incremental_earliest_4 -- -4000 - --- !incremental_3_4 -- -1000 - --- !incremental_5_end -- -5000 - --- !incremental_earliest_5 -- -5000 - --- !incremental_4_5 -- -1000 - --- !incremental_6_end -- -4000 - --- !incremental_earliest_6 -- -6000 - --- !incremental_5_6 -- -1000 - --- !incremental_7_end -- -3000 - --- !incremental_earliest_7 -- -7000 - --- !incremental_6_7 -- -1000 - --- !incremental_8_end -- -2000 - --- !incremental_earliest_8 -- -8000 - --- !incremental_7_8 -- -1000 - --- !incremental_9_end -- -1000 - --- !incremental_earliest_9 -- -9000 - --- !incremental_8_9 -- -1000 - --- !incremental_10_end -- -0 - --- !incremental_earliest_10 -- -10000 - --- !incremental_9_10 -- -1000 - --- !incremental_1_end -- -9000 - --- !incremental_earliest_1 -- -1000 - --- !incremental_2_end -- -8000 - --- !incremental_earliest_2 -- -2000 - --- !incremental_1_2 -- -1000 - --- !incremental_3_end -- -7000 - --- !incremental_earliest_3 -- -3000 - --- !incremental_2_3 -- -1000 - --- !incremental_4_end -- -6000 - --- !incremental_earliest_4 -- -4000 - --- !incremental_3_4 -- -1000 - --- !incremental_5_end -- -5000 - --- !incremental_earliest_5 -- -5000 - --- !incremental_4_5 -- -1000 - --- !incremental_6_end -- -4000 - --- !incremental_earliest_6 -- -6000 - --- !incremental_5_6 -- -1000 - --- !incremental_7_end -- -3000 - --- !incremental_earliest_7 -- -7000 - --- !incremental_6_7 -- -1000 - --- !incremental_8_end -- -2000 - --- !incremental_earliest_8 -- -8000 - --- !incremental_7_8 -- -1000 - --- !incremental_9_end -- -1000 - --- !incremental_earliest_9 -- -9000 - --- !incremental_8_9 -- -1000 - --- !incremental_10_end -- -0 - --- !incremental_earliest_10 -- -10000 - --- !incremental_9_10 -- -1000 - diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_snapshot.out b/regression-test/data/external_table_p2/hudi/test_hudi_snapshot.out index 41f34255f40a5249cc790acd1c805f7917621a7b..1e151c2a86fa200668caaaf943a01f9e4c11f5a3 100644 GIT binary patch delta 2549 zcmYk8eQ;FO8OGm@fmnwsV~lIJvj-4DY=LPsK(f05Em6rBEy4y ze6ZHPUQlD37o$4QlX5W4#&NmeVQU4Wkt>+pH6F8N2U(jf7ysOPjZxbqFO!-aJMB^v z=P|0E?4{*M4;vM7;xm4OO&v45tXwk7KI%~QJTIF%R{jB}$8_gUWSDQQ%V7RrZ;jTWg=?^a|%$d7;BvDMC^G1zgG% zb@m}Ql)mOVjF;^9Keg#X!jI9^D8>`J{9%WBr(i1*-p}dKcl}a&_GT<4daa|J5)EE& z>OkXwLnB`xlcML4>9_@kQ0>-L2=twI92UO0`X0PF-eH@I>GQvN8M!&mb~#~1 zzV}Nk&Hd0T6tZFl>vB~wV^sDrguT9nZ*gVcOJbkSDMpDeG5bNlqn;ex&TsZo!&lLKYWty*LPe> z`EHC?Y>4b&VLf(DK@@AM^{Su){bCM%G}D?x?I-b+?-wofvYOkZh65+X4^*);vVI5K zYs;G1exWTpn^+H1H7}vUAw=Hvd)Cgv1WznPt6VchK~ zct6H<$HhKQ2g?!qg{Kh=+#ca7`zS5ijFUCI$(tOM^m)5a!JBk|?7JN2N2%?M-)D%` z>XvP@b0GFBr7lWSx7#@btX`e;2F9RvkpQbU(;2jkf)=2O&*g3Vc_Huz-nWkqkh76U()c zVa4!SmCLnS0^z5|Il~6w`Wx^Fgs->?+j!aAV}h5kp||R%e*~)O-U;3cgXQg| zLH8->fU)u&Y9dS>q1Z(?TgP5L6?3SS~LP8dloni{EhGOcBx;X{cPDiI!d|bQW zLN~1N^nPjZLP=u5;Q{T2e{twIxs$2+HaBe$gW6ay^fgjRu`;lNinyiia4(}|nUzyiq5G*Tr^byaG@Nk$7+-L> z3Qi+9o_Zvq-Nf+OA3!nTb1!i1Zj`v6l~CtXR-dA5mT2vD z&l|!J$`0QRniU(ZNe zg>Mj!^;f5L62!j0fMpa?e|8l~^K^rSL+M0@mSgQ;4cn#}!he&odK3v`N==%}qB&Qh8-h5BjwEHI~2TQ42YtB2g+kBXWu%$rU+& Date: Thu, 28 Nov 2024 23:29:04 +0800 Subject: [PATCH 23/35] fix case --- .../hudi/test_hudi_incremental.out | 174 ++++++++++++++++++ .../hudi/test_hudi_incremental.groovy | 7 +- 2 files changed, 178 insertions(+), 3 deletions(-) diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out b/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out index b1bdad85013bfc..50644f34961942 100644 --- a/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out +++ b/regression-test/data/external_table_p2/hudi/test_hudi_incremental.out @@ -347,3 +347,177 @@ -- !incremental_9_10 -- 1000 +-- !incremental_1_end -- +9000 + +-- !incremental_earliest_1 -- +1000 + +-- !incremental_2_end -- +8000 + +-- !incremental_earliest_2 -- +2000 + +-- !incremental_1_2 -- +1000 + +-- !incremental_3_end -- +7000 + +-- !incremental_earliest_3 -- +3000 + +-- !incremental_2_3 -- +1000 + +-- !incremental_4_end -- +6000 + +-- !incremental_earliest_4 -- +4000 + +-- !incremental_3_4 -- +1000 + +-- !incremental_5_end -- +5000 + +-- !incremental_earliest_5 -- +5000 + +-- !incremental_4_5 -- +1000 + +-- !incremental_6_end -- +4000 + +-- !incremental_earliest_6 -- +6000 + +-- !incremental_5_6 -- +1000 + +-- !incremental_7_end -- +3000 + +-- !incremental_earliest_7 -- +7000 + +-- !incremental_6_7 -- +1000 + +-- !incremental_8_end -- +2000 + +-- !incremental_earliest_8 -- +8000 + +-- !incremental_7_8 -- +1000 + +-- !incremental_9_end -- +1000 + +-- !incremental_earliest_9 -- +9000 + +-- !incremental_8_9 -- +1000 + +-- !incremental_10_end -- +0 + +-- !incremental_earliest_10 -- +10000 + +-- !incremental_9_10 -- +1000 + +-- !incremental_1_end -- +9000 + +-- !incremental_earliest_1 -- +1000 + +-- !incremental_2_end -- +8000 + +-- !incremental_earliest_2 -- +2000 + +-- !incremental_1_2 -- +1000 + +-- !incremental_3_end -- +7000 + +-- !incremental_earliest_3 -- +3000 + +-- !incremental_2_3 -- +1000 + +-- !incremental_4_end -- +6000 + +-- !incremental_earliest_4 -- +4000 + +-- !incremental_3_4 -- +1000 + +-- !incremental_5_end -- +5000 + +-- !incremental_earliest_5 -- +5000 + +-- !incremental_4_5 -- +1000 + +-- !incremental_6_end -- +4000 + +-- !incremental_earliest_6 -- +6000 + +-- !incremental_5_6 -- +1000 + +-- !incremental_7_end -- +3000 + +-- !incremental_earliest_7 -- +7000 + +-- !incremental_6_7 -- +1000 + +-- !incremental_8_end -- +2000 + +-- !incremental_earliest_8 -- +8000 + +-- !incremental_7_8 -- +1000 + +-- !incremental_9_end -- +1000 + +-- !incremental_earliest_9 -- +9000 + +-- !incremental_8_9 -- +1000 + +-- !incremental_10_end -- +0 + +-- !incremental_earliest_10 -- +10000 + +-- !incremental_9_10 -- +1000 + diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_incremental.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_incremental.groovy index f12bfe8b2a1af6..61fdd08c4960a7 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_incremental.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_incremental.groovy @@ -107,11 +107,12 @@ suite("test_hudi_incremental", "p2,external,hudi,external_remote,external_remote test_hudi_incremental_querys("user_activity_log_cow_partition", timestamps_cow_partition) test_hudi_incremental_querys("user_activity_log_mor_non_partition", timestamps_mor_non_partition) test_hudi_incremental_querys("user_activity_log_mor_partition", timestamps_mor_partition) - // sql """set force_jni_scanner=true;""" + sql """set force_jni_scanner=true;""" + // don't support incremental query for cow table by jni reader // test_hudi_incremental_querys("user_activity_log_cow_non_partition", timestamps_cow_non_partition) // test_hudi_incremental_querys("user_activity_log_cow_partition", timestamps_cow_partition) - // test_hudi_incremental_querys("user_activity_log_mor_non_partition", timestamps_mor_non_partition) - // test_hudi_incremental_querys("user_activity_log_mor_partition", timestamps_mor_partition) + test_hudi_incremental_querys("user_activity_log_mor_non_partition", timestamps_mor_non_partition) + test_hudi_incremental_querys("user_activity_log_mor_partition", timestamps_mor_partition) // sql """set force_jni_scanner=false;""" sql """drop catalog if exists ${catalog_name};""" From eb1b8b622e37c7f242206a7fbf30f216eccf8db5 Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 29 Nov 2024 01:35:53 +0800 Subject: [PATCH 24/35] add SessionVariable hudi_jni_scanner --- be/src/common/config.cpp | 2 -- be/src/common/config.h | 3 --- be/src/vec/exec/format/table/hudi_jni_reader.cpp | 8 +++++--- .../doris/datasource/hudi/source/HudiScanNode.java | 2 ++ .../doris/datasource/hudi/source/HudiSplit.java | 3 +-- .../java/org/apache/doris/qe/SessionVariable.java | 14 ++++++++++++++ gensrc/thrift/PlanNodes.thrift | 1 + 7 files changed, 23 insertions(+), 10 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index f4877dbe0e0ac6..63989a76261bb6 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1396,8 +1396,6 @@ DEFINE_mBool(enable_delete_bitmap_merge_on_compaction, "false"); DEFINE_Bool(enable_table_size_correctness_check, "false"); DEFINE_Bool(force_regenerate_rowsetid_on_start_error, "false"); -DEFINE_Bool(use_old_hudi_jni_reader, "false"); - // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 03cb3e44d202d0..29e55e6406390e 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1482,9 +1482,6 @@ DECLARE_mBool(enable_delete_bitmap_merge_on_compaction); // Enable validation to check the correctness of table size. DECLARE_Bool(enable_table_size_correctness_check); -// Use old hudi jni reader -DECLARE_mBool(use_old_hudi_jni_reader); - #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/vec/exec/format/table/hudi_jni_reader.cpp b/be/src/vec/exec/format/table/hudi_jni_reader.cpp index f6cda63b6b5b4a..524e3d9eb50c78 100644 --- a/be/src/vec/exec/format/table/hudi_jni_reader.cpp +++ b/be/src/vec/exec/format/table/hudi_jni_reader.cpp @@ -73,12 +73,14 @@ HudiJniReader::HudiJniReader(const TFileScanRangeParams& scan_params, } } - if (config::use_old_hudi_jni_reader) [[unlikely]] { + if (_hudi_params.hudi_jni_scanner == "hadoop") { + _jni_connector = std::make_unique( + "org/apache/doris/hudi/HadoopHudiJniScanner", params, required_fields); + } else if (_hudi_params.hudi_jni_scanner == "spark") { _jni_connector = std::make_unique("org/apache/doris/hudi/HudiJniScanner", params, required_fields); } else { - _jni_connector = std::make_unique( - "org/apache/doris/hudi/HadoopHudiJniScanner", params, required_fields); + DCHECK(false) << "Unsupported hudi jni scanner: " << _hudi_params.hudi_jni_scanner; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index 07649fa7b41f54..533b44d7791eb1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -261,6 +261,7 @@ private void setHudiParams(TFileRangeDesc rangeDesc, HudiSplit hudiSplit) { fileDesc.setColumnTypes(hudiSplit.getHudiColumnTypes()); // TODO(gaoxin): support complex types // fileDesc.setNestedFields(hudiSplit.getNestedFields()); + fileDesc.setHudiJniScanner(hudiSplit.getHudiJniScanner()); tableFormatFileDesc.setHudiParams(fileDesc); rangeDesc.setTableFormatParams(tableFormatFileDesc); } @@ -503,6 +504,7 @@ private HudiSplit generateHudiSplit(FileSlice fileSlice, List partitionV split.setHudiColumnNames(columnNames); split.setHudiColumnTypes(columnTypes); split.setInstantTime(queryInstant); + split.setHudiJniScanner(sessionVariable.getHudiJniScanner()); return split; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java index c72f7621feaa55..2270d2017937da 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java @@ -40,6 +40,5 @@ public HudiSplit(LocationPath file, long start, long length, long fileLength, St private List hudiColumnNames; private List hudiColumnTypes; private List nestedFields; + private String hudiJniScanner; } - - diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 3c180be9d42802..374100598c8b75 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -635,6 +635,8 @@ public class SessionVariable implements Serializable, Writable { public static final String FORCE_JNI_SCANNER = "force_jni_scanner"; + public static final String HUDI_JNI_SCANNER = "hudi_jni_scanner"; + public static final String ENABLE_COUNT_PUSH_DOWN_FOR_EXTERNAL_TABLE = "enable_count_push_down_for_external_table"; public static final String SHOW_ALL_FE_CONNECTION = "show_all_fe_connection"; @@ -2077,6 +2079,10 @@ public void setEnableLeftZigZag(boolean enableLeftZigZag) { description = {"强制使用jni方式读取外表", "Force the use of jni mode to read external table"}) private boolean forceJniScanner = false; + @VariableMgr.VarAttr(name = HUDI_JNI_SCANNER, description = { "使用那种hudi jni scanner, 'hadoop' 或 'spark'", + "Which hudi jni scanner to use, 'hadoop' or 'spark'" }) + private String hudiJniScanner = "hadoop"; + @VariableMgr.VarAttr(name = ENABLE_COUNT_PUSH_DOWN_FOR_EXTERNAL_TABLE, description = {"对外表启用 count(*) 下推优化", "enable count(*) pushdown optimization for external table"}) private boolean enableCountPushDownForExternalTable = true; @@ -4510,6 +4516,10 @@ public boolean isForceJniScanner() { return forceJniScanner; } + public String getHudiJniScanner() { + return hudiJniScanner; + } + public String getIgnoreSplitType() { return ignoreSplitType; } @@ -4530,6 +4540,10 @@ public void setForceJniScanner(boolean force) { forceJniScanner = force; } + public void setHudiJniScanner(String hudiJniScanner) { + this.hudiJniScanner = hudiJniScanner; + } + public boolean isEnableCountPushDownForExternalTable() { return enableCountPushDownForExternalTable; } diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 165fdc598eca60..8a59ad1fde1885 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -366,6 +366,7 @@ struct THudiFileDesc { 8: optional list column_names; 9: optional list column_types; 10: optional list nested_fields; + 11: optional string hudi_jni_scanner; } struct TLakeSoulFileDesc { From b1d46693ee6d59f851f18ad679586e5bd558d2d5 Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 29 Nov 2024 11:40:22 +0800 Subject: [PATCH 25/35] fix --- be/src/vec/exec/format/table/hudi_jni_reader.cpp | 1 - .../suites/external_table_p2/hudi/test_hudi_timetravel.groovy | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/be/src/vec/exec/format/table/hudi_jni_reader.cpp b/be/src/vec/exec/format/table/hudi_jni_reader.cpp index 524e3d9eb50c78..cb109bf05a2393 100644 --- a/be/src/vec/exec/format/table/hudi_jni_reader.cpp +++ b/be/src/vec/exec/format/table/hudi_jni_reader.cpp @@ -19,7 +19,6 @@ #include -#include "common/config.h" #include "runtime/descriptors.h" #include "runtime/runtime_state.h" #include "runtime/types.h" diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_timetravel.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_timetravel.groovy index e0156fbcd12693..cceeaa412202c6 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_timetravel.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_timetravel.groovy @@ -109,4 +109,4 @@ suite("test_hudi_timetravel", "p2,external,hudi,external_remote,external_remote_ sql """set force_jni_scanner=false;""" sql """drop catalog if exists ${catalog_name};""" -} \ No newline at end of file +} From 99d8d5bd5d494b5b0ee723f91b88df3528a2320c Mon Sep 17 00:00:00 2001 From: Socrates Date: Sun, 1 Dec 2024 17:04:39 +0800 Subject: [PATCH 26/35] move the logic about falling back to native reader from be to fe --- be/src/vec/exec/scan/vfile_scanner.cpp | 19 ---------- .../doris/datasource/FileQueryScanNode.java | 3 +- .../datasource/hudi/source/HudiScanNode.java | 24 +++++++----- .../datasource/hudi/source/HudiSplit.java | 1 - .../paimon/source/PaimonScanNode.java | 37 +++++++++++++------ gensrc/thrift/PlanNodes.thrift | 4 +- 6 files changed, 45 insertions(+), 43 deletions(-) diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 9df752156ef8fe..e60fcd83c27fe3 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -754,25 +754,6 @@ Status VFileScanner::_get_next_reader() { // JNI reader can only push down column value range bool push_down_predicates = !_is_load && _params->format_type != TFileFormatType::FORMAT_JNI; - if (!_params->force_jni_reader && format_type == TFileFormatType::FORMAT_JNI && - range.__isset.table_format_params) { - if (range.table_format_params.table_format_type == "hudi" && - range.table_format_params.hudi_params.delta_logs.empty()) { - // fall back to native reader if there is no log file - format_type = TFileFormatType::FORMAT_PARQUET; - } else if (range.table_format_params.table_format_type == "paimon" && - !range.table_format_params.paimon_params.__isset.paimon_split) { - // use native reader - auto format = range.table_format_params.paimon_params.file_format; - if (format == "orc") { - format_type = TFileFormatType::FORMAT_ORC; - } else if (format == "parquet") { - format_type = TFileFormatType::FORMAT_PARQUET; - } else { - return Status::InternalError("Not supported paimon file format: {}", format); - } - } - } bool need_to_get_parsed_schema = false; switch (format_type) { case TFileFormatType::FORMAT_JNI: { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java index f60c6016264f47..2672cc016eb528 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java @@ -374,7 +374,6 @@ public void createScanRangeLocations() throws UserException { scanBackendIds.add(backend.getId()); } } - params.setForceJniReader(ConnectContext.get().getSessionVariable().isForceJniScanner()); getSerializedTable().ifPresent(params::setSerializedTable); @@ -434,6 +433,8 @@ private TScanRangeLocations splitToScanRange( } } + // set file format type, and the type might fall back to native format in setScanParams + rangeDesc.setFormatType(getFileFormatType()); setScanParams(rangeDesc, fileSplit); curLocations.getScanRange().getExtScanRange().getFileScanRange().addToRanges(rangeDesc); TScanRangeLocation location = new TScanRangeLocation(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index 533b44d7791eb1..7e551f7a43159b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -88,7 +88,7 @@ public class HudiScanNode extends HiveScanNode { private static final Logger LOG = LogManager.getLogger(HudiScanNode.class); - private boolean isCowOrRoTable; + private boolean isCowTable; private final AtomicLong noLogsSplitNum = new AtomicLong(0); @@ -128,9 +128,9 @@ public HudiScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumn Optional scanParams, Optional incrementalRelation, SessionVariable sessionVariable) { super(id, desc, "HUDI_SCAN_NODE", StatisticalType.HUDI_SCAN_NODE, needCheckColumnPriv); - isCowOrRoTable = hmsTable.isHoodieCowTable(); + isCowTable = hmsTable.isHoodieCowTable(); if (LOG.isDebugEnabled()) { - if (isCowOrRoTable) { + if (isCowTable) { LOG.debug("Hudi table {} can read as cow/read optimize table", hmsTable.getFullQualifiers()); } else { LOG.debug("Hudi table {} is a mor table, and will use JNI to read data in BE", @@ -191,13 +191,13 @@ protected void doInitialize() throws UserException { throw new UserException("Not support function '" + scanParams.getParamType() + "' in hudi table"); } if (incrementalRead) { - if (isCowOrRoTable) { + if (isCowTable) { try { Map serd = hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters(); if ("true".equals(serd.get("hoodie.query.as.ro.table")) && hmsTable.getRemoteTable().getTableName().endsWith("_ro")) { // Incremental read RO table as RT table, I don't know why? - isCowOrRoTable = false; + isCowTable = false; LOG.warn("Execute incremental read on RO table: {}", hmsTable.getFullQualifiers()); } } catch (Exception e) { @@ -242,7 +242,15 @@ protected Map getLocationProperties() throws UserException { @Override protected void setScanParams(TFileRangeDesc rangeDesc, Split split) { if (split instanceof HudiSplit) { - setHudiParams(rangeDesc, (HudiSplit) split); + HudiSplit hudiSplit = (HudiSplit) split; + if (rangeDesc.getFormatType() == TFileFormatType.FORMAT_JNI + && !sessionVariable.isForceJniScanner() + && hudiSplit.getHudiDeltaLogs().isEmpty()) { + // no logs, is read optimize table, fallback to use native reader + // TODO: hudi only support parquet now? + rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET); + } + setHudiParams(rangeDesc, hudiSplit); } } @@ -261,13 +269,12 @@ private void setHudiParams(TFileRangeDesc rangeDesc, HudiSplit hudiSplit) { fileDesc.setColumnTypes(hudiSplit.getHudiColumnTypes()); // TODO(gaoxin): support complex types // fileDesc.setNestedFields(hudiSplit.getNestedFields()); - fileDesc.setHudiJniScanner(hudiSplit.getHudiJniScanner()); tableFormatFileDesc.setHudiParams(fileDesc); rangeDesc.setTableFormatParams(tableFormatFileDesc); } private boolean canUseNativeReader() { - return !sessionVariable.isForceJniScanner() && isCowOrRoTable; + return !sessionVariable.isForceJniScanner() && isCowTable; } private List getPrunedPartitions( @@ -504,7 +511,6 @@ private HudiSplit generateHudiSplit(FileSlice fileSlice, List partitionV split.setHudiColumnNames(columnNames); split.setHudiColumnTypes(columnTypes); split.setInstantTime(queryInstant); - split.setHudiJniScanner(sessionVariable.getHudiJniScanner()); return split; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java index 2270d2017937da..2c3cbdb7fbac5c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java @@ -40,5 +40,4 @@ public HudiSplit(LocationPath file, long start, long length, long fileLength, St private List hudiColumnNames; private List hudiColumnTypes; private List nestedFields; - private String hudiJniScanner; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java index 59f51c8425c7f2..4026358817db34 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java @@ -105,9 +105,9 @@ public String toString() { private String serializedTable; public PaimonScanNode(PlanNodeId id, - TupleDescriptor desc, - boolean needCheckColumnPriv, - SessionVariable sessionVariable) { + TupleDescriptor desc, + boolean needCheckColumnPriv, + SessionVariable sessionVariable) { super(id, desc, "PAIMON_SCAN_NODE", StatisticalType.PAIMON_SCAN_NODE, needCheckColumnPriv); this.sessionVariable = sessionVariable; } @@ -127,8 +127,7 @@ protected void convertPredicate() { predicates = paimonPredicateConverter.convertToPaimonExpr(conjuncts); } - private static final Base64.Encoder BASE64_ENCODER = - java.util.Base64.getUrlEncoder().withoutPadding(); + private static final Base64.Encoder BASE64_ENCODER = java.util.Base64.getUrlEncoder().withoutPadding(); public static String encodeObjectToString(T t) { try { @@ -142,6 +141,19 @@ public static String encodeObjectToString(T t) { @Override protected void setScanParams(TFileRangeDesc rangeDesc, Split split) { if (split instanceof PaimonSplit) { + PaimonSplit paimonSplit = (PaimonSplit) split; + if (rangeDesc.getFormatType() == TFileFormatType.FORMAT_JNI + && paimonSplit.getSplit() != null) { + // fall back to JNI reader + String fileFormat = getFileFormat(paimonSplit.getPathString()); + if (fileFormat.equals("orc")) { + rangeDesc.setFormatType(TFileFormatType.FORMAT_ORC); + } else if (fileFormat.equals("parquet")) { + rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET); + } else { + throw new RuntimeException("Unsupported file format: " + fileFormat); + } + } setPaimonParams(rangeDesc, (PaimonSplit) split); } } @@ -172,7 +184,8 @@ private void setPaimonParams(TFileRangeDesc rangeDesc, PaimonSplit paimonSplit) fileDesc.setTblId(source.getTargetTable().getId()); fileDesc.setLastUpdateTime(source.getTargetTable().getUpdateTime()); fileDesc.setPaimonTable(encodeObjectToString(source.getPaimonTable())); - // The hadoop conf should be same with PaimonExternalCatalog.createCatalog()#getConfiguration() + // The hadoop conf should be same with + // PaimonExternalCatalog.createCatalog()#getConfiguration() fileDesc.setHadoopConf(source.getCatalog().getCatalogProperty().getHadoopProperties()); Optional optDeletionFile = paimonSplit.getDeletionFile(); if (optDeletionFile.isPresent()) { @@ -190,8 +203,8 @@ private void setPaimonParams(TFileRangeDesc rangeDesc, PaimonSplit paimonSplit) @Override public List getSplits() throws UserException { boolean forceJniScanner = sessionVariable.isForceJniScanner(); - SessionVariable.IgnoreSplitType ignoreSplitType = - SessionVariable.IgnoreSplitType.valueOf(sessionVariable.getIgnoreSplitType()); + SessionVariable.IgnoreSplitType ignoreSplitType = SessionVariable.IgnoreSplitType + .valueOf(sessionVariable.getIgnoreSplitType()); List splits = new ArrayList<>(); int[] projected = desc.getSlots().stream().mapToInt( slot -> (source.getPaimonTable().rowType().getFieldNames().indexOf(slot.getColumn().getName()))) @@ -288,7 +301,8 @@ public List getSplits() throws UserException { } this.selectedPartitionNum = selectedPartitionValues.size(); // TODO: get total partition number - // We should set fileSplitSize at the end because fileSplitSize may be modified in splitFile. + // We should set fileSplitSize at the end because fileSplitSize may be modified + // in splitFile. splits.forEach(s -> s.setTargetSplitSize(fileSplitSize)); return splits; } @@ -318,8 +332,9 @@ public TFileFormatType getFileFormatType() throws DdlException, MetaNotFoundExce @Override public List getPathPartitionKeys() throws DdlException, MetaNotFoundException { - // return new ArrayList<>(source.getPaimonTable().partitionKeys()); - //Paymon is not aware of partitions and bypasses some existing logic by returning an empty list + // return new ArrayList<>(source.getPaimonTable().partitionKeys()); + // Paymon is not aware of partitions and bypasses some existing logic by + // returning an empty list return new ArrayList<>(); } diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 8a59ad1fde1885..3a0a995ca459e2 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -406,6 +406,7 @@ enum TTextSerdeType { struct TFileScanRangeParams { // deprecated, move to TFileScanRange 1: optional Types.TFileType file_type; + // deprecated, move to TFileScanRange 2: optional TFileFormatType format_type; // deprecated, move to TFileScanRange 3: optional TFileCompressType compress_type; @@ -453,8 +454,6 @@ struct TFileScanRangeParams { // 1. Reduce the access to HMS and HDFS on the JNI side. // 2. There will be no inconsistency between the fe and be tables. 24: optional string serialized_table - // if set true, be will be forced to use jni reader - 25: bool force_jni_reader; } struct TFileRangeDesc { @@ -482,6 +481,7 @@ struct TFileRangeDesc { // for hive table, different files may have different fs, // so fs_name should be with TFileRangeDesc 12: optional string fs_name + 13: optional TFileFormatType format_type; } struct TSplitSource { From 838c91a2379ad42c7418bb27f2f246a53859bb55 Mon Sep 17 00:00:00 2001 From: Socrates Date: Sun, 1 Dec 2024 22:47:28 +0800 Subject: [PATCH 27/35] fix --- .../org/apache/doris/datasource/hudi/source/HudiScanNode.java | 2 ++ .../java/org/apache/doris/datasource/hudi/source/HudiSplit.java | 1 + 2 files changed, 3 insertions(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index 7e551f7a43159b..8aabd52ace3a46 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -269,6 +269,7 @@ private void setHudiParams(TFileRangeDesc rangeDesc, HudiSplit hudiSplit) { fileDesc.setColumnTypes(hudiSplit.getHudiColumnTypes()); // TODO(gaoxin): support complex types // fileDesc.setNestedFields(hudiSplit.getNestedFields()); + fileDesc.setHudiJniScanner(hudiSplit.getHudiJniScanner()); tableFormatFileDesc.setHudiParams(fileDesc); rangeDesc.setTableFormatParams(tableFormatFileDesc); } @@ -511,6 +512,7 @@ private HudiSplit generateHudiSplit(FileSlice fileSlice, List partitionV split.setHudiColumnNames(columnNames); split.setHudiColumnTypes(columnTypes); split.setInstantTime(queryInstant); + split.setHudiJniScanner(sessionVariable.getHudiJniScanner()); return split; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java index 2c3cbdb7fbac5c..2270d2017937da 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiSplit.java @@ -40,4 +40,5 @@ public HudiSplit(LocationPath file, long start, long length, long fileLength, St private List hudiColumnNames; private List hudiColumnTypes; private List nestedFields; + private String hudiJniScanner; } From 68a9ea1d1a387cc158f225927fac2f422dfc6815 Mon Sep 17 00:00:00 2001 From: Socrates Date: Sun, 1 Dec 2024 23:27:57 +0800 Subject: [PATCH 28/35] fix be --- be/src/vec/exec/scan/vfile_scanner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index e60fcd83c27fe3..21af1bb0518baf 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -750,7 +750,7 @@ Status VFileScanner::_get_next_reader() { // create reader for specific format Status init_status; - TFileFormatType::type format_type = _params->format_type; + TFileFormatType::type format_type = range.format_type; // JNI reader can only push down column value range bool push_down_predicates = !_is_load && _params->format_type != TFileFormatType::FORMAT_JNI; From 07cb8434938075bd10736b2672dbe6f3318b1e74 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 2 Dec 2024 10:35:48 +0800 Subject: [PATCH 29/35] fix paimon --- .../apache/doris/datasource/paimon/source/PaimonScanNode.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java index 4026358817db34..8a62b4aa8f4424 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java @@ -143,7 +143,7 @@ protected void setScanParams(TFileRangeDesc rangeDesc, Split split) { if (split instanceof PaimonSplit) { PaimonSplit paimonSplit = (PaimonSplit) split; if (rangeDesc.getFormatType() == TFileFormatType.FORMAT_JNI - && paimonSplit.getSplit() != null) { + && paimonSplit.getSplit() == null) { // fall back to JNI reader String fileFormat = getFileFormat(paimonSplit.getPathString()); if (fileFormat.equals("orc")) { From 19c4c27a7c82f226197c72fa1ce3af8a41005b37 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 2 Dec 2024 15:20:11 +0800 Subject: [PATCH 30/35] fix be ut --- be/test/vec/exec/vfile_scanner_exception_test.cpp | 2 +- be/test/vec/exec/vwal_scanner_test.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/be/test/vec/exec/vfile_scanner_exception_test.cpp b/be/test/vec/exec/vfile_scanner_exception_test.cpp index 4b6ce46bd88cf3..8506fe624b70eb 100644 --- a/be/test/vec/exec/vfile_scanner_exception_test.cpp +++ b/be/test/vec/exec/vfile_scanner_exception_test.cpp @@ -261,10 +261,10 @@ void VfileScannerExceptionTest::init() { _range_desc.start_offset = 0; _range_desc.size = 1000; + _range_desc.format_type = TFileFormatType::FORMAT_JNI; _ranges.push_back(_range_desc); _scan_range.ranges = _ranges; _scan_range.__isset.params = true; - _scan_range.params.format_type = TFileFormatType::FORMAT_JNI; _kv_cache.reset(new ShardedKVCache(48)); _cluster_info.reset(new ClusterInfo()); diff --git a/be/test/vec/exec/vwal_scanner_test.cpp b/be/test/vec/exec/vwal_scanner_test.cpp index 5c4056a8c24104..fdb5b570d023c0 100644 --- a/be/test/vec/exec/vwal_scanner_test.cpp +++ b/be/test/vec/exec/vwal_scanner_test.cpp @@ -274,10 +274,10 @@ void VWalScannerTest::init() { _range_desc.start_offset = 0; _range_desc.size = 1000; + _range_desc.format_type = TFileFormatType::FORMAT_WAL; _ranges.push_back(_range_desc); _scan_range.ranges = _ranges; _scan_range.__isset.params = true; - _scan_range.params.format_type = TFileFormatType::FORMAT_WAL; _kv_cache.reset(new ShardedKVCache(48)); _cluster_info.reset(new ClusterInfo()); From 5feb98dada4e09812839c109acbc235f5b6b6364 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 2 Dec 2024 15:30:06 +0800 Subject: [PATCH 31/35] fix for compatibility --- be/src/vec/exec/scan/vfile_scanner.cpp | 23 ++++++++++++++++++- .../apache/doris/hudi/BaseSplitReader.scala | 4 ++-- .../paimon/source/PaimonScanNode.java | 2 +- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 21af1bb0518baf..b5a7160f702954 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -750,10 +750,31 @@ Status VFileScanner::_get_next_reader() { // create reader for specific format Status init_status; - TFileFormatType::type format_type = range.format_type; + // for compatibility, if format_type is not set in range, use the format type of params + TFileFormatType::type format_type = + range.__isset.format_type ? range.format_type : _params->format_type; // JNI reader can only push down column value range bool push_down_predicates = !_is_load && _params->format_type != TFileFormatType::FORMAT_JNI; + // for compatibility, this logic is deprecated in 3.1 + if (format_type == TFileFormatType::FORMAT_JNI && range.__isset.table_format_params) { + if (range.table_format_params.table_format_type == "hudi" && + range.table_format_params.hudi_params.delta_logs.empty()) { + // fall back to native reader if there is no log file + format_type = TFileFormatType::FORMAT_PARQUET; + } else if (range.table_format_params.table_format_type == "paimon" && + !range.table_format_params.paimon_params.__isset.paimon_split) { + // use native reader + auto format = range.table_format_params.paimon_params.file_format; + if (format == "orc") { + format_type = TFileFormatType::FORMAT_ORC; + } else if (format == "parquet") { + format_type = TFileFormatType::FORMAT_PARQUET; + } else { + return Status::InternalError("Not supported paimon file format: {}", format); + } + } + } bool need_to_get_parsed_schema = false; switch (format_type) { case TFileFormatType::FORMAT_JNI: { diff --git a/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala b/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala index 5d69ab5d1b1fdf..fc8d74f9713c26 100644 --- a/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala +++ b/fe/be-java-extensions/hudi-scanner/src/main/scala/org/apache/doris/hudi/BaseSplitReader.scala @@ -500,10 +500,10 @@ abstract class BaseSplitReader(val split: HoodieSplit) { hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { partitionedFile => { var hadoopStorageConfiguration = new HadoopStorageConfiguration(hadoopConf); - var sotragePath = new StoragePath(partitionedFile.toPath.toUri.getPath); + var storagePath = new StoragePath(partitionedFile.toPath.toUri.getPath); var emptySchema = org.apache.hudi.common.util.Option.empty[org.apache.avro.Schema]() val reader = new HoodieHBaseAvroHFileReader( - hadoopStorageConfiguration, sotragePath, emptySchema) + hadoopStorageConfiguration, storagePath, emptySchema) val requiredRowSchema = requiredDataSchema.structTypeSchema // NOTE: Schema has to be parsed at this point, since Avro's [[Schema]] aren't serializable diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java index 8a62b4aa8f4424..a9956549f11b81 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java @@ -144,7 +144,7 @@ protected void setScanParams(TFileRangeDesc rangeDesc, Split split) { PaimonSplit paimonSplit = (PaimonSplit) split; if (rangeDesc.getFormatType() == TFileFormatType.FORMAT_JNI && paimonSplit.getSplit() == null) { - // fall back to JNI reader + // fall back to native reader String fileFormat = getFileFormat(paimonSplit.getPathString()); if (fileFormat.equals("orc")) { rangeDesc.setFormatType(TFileFormatType.FORMAT_ORC); From e4e4340d503b9d5a09bf2be4c6da2985fa7f41c0 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 2 Dec 2024 15:31:08 +0800 Subject: [PATCH 32/35] fix --- .../org/apache/doris/datasource/hudi/source/HudiScanNode.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index 8aabd52ace3a46..28805aae63c1e3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -247,7 +247,7 @@ protected void setScanParams(TFileRangeDesc rangeDesc, Split split) { && !sessionVariable.isForceJniScanner() && hudiSplit.getHudiDeltaLogs().isEmpty()) { // no logs, is read optimize table, fallback to use native reader - // TODO: hudi only support parquet now? + // TODO: support read orc hudi table in native reader rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET); } setHudiParams(rangeDesc, hudiSplit); From 45b432847a6f82cdd301080f845ef7fbb778b79e Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 2 Dec 2024 16:25:48 +0800 Subject: [PATCH 33/35] remove be fall back logic --- be/src/vec/exec/scan/vfile_scanner.cpp | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index b5a7160f702954..ba404466b31922 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -756,25 +756,6 @@ Status VFileScanner::_get_next_reader() { // JNI reader can only push down column value range bool push_down_predicates = !_is_load && _params->format_type != TFileFormatType::FORMAT_JNI; - // for compatibility, this logic is deprecated in 3.1 - if (format_type == TFileFormatType::FORMAT_JNI && range.__isset.table_format_params) { - if (range.table_format_params.table_format_type == "hudi" && - range.table_format_params.hudi_params.delta_logs.empty()) { - // fall back to native reader if there is no log file - format_type = TFileFormatType::FORMAT_PARQUET; - } else if (range.table_format_params.table_format_type == "paimon" && - !range.table_format_params.paimon_params.__isset.paimon_split) { - // use native reader - auto format = range.table_format_params.paimon_params.file_format; - if (format == "orc") { - format_type = TFileFormatType::FORMAT_ORC; - } else if (format == "parquet") { - format_type = TFileFormatType::FORMAT_PARQUET; - } else { - return Status::InternalError("Not supported paimon file format: {}", format); - } - } - } bool need_to_get_parsed_schema = false; switch (format_type) { case TFileFormatType::FORMAT_JNI: { From c2e27cbfe1470f1d0cd6db252b0f46607cce8605 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 2 Dec 2024 20:05:52 +0800 Subject: [PATCH 34/35] fix PaimonScanNode and be ut --- be/src/vec/exec/scan/vfile_scanner.cpp | 15 ++++++++++ .../vec/exec/vfile_scanner_exception_test.cpp | 2 +- be/test/vec/exec/vwal_scanner_test.cpp | 2 +- .../paimon/source/PaimonScanNode.java | 28 +++++++++---------- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index ba404466b31922..246a2edb082c92 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -756,6 +756,21 @@ Status VFileScanner::_get_next_reader() { // JNI reader can only push down column value range bool push_down_predicates = !_is_load && _params->format_type != TFileFormatType::FORMAT_JNI; + // for compatibility, this logic is deprecated in 3.1 + if (format_type == TFileFormatType::FORMAT_JNI && range.__isset.table_format_params) { + if (range.table_format_params.table_format_type == "paimon" && + !range.table_format_params.paimon_params.__isset.paimon_split) { + // use native reader + auto format = range.table_format_params.paimon_params.file_format; + if (format == "orc") { + format_type = TFileFormatType::FORMAT_ORC; + } else if (format == "parquet") { + format_type = TFileFormatType::FORMAT_PARQUET; + } else { + return Status::InternalError("Not supported paimon file format: {}", format); + } + } + } bool need_to_get_parsed_schema = false; switch (format_type) { case TFileFormatType::FORMAT_JNI: { diff --git a/be/test/vec/exec/vfile_scanner_exception_test.cpp b/be/test/vec/exec/vfile_scanner_exception_test.cpp index 8506fe624b70eb..4b6ce46bd88cf3 100644 --- a/be/test/vec/exec/vfile_scanner_exception_test.cpp +++ b/be/test/vec/exec/vfile_scanner_exception_test.cpp @@ -261,10 +261,10 @@ void VfileScannerExceptionTest::init() { _range_desc.start_offset = 0; _range_desc.size = 1000; - _range_desc.format_type = TFileFormatType::FORMAT_JNI; _ranges.push_back(_range_desc); _scan_range.ranges = _ranges; _scan_range.__isset.params = true; + _scan_range.params.format_type = TFileFormatType::FORMAT_JNI; _kv_cache.reset(new ShardedKVCache(48)); _cluster_info.reset(new ClusterInfo()); diff --git a/be/test/vec/exec/vwal_scanner_test.cpp b/be/test/vec/exec/vwal_scanner_test.cpp index fdb5b570d023c0..5c4056a8c24104 100644 --- a/be/test/vec/exec/vwal_scanner_test.cpp +++ b/be/test/vec/exec/vwal_scanner_test.cpp @@ -274,10 +274,10 @@ void VWalScannerTest::init() { _range_desc.start_offset = 0; _range_desc.size = 1000; - _range_desc.format_type = TFileFormatType::FORMAT_WAL; _ranges.push_back(_range_desc); _scan_range.ranges = _ranges; _scan_range.__isset.params = true; + _scan_range.params.format_type = TFileFormatType::FORMAT_WAL; _kv_cache.reset(new ShardedKVCache(48)); _cluster_info.reset(new ClusterInfo()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java index a9956549f11b81..bf917804fb7b6d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java @@ -141,19 +141,6 @@ public static String encodeObjectToString(T t) { @Override protected void setScanParams(TFileRangeDesc rangeDesc, Split split) { if (split instanceof PaimonSplit) { - PaimonSplit paimonSplit = (PaimonSplit) split; - if (rangeDesc.getFormatType() == TFileFormatType.FORMAT_JNI - && paimonSplit.getSplit() == null) { - // fall back to native reader - String fileFormat = getFileFormat(paimonSplit.getPathString()); - if (fileFormat.equals("orc")) { - rangeDesc.setFormatType(TFileFormatType.FORMAT_ORC); - } else if (fileFormat.equals("parquet")) { - rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET); - } else { - throw new RuntimeException("Unsupported file format: " + fileFormat); - } - } setPaimonParams(rangeDesc, (PaimonSplit) split); } } @@ -168,11 +155,24 @@ private void setPaimonParams(TFileRangeDesc rangeDesc, PaimonSplit paimonSplit) tableFormatFileDesc.setTableFormatType(paimonSplit.getTableFormatType().value()); TPaimonFileDesc fileDesc = new TPaimonFileDesc(); org.apache.paimon.table.source.Split split = paimonSplit.getSplit(); + + String fileFormat = getFileFormat(paimonSplit.getPathString()); if (split != null) { // use jni reader + rangeDesc.setFormatType(TFileFormatType.FORMAT_JNI); fileDesc.setPaimonSplit(encodeObjectToString(split)); + } else { + // use native reader + if (fileFormat.equals("orc")) { + rangeDesc.setFormatType(TFileFormatType.FORMAT_ORC); + } else if (fileFormat.equals("parquet")) { + rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET); + } else { + throw new RuntimeException("Unsupported file format: " + fileFormat); + } } - fileDesc.setFileFormat(getFileFormat(paimonSplit.getPathString())); + + fileDesc.setFileFormat(fileFormat); fileDesc.setPaimonPredicate(encodeObjectToString(predicates)); fileDesc.setPaimonColumnNames(source.getDesc().getSlots().stream().map(slot -> slot.getColumn().getName()) .collect(Collectors.joining(","))); From 093f49e8380bd1193739fa32d9cf6332f1e61cec Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 2 Dec 2024 20:10:17 +0800 Subject: [PATCH 35/35] format --- .../suites/external_table_p2/hudi/test_hudi_catalog.groovy | 2 +- .../suites/external_table_p2/hudi/test_hudi_incremental.groovy | 2 +- .../external_table_p2/hudi/test_hudi_schema_evolution.groovy | 2 +- .../suites/external_table_p2/hudi/test_hudi_snapshot.groovy | 2 +- .../suites/external_table_p2/hudi/test_hudi_timestamp.groovy | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_catalog.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_catalog.groovy index f2082ef89c7a50..149eecf5817bd4 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_catalog.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_catalog.groovy @@ -36,4 +36,4 @@ suite("test_hudi_catalog", "p2,external,hudi,external_remote,external_remote_hud def tables = sql """ show tables; """ assertTrue(tables.size() > 0) sql """drop catalog if exists ${catalog_name};""" -} \ No newline at end of file +} diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_incremental.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_incremental.groovy index 61fdd08c4960a7..885903646cc5b5 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_incremental.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_incremental.groovy @@ -116,4 +116,4 @@ suite("test_hudi_incremental", "p2,external,hudi,external_remote,external_remote // sql """set force_jni_scanner=false;""" sql """drop catalog if exists ${catalog_name};""" -} \ No newline at end of file +} diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_schema_evolution.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_schema_evolution.groovy index 5a7bd3d7da79d9..0da88447cdef15 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_schema_evolution.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_schema_evolution.groovy @@ -57,4 +57,4 @@ suite("test_hudi_schema_evolution", "p2,external,hudi,external_remote,external_r sql """set force_jni_scanner = false;""" sql """drop catalog if exists ${catalog_name};""" -} \ No newline at end of file +} diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_snapshot.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_snapshot.groovy index 9dbfe5372c4ac6..89d89709b3c822 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_snapshot.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_snapshot.groovy @@ -95,4 +95,4 @@ suite("test_hudi_snapshot", "p2,external,hudi,external_remote,external_remote_hu sql """set force_jni_scanner=false;""" sql """drop catalog if exists ${catalog_name};""" -} \ No newline at end of file +} diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy index c1ba630e4a7d01..36309322558f52 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy @@ -59,4 +59,4 @@ suite("test_hudi_timestamp", "p2,external,hudi,external_remote,external_remote_h // INSERT OVERWRITE hudi_table_with_timestamp VALUES // ('1', 'Alice', timestamp('2024-10-25 08:00:00')), // ('2', 'Bob', timestamp('2024-10-25 09:30:00')), -// ('3', 'Charlie', timestamp('2024-10-25 11:00:00')); \ No newline at end of file +// ('3', 'Charlie', timestamp('2024-10-25 11:00:00'));