From 7df5c10fc0ff0427fe330c7475e523192edb249a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:34:56 -0800 Subject: [PATCH] branch-3.0: [fix](hudi)Add hudi catalog read partition table partition prune #44669 (#45111) Cherry-picked from #44669 Co-authored-by: daidai --- .../datasource/hive/HMSExternalTable.java | 26 +- .../doris/datasource/hudi/HudiUtils.java | 50 +++ .../datasource/hudi/source/HudiScanNode.java | 71 ++-- .../translator/PhysicalPlanTranslator.java | 2 + .../trees/plans/logical/LogicalHudiScan.java | 2 +- .../hudi/test_hudi_partition_prune.out | 357 ++++++++++++++++++ .../hudi/test_hudi_partition_prune.groovy | 333 ++++++++++++++++ 7 files changed, 792 insertions(+), 49 deletions(-) create mode 100644 regression-test/data/external_table_p2/hudi/test_hudi_partition_prune.out create mode 100644 regression-test/suites/external_table_p2/hudi/test_hudi_partition_prune.groovy diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java index 134ad362fa1eed..ad685386ec9e89 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java @@ -17,6 +17,7 @@ package org.apache.doris.datasource.hive; +import org.apache.doris.analysis.TableSnapshot; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.ListPartitionItem; @@ -31,6 +32,7 @@ import org.apache.doris.common.DdlException; import org.apache.doris.datasource.ExternalTable; import org.apache.doris.datasource.SchemaCacheValue; +import org.apache.doris.datasource.TablePartitionValues; import org.apache.doris.datasource.hudi.HudiUtils; import org.apache.doris.datasource.iceberg.IcebergUtils; import org.apache.doris.datasource.mvcc.MvccSnapshot; @@ -41,6 +43,7 @@ import org.apache.doris.mtmv.MTMVSnapshotIf; import org.apache.doris.mtmv.MTMVTimestampSnapshot; import org.apache.doris.nereids.exceptions.NotSupportedException; +import org.apache.doris.nereids.trees.plans.logical.LogicalFileScan.SelectedPartitions; import org.apache.doris.qe.GlobalVariable; import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; @@ -302,7 +305,28 @@ public List getPartitionColumns(Optional snapshot) { @Override public boolean supportInternalPartitionPruned() { - return getDlaType() == DLAType.HIVE; + return getDlaType() == DLAType.HIVE || getDlaType() == DLAType.HUDI; + } + + public SelectedPartitions initHudiSelectedPartitions(Optional tableSnapshot) { + if (getDlaType() != DLAType.HUDI) { + return SelectedPartitions.NOT_PRUNED; + } + + if (getPartitionColumns().isEmpty()) { + return SelectedPartitions.NOT_PRUNED; + } + TablePartitionValues tablePartitionValues = HudiUtils.getPartitionValues(tableSnapshot, this); + + Map idToPartitionItem = tablePartitionValues.getIdToPartitionItem(); + Map idToNameMap = tablePartitionValues.getPartitionIdToNameMap(); + + Map nameToPartitionItems = Maps.newHashMapWithExpectedSize(idToPartitionItem.size()); + for (Entry entry : idToPartitionItem.entrySet()) { + nameToPartitionItems.put(idToNameMap.get(entry.getKey()), entry.getValue()); + } + + return new SelectedPartitions(nameToPartitionItems.size(), nameToPartitionItems, false); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/HudiUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/HudiUtils.java index c98d994a28a08f..0f38abafaa4d98 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/HudiUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/HudiUtils.java @@ -17,24 +17,35 @@ package org.apache.doris.datasource.hudi; +import org.apache.doris.analysis.TableSnapshot; import org.apache.doris.catalog.ArrayType; +import org.apache.doris.catalog.Env; import org.apache.doris.catalog.MapType; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.StructField; import org.apache.doris.catalog.StructType; import org.apache.doris.catalog.Type; +import org.apache.doris.datasource.TablePartitionValues; +import org.apache.doris.datasource.hive.HMSExternalTable; +import org.apache.doris.datasource.hive.HiveMetaStoreClientHelper; +import org.apache.doris.datasource.hudi.source.HudiCachedPartitionProcessor; import org.apache.avro.LogicalType; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; +import java.util.Optional; import java.util.stream.Collectors; public class HudiUtils { @@ -231,4 +242,43 @@ private static Type handleUnionType(Schema avroSchema) { } return Type.UNSUPPORTED; } + + public static TablePartitionValues getPartitionValues(Optional tableSnapshot, + HMSExternalTable hmsTable) { + TablePartitionValues partitionValues = new TablePartitionValues(); + if (hmsTable.getPartitionColumns().isEmpty()) { + //isn't partition table. + return partitionValues; + } + + HoodieTableMetaClient hudiClient = HiveMetaStoreClientHelper.getHudiClient(hmsTable); + HudiCachedPartitionProcessor processor = (HudiCachedPartitionProcessor) Env.getCurrentEnv() + .getExtMetaCacheMgr().getHudiPartitionProcess(hmsTable.getCatalog()); + boolean useHiveSyncPartition = hmsTable.useHiveSyncPartition(); + + if (tableSnapshot.isPresent()) { + if (tableSnapshot.get().getType() == TableSnapshot.VersionType.VERSION) { + // Hudi does not support `FOR VERSION AS OF`, please use `FOR TIME AS OF`"; + return partitionValues; + } + String queryInstant = tableSnapshot.get().getTime().replaceAll("[-: ]", ""); + + partitionValues = + HiveMetaStoreClientHelper.ugiDoAs( + HiveMetaStoreClientHelper.getConfiguration(hmsTable), + () -> processor.getSnapshotPartitionValues( + hmsTable, hudiClient, queryInstant, useHiveSyncPartition)); + } else { + HoodieTimeline timeline = hudiClient.getCommitsAndCompactionTimeline().filterCompletedInstants(); + Option snapshotInstant = timeline.lastInstant(); + if (!snapshotInstant.isPresent()) { + return partitionValues; + } + partitionValues = + HiveMetaStoreClientHelper.ugiDoAs( + HiveMetaStoreClientHelper.getConfiguration(hmsTable), + () -> processor.getPartitionValues(hmsTable, hudiClient, useHiveSyncPartition)); + } + return partitionValues; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index a73a2065d0ffaf..b2cad8ab710178 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -21,6 +21,7 @@ import org.apache.doris.analysis.TableSnapshot; import org.apache.doris.analysis.TupleDescriptor; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.ListPartitionItem; import org.apache.doris.catalog.PartitionItem; import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; @@ -30,12 +31,10 @@ import org.apache.doris.datasource.ExternalTable; import org.apache.doris.datasource.FileSplit; import org.apache.doris.datasource.TableFormatType; -import org.apache.doris.datasource.TablePartitionValues; import org.apache.doris.datasource.hive.HiveMetaStoreClientHelper; import org.apache.doris.datasource.hive.HivePartition; import org.apache.doris.datasource.hive.source.HiveScanNode; import org.apache.doris.datasource.hudi.HudiUtils; -import org.apache.doris.planner.ListPartitionPrunerV2; import org.apache.doris.planner.PlanNodeId; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.SessionVariable; @@ -70,7 +69,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Locale; @@ -286,50 +284,29 @@ private boolean canUseNativeReader() { return !sessionVariable.isForceJniScanner() && isCowTable; } - private List getPrunedPartitions( - HoodieTableMetaClient metaClient, Option snapshotTimestamp) throws AnalysisException { + private List getPrunedPartitions(HoodieTableMetaClient metaClient) { List partitionColumnTypes = hmsTable.getPartitionColumnTypes(); if (!partitionColumnTypes.isEmpty()) { - HudiCachedPartitionProcessor processor = (HudiCachedPartitionProcessor) Env.getCurrentEnv() - .getExtMetaCacheMgr().getHudiPartitionProcess(hmsTable.getCatalog()); - TablePartitionValues partitionValues; - if (snapshotTimestamp.isPresent()) { - partitionValues = processor.getSnapshotPartitionValues( - hmsTable, metaClient, snapshotTimestamp.get(), useHiveSyncPartition); - } else { - partitionValues = processor.getPartitionValues(hmsTable, metaClient, useHiveSyncPartition); - } - if (partitionValues != null) { - // 2. prune partitions by expr - partitionValues.readLock().lock(); - try { - Map idToPartitionItem = partitionValues.getIdToPartitionItem(); - this.totalPartitionNum = idToPartitionItem.size(); - ListPartitionPrunerV2 pruner = new ListPartitionPrunerV2(idToPartitionItem, - hmsTable.getPartitionColumns(), columnNameToRange, - partitionValues.getUidToPartitionRange(), - partitionValues.getRangeToId(), - partitionValues.getSingleColumnRangeMap(), - true); - Collection filteredPartitionIds = pruner.prune(); - this.selectedPartitionNum = filteredPartitionIds.size(); - // 3. get partitions from cache - String dbName = hmsTable.getDbName(); - String tblName = hmsTable.getName(); - String inputFormat = hmsTable.getRemoteTable().getSd().getInputFormat(); - String basePath = metaClient.getBasePathV2().toString(); - Map partitionIdToNameMap = partitionValues.getPartitionIdToNameMap(); - Map> partitionValuesMap = partitionValues.getPartitionValuesMap(); - return filteredPartitionIds.stream().map(id -> { - String path = basePath + "/" + partitionIdToNameMap.get(id); - return new HivePartition( - dbName, tblName, false, inputFormat, path, partitionValuesMap.get(id), - Maps.newHashMap()); - }).collect(Collectors.toList()); - } finally { - partitionValues.readLock().unlock(); - } - } + this.totalPartitionNum = selectedPartitions.totalPartitionNum; + Map prunedPartitions = selectedPartitions.selectedPartitions; + this.selectedPartitionNum = prunedPartitions.size(); + + String dbName = hmsTable.getDbName(); + String tblName = hmsTable.getName(); + String inputFormat = hmsTable.getRemoteTable().getSd().getInputFormat(); + String basePath = metaClient.getBasePathV2().toString(); + + List hivePartitions = Lists.newArrayList(); + prunedPartitions.forEach( + (key, value) -> { + String path = basePath + "/" + key; + hivePartitions.add(new HivePartition( + dbName, tblName, false, inputFormat, path, + ((ListPartitionItem) value).getItems().get(0).getPartitionValuesAsStringList(), + Maps.newHashMap())); + } + ); + return hivePartitions; } // unpartitioned table, create a dummy partition to save location and // inputformat, @@ -420,7 +397,7 @@ public List getSplits() throws UserException { if (!partitionInit) { prunedPartitions = HiveMetaStoreClientHelper.ugiDoAs( HiveMetaStoreClientHelper.getConfiguration(hmsTable), - () -> getPrunedPartitions(hudiClient, snapshotTimestamp)); + () -> getPrunedPartitions(hudiClient)); partitionInit = true; } List splits = Collections.synchronizedList(new ArrayList<>()); @@ -482,7 +459,7 @@ public boolean isBatchMode() { // Non partition table will get one dummy partition prunedPartitions = HiveMetaStoreClientHelper.ugiDoAs( HiveMetaStoreClientHelper.getConfiguration(hmsTable), - () -> getPrunedPartitions(hudiClient, snapshotTimestamp)); + () -> getPrunedPartitions(hudiClient)); partitionInit = true; } int numPartitions = ConnectContext.get().getSessionVariable().getNumPartitionsInBatchMode(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java index 61d07ef6bbc8dc..7f4d23c61303b3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java @@ -656,6 +656,8 @@ public PlanFragment visitPhysicalHudiScan(PhysicalHudiScan fileScan, PlanTransla if (fileScan.getTableSnapshot().isPresent()) { ((FileQueryScanNode) scanNode).setQueryTableSnapshot(fileScan.getTableSnapshot().get()); } + HudiScanNode hudiScanNode = (HudiScanNode) scanNode; + hudiScanNode.setSelectedPartitions(fileScan.getSelectedPartitions()); return getPlanFragmentForPhysicalFileScan(fileScan, context, scanNode, table, tupleDescriptor); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalHudiScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalHudiScan.java index 629690889432b3..51e68eb07631ae 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalHudiScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalHudiScan.java @@ -84,7 +84,7 @@ protected LogicalHudiScan(RelationId id, ExternalTable table, List quali public LogicalHudiScan(RelationId id, ExternalTable table, List qualifier, Optional tableSample, Optional tableSnapshot) { this(id, table, qualifier, Optional.empty(), Optional.empty(), - SelectedPartitions.NOT_PRUNED, tableSample, tableSnapshot, + ((HMSExternalTable) table).initHudiSelectedPartitions(tableSnapshot), tableSample, tableSnapshot, Optional.empty(), Optional.empty()); } diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_partition_prune.out b/regression-test/data/external_table_p2/hudi/test_hudi_partition_prune.out new file mode 100644 index 00000000000000..fd3eafa0255722 --- /dev/null +++ b/regression-test/data/external_table_p2/hudi/test_hudi_partition_prune.out @@ -0,0 +1,357 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !one_partition_1_1 -- +1 Alice 2024 +2 Bob 2024 +3 Charlie 2024 + +-- !one_partition_2_1 -- +4 David 2025 +5 Eva 2025 + +-- !one_partition_3_all -- +1 Alice 2024 +2 Bob 2024 +3 Charlie 2024 +4 David 2025 +5 Eva 2025 + +-- !one_partition_4_all -- +5 Eva 2025 + +-- !one_partition_5_1 -- +3 Charlie 2024 + +-- !two_partition_1_1 -- +1 Alice US 1 +2 Bob US 1 +3 Charlie US 1 + +-- !two_partition_2_1 -- +8 Hannah EU 2 +9 Ivy EU 2 +10 Jack EU 2 + +-- !two_partition_3_2 -- +1 Alice US 1 +2 Bob US 1 +3 Charlie US 1 +4 David US 2 +5 Eva US 2 + +-- !two_partition_4_all -- +1 Alice US 1 +2 Bob US 1 +3 Charlie US 1 +4 David US 2 +5 Eva US 2 +6 Frank EU 1 +7 Grace EU 1 +8 Hannah EU 2 +9 Ivy EU 2 +10 Jack EU 2 + +-- !two_partition_5_1 -- + +-- !two_partition_6_1 -- +8 Hannah EU 2 +9 Ivy EU 2 +10 Jack EU 2 + +-- !three_partition_1_1 -- +1 Alice US 2024 Q1 +2 Bob US 2024 Q1 +3 Charlie US 2024 Q1 + +-- !three_partition_2_1 -- +10 Jack EU 2025 Q2 +11 Leo EU 2025 Q2 + +-- !three_partition_3_3 -- +13 Nina AS 2025 Q1 +14 Oscar AS 2025 Q2 +15 Paul AS 2025 Q3 + +-- !three_partition_4_2 -- +1 Alice US 2024 Q1 +2 Bob US 2024 Q1 +3 Charlie US 2024 Q1 +6 Frank US 2025 Q1 + +-- !three_partition_5_all -- +1 Alice US 2024 Q1 +2 Bob US 2024 Q1 +3 Charlie US 2024 Q1 +4 David US 2024 Q2 +5 Eva US 2024 Q2 +6 Frank US 2025 Q1 +7 Grace US 2025 Q2 +8 Hannah EU 2024 Q1 +9 Ivy EU 2024 Q1 +10 Jack EU 2025 Q2 +11 Leo EU 2025 Q2 +12 Mia EU 2025 Q3 +13 Nina AS 2025 Q1 +14 Oscar AS 2025 Q2 +15 Paul AS 2025 Q3 + +-- !three_partition_6_1 -- +8 Hannah EU 2024 Q1 +9 Ivy EU 2024 Q1 + +-- !three_partition_7_7 -- +6 Frank US 2025 Q1 +7 Grace US 2025 Q2 +10 Jack EU 2025 Q2 +11 Leo EU 2025 Q2 +12 Mia EU 2025 Q3 +13 Nina AS 2025 Q1 +14 Oscar AS 2025 Q2 +15 Paul AS 2025 Q3 + +-- !three_partition_8_2 -- +7 Grace US 2025 Q2 + +-- !one_partition_6_0 -- + +-- !two_partition_7_0 -- + +-- !two_partition_8_0 -- + +-- !three_partition_9_0 -- + +-- !three_partition_10_0 -- + +-- !three_partition_11_0 -- + +-- !time_travel_two_partition_1_3 -- +1 Alice US 1 +2 Bob US 1 +3 Charlie US 1 +4 David US 2 +5 Eva US 2 +6 Frank EU 1 + +-- !time_travel_two_partition_2_2 -- +1 Alice US 1 +2 Bob US 1 +3 Charlie US 1 +4 David US 2 +5 Eva US 2 + +-- !time_travel_two_partition_3_1 -- +4 David US 2 +5 Eva US 2 + +-- !time_travel_two_partition_4_0 -- + +-- !time_travel_two_partition_5_0 -- + +-- !time_travel_two_partition_6_1 -- +1 Alice US 1 + +-- !one_partition_boolean -- +1 Alice true +2 Bob true + +-- !one_partition_tinyint -- +1 Alice 1 +2 Bob 1 + +-- !one_partition_smallint -- +1 Alice 10 +2 Bob 10 + +-- !one_partition_int -- +1 Alice 100 +2 Bob 100 + +-- !one_partition_bigint -- +1 Alice 1234567890 +2 Bob 1234567890 + +-- !one_partition_string -- +1 Alice RegionA +2 Bob RegionA + +-- !one_partition_date -- +1 Alice 2023-12-01 +2 Bob 2023-12-01 + +-- !one_partition_1_1 -- +1 Alice 2024 +2 Bob 2024 +3 Charlie 2024 + +-- !one_partition_2_1 -- +4 David 2025 +5 Eva 2025 + +-- !one_partition_3_all -- +1 Alice 2024 +2 Bob 2024 +3 Charlie 2024 +4 David 2025 +5 Eva 2025 + +-- !one_partition_4_all -- +5 Eva 2025 + +-- !one_partition_5_1 -- +3 Charlie 2024 + +-- !two_partition_1_1 -- +1 Alice US 1 +2 Bob US 1 +3 Charlie US 1 + +-- !two_partition_2_1 -- +8 Hannah EU 2 +9 Ivy EU 2 +10 Jack EU 2 + +-- !two_partition_3_2 -- +1 Alice US 1 +2 Bob US 1 +3 Charlie US 1 +4 David US 2 +5 Eva US 2 + +-- !two_partition_4_all -- +1 Alice US 1 +2 Bob US 1 +3 Charlie US 1 +4 David US 2 +5 Eva US 2 +6 Frank EU 1 +7 Grace EU 1 +8 Hannah EU 2 +9 Ivy EU 2 +10 Jack EU 2 + +-- !two_partition_5_1 -- + +-- !two_partition_6_1 -- +8 Hannah EU 2 +9 Ivy EU 2 +10 Jack EU 2 + +-- !three_partition_1_1 -- +1 Alice US 2024 Q1 +2 Bob US 2024 Q1 +3 Charlie US 2024 Q1 + +-- !three_partition_2_1 -- +10 Jack EU 2025 Q2 +11 Leo EU 2025 Q2 + +-- !three_partition_3_3 -- +13 Nina AS 2025 Q1 +14 Oscar AS 2025 Q2 +15 Paul AS 2025 Q3 + +-- !three_partition_4_2 -- +1 Alice US 2024 Q1 +2 Bob US 2024 Q1 +3 Charlie US 2024 Q1 +6 Frank US 2025 Q1 + +-- !three_partition_5_all -- +1 Alice US 2024 Q1 +2 Bob US 2024 Q1 +3 Charlie US 2024 Q1 +4 David US 2024 Q2 +5 Eva US 2024 Q2 +6 Frank US 2025 Q1 +7 Grace US 2025 Q2 +8 Hannah EU 2024 Q1 +9 Ivy EU 2024 Q1 +10 Jack EU 2025 Q2 +11 Leo EU 2025 Q2 +12 Mia EU 2025 Q3 +13 Nina AS 2025 Q1 +14 Oscar AS 2025 Q2 +15 Paul AS 2025 Q3 + +-- !three_partition_6_1 -- +8 Hannah EU 2024 Q1 +9 Ivy EU 2024 Q1 + +-- !three_partition_7_7 -- +6 Frank US 2025 Q1 +7 Grace US 2025 Q2 +10 Jack EU 2025 Q2 +11 Leo EU 2025 Q2 +12 Mia EU 2025 Q3 +13 Nina AS 2025 Q1 +14 Oscar AS 2025 Q2 +15 Paul AS 2025 Q3 + +-- !three_partition_8_2 -- +7 Grace US 2025 Q2 + +-- !one_partition_6_0 -- + +-- !two_partition_7_0 -- + +-- !two_partition_8_0 -- + +-- !three_partition_9_0 -- + +-- !three_partition_10_0 -- + +-- !three_partition_11_0 -- + +-- !time_travel_two_partition_1_3 -- +1 Alice US 1 +2 Bob US 1 +3 Charlie US 1 +4 David US 2 +5 Eva US 2 +6 Frank EU 1 + +-- !time_travel_two_partition_2_2 -- +1 Alice US 1 +2 Bob US 1 +3 Charlie US 1 +4 David US 2 +5 Eva US 2 + +-- !time_travel_two_partition_3_1 -- +4 David US 2 +5 Eva US 2 + +-- !time_travel_two_partition_4_0 -- + +-- !time_travel_two_partition_5_0 -- + +-- !time_travel_two_partition_6_1 -- +1 Alice US 1 + +-- !one_partition_boolean -- +1 Alice true +2 Bob true + +-- !one_partition_tinyint -- +1 Alice 1 +2 Bob 1 + +-- !one_partition_smallint -- +1 Alice 10 +2 Bob 10 + +-- !one_partition_int -- +1 Alice 100 +2 Bob 100 + +-- !one_partition_bigint -- +1 Alice 1234567890 +2 Bob 1234567890 + +-- !one_partition_string -- +1 Alice RegionA +2 Bob RegionA + +-- !one_partition_date -- +1 Alice 2023-12-01 +2 Bob 2023-12-01 + diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_partition_prune.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_partition_prune.groovy new file mode 100644 index 00000000000000..eea17b01711b44 --- /dev/null +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_partition_prune.groovy @@ -0,0 +1,333 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hudi_partition_prune", "p2,external,hudi,external_remote,external_remote_hudi") { + String enabled = context.config.otherConfigs.get("enableExternalHudiTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("disable hudi test") + } + + String catalog_name = "test_hudi_partition_prune" + String props = context.config.otherConfigs.get("hudiEmrCatalog") + sql """drop catalog if exists ${catalog_name};""" + + for (String use_hive_sync_partition : ['true','false']) { + + sql """ + create catalog if not exists ${catalog_name} properties ( + ${props} + ,"use_hive_sync_partition"="${use_hive_sync_partition}" + ); + """ + + sql """ switch ${catalog_name};""" + sql """ use regression_hudi;""" + sql """ set enable_fallback_to_original_planner=false """ + + + + def one_partition_1_1 = """SELECT id,name,part1 FROM one_partition_tb WHERE part1 = 2024 ORDER BY id;""" + def one_partition_2_1 = """SELECT id,name,part1 FROM one_partition_tb WHERE part1 = 2025 ORDER BY id;""" + def one_partition_3_all = """SELECT id,name,part1 FROM one_partition_tb ORDER BY id;""" + def one_partition_4_all = """SELECT id,name,part1 FROM one_partition_tb WHERE id = 5 ORDER BY id;""" + def one_partition_5_1 = """SELECT id,name,part1 FROM one_partition_tb WHERE part1 = 2024 AND id >= 3 ORDER BY id;""" + + def two_partition_1_1 = """SELECT id,name,part1,part2 FROM two_partition_tb WHERE part1 = 'US' AND part2 = 1 ORDER BY id;""" + def two_partition_2_1 = """SELECT id,name,part1,part2 FROM two_partition_tb WHERE part1 = 'EU' AND part2 = 2 ORDER BY id;""" + def two_partition_3_2 = """SELECT id,name,part1,part2 FROM two_partition_tb WHERE part1 = 'US' ORDER BY id;""" + def two_partition_4_all = """SELECT id,name,part1,part2 FROM two_partition_tb ORDER BY id;""" + def two_partition_5_1 = """SELECT id,name,part1,part2 FROM two_partition_tb WHERE part1 = 'US' AND part2 = 2 AND id > 5 ORDER BY id;""" + def two_partition_6_1 = """SELECT id,name,part1,part2 FROM two_partition_tb WHERE part1 = 'EU' AND part2 = 2 ORDER BY id;""" + + def three_partition_1_1 = """SELECT id,name,part1,part2,part3 FROM three_partition_tb WHERE part1 = 'US' AND part2 = 2024 AND part3 = 'Q1' ORDER BY id;""" + def three_partition_2_1 = """SELECT id,name,part1,part2,part3 FROM three_partition_tb WHERE part1 = 'EU' AND part2 = 2025 AND part3 = 'Q2' ORDER BY id;""" + def three_partition_3_3 = """SELECT id,name,part1,part2,part3 FROM three_partition_tb WHERE part1 = 'AS' AND part2 = 2025 ORDER BY id;""" + def three_partition_4_2 = """SELECT id,name,part1,part2,part3 FROM three_partition_tb WHERE part1 = 'US' AND part3 = 'Q1' ORDER BY id;""" + def three_partition_5_all = """SELECT id,name,part1,part2,part3 FROM three_partition_tb ORDER BY id;""" + def three_partition_6_1 = """SELECT id,name,part1,part2,part3 FROM three_partition_tb WHERE part1 = 'EU' AND part2 = 2024 AND part3 = 'Q1' ORDER BY id;""" + def three_partition_7_7 = """SELECT id,name,part1,part2,part3 FROM three_partition_tb WHERE part2 = 2025 ORDER BY id;""" + def three_partition_8_2 = """SELECT id,name,part1,part2,part3 FROM three_partition_tb WHERE part1 = 'US' AND part3 = 'Q2' AND id BETWEEN 6 AND 10 ORDER BY id;""" + + def one_partition_boolean = """SELECT id,name,part1 FROM boolean_partition_tb WHERE part1 = true ORDER BY id;""" + def one_partition_tinyint = """SELECT id,name,part1 FROM tinyint_partition_tb WHERE part1 = 1 ORDER BY id;""" + def one_partition_smallint = """SELECT id,name,part1 FROM smallint_partition_tb WHERE part1 = 10 ORDER BY id;""" + def one_partition_int = """SELECT id,name,part1 FROM int_partition_tb WHERE part1 = 100 ORDER BY id;""" + def one_partition_bigint = """SELECT id,name,part1 FROM bigint_partition_tb WHERE part1 = 1234567890 ORDER BY id;""" + def one_partition_string = """SELECT id,name,part1 FROM string_partition_tb WHERE part1 = 'RegionA' ORDER BY id;""" + def one_partition_date = """SELECT id,name,part1 FROM date_partition_tb WHERE part1 = '2023-12-01' ORDER BY id;""" + def one_partition_timestamp = """SELECT id,name,part1 FROM timestamp_partition_tb WHERE part1 = '2023-12-01 08:00:00' ORDER BY id;""" + + + + qt_one_partition_1_1 one_partition_1_1 + explain { + sql("${one_partition_1_1}") + contains "partition=1/2" + } + + qt_one_partition_2_1 one_partition_2_1 + explain { + sql("${one_partition_2_1}") + contains "partition=1/2" + } + + qt_one_partition_3_all one_partition_3_all + explain { + sql("${one_partition_3_all}") + contains "partition=2/2" + } + + qt_one_partition_4_all one_partition_4_all + explain { + sql("${one_partition_4_all}") + contains "partition=2/2" + } + + qt_one_partition_5_1 one_partition_5_1 + explain { + sql("${one_partition_5_1}") + contains "partition=1/2" + } + + + qt_two_partition_1_1 two_partition_1_1 + explain { + sql("${two_partition_1_1}") + contains "partition=1/4" + } + + qt_two_partition_2_1 two_partition_2_1 + explain { + sql("${two_partition_2_1}") + contains "partition=1/4" + } + + qt_two_partition_3_2 two_partition_3_2 + explain { + sql("${two_partition_3_2}") + contains "partition=2/4" + } + + qt_two_partition_4_all two_partition_4_all + explain { + sql("${two_partition_4_all}") + contains "partition=4/4" + } + + qt_two_partition_5_1 two_partition_5_1 + explain { + sql("${two_partition_5_1}") + contains "partition=1/4" + } + + qt_two_partition_6_1 two_partition_6_1 + explain { + sql("${two_partition_6_1}") + contains "partition=1/4" + } + + + + qt_three_partition_1_1 three_partition_1_1 + explain { + sql("${three_partition_1_1}") + contains "partition=1/10" + } + + qt_three_partition_2_1 three_partition_2_1 + explain { + sql("${three_partition_2_1}") + contains "partition=1/10" + } + + qt_three_partition_3_3 three_partition_3_3 + explain { + sql("${three_partition_3_3}") + contains "partition=3/10" + } + + qt_three_partition_4_2 three_partition_4_2 + explain { + sql("${three_partition_4_2}") + contains "partition=2/10" + } + + qt_three_partition_5_all three_partition_5_all + explain { + sql("${three_partition_5_all}") + contains "partition=10/10" + } + + qt_three_partition_6_1 three_partition_6_1 + explain { + sql("${three_partition_6_1}") + contains "partition=1/10" + } + + qt_three_partition_7_7 three_partition_7_7 + explain { + sql("${three_partition_7_7}") + contains "partition=7/10" + } + + qt_three_partition_8_2 three_partition_8_2 + explain { + sql("${three_partition_8_2}") + contains "partition=2/10" + } + + + // 0 partitions + def one_partition_6_0 = """SELECT id,name,part1 FROM one_partition_tb WHERE part1 = 2023 ORDER BY id;""" + qt_one_partition_6_0 one_partition_6_0 + explain { + sql("${one_partition_6_0}") + contains "partition=0/2" + } + + def two_partition_7_0 = """SELECT id,name,part1 FROM two_partition_tb WHERE part1 = 'CN' AND part2 = 1 ORDER BY id;""" + qt_two_partition_7_0 two_partition_7_0 + explain { + sql("${two_partition_7_0}") + contains "partition=0/4" + } + + def two_partition_8_0 = """SELECT id,name,part1 FROM two_partition_tb WHERE part1 = 'US' AND part2 = 3 ORDER BY id;""" + qt_two_partition_8_0 two_partition_8_0 + explain { + sql("${two_partition_8_0}") + contains "partition=0/4" + } + + def three_partition_9_0 = """SELECT id,name,part1 FROM three_partition_tb WHERE part1 = 'US' AND part2 = 2023 AND part3 = 'Q1' ORDER BY id;""" + qt_three_partition_9_0 three_partition_9_0 + explain { + sql("${three_partition_9_0}") + contains "partition=0/10" + } + + def three_partition_10_0 = """SELECT id,name,part1 FROM three_partition_tb WHERE part1 = 'EU' AND part2 = 2024 AND part3 = 'Q4' ORDER BY id;""" + qt_three_partition_10_0 three_partition_10_0 + explain { + sql("${three_partition_10_0}") + contains "partition=0/10" + } + + def three_partition_11_0 = """SELECT id,name,part1 FROM three_partition_tb WHERE part1 = 'AS' AND part2 = 2025 AND part3 = 'Q4' ORDER BY id;""" + qt_three_partition_11_0 three_partition_11_0 + explain { + sql("${three_partition_11_0}") + contains "partition=0/10" + } + + + //time travel + def time_travel_two_partition_1_3 = "select id,name,part1,part2 from two_partition_tb FOR TIME AS OF '20241202171226401' order by id;" + def time_travel_two_partition_2_2 = "select id,name,part1,part2 from two_partition_tb FOR TIME AS OF '20241202171226401' where part1='US' order by id;" + def time_travel_two_partition_3_1 = "select id,name,part1,part2 from two_partition_tb FOR TIME AS OF '20241202171226401' where part2=2 order by id;" + def time_travel_two_partition_4_0 = "select id,name,part1,part2 from two_partition_tb FOR TIME AS OF '20241202171226401' where part2=10 order by id;" + + qt_time_travel_two_partition_1_3 time_travel_two_partition_1_3 + explain { + sql("${time_travel_two_partition_1_3}") + contains "partition=3/3" + } + + + qt_time_travel_two_partition_2_2 time_travel_two_partition_2_2 + explain { + sql("${time_travel_two_partition_2_2}") + contains "partition=2/3" + } + + qt_time_travel_two_partition_3_1 time_travel_two_partition_3_1 + explain { + sql("${time_travel_two_partition_3_1}") + contains "partition=1/3" + } + + qt_time_travel_two_partition_4_0 time_travel_two_partition_4_0 + explain { + sql("${time_travel_two_partition_4_0}") + contains "partition=0/3" + } + + + + + def time_travel_two_partition_5_0 = "select id,name,part1,part2 from two_partition_tb FOR TIME AS OF '20231126012025218' order by id;" + qt_time_travel_two_partition_5_0 time_travel_two_partition_5_0 + explain { + sql("${time_travel_two_partition_5_0}") + contains "partition=0/0" + } + + def time_travel_two_partition_6_1 = "select id,name,part1,part2 from two_partition_tb FOR TIME AS OF '20241202171214902' order by id;" + qt_time_travel_two_partition_6_1 time_travel_two_partition_6_1 + explain { + sql("${time_travel_two_partition_6_1}") + contains "partition=1/1" + } + + // all types as partition + qt_one_partition_boolean one_partition_boolean + explain { + sql("${one_partition_boolean}") + contains "partition=1/2" + } + qt_one_partition_tinyint one_partition_tinyint + explain { + sql("${one_partition_tinyint}") + contains "partition=1/2" + } + qt_one_partition_smallint one_partition_smallint + explain { + sql("${one_partition_smallint}") + contains "partition=1/2" + } + qt_one_partition_int one_partition_int + explain { + sql("${one_partition_int}") + contains "partition=1/2" + } + qt_one_partition_bigint one_partition_bigint + explain { + sql("${one_partition_bigint}") + contains "partition=1/2" + } + qt_one_partition_string one_partition_string + explain { + sql("${one_partition_string}") + contains "partition=1/2" + } + qt_one_partition_date one_partition_date + explain { + sql("${one_partition_date}") + contains "partition=1/2" + } + // qt_one_partition_timestamp one_partition_timestamp + // explain { + // sql("${one_partition_timestamp}") + // contains "partition=1/2" + // } + + sql """drop catalog if exists ${catalog_name};""" + + + } + +} \ No newline at end of file