From 866021d7d34f274349ce7de1f29d113395e7f28c Mon Sep 17 00:00:00 2001 From: Hanzhi Wang Date: Tue, 23 Apr 2024 03:21:20 -0700 Subject: [PATCH] Hive: turn off the stats gathering when iceberg.hive.keep.stats is false (#10148) --- .../iceberg/hive/HiveTableOperations.java | 1 + .../TestHiveIcebergStorageHandlerNoScan.java | 4 +- ...stHiveIcebergWithHiveAutogatherEnable.java | 185 ++++++++++++++++++ 3 files changed, 188 insertions(+), 2 deletions(-) create mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergWithHiveAutogatherEnable.java diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java index 75d59de75d4d..5293f915407e 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java @@ -235,6 +235,7 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { if (!keepHiveStats) { tbl.getParameters().remove(StatsSetupConst.COLUMN_STATS_ACCURATE); + tbl.getParameters().put(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE); } lock.ensureActive(); diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java index 534cc7d7476c..328b9f3b5b95 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java @@ -776,7 +776,7 @@ public void testIcebergAndHmsTableProperties() throws Exception { if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { assertThat(hmsParams) - .hasSize(14) + .hasSize(15) .containsEntry("custom_property", "initial_val") .containsEntry(InputFormatConfig.EXTERNAL_TABLE_PURGE, "TRUE") .containsEntry("EXTERNAL", "TRUE") @@ -819,7 +819,7 @@ public void testIcebergAndHmsTableProperties() throws Exception { if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { assertThat(hmsParams) - .hasSize(17) + .hasSize(18) .containsEntry("new_prop_1", "true") .containsEntry("new_prop_2", "false") .containsEntry("custom_property", "new_val"); diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergWithHiveAutogatherEnable.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergWithHiveAutogatherEnable.java new file mode 100644 index 000000000000..6b3bddd637c2 --- /dev/null +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergWithHiveAutogatherEnable.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.mr.hive; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Collection; +import org.apache.hadoop.hive.common.StatsSetupConst; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.hadoop.ConfigProperties; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestHiveIcebergWithHiveAutogatherEnable { + + @Parameters(name = "fileFormat={0}, catalog={1}") + public static Collection parameters() { + Collection testParams = Lists.newArrayList(); + // Run tests with every FileFormat for a single Catalog (HiveCatalog) + for (FileFormat fileFormat : HiveIcebergStorageHandlerTestUtils.FILE_FORMATS) { + testParams.add(new Object[] {fileFormat, TestTables.TestTableType.HIVE_CATALOG}); + } + return testParams; + } + + private static TestHiveShell shell; + + private TestTables testTables; + + @Parameter(index = 0) + private FileFormat fileFormat; + + @Parameter(index = 1) + private TestTables.TestTableType testTableType; + + @TempDir private Path temp; + + @BeforeAll + public static void beforeClass() { + // The hive configuration HIVESTATSAUTOGATHER must be set to true from hive engine + shell = + HiveIcebergStorageHandlerTestUtils.shell( + ImmutableMap.of(HiveConf.ConfVars.HIVESTATSAUTOGATHER.varname, "true")); + } + + @AfterAll + public static void afterClass() throws Exception { + shell.stop(); + } + + @BeforeEach + public void before() throws IOException { + testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); + HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, "mr"); + } + + @AfterEach + public void after() throws Exception { + HiveIcebergStorageHandlerTestUtils.close(shell); + } + + @TestTemplate + public void testHiveStatsAutogatherWhenCreateNewTable() throws Exception { + // Create a Catalog where the KEEP_HIVE_STATS is false + shell.metastore().hiveConf().set(ConfigProperties.KEEP_HIVE_STATS, StatsSetupConst.FALSE); + TestTables hiveStatsDisabledTestTables = + HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); + + TableIdentifier identifierWithoutStats = + TableIdentifier.of("default", "customers_without_stats"); + + // To validate the stats augother is disabled from Hive engine, the creation of iceberg table + // cannot have any records. Otherwise, the table parameters TOTAL_SIZE and NUM_FILES are + // added by Iceberg when inserting records. + hiveStatsDisabledTestTables.createTable( + shell, + identifierWithoutStats.name(), + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + ImmutableList.of()); + + // The table parameter TOTAL_SIZE is removed from hive engine + String totalSize = + shell + .metastore() + .getTable(identifierWithoutStats) + .getParameters() + .get(StatsSetupConst.TOTAL_SIZE); + assertThat(totalSize).isNull(); + + // The table parameter NUM_FILES is removed from hive engine + String numFiles = + shell + .metastore() + .getTable(identifierWithoutStats) + .getParameters() + .get(StatsSetupConst.NUM_FILES); + assertThat(numFiles).isNull(); + + // The table parameter DO_NOT_UPDATE_STATS is removed from hive engine + String stats = + shell + .metastore() + .getTable(identifierWithoutStats) + .getParameters() + .get(StatsSetupConst.DO_NOT_UPDATE_STATS); + assertThat(stats).isNull(); + + // Create a Catalog where the KEEP_HIVE_STATS is true + shell.metastore().hiveConf().set(ConfigProperties.KEEP_HIVE_STATS, StatsSetupConst.TRUE); + TestTables keepHiveStatsTestTables = + HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); + + TableIdentifier identifierWithStats = TableIdentifier.of("default", "customers_with_stats"); + + // To validate the stats augother is enabled from Hive engine, the creation of iceberg table + // cannot have any records. Otherwise, the table parameters TOTAL_SIZE and NUM_FILES are + // added by Iceberg when inserting records. + keepHiveStatsTestTables.createTable( + shell, + identifierWithStats.name(), + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + ImmutableList.of()); + + // The table parameter DO_NOT_UPDATE_STATS doesn't exist + stats = + shell + .metastore() + .getTable(identifierWithStats) + .getParameters() + .get(StatsSetupConst.DO_NOT_UPDATE_STATS); + assertThat(stats).isNull(); + + // The table parameter NUM_FILES is gathered from hive engine + numFiles = + shell + .metastore() + .getTable(identifierWithStats) + .getParameters() + .get(StatsSetupConst.NUM_FILES); + assertThat(numFiles).isEqualTo("1"); + + // The table parameter TOTAL_SIZE is gathered from hive engine + numFiles = + shell + .metastore() + .getTable(identifierWithStats) + .getParameters() + .get(StatsSetupConst.TOTAL_SIZE); + assertThat(numFiles).isNotNull(); + } +}