Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GOBBLIN-2159] Adding support for partition level copy in Iceberg distcp #4058

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableOperations;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.catalog.TableIdentifier;
Expand All @@ -43,7 +44,10 @@ protected BaseIcebergCatalog(String catalogName, Class<? extends Catalog> compan
@Override
public IcebergTable openTable(String dbName, String tableName) {
TableIdentifier tableId = TableIdentifier.of(dbName, tableName);
return new IcebergTable(tableId, calcDatasetDescriptorName(tableId), getDatasetDescriptorPlatform(), createTableOperations(tableId), this.getCatalogUri());
return new IcebergTable(tableId, calcDatasetDescriptorName(tableId), getDatasetDescriptorPlatform(),
createTableOperations(tableId),
this.getCatalogUri(),
loadTableInstance(tableId));
}

protected Catalog createCompanionCatalog(Map<String, String> properties, Configuration configuration) {
Expand All @@ -67,4 +71,6 @@ protected String getDatasetDescriptorPlatform() {
}

protected abstract TableOperations createTableOperations(TableIdentifier tableId);

protected abstract Table loadTableInstance(TableIdentifier tableId);
}
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public String getConfigPrefix() {
}

protected final FileSystem sourceFs;
private final Properties properties;
protected final Properties properties;

/**
* Finds all {@link IcebergDataset}s in the file system using the Iceberg Catalog.
Expand Down Expand Up @@ -153,7 +153,7 @@ protected IcebergDataset createIcebergDataset(IcebergCatalog sourceIcebergCatalo
IcebergTable destIcebergTable = destinationIcebergCatalog.openTable(destDbName, destTableName);
// TODO: Rethink strategy to enforce dest iceberg table
Preconditions.checkArgument(destinationIcebergCatalog.tableAlreadyExists(destIcebergTable), String.format("Missing Destination Iceberg Table: {%s}.{%s}", destDbName, destTableName));
return new IcebergDataset(srcIcebergTable, destIcebergTable, properties, fs, getConfigShouldCopyMetadataPath(properties));
return createSpecificDataset(srcIcebergTable, destIcebergTable, properties, fs, getConfigShouldCopyMetadataPath(properties));
}

protected static IcebergCatalog createIcebergCatalog(Properties properties, CatalogLocation location) throws IOException {
Expand All @@ -165,6 +165,11 @@ protected static IcebergCatalog createIcebergCatalog(Properties properties, Cata
return IcebergCatalogFactory.create(icebergCatalogClassName, catalogProperties, configuration);
}

protected IcebergDataset createSpecificDataset(IcebergTable srcIcebergTable, IcebergTable destIcebergTable, Properties properties, FileSystem fs, boolean shouldIncludeMetadataPath)
throws IcebergTable.TableNotFoundException {
return new IcebergDataset(srcIcebergTable, destIcebergTable, properties, fs, shouldIncludeMetadataPath);
}

protected static boolean getConfigShouldCopyMetadataPath(Properties properties) {
return Boolean.valueOf(properties.getProperty(ICEBERG_DATASET_SHOULD_COPY_METADATA_PATH, DEFAULT_ICEBERG_DATASET_SHOULD_COPY_METADATA_PATH));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableOperations;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.hive.HiveCatalog;
Expand Down Expand Up @@ -61,4 +62,9 @@ protected TableOperations createTableOperations(TableIdentifier tableId) {
public boolean tableAlreadyExists(IcebergTable icebergTable) {
return hc.tableExists(icebergTable.getTableId());
}

@Override
protected Table loadTableInstance(TableIdentifier tableId) {
return hc.loadTable(tableId);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.gobblin.data.management.copy.iceberg;

import java.io.IOException;
import java.time.Duration;
import java.util.List;
import java.util.Optional;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.ExecutionException;

import org.apache.iceberg.DataFile;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.util.SerializationUtil;

import com.github.rholder.retry.Attempt;
import com.github.rholder.retry.RetryException;
import com.github.rholder.retry.RetryListener;
import com.github.rholder.retry.Retryer;
import com.google.common.collect.ImmutableMap;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;

import lombok.extern.slf4j.Slf4j;

import org.apache.gobblin.commit.CommitStep;
import org.apache.gobblin.util.retry.RetryerFactory;

import static org.apache.gobblin.util.retry.RetryerFactory.RETRY_INTERVAL_MS;
import static org.apache.gobblin.util.retry.RetryerFactory.RETRY_TIMES;
import static org.apache.gobblin.util.retry.RetryerFactory.RETRY_TYPE;
import static org.apache.gobblin.util.retry.RetryerFactory.RetryType;

/**
* Commit step for overwriting partitions in an Iceberg table.
* <p>
* This class implements the {@link CommitStep} interface and provides functionality to overwrite
* partitions in the destination Iceberg table using serialized data files.
* </p>
*/
@Slf4j
public class IcebergOverwritePartitionsStep implements CommitStep {
private final String destTableIdStr;
private final Properties properties;
private final byte[] serializedDataFiles;
private final String partitionColName;
private final String partitionValue;
public static final String OVERWRITE_PARTITIONS_RETRYER_CONFIG_PREFIX = IcebergDatasetFinder.ICEBERG_DATASET_PREFIX +
".catalog.overwrite.partitions.retries";
private static final Config RETRYER_FALLBACK_CONFIG = ConfigFactory.parseMap(ImmutableMap.of(
RETRY_INTERVAL_MS, TimeUnit.SECONDS.toMillis(3L),
RETRY_TIMES, 3,
RETRY_TYPE, RetryType.FIXED_ATTEMPT.name()));

/**
* Constructs an {@code IcebergReplacePartitionsStep} with the specified parameters.
*
* @param destTableIdStr the identifier of the destination table as a string
* @param serializedDataFiles the serialized data files to be used for replacing partitions
* @param properties the properties containing configuration
*/
public IcebergOverwritePartitionsStep(String destTableIdStr, String partitionColName, String partitionValue, byte[] serializedDataFiles, Properties properties) {
this.destTableIdStr = destTableIdStr;
this.partitionColName = partitionColName;
this.partitionValue = partitionValue;
this.serializedDataFiles = serializedDataFiles;
this.properties = properties;
}

@Override
public boolean isCompleted() {
return false;
}

/**
* Executes the partition replacement in the destination Iceberg table.
* Also, have retry mechanism as done in {@link IcebergRegisterStep#execute()}
*
* @throws IOException if an I/O error occurs during execution
*/
@Override
public void execute() throws IOException {
IcebergTable destTable = createDestinationCatalog().openTable(TableIdentifier.parse(this.destTableIdStr));
List<DataFile> dataFiles = SerializationUtil.deserializeFromBytes(this.serializedDataFiles);
try {
log.info("Overwriting Data files of partition {} with value {} for destination table : {} ",
this.partitionColName,
this.partitionValue,
this.destTableIdStr
);
Retryer<Void> overwritePartitionsRetryer = createOverwritePartitionsRetryer();
overwritePartitionsRetryer.call(() -> {
destTable.overwritePartitions(dataFiles, this.partitionColName, this.partitionValue);
return null;
});
log.info("Overwriting Data files completed for partition {} with value {} for destination table : {} ",
this.partitionColName,
this.partitionValue,
this.destTableIdStr
);
} catch (ExecutionException executionException) {
String msg = String.format("Failed to overwrite partitions for destination iceberg table : {%s}", this.destTableIdStr);
log.error(msg, executionException);
throw new RuntimeException(msg, executionException.getCause());
} catch (RetryException retryException) {
String interruptedNote = Thread.currentThread().isInterrupted() ? "... then interrupted" : "";
String msg = String.format("Failed to overwrite partition for destination table : {%s} : (retried %d times) %s ",
this.destTableIdStr,
retryException.getNumberOfFailedAttempts(),
interruptedNote);
Throwable informativeException = retryException.getLastFailedAttempt().hasException()
? retryException.getLastFailedAttempt().getExceptionCause()
: retryException;
log.error(msg, informativeException);
throw new RuntimeException(msg, informativeException);
}
}

protected IcebergCatalog createDestinationCatalog() throws IOException {
return IcebergDatasetFinder.createIcebergCatalog(this.properties, IcebergDatasetFinder.CatalogLocation.DESTINATION);
}

private Retryer<Void> createOverwritePartitionsRetryer() {
Config config = ConfigFactory.parseProperties(this.properties);
Config retryerOverridesConfig = config.hasPath(IcebergOverwritePartitionsStep.OVERWRITE_PARTITIONS_RETRYER_CONFIG_PREFIX)
? config.getConfig(IcebergOverwritePartitionsStep.OVERWRITE_PARTITIONS_RETRYER_CONFIG_PREFIX)
: ConfigFactory.empty();

return RetryerFactory.newInstance(retryerOverridesConfig.withFallback(RETRYER_FALLBACK_CONFIG), Optional.of(new RetryListener() {
@Override
public <V> void onRetry(Attempt<V> attempt) {
if (attempt.hasException()) {
String msg = String.format("Exception caught while overwriting partitions for destination table : {%s} : [attempt: %d; %s after start]",
destTableIdStr,
attempt.getAttemptNumber(),
Duration.ofMillis(attempt.getDelaySinceFirstAttempt()).toString());
log.warn(msg, attempt.getExceptionCause());
}
}
}));
}
}
Loading