-
Notifications
You must be signed in to change notification settings - Fork 751
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GOBBLIN-2159] Adding support for partition level copy in Iceberg distcp #4058
Changes from 15 commits
02ae2fc
981357c
7cd9353
82d10d3
c43d3e1
0cf7638
63bb9aa
6e1cf6b
065cde3
a13220d
e1d812f
4364044
66d81a3
24b4823
d8356e1
4dcc88b
46bd976
e1e6f57
b6163ba
6c73a25
9c35733
cdc863a
1dbe929
383ed91
942ad8d
6a4cf78
2adaa8b
c948854
a55ee61
1afc37a
bb35070
eeb8d25
675e8bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.iceberg.CatalogUtil; | ||
import org.apache.iceberg.Table; | ||
import org.apache.iceberg.TableOperations; | ||
import org.apache.iceberg.catalog.Catalog; | ||
import org.apache.iceberg.catalog.TableIdentifier; | ||
|
@@ -43,7 +44,10 @@ protected BaseIcebergCatalog(String catalogName, Class<? extends Catalog> compan | |
@Override | ||
public IcebergTable openTable(String dbName, String tableName) { | ||
TableIdentifier tableId = TableIdentifier.of(dbName, tableName); | ||
return new IcebergTable(tableId, calcDatasetDescriptorName(tableId), getDatasetDescriptorPlatform(), createTableOperations(tableId), this.getCatalogUri()); | ||
return new IcebergTable(tableId, calcDatasetDescriptorName(tableId), getDatasetDescriptorPlatform(), | ||
createTableOperations(tableId), | ||
this.getCatalogUri(), | ||
loadTableInstance(tableId)); | ||
} | ||
|
||
protected Catalog createCompanionCatalog(Map<String, String> properties, Configuration configuration) { | ||
|
@@ -67,4 +71,6 @@ protected String getDatasetDescriptorPlatform() { | |
} | ||
|
||
protected abstract TableOperations createTableOperations(TableIdentifier tableId); | ||
|
||
protected abstract Table loadTableInstance(TableIdentifier tableId); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for good measure you could also make I'm tempted to re-situate the exception as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed not throwing here instead catching NoSuchTableException in BaseIcebergCatalog::openTable and throwing IcebergTable.TableNotFoundException from there. |
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.gobblin.data.management.copy.iceberg; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.List; | ||
import java.util.Properties; | ||
import java.util.UUID; | ||
import java.util.function.Predicate; | ||
|
||
import org.apache.commons.collections.CollectionUtils; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.FileStatus; | ||
import org.apache.hadoop.fs.FileSystem; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.iceberg.DataFile; | ||
import org.apache.iceberg.DataFiles; | ||
import org.apache.iceberg.PartitionSpec; | ||
import org.apache.iceberg.StructLike; | ||
import org.apache.iceberg.TableMetadata; | ||
import org.apache.iceberg.TableProperties; | ||
import org.apache.iceberg.util.SerializationUtil; | ||
|
||
import com.google.common.collect.Lists; | ||
import com.google.common.collect.Maps; | ||
import com.google.common.base.Preconditions; | ||
|
||
import lombok.Data; | ||
import lombok.extern.slf4j.Slf4j; | ||
|
||
import org.apache.gobblin.data.management.copy.CopyConfiguration; | ||
import org.apache.gobblin.data.management.copy.CopyEntity; | ||
import org.apache.gobblin.data.management.copy.CopyableFile; | ||
import org.apache.gobblin.data.management.copy.entities.PostPublishStep; | ||
import org.apache.gobblin.data.management.copy.CopyableDataset; | ||
import org.apache.gobblin.data.management.copy.iceberg.predicates.IcebergPartitionFilterPredicateFactory; | ||
|
||
/** | ||
* Iceberg Partition dataset implementing {@link CopyableDataset} | ||
* <p> | ||
* This class extends {@link IcebergDataset} and provides functionality to filter partitions | ||
* and generate copy entities for partition based data movement. | ||
* </p> | ||
*/ | ||
@Slf4j | ||
public class IcebergPartitionDataset extends IcebergDataset { | ||
|
||
private static final String ICEBERG_PARTITION_NAME_KEY = "partition.name"; | ||
private final Predicate<StructLike> partitionFilterPredicate; | ||
|
||
public IcebergPartitionDataset(IcebergTable srcIcebergTable, IcebergTable destIcebergTable, Properties properties, | ||
FileSystem sourceFs, boolean shouldIncludeMetadataPath) throws IcebergTable.TableNotFoundException { | ||
super(srcIcebergTable, destIcebergTable, properties, sourceFs, shouldIncludeMetadataPath); | ||
|
||
String partitionColumnName = | ||
IcebergDatasetFinder.getLocationQualifiedProperty(properties, IcebergDatasetFinder.CatalogLocation.SOURCE, | ||
ICEBERG_PARTITION_NAME_KEY); | ||
Blazer-007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Preconditions.checkArgument(StringUtils.isNotEmpty(partitionColumnName), | ||
"Partition column name cannot be empty"); | ||
|
||
TableMetadata srcTableMetadata = getSrcIcebergTable().accessTableMetadata(); | ||
this.partitionFilterPredicate = IcebergPartitionFilterPredicateFactory.getFilterPredicate(partitionColumnName, | ||
srcTableMetadata, properties); | ||
} | ||
|
||
/** | ||
* Represents the destination file paths and the corresponding file status in source file system. | ||
* These both properties are used in creating {@link CopyEntity} | ||
*/ | ||
@Data | ||
protected static final class FilePathsWithStatus { | ||
private final Path destPath; | ||
private final FileStatus srcFileStatus; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the base class uses There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one could have been same, the reason i will mention in the reply of this comment - #4058 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAICT the so that still seems worth following here, rather than inventing a new one-off type to convey equivalent info. I'll leave the choice to you, but consistency is my recommendation. |
||
|
||
/** | ||
* Generates copy entities for partition based data movement. | ||
* It finds files specific to the partition and create destination data files based on the source data files. | ||
* Also updates the destination data files with destination table write data location and add UUID to the file path | ||
* to avoid conflicts. | ||
* | ||
* @param targetFs the target file system | ||
* @param copyConfig the copy configuration | ||
* @return a collection of copy entities | ||
* @throws IOException if an I/O error occurs | ||
*/ | ||
@Override | ||
Collection<CopyEntity> generateCopyEntities(FileSystem targetFs, CopyConfiguration copyConfig) throws IOException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this impl is really, really similar to the one it's based on in its base class. deriving from a class and then overriding methods w/ only small changes is pretty nearly cut-and-paste code. sometimes it's inevitable, but let's avoid when we can. in this case, could we NOT override this method, but only
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will list down my reason here -
To conclude it - |
||
String fileSet = this.getFileSetId(); | ||
List<CopyEntity> copyEntities = Lists.newArrayList(); | ||
IcebergTable srcIcebergTable = getSrcIcebergTable(); | ||
List<DataFile> srcDataFiles = srcIcebergTable.getPartitionSpecificDataFiles(this.partitionFilterPredicate); | ||
List<DataFile> destDataFiles = getDestDataFiles(srcDataFiles); | ||
Configuration defaultHadoopConfiguration = new Configuration(); | ||
|
||
for (FilePathsWithStatus filePathsWithStatus : getFilePathsStatus(srcDataFiles, destDataFiles, this.sourceFs)) { | ||
Path destPath = filePathsWithStatus.getDestPath(); | ||
FileStatus srcFileStatus = filePathsWithStatus.getSrcFileStatus(); | ||
FileSystem actualSourceFs = getSourceFileSystemFromFileStatus(srcFileStatus, defaultHadoopConfiguration); | ||
Blazer-007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
CopyableFile fileEntity = CopyableFile.fromOriginAndDestination( | ||
actualSourceFs, srcFileStatus, targetFs.makeQualified(destPath), copyConfig) | ||
.fileSet(fileSet) | ||
.datasetOutputPath(targetFs.getUri().getPath()) | ||
.build(); | ||
Comment on lines
+115
to
+119
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you skip first doing this, like in
is that intentional? do you feel it's not necessary or actually contra-indicated? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the IcebergDataset the path of tables are exactly since table UUID are same on source and destination here it can be different, so copying permissions atleast in first draft is not necessary I believe. Even if there is need that we need to make sure ancestor path, parent path are ones we want, that's why I have removed it for now. |
||
|
||
fileEntity.setSourceData(getSourceDataset(this.sourceFs)); | ||
fileEntity.setDestinationData(getDestinationDataset(targetFs)); | ||
copyEntities.add(fileEntity); | ||
} | ||
|
||
// Adding this check to avoid adding post publish step when there are no files to copy. | ||
if (CollectionUtils.isNotEmpty(destDataFiles)) { | ||
copyEntities.add(createPostPublishStep(destDataFiles)); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree this is one difference with |
||
|
||
log.info("~{}~ generated {} copy--entities", fileSet, copyEntities.size()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the two dashes between |
||
return copyEntities; | ||
} | ||
|
||
private List<DataFile> getDestDataFiles(List<DataFile> srcDataFiles) throws IcebergTable.TableNotFoundException { | ||
List<DataFile> destDataFiles = new ArrayList<>(); | ||
if (srcDataFiles.isEmpty()) { | ||
return destDataFiles; | ||
} | ||
TableMetadata srcTableMetadata = getSrcIcebergTable().accessTableMetadata(); | ||
TableMetadata destTableMetadata = getDestIcebergTable().accessTableMetadata(); | ||
PartitionSpec partitionSpec = destTableMetadata.spec(); | ||
String srcWriteDataLocation = srcTableMetadata.property(TableProperties.WRITE_DATA_LOCATION, ""); | ||
String destWriteDataLocation = destTableMetadata.property(TableProperties.WRITE_DATA_LOCATION, ""); | ||
if (StringUtils.isEmpty(srcWriteDataLocation) || StringUtils.isEmpty(destWriteDataLocation)) { | ||
log.warn( | ||
String.format("Either source or destination table does not have write data location : source table write data location : {%s} , destination table write data location : {%s}", | ||
srcWriteDataLocation, | ||
destWriteDataLocation | ||
) | ||
); | ||
Blazer-007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
// tableMetadata.property(TableProperties.WRITE_DATA_LOCATION, "") returns null if the property is not set and | ||
// doesn't respect passed default value, so to avoid NPE in .replace() we are setting it to empty string. | ||
String prefixToBeReplaced = (srcWriteDataLocation != null) ? srcWriteDataLocation : ""; | ||
String prefixToReplaceWith = (destWriteDataLocation != null) ? destWriteDataLocation : ""; | ||
Blazer-007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
srcDataFiles.forEach(dataFile -> { | ||
String curDestFilePath = dataFile.path().toString(); | ||
String newDestFilePath = curDestFilePath.replace(prefixToBeReplaced, prefixToReplaceWith); | ||
String updatedDestFilePath = addUUIDToPath(newDestFilePath); | ||
Blazer-007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
destDataFiles.add(DataFiles.builder(partitionSpec) | ||
.copy(dataFile) | ||
.withPath(updatedDestFilePath) | ||
.build()); | ||
}); | ||
return destDataFiles; | ||
} | ||
|
||
private String addUUIDToPath(String filePathStr) { | ||
Path filePath = new Path(filePathStr); | ||
String fileDir = filePath.getParent().toString(); | ||
String fileName = filePath.getName(); | ||
String newFileName = UUID.randomUUID() + "-" + fileName; | ||
return String.join("/", fileDir, newFileName); | ||
Blazer-007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
private List<FilePathsWithStatus> getFilePathsStatus(List<DataFile> srcDataFiles, List<DataFile> destDataFiles, FileSystem fs) throws IOException { | ||
List<FilePathsWithStatus> filePathsStatus = new ArrayList<>(); | ||
for (int i = 0; i < srcDataFiles.size(); i++) { | ||
Path srcPath = new Path(srcDataFiles.get(i).path().toString()); | ||
Path destPath = new Path(destDataFiles.get(i).path().toString()); | ||
FileStatus srcFileStatus = fs.getFileStatus(srcPath); | ||
filePathsStatus.add(new FilePathsWithStatus(destPath, srcFileStatus)); | ||
} | ||
return filePathsStatus; | ||
} | ||
|
||
private PostPublishStep createPostPublishStep(List<DataFile> destDataFiles) { | ||
|
||
byte[] serializedDataFiles = SerializationUtil.serializeToBytes(destDataFiles); | ||
|
||
IcebergReplacePartitionsStep icebergReplacePartitionsStep = new IcebergReplacePartitionsStep( | ||
this.getDestIcebergTable().getTableId().toString(), | ||
serializedDataFiles, | ||
this.properties); | ||
|
||
return new PostPublishStep(this.getFileSetId(), Maps.newHashMap(), icebergReplacePartitionsStep, 0); | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.gobblin.data.management.copy.iceberg; | ||
|
||
import java.io.IOException; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Properties; | ||
|
||
import org.apache.commons.lang3.StringUtils; | ||
import org.apache.hadoop.fs.FileSystem; | ||
|
||
import com.google.common.base.Preconditions; | ||
|
||
import lombok.extern.slf4j.Slf4j; | ||
|
||
/** | ||
* Finder class for locating and creating partitioned Iceberg datasets. | ||
* <p> | ||
* This class extends {@link IcebergDatasetFinder} and provides functionality to create | ||
* {@link IcebergPartitionDataset} instances based on the specified source and destination Iceberg catalogs. | ||
* </p> | ||
*/ | ||
@Slf4j | ||
public class IcebergPartitionDatasetFinder extends IcebergDatasetFinder { | ||
public IcebergPartitionDatasetFinder(FileSystem sourceFs, Properties properties) { | ||
super(sourceFs, properties); | ||
} | ||
|
||
/** | ||
* Finds the {@link IcebergPartitionDataset}s in the file system using the Iceberg Catalog. Both Iceberg database name and table | ||
* name are mandatory based on current implementation. | ||
* <p> | ||
* Overriding this method to put a check whether source and destination db & table names are passed in the properties as separate values | ||
* </p> | ||
* @return List of {@link IcebergPartitionDataset}s in the file system. | ||
* @throws IOException if there is an error while finding the datasets. | ||
*/ | ||
@Override | ||
public List<IcebergDataset> findDatasets() throws IOException { | ||
String srcDbName = getLocationQualifiedProperty(this.properties, CatalogLocation.SOURCE, ICEBERG_DB_NAME_KEY); | ||
String destDbName = getLocationQualifiedProperty(this.properties, CatalogLocation.DESTINATION, ICEBERG_DB_NAME_KEY); | ||
String srcTableName = getLocationQualifiedProperty(this.properties, CatalogLocation.SOURCE, ICEBERG_TABLE_NAME_KEY); | ||
String destTableName = getLocationQualifiedProperty(this.properties, CatalogLocation.DESTINATION, ICEBERG_TABLE_NAME_KEY); | ||
Blazer-007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if (StringUtils.isBlank(srcDbName) || StringUtils.isBlank(destDbName) || StringUtils.isBlank(srcTableName) | ||
|| StringUtils.isBlank(destTableName)) { | ||
String errorMsg = String.format("Missing (at least some) IcebergDataset properties - source: ('%s' and '%s') and destination: ('%s' and '%s') ", | ||
srcDbName, srcTableName, destDbName, destTableName); | ||
log.error(errorMsg); | ||
throw new IllegalArgumentException(errorMsg); | ||
} | ||
|
||
IcebergCatalog srcIcebergCatalog = createIcebergCatalog(this.properties, CatalogLocation.SOURCE); | ||
IcebergCatalog destIcebergCatalog = createIcebergCatalog(this.properties, CatalogLocation.DESTINATION); | ||
|
||
return Collections.singletonList(createIcebergDataset( | ||
srcIcebergCatalog, srcDbName, srcTableName, | ||
destIcebergCatalog, destDbName, destTableName, | ||
this.properties, this.sourceFs | ||
)); | ||
} | ||
|
||
/** | ||
* Creates an {@link IcebergPartitionDataset} instance for the specified source and destination Iceberg tables. | ||
*/ | ||
@Override | ||
protected IcebergDataset createIcebergDataset(IcebergCatalog sourceIcebergCatalog, String srcDbName, String srcTableName, IcebergCatalog destinationIcebergCatalog, String destDbName, String destTableName, Properties properties, FileSystem fs) throws IOException { | ||
IcebergTable srcIcebergTable = sourceIcebergCatalog.openTable(srcDbName, srcTableName); | ||
Preconditions.checkArgument(sourceIcebergCatalog.tableAlreadyExists(srcIcebergTable), | ||
String.format("Missing Source Iceberg Table: {%s}.{%s}", srcDbName, srcTableName)); | ||
IcebergTable destIcebergTable = destinationIcebergCatalog.openTable(destDbName, destTableName); | ||
Preconditions.checkArgument(destinationIcebergCatalog.tableAlreadyExists(destIcebergTable), | ||
String.format("Missing Destination Iceberg Table: {%s}.{%s}", destDbName, destTableName)); | ||
// TODO: Add Validator for source and destination tables later | ||
// TableMetadata srcTableMetadata = srcIcebergTable.accessTableMetadata(); | ||
// TableMetadata destTableMetadata = destIcebergTable.accessTableMetadata(); | ||
// IcebergTableMetadataValidator.validateSourceAndDestinationTablesMetadata(srcTableMetadata, destTableMetadata); | ||
Blazer-007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return new IcebergPartitionDataset(srcIcebergTable, destIcebergTable, properties, fs, getConfigShouldCopyMetadataPath(properties)); | ||
Blazer-007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I noticed a test now validating a throw of
org.apache.iceberg.exceptions.NoSuchTableException
. does that arise from this call toloadTableInstance
?wherever it is, let's catch it, so it exceptions from
org.apache.iceberg
don't bleed through. locate where and re-wrap with (our own)IcebergTable.TableNotFoundException
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes it arise from
loadTableInstance
or more specifically fromcatalog.loadTable(tableIdentifier)
and i dont believe it will be easy to catch it and rethrow asIcebergTable.TableNotFoundException
because this exception arises before creating IcebergTable instance itself as opposite to catalog.newTableOps(tableIdentifier) which doesn't throw this exception while initializing rather it throws when used for first time inside IcebergTable.Please let me know your thoughts on this if we really want to throw
IcebergTable.TableNotFoundException
itself in catalog onlyThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's OK to raise the exception at creation time rather than upon first-use. in fact, that's arguably even better
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
e.g.