-
Notifications
You must be signed in to change notification settings - Fork 166
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support for REPLACE TABLE operation #433
base: main
Are you sure you want to change the base?
Changes from 27 commits
ac59bd6
efc71de
e1f81f0
359b8ec
5a9f078
b344516
24fcda4
93587fb
b1ea25c
ec36946
2e6b8ec
34f559a
bb7a591
1f397a8
e25366e
38f4855
d644906
24b6b9f
aa34637
6b33671
0326af3
8d25dda
53f2d2c
db62f56
f9ea0cf
9eea538
d7dd4ae
ab9ddf2
a8eb07a
b54b357
044896d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -39,16 +39,25 @@ | |||||
from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, NotInstalledError, TableAlreadyExistsError | ||||||
from pyiceberg.io import FileIO, load_file_io | ||||||
from pyiceberg.manifest import ManifestFile | ||||||
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec | ||||||
from pyiceberg.schema import Schema | ||||||
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec, assign_fresh_partition_spec_ids | ||||||
from pyiceberg.schema import Schema, assign_fresh_schema_ids | ||||||
from pyiceberg.serializers import ToOutputFile | ||||||
from pyiceberg.table import ( | ||||||
AddPartitionSpecUpdate, | ||||||
AddSchemaUpdate, | ||||||
AddSortOrderUpdate, | ||||||
AssertTableUUID, | ||||||
CommitTableRequest, | ||||||
CommitTableResponse, | ||||||
SetCurrentSchemaUpdate, | ||||||
SetDefaultSortOrderUpdate, | ||||||
SetDefaultSpecUpdate, | ||||||
Table, | ||||||
TableMetadata, | ||||||
TableRequirement, | ||||||
TableUpdate, | ||||||
) | ||||||
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder | ||||||
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder, assign_fresh_sort_order_ids | ||||||
from pyiceberg.typedef import ( | ||||||
EMPTY_DICT, | ||||||
Identifier, | ||||||
|
@@ -646,6 +655,47 @@ def purge_table(self, identifier: Union[str, Identifier]) -> None: | |||||
delete_files(io, prev_metadata_files, PREVIOUS_METADATA) | ||||||
delete_files(io, {table.metadata_location}, METADATA) | ||||||
|
||||||
def create_or_replace_table( | ||||||
self, | ||||||
identifier: Union[str, Identifier], | ||||||
schema: Union[Schema, "pa.Schema"], | ||||||
location: Optional[str] = None, | ||||||
partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, | ||||||
sort_order: SortOrder = UNSORTED_SORT_ORDER, | ||||||
properties: Properties = EMPTY_DICT, | ||||||
) -> Table: | ||||||
"""Create a new table or replace an existing one. Replacing the table reatains the table metadata history. | ||||||
|
||||||
Args: | ||||||
identifier (str | Identifier): Table identifier. | ||||||
schema (Schema): Table's schema. | ||||||
location (str | None): Location for the table. Optional Argument. | ||||||
partition_spec (PartitionSpec): PartitionSpec for the table. | ||||||
sort_order (SortOrder): SortOrder for the table. | ||||||
properties (Properties): Table properties that can be a string based dictionary. | ||||||
|
||||||
Returns: | ||||||
Table: the new table instance. | ||||||
""" | ||||||
try: | ||||||
return self._replace_table( | ||||||
identifier=identifier, | ||||||
new_schema=schema, | ||||||
new_location=location, | ||||||
new_partition_spec=partition_spec, | ||||||
new_sort_order=sort_order, | ||||||
new_properties=properties, | ||||||
) | ||||||
except NoSuchTableError: | ||||||
return self.create_table( | ||||||
identifier=identifier, | ||||||
schema=schema, | ||||||
location=location, | ||||||
partition_spec=partition_spec, | ||||||
sort_order=sort_order, | ||||||
properties=properties, | ||||||
) | ||||||
|
||||||
@staticmethod | ||||||
def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> None: | ||||||
ToOutputFile.table_metadata(metadata, io.new_output(metadata_path)) | ||||||
|
@@ -710,6 +760,45 @@ def _get_updated_props_and_update_summary( | |||||
|
||||||
return properties_update_summary, updated_properties | ||||||
|
||||||
def _replace_table( | ||||||
self, | ||||||
identifier: Union[str, Identifier], | ||||||
new_schema: Union[Schema, "pa.Schema"], | ||||||
new_partition_spec: PartitionSpec, | ||||||
new_sort_order: SortOrder, | ||||||
new_properties: Properties, | ||||||
new_location: Optional[str] = None, | ||||||
) -> Table: | ||||||
table = self.load_table(identifier) | ||||||
with table.transaction() as tx: | ||||||
base_schema = table.schema() | ||||||
new_schema = assign_fresh_schema_ids(schema_or_type=new_schema, base_schema=base_schema) | ||||||
new_sort_order = assign_fresh_sort_order_ids( | ||||||
sort_order=new_sort_order, | ||||||
old_schema=base_schema, | ||||||
fresh_schema=new_schema, | ||||||
sort_order_id=table.sort_order().order_id + 1, | ||||||
) | ||||||
new_partition_spec = assign_fresh_partition_spec_ids( | ||||||
spec=new_partition_spec, old_schema=base_schema, fresh_schema=new_schema, spec_id=table.spec().spec_id + 1 | ||||||
) | ||||||
|
||||||
requirements: Tuple[TableRequirement, ...] = (AssertTableUUID(uuid=table.metadata.table_uuid),) | ||||||
updates: Tuple[TableUpdate, ...] = ( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to clear the snapshots here as well: CREATE TABLE default.t3 AS SELECT 'Fokko' as name {
"format-version" : 2,
"table-uuid" : "c61404c8-2211-46c7-866f-2eb87022b728",
"location" : "s3://warehouse/default/t3",
"last-sequence-number" : 1,
"last-updated-ms" : 1710409653861,
"last-column-id" : 1,
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "name",
"required" : false,
"type" : "string"
} ]
} ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "root",
"created-at" : "2024-03-14T09:47:10.455199504Z",
"write.parquet.compression-codec" : "zstd"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ {
"sequence-number" : 1,
"snapshot-id" : 3622792816294171432,
"timestamp-ms" : 1710409631964,
"summary" : {
"operation" : "append",
"spark.app.id" : "local-1710405058122",
"added-data-files" : "1",
"added-records" : "1",
"added-files-size" : "416",
"changed-partition-count" : "1",
"total-records" : "1",
"total-files-size" : "416",
"total-data-files" : "1",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
"manifest-list" : "s3://warehouse/default/t3/metadata/snap-3622792816294171432-1-e457c732-62e5-41eb-998b-abbb8f021ed5.avro",
"schema-id" : 0
} ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ {
"timestamp-ms" : 1710409631964,
"metadata-file" : "s3://warehouse/default/t3/metadata/00000-c82191f3-e6e2-4001-8e85-8623e3915ff7.metadata.json"
} ]
} CREATE OR REPLACE TABLE default.t3 (name string); {
"format-version" : 2,
"table-uuid" : "c61404c8-2211-46c7-866f-2eb87022b728",
"location" : "s3://warehouse/default/t3",
"last-sequence-number" : 1,
"last-updated-ms" : 1710411760623,
"last-column-id" : 1,
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "name",
"required" : false,
"type" : "string"
} ]
} ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "root",
"created-at" : "2024-03-14T09:47:10.455199504Z",
"write.parquet.compression-codec" : "zstd"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ {
"sequence-number" : 1,
"snapshot-id" : 3622792816294171432,
"timestamp-ms" : 1710409631964,
"summary" : {
"operation" : "append",
"spark.app.id" : "local-1710405058122",
"added-data-files" : "1",
"added-records" : "1",
"added-files-size" : "416",
"changed-partition-count" : "1",
"total-records" : "1",
"total-files-size" : "416",
"total-data-files" : "1",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
"manifest-list" : "s3://warehouse/default/t3/metadata/snap-3622792816294171432-1-e457c732-62e5-41eb-998b-abbb8f021ed5.avro",
"schema-id" : 0
} ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ {
"timestamp-ms" : 1710409631964,
"metadata-file" : "s3://warehouse/default/t3/metadata/00000-c82191f3-e6e2-4001-8e85-8623e3915ff7.metadata.json"
}, {
"timestamp-ms" : 1710409653861,
"metadata-file" : "s3://warehouse/default/t3/metadata/00001-0297d4e7-2468-4c0d-b4ed-ea717df8c3e6.metadata.json"
} ]
} |
||||||
AddSchemaUpdate(schema=new_schema, last_column_id=new_schema.highest_field_id), | ||||||
SetCurrentSchemaUpdate(schema_id=-1), | ||||||
AddSortOrderUpdate(sort_order=new_sort_order), | ||||||
SetDefaultSortOrderUpdate(sort_order_id=-1), | ||||||
AddPartitionSpecUpdate(spec=new_partition_spec), | ||||||
SetDefaultSpecUpdate(spec_id=-1), | ||||||
Comment on lines
+799
to
+800
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same goes here, the spec is being re-used: CREATE TABLE default.t2 (name string, age int) PARTITIONED BY (name); {
"format-version" : 2,
"table-uuid" : "27f1ab29-7a9f-4324-bb64-c10c5bd2be53",
"location" : "s3://warehouse/default/t2",
"last-sequence-number" : 0,
"last-updated-ms" : 1710409060360,
"last-column-id" : 2,
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "name",
"required" : false,
"type" : "string"
}, {
"id" : 2,
"name" : "age",
"required" : false,
"type" : "int"
} ]
} ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ {
"name" : "name",
"transform" : "identity",
"source-id" : 1,
"field-id" : 1000
} ]
} ],
"last-partition-id" : 1000,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "root",
"write.parquet.compression-codec" : "zstd"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ ]
} CREATE OR REPLACE TABLE default.t2 (name string, age int) PARTITIONED BY (name, age); {
"format-version" : 2,
"table-uuid" : "27f1ab29-7a9f-4324-bb64-c10c5bd2be53",
"location" : "s3://warehouse/default/t2",
"last-sequence-number" : 0,
"last-updated-ms" : 1710409079414,
"last-column-id" : 2,
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "name",
"required" : false,
"type" : "string"
}, {
"id" : 2,
"name" : "age",
"required" : false,
"type" : "int"
} ]
} ],
"default-spec-id" : 1,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ {
"name" : "name",
"transform" : "identity",
"source-id" : 1,
"field-id" : 1000
} ]
}, {
"spec-id" : 1,
"fields" : [ {
"name" : "name",
"transform" : "identity",
"source-id" : 1,
"field-id" : 1000
}, {
"name" : "age",
"transform" : "identity",
"source-id" : 2,
"field-id" : 1001
} ]
} ],
"last-partition-id" : 1001,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "root",
"write.parquet.compression-codec" : "zstd"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ {
"timestamp-ms" : 1710409060360,
"metadata-file" : "s3://warehouse/default/t2/metadata/00000-bc21fe27-d037-4b98-a8ca-cc1502eb19ec.metadata.json"
} ]
} CREATE OR REPLACE TABLE default.t2 (name string, age int) PARTITIONED BY (name); {
"format-version" : 2,
"table-uuid" : "27f1ab29-7a9f-4324-bb64-c10c5bd2be53",
"location" : "s3://warehouse/default/t2",
"last-sequence-number" : 0,
"last-updated-ms" : 1710409086268,
"last-column-id" : 2,
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "name",
"required" : false,
"type" : "string"
}, {
"id" : 2,
"name" : "age",
"required" : false,
"type" : "int"
} ]
} ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ {
"name" : "name",
"transform" : "identity",
"source-id" : 1,
"field-id" : 1000
} ]
}, {
"spec-id" : 1,
"fields" : [ {
"name" : "name",
"transform" : "identity",
"source-id" : 1,
"field-id" : 1000
}, {
"name" : "age",
"transform" : "identity",
"source-id" : 2,
"field-id" : 1001
} ]
} ],
"last-partition-id" : 1001,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "root",
"write.parquet.compression-codec" : "zstd"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ {
"timestamp-ms" : 1710409060360,
"metadata-file" : "s3://warehouse/default/t2/metadata/00000-bc21fe27-d037-4b98-a8ca-cc1502eb19ec.metadata.json"
}, {
"timestamp-ms" : 1710409079414,
"metadata-file" : "s3://warehouse/default/t2/metadata/00001-8062294f-a8d6-493d-905f-b82dfe01cb29.metadata.json"
} ]
} Ideally, we also want to-reuse the |
||||||
) | ||||||
tx._apply(updates, requirements) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With #471 in this is not necessary anymore! 🥳
Suggested change
|
||||||
|
||||||
tx.set_properties(**new_properties) | ||||||
if new_location is not None: | ||||||
tx.update_location(new_location) | ||||||
return table | ||||||
|
||||||
def __repr__(self) -> str: | ||||||
"""Return the string representation of the Catalog class.""" | ||||||
return f"{self.name} ({self.__class__})" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's consider the following:
Results in:
The second schema is added:
And then go back to the original schema:
You'll see that no new schema is being added:
What do you think of re-using the
update_schema()
class:^ Pseudocode, could be cleaner. Ideally, the removal should be done with a
visit_with_partner
(that's the opposite of theunion_by_name
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @Fokko for this detailed explanation.
But we also need to cover the step 2 of this example where we add a new schema, right?
So from my understanding, if the schema fields match with an old schema in the metadata, we do
union_by_name
with the old schema and set it as the current oneElse, we add the new schema.
Is this correct assessment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's correct!