Merge branch 'main' of github.com:apache/iceberg-python into fd-hive

apache · Jan 11, 2024 · a25c1b7 · a25c1b7
2 parents 2e77cb7 + 5085d28
commit a25c1b7
Show file tree

Hide file tree

Showing 51 changed files with 2,206 additions and 981 deletions.
diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml
@@ -80,7 +80,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
         run: ls -lah dist/* && cp dist/* wheelhouse/
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: "release-${{ github.event.inputs.version }}"
           path: ./wheelhouse/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -29,15 +29,12 @@ repos:
       - id: check-ast
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version (Used for linting)
-    rev: v0.1.0
+    rev: v0.1.8
     hooks:
       - id: ruff
-        args: [ --fix, --exit-non-zero-on-fix ]
-  - repo: https://github.com/ambv/black
-    rev: 23.10.0
-    hooks:
-      - id: black
-        args: [--skip-string-normalization]
+        args: [ --fix, --exit-non-zero-on-fix, --preview ]
+      - id: ruff-format
+        args: [ --preview ]
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.6.1
     hooks:

diff --git a/dev/provision.py b/dev/provision.py
@@ -297,3 +297,30 @@
         ('123')
     """
     )
+
+    spark.sql(
+        """
+    INSERT INTO default.test_table_sanitized_character
+    VALUES
+        ('123')
+    """
+    )
+
+    spark.sql(
+        """
+    CREATE TABLE default.test_table_add_column (
+        a string
+    )
+    USING iceberg
+    """
+    )
+
+    spark.sql("INSERT INTO default.test_table_add_column VALUES ('1')")
+
+    spark.sql(
+        """
+    ALTER TABLE default.test_table_add_column ADD COLUMN b string
+    """
+    )
+
+    spark.sql("INSERT INTO default.test_table_add_column VALUES ('2', '2')")
diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
@@ -74,6 +74,7 @@ For the FileIO there are several configuration options available:
 | s3.signer            | bearer                   | Configure the signature version of the FileIO.                                                                                                                                                                                                            |
 | s3.region            | us-west-2                | Sets the region of the bucket                                                                                                                                                                                                                             |
 | s3.proxy-uri         | http://my.proxy.com:8080 | Configure the proxy server to be used by the FileIO.                                                                                                                                                                                                      |
+| s3.connect-timeout   | 60.0                     | Configure socket connection timeout, in seconds.                                                                                                                                                                                                          |
 
 ### HDFS
 

diff --git a/mkdocs/docs/feature-support.md b/mkdocs/docs/feature-support.md
@@ -36,8 +36,8 @@ The goal is that the python library will provide a functional, performant subset
 | Create Table             |  X   |   X    |
 | Rename Table             |  X   |   X    |
 | Drop Table               |  X   |   X    |
-| Alter Table              |  X   |        |
-| Set Table Properties     |  X   |        |
+| Alter Table              |  X   |   X    |
+| Set Table Properties     |  X   |   X    |
 | Create Namespace         |  X   |   X    |
 | Drop Namespace           |  X   |   X    |
 | Set Namespace Properties |  X   |   X    |

diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt
@@ -17,12 +17,12 @@
 
 mkdocs==1.5.3
 griffe==0.38.1
-jinja2==3.1.2
+jinja2==3.1.3
 mkdocstrings==0.24.0
-mkdocstrings-python==1.7.5
+mkdocstrings-python==1.8.0
 mkdocs-literate-nav==0.6.1
 mkdocs-autorefs==0.5.0
 mkdocs-gen-files==0.5.0
-mkdocs-material==9.5.2
+mkdocs-material==9.5.3
 mkdocs-material-extensions==1.3.1
 mkdocs-section-index==0.3.8
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyiceberg/avro/codecs/__init__.py b/pyiceberg/avro/codecs/__init__.py
@@ -23,6 +23,7 @@
 so don't confuse it with the Python's "codecs", which is a package mainly for
 converting character sets (https://docs.python.org/3/library/codecs.html).
 """
+
 from __future__ import annotations
 
 from typing import Dict, Optional, Type

diff --git a/pyiceberg/avro/codecs/codec.py b/pyiceberg/avro/codecs/codec.py
@@ -24,10 +24,8 @@ class Codec(ABC):
 
     @staticmethod
     @abstractmethod
-    def compress(data: bytes) -> tuple[bytes, int]:
-        ...
+    def compress(data: bytes) -> tuple[bytes, int]: ...
 
     @staticmethod
     @abstractmethod
-    def decompress(data: bytes) -> bytes:
-        ...
+    def decompress(data: bytes) -> bytes: ...
diff --git a/pyiceberg/avro/file.py b/pyiceberg/avro/file.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=W0621
 """Avro reader for reading Avro files."""
+
 from __future__ import annotations
 
 import io

diff --git a/pyiceberg/avro/reader.py b/pyiceberg/avro/reader.py
@@ -23,6 +23,7 @@
 The reader tree can be changed in such a way that the
 read schema is different, while respecting the read schema.
 """
+
 from __future__ import annotations
 
 from abc import abstractmethod
@@ -85,12 +86,10 @@ def _skip_map_array(decoder: BinaryDecoder, skip_entry: Callable[[], None]) -> N
 
 class Reader(Singleton):
     @abstractmethod
-    def read(self, decoder: BinaryDecoder) -> Any:
-        ...
+    def read(self, decoder: BinaryDecoder) -> Any: ...
 
     @abstractmethod
-    def skip(self, decoder: BinaryDecoder) -> None:
-        ...
+    def skip(self, decoder: BinaryDecoder) -> None: ...
 
     def __repr__(self) -> str:
         """Return the string representation of the Reader class."""

diff --git a/pyiceberg/avro/resolver.py b/pyiceberg/avro/resolver.py
@@ -232,9 +232,7 @@ def resolve_reader(
     Raises:
         NotImplementedError: If attempting to resolve an unrecognized object type.
     """
-    return visit_with_partner(
-        file_schema, read_schema, ReadSchemaResolver(read_types, read_enums), SchemaPartnerAccessor()
-    )  # type: ignore
+    return visit_with_partner(file_schema, read_schema, ReadSchemaResolver(read_types, read_enums), SchemaPartnerAccessor())  # type: ignore
 
 
 class EnumReader(Reader):

diff --git a/pyiceberg/avro/writer.py b/pyiceberg/avro/writer.py
@@ -20,6 +20,7 @@
 Constructing a writer tree from the schema makes it easy
 to decouple the writing implementation from the schema.
 """
+
 from __future__ import annotations
 
 from abc import abstractmethod
@@ -43,8 +44,7 @@
 @dataclass(frozen=True)
 class Writer(Singleton):
     @abstractmethod
-    def write(self, encoder: BinaryEncoder, val: Any) -> Any:
-        ...
+    def write(self, encoder: BinaryEncoder, val: Any) -> Any: ...
 
     def __repr__(self) -> str:
         """Return string representation of this object."""

diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py
@@ -18,6 +18,7 @@
 from __future__ import annotations
 
 import logging
+import re
 import uuid
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -74,6 +75,17 @@
 LOCATION = "location"
 EXTERNAL_TABLE = "EXTERNAL_TABLE"
 
+TABLE_METADATA_FILE_NAME_REGEX = re.compile(
+    r"""
+    (\d+)              # version number
+    -                  # separator
+    ([\w-]{36})        # UUID (36 characters, including hyphens)
+    (?:\.\w+)?         # optional codec name
+    \.metadata\.json   # file extension
+    """,
+    re.X,
+)
+
 
 class CatalogType(Enum):
     REST = "rest"
@@ -587,8 +599,38 @@ def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) ->
         ToOutputFile.table_metadata(metadata, io.new_output(metadata_path))
 
     @staticmethod
-    def _get_metadata_location(location: str) -> str:
-        return f"{location}/metadata/00000-{uuid.uuid4()}.metadata.json"
+    def _get_metadata_location(location: str, new_version: int = 0) -> str:
+        if new_version < 0:
+            raise ValueError(f"Table metadata version: `{new_version}` must be a non-negative integer")
+        version_str = f"{new_version:05d}"
+        return f"{location}/metadata/{version_str}-{uuid.uuid4()}.metadata.json"
+
+    @staticmethod
+    def _parse_metadata_version(metadata_location: str) -> int:
+        """Parse the version from the metadata location.
+
+        The version is the first part of the file name, before the first dash.
+        For example, the version of the metadata file
+        `s3://bucket/db/tb/metadata/00001-6c97e413-d51b-4538-ac70-12fe2a85cb83.metadata.json`
+        is 1.
+        If the path does not comply with the pattern, the version is defaulted to be -1, ensuring
+        that the next metadata file is treated as having version 0.
+
+        Args:
+            metadata_location (str): The location of the metadata file.
+
+        Returns:
+            int: The version of the metadata file. -1 if the file name does not have valid version string
+        """
+        file_name = metadata_location.split("/")[-1]
+        if file_name_match := TABLE_METADATA_FILE_NAME_REGEX.fullmatch(file_name):
+            try:
+                uuid.UUID(file_name_match.group(2))
+            except ValueError:
+                return -1
+            return int(file_name_match.group(1))
+        else:
+            return -1
 
     def _get_updated_props_and_update_summary(
         self, current_properties: Properties, removals: Optional[Set[str]], updates: Properties