Merge branch 'main' of github.com:apache/iceberg-python into fd-snaps…

…hots
apache · Dec 7, 2023 · 2ff5c03 · 2ff5c03
2 parents 49687fc + 2493789
commit 2ff5c03
Show file tree

Hide file tree

Showing 36 changed files with 1,920 additions and 920 deletions.
diff --git a/.asf.yaml b/.asf.yaml
@@ -45,7 +45,7 @@ github:
   collaborators:  # Note: the number of collaborators is limited to 10
     - ajantha-bhat
   ghp_branch: gh-pages
-  ghp_path: ~
+  ghp_path: /
 
 notifications:
     commits:      [email protected]

diff --git a/.github/workflows/python-ci-docs.yml b/.github/workflows/python-ci-docs.yml
@@ -31,7 +31,7 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python }}
       - name: Install

diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
@@ -40,7 +40,7 @@ jobs:
     - uses: actions/checkout@v4
     - name: Install poetry
       run: make install-poetry
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python }}
         cache: poetry

diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml
@@ -41,7 +41,7 @@ jobs:
         with:
           fetch-depth: 0
 
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: '3.8'
 

diff --git a/Makefile b/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 install-poetry:
-	pip install poetry==1.6.1
+	pip install poetry==1.7.1
 
 install-dependencies:
 	poetry install -E pyarrow -E hive -E s3fs -E glue -E adlfs -E duckdb -E ray -E sql-postgres -E gcsfs

diff --git a/dev/Dockerfile b/dev/Dockerfile
@@ -36,7 +36,7 @@ ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$
 RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events
 WORKDIR ${SPARK_HOME}
 
-ENV SPARK_VERSION=3.4.1
+ENV SPARK_VERSION=3.4.2
 ENV ICEBERG_SPARK_RUNTIME_VERSION=3.4_2.12
 ENV ICEBERG_VERSION=1.4.0
 ENV AWS_SDK_VERSION=2.20.18

diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -318,7 +318,7 @@ In this case it is up to the engine itself to filter the file itself. Below, `to
 <!-- prettier-ignore-start -->
 
 !!! note "Requirements"
-    This requires [PyArrow to be installed](index.md).
+    This requires [`pyarrow` to be installed](index.md).
 
 <!-- prettier-ignore-end -->
 
@@ -346,6 +346,45 @@ tpep_dropoff_datetime: [[2021-04-01 00:47:59.000000,...,2021-05-01 00:14:47.0000
 
 This will only pull in the files that that might contain matching rows.
 
+### Pandas
+
+<!-- prettier-ignore-start -->
+
+!!! note "Requirements"
+    This requires [`pandas` to be installed](index.md).
+
+<!-- prettier-ignore-end -->
+
+PyIceberg makes it easy to filter out data from a huge table and pull it into a Pandas dataframe locally. This will only fetch the relevant Parquet files for the query and apply the filter. This will reduce IO and therefore improve performance and reduce cost.
+
+```python
+table.scan(
+    row_filter="trip_distance >= 10.0",
+    selected_fields=("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime"),
+).to_pandas()
+```
+
+This will return a Pandas dataframe:
+
+```
+        VendorID      tpep_pickup_datetime     tpep_dropoff_datetime
+0              2 2021-04-01 00:28:05+00:00 2021-04-01 00:47:59+00:00
+1              1 2021-04-01 00:39:01+00:00 2021-04-01 00:57:39+00:00
+2              2 2021-04-01 00:14:42+00:00 2021-04-01 00:42:59+00:00
+3              1 2021-04-01 00:17:17+00:00 2021-04-01 00:43:38+00:00
+4              1 2021-04-01 00:24:04+00:00 2021-04-01 00:56:20+00:00
+...          ...                       ...                       ...
+116976         2 2021-04-30 23:56:18+00:00 2021-05-01 00:29:13+00:00
+116977         2 2021-04-30 23:07:41+00:00 2021-04-30 23:37:18+00:00
+116978         2 2021-04-30 23:38:28+00:00 2021-05-01 00:12:04+00:00
+116979         2 2021-04-30 23:33:00+00:00 2021-04-30 23:59:00+00:00
+116980         2 2021-04-30 23:44:25+00:00 2021-05-01 00:14:47+00:00
+
+[116981 rows x 3 columns]
+```
+
+It is recommended to use Pandas 2 or later, because it stores the data in an [Apache Arrow backend](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i) which avoids copies of data.
+
 ### DuckDB
 
 <!-- prettier-ignore-start -->

diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
@@ -32,13 +32,19 @@ There are three ways to pass in configuration:
 - Through environment variables
 - By passing in credentials through the CLI or the Python API
 
-The configuration file is recommended since that's the most transparent way. If you prefer environment configuration:
+The configuration file is recommended since that's the easiest way to manage the credentials.
+
+Another option is through environment variables:
 
 ```sh
 export PYICEBERG_CATALOG__DEFAULT__URI=thrift://localhost:9083
+export PYICEBERG_CATALOG__DEFAULT__S3__ACCESS_KEY_ID=username
+export PYICEBERG_CATALOG__DEFAULT__S3__SECRET_ACCESS_KEY=password
 ```
 
-The environment variable picked up by Iceberg starts with `PYICEBERG_` and then follows the yaml structure below, where a double underscore `__` represents a nested field.
+The environment variable picked up by Iceberg starts with `PYICEBERG_` and then follows the yaml structure below, where a double underscore `__` represents a nested field, and the underscore `_` is converted into a dash `-`.
+
+For example, `PYICEBERG_CATALOG__DEFAULT__S3__ACCESS_KEY_ID`, sets `s3.access-key-id` on the `default` catalog.
 
 ## FileIO
 
@@ -80,8 +86,6 @@ For the FileIO there are several configuration options available:
 
 ### Azure Data lake
 
-### Azure Data lake
-
 | Key                     | Example                                                                                   | Description                                                                                                                                                                                                                                                                            |
 | ----------------------- | ----------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | adlfs.connection-string | AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqF...;BlobEndpoint=http://localhost/ | A [connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string). This could be used to use FileIO with any adlfs-compatible object storage service that has a different endpoint (like [azurite](https://github.com/azure/azurite)). |

diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt
@@ -16,13 +16,13 @@
 # under the License.
 
 mkdocs==1.5.3
-griffe==0.36.9
+griffe==0.38.1
 jinja2==3.1.2
-mkdocstrings==0.23.0
-mkdocstrings-python==1.7.3
+mkdocstrings==0.24.0
+mkdocstrings-python==1.7.5
 mkdocs-literate-nav==0.6.1
 mkdocs-autorefs==0.5.0
 mkdocs-gen-files==0.5.0
-mkdocs-material==9.4.7
-mkdocs-material-extensions==1.3
+mkdocs-material==9.4.14
+mkdocs-material-extensions==1.3.1
 mkdocs-section-index==0.3.8