feat(pyspark): add official support and ci testing with spark connect #24613
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Backends | |
on: | |
push: | |
# Skip the backend suite if all changes are docs | |
paths-ignore: | |
- "docs/**" | |
- "**/*.md" | |
- "**/*.qmd" | |
- "codecov.yml" | |
- ".envrc" | |
- ".codespellrc" | |
branches: | |
- main | |
- "*.x.x" | |
pull_request: | |
# Skip the backend suite if all changes are docs | |
paths-ignore: | |
- "docs/**" | |
- "**/*.md" | |
- "**/*.qmd" | |
- "codecov.yml" | |
- ".envrc" | |
- ".codespellrc" | |
branches: | |
- main | |
- "*.x.x" | |
merge_group: | |
permissions: | |
# this allows extractions/setup-just to list releases for `just` at a higher | |
# rate limit while restricting GITHUB_TOKEN permissions elsewhere | |
contents: read | |
concurrency: | |
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} | |
cancel-in-progress: true | |
env: | |
FORCE_COLOR: "1" | |
ODBCSYSINI: "${{ github.workspace }}/ci/odbc" | |
HYPOTHESIS_PROFILE: "ci" | |
jobs: | |
test_bigquery_lite: | |
name: BigQuery ${{ matrix.os }} python-${{ matrix.python-version }} | |
runs-on: ${{ matrix.os }} | |
strategy: | |
fail-fast: false | |
matrix: | |
os: | |
- ubuntu-latest | |
- windows-latest | |
python-version: | |
- "3.10" | |
- "3.12" | |
steps: | |
- name: checkout | |
uses: actions/checkout@v4 | |
- name: install python | |
uses: actions/setup-python@v5 | |
id: install_python | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: install poetry | |
run: pip install 'poetry==1.8.3' | |
- name: install ibis | |
run: poetry install --without dev --without docs --extras bigquery | |
- uses: extractions/setup-just@v2 | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
- name: run simple bigquery unit tests | |
run: just ci-check ibis/backends/bigquery/tests/unit | |
- name: upload code coverage | |
if: success() | |
continue-on-error: true | |
uses: codecov/codecov-action@v4 | |
with: | |
flags: backend,bigquery,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }} | |
token: ${{ secrets.CODECOV_TOKEN }} | |
test_backends: | |
name: ${{ matrix.backend.title }} ${{ matrix.os }} python-${{ matrix.python-version }} | |
runs-on: ${{ matrix.os }} | |
strategy: | |
fail-fast: false | |
matrix: | |
os: | |
- ubuntu-latest | |
- windows-latest | |
python-version: | |
- "3.10" | |
- "3.12" | |
backend: | |
- name: duckdb | |
title: DuckDB | |
serial: true | |
extras: | |
- duckdb | |
- deltalake | |
- geospatial | |
- examples | |
- decompiler | |
- polars | |
additional_deps: | |
- torch | |
- name: clickhouse | |
title: ClickHouse | |
services: | |
- clickhouse | |
extras: | |
- clickhouse | |
- examples | |
- name: sqlite | |
title: SQLite | |
extras: | |
- sqlite | |
- name: datafusion | |
title: DataFusion | |
serial: true | |
extras: | |
- datafusion | |
- name: polars | |
title: Polars | |
extras: | |
- polars | |
- deltalake | |
- name: mysql | |
title: MySQL | |
services: | |
- mysql | |
extras: | |
- mysql | |
- geospatial | |
- polars | |
sys-deps: | |
- libgeos-dev | |
- default-libmysqlclient-dev | |
- name: postgres | |
title: PostgreSQL | |
extras: | |
- postgres | |
- geospatial | |
services: | |
- postgres | |
sys-deps: | |
- libgeos-dev | |
- name: postgres | |
title: PostgreSQL + Torch | |
extras: | |
- postgres | |
- geospatial | |
- polars | |
additional_deps: | |
- torch | |
services: | |
- postgres | |
sys-deps: | |
- libgeos-dev | |
- name: risingwave | |
title: RisingWave | |
serial: true | |
services: | |
- risingwave | |
extras: | |
- risingwave | |
- name: impala | |
title: Impala | |
serial: true | |
extras: | |
- impala | |
services: | |
- impala | |
- kudu | |
sys-deps: | |
- cmake | |
- ninja-build | |
- name: mssql | |
title: MS SQL Server | |
extras: | |
- mssql | |
- polars | |
services: | |
- mssql | |
sys-deps: | |
- freetds-dev | |
- unixodbc-dev | |
- tdsodbc | |
- name: trino | |
title: Trino | |
extras: | |
- trino | |
services: | |
- trino | |
- name: druid | |
title: Druid | |
extras: | |
- druid | |
services: | |
- druid | |
- name: exasol | |
title: Exasol | |
serial: true | |
extras: | |
- exasol | |
services: | |
- exasol | |
- name: oracle | |
title: Oracle | |
serial: true | |
extras: | |
- oracle | |
- polars | |
services: | |
- oracle | |
- name: flink | |
title: Flink | |
serial: true | |
extras: | |
- flink | |
additional_deps: | |
- "'apache-flink==1.20.0'" | |
- "'pandas<2.2'" | |
- setuptools | |
services: | |
- flink | |
include: | |
- os: ubuntu-latest | |
python-version: "3.11" | |
backend: | |
name: flink | |
title: Flink | |
serial: true | |
extras: | |
- flink | |
additional_deps: | |
- "'apache-flink==1.20.0'" | |
- "'pandas<2.2'" | |
- setuptools | |
services: | |
- flink | |
- os: ubuntu-latest | |
python-version: "3.11" | |
backend: | |
name: impala | |
title: Impala | |
serial: true | |
extras: | |
- impala | |
services: | |
- impala | |
- kudu | |
sys-deps: | |
- cmake | |
- ninja-build | |
exclude: | |
- os: windows-latest | |
backend: | |
name: mysql | |
title: MySQL | |
extras: | |
- mysql | |
- geospatial | |
- polars | |
services: | |
- mysql | |
sys-deps: | |
- libgeos-dev | |
- default-libmysqlclient-dev | |
- os: windows-latest | |
backend: | |
name: clickhouse | |
title: ClickHouse | |
extras: | |
- clickhouse | |
- examples | |
services: | |
- clickhouse | |
- os: windows-latest | |
backend: | |
name: postgres | |
title: PostgreSQL | |
extras: | |
- postgres | |
- geospatial | |
services: | |
- postgres | |
sys-deps: | |
- libgeos-dev | |
- os: windows-latest | |
backend: | |
name: risingwave | |
title: RisingWave | |
serial: true | |
services: | |
- risingwave | |
extras: | |
- risingwave | |
- os: windows-latest | |
backend: | |
name: postgres | |
title: PostgreSQL + Torch | |
extras: | |
- postgres | |
- geospatial | |
- polars | |
additional_deps: | |
- torch | |
services: | |
- postgres | |
sys-deps: | |
- libgeos-dev | |
# TODO(deepyaman): Test whether this works upon releasing https://github.com/cloudera/impyla/commit/bf1f94c3c4106ded6267d2485c1e939775a6a87f | |
- os: ubuntu-latest | |
python-version: "3.12" | |
backend: | |
name: impala | |
title: Impala | |
serial: true | |
extras: | |
- impala | |
services: | |
- impala | |
- kudu | |
sys-deps: | |
- cmake | |
- ninja-build | |
- os: windows-latest | |
backend: | |
name: impala | |
title: Impala | |
serial: true | |
extras: | |
- impala | |
services: | |
- impala | |
- kudu | |
sys-deps: | |
- cmake | |
- ninja-build | |
- os: windows-latest | |
backend: | |
name: mssql | |
title: MS SQL Server | |
extras: | |
- mssql | |
- polars | |
services: | |
- mssql | |
sys-deps: | |
- freetds-dev | |
- unixodbc-dev | |
- tdsodbc | |
- os: windows-latest | |
backend: | |
name: trino | |
title: Trino | |
services: | |
- trino | |
extras: | |
- trino | |
- os: windows-latest | |
backend: | |
name: druid | |
title: Druid | |
extras: | |
- druid | |
services: | |
- druid | |
- os: windows-latest | |
backend: | |
name: oracle | |
title: Oracle | |
serial: true | |
extras: | |
- oracle | |
- polars | |
services: | |
- oracle | |
- os: ubuntu-latest | |
python-version: "3.12" | |
backend: | |
name: flink | |
title: Flink | |
serial: true | |
extras: | |
- flink | |
additional_deps: | |
- "'apache-flink==1.20.0'" | |
- "'pandas<2.2'" | |
- setuptools | |
services: | |
- flink | |
- os: windows-latest | |
backend: | |
name: flink | |
title: Flink | |
serial: true | |
extras: | |
- flink | |
additional_deps: | |
- "'apache-flink==1.20.0'" | |
- "'pandas<2.2'" | |
- setuptools | |
services: | |
- flink | |
- os: windows-latest | |
backend: | |
name: exasol | |
title: Exasol | |
serial: true | |
extras: | |
- exasol | |
services: | |
- exasol | |
steps: | |
- name: update and install system dependencies | |
if: matrix.os == 'ubuntu-latest' && matrix.backend.sys-deps != null | |
run: | | |
set -euo pipefail | |
sudo apt-get update -qq -y | |
sudo apt-get install -qq -y build-essential ${{ join(matrix.backend.sys-deps, ' ') }} | |
- name: install sqlite | |
if: matrix.os == 'windows-latest' && matrix.backend.name == 'sqlite' | |
run: choco install sqlite | |
- name: checkout | |
uses: actions/checkout@v4 | |
- uses: extractions/setup-just@v2 | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
- name: download backend data | |
run: just download-data | |
- name: show docker compose version | |
if: matrix.backend.services != null | |
run: docker compose version | |
- name: start services | |
if: matrix.backend.services != null | |
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }} | |
- name: install python | |
uses: actions/setup-python@v5 | |
id: install_python | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: install poetry | |
run: pip install 'poetry==1.8.3' | |
- name: install ibis | |
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }} examples" | |
- name: install deps for broken avro-python setup | |
if: matrix.backend.name == 'flink' | |
run: poetry run pip install wheel | |
- name: install other deps | |
if: matrix.backend.additional_deps != null | |
run: poetry run pip install ${{ join(matrix.backend.additional_deps, ' ') }} | |
- name: show installed deps | |
run: poetry run pip list | |
- name: show version of python-linked sqlite | |
if: matrix.backend.name == 'sqlite' | |
run: poetry run python -c 'import sqlite3; print(sqlite3.sqlite_version)' | |
- name: "run parallel tests: ${{ matrix.backend.name }}" | |
if: ${{ !matrix.backend.serial }} | |
run: just ci-check -m ${{ matrix.backend.name }} --numprocesses auto --dist=loadgroup | |
env: | |
IBIS_TEST_IMPALA_HOST: localhost | |
IBIS_TEST_IMPALA_PORT: 21050 | |
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }} | |
- name: "run serial tests: ${{ matrix.backend.name }}" | |
if: matrix.backend.serial | |
run: just ci-check -m ${{ matrix.backend.name }} | |
env: | |
FLINK_REMOTE_CLUSTER_ADDR: localhost | |
FLINK_REMOTE_CLUSTER_PORT: "8081" | |
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }} | |
- name: "run backend doctests: ${{ matrix.backend.name }}" | |
if: matrix.os == 'ubuntu-latest' | |
run: just backend-doctests ${{ matrix.backend.name }} | |
env: | |
FLINK_REMOTE_CLUSTER_ADDR: localhost | |
FLINK_REMOTE_CLUSTER_PORT: "8081" | |
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }} | |
- name: check that no untracked files were produced | |
shell: bash | |
run: | | |
! git status --porcelain | tee /dev/stderr | grep . | |
- name: upload code coverage | |
if: success() | |
continue-on-error: true | |
uses: codecov/codecov-action@v4 | |
with: | |
flags: backend,${{ matrix.backend.name }},${{ runner.os }},python-${{ steps.install_python.outputs.python-version }} | |
token: ${{ secrets.CODECOV_TOKEN }} | |
- name: Show docker compose logs on fail | |
if: matrix.backend.services != null && failure() | |
run: docker compose logs | |
test_backends_min_version: | |
name: ${{ matrix.backend.title }} Min Version ${{ matrix.os }} python-${{ matrix.python-version }} | |
runs-on: ${{ matrix.os }} | |
strategy: | |
fail-fast: false | |
matrix: | |
os: | |
- ubuntu-latest | |
- windows-latest | |
python-version: | |
- "3.10" | |
- "3.12" | |
backend: | |
- name: postgres | |
title: PostgreSQL | |
deps: | |
required: | |
- "[email protected]" | |
- "[email protected]" | |
optional: | |
- "[email protected]" | |
- "[email protected]" | |
- "Shapely@2" | |
services: | |
- postgres | |
extras: | |
- postgres | |
- geospatial | |
exclude: | |
- os: windows-latest | |
backend: | |
name: postgres | |
title: PostgreSQL | |
deps: | |
required: | |
- "[email protected]" | |
- "[email protected]" | |
optional: | |
- "[email protected]" | |
- "[email protected]" | |
- "Shapely@2" | |
services: | |
- postgres | |
extras: | |
- postgres | |
- geospatial | |
- python-version: "3.12" | |
backend: | |
name: postgres | |
title: PostgreSQL | |
deps: | |
required: | |
- "[email protected]" | |
- "[email protected]" | |
optional: | |
- "[email protected]" | |
- "[email protected]" | |
- "Shapely@2" | |
services: | |
- postgres | |
extras: | |
- postgres | |
- geospatial | |
steps: | |
- name: checkout | |
uses: actions/checkout@v4 | |
- name: install libgeos for shapely | |
if: matrix.backend.name == 'postgres' | |
run: | | |
sudo apt-get update -y -qq | |
sudo apt-get install -qq -y build-essential libgeos-dev | |
- uses: extractions/setup-just@v2 | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
- name: download backend data | |
run: just download-data | |
- name: start services | |
if: matrix.backend.services != null | |
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }} | |
- name: install python | |
uses: actions/setup-python@v5 | |
id: install_python | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: install poetry | |
run: python -m pip install --upgrade pip 'poetry==1.8.3' | |
- name: remove incompatible deps | |
# it requires a version of pandas that min versions are not compatible with | |
run: poetry remove lonboard deltalake | |
- name: install minimum versions of required deps | |
run: poetry add --lock ${{ join(matrix.backend.deps.required, ' ') }} --python="==${{ steps.install_python.outputs.python-version }}" | |
- name: install minimum versions of optional deps | |
run: poetry add --lock --optional ${{ join(matrix.backend.deps.optional, ' ') }} --python="==${{ steps.install_python.outputs.python-version }}" | |
- name: checkout the lock file | |
run: git checkout poetry.lock | |
- name: lock with no updates | |
# poetry add is aggressive and will update other dependencies like | |
# numpy and pandas so we keep the pyproject.toml edits and then relock | |
# without updating anything except the requested versions | |
run: poetry lock --no-update | |
- name: install ibis | |
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }} examples" | |
- name: run tests | |
run: just ci-check -m ${{ matrix.backend.name }} --numprocesses auto --dist=loadgroup | |
- name: check that no untracked files were produced | |
shell: bash | |
run: git checkout poetry.lock pyproject.toml && ! git status --porcelain | tee /dev/stderr | grep . | |
- name: upload code coverage | |
if: success() | |
continue-on-error: true | |
uses: codecov/codecov-action@v4 | |
with: | |
flags: backend,${{ matrix.backend.name }},${{ runner.os }},python-${{ steps.install_python.outputs.python-version }} | |
token: ${{ secrets.CODECOV_TOKEN }} | |
- name: Show docker compose logs on fail | |
if: matrix.backend.services != null && failure() | |
run: docker compose logs | |
test_pyspark: | |
name: PySpark ${{ matrix.tag }} ${{ matrix.pyspark-minor-version }} ubuntu-latest python-${{ matrix.python-version }} | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
include: | |
- python-version: "3.10" | |
pyspark-version: "3.3.3" | |
pyspark-minor-version: "3.3" | |
deps: | |
- "'pandas@<2'" | |
- "'numpy@<1.24'" | |
tag: local | |
- python-version: "3.11" | |
pyspark-version: "3.5.2" | |
pyspark-minor-version: "3.5" | |
deps: | |
- "'pandas@>2'" | |
- "'numpy@>1.24'" | |
tag: local | |
- python-version: "3.12" | |
pyspark-version: "3.5.2" | |
pyspark-minor-version: "3.5" | |
deps: | |
- "'pandas@>2'" | |
- "'numpy@>1.24'" | |
- setuptools | |
tag: local | |
- python-version: "3.12" | |
pyspark-version: "3.5.2" | |
pyspark-minor-version: "3.5" | |
deps: | |
- setuptools | |
tag: remote | |
SPARK_REMOTE: "sc://localhost:15002" | |
steps: | |
- name: checkout | |
uses: actions/checkout@v4 | |
- uses: actions/setup-java@v4 | |
with: | |
distribution: microsoft | |
java-version: 17 | |
- uses: extractions/setup-just@v2 | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
- name: start services | |
if: matrix.tag == 'remote' | |
run: just up spark-connect | |
- name: download backend data | |
run: just download-data | |
- name: install python | |
uses: actions/setup-python@v5 | |
id: install_python | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: install poetry | |
run: python -m pip install --upgrade pip 'poetry==1.8.3' | |
- name: remove lonboard | |
# it requires a version of pandas that pyspark is not compatible with | |
run: poetry remove lonboard | |
- name: install exact versions of pyspark, pandas and numpy | |
run: poetry add --lock 'pyspark@${{ matrix.pyspark-version }}' ${{ join(matrix.deps, ' ') }} | |
- name: checkout the lock file | |
run: git checkout poetry.lock | |
- name: lock with no updates | |
# poetry add is aggressive and will update other dependencies like | |
# numpy and pandas so we keep the pyproject.toml edits and then relock | |
# without updating anything except the requested versions | |
run: poetry lock --no-update | |
- name: install ibis | |
run: poetry install --without dev --without docs --extras "pyspark examples" | |
- name: install delta-spark | |
if: matrix.pyspark-version == '3.5' | |
run: poetry run pip install delta-spark | |
- name: install iceberg | |
shell: bash | |
run: just download-iceberg-jar ${{ matrix.pyspark-minor-version }} | |
- name: run spark connect tests | |
if: matrix.tag == 'remote' | |
run: just ci-check -m pyspark | |
env: | |
SPARK_REMOTE: ${{ matrix.SPARK_REMOTE }} | |
- name: run spark tests | |
if: matrix.tag == 'local' | |
run: just ci-check -m pyspark | |
- name: check that no untracked files were produced | |
shell: bash | |
run: git checkout poetry.lock pyproject.toml && ! git status --porcelain | tee /dev/stderr | grep . | |
- name: upload code coverage | |
# only upload coverage for jobs that aren't mostly xfails | |
if: success() | |
continue-on-error: true | |
uses: codecov/codecov-action@v4 | |
with: | |
flags: backend,pyspark,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }} | |
token: ${{ secrets.CODECOV_TOKEN }} | |
backends: | |
# this job exists so that we can use a single job from this workflow to gate merging | |
runs-on: ubuntu-latest | |
needs: | |
- test_bigquery_lite | |
- test_backends_min_version | |
- test_backends | |
- test_pyspark | |
steps: | |
- run: exit 0 |