Skip to content

docs(blog-post): pydata performance part 2; polars and datafusion #17163

docs(blog-post): pydata performance part 2; polars and datafusion

docs(blog-post): pydata performance part 2; polars and datafusion #17163

Workflow file for this run

name: Backends
on:
push:
# Skip the backend suite if all changes are docs
paths-ignore:
- "docs/**"
- "**/*.md"
- "**/*.qmd"
- "codecov.yml"
- ".envrc"
branches:
- master
- "*.x.x"
pull_request:
# Skip the backend suite if all changes are docs
paths-ignore:
- "docs/**"
- "**/*.md"
- "**/*.qmd"
- "codecov.yml"
- ".envrc"
branches:
- master
- "*.x.x"
merge_group:
permissions:
# this allows extractions/setup-just to list releases for `just` at a higher
# rate limit while restricting GITHUB_TOKEN permissions elsewhere
contents: read
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
env:
FORCE_COLOR: "1"
jobs:
test_backends:
name: ${{ matrix.backend.title }} ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
env:
SQLALCHEMY_WARN_20: "1"
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
- windows-latest
python-version:
- "3.9"
- "3.11"
backend:
- name: dask
title: Dask
extras:
- dask
- name: duckdb
title: DuckDB
extras:
- duckdb
- deltalake
- geospatial
additional_deps:
- torch
- name: pandas
title: Pandas
extras:
- pandas
- name: sqlite
title: SQLite
extras:
- sqlite
- name: datafusion
title: Datafusion
extras:
- datafusion
- name: polars
title: Polars
extras:
- polars
- deltalake
- name: mysql
title: MySQL
services:
- mysql
extras:
- mysql
- geospatial
sys-deps:
- libgeos-dev
- name: clickhouse
title: ClickHouse
services:
- clickhouse
extras:
- clickhouse
- name: postgres
title: PostgreSQL
extras:
- postgres
- geospatial
services:
- postgres
sys-deps:
- libgeos-dev
- name: postgres
title: PostgreSQL + Torch
extras:
- postgres
- geospatial
additional_deps:
- torch
services:
- postgres
sys-deps:
- libgeos-dev
- name: impala
title: Impala
serial: true
extras:
- impala
services:
- impala
- kudu
sys-deps:
- cmake
- ninja-build
- name: mssql
title: MS SQL Server
serial: true
extras:
- mssql
services:
- mssql
sys-deps:
- libkrb5-dev
- krb5-config
- freetds-dev
- name: trino
title: Trino
extras:
- trino
- postgres
services:
- trino
- name: druid
title: Druid
extras:
- druid
services:
- druid
- name: oracle
title: Oracle
serial: true
extras:
- oracle
services:
- oracle
- name: flink
title: Flink
serial: true
extras:
- flink
additional_deps:
- apache-flink
- pytest-split
services:
- flink
exclude:
- os: windows-latest
backend:
name: mysql
title: MySQL
extras:
- mysql
- geospatial
services:
- mysql
sys-deps:
- libgeos-dev
- os: windows-latest
backend:
name: clickhouse
title: ClickHouse
extras:
- clickhouse
services:
- clickhouse
- os: windows-latest
backend:
name: postgres
title: PostgreSQL
extras:
- postgres
- geospatial
services:
- postgres
sys-deps:
- libgeos-dev
- os: windows-latest
backend:
name: postgres
title: PostgreSQL + Torch
extras:
- postgres
- geospatial
additional_deps:
- torch
services:
- postgres
sys-deps:
- libgeos-dev
- os: windows-latest
backend:
name: impala
title: Impala
serial: true
extras:
- impala
services:
- impala
- kudu
sys-deps:
- cmake
- ninja-build
- os: windows-latest
backend:
name: mssql
title: MS SQL Server
serial: true
extras:
- mssql
services:
- mssql
sys-deps:
- libkrb5-dev
- krb5-config
- freetds-dev
- os: windows-latest
backend:
name: trino
title: Trino
services:
- trino
extras:
- trino
- postgres
- os: windows-latest
backend:
name: druid
title: Druid
extras:
- druid
services:
- druid
- os: windows-latest
backend:
name: oracle
title: Oracle
serial: true
extras:
- oracle
services:
- oracle
- os: windows-latest
backend:
name: flink
title: Flink
serial: true
extras:
- flink
services:
- flink
- python-version: "3.11"
backend:
name: flink
title: Flink
serial: true
extras:
- flink
services:
- flink
steps:
- name: update and install system dependencies
if: matrix.os == 'ubuntu-latest' && matrix.backend.sys-deps != null
run: |
set -euo pipefail
sudo apt-get update -qq -y
sudo apt-get install -qq -y build-essential ${{ join(matrix.backend.sys-deps, ' ') }}
- name: install sqlite
if: matrix.os == 'windows-latest' && matrix.backend.name == 'sqlite'
run: choco install sqlite
- name: checkout
uses: actions/checkout@v4
- uses: extractions/setup-just@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: download backend data
run: just download-data
- name: start services
if: matrix.backend.services != null
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }}
- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: ${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.7.1'
- uses: syphar/restore-virtualenv@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: ${{ matrix.backend.name }}-${{ steps.install_python.outputs.python-version }}
- name: install ibis
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }}"
- name: install deps for broken avro-python setup
if: matrix.backend.name == 'flink'
run: poetry run pip install wheel
- name: install other deps
if: matrix.backend.additional_deps != null
run: poetry run pip install ${{ join(matrix.backend.additional_deps, ' ') }}
- name: show installed deps
run: poetry run pip list
- name: "run parallel tests: ${{ matrix.backend.name }}"
if: ${{ !matrix.backend.serial }}
run: just ci-check -m ${{ matrix.backend.name }} --numprocesses auto --dist=loadgroup
- name: "run serial tests: ${{ matrix.backend.name }}"
if: matrix.backend.serial && matrix.backend.name == 'impala'
run: just ci-check -m ${{ matrix.backend.name }} --randomly-dont-reorganize
env:
IBIS_TEST_NN_HOST: localhost
IBIS_TEST_IMPALA_HOST: localhost
IBIS_TEST_IMPALA_PORT: 21050
IBIS_TEST_WEBHDFS_PORT: 50070
IBIS_TEST_WEBHDFS_USER: hdfs
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
# FIXME(deepyaman): If some backend-specific test, in test_ddl.py,
# executes before common tests, they will fail with:
# org.apache.flink.table.api.ValidationException: Table `default_catalog`.`default_database`.`functional_alltypes` was not found.
# Therefore, we run backend-specific tests second to avoid this.
- name: "run serial tests: ${{ matrix.backend.name }}"
if: matrix.backend.serial && matrix.backend.name == 'flink'
run: |
just ci-check -m ${{ matrix.backend.name }} ibis/backends/tests
just ci-check -m ${{ matrix.backend.name }} ibis/backends/flink/tests
env:
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
FLINK_REMOTE_CLUSTER_ADDR: localhost
FLINK_REMOTE_CLUSTER_PORT: "8081"
- name: "run serial tests: ${{ matrix.backend.name }}"
if: matrix.backend.serial && matrix.backend.name != 'impala' && matrix.backend.name != 'flink'
run: just ci-check -m ${{ matrix.backend.name }}
env:
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
- name: check that no untracked files were produced
shell: bash
run: git checkout poetry.lock pyproject.toml && ! git status --porcelain | tee /dev/stderr | grep .
- name: upload code coverage
if: success()
uses: codecov/codecov-action@v3
with:
flags: backend,${{ matrix.backend.name }},${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}
- name: Show docker compose logs on fail
if: matrix.backend.services != null && failure()
run: docker compose logs
test_backends_min_version:
name: ${{ matrix.backend.title }} Min Version ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
env:
SQLALCHEMY_WARN_20: "1"
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
- windows-latest
python-version:
- "3.9"
- "3.11"
backend:
- name: dask
title: Dask
deps:
- "dask[array,dataframe]@2022.9.1"
- "[email protected]"
extras:
- dask
- name: postgres
title: PostgreSQL
deps:
- "[email protected]"
- "[email protected]"
- "[email protected]"
- "Shapely@2"
services:
- postgres
extras:
- postgres
- geospatial
exclude:
- os: windows-latest
backend:
name: postgres
title: PostgreSQL
deps:
- "[email protected]"
- "[email protected]"
- "[email protected]"
- "Shapely@2"
services:
- postgres
extras:
- postgres
- geospatial
- python-version: "3.11"
backend:
name: postgres
title: PostgreSQL
deps:
- "[email protected]"
- "[email protected]"
- "[email protected]"
- "Shapely@2"
services:
- postgres
extras:
- postgres
- geospatial
steps:
- name: checkout
uses: actions/checkout@v4
- name: install libgeos for shapely
if: matrix.backend.name == 'postgres'
run: |
sudo apt-get update -y -qq
sudo apt-get install -qq -y build-essential libgeos-dev
- uses: extractions/setup-just@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: download backend data
run: just download-data
- name: start services
if: matrix.backend.services != null
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }}
- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.7.1'
- name: remove lonboard
# it requires a version of pandas that min versions are not compatible with
run: poetry remove lonboard
- name: install minimum versions
run: poetry add --lock --optional ${{ join(matrix.backend.deps, ' ') }}
- name: checkout the lock file
run: git checkout poetry.lock
- name: lock with no updates
# poetry add is aggressive and will update other dependencies like
# numpy and pandas so we keep the pyproject.toml edits and then relock
# without updating anything except the requested versions
run: poetry lock --no-update
- name: install ibis
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }}"
- name: run tests
run: just ci-check -m ${{ matrix.backend.name }} --numprocesses auto --dist=loadgroup
- name: check that no untracked files were produced
shell: bash
run: git checkout poetry.lock pyproject.toml && ! git status --porcelain | tee /dev/stderr | grep .
- name: upload code coverage
if: success()
uses: codecov/codecov-action@v3
with:
flags: backend,${{ matrix.backend.name }},${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}
- name: Show docker compose logs on fail
if: matrix.backend.services != null && failure()
run: docker compose logs
test_pyspark:
name: PySpark ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
python-version:
- "3.10"
steps:
- name: checkout
uses: actions/checkout@v4
- uses: actions/setup-java@v4
with:
distribution: microsoft
java-version: 17
- uses: extractions/setup-just@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: download backend data
run: just download-data
- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.7.1'
- name: remove lonboard
# it requires a version of pandas that pyspark is not compatible with
run: poetry remove lonboard
- name: install maximum versions of pandas and numpy
run: poetry add --lock 'pandas@<2' 'numpy<1.24'
- name: checkout the lock file
run: git checkout poetry.lock
- name: lock with no updates
# poetry add is aggressive and will update other dependencies like
# numpy and pandas so we keep the pyproject.toml edits and then relock
# without updating anything except the requested versions
run: poetry lock --no-update
- name: install ibis
run: poetry install --without dev --without docs --extras pyspark
- name: run tests
run: just ci-check -m pyspark
- name: check that no untracked files were produced
shell: bash
run: git checkout poetry.lock pyproject.toml && ! git status --porcelain | tee /dev/stderr | grep .
- name: upload code coverage
# only upload coverage for jobs that aren't mostly xfails
if: success() && matrix.python-version != '3.11'
uses: codecov/codecov-action@v3
with:
flags: backend,pyspark,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}
gen_lockfile_sqlalchemy2:
name: Generate Poetry Lockfile for SQLAlchemy 2
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v4
- name: install python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- run: python -m pip install --upgrade pip 'poetry==1.7.1'
- name: remove deps that are not compatible with sqlalchemy 2
run: poetry remove snowflake-sqlalchemy
- name: add sqlalchemy 2
run: poetry add --lock --optional 'sqlalchemy>=2,<3'
- name: checkout the lock file
run: git checkout poetry.lock
- name: lock with no updates
# poetry add is aggressive and will update other dependencies like
# numpy and pandas so we keep the pyproject.toml edits and then relock
# without updating anything except the requested versions
run: poetry lock --no-update
- name: check the sqlalchemy version
run: poetry show sqlalchemy --no-ansi | grep version | cut -d ':' -f2- | sed 's/ //g' | grep -P '^2\.'
- name: upload deps file
uses: actions/upload-artifact@v3
with:
name: deps
path: |
pyproject.toml
poetry.lock
test_backends_sqlalchemy2:
name: SQLAlchemy 2 ${{ matrix.backend.title }} ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
needs:
- gen_lockfile_sqlalchemy2
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
python-version:
- "3.11"
backend:
- name: mssql
title: MS SQL Server
services:
- mssql
extras:
- mssql
- name: mysql
title: MySQL
services:
- mysql
extras:
- geospatial
- mysql
- name: postgres
title: PostgreSQL
services:
- postgres
extras:
- geospatial
- postgres
- name: sqlite
title: SQLite
extras:
- sqlite
- name: trino
title: Trino
services:
- trino
extras:
- trino
- postgres
- name: duckdb
title: DuckDB
extras:
- duckdb
- name: oracle
title: Oracle
serial: true
extras:
- oracle
services:
- oracle
steps:
- name: checkout
uses: actions/checkout@v4
- name: install libgeos for shapely
if: ${{ matrix.backend.name == 'postgres' }}
run: |
sudo apt-get update -y -qq
sudo apt-get install -qq -y build-essential libgeos-dev
- name: install freetds-dev for mssql
if: ${{ matrix.backend.name == 'mssql' }}
run: |
sudo apt-get update -y -qq
sudo apt-get install -qq -y build-essential libkrb5-dev krb5-config freetds-dev
- uses: extractions/setup-just@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: download backend data
run: just download-data
- name: start services
if: matrix.backend.services != null
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }}
- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
- name: download poetry lockfile
uses: actions/download-artifact@v3
with:
name: deps
path: deps
- name: pull out lockfile
run: |
set -euo pipefail
mv -f deps/* .
rm -r deps
- uses: syphar/restore-virtualenv@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: ${{ matrix.backend.name }}-${{ steps.install_python.outputs.python-version }}
- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: ${{ steps.install_python.outputs.python-version }}
- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.7.1'
- name: install ibis
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }}"
- name: run tests
run: just ci-check -m ${{ matrix.backend.name }} --numprocesses auto --dist=loadgroup
- name: check that no untracked files were produced
shell: bash
run: git checkout poetry.lock pyproject.toml && ! git status --porcelain | tee /dev/stderr | grep .
- name: upload code coverage
if: success()
uses: codecov/codecov-action@v3
with:
flags: backend,${{ matrix.backend.name }},${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}
backends:
# this job exists so that we can use a single job from this workflow to gate merging
runs-on: ubuntu-latest
needs:
- test_backends_min_version
- test_backends
- test_backends_sqlalchemy2
- test_pyspark
steps:
- run: exit 0