diff --git a/ibis/backends/datafusion/__init__.py b/ibis/backends/datafusion/__init__.py index 8572ceadfe22..0570f163e9f9 100644 --- a/ibis/backends/datafusion/__init__.py +++ b/ibis/backends/datafusion/__init__.py @@ -421,7 +421,7 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: # self.con.register_table is broken, so we do this roundabout thing # of constructing a datafusion DataFrame, which has a side effect # of registering the table - self.con.from_arrow_table(op.data.to_pyarrow(op.schema), op.name) + self.con.from_arrow(op.data.to_pyarrow(op.schema), op.name) def read_csv( self, path: str | Path, table_name: str | None = None, **kwargs: Any @@ -757,14 +757,14 @@ def _polars(source, table_name, _conn, overwrite: bool = False): def _pyarrow_table(source, table_name, _conn, overwrite: bool = False): tmp_name = gen_name("pyarrow") with _create_and_drop_memtable(_conn, table_name, tmp_name, overwrite): - _conn.con.from_arrow_table(source, name=tmp_name) + _conn.con.from_arrow(source, name=tmp_name) @_read_in_memory.register("pyarrow.RecordBatchReader") def _pyarrow_rbr(source, table_name, _conn, overwrite: bool = False): tmp_name = gen_name("pyarrow") with _create_and_drop_memtable(_conn, table_name, tmp_name, overwrite): - _conn.con.from_arrow_table(source.read_all(), name=tmp_name) + _conn.con.from_arrow(source.read_all(), name=tmp_name) @_read_in_memory.register("pyarrow.RecordBatch") diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py index b0706acf0080..1db71e2eccc9 100644 --- a/ibis/backends/tests/test_aggregation.py +++ b/ibis/backends/tests/test_aggregation.py @@ -1586,7 +1586,9 @@ def test_grouped_case(backend, con): @pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl( - ["datafusion"], raises=Exception, reason="not supported in datafusion" + ["datafusion"], + raises=BaseException, + reason="because pyo3 panic exception is raised", ) @pytest.mark.notyet(["flink"], raises=Py4JJavaError) @pytest.mark.notyet(["impala"], raises=ImpalaHiveServer2Error) diff --git a/ibis/backends/tests/test_map.py b/ibis/backends/tests/test_map.py index 851c0810732a..e8a80757cff9 100644 --- a/ibis/backends/tests/test_map.py +++ b/ibis/backends/tests/test_map.py @@ -21,7 +21,7 @@ ["bigquery", "impala"], reason="Backend doesn't yet implement map types" ), pytest.mark.notimpl( - ["datafusion", "exasol", "polars", "druid", "oracle"], + ["exasol", "polars", "druid", "oracle"], reason="Not yet implemented in ibis", ), ] @@ -39,6 +39,10 @@ reason="function hstore(character varying[], character varying[]) does not exist", ) +mark_notyet_datafusion = pytest.mark.notyet( + ["datafusion"], raises=Exception, reason="only map and make_map are available" +) + @pytest.mark.notyet("clickhouse", reason="nested types can't be NULL") @pytest.mark.notimpl( @@ -54,6 +58,7 @@ param(None, None, id="null_both"), ], ) +@mark_notyet_datafusion def test_map_nulls(con, k, v): k = ibis.literal(k, type="array") v = ibis.literal(v, type="array") @@ -74,6 +79,7 @@ def test_map_nulls(con, k, v): param(None, None, id="null_both"), ], ) +@mark_notyet_datafusion def test_map_keys_nulls(con, k, v): k = ibis.literal(k, type="array") v = ibis.literal(v, type="array") @@ -106,6 +112,7 @@ def test_map_keys_nulls(con, k, v): param(ibis.literal(None, type="map"), id="null_map"), ], ) +@mark_notyet_datafusion def test_map_values_nulls(con, map): assert con.execute(map.values()) is None @@ -174,6 +181,7 @@ def test_map_values_nulls(con, map): ], ) @pytest.mark.parametrize("method", ["get", "contains"]) +@mark_notyet_datafusion def test_map_get_contains_nulls(con, map, key, method): expr = getattr(map, method) assert con.execute(expr(key)) is None @@ -205,11 +213,13 @@ def test_map_get_contains_nulls(con, map, key, method): ), ], ) +@mark_notyet_datafusion def test_map_merge_nulls(con, m1, m2): concatted = m1 + m2 assert con.execute(concatted) is None +@mark_notyet_datafusion def test_map_table(backend): table = backend.map assert table.kv.type().is_map() @@ -217,6 +227,7 @@ def test_map_table(backend): @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_column_map_values(backend): table = backend.map expr = table.select("idx", vals=table.kv.values()).order_by("idx") @@ -225,6 +236,7 @@ def test_column_map_values(backend): backend.assert_series_equal(result, expected) +@mark_notyet_datafusion def test_column_map_merge(backend): table = backend.map expr = table.select( @@ -239,6 +251,7 @@ def test_column_map_merge(backend): @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_literal_map_keys(con): mapping = ibis.literal({"1": "a", "2": "b"}) expr = mapping.keys().name("tmp") @@ -250,6 +263,7 @@ def test_literal_map_keys(con): @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_literal_map_values(con): mapping = ibis.literal({"1": "a", "2": "b"}) expr = mapping.values().name("tmp") @@ -260,6 +274,7 @@ def test_literal_map_values(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres +@mark_notyet_datafusion def test_scalar_isin_literal_map_keys(con): mapping = ibis.literal({"a": 1, "b": 2}) a = ibis.literal("a") @@ -272,6 +287,7 @@ def test_scalar_isin_literal_map_keys(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres +@mark_notyet_datafusion def test_map_scalar_contains_key_scalar(con): mapping = ibis.literal({"a": 1, "b": 2}) a = ibis.literal("a") @@ -283,6 +299,7 @@ def test_map_scalar_contains_key_scalar(con): @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_map_scalar_contains_key_column(backend, alltypes, df): value = {"1": "a", "3": "c"} mapping = ibis.literal(value) @@ -294,6 +311,7 @@ def test_map_scalar_contains_key_column(backend, alltypes, df): @mark_notimpl_risingwave_hstore @mark_notyet_postgres +@mark_notyet_datafusion def test_map_column_contains_key_scalar(backend, alltypes, df): expr = ibis.map(ibis.array([alltypes.string_col]), ibis.array([alltypes.int_col])) series = df.apply(lambda row: {row["string_col"]: row["int_col"]}, axis=1) @@ -306,6 +324,7 @@ def test_map_column_contains_key_scalar(backend, alltypes, df): @mark_notimpl_risingwave_hstore @mark_notyet_postgres +@mark_notyet_datafusion def test_map_column_contains_key_column(alltypes): map_expr = ibis.map( ibis.array([alltypes.string_col]), ibis.array([alltypes.int_col]) @@ -317,6 +336,7 @@ def test_map_column_contains_key_column(alltypes): @mark_notimpl_risingwave_hstore @mark_notyet_postgres +@mark_notyet_datafusion def test_literal_map_merge(con): a = ibis.literal({"a": 0, "b": 2}) b = ibis.literal({"a": 1, "c": 3}) @@ -326,6 +346,7 @@ def test_literal_map_merge(con): @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_literal_map_getitem_broadcast(backend, alltypes, df): value = {"1": "a", "2": "b"} @@ -472,6 +493,7 @@ def test_literal_map_getitem_broadcast(backend, alltypes, df): @values @keys @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_map_get_all_types(con, keys, values): m = ibis.map(ibis.array(keys), ibis.array(values)) for key, val in zip(keys, values): @@ -482,6 +504,7 @@ def test_map_get_all_types(con, keys, values): @keys @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_map_contains_all_types(con, keys): a = ibis.array(keys) m = ibis.map(a, a) @@ -490,6 +513,7 @@ def test_map_contains_all_types(con, keys): @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_literal_map_get_broadcast(backend, alltypes, df): value = {"1": "a", "2": "b"} @@ -524,13 +548,14 @@ def test_map_construct_dict(con, keys, values): assert result == dict(zip(keys, values)) -@mark_notimpl_risingwave_hstore -@mark_notyet_postgres @pytest.mark.notimpl( ["flink"], raises=pa.lib.ArrowInvalid, reason="Map array child array should have no nulls", ) +@mark_notimpl_risingwave_hstore +@mark_notyet_postgres +@mark_notyet_datafusion def test_map_construct_array_column(con, alltypes, df): expr = ibis.map(ibis.array([alltypes.string_col]), ibis.array([alltypes.int_col])) result = con.execute(expr) @@ -541,6 +566,7 @@ def test_map_construct_array_column(con, alltypes, df): @mark_notimpl_risingwave_hstore @mark_notyet_postgres +@mark_notyet_datafusion def test_map_get_with_compatible_value_smaller(con): value = ibis.literal({"A": 1000, "B": 2000}) expr = value.get("C", 3) @@ -549,6 +575,7 @@ def test_map_get_with_compatible_value_smaller(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres +@mark_notyet_datafusion def test_map_get_with_compatible_value_bigger(con): value = ibis.literal({"A": 1, "B": 2}) expr = value.get("C", 3000) @@ -557,6 +584,7 @@ def test_map_get_with_compatible_value_bigger(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres +@mark_notyet_datafusion def test_map_get_with_incompatible_value_different_kind(con): value = ibis.literal({"A": 1000, "B": 2000}) expr = value.get("C", 3.0) @@ -565,6 +593,7 @@ def test_map_get_with_incompatible_value_different_kind(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres +@mark_notyet_datafusion @pytest.mark.parametrize("null_value", [None, ibis.null()]) def test_map_get_with_null_on_not_nullable(con, null_value): map_type = dt.Map(dt.string, dt.Int16(nullable=False)) @@ -579,6 +608,7 @@ def test_map_get_with_null_on_not_nullable(con, null_value): ["flink"], raises=Py4JJavaError, reason="Flink cannot handle typeless nulls" ) @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_map_get_with_null_on_null_type_with_null(con, null_value): value = ibis.literal({"A": None, "B": None}) expr = value.get("C", null_value) @@ -586,11 +616,12 @@ def test_map_get_with_null_on_null_type_with_null(con, null_value): assert pd.isna(result) -@mark_notimpl_risingwave_hstore -@mark_notyet_postgres @pytest.mark.notyet( ["flink"], raises=Py4JJavaError, reason="Flink cannot handle typeless nulls" ) +@mark_notimpl_risingwave_hstore +@mark_notyet_postgres +@mark_notyet_datafusion def test_map_get_with_null_on_null_type_with_non_null(con): value = ibis.literal({"A": None, "B": None}) expr = value.get("C", 1) @@ -603,6 +634,7 @@ def test_map_get_with_null_on_null_type_with_non_null(con): reason="`tbl_properties` is required when creating table with schema", ) @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_map_create_table(con, temp_table): t = con.create_table( temp_table, @@ -617,11 +649,13 @@ def test_map_create_table(con, temp_table): reason="No translation rule for ", ) @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_map_length(con): expr = ibis.literal(dict(a="A", b="B")).length() assert con.execute(expr) == 2 +@mark_notyet_datafusion def test_map_keys_unnest(backend): expr = backend.map.kv.keys().unnest() result = expr.to_pandas() @@ -629,6 +663,7 @@ def test_map_keys_unnest(backend): @mark_notimpl_risingwave_hstore +@mark_notyet_datafusion def test_map_contains_null(con): expr = ibis.map(["a"], ibis.literal([None], type="array")) assert con.execute(expr.contains("a")) diff --git a/ibis/backends/tests/tpc/ds/test_queries.py b/ibis/backends/tests/tpc/ds/test_queries.py index e6fd2b3272c3..04a77f894e2e 100644 --- a/ibis/backends/tests/tpc/ds/test_queries.py +++ b/ibis/backends/tests/tpc/ds/test_queries.py @@ -9,11 +9,7 @@ from ibis import _, coalesce, cumulative_window, date, ifelse, null, rank, union from ibis import literal as lit from ibis import selectors as s -from ibis.backends.tests.errors import ( - ArrowNotImplementedError, - ClickHouseDatabaseError, - TrinoUserError, -) +from ibis.backends.tests.errors import ClickHouseDatabaseError, TrinoUserError from ibis.backends.tests.tpc.conftest import tpc_test from ibis.common.exceptions import OperationNotDefinedError @@ -1416,7 +1412,6 @@ def test_26(catalog_sales, customer_demographics, date_dim, item, promotion): @tpc_test("ds") -@pytest.mark.notyet(["datafusion"], reason="Failed to plan") def test_27(store_sales, customer_demographics, date_dim, store, item): results = ( store_sales.join(customer_demographics, [("ss_cdemo_sk", "cd_demo_sk")]) @@ -1999,11 +1994,6 @@ def test_38(store_sales, catalog_sales, web_sales, date_dim, customer): @tpc_test("ds") -@pytest.mark.notyet( - ["datafusion"], - raises=ArrowNotImplementedError, - reason="Unsupported cast from double to null using function cast_null", -) def test_39(inventory, item, warehouse, date_dim): inv = ( inventory.join(item, [("inv_item_sk", "i_item_sk")]) @@ -4894,11 +4884,6 @@ def test_89(item, store_sales, date_dim, store): ).limit(100) -@pytest.mark.notyet( - ["datafusion"], - raises=ArrowNotImplementedError, - reason="Unsupported cast from double to null using function cast_null", -) @tpc_test("ds") def test_90(web_sales, household_demographics, time_dim, web_page): def am_pm(*, hour: int, name: str): diff --git a/poetry.lock b/poetry.lock index 46f34f9e74c5..96e4b251d42a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1341,16 +1341,17 @@ tests = ["pytest", "pytest-cov", "pytest-xdist"] [[package]] name = "datafusion" -version = "40.1.0" +version = "41.0.0" description = "Build and run queries against data" optional = true python-versions = ">=3.6" files = [ - {file = "datafusion-40.1.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:983b5b89a3aaaf2789f0423564cc24dbe5eb3f4f0a7daa8e87b35ce4d2920d6b"}, - {file = "datafusion-40.1.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be44d24971e73d324a3f41503bb091f48d171d50d1d2415b469ca5e3953b5a0e"}, - {file = "datafusion-40.1.0-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:fd8c3689e9d195a9603a00e399b995a6343587d0763358f2eee65b85d5f56a37"}, - {file = "datafusion-40.1.0-cp38-abi3-win_amd64.whl", hash = "sha256:2960871ce31ee489ef3b210e77c4048278e9ee873517eff9f46ca52c82eb166d"}, - {file = "datafusion-40.1.0.tar.gz", hash = "sha256:7c7cfd2bcf491adcf6580f0ff5882ca9fc658dbdab30802ad46889e417965cc6"}, + {file = "datafusion-41.0.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:4b484035765a4f239737d6313af3cc3822448dfa86738ec44db02dfc4e08057f"}, + {file = "datafusion-41.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:fe324048a63bf462d49cca3b046821fcb546cdab3a13b1fe860aab038c4e4ad4"}, + {file = "datafusion-41.0.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7c6987ad20b238a555fac09851f2329cd8b7e829de98446159ea27a172a5f1f"}, + {file = "datafusion-41.0.0-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:a10179610c8d211d215ff3533bbd8f5faf3b47b00c0e8371ca9656e98c420380"}, + {file = "datafusion-41.0.0-cp38-abi3-win_amd64.whl", hash = "sha256:a862f6667979a367c30ae58d8770bba044bab09d1da9012ee37cb3bb86fccdf2"}, + {file = "datafusion-41.0.0.tar.gz", hash = "sha256:b2124bcd976520a8dbcb456c200f2cb8b1343285e9329fe757aa628bbd0b08f7"}, ] [package.dependencies] @@ -7893,4 +7894,4 @@ visualization = ["graphviz"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "c3905989eab1b3ab585b029a685e38ac6fa10febb022f15f3b1aea5e449e23fd" +content-hash = "82384564308f7be107e844339bbcaaf90643f677702f39cf19231f121d2bc7f5" diff --git a/pyproject.toml b/pyproject.toml index c70a7682dfed..f2fa45d43ac9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ clickhouse-connect = { version = ">=0.5.23,<1", optional = true, extras = [ "numpy", "pandas", ] } -datafusion = { version = ">=0.6,<41", optional = true } +datafusion = { version = ">=0.6,<42", optional = true } db-dtypes = { version = ">=0.3,<2", optional = true } deltalake = { version = ">=0.9.0,<1", optional = true } duckdb = { version = ">=0.8.1,<1.2", optional = true } diff --git a/requirements-dev.txt b/requirements-dev.txt index 0fbab1bb81f4..28f9348419d3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -42,7 +42,7 @@ coverage[toml]==7.6.1 ; python_version >= "3.10" and python_version < "4.0" crashtest==0.4.1 ; python_version >= "3.10" and python_version < "4.0" cryptography==43.0.1 ; python_version >= "3.10" and python_version < "4.0" cycler==0.12.1 ; python_version >= "3.10" and python_version < "3.13" -datafusion==40.1.0 ; python_version >= "3.10" and python_version < "4.0" +datafusion==41.0.0 ; python_version >= "3.10" and python_version < "4.0" db-dtypes==1.3.0 ; python_version >= "3.10" and python_version < "4.0" debugpy==1.8.5 ; python_version >= "3.10" and python_version < "3.13" decorator==5.1.1 ; python_version >= "3.10" and python_version < "4.0"