Skip to content

Commit

Permalink
fix(duckdb): support version 1.1.0 (#10037)
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud authored Sep 9, 2024
1 parent 3533827 commit 3a37626
Show file tree
Hide file tree
Showing 13 changed files with 108 additions and 69 deletions.
6 changes: 6 additions & 0 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pyarrow_hotfix # noqa: F401
import sqlglot as sg
import sqlglot.expressions as sge
from packaging.version import parse as vparse

import ibis
import ibis.backends.sql.compilers as sc
Expand Down Expand Up @@ -461,6 +462,11 @@ def _post_connect(self, extensions: Sequence[str] | None = None) -> None:
# Default timezone, can't be set with `config`
self.settings["timezone"] = "UTC"

# setting this to false disables magic variables-as-tables discovery,
# hopefully eliminating large classes of bugs
if vparse(self.version) > vparse("1"):
self.settings["python_enable_replacements"] = False

self._record_batch_readers_consumed = {}

def _load_extensions(
Expand Down
3 changes: 2 additions & 1 deletion ibis/backends/duckdb/tests/test_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def favg(x: float, where: bool = True) -> float: ...
def test_builtin_agg(con, func):
import ibis

raw_data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
start, stop = 1, 11
raw_data = list(map(float, range(start, stop)))
data = ibis.memtable({"a": raw_data})
expr = func(data.a)

Expand Down
14 changes: 12 additions & 2 deletions ibis/backends/sql/compilers/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ class DuckDBCompiler(SQLGlotCompiler):

SIMPLE_OPS = {
ops.Arbitrary: "any_value",
ops.ArrayPosition: "list_indexof",
ops.ArrayMin: "list_min",
ops.ArrayMax: "list_max",
ops.ArrayAny: "list_bool_or",
Expand Down Expand Up @@ -150,6 +149,13 @@ def visit_ArrayDistinct(self, op, *, arg):
),
)

def visit_ArrayPosition(self, op, *, arg, other):
return self.if_(
arg.is_(NULL) | other.is_(NULL),
NULL,
self.f.coalesce(self.f.list_indexof(arg, other), 0),
)

def visit_ArrayCollect(self, op, *, arg, where, order_by, include_null):
if not include_null:
cond = arg.is_(sg.not_(NULL, copy=False))
Expand Down Expand Up @@ -352,7 +358,11 @@ def visit_IntervalFromInteger(self, op, *, arg, unit):
return self.f[f"to_{unit.plural}"](arg)

def visit_FindInSet(self, op, *, needle, values):
return self.f.list_indexof(self.f.array(*values), needle)
return self.if_(
needle.is_(NULL),
NULL,
self.f.coalesce(self.f.list_indexof(self.f.array(*values), needle), 0),
)

def visit_CountDistinctStar(self, op, *, where, arg):
# use a tuple because duckdb doesn't accept COUNT(DISTINCT a, b, c, ...)
Expand Down
6 changes: 6 additions & 0 deletions ibis/backends/tests/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,12 @@ def mean_and_std(v):
lambda t, where: t.string_col.approx_nunique(where=where),
lambda t, where: t.string_col[where].nunique(),
id="approx_nunique",
marks=pytest.mark.xfail_version(
duckdb=["duckdb>=1.1"],
raises=AssertionError,
reason="not exact, even at this tiny scale",
strict=False,
),
),
param(
lambda t, where: t.bigint_col.bit_and(where=where),
Expand Down
8 changes: 5 additions & 3 deletions ibis/backends/tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,15 +296,17 @@ def test_roundtrip_partitioned_parquet(tmp_path, con, backend, awards_players):

# Reingest and compare schema
reingest = con.read_parquet(outparquet / "*" / "*")
reingest = reingest.cast({"yearID": "int64"})

# avoid type comparison to appease duckdb: as of 0.8.0 it returns large_string
assert reingest.schema().names == awards_players.schema().names
assert reingest.schema().keys() == awards_players.schema().keys()

reingest = reingest.order_by(["yearID", "playerID", "awardID", "lgID"])
awards_players = awards_players.order_by(["yearID", "playerID", "awardID", "lgID"])

backend.assert_frame_equal(reingest.to_pandas(), awards_players.to_pandas())
# reorder columns to match the partitioning
backend.assert_frame_equal(
reingest.to_pandas(), awards_players[reingest.columns].to_pandas()
)


@pytest.mark.parametrize("ftype", ["csv", "parquet"])
Expand Down
5 changes: 2 additions & 3 deletions ibis/backends/tests/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,14 +1290,13 @@ def test_floating_mod(backend, alltypes, df):
)
@pytest.mark.notyet(["mysql", "pyspark"], raises=AssertionError)
@pytest.mark.notyet(
["duckdb", "sqlite"],
raises=AssertionError,
reason="returns NULL when dividing by zero",
["sqlite"], raises=AssertionError, reason="returns NULL when dividing by zero"
)
@pytest.mark.notyet(["mssql"], raises=PyODBCDataError)
@pytest.mark.notyet(["snowflake"], raises=SnowflakeProgrammingError)
@pytest.mark.notyet(["postgres"], raises=PsycoPg2DivisionByZero)
@pytest.mark.notimpl(["exasol"], raises=ExaQueryError)
@pytest.mark.xfail_version(duckdb=["duckdb<1.1"])
def test_divide_by_zero(backend, alltypes, df, column, denominator):
expr = alltypes[column] / denominator
result = expr.name("tmp").execute()
Expand Down
7 changes: 6 additions & 1 deletion ibis/backends/tests/test_temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1900,10 +1900,15 @@ def test_large_timestamp(con):
id="ns",
marks=[
pytest.mark.notyet(
["duckdb", "impala", "pyspark", "trino"],
["impala", "pyspark", "trino"],
reason="drivers appear to truncate nanos",
raises=AssertionError,
),
pytest.mark.xfail_version(
duckdb=["duckdb<1.1"],
reason="not implemented until 1.1",
raises=AssertionError,
),
pytest.mark.notimpl(
["druid"],
reason="ibis normalization truncates nanos",
Expand Down
4 changes: 2 additions & 2 deletions ibis/expr/types/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1630,11 +1630,11 @@ def approx_nunique(self, where: ir.BooleanValue | None = None) -> ir.IntegerScal
>>> t = ibis.examples.penguins.fetch()
>>> t.body_mass_g.approx_nunique()
┌────┐
94
92
└────┘
>>> t.body_mass_g.approx_nunique(where=t.species == "Adelie")
┌────┐
55
61
└────┘
"""
return ops.ApproxCountDistinct(
Expand Down
10 changes: 5 additions & 5 deletions ibis/expr/types/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3277,11 +3277,11 @@ def join(
┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ int64 │ string │ int64 │ string │
├─────────┼───────────────────┼───────────────┼───────────────────┤
1732 │ funny │ 60756 │ funny │
1732 │ Highly quotable │ 60756 │ Highly quotable │
1732 │ drugs 106782drugs
5989 │ Leonardo DiCaprio 106782 │ Leonardo DiCaprio
139385tom hardy 89774 │ Tom Hardy
60756 │ funny │ 1732 │ funny │
60756 │ Highly quotable │ 1732 │ Highly quotable │
89774 │ Tom Hardy139385tom hardy
106782 │ drugs 1732 │ drugs
106782Leonardo DiCaprio 5989 │ Leonardo DiCaprio
└─────────┴───────────────────┴───────────────┴───────────────────┘
"""
from ibis.expr.types.joins import Join
Expand Down
2 changes: 2 additions & 0 deletions ibis/formats/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,8 @@ def convert(value):
return value
elif isinstance(value, UUID):
return value
elif isinstance(value, bytes):
return UUID(bytes=value)
return UUID(value)

return convert
Expand Down
Loading

0 comments on commit 3a37626

Please sign in to comment.