fix(duckdb): support version 1.1.0 (#10037)

ibis-project · Sep 9, 2024 · 3a37626 · 3a37626
1 parent 3533827
commit 3a37626
Show file tree

Hide file tree

Showing 13 changed files with 108 additions and 69 deletions.
diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py
@@ -16,6 +16,7 @@
 import pyarrow_hotfix  # noqa: F401
 import sqlglot as sg
 import sqlglot.expressions as sge
+from packaging.version import parse as vparse
 
 import ibis
 import ibis.backends.sql.compilers as sc
@@ -461,6 +462,11 @@ def _post_connect(self, extensions: Sequence[str] | None = None) -> None:
         # Default timezone, can't be set with `config`
         self.settings["timezone"] = "UTC"
 
+        # setting this to false disables magic variables-as-tables discovery,
+        # hopefully eliminating large classes of bugs
+        if vparse(self.version) > vparse("1"):
+            self.settings["python_enable_replacements"] = False
+
         self._record_batch_readers_consumed = {}
 
     def _load_extensions(

diff --git a/ibis/backends/duckdb/tests/test_udf.py b/ibis/backends/duckdb/tests/test_udf.py
@@ -73,7 +73,8 @@ def favg(x: float, where: bool = True) -> float: ...
 def test_builtin_agg(con, func):
     import ibis
 
-    raw_data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
+    start, stop = 1, 11
+    raw_data = list(map(float, range(start, stop)))
     data = ibis.memtable({"a": raw_data})
     expr = func(data.a)
 

diff --git a/ibis/backends/sql/compilers/duckdb.py b/ibis/backends/sql/compilers/duckdb.py
@@ -51,7 +51,6 @@ class DuckDBCompiler(SQLGlotCompiler):
 
     SIMPLE_OPS = {
         ops.Arbitrary: "any_value",
-        ops.ArrayPosition: "list_indexof",
         ops.ArrayMin: "list_min",
         ops.ArrayMax: "list_max",
         ops.ArrayAny: "list_bool_or",
@@ -150,6 +149,13 @@ def visit_ArrayDistinct(self, op, *, arg):
             ),
         )
 
+    def visit_ArrayPosition(self, op, *, arg, other):
+        return self.if_(
+            arg.is_(NULL) | other.is_(NULL),
+            NULL,
+            self.f.coalesce(self.f.list_indexof(arg, other), 0),
+        )
+
     def visit_ArrayCollect(self, op, *, arg, where, order_by, include_null):
         if not include_null:
             cond = arg.is_(sg.not_(NULL, copy=False))
@@ -352,7 +358,11 @@ def visit_IntervalFromInteger(self, op, *, arg, unit):
         return self.f[f"to_{unit.plural}"](arg)
 
     def visit_FindInSet(self, op, *, needle, values):
-        return self.f.list_indexof(self.f.array(*values), needle)
+        return self.if_(
+            needle.is_(NULL),
+            NULL,
+            self.f.coalesce(self.f.list_indexof(self.f.array(*values), needle), 0),
+        )
 
     def visit_CountDistinctStar(self, op, *, where, arg):
         # use a tuple because duckdb doesn't accept COUNT(DISTINCT a, b, c, ...)

diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py
@@ -466,6 +466,12 @@ def mean_and_std(v):
             lambda t, where: t.string_col.approx_nunique(where=where),
             lambda t, where: t.string_col[where].nunique(),
             id="approx_nunique",
+            marks=pytest.mark.xfail_version(
+                duckdb=["duckdb>=1.1"],
+                raises=AssertionError,
+                reason="not exact, even at this tiny scale",
+                strict=False,
+            ),
         ),
         param(
             lambda t, where: t.bigint_col.bit_and(where=where),

diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py
@@ -296,15 +296,17 @@ def test_roundtrip_partitioned_parquet(tmp_path, con, backend, awards_players):
 
     # Reingest and compare schema
     reingest = con.read_parquet(outparquet / "*" / "*")
-    reingest = reingest.cast({"yearID": "int64"})
 
     # avoid type comparison to appease duckdb: as of 0.8.0 it returns large_string
-    assert reingest.schema().names == awards_players.schema().names
+    assert reingest.schema().keys() == awards_players.schema().keys()
 
     reingest = reingest.order_by(["yearID", "playerID", "awardID", "lgID"])
     awards_players = awards_players.order_by(["yearID", "playerID", "awardID", "lgID"])
 
-    backend.assert_frame_equal(reingest.to_pandas(), awards_players.to_pandas())
+    # reorder columns to match the partitioning
+    backend.assert_frame_equal(
+        reingest.to_pandas(), awards_players[reingest.columns].to_pandas()
+    )
 
 
 @pytest.mark.parametrize("ftype", ["csv", "parquet"])

diff --git a/ibis/backends/tests/test_numeric.py b/ibis/backends/tests/test_numeric.py
@@ -1290,14 +1290,13 @@ def test_floating_mod(backend, alltypes, df):
 )
 @pytest.mark.notyet(["mysql", "pyspark"], raises=AssertionError)
 @pytest.mark.notyet(
-    ["duckdb", "sqlite"],
-    raises=AssertionError,
-    reason="returns NULL when dividing by zero",
+    ["sqlite"], raises=AssertionError, reason="returns NULL when dividing by zero"
 )
 @pytest.mark.notyet(["mssql"], raises=PyODBCDataError)
 @pytest.mark.notyet(["snowflake"], raises=SnowflakeProgrammingError)
 @pytest.mark.notyet(["postgres"], raises=PsycoPg2DivisionByZero)
 @pytest.mark.notimpl(["exasol"], raises=ExaQueryError)
+@pytest.mark.xfail_version(duckdb=["duckdb<1.1"])
 def test_divide_by_zero(backend, alltypes, df, column, denominator):
     expr = alltypes[column] / denominator
     result = expr.name("tmp").execute()

diff --git a/ibis/backends/tests/test_temporal.py b/ibis/backends/tests/test_temporal.py
@@ -1900,10 +1900,15 @@ def test_large_timestamp(con):
             id="ns",
             marks=[
                 pytest.mark.notyet(
-                    ["duckdb", "impala", "pyspark", "trino"],
+                    ["impala", "pyspark", "trino"],
                     reason="drivers appear to truncate nanos",
                     raises=AssertionError,
                 ),
+                pytest.mark.xfail_version(
+                    duckdb=["duckdb<1.1"],
+                    reason="not implemented until 1.1",
+                    raises=AssertionError,
+                ),
                 pytest.mark.notimpl(
                     ["druid"],
                     reason="ibis normalization truncates nanos",

diff --git a/ibis/expr/types/generic.py b/ibis/expr/types/generic.py
@@ -1630,11 +1630,11 @@ def approx_nunique(self, where: ir.BooleanValue | None = None) -> ir.IntegerScal
         >>> t = ibis.examples.penguins.fetch()
         >>> t.body_mass_g.approx_nunique()
         ┌────┐
-        │ 94 │
+        │ 92 │
         └────┘
         >>> t.body_mass_g.approx_nunique(where=t.species == "Adelie")
         ┌────┐
-        │ 55 │
+        │ 61 │
         └────┘
         """
         return ops.ApproxCountDistinct(

diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py
@@ -3277,11 +3277,11 @@ def join(
         ┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
         │ int64   │ string            │ int64         │ string            │
         ├─────────┼───────────────────┼───────────────┼───────────────────┤
-        │    1732 │ funny             │         60756 │ funny             │
-        │    1732 │ Highly quotable   │         60756 │ Highly quotable   │
-        │    1732 │ drugs             │        106782 │ drugs             │
-        │    5989 │ Leonardo DiCaprio │        106782 │ Leonardo DiCaprio │
-        │  139385 │ tom hardy         │         89774 │ Tom Hardy         │
+        │   60756 │ funny             │          1732 │ funny             │
+        │   60756 │ Highly quotable   │          1732 │ Highly quotable   │
+        │   89774 │ Tom Hardy         │        139385 │ tom hardy         │
+        │  106782 │ drugs             │          1732 │ drugs             │
+        │  106782 │ Leonardo DiCaprio │          5989 │ Leonardo DiCaprio │
         └─────────┴───────────────────┴───────────────┴───────────────────┘
         """
         from ibis.expr.types.joins import Join

diff --git a/ibis/formats/pandas.py b/ibis/formats/pandas.py
@@ -383,6 +383,8 @@ def convert(value):
                 return value
             elif isinstance(value, UUID):
                 return value
+            elif isinstance(value, bytes):
+                return UUID(bytes=value)
             return UUID(value)
 
         return convert