Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: add regression case to duckdb integration test #3098

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 54 additions & 1 deletion python/python/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,68 @@ def test_duckdb_filter_on_rowid(tmp_path):
assert actual.to_pydict() == expected.to_pydict()


class DuckDataset(pa.dataset.Dataset):
"""
Hacky way to wrap a lance dataset hide extension types.

Usage:
>>> scanner = DuckDataset(lance.dataset("my_dataset.lance"))
>>> duckdb.sql("SELECT uid FROM scanner LIMIT 10;")
"""

def __init__(self, ds):
self._ds = ds
fields = [ds.schema.field(i) for i in range(len(ds.schema))]
fields = [f.remove_metadata() for f in fields]
self.pruned_schema = pa.schema(fields)

def __getattribute__(self, attr):
if attr == "schema":
return object.__getattribute__(self, "pruned_schema")
else:
ds = super().__getattribute__("_ds")
return object.__getattribute__(ds, attr)


def test_duckdb_pushdown_extension_types(tmp_path):
# large_binary is reported by pyarrow as a substrait extension type. Datafusion
# does not currently handle these extension types. This should be ok as long
# as the filter isn't accessing the column with the extension type.
#
# Lance works around this by removing any columns with extension types from the
# schema it gives to duckdb.
#
# image is an extension type. DuckDb currently rejects anything that's an extension
# type. We can clumsily work around this by pretending its not an extension type.
tab = pa.table(
{
"filterme": [1, 2, 3],
"largebin": pa.array([b"123", b"456", b"789"], pa.large_binary()),
"othercol": [4, 5, 6],
}
"image": pa.array(
[b"123", b"456", b"789"],
pa.binary(),
),
},
schema=pa.schema(
[
pa.field("filterme", pa.int64()),
pa.field("largebin", pa.large_binary()),
pa.field("othercol", pa.int64()),
pa.field(
"image",
pa.binary(),
metadata={
b"ARROW:extension:metadata": b"",
b"ARROW:extension:name": b"lance.arrow.encoded_image",
},
),
]
),
)
ds = lance.write_dataset(tab, str(tmp_path)) # noqa: F841
ds = DuckDataset(ds)

expected = tab.slice(1, 1)
actual = duckdb.query("SELECT * FROM ds WHERE filterme = 2").fetch_arrow_table()
assert actual.to_pydict() == expected.to_pydict()
Expand All @@ -44,6 +91,10 @@ def test_duckdb_pushdown_extension_types(tmp_path):
actual = duckdb.query("SELECT * FROM ds WHERE othercol = 4").fetch_arrow_table()
assert actual.to_pydict() == expected.to_pydict()

expected = pa.table({"max(image)": [b"789"]})
actual = duckdb.query("SELECT MAX(image) FROM ds").fetch_arrow_table()
assert actual.to_pydict() == expected.to_pydict()

# Not the best error message but hopefully this is short lived until datafusion
# supports substrait extension types.
with pytest.raises(
Expand All @@ -55,6 +106,8 @@ def test_duckdb_pushdown_extension_types(tmp_path):
):
duckdb.query("SELECT * FROM ds WHERE largebin = '456'").fetchall()

# Need to subtract extension types from tab to use as our expected results
tab = pa.table(tab.columns, schema=ds.schema)
# Unclear if all of these result in pushdown or not but they shouldn't error if
# they do.
for filt in [
Expand Down
Loading