From aef0faca131b4f309c6af1a75602ced7720b24f9 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 15 Apr 2024 15:56:30 +0200 Subject: [PATCH 1/7] Adding ADBC - Substrait Test in the Python layer --- duckdb | 2 +- duckdb-r | 2 +- src/to_substrait.cpp | 4 +- substrait | 2 +- test/python/data/somefile.parquet | Bin 0 -> 292 bytes test/python/test_adbc.py | 167 ++++++++++++++++++++++++++++++ 6 files changed, 172 insertions(+), 5 deletions(-) create mode 100644 test/python/data/somefile.parquet create mode 100644 test/python/test_adbc.py diff --git a/duckdb b/duckdb index 0e78476..eb9f63a 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 0e784765f6f87bd1ce9034afcce1e7f89fcd8777 +Subproject commit eb9f63a1cd506afe1fad8125d6f123868e979b5d diff --git a/duckdb-r b/duckdb-r index a487572..83d1d9b 160000 --- a/duckdb-r +++ b/duckdb-r @@ -1 +1 @@ -Subproject commit a4875722a9f6b4172ff99a0499a8b684a82fa31c +Subproject commit 83d1d9b997f6671ed771a60ae590a5fd90947451 diff --git a/src/to_substrait.cpp b/src/to_substrait.cpp index 1796eab..7b5a1f4 100644 --- a/src/to_substrait.cpp +++ b/src/to_substrait.cpp @@ -780,7 +780,7 @@ substrait::Rel *DuckDBToSubstrait::TransformLimit(LogicalOperator &dop) { idx_t limit_val; idx_t offset_val; - switch(dlimit.limit_val.Type()) { + switch (dlimit.limit_val.Type()) { case LimitNodeType::CONSTANT_VALUE: limit_val = dlimit.limit_val.GetConstantValue(); break; @@ -790,7 +790,7 @@ substrait::Rel *DuckDBToSubstrait::TransformLimit(LogicalOperator &dop) { default: throw InternalException("Unsupported limit value type"); } - switch(dlimit.offset_val.Type()) { + switch (dlimit.offset_val.Type()) { case LimitNodeType::CONSTANT_VALUE: offset_val = dlimit.offset_val.GetConstantValue(); break; diff --git a/substrait b/substrait index d9b9672..1a51b3d 160000 --- a/substrait +++ b/substrait @@ -1 +1 @@ -Subproject commit d9b9672fd3c24285afdee9344fc2f4f7fcd70afb +Subproject commit 1a51b3d49ba8323134314ef94acf49c910ff520d diff --git a/test/python/data/somefile.parquet b/test/python/data/somefile.parquet new file mode 100644 index 0000000000000000000000000000000000000000..37b9d4091de5936e305d3fc747b97874dfbc0e94 GIT binary patch literal 292 zcmZ{gJqyA>42IKCN~Lv7xDyAL9z}}aAUJeUIyw|_mwtgos$lEx=LFF*~;YjG^| zeOnx=CL0(m4uAEaUpaPc|x*=hfGuXpC^#pHrxOJ literal 0 HcmV?d00001 diff --git a/test/python/test_adbc.py b/test/python/test_adbc.py new file mode 100644 index 0000000..290fdab --- /dev/null +++ b/test/python/test_adbc.py @@ -0,0 +1,167 @@ +import duckdb +import pytest +import sys +import datetime +import os +from os.path import abspath, join, dirname, normpath +import glob + + +adbc_driver_manager = pytest.importorskip("adbc_driver_manager.dbapi") +adbc_driver_manager_lib = pytest.importorskip("adbc_driver_manager._lib") +json_format = pytest.importorskip("google.protobuf.json_format") +plan_pb2 = pytest.importorskip("substrait.gen.proto.plan_pb2") +pyarrow = pytest.importorskip("pyarrow") + +# When testing local, if you build via BUILD_PYTHON=1 make, you need to manually set up the +# dylib duckdb path. +driver_path = duckdb.duckdb.__file__ + +def find_substrait(): + # Paths to search for extensions + build = normpath(join(dirname(__file__), "../../duckdb/build/")) + extension = "extension/*/*.duckdb_extension" + + extension_search_patterns = [ + join(build, "release", extension), + join(build, "debug", extension), + ] + + # DUCKDB_PYTHON_TEST_EXTENSION_PATH can be used to add a path for the extension test to search for extensions + if 'DUCKDB_PYTHON_TEST_EXTENSION_PATH' in os.environ: + env_extension_path = os.getenv('DUCKDB_PYTHON_TEST_EXTENSION_PATH') + env_extension_path = env_extension_path.rstrip('/') + extension_search_patterns.append(env_extension_path + '/*/*.duckdb_extension') + extension_search_patterns.append(env_extension_path + '/*.duckdb_extension') + + extension_paths_found = [] + for pattern in extension_search_patterns: + extension_pattern_abs = abspath(pattern) + for path in glob.glob(extension_pattern_abs): + extension_paths_found.append(path) + + for path in extension_paths_found: + if path.endswith("substrait.duckdb_extension"): + return path + pytest.skip(f'could not load substrait') + + return "Fail" + + +@pytest.fixture +def duck_conn(): + with adbc_driver_manager.connect(driver=driver_path, entrypoint="duckdb_adbc_init", db_kwargs={"allow_unsigned_extensions": "true"}) as conn: + yield conn + +# file_path = os.path.dirname(os.path.abspath(__file__)) +# file_path = os.path.join(file_path,data,'somefile.parquet') + +PLAN_PROTOTEXT = '''{ + "relations":[ + { + "root":{ + "input":{ + "project":{ + "input":{ + "read":{ + "baseSchema":{ + "names":[ + "mbid", + "artist_mb" + ], + "struct":{ + "types":[ + { + "string":{ + "nullability":"NULLABILITY_NULLABLE" + } + }, + { + "string":{ + "nullability":"NULLABILITY_NULLABLE" + } + } + ], + "nullability":"NULLABILITY_REQUIRED" + } + }, + "projection":{ + "select":{ + "structItems":[ + { + + }, + { + "field":1 + } + ] + }, + "maintainSingularStruct":true + }, + "localFiles":{ + "items":[ + { + "uriFile":" data/somefile.parquet", + "parquet":{ + + } + } + ] + } + } + }, + "expressions":[ + { + "selection":{ + "directReference":{ + "structField":{ + + } + }, + "rootReference":{ + + } + } + }, + { + "selection":{ + "directReference":{ + "structField":{ + "field":1 + } + }, + "rootReference":{ + + } + } + } + ] + } + }, + "names":[ + "mbid", + "artist_mb" + ] + } + } + ], + "version":{ + "minorNumber":39, + "producer":"DuckDB" + } +}''' + +def test_substrait_over_adbc(duck_conn): + plan = json_format.Parse(PLAN_PROTOTEXT, plan_pb2.Plan()) + cur = duck_conn.cursor() + substrait_path = find_substrait() + cur.execute("LOAD '"+ substrait_path + "'") + + plan_data = plan.SerializeToString() + cur.execute(plan_data) + result_table = cur.fetch_arrow_table() + correct_table = pyarrow.Table.from_pydict({ + 'mbid': pyarrow.array(["1"], type=pyarrow.string()), + 'artist_mb': pyarrow.array(["Tenacious D"], type=pyarrow.string()) + }) + assert result_table.equals(correct_table) From 4a1146480c4296f39993f011f98d6aa7dc3a4117 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 16 Apr 2024 12:12:40 +0200 Subject: [PATCH 2/7] Install subtrait via pip --- .github/workflows/python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 61206d3..7ef5999 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -39,7 +39,7 @@ jobs: - name: Install Python Dependencies shell: bash run: | - pip install pytest pandas "ibis-framework[duckdb]==3.2.0" "ibis-substrait==2.21.1" "substrait-validator==0.0.11" + pip install pytest pandas substrait "ibis-framework[duckdb]==3.2.0" "ibis-substrait==2.21.1" "substrait-validator==0.0.11" pip uninstall protobuf -y pip install --no-binary protobuf protobuf From 7352b143b36438ef16abc23e1554b6363ee8c732 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 16 Apr 2024 12:19:55 +0200 Subject: [PATCH 3/7] just define duckdb_path? --- .github/workflows/r.yml | 2 ++ duckdb-r | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 6dc5b80..6f65768 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -10,6 +10,8 @@ jobs: runs-on: ubuntu-20.04 env: GEN: ninja + DUCKDB_PATH: ./duckdb/ + steps: - uses: r-lib/actions/setup-r@v2 with: diff --git a/duckdb-r b/duckdb-r index 83d1d9b..59f1673 160000 --- a/duckdb-r +++ b/duckdb-r @@ -1 +1 @@ -Subproject commit 83d1d9b997f6671ed771a60ae590a5fd90947451 +Subproject commit 59f1673ba4dcdf327782f6b7dfcf44f7d3ec7666 From 6f1efc24883ce019ad5eb5f7ca33fd40b776505a Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 16 Apr 2024 12:36:12 +0200 Subject: [PATCH 4/7] I think this is with duckdb-r as a reference --- .github/workflows/r.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 6f65768..24a06da 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-20.04 env: GEN: ninja - DUCKDB_PATH: ./duckdb/ + DUCKDB_PATH: ../duckdb/ steps: - uses: r-lib/actions/setup-r@v2 From 415c4508eccf87783a716a10bd0e35043b6ea7cc Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 16 Apr 2024 13:12:55 +0200 Subject: [PATCH 5/7] we also need the adbc_driver_manager library --- .github/workflows/python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 7ef5999..719b121 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -39,7 +39,7 @@ jobs: - name: Install Python Dependencies shell: bash run: | - pip install pytest pandas substrait "ibis-framework[duckdb]==3.2.0" "ibis-substrait==2.21.1" "substrait-validator==0.0.11" + pip install pytest pandas substrait adbc_driver_manager "ibis-framework[duckdb]==3.2.0" "ibis-substrait==2.21.1" "substrait-validator==0.0.11" pip uninstall protobuf -y pip install --no-binary protobuf protobuf From 9354b474b9ca8e83eaf05b5ededa3ce5c5ab7155 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 16 Apr 2024 13:58:56 +0200 Subject: [PATCH 6/7] Use script path to figure out file path --- test/python/test_adbc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/python/test_adbc.py b/test/python/test_adbc.py index 290fdab..d35ca5e 100644 --- a/test/python/test_adbc.py +++ b/test/python/test_adbc.py @@ -53,8 +53,8 @@ def duck_conn(): with adbc_driver_manager.connect(driver=driver_path, entrypoint="duckdb_adbc_init", db_kwargs={"allow_unsigned_extensions": "true"}) as conn: yield conn -# file_path = os.path.dirname(os.path.abspath(__file__)) -# file_path = os.path.join(file_path,data,'somefile.parquet') +file_path = os.path.dirname(os.path.abspath(__file__)) +file_path = os.path.join(file_path,'data','somefile.parquet') PLAN_PROTOTEXT = '''{ "relations":[ @@ -101,7 +101,7 @@ def duck_conn(): "localFiles":{ "items":[ { - "uriFile":" data/somefile.parquet", + "uriFile":" ''' + file_path + '''", "parquet":{ } From 5ce4f6ee975eb0c452f7479f071a7de7824a0a17 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 16 Apr 2024 14:33:39 +0200 Subject: [PATCH 7/7] extra sneaky space --- test/python/test_adbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/python/test_adbc.py b/test/python/test_adbc.py index d35ca5e..27ccf46 100644 --- a/test/python/test_adbc.py +++ b/test/python/test_adbc.py @@ -101,7 +101,7 @@ def duck_conn(): "localFiles":{ "items":[ { - "uriFile":" ''' + file_path + '''", + "uriFile":"''' + file_path + '''", "parquet":{ }