From 883d2d3f064a75ae59660ee5027c2adfa2483913 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Mon, 9 Sep 2024 10:27:58 -0400 Subject: [PATCH] fix(duckdb): allow setting `auto_detect` to `False` by fixing translation of columns argument (#10065) --- ibis/backends/duckdb/__init__.py | 23 ++++++++++++++++++++- ibis/backends/duckdb/tests/test_register.py | 13 ++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py index fe1216bfda04..45e83b9fb16f 100644 --- a/ibis/backends/duckdb/__init__.py +++ b/ibis/backends/duckdb/__init__.py @@ -600,6 +600,7 @@ def read_json( self, source_list: str | list[str] | tuple[str], table_name: str | None = None, + columns: Mapping[str, str] | None = None, **kwargs, ) -> ir.Table: """Read newline-delimited JSON into an ibis table. @@ -614,8 +615,13 @@ def read_json( File or list of files table_name Optional table name + columns + Optional mapping from string column name to duckdb type string. **kwargs - Additional keyword arguments passed to DuckDB's `read_json_auto` function + Additional keyword arguments passed to DuckDB's `read_json_auto` function. + + See https://duckdb.org/docs/data/json/overview.html#json-loading + for parameters and more information about reading JSON. Returns ------- @@ -630,6 +636,21 @@ def read_json( sg.to_identifier(key).eq(sge.convert(val)) for key, val in kwargs.items() ] + if columns: + options.append( + sg.to_identifier("columns").eq( + sge.Struct.from_arg_list( + [ + sge.PropertyEQ( + this=sg.to_identifier(key), + expression=sge.convert(value), + ) + for key, value in columns.items() + ] + ) + ) + ) + self._create_temp_view( table_name, sg.select(STAR).from_( diff --git a/ibis/backends/duckdb/tests/test_register.py b/ibis/backends/duckdb/tests/test_register.py index e8d892ca26ef..68ac9cbae9d5 100644 --- a/ibis/backends/duckdb/tests/test_register.py +++ b/ibis/backends/duckdb/tests/test_register.py @@ -505,3 +505,16 @@ def test_memtable_null_column_parquet_dtype_roundtrip(con, tmp_path): after = con.read_parquet(tmp_path / "tmp.parquet") assert before.a.type() == after.a.type() + + +def test_read_json_no_auto_detection(con, tmp_path): + ndjson_data = """ + {"year": 2007} + {"year": 2008} + {"year": 2009} + """ + path = tmp_path.joinpath("test.ndjson") + path.write_text(ndjson_data) + + t = con.read_json(path, auto_detect=False, columns={"year": "varchar"}) + assert t.year.type() == dt.string