From d5332940db0966ba8c10ebca416a23c4c1cc3192 Mon Sep 17 00:00:00 2001 From: tim_stephenson <38039215+tim-stephenson@users.noreply.github.com> Date: Sat, 24 Aug 2024 14:49:08 -0400 Subject: [PATCH 1/6] initial commit, everything seems to be working, may still want to add more tests --- py-polars/polars/expr/list.py | 36 +++++++++++++++++++++++++++++++++ py-polars/polars/series/list.py | 19 +++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index 5655b58c86cb..d91152104f8b 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -1358,3 +1358,39 @@ def set_symmetric_difference(self, other: IntoExpr) -> Expr: """ # noqa: W505. other = parse_into_expression(other, str_as_lit=False) return wrap_expr(self._pyexpr.list_set_operation(other, "symmetric_difference")) + + def json_encode(self) -> Expr: + r""" + Convert this list to a string column with json values. + + Examples + -------- + >>> pl.DataFrame( + ... {"a": [[1, 2], [45], [9, 1, 3], None]} + ... ).with_columns(pl.col("a").list.json_encode().alias("encoded")) + shape: (4, 2) + ┌───────────┬───────────┐ + │ a ┆ encoded │ + │ --- ┆ --- │ + │ list[i64] ┆ str │ + ╞═══════════╪═══════════╡ + │ [1, 2] ┆ [1, 2] │ + │ [45] ┆ [45] │ + │ [9, 1, 3] ┆ [9, 1, 3] │ + │ null ┆ null │ + └───────────┴───────────┘ + + >>> pl.DataFrame( + ... {"a": [["\\", "\\foo"], ["\a\"'", "{\" bar}"]]} + ... ).with_columns(pl.col("a").list.json_encode().alias("encoded")) + shape: (2, 2) + ┌────────────────────┬───────────────────────────┐ + │ a ┆ encoded │ + │ --- ┆ --- │ + │ list[str] ┆ str │ + ╞════════════════════╪═══════════════════════════╡ + │ ["\", "\foo"] ┆ ['\', '\bar'] │ + │ [""'", "{" bar}"] ┆ ["\u0007\"'", "{\" bar}"] │ + └────────────────────┴───────────────────────────┘ + """ + return wrap_expr(self._pyexpr.list_json_encode()) diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index 53bcb22ce6f3..09a60daffcd0 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -1052,3 +1052,22 @@ def set_symmetric_difference(self, other: Series) -> Series: [5, 7, 8] ] """ # noqa: W505 + + def json_encode(self) -> Series: + """ + Convert this list Series into a string Series with json values. + + Examples + -------- + >>> a = pl.Series([[1, 2, 3], [], [None, 3], [5, 6, 7]]) + >>> a.list.json_encode(b) + shape: (4,) + Series: '' [String] + [ + "[1, 2, 3]" + "[]" + "[null, 3]" + "[5, 6, 7]" + ] + """ + From f15a8e83ab895f5a01b379260baf98a5f9970f87 Mon Sep 17 00:00:00 2001 From: tim_stephenson <38039215+tim-stephenson@users.noreply.github.com> Date: Sat, 24 Aug 2024 14:53:21 -0400 Subject: [PATCH 2/6] changes which did not make it into the first commit --- .../polars-plan/src/dsl/function_expr/list.rs | 23 +++++++++++++++++++ crates/polars-plan/src/dsl/list.rs | 7 ++++++ crates/polars-python/src/expr/list.rs | 5 ++++ 3 files changed, 35 insertions(+) diff --git a/crates/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs index e68b080d17f1..f4cb1d8366ef 100644 --- a/crates/polars-plan/src/dsl/function_expr/list.rs +++ b/crates/polars-plan/src/dsl/function_expr/list.rs @@ -56,6 +56,8 @@ pub enum ListFunction { Join(bool), #[cfg(feature = "dtype-array")] ToArray(usize), + #[cfg(feature = "json")] + JsonEncode, } impl ListFunction { @@ -103,6 +105,8 @@ impl ListFunction { #[cfg(feature = "dtype-array")] ToArray(width) => mapper.try_map_dtype(|dt| map_list_dtype_to_array_dtype(dt, *width)), NUnique => mapper.with_dtype(IDX_DTYPE), + #[cfg(feature = "json")] + JsonEncode => mapper.with_dtype(DataType::String), } } } @@ -174,6 +178,8 @@ impl Display for ListFunction { Join(_) => "join", #[cfg(feature = "dtype-array")] ToArray(_) => "to_array", + #[cfg(feature = "json")] + JsonEncode => "to_json", }; write!(f, "list.{name}") } @@ -235,6 +241,8 @@ impl From for SpecialEq> { #[cfg(feature = "dtype-array")] ToArray(width) => map!(to_array, width), NUnique => map!(n_unique), + #[cfg(feature = "json")] + JsonEncode => map!(to_json), } } } @@ -641,3 +649,18 @@ pub(super) fn to_array(s: &Series, width: usize) -> PolarsResult { pub(super) fn n_unique(s: &Series) -> PolarsResult { Ok(s.list()?.lst_n_unique()?.into_series()) } + +#[cfg(feature = "json")] +pub(super) fn to_json(s: &Series) -> PolarsResult { + let ca = s.list()?; + + + let dtype = ca.dtype().to_arrow(CompatLevel::newest()); + + let iter = ca.chunks().iter().map(|arr| { + let arr = arrow::compute::cast::cast_unchecked(arr.as_ref(), &dtype).unwrap(); + polars_json::json::write::serialize_to_utf8(arr.as_ref()) + }); + + Ok(StringChunked::from_chunk_iter(ca.name(), iter).into_series()) +} \ No newline at end of file diff --git a/crates/polars-plan/src/dsl/list.rs b/crates/polars-plan/src/dsl/list.rs index 3762e0102432..e61ab9aef7fa 100644 --- a/crates/polars-plan/src/dsl/list.rs +++ b/crates/polars-plan/src/dsl/list.rs @@ -406,4 +406,11 @@ impl ListNameSpace { let other = other.into(); self.set_operation(other, SetOperation::SymmetricDifference) } + + #[cfg(feature = "json")] + pub fn json_encode(self) -> Expr { + self.0 + .map_private(FunctionExpr::ListExpr(ListFunction::JsonEncode)) + } + } diff --git a/crates/polars-python/src/expr/list.rs b/crates/polars-python/src/expr/list.rs index 9ab917918b83..29a6e25116b3 100644 --- a/crates/polars-python/src/expr/list.rs +++ b/crates/polars-python/src/expr/list.rs @@ -253,4 +253,9 @@ impl PyExpr { } .into() } + + #[cfg(feature = "json")] + fn list_json_encode(&self) -> Self { + self.inner.clone().list().json_encode().into() + } } From 014c9315e422d8adbec2d68b894a586a176cd308 Mon Sep 17 00:00:00 2001 From: tim_stephenson <38039215+tim-stephenson@users.noreply.github.com> Date: Sat, 24 Aug 2024 15:11:25 -0400 Subject: [PATCH 3/6] ran pre commit --- crates/polars-plan/src/dsl/function_expr/list.rs | 3 +-- crates/polars-plan/src/dsl/list.rs | 1 - py-polars/polars/expr/list.py | 12 ++++++------ py-polars/polars/series/list.py | 1 - 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/crates/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs index f4cb1d8366ef..7027a193a869 100644 --- a/crates/polars-plan/src/dsl/function_expr/list.rs +++ b/crates/polars-plan/src/dsl/function_expr/list.rs @@ -654,7 +654,6 @@ pub(super) fn n_unique(s: &Series) -> PolarsResult { pub(super) fn to_json(s: &Series) -> PolarsResult { let ca = s.list()?; - let dtype = ca.dtype().to_arrow(CompatLevel::newest()); let iter = ca.chunks().iter().map(|arr| { @@ -663,4 +662,4 @@ pub(super) fn to_json(s: &Series) -> PolarsResult { }); Ok(StringChunked::from_chunk_iter(ca.name(), iter).into_series()) -} \ No newline at end of file +} diff --git a/crates/polars-plan/src/dsl/list.rs b/crates/polars-plan/src/dsl/list.rs index e61ab9aef7fa..b9af2ca9831e 100644 --- a/crates/polars-plan/src/dsl/list.rs +++ b/crates/polars-plan/src/dsl/list.rs @@ -412,5 +412,4 @@ impl ListNameSpace { self.0 .map_private(FunctionExpr::ListExpr(ListFunction::JsonEncode)) } - } diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index d91152104f8b..f6c7e59749eb 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -1365,9 +1365,9 @@ def json_encode(self) -> Expr: Examples -------- - >>> pl.DataFrame( - ... {"a": [[1, 2], [45], [9, 1, 3], None]} - ... ).with_columns(pl.col("a").list.json_encode().alias("encoded")) + >>> pl.DataFrame({"a": [[1, 2], [45], [9, 1, 3], None]}).with_columns( + ... pl.col("a").list.json_encode().alias("encoded") + ... ) shape: (4, 2) ┌───────────┬───────────┐ │ a ┆ encoded │ @@ -1380,9 +1380,9 @@ def json_encode(self) -> Expr: │ null ┆ null │ └───────────┴───────────┘ - >>> pl.DataFrame( - ... {"a": [["\\", "\\foo"], ["\a\"'", "{\" bar}"]]} - ... ).with_columns(pl.col("a").list.json_encode().alias("encoded")) + >>> pl.DataFrame({"a": [["\\", "\\foo"], ["\a\"'", '{" bar}']]}).with_columns( + ... pl.col("a").list.json_encode().alias("encoded") + ... ) shape: (2, 2) ┌────────────────────┬───────────────────────────┐ │ a ┆ encoded │ diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index 09a60daffcd0..7cc82301b8fc 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -1070,4 +1070,3 @@ def json_encode(self) -> Series: "[5, 6, 7]" ] """ - From 92d7ccdd89d0c35efc253e15ff8ef3126d1f2dce Mon Sep 17 00:00:00 2001 From: tim_stephenson <38039215+tim-stephenson@users.noreply.github.com> Date: Sat, 24 Aug 2024 23:17:02 -0400 Subject: [PATCH 4/6] fixed doctest --- py-polars/polars/series/list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index 7cc82301b8fc..792d2fab5b3f 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -1060,7 +1060,7 @@ def json_encode(self) -> Series: Examples -------- >>> a = pl.Series([[1, 2, 3], [], [None, 3], [5, 6, 7]]) - >>> a.list.json_encode(b) + >>> a.list.json_encode() shape: (4,) Series: '' [String] [ From c7ca0ca5b80acd556065c1461f2bc57447849a54 Mon Sep 17 00:00:00 2001 From: tim_stephenson <38039215+tim-stephenson@users.noreply.github.com> Date: Sat, 24 Aug 2024 23:39:52 -0400 Subject: [PATCH 5/6] added test of list.json_encode() functionality --- .../operations/namespaces/list/test_list.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/py-polars/tests/unit/operations/namespaces/list/test_list.py b/py-polars/tests/unit/operations/namespaces/list/test_list.py index 4e9bea71b792..d6d20e5331c5 100644 --- a/py-polars/tests/unit/operations/namespaces/list/test_list.py +++ b/py-polars/tests/unit/operations/namespaces/list/test_list.py @@ -1,5 +1,6 @@ from __future__ import annotations +import math from datetime import date, datetime import numpy as np @@ -917,3 +918,64 @@ def test_list_eval_type_cast_11188() -> None: assert df.select( pl.col("a").list.eval(pl.element().cast(pl.String)).alias("a_str") ).schema == {"a_str": pl.List(pl.String)} + + +def test_list_json_encode() -> None: + df = pl.DataFrame( + { + "a": [[1, None, 3], [4, 5, 6], None], + "b": [ + [ + { + "foo": 1, + }, + {"foo": 2}, + { + "foo": 3, + }, + ], + [ + { + "foo": 3, + }, + {"foo": 4}, + { + "foo": 5, + }, + ], + [ + { + "foo": 6, + }, + {"foo": 7}, + { + "foo": 8, + }, + ], + ], + "c": [[True, False, True], [False, True, False], [True, True, False]], + "d": [[1.6, 2.7, 3.8], [math.inf, -math.inf, 6.1], [7.2, 8.3, math.nan]], + } + ) + df = df.with_columns( + pl.col("a").list.json_encode(), + pl.col("b").list.json_encode(), + pl.col("c").list.json_encode(), + pl.col("d").list.json_encode(), + ) + assert df.schema == { + "a": pl.String, + "b": pl.String, + "c": pl.String, + "d": pl.String, + } + assert df.to_dict(as_series=False) == { + "a": ["[1,null,3]", "[4,5,6]", "null"], + "b": [ + """[{"foo":1},{"foo":2},{"foo":3}]""", + """[{"foo":3},{"foo":4},{"foo":5}]""", + """[{"foo":6},{"foo":7},{"foo":8}]""", + ], + "c": ["[true,false,true]", "[false,true,false]", "[true,true,false]"], + "d": ["[1.6,2.7,3.8]", "[null,null,6.1]", "[7.2,8.3,null]"], + } From 6ddb616587c2113a889095af2afc85b7369add81 Mon Sep 17 00:00:00 2001 From: tim_stephenson <38039215+tim-stephenson@users.noreply.github.com> Date: Sun, 25 Aug 2024 00:12:27 -0400 Subject: [PATCH 6/6] fixed doctests --- py-polars/polars/expr/list.py | 39 +++++++++++++++++---------------- py-polars/polars/series/list.py | 8 +++---- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index f6c7e59749eb..b709f1badece 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -1369,28 +1369,29 @@ def json_encode(self) -> Expr: ... pl.col("a").list.json_encode().alias("encoded") ... ) shape: (4, 2) - ┌───────────┬───────────┐ - │ a ┆ encoded │ - │ --- ┆ --- │ - │ list[i64] ┆ str │ - ╞═══════════╪═══════════╡ - │ [1, 2] ┆ [1, 2] │ - │ [45] ┆ [45] │ - │ [9, 1, 3] ┆ [9, 1, 3] │ - │ null ┆ null │ - └───────────┴───────────┘ + ┌───────────┬─────────┐ + │ a ┆ encoded │ + │ --- ┆ --- │ + │ list[i64] ┆ str │ + ╞═══════════╪═════════╡ + │ [1, 2] ┆ [1,2] │ + │ [45] ┆ [45] │ + │ [9, 1, 3] ┆ [9,1,3] │ + │ null ┆ null │ + └───────────┴─────────┘ - >>> pl.DataFrame({"a": [["\\", "\\foo"], ["\a\"'", '{" bar}']]}).with_columns( + >>> pl.DataFrame({"a": [["\\", '"foo"'], [None, ""]]}).with_columns( ... pl.col("a").list.json_encode().alias("encoded") ... ) shape: (2, 2) - ┌────────────────────┬───────────────────────────┐ - │ a ┆ encoded │ - │ --- ┆ --- │ - │ list[str] ┆ str │ - ╞════════════════════╪═══════════════════════════╡ - │ ["\", "\foo"] ┆ ['\', '\bar'] │ - │ [""'", "{" bar}"] ┆ ["\u0007\"'", "{\" bar}"] │ - └────────────────────┴───────────────────────────┘ + ┌────────────────┬──────────────────┐ + │ a ┆ encoded │ + │ --- ┆ --- │ + │ list[str] ┆ str │ + ╞════════════════╪══════════════════╡ + │ ["\", ""foo""] ┆ ["\\","\"foo\""] │ + │ [null, ""] ┆ [null,""] │ + └────────────────┴──────────────────┘ + """ return wrap_expr(self._pyexpr.list_json_encode()) diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index 792d2fab5b3f..5272c567e42b 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -1062,11 +1062,11 @@ def json_encode(self) -> Series: >>> a = pl.Series([[1, 2, 3], [], [None, 3], [5, 6, 7]]) >>> a.list.json_encode() shape: (4,) - Series: '' [String] + Series: '' [str] [ - "[1, 2, 3]" + "[1,2,3]" "[]" - "[null, 3]" - "[5, 6, 7]" + "[null,3]" + "[5,6,7]" ] """