From 791c336ff0b8ee271559d82143d97e76ed014834 Mon Sep 17 00:00:00 2001 From: barak1412 Date: Tue, 22 Oct 2024 20:27:49 +0300 Subject: [PATCH] feat: Added `escape_regex` operation to the `str` namespace and as a global function (#19257) --- Cargo.lock | 1 + Cargo.toml | 1 + crates/polars-ops/Cargo.toml | 1 + .../src/chunked_array/strings/escape_regex.rs | 21 +++++++++++++++ .../src/chunked_array/strings/mod.rs | 5 +++- .../src/chunked_array/strings/namespace.rs | 6 +++++ .../src/dsl/function_expr/strings.rs | 14 ++++++++++ crates/polars-plan/src/dsl/string.rs | 10 +++++++ crates/polars-python/src/expr/string.rs | 5 ++++ crates/polars-python/src/functions/mod.rs | 2 ++ crates/polars-python/src/functions/strings.rs | 7 +++++ .../src/lazyframe/visitor/expr_nodes.rs | 4 +++ .../source/reference/expressions/string.rst | 1 + py-polars/docs/source/reference/functions.rst | 1 + py-polars/polars/__init__.py | 2 ++ py-polars/polars/expr/string.py | 22 +++++++++++++++ py-polars/polars/functions/__init__.py | 3 +++ py-polars/polars/functions/escape_regex.py | 27 +++++++++++++++++++ py-polars/src/lib.rs | 4 +++ .../tests/unit/functions/test_functions.py | 19 +++++++++++++ .../namespaces/string/test_string.py | 13 +++++++++ 21 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 crates/polars-ops/src/chunked_array/strings/escape_regex.rs create mode 100644 crates/polars-python/src/functions/strings.rs create mode 100644 py-polars/polars/functions/escape_regex.py diff --git a/Cargo.lock b/Cargo.lock index f1b9bd7da534..88e3aca9fe4b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3046,6 +3046,7 @@ dependencies = [ "rand_distr", "rayon", "regex", + "regex-syntax 0.8.5", "serde", "serde_json", "unicode-reverse", diff --git a/Cargo.toml b/Cargo.toml index fe7ec311cc7c..473ab2e49b0f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,6 +73,7 @@ raw-cpuid = "11" rayon = "1.9" recursive = "0.1" regex = "1.9" +regex-syntax = "0.8.5" reqwest = { version = "0.12", default-features = false } ryu = "1.0.13" serde = { version = "1.0.188", features = ["derive", "rc"] } diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 027d846b485e..63e4491b25d8 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -34,6 +34,7 @@ rand = { workspace = true, optional = true, features = ["small_rng", "std"] } rand_distr = { workspace = true, optional = true } rayon = { workspace = true } regex = { workspace = true } +regex-syntax = { workspace = true } serde = { workspace = true, optional = true } serde_json = { workspace = true, optional = true } unicode-reverse = { workspace = true, optional = true } diff --git a/crates/polars-ops/src/chunked_array/strings/escape_regex.rs b/crates/polars-ops/src/chunked_array/strings/escape_regex.rs new file mode 100644 index 000000000000..1edb9146e9f4 --- /dev/null +++ b/crates/polars-ops/src/chunked_array/strings/escape_regex.rs @@ -0,0 +1,21 @@ +use polars_core::prelude::{StringChunked, StringChunkedBuilder}; + +#[inline] +pub fn escape_regex_str(s: &str) -> String { + regex_syntax::escape(s) +} + +pub fn escape_regex(ca: &StringChunked) -> StringChunked { + let mut buffer = String::new(); + let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len()); + for opt_s in ca.iter() { + if let Some(s) = opt_s { + buffer.clear(); + regex_syntax::escape_into(s, &mut buffer); + builder.append_value(&buffer); + } else { + builder.append_null(); + } + } + builder.finish() +} diff --git a/crates/polars-ops/src/chunked_array/strings/mod.rs b/crates/polars-ops/src/chunked_array/strings/mod.rs index b9149983307b..326349c36815 100644 --- a/crates/polars-ops/src/chunked_array/strings/mod.rs +++ b/crates/polars-ops/src/chunked_array/strings/mod.rs @@ -3,6 +3,8 @@ mod case; #[cfg(feature = "strings")] mod concat; #[cfg(feature = "strings")] +mod escape_regex; +#[cfg(feature = "strings")] mod extract; #[cfg(feature = "find_many")] mod find_many; @@ -20,12 +22,13 @@ mod split; mod strip; #[cfg(feature = "strings")] mod substring; - #[cfg(all(not(feature = "nightly"), feature = "strings"))] mod unicode_internals; #[cfg(feature = "strings")] pub use concat::*; +#[cfg(feature = "strings")] +pub use escape_regex::*; #[cfg(feature = "find_many")] pub use find_many::*; #[cfg(feature = "extract_jsonpath")] diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 812dfbfcba91..93574e5f3080 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -640,6 +640,12 @@ pub trait StringNameSpaceImpl: AsString { substring::tail(ca, n.i64()?) } + #[cfg(feature = "strings")] + /// Escapes all regular expression meta characters in the string. + fn str_escape_regex(&self) -> StringChunked { + let ca = self.as_string(); + escape_regex::escape_regex(ca) + } } impl StringNameSpaceImpl for StringChunked {} diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 66bd3c5c6e73..12e8c6c6e53e 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -130,6 +130,8 @@ pub enum StringFunction { ascii_case_insensitive: bool, overlapping: bool, }, + #[cfg(feature = "regex")] + EscapeRegex, } impl StringFunction { @@ -197,6 +199,8 @@ impl StringFunction { ReplaceMany { .. } => mapper.with_same_dtype(), #[cfg(feature = "find_many")] ExtractMany { .. } => mapper.with_dtype(DataType::List(Box::new(DataType::String))), + #[cfg(feature = "regex")] + EscapeRegex => mapper.with_same_dtype(), } } } @@ -285,6 +289,8 @@ impl Display for StringFunction { ReplaceMany { .. } => "replace_many", #[cfg(feature = "find_many")] ExtractMany { .. } => "extract_many", + #[cfg(feature = "regex")] + EscapeRegex => "escape_regex", }; write!(f, "str.{s}") } @@ -400,6 +406,8 @@ impl From for SpecialEq> { } => { map_as_slice!(extract_many, ascii_case_insensitive, overlapping) }, + #[cfg(feature = "regex")] + EscapeRegex => map!(escape_regex), } } } @@ -1033,3 +1041,9 @@ pub(super) fn json_path_match(s: &[Column]) -> PolarsResult { let pat = s[1].str()?; Ok(ca.json_path_match(pat)?.into_column()) } + +#[cfg(feature = "regex")] +pub(super) fn escape_regex(s: &Column) -> PolarsResult { + let ca = s.str()?; + Ok(ca.str_escape_regex().into_column()) +} diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index d392d403d1b6..efa34f59c04c 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -592,4 +592,14 @@ impl StringNameSpace { None, ) } + + #[cfg(feature = "strings")] + pub fn escape_regex(self) -> Expr { + self.0.map_many_private( + FunctionExpr::StringExpr(StringFunction::EscapeRegex), + &[], + false, + None, + ) + } } diff --git a/crates/polars-python/src/expr/string.rs b/crates/polars-python/src/expr/string.rs index 6f0836ad8d13..87521a2b7aa1 100644 --- a/crates/polars-python/src/expr/string.rs +++ b/crates/polars-python/src/expr/string.rs @@ -339,4 +339,9 @@ impl PyExpr { .extract_many(patterns.inner, ascii_case_insensitive, overlapping) .into() } + + #[cfg(feature = "regex")] + fn str_escape_regex(&self) -> Self { + self.inner.clone().str().escape_regex().into() + } } diff --git a/crates/polars-python/src/functions/mod.rs b/crates/polars-python/src/functions/mod.rs index 0bb5e55ea23c..ddf58c7acde6 100644 --- a/crates/polars-python/src/functions/mod.rs +++ b/crates/polars-python/src/functions/mod.rs @@ -8,6 +8,7 @@ mod misc; mod random; mod range; mod string_cache; +mod strings; mod whenthen; pub use aggregation::*; @@ -20,4 +21,5 @@ pub use misc::*; pub use random::*; pub use range::*; pub use string_cache::*; +pub use strings::*; pub use whenthen::*; diff --git a/crates/polars-python/src/functions/strings.rs b/crates/polars-python/src/functions/strings.rs new file mode 100644 index 000000000000..d75666ecf367 --- /dev/null +++ b/crates/polars-python/src/functions/strings.rs @@ -0,0 +1,7 @@ +use pyo3::prelude::*; + +#[pyfunction] +pub fn escape_regex(s: &str) -> PyResult { + let escaped_s = polars_ops::chunked_array::strings::escape_regex_str(s); + Ok(escaped_s) +} diff --git a/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs b/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs index 07d2f872437c..32e99edb3e69 100644 --- a/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs @@ -174,6 +174,7 @@ pub enum PyStringFunction { ZFill, ContainsMany, ReplaceMany, + EscapeRegex, } #[pymethods] @@ -953,6 +954,9 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { StringFunction::ExtractMany { .. } => { return Err(PyNotImplementedError::new_err("extract_many")) }, + StringFunction::EscapeRegex => { + (PyStringFunction::EscapeRegex.into_py(py),).to_object(py) + }, }, FunctionExpr::StructExpr(_) => { return Err(PyNotImplementedError::new_err("struct expr")) diff --git a/py-polars/docs/source/reference/expressions/string.rst b/py-polars/docs/source/reference/expressions/string.rst index a0cde717f0da..7c1358b480f6 100644 --- a/py-polars/docs/source/reference/expressions/string.rst +++ b/py-polars/docs/source/reference/expressions/string.rst @@ -16,6 +16,7 @@ The following methods are available under the `expr.str` attribute. Expr.str.decode Expr.str.encode Expr.str.ends_with + Expr.str.escape_regex Expr.str.explode Expr.str.extract Expr.str.extract_all diff --git a/py-polars/docs/source/reference/functions.rst b/py-polars/docs/source/reference/functions.rst index c672aaa77eac..33ee296844db 100644 --- a/py-polars/docs/source/reference/functions.rst +++ b/py-polars/docs/source/reference/functions.rst @@ -25,6 +25,7 @@ Miscellaneous align_frames concat + escape_regex Parallelization ~~~~~~~~~~~~~~~ diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 063f84c91126..83ea52acc822 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -104,6 +104,7 @@ datetime_ranges, duration, element, + escape_regex, exclude, field, first, @@ -303,6 +304,7 @@ "time_range", "time_ranges", "zeros", + "escape_regex", # polars.functions.aggregation "all", "all_horizontal", diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 7582758d5921..e94f995ee700 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -2781,6 +2781,28 @@ def concat( delimiter = "-" return self.join(delimiter, ignore_nulls=ignore_nulls) + def escape_regex(self) -> Expr: + r""" + Returns string values with all regular expression meta characters escaped. + + Examples + -------- + >>> df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]}) + >>> df.with_columns(pl.col("text").str.escape_regex().alias("escaped")) + shape: (4, 2) + ┌──────────┬──────────────┐ + │ text ┆ escaped │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════════╪══════════════╡ + │ abc ┆ abc │ + │ def ┆ def │ + │ null ┆ null │ + │ abc(\w+) ┆ abc\(\\w\+\) │ + └──────────┴──────────────┘ + """ + return wrap_expr(self._pyexpr.str_escape_regex()) + def _validate_format_argument(format: str | None) -> None: if format is not None and ".%f" in format: diff --git a/py-polars/polars/functions/__init__.py b/py-polars/polars/functions/__init__.py index fedd0ac2bff0..32fbe4578059 100644 --- a/py-polars/polars/functions/__init__.py +++ b/py-polars/polars/functions/__init__.py @@ -26,6 +26,7 @@ from polars.functions.business import business_day_count from polars.functions.col import col from polars.functions.eager import align_frames, concat +from polars.functions.escape_regex import escape_regex from polars.functions.lazy import ( approx_n_unique, arctan2, @@ -170,4 +171,6 @@ # polars.functions.whenthen "when", "sql_expr", + # polars.functions.escape_regex + "escape_regex", ] diff --git a/py-polars/polars/functions/escape_regex.py b/py-polars/polars/functions/escape_regex.py new file mode 100644 index 000000000000..1c038347e8af --- /dev/null +++ b/py-polars/polars/functions/escape_regex.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import contextlib + +with contextlib.suppress(ImportError): # Module not available when building docs + import polars.polars as plr +import polars._reexport as pl + + +def escape_regex(s: str) -> str: + r""" + Escapes string regex meta characters. + + Parameters + ---------- + s + The string that all of its meta characters will be escaped. + + """ + if isinstance(s, pl.Expr): + msg = "escape_regex function is unsupported for `Expr`, you may want use `Expr.str.escape_regex` instead" + raise TypeError(msg) + elif not isinstance(s, str): + msg = f"escape_regex function supports only `str` type, got `{type(s)}`" + raise TypeError(msg) + + return plr.escape_regex(s) diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs index c4cbab12f056..859609828d19 100644 --- a/py-polars/src/lib.rs +++ b/py-polars/src/lib.rs @@ -275,6 +275,10 @@ fn polars(py: Python, m: &Bound) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(functions::set_random_seed)) .unwrap(); + // Functions - escape_regex + m.add_wrapped(wrap_pyfunction!(functions::escape_regex)) + .unwrap(); + // Exceptions - Errors m.add( "PolarsError", diff --git a/py-polars/tests/unit/functions/test_functions.py b/py-polars/tests/unit/functions/test_functions.py index de7e49574393..05bd11976fd8 100644 --- a/py-polars/tests/unit/functions/test_functions.py +++ b/py-polars/tests/unit/functions/test_functions.py @@ -538,3 +538,22 @@ def test_head_tail(fruits_cars: pl.DataFrame) -> None: res_expr = fruits_cars.select(pl.tail("A", 2)) expected = pl.Series("A", [4, 5]) assert_series_equal(res_expr.to_series(), expected) + + +def test_escape_regex() -> None: + result = pl.escape_regex("abc(\\w+)") + expected = "abc\\(\\\\w\\+\\)" + assert result == expected + + df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]}) + with pytest.raises( + TypeError, + match="escape_regex function is unsupported for `Expr`, you may want use `Expr.str.escape_regex` instead", + ): + df.with_columns(escaped=pl.escape_regex(pl.col("text"))) # type: ignore[arg-type] + + with pytest.raises( + TypeError, + match="escape_regex function supports only `str` type, got ``", + ): + pl.escape_regex(3) # type: ignore[arg-type] diff --git a/py-polars/tests/unit/operations/namespaces/string/test_string.py b/py-polars/tests/unit/operations/namespaces/string/test_string.py index ab44b4e9603a..3b2637a0f334 100644 --- a/py-polars/tests/unit/operations/namespaces/string/test_string.py +++ b/py-polars/tests/unit/operations/namespaces/string/test_string.py @@ -1793,3 +1793,16 @@ def test_json_decode_struct_schema() -> None: ), pl.Series([{"a": 1}, {"a": 2}]), ) + + +def test_escape_regex() -> None: + df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]}) + result_df = df.with_columns(pl.col("text").str.escape_regex().alias("escaped")) + expected_df = pl.DataFrame( + { + "text": ["abc", "def", None, "abc(\\w+)"], + "escaped": ["abc", "def", None, "abc\\(\\\\w\\+\\)"], + } + ) + + assert_frame_equal(result_df, expected_df)