diff --git a/R/expr__string.R b/R/expr__string.R index 57569963f..87c0c4025 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -461,7 +461,7 @@ ExprStr_pad_start = function(width, fillchar = " ") { #' ) ExprStr_contains = function(pattern, ..., literal = FALSE, strict = TRUE) { .pr$Expr$str_contains(self, pattern, literal, strict) |> - unwrap("in str$contains():") + unwrap("in $str$contains():") } @@ -655,27 +655,18 @@ ExprStr_extract_all = function(pattern) { #' Count all successive non-overlapping regex matches #' -#' @keywords ExprStr -#' @param pattern A valid regex pattern -#' @param literal Treat pattern as a literal string. -#' -#' @return -#' UInt32 array. Contains null if original value is null or regex capture nothing. -#' +#' @inheritParams ExprStr_contains +#' @return Expression of data type `UInt32`. +#' Returns `null` if the original value is `null`. #' @examples -#' df = pl$DataFrame(foo = c("123 bla 45 asd", "xyz 678 910t")) -#' df$select( -#' pl$col("foo")$str$count_matches(r"{(\d)}")$alias("count digits") -#' ) +#' df = pl$DataFrame(foo = c("12 dbc 3xy", "cat\\w", "1zy3\\d\\d", NA)) #' -#' # we can use Polars expressions as pattern so that it's not necessarily the -#' # same for all rows -#' df2 = pl$DataFrame(foo = c("hello", "hi there"), pat = c("ell", "e")) -#' df2$with_columns( -#' pl$col("foo")$str$count_matches(pl$col("pat"))$alias("reg_count") +#' df$with_columns( +#' count_digits = pl$col("foo")$str$count_matches(r"(\d)"), +#' count_slash_d = pl$col("foo")$str$count_matches(r"(\d)", literal = TRUE) #' ) -ExprStr_count_matches = function(pattern, literal = FALSE) { - .pr$Expr$str_count_matches(self, wrap_e(pattern), literal) |> +ExprStr_count_matches = function(pattern, ..., literal = FALSE) { + .pr$Expr$str_count_matches(self, pattern, literal) |> unwrap("in $str$count_matches():") } @@ -757,46 +748,80 @@ ExprStr_splitn = function(by, n) { #' Replace first matching regex/literal substring with a new string value #' -#' @keywords ExprStr -#' @param pattern Regex pattern, can be an Expr. -#' @param value Replacement, can be an Expr. -#' @param literal Treat pattern as a literal string. -#' -#' @return Expr of String Series +#' @inherit ExprStr_contains details +#' @section Capture groups: +#' The dollar sign (`$`) is a special character related to capture groups. +#' To refer to a literal dollar sign, use `$$` instead or set `literal` to `TRUE`. +#' @inheritParams ExprStr_contains +#' @param value A character or an [Expr][Expr_class] of string +#' that will replace the matched substring. +#' @param n A number of matches to replace. +#' @return [Expr][Expr_class] of String type +#' @seealso +#' - [`$str$replace_all()`][ExprStr_replace_all] +#' @examples +#' df = pl$DataFrame(id = 1L:2L, text = c("123abc", "abc456")) +#' df$with_columns(pl$col("text")$str$replace(r"(abc\b)", "ABC")) #' -#' @seealso `$str$replace_all()`: Replace all matching regex/literal substrings. +#' # Capture groups are supported. +#' # Use `${1}` in the value string to refer to the first capture group in the pattern, +#' # `${2}` to refer to the second capture group, and so on. +#' # You can also use named capture groups. +#' df = pl$DataFrame(word = c("hat", "hut")) +#' df$with_columns( +#' positional = pl$col("word")$str$replace("h(.)t", "b${1}d"), +#' named = pl$col("word")$str$replace("h(?.)t", "b${vowel}d") +#' ) #' -#' @examples -#' df = pl$DataFrame(id = c(1, 2), text = c("123abc", "abc456")) +#' # Apply case-insensitive string replacement using the `(?i)` flag. +#' df = pl$DataFrame( +#' city = "Philadelphia", +#' season = c("Spring", "Summer", "Autumn", "Winter"), +#' weather = c("Rainy", "Sunny", "Cloudy", "Snowy") +#' ) #' df$with_columns( -#' pl$col("text")$str$replace(r"{abc\b}", "ABC") +#' pl$col("weather")$str$replace("(?i)foggy|rainy|cloudy|snowy", "Sunny") #' ) -ExprStr_replace = function(pattern, value, literal = FALSE) { - .pr$Expr$str_replace(self, wrap_e_result(pattern), wrap_e_result(value), result(literal)) |> - unwrap("in str$replace():") +ExprStr_replace = function(pattern, value, ..., literal = FALSE, n = 1L) { + .pr$Expr$str_replace(self, pattern, value, literal, n) |> + unwrap("in $str$replace():") } #' Replace all matching regex/literal substrings with a new string value #' -#' @keywords ExprStr -#' @param pattern Regex pattern, can be an Expr. -#' @param value Replacement, can be an Expr. -#' @param literal Treat pattern as a literal string. -#' -#' @return Expr of String Series +#' @inherit ExprStr_replace details sections params return +#' @seealso +#' - [`$str$replace()`][ExprStr_replace] +#' @examples +#' df = pl$DataFrame(id = 1L:2L, text = c("abcabc", "123a123")) +#' df$with_columns(pl$col("text")$str$replace_all("a", "-")) #' -#' @seealso `$str$replace()`: Replace first matching regex/literal substring. +#' # Capture groups are supported. +#' # Use `${1}` in the value string to refer to the first capture group in the pattern, +#' # `${2}` to refer to the second capture group, and so on. +#' # You can also use named capture groups. +#' df = pl$DataFrame(word = c("hat", "hut")) +#' df$with_columns( +#' positional = pl$col("word")$str$replacePall("h(.)t", "b${1}d"), +#' named = pl$col("word")$str$replace_all("h(?.)t", "b${vowel}d") +#' ) #' -#' @examples -#' df = pl$DataFrame(id = c(1, 2), text = c("abcabc", "123a123")) +#' # Apply case-insensitive string replacement using the `(?i)` flag. +#' df = pl$DataFrame( +#' city = "Philadelphia", +#' season = c("Spring", "Summer", "Autumn", "Winter"), +#' weather = c("Rainy", "Sunny", "Cloudy", "Snowy") +#' ) #' df$with_columns( -#' pl$col("text")$str$replace_all("a", "-") +#' pl$col("weather")$str$replace_all( +#' "(?i)foggy|rainy|cloudy|snowy", "Sunny" +#' ) #' ) -ExprStr_replace_all = function(pattern, value, literal = FALSE) { - .pr$Expr$str_replace_all(self, wrap_e_result(pattern), wrap_e_result(value), result(literal)) |> - unwrap("in str$replace_all():") +ExprStr_replace_all = function(pattern, value, ..., literal = FALSE) { + .pr$Expr$str_replace_all(self, pattern, value, literal) |> + unwrap("in $str$replace_all():") } diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 252b84cb7..467015dbc 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -994,7 +994,7 @@ RPolarsExpr$str_extract_all <- function(pattern) .Call(wrap__RPolarsExpr__str_ex RPolarsExpr$str_extract_groups <- function(pattern) .Call(wrap__RPolarsExpr__str_extract_groups, self, pattern) -RPolarsExpr$str_count_matches <- function(pattern, literal) .Call(wrap__RPolarsExpr__str_count_matches, self, pattern, literal) +RPolarsExpr$str_count_matches <- function(pat, literal) .Call(wrap__RPolarsExpr__str_count_matches, self, pat, literal) RPolarsExpr$str_to_date <- function(format, strict, exact, cache) .Call(wrap__RPolarsExpr__str_to_date, self, format, strict, exact, cache) @@ -1008,9 +1008,9 @@ RPolarsExpr$str_split_exact <- function(by, n, inclusive) .Call(wrap__RPolarsExp RPolarsExpr$str_splitn <- function(by, n) .Call(wrap__RPolarsExpr__str_splitn, self, by, n) -RPolarsExpr$str_replace <- function(pattern, value, literal) .Call(wrap__RPolarsExpr__str_replace, self, pattern, value, literal) +RPolarsExpr$str_replace <- function(pat, value, literal, n) .Call(wrap__RPolarsExpr__str_replace, self, pat, value, literal, n) -RPolarsExpr$str_replace_all <- function(pattern, value, literal) .Call(wrap__RPolarsExpr__str_replace_all, self, pattern, value, literal) +RPolarsExpr$str_replace_all <- function(pat, value, literal) .Call(wrap__RPolarsExpr__str_replace_all, self, pat, value, literal) RPolarsExpr$str_slice <- function(offset, length) .Call(wrap__RPolarsExpr__str_slice, self, offset, length) diff --git a/man/ExprStr_count_matches.Rd b/man/ExprStr_count_matches.Rd index bedf0ed79..885c475aa 100644 --- a/man/ExprStr_count_matches.Rd +++ b/man/ExprStr_count_matches.Rd @@ -4,30 +4,29 @@ \alias{ExprStr_count_matches} \title{Count all successive non-overlapping regex matches} \usage{ -ExprStr_count_matches(pattern, literal = FALSE) +ExprStr_count_matches(pattern, ..., literal = FALSE) } \arguments{ -\item{pattern}{A valid regex pattern} +\item{pattern}{A character or something can be coerced to a string \link[=Expr_class]{Expr} +of a valid regex pattern, compatible with the \href{https://docs.rs/regex/latest/regex/}{regex crate}.} -\item{literal}{Treat pattern as a literal string.} +\item{...}{Ignored.} + +\item{literal}{Logical. If \code{TRUE} (default), treat \code{pattern} as a literal string, +not as a regular expression.} } \value{ -UInt32 array. Contains null if original value is null or regex capture nothing. +Expression of data type \code{UInt32}. +Returns \code{null} if the original value is \code{null}. } \description{ Count all successive non-overlapping regex matches } \examples{ -df = pl$DataFrame(foo = c("123 bla 45 asd", "xyz 678 910t")) -df$select( - pl$col("foo")$str$count_matches(r"{(\d)}")$alias("count digits") -) +df = pl$DataFrame(foo = c("12 dbc 3xy", "cat\\\\w", "1zy3\\\\d\\\\d", NA)) -# we can use Polars expressions as pattern so that it's not necessarily the -# same for all rows -df2 = pl$DataFrame(foo = c("hello", "hi there"), pat = c("ell", "e")) -df2$with_columns( - pl$col("foo")$str$count_matches(pl$col("pat"))$alias("reg_count") +df$with_columns( + count_digits = pl$col("foo")$str$count_matches(r"(\d)"), + count_slash_d = pl$col("foo")$str$count_matches(r"(\d)", literal = TRUE) ) } -\keyword{ExprStr} diff --git a/man/ExprStr_replace.Rd b/man/ExprStr_replace.Rd index d458f2ae6..98237c307 100644 --- a/man/ExprStr_replace.Rd +++ b/man/ExprStr_replace.Rd @@ -4,28 +4,66 @@ \alias{ExprStr_replace} \title{Replace first matching regex/literal substring with a new string value} \usage{ -ExprStr_replace(pattern, value, literal = FALSE) +ExprStr_replace(pattern, value, ..., literal = FALSE, n = 1L) } \arguments{ -\item{pattern}{Regex pattern, can be an Expr.} +\item{pattern}{A character or something can be coerced to a string \link[=Expr_class]{Expr} +of a valid regex pattern, compatible with the \href{https://docs.rs/regex/latest/regex/}{regex crate}.} -\item{value}{Replacement, can be an Expr.} +\item{value}{A character or an \link[=Expr_class]{Expr} of string +that will replace the matched substring.} -\item{literal}{Treat pattern as a literal string.} +\item{...}{Ignored.} + +\item{literal}{Logical. If \code{TRUE} (default), treat \code{pattern} as a literal string, +not as a regular expression.} + +\item{n}{A number of matches to replace.} } \value{ -Expr of String Series +\link[=Expr_class]{Expr} of String type } \description{ Replace first matching regex/literal substring with a new string value } +\details{ +To modify regular expression behaviour (such as case-sensitivity) with flags, +use the inline \code{(?iLmsuxU)} syntax. See the regex crate’s section on +\href{https://docs.rs/regex/latest/regex/#grouping-and-flags}{grouping and flags} +for additional information about the use of inline expression modifiers. +} +\section{Capture groups}{ + +The dollar sign (\code{$}) is a special character related to capture groups. +To refer to a literal dollar sign, use \verb{$$} instead or set \code{literal} to \code{TRUE}. +} + \examples{ -df = pl$DataFrame(id = c(1, 2), text = c("123abc", "abc456")) +df = pl$DataFrame(id = 1L:2L, text = c("123abc", "abc456")) +df$with_columns(pl$col("text")$str$replace(r"(abc\b)", "ABC")) + +# Capture groups are supported. +# Use `${1}` in the value string to refer to the first capture group in the pattern, +# `${2}` to refer to the second capture group, and so on. +# You can also use named capture groups. +df = pl$DataFrame(word = c("hat", "hut")) +df$with_columns( + positional = pl$col("word")$str$replace("h(.)t", "b${1}d"), + named = pl$col("word")$str$replace("h(?.)t", "b${vowel}d") +) + +# Apply case-insensitive string replacement using the `(?i)` flag. +df = pl$DataFrame( + city = "Philadelphia", + season = c("Spring", "Summer", "Autumn", "Winter"), + weather = c("Rainy", "Sunny", "Cloudy", "Snowy") +) df$with_columns( - pl$col("text")$str$replace(r"{abc\b}", "ABC") + pl$col("weather")$str$replace("(?i)foggy|rainy|cloudy|snowy", "Sunny") ) } \seealso{ -\verb{$str$replace_all()}: Replace all matching regex/literal substrings. +\itemize{ +\item \code{\link[=ExprStr_replace_all]{$str$replace_all()}} +} } -\keyword{ExprStr} diff --git a/man/ExprStr_replace_all.Rd b/man/ExprStr_replace_all.Rd index 36e48d7e0..e6cf77fac 100644 --- a/man/ExprStr_replace_all.Rd +++ b/man/ExprStr_replace_all.Rd @@ -4,28 +4,66 @@ \alias{ExprStr_replace_all} \title{Replace all matching regex/literal substrings with a new string value} \usage{ -ExprStr_replace_all(pattern, value, literal = FALSE) +ExprStr_replace_all(pattern, value, ..., literal = FALSE) } \arguments{ -\item{pattern}{Regex pattern, can be an Expr.} +\item{pattern}{A character or something can be coerced to a string \link[=Expr_class]{Expr} +of a valid regex pattern, compatible with the \href{https://docs.rs/regex/latest/regex/}{regex crate}.} -\item{value}{Replacement, can be an Expr.} +\item{value}{A character or an \link[=Expr_class]{Expr} of string +that will replace the matched substring.} -\item{literal}{Treat pattern as a literal string.} +\item{...}{Ignored.} + +\item{literal}{Logical. If \code{TRUE} (default), treat \code{pattern} as a literal string, +not as a regular expression.} } \value{ -Expr of String Series +\link[=Expr_class]{Expr} of String type } \description{ Replace all matching regex/literal substrings with a new string value } +\details{ +To modify regular expression behaviour (such as case-sensitivity) with flags, +use the inline \code{(?iLmsuxU)} syntax. See the regex crate’s section on +\href{https://docs.rs/regex/latest/regex/#grouping-and-flags}{grouping and flags} +for additional information about the use of inline expression modifiers. +} +\section{Capture groups}{ + +The dollar sign (\code{$}) is a special character related to capture groups. +To refer to a literal dollar sign, use \verb{$$} instead or set \code{literal} to \code{TRUE}. +} + \examples{ -df = pl$DataFrame(id = c(1, 2), text = c("abcabc", "123a123")) +df = pl$DataFrame(id = 1L:2L, text = c("abcabc", "123a123")) +df$with_columns(pl$col("text")$str$replace_all("a", "-")) + +# Capture groups are supported. +# Use `${1}` in the value string to refer to the first capture group in the pattern, +# `${2}` to refer to the second capture group, and so on. +# You can also use named capture groups. +df = pl$DataFrame(word = c("hat", "hut")) df$with_columns( - pl$col("text")$str$replace_all("a", "-") + positional = pl$col("word")$str$replacePall("h(.)t", "b${1}d"), + named = pl$col("word")$str$replace_all("h(?.)t", "b${vowel}d") +) + +# Apply case-insensitive string replacement using the `(?i)` flag. +df = pl$DataFrame( + city = "Philadelphia", + season = c("Spring", "Summer", "Autumn", "Winter"), + weather = c("Rainy", "Sunny", "Cloudy", "Snowy") +) +df$with_columns( + pl$col("weather")$str$replace_all( + "(?i)foggy|rainy|cloudy|snowy", "Sunny" + ) ) } \seealso{ -\verb{$str$replace()}: Replace first matching regex/literal substring. +\itemize{ +\item \code{\link[=ExprStr_replace]{$str$replace()}} +} } -\keyword{ExprStr} diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index ad29a8bb7..435a07a5f 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -2150,13 +2150,10 @@ impl RPolarsExpr { Ok(self.0.clone().str().extract_groups(pattern)?.into()) } - pub fn str_count_matches(&self, pattern: Robj, literal: Robj) -> RResult { - Ok(self - .0 - .clone() - .str() - .count_matches(robj_to!(PLExpr, pattern)?, robj_to!(bool, literal)?) - .into()) + pub fn str_count_matches(&self, pat: Robj, literal: Robj) -> RResult { + let pat = robj_to!(PLExpr, pat)?; + let literal = robj_to!(bool, literal)?; + Ok(self.0.clone().str().count_matches(pat, literal).into()) } pub fn str_to_date( @@ -2260,38 +2257,33 @@ impl RPolarsExpr { pub fn str_replace( &self, - pattern: Robj, + pat: Robj, value: Robj, literal: Robj, + n: Robj, ) -> Result { + let pat = robj_to!(PLExpr, pat)?; + let value = robj_to!(PLExpr, value)?; + let literal = robj_to!(bool, literal)?; + let n = robj_to!(i64, n)?; Ok(self .0 .clone() .str() - .replace( - robj_to!(Expr, pattern)?.0, - robj_to!(Expr, value)?.0, - robj_to!(bool, literal)?, - ) + .replace_n(pat, value, literal, n) .into()) } pub fn str_replace_all( &self, - pattern: Robj, + pat: Robj, value: Robj, literal: Robj, ) -> Result { - Ok(self - .0 - .clone() - .str() - .replace_all( - robj_to!(Expr, pattern)?.0, - robj_to!(Expr, value)?.0, - robj_to!(bool, literal)?, - ) - .into()) + let pat = robj_to!(PLExpr, pat)?; + let value = robj_to!(PLExpr, value)?; + let literal = robj_to!(bool, literal)?; + Ok(self.0.clone().str().replace_all(pat, value, literal).into()) } pub fn str_slice(&self, offset: Robj, length: Robj) -> Result { diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index 6a8146d9e..5a57601a0 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -604,13 +604,13 @@ test_that("str$replace", { ) expect_identical( - pl$lit(c("123abc", "abc456"))$str$replace(r"{abc\b}", "ABC", TRUE)$to_r(), + pl$lit(c("123abc", "abc456"))$str$replace(r"{abc\b}", "ABC", literal = TRUE)$to_r(), c("123abc", "abc456") ) e = pl$lit(r"{(abc\b)}") expect_identical( - pl$lit(c("123abc", "abc456"))$str$replace(e, "ABC", FALSE)$to_r(), + pl$lit(c("123abc", "abc456"))$str$replace(e, "ABC", literal = FALSE)$to_r(), c("123ABC", "abc456") ) @@ -637,7 +637,7 @@ test_that("str$replace_all", { ) expect_identical( - pl$lit(c("abcabc", "123a123"))$str$replace_all("^12", "-", TRUE)$to_r(), + pl$lit(c("abcabc", "123a123"))$str$replace_all("^12", "-", literal = TRUE)$to_r(), c("abcabc", "123a123") ) })