diff --git a/NEWS.md b/NEWS.md index bf58082b6..3626dd379 100644 --- a/NEWS.md +++ b/NEWS.md @@ -102,6 +102,7 @@ - Export the `Duration` datatype (#955). - New functions `pl$int_range()` and `pl$int_ranges()` (#968). - New string method `$str$extract_groups()` (#979). +- New string method `$str$find()` (#985). ### Bug fixes diff --git a/R/expr__string.R b/R/expr__string.R index 57569963f..ce6bdae16 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -429,13 +429,13 @@ ExprStr_pad_start = function(width, fillchar = " ") { } -# TODO: Add ExprStr_find to seealso #' Check if string contains a substring that matches a pattern #' -#' @details To modify regular expression behaviour (such as case-sensitivity) with flags, -#' use the inline `(?iLmsuxU)` syntax. See the regex crate’s section on -#' [grouping and flags](https://docs.rs/regex/latest/regex/#grouping-and-flags) +#' @details To modify regular expression behaviour (such as case-sensitivity) +#' with flags, use the inline `(?iLmsuxU)` syntax. See the regex crate’s section +#' on [grouping and flags](https://docs.rs/regex/latest/regex/#grouping-and-flags) #' for additional information about the use of inline expression modifiers. +#' #' @param pattern A character or something can be coerced to a string [Expr][Expr_class] #' of a valid regex pattern, compatible with the [regex crate](https://docs.rs/regex/latest/regex/). #' @param ... Ignored. @@ -443,10 +443,17 @@ ExprStr_pad_start = function(width, fillchar = " ") { #' not as a regular expression. #' @param strict Logical. If `TRUE` (default), raise an error if the underlying pattern is #' not a valid regex, otherwise mask out with a null value. +#' #' @return [Expr][Expr_class] of Boolean data type #' @seealso -#' - [`$str$start_with()`][ExprStr_starts_with]: Check if string values start with a substring. -#' - [`$str$ends_with()`][ExprStr_ends_with]: Check if string values end with a substring. +#' - [`$str$start_with()`][ExprStr_starts_with]: Check if string values +#' start with a substring. +#' - [`$str$ends_with()`][ExprStr_ends_with]: Check if string values end +#' with a substring. +#' - [`$str$find()`][ExprStr_find]: Return the index position of the first +#' substring matching a pattern. +#' +#' #' @examples #' # The inline `(?i)` syntax example #' pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( @@ -966,3 +973,27 @@ ExprStr_extract_groups = function(pattern) { .pr$Expr$str_extract_groups(self, pattern) |> unwrap("in str$extract_groups():") } + +#' Return the index position of the first substring matching a pattern +#' +#' @inherit ExprStr_contains params details +#' +#' @return An Expr of data type UInt32 +#' +#' @seealso +#' - [`$str$start_with()`][ExprStr_starts_with]: Check if string values +#' start with a substring. +#' - [`$str$ends_with()`][ExprStr_ends_with]: Check if string values end +#' with a substring. +#' - [`$str$contains()`][ExprStr_contains]: Check if string contains a substring +#' that matches a pattern. +#' +#' @examples +#' pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( +#' default_match = pl$col("s")$str$find("Aa"), +#' insensitive_match = pl$col("s")$str$find("(?i)Aa") +#' ) +ExprStr_find = function(pattern, ..., literal = FALSE, strict = TRUE) { + .pr$Expr$str_find(self, pattern, literal, strict) |> + unwrap("in str$find():") +} diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index fdcf55022..999975724 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -1024,6 +1024,8 @@ RPolarsExpr$str_contains_any <- function(patterns, ascii_case_insensitive) .Call RPolarsExpr$str_replace_many <- function(patterns, replace_with, ascii_case_insensitive) .Call(wrap__RPolarsExpr__str_replace_many, self, patterns, replace_with, ascii_case_insensitive) +RPolarsExpr$str_find <- function(pat, literal, strict) .Call(wrap__RPolarsExpr__str_find, self, pat, literal, strict) + RPolarsExpr$bin_contains <- function(lit) .Call(wrap__RPolarsExpr__bin_contains, self, lit) RPolarsExpr$bin_starts_with <- function(sub) .Call(wrap__RPolarsExpr__bin_starts_with, self, sub) diff --git a/man/ExprStr_contains.Rd b/man/ExprStr_contains.Rd index ccbd56164..d656a9fa1 100644 --- a/man/ExprStr_contains.Rd +++ b/man/ExprStr_contains.Rd @@ -25,9 +25,9 @@ not a valid regex, otherwise mask out with a null value.} Check if string contains a substring that matches a pattern } \details{ -To modify regular expression behaviour (such as case-sensitivity) with flags, -use the inline \code{(?iLmsuxU)} syntax. See the regex crate’s section on -\href{https://docs.rs/regex/latest/regex/#grouping-and-flags}{grouping and flags} +To modify regular expression behaviour (such as case-sensitivity) +with flags, use the inline \code{(?iLmsuxU)} syntax. See the regex crate’s section +on \href{https://docs.rs/regex/latest/regex/#grouping-and-flags}{grouping and flags} for additional information about the use of inline expression modifiers. } \examples{ @@ -45,7 +45,11 @@ df$with_columns( } \seealso{ \itemize{ -\item \code{\link[=ExprStr_starts_with]{$str$start_with()}}: Check if string values start with a substring. -\item \code{\link[=ExprStr_ends_with]{$str$ends_with()}}: Check if string values end with a substring. +\item \code{\link[=ExprStr_starts_with]{$str$start_with()}}: Check if string values +start with a substring. +\item \code{\link[=ExprStr_ends_with]{$str$ends_with()}}: Check if string values end +with a substring. +\item \code{\link[=ExprStr_find]{$str$find()}}: Return the index position of the first +substring matching a pattern. } } diff --git a/man/ExprStr_find.Rd b/man/ExprStr_find.Rd new file mode 100644 index 000000000..00c4082f4 --- /dev/null +++ b/man/ExprStr_find.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expr__string.R +\name{ExprStr_find} +\alias{ExprStr_find} +\title{Return the index position of the first substring matching a pattern} +\usage{ +ExprStr_find(pattern, ..., literal = FALSE, strict = TRUE) +} +\arguments{ +\item{pattern}{A character or something can be coerced to a string \link[=Expr_class]{Expr} +of a valid regex pattern, compatible with the \href{https://docs.rs/regex/latest/regex/}{regex crate}.} + +\item{...}{Ignored.} + +\item{literal}{Logical. If \code{TRUE} (default), treat \code{pattern} as a literal string, +not as a regular expression.} + +\item{strict}{Logical. If \code{TRUE} (default), raise an error if the underlying pattern is +not a valid regex, otherwise mask out with a null value.} +} +\value{ +An Expr of data type UInt32 +} +\description{ +Return the index position of the first substring matching a pattern +} +\details{ +To modify regular expression behaviour (such as case-sensitivity) +with flags, use the inline \code{(?iLmsuxU)} syntax. See the regex crate’s section +on \href{https://docs.rs/regex/latest/regex/#grouping-and-flags}{grouping and flags} +for additional information about the use of inline expression modifiers. +} +\examples{ +pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( + default_match = pl$col("s")$str$find("Aa"), + insensitive_match = pl$col("s")$str$find("(?i)Aa") +) +} +\seealso{ +\itemize{ +\item \code{\link[=ExprStr_starts_with]{$str$start_with()}}: Check if string values +start with a substring. +\item \code{\link[=ExprStr_ends_with]{$str$ends_with()}}: Check if string values end +with a substring. +\item \code{\link[=ExprStr_contains]{$str$contains()}}: Check if string contains a substring +that matches a pattern. +} +} diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 21de1da85..47df3d6ac 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -2354,6 +2354,15 @@ impl RPolarsExpr { .into()) } + pub fn str_find(&self, pat: Robj, literal: Robj, strict: Robj) -> RResult { + let pat = robj_to!(PLExpr, pat)?; + let literal = robj_to!(Option, bool, literal)?; + let strict = robj_to!(bool, strict)?; + match literal { + Some(true) => Ok(self.0.clone().str().find_literal(pat).into()), + _ => Ok(self.0.clone().str().find(pat, strict).into()), + } + } //binary methods pub fn bin_contains(&self, lit: Robj) -> RResult { Ok(self diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index c11a5c996..51f3ca432 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -405,29 +405,30 @@ [267] "str_contains_any" "str_count_matches" [269] "str_ends_with" "str_explode" [271] "str_extract" "str_extract_all" - [273] "str_extract_groups" "str_hex_decode" - [275] "str_hex_encode" "str_json_decode" - [277] "str_json_path_match" "str_len_bytes" - [279] "str_len_chars" "str_pad_end" - [281] "str_pad_start" "str_parse_int" - [283] "str_replace" "str_replace_all" - [285] "str_replace_many" "str_reverse" - [287] "str_slice" "str_split" - [289] "str_split_exact" "str_splitn" - [291] "str_starts_with" "str_strip_chars" - [293] "str_strip_chars_end" "str_strip_chars_start" - [295] "str_to_date" "str_to_datetime" - [297] "str_to_lowercase" "str_to_time" - [299] "str_to_titlecase" "str_to_uppercase" - [301] "str_zfill" "struct_field_by_name" - [303] "struct_rename_fields" "sub" - [305] "sum" "tail" - [307] "tan" "tanh" - [309] "timestamp" "to_physical" - [311] "top_k" "unique" - [313] "unique_counts" "unique_stable" - [315] "upper_bound" "value_counts" - [317] "var" "xor" + [273] "str_extract_groups" "str_find" + [275] "str_hex_decode" "str_hex_encode" + [277] "str_json_decode" "str_json_path_match" + [279] "str_len_bytes" "str_len_chars" + [281] "str_pad_end" "str_pad_start" + [283] "str_parse_int" "str_replace" + [285] "str_replace_all" "str_replace_many" + [287] "str_reverse" "str_slice" + [289] "str_split" "str_split_exact" + [291] "str_splitn" "str_starts_with" + [293] "str_strip_chars" "str_strip_chars_end" + [295] "str_strip_chars_start" "str_to_date" + [297] "str_to_datetime" "str_to_lowercase" + [299] "str_to_time" "str_to_titlecase" + [301] "str_to_uppercase" "str_zfill" + [303] "struct_field_by_name" "struct_rename_fields" + [305] "sub" "sum" + [307] "tail" "tan" + [309] "tanh" "timestamp" + [311] "to_physical" "top_k" + [313] "unique" "unique_counts" + [315] "unique_stable" "upper_bound" + [317] "value_counts" "var" + [319] "xor" # public and private methods of each class When diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index 6a8146d9e..8d470a5dd 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -832,3 +832,38 @@ test_that("str$extract_groups() works", { list(url = NULL) ) }) + +test_that("str$find() works", { + test = pl$DataFrame(s = c("AAA", "aAa", "aaa", "(?i)Aa")) + + expect_identical( + test$select( + default = pl$col("s")$str$find("Aa"), + insensitive = pl$col("s")$str$find("(?i)Aa") + )$to_list(), + list(default = c(NA, 1, NA, 4), insensitive = c(0, 0, 0, 4)) + ) + + # arg "literal" works + expect_identical( + test$select( + lit = pl$col("s")$str$find("(?i)Aa", literal = TRUE) + )$to_list(), + list(lit = c(NA, NA, NA, 0)) + ) + + # arg "strict" works + expect_grepl_error( + test$select(lit = pl$col("s")$str$find("(?iAa")), + "unrecognized flag" + ) + + expect_silent( + test$select(lit = pl$col("s")$str$find("(?iAa", strict = FALSE)) + ) + + # combining "literal" and "strict" + expect_silent( + test$select(lit = pl$col("s")$str$find("(?iAa", strict = TRUE, literal = TRUE)) + ) +})