Skip to content

Commit

Permalink
Add string.replace_re APIs to pylibcudf (rapidsai#17023)
Browse files Browse the repository at this point in the history
Contributes to rapidsai#15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: rapidsai#17023
  • Loading branch information
mroeschke authored Oct 22, 2024
1 parent 637e320 commit 4fe338c
Show file tree
Hide file tree
Showing 11 changed files with 289 additions and 88 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ strings
regex_flags
regex_program
repeat
replace_re
replace
side_type
slice
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
==========
replace_re
==========

.. automodule:: pylibcudf.strings.replace_re
:members:
104 changes: 28 additions & 76 deletions python/cudf/cudf/_lib/strings/replace_re.pyx
Original file line number Diff line number Diff line change
@@ -1,26 +1,11 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector
from pylibcudf.libcudf.types cimport size_type
import pylibcudf as plc

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from pylibcudf.libcudf.strings.regex_program cimport regex_program
from pylibcudf.libcudf.strings.replace_re cimport (
replace_re as cpp_replace_re,
replace_with_backrefs as cpp_replace_with_backrefs,
)
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar


@acquire_spill_lock()
Expand All @@ -34,28 +19,16 @@ def replace_re(Column source_strings,
`n` indicates the number of resplacements to be made from
start. (-1 indicates all)
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string pattern_string = <string>str(pattern).encode()
cdef const string_scalar* scalar_repl = \
<const string_scalar*>(repl.get_raw_ptr())
cdef regex_flags c_flags = regex_flags.DEFAULT
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_replace_re(
source_view,
dereference(c_prog),
scalar_repl[0],
n
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.replace_re.replace_re(
source_strings.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
str(pattern),
plc.strings.regex_flags.RegexFlags.DEFAULT
),
py_repl.device_value.c_value,
n
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -68,50 +41,29 @@ def replace_with_backrefs(
new string with the extracted elements found using
`pattern` regular expression in `source_strings`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string pattern_string = <string>str(pattern).encode()
cdef string repl_string = <string>str(repl).encode()
cdef regex_flags c_flags = regex_flags.DEFAULT
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_replace_with_backrefs(
source_view,
dereference(c_prog),
repl_string
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.replace_re.replace_with_backrefs(
source_strings.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
str(pattern),
plc.strings.regex_flags.RegexFlags.DEFAULT
),
repl
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
def replace_multi_re(Column source_strings,
object patterns,
list patterns,
Column repl_strings):
"""
Returns a Column after replacing occurrences of multiple
regular expressions `patterns` with their corresponding
strings in `repl_strings` in `source_strings`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view repl_view = repl_strings.view()

cdef int pattern_size = len(patterns)
cdef vector[string] patterns_vector
patterns_vector.reserve(pattern_size)

for pattern in patterns:
patterns_vector.push_back(str.encode(pattern))

with nogil:
c_result = move(cpp_replace_re(
source_view,
patterns_vector,
repl_view
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.replace_re.replace_re(
source_strings.to_pylibcudf(mode="read"),
patterns,
repl_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -998,7 +998,7 @@ def replace(
return self._return_or_inplace(
libstrings.replace_multi_re(
self._column,
pat,
list(pat),
column.as_column(repl, dtype="str"),
)
if regex
Expand Down
24 changes: 13 additions & 11 deletions python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from libcpp.vector cimport vector
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from pylibcudf.libcudf.strings.regex_program cimport regex_program
from pylibcudf.libcudf.table.table cimport table
from pylibcudf.libcudf.types cimport size_type
Expand All @@ -14,17 +15,18 @@ from pylibcudf.libcudf.types cimport size_type
cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:

cdef unique_ptr[column] replace_re(
column_view source_strings,
regex_program,
string_scalar repl,
size_type maxrepl) except +

cdef unique_ptr[column] replace_with_backrefs(
column_view source_strings,
regex_program,
string repl) except +
column_view input,
regex_program prog,
string_scalar replacement,
size_type max_replace_count) except +

cdef unique_ptr[column] replace_re(
column_view source_strings,
column_view input,
vector[string] patterns,
column_view repls) except +
column_view replacements,
regex_flags flags) except +

cdef unique_ptr[column] replace_with_backrefs(
column_view input,
regex_program prog,
string replacement) except +
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ set(cython_sources
regex_program.pyx
repeat.pyx
replace.pyx
replace_re.pyx
side_type.pyx
slice.pyx
strip.pyx
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ from . cimport (
regex_program,
repeat,
replace,
replace_re,
side_type,
slice,
split,
Expand All @@ -42,6 +43,7 @@ __all__ = [
"regex_program",
"repeat",
"replace",
"replace_re",
"slice",
"strip",
"split",
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
regex_program,
repeat,
replace,
replace_re,
side_type,
slice,
split,
Expand All @@ -42,6 +43,7 @@
"regex_program",
"repeat",
"replace",
"replace_re",
"slice",
"strip",
"split",
Expand Down
30 changes: 30 additions & 0 deletions python/pylibcudf/pylibcudf/strings/replace_re.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar
from pylibcudf.strings.regex_flags cimport regex_flags
from pylibcudf.strings.regex_program cimport RegexProgram

ctypedef fused Replacement:
Column
Scalar

ctypedef fused Patterns:
RegexProgram
list


cpdef Column replace_re(
Column input,
Patterns patterns,
Replacement replacement=*,
size_type max_replace_count=*,
regex_flags flags=*
)

cpdef Column replace_with_backrefs(
Column input,
RegexProgram prog,
str replacement
)
Loading

0 comments on commit 4fe338c

Please sign in to comment.