Skip to content

Commit

Permalink
Add: initial UTF-8 support on Windows with stringi #5
Browse files Browse the repository at this point in the history
  • Loading branch information
qinwf committed Apr 23, 2016
1 parent accbed0 commit 91751a1
Show file tree
Hide file tree
Showing 8 changed files with 15 additions and 10 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ Description: RE2 is a primarily DFA based regexp engine from Google that is very
License: BSD_3_clause + file LICENSE
LazyData: TRUE
Imports:
Rcpp (>= 0.12.2)
Rcpp (>= 0.12.2), stringi
LinkingTo: Rcpp
Suggests:
knitr (>= 1.12.3),testthat,
rmarkdown (>= 0.9.5),microbenchmark,rex
Enhances:
directlabels, ggplot2 ,stringi
directlabels, ggplot2
URL: https://github.com/qinwf/re2r/
BugReports: https://github.com/qinwf/re2r/issues
VignetteBuilder: knitr
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ export(re2_extract)
export(re2_match)
export(re2_replace)
import(Rcpp)
importFrom(stringi,stri_enc_toutf8)
useDynLib(re2r)
7 changes: 2 additions & 5 deletions R/compile.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ re2 = function(pattern,
# pattern = enc2utf8(pattern)
# }
regexp = cpp_re2_compile(
pattern,
stri_enc_toutf8(pattern),
log_errors_value = FALSE,
utf_8_value = utf_8,
case_sensitive_value = case_sensitive,
Expand Down Expand Up @@ -178,10 +178,7 @@ get_named_groups = function(regexp) {
#' @return quoted string
#' @export
quote_meta = function(unquoted) {
# if (check_windows_strings(unquoted)) {
# unquoted = enc2utf8(unquoted)
# }
res = cpp_quote_meta(unquoted)
res = cpp_quote_meta(stri_enc_toutf8(unquoted))
# if (update_windows_strings()) {
# Encoding(res) = "UTF-8"
# }
Expand Down
2 changes: 1 addition & 1 deletion R/extract.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,5 @@ re2_extract = function(input, pattern, rewrite = "\\1", ...) {
if (is.character(pattern)) {
pattern = re2(pattern, ...)
}
cpp_extract(input, pattern, rewrite)
cpp_extract(stri_enc_toutf8(input), pattern, stri_enc_toutf8(rewrite))
}
2 changes: 1 addition & 1 deletion R/match.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,5 @@ re2_match = function(input,
if (is.character(pattern)) {
pattern = re2(pattern, ...)
}
cpp_match(input, pattern, value, anchor, all)
cpp_match(stri_enc_toutf8(input), pattern, value, anchor, all)
}
2 changes: 1 addition & 1 deletion R/replace.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,5 @@ re2_replace = function(input, pattern, rewrite, all = FALSE, ...) {
if (is.character(pattern)) {
pattern = re2(pattern, ...)
}
cpp_replace(input, pattern, rewrite, all)
cpp_replace(stri_enc_toutf8(input), pattern, stri_enc_toutf8(rewrite), all)
}
1 change: 1 addition & 0 deletions R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@

##' @useDynLib re2r
##' @import Rcpp
##' @importFrom stringi stri_enc_toutf8
NULL
6 changes: 6 additions & 0 deletions tests/testthat/test-unicode.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
context("Unicode")

test_that("unicode match",{
x <- stringi::stri_conv("a\u6587bc", "UTF-8", "")
expect_true(re2_match(x,"\u6587"))
})

0 comments on commit 91751a1

Please sign in to comment.