From 91751a1d8a75aef1510f9888f5ee6a2396b4db01 Mon Sep 17 00:00:00 2001 From: qinwf Date: Sun, 24 Apr 2016 00:49:38 +0800 Subject: [PATCH] Add: initial UTF-8 support on Windows with stringi #5 --- DESCRIPTION | 4 ++-- NAMESPACE | 1 + R/compile.R | 7 ++----- R/extract.R | 2 +- R/match.R | 2 +- R/replace.R | 2 +- R/zzz.R | 1 + tests/testthat/test-unicode.R | 6 ++++++ 8 files changed, 15 insertions(+), 10 deletions(-) create mode 100644 tests/testthat/test-unicode.R diff --git a/DESCRIPTION b/DESCRIPTION index 7d8c184..58babf0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,13 +9,13 @@ Description: RE2 is a primarily DFA based regexp engine from Google that is very License: BSD_3_clause + file LICENSE LazyData: TRUE Imports: - Rcpp (>= 0.12.2) + Rcpp (>= 0.12.2), stringi LinkingTo: Rcpp Suggests: knitr (>= 1.12.3),testthat, rmarkdown (>= 0.9.5),microbenchmark,rex Enhances: - directlabels, ggplot2 ,stringi + directlabels, ggplot2 URL: https://github.com/qinwf/re2r/ BugReports: https://github.com/qinwf/re2r/issues VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index f08dfc7..769a720 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,4 +15,5 @@ export(re2_extract) export(re2_match) export(re2_replace) import(Rcpp) +importFrom(stringi,stri_enc_toutf8) useDynLib(re2r) diff --git a/R/compile.R b/R/compile.R index ced04ce..97bb1b4 100644 --- a/R/compile.R +++ b/R/compile.R @@ -108,7 +108,7 @@ re2 = function(pattern, # pattern = enc2utf8(pattern) # } regexp = cpp_re2_compile( - pattern, + stri_enc_toutf8(pattern), log_errors_value = FALSE, utf_8_value = utf_8, case_sensitive_value = case_sensitive, @@ -178,10 +178,7 @@ get_named_groups = function(regexp) { #' @return quoted string #' @export quote_meta = function(unquoted) { - # if (check_windows_strings(unquoted)) { - # unquoted = enc2utf8(unquoted) - # } - res = cpp_quote_meta(unquoted) + res = cpp_quote_meta(stri_enc_toutf8(unquoted)) # if (update_windows_strings()) { # Encoding(res) = "UTF-8" # } diff --git a/R/extract.R b/R/extract.R index a08ac20..a40ddc0 100644 --- a/R/extract.R +++ b/R/extract.R @@ -48,5 +48,5 @@ re2_extract = function(input, pattern, rewrite = "\\1", ...) { if (is.character(pattern)) { pattern = re2(pattern, ...) } - cpp_extract(input, pattern, rewrite) + cpp_extract(stri_enc_toutf8(input), pattern, stri_enc_toutf8(rewrite)) } diff --git a/R/match.R b/R/match.R index cfc85fd..a5963a0 100644 --- a/R/match.R +++ b/R/match.R @@ -69,5 +69,5 @@ re2_match = function(input, if (is.character(pattern)) { pattern = re2(pattern, ...) } - cpp_match(input, pattern, value, anchor, all) + cpp_match(stri_enc_toutf8(input), pattern, value, anchor, all) } diff --git a/R/replace.R b/R/replace.R index 140d7ed..9a03db3 100644 --- a/R/replace.R +++ b/R/replace.R @@ -52,5 +52,5 @@ re2_replace = function(input, pattern, rewrite, all = FALSE, ...) { if (is.character(pattern)) { pattern = re2(pattern, ...) } - cpp_replace(input, pattern, rewrite, all) + cpp_replace(stri_enc_toutf8(input), pattern, stri_enc_toutf8(rewrite), all) } diff --git a/R/zzz.R b/R/zzz.R index f6bf6f2..59bfd89 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -32,4 +32,5 @@ ##' @useDynLib re2r ##' @import Rcpp +##' @importFrom stringi stri_enc_toutf8 NULL diff --git a/tests/testthat/test-unicode.R b/tests/testthat/test-unicode.R new file mode 100644 index 0000000..c516026 --- /dev/null +++ b/tests/testthat/test-unicode.R @@ -0,0 +1,6 @@ +context("Unicode") + +test_that("unicode match",{ + x <- stringi::stri_conv("a\u6587bc", "UTF-8", "") + expect_true(re2_match(x,"\u6587")) +})