Skip to content

Commit

Permalink
Bug fix for text white leading/trailing whitespace
Browse files Browse the repository at this point in the history
- Now also handles mixed vectors e.g. `c(1, "two", 3)`
- Update README.Rmd for more clarity
  • Loading branch information
bahadzie committed Feb 25, 2024
1 parent ea61ca2 commit abfc245
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 10 deletions.
18 changes: 15 additions & 3 deletions R/numberize.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,13 @@ digits_from <- function(text, lang = "en") {
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
"twenty", "", "", "", "", "", "", "", "", "",
"twenty", "twenty one", "twenty two", "twenty three", "twenty four",
"twenty five", "twenty six", "twenty seven", "twenty eight",
"twenty nine",
"thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
"hundred", "", "", "", "", "", "", "", "",
"hundred", "two hundred", "three hundred", "four hundred",
"five hundred", "six hundred", "seven hundred", "eight hundred",
"nine hundred",
"thousand", "million", "billion", "trillion"
),
es = c(
Expand Down Expand Up @@ -56,7 +60,8 @@ digits_from <- function(text, lang = "en") {
)

# clean and prep
text <- tolower(text)
text <- tolower(text) # converts to string as a side effect
text <- trimws(text)
text <- gsub("\\sand|-|,|\\bet\\b|\\sy\\s", " ", text) # all lang

# TODO check the words are in the selected lang or return NA
Expand All @@ -73,6 +78,12 @@ digits_from <- function(text, lang = "en") {
# lang=fr one word
text <- gsub("quatre vingt", "quatre-vingt", text, fixed = TRUE)
}
# TODO raise issue in {lintr} repo. False lint generated by next line.
# [nonportable_path_linter] Use file.path() to construct portable file paths.
only_digits <- grepl("^\\d+$", text) # nolint
if (only_digits) {
return(as.double(text))
}

words <- strsplit(text, "\\s+")[[1]]
digits <- numbers[match(words, numbers[[lang]]), "digit"]
Expand Down Expand Up @@ -146,6 +157,7 @@ number_from <- function(digits) {
#'
#' @param text Vector containing spelt numbers in a supported language.
#' @param lang The text's language. Currently one of `"en" | "fr" | "es"`.
#' Default is "en"
#'
#' @return A numeric value.
#'
Expand Down
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ knitr::opts_chunk$set(

<!-- badges: end -->

_{{ packagename }}_ is an R package to convert numbers written as English, French or Spanish words from `"zero"` to `"nine hundred and ninety nine trillion, nine hundred and ninety nine billion, nine hundred and ninety nine million, nine hundred and ninety nine thousand, nine hundred and ninety nine"` from a character string to a numeric value.
_{{ packagename }}_ is an R package to convert integers written as English, French or Spanish words from `"zero"` to `"nine hundred and ninety nine trillion, nine hundred and ninety nine billion, nine hundred and ninety nine million, nine hundred and ninety nine thousand, nine hundred and ninety nine"` from a character string to a numeric value.

<!-- This sentence is optional and can be removed -->
_{{ packagename }}_ is developed at the [{{ department }}]({{ department_url }}) at the {{ institution }} as part of the [Epiverse-TRACE program](https://data.org/initiatives/epiverse/).
Expand Down
28 changes: 22 additions & 6 deletions tests/testthat/test-numberize.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,18 +67,17 @@ test_df <- data.frame(
)
)

test_that("translating English numbers works", {
test_that("translating vector of English numbers works", {
res <- numberize(test_df[["en"]])
# res <- sapply(test_df[["en"]], numberize)
expect_identical(unname(res), test_df[["num"]])
})

test_that("translating French numbers works", {
test_that("translating vector of French numbers works", {
res <- numberize(test_df[["fr"]], lang = "fr")
expect_identical(unname(res), test_df[["num"]])
})

test_that("translating Spanish numbers works", {
test_that("translating vector of Spanish numbers works", {
res <- numberize(test_df[["es"]], lang = "es")
expect_identical(unname(res), test_df[["num"]])
})
Expand All @@ -88,8 +87,25 @@ test_that("translating single french text works", {
expect_identical(unname(res), 1515)
})

test_that("non digit word returns NA", {
test_that("translating text with leading and trailing whitespace works", {
res <- numberize(" mille cinq cent quinze
", lang = "fr")
expect_identical(unname(res), 1515)
})


test_that("text with non digit word returns NA", {
res <- numberize("epiverse", lang = "en")
expect_true(is.na(res))
})
# TODO test edge cases in es and fr

test_that("vector contains actual digits and spelt digits", {
res <- numberize(c(1, "two", 3))
expect_identical(unname(res), c(1, 2, 3))
})

test_that("text with whitespace at the start and end and tab and newline in between", {
res <- numberize(" one hundred
and thirty ", lang = "en")
expect_identical(unname(res), 130)
})

0 comments on commit abfc245

Please sign in to comment.