Bug fix for text white leading/trailing whitespace

- Now also handles mixed vectors e.g. `c(1, "two", 3)` - Update README.Rmd for more clarity
epiverse-trace · Feb 25, 2024 · abfc245 · abfc245
1 parent ea61ca2
commit abfc245
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 10 deletions.
diff --git a/R/numberize.R b/R/numberize.R
@@ -26,9 +26,13 @@ digits_from <- function(text, lang = "en") {
       "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
       "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
       "sixteen", "seventeen", "eighteen", "nineteen",
-      "twenty", "", "", "", "", "", "", "", "", "",
+      "twenty", "twenty one", "twenty two", "twenty three", "twenty four",
+      "twenty five", "twenty six", "twenty seven", "twenty eight",
+      "twenty nine",
       "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
-      "hundred", "", "", "", "", "", "", "", "",
+      "hundred", "two hundred", "three hundred", "four hundred",
+      "five hundred", "six hundred", "seven hundred", "eight hundred",
+      "nine hundred",
       "thousand", "million", "billion", "trillion"
     ),
     es = c(
@@ -56,7 +60,8 @@ digits_from <- function(text, lang = "en") {
   )
 
   # clean and prep
-  text <- tolower(text)
+  text <- tolower(text) # converts to string as a side effect
+  text <- trimws(text)
   text <- gsub("\\sand|-|,|\\bet\\b|\\sy\\s", " ", text) # all lang
 
   # TODO check the words are in the selected lang or return NA
@@ -73,6 +78,12 @@ digits_from <- function(text, lang = "en") {
     # lang=fr one word
     text <- gsub("quatre vingt", "quatre-vingt", text, fixed = TRUE)
   }
+  # TODO raise issue in {lintr} repo. False lint generated by next line.
+  # [nonportable_path_linter] Use file.path() to construct portable file paths.
+  only_digits <- grepl("^\\d+$", text) # nolint
+  if (only_digits) {
+    return(as.double(text))
+  }
 
   words <- strsplit(text, "\\s+")[[1]]
   digits <- numbers[match(words, numbers[[lang]]), "digit"]
@@ -146,6 +157,7 @@ number_from <- function(digits) {
 #'
 #' @param text Vector containing spelt numbers in a supported language.
 #' @param lang The text's language. Currently one of `"en" | "fr" | "es"`.
+#' Default is "en"
 #'
 #' @return A numeric value.
 #'

diff --git a/README.Rmd b/README.Rmd
@@ -29,7 +29,7 @@ knitr::opts_chunk$set(
 
 <!-- badges: end -->
 
-_{{ packagename }}_ is an R package to convert numbers written as English, French or Spanish words from `"zero"` to `"nine hundred and ninety nine trillion, nine hundred and ninety nine billion, nine hundred and ninety nine million, nine hundred and ninety nine thousand, nine hundred and ninety nine"` from a character string to a numeric value.
+_{{ packagename }}_ is an R package to convert integers written as English, French or Spanish words from `"zero"` to `"nine hundred and ninety nine trillion, nine hundred and ninety nine billion, nine hundred and ninety nine million, nine hundred and ninety nine thousand, nine hundred and ninety nine"` from a character string to a numeric value.
 
 <!-- This sentence is optional and can be removed -->
 _{{ packagename }}_ is developed at the [{{ department }}]({{ department_url }}) at the {{ institution }} as part of the [Epiverse-TRACE program](https://data.org/initiatives/epiverse/).

diff --git a/tests/testthat/test-numberize.R b/tests/testthat/test-numberize.R
@@ -67,18 +67,17 @@ test_df <- data.frame(
   )
 )
 
-test_that("translating English numbers works", {
+test_that("translating vector of English numbers works", {
   res <- numberize(test_df[["en"]])
-  # res <- sapply(test_df[["en"]], numberize)
   expect_identical(unname(res), test_df[["num"]])
 })
 
-test_that("translating French numbers works", {
+test_that("translating vector of French numbers works", {
   res <- numberize(test_df[["fr"]], lang = "fr")
   expect_identical(unname(res), test_df[["num"]])
 })
 
-test_that("translating Spanish numbers works", {
+test_that("translating vector of Spanish numbers works", {
   res <- numberize(test_df[["es"]], lang = "es")
   expect_identical(unname(res), test_df[["num"]])
 })
@@ -88,8 +87,25 @@ test_that("translating single french text works", {
   expect_identical(unname(res), 1515)
 })
 
-test_that("non digit word returns NA", {
+test_that("translating text with leading and trailing whitespace works", {
+  res <- numberize("  mille cinq  cent quinze
+    ", lang = "fr")
+  expect_identical(unname(res), 1515)
+})
+
+
+test_that("text with non digit word returns NA", {
   res <- numberize("epiverse", lang = "en")
   expect_true(is.na(res))
 })
-# TODO test edge cases in es and fr
+
+test_that("vector contains actual digits and spelt digits", {
+  res <- numberize(c(1, "two", 3))
+  expect_identical(unname(res), c(1, 2, 3))
+})
+
+test_that("text with whitespace at the start and end and tab and newline in between", {
+  res <- numberize("  one  hundred
+  and thirty   ", lang = "en")
+  expect_identical(unname(res), 130)
+})