Skip to content

Commit

Permalink
Detect text that can be interpreted ambiguously
Browse files Browse the repository at this point in the history
- Returns NA for all ambiguities
- Include warning for ambiguities
- Returns NA for R keywords like NA, NaN, Inf, NULL, TRUE, FALSE
  • Loading branch information
bahadzie committed May 13, 2024
1 parent c2baf85 commit ca437eb
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 21 deletions.
111 changes: 90 additions & 21 deletions R/numberize.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,39 @@
#' @keywords internal
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
digits_from <- function(text, lang = "en") {
invalid_structure <- function(positions) {
valid_position <- c(
"units", "tens", "hundreds", "thousand", "million", "billion", "trillion"
)
for (i in seq_along(valid_position)) {
index <- which(positions %in% valid_position[i])
is_adjacent <- any(diff(index) == 1)
if (is_adjacent) {
return(is_adjacent)
}
}
FALSE
}

# data frame that maps numbers to words
numbers <- data.frame(
stringsAsFactors = FALSE,
digit = c(
0:30, # because es is unique to 30
seq(40, 90, by = 10),
seq(40, 70, by = 10),
71:80,
90:99,
seq(100, 900, by = 100), 1000, 1E6, 1E9, 1E12
),
en = c(
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
"twenty", "", "", "", "", "", "", "", "", "",
"thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
"thirty", "forty", "fifty", "sixty",
"seventy", "", "", "", "", "", "", "", "", "",
"eighty",
"ninety", "", "", "", "", "", "", "", "", "",
"hundred", "", "", "", "", "", "", "", "",
"thousand", "million", "billion", "trillion"
),
Expand All @@ -30,9 +49,11 @@ digits_from <- function(text, lang = "en") {
"nueve", "diez", "once", "doce", "trece", "catorce", "quince",
"diecis\u00e9is", "diecisiete", "dieciocho", "diecinueve", "veinte",
"veintiuno", "veintid\u00f3s", "veintitr\u00e9s", "veinticuatro",
"veinticinco", "veintis\u00e9is", "veintisiete", "veintiocho",
"veintinueve", "treinta", "cuarenta", "cincuenta", "sesenta",
"setenta", "ochenta", "noventa",
"veinticinco", "veintis\u00e9is", "veintisiete", "veintiocho", "veintinueve", # nolint
"treinta", "cuarenta", "cincuenta", "sesenta",
"setenta", "", "", "", "", "", "", "", "", "",
"ochenta",
"noventa", "", "", "", "", "", "", "", "", "",
"ciento", "doscientos", "trescientos", "cuatrocientos", "quinientos",
"seiscientos", "setecientos", "ochocientos", "novecientos",
"mil", "mill\u00f3n", "mil-millones", "bill\u00f3n"
Expand All @@ -42,13 +63,37 @@ digits_from <- function(text, lang = "en") {
"huit", "neuf", "dix", "onze", "douze", "treize", "quatorze",
"quinze", "seize", "dix sept", "dix huit", "dix neuf",
"vingt", "", "", "", "", "", "", "", "", "",
"trente", "quarante", "cinquante",
"soixante", "soixante dix", "quatre-vingt", "quatre-vingt dix",
"trente", "quarante", "cinquante", "soixante",
"soixante-dix", "soixante-onze", "soixante-douze", "soixante-treize",
"soixante-quatorze", "soixante-quinze", "soixante-seize",
"soixante-dix-sept", "soixante-dix-huit", "soixante-dix-neuf",
"quatre-vingt",
"quatre-vingt-dix", "quatre-vingt-onze", "quatre-vingt-douze", "quatre-vingt-treize", # nolint
"quatre-vingt-quatorze", "quatre-vingt-quinze", "quatre-vingt-seize",
"quatre-vingt-dix-sept", "quatre-vingt-dix-huit", "quatre-vingt-dix-neuf",
"cent", "", "", "", "", "", "", "", "",
"mille", "million", "milliard", "billion"
),
position = c(
rep("units", 10),
rep("tens", 45),
rep("hundreds", 9),
"thousand", "million", "billion", "trillion"
),
positional_digit = c(
0:9, # units
rep(1, 10), # tens (10-19)
rep(2, 10), # tens (20-29)
3:6, # tens (30-60)
rep(7, 10), # tens (70-79)
8, # tens (80)
rep(9, 10), # tens (90-99)
1:9, # hundreds (100-900)
rep(1, 4) # thousand, million, billion, trillion
)
)

original_text <- text # to report warning if necessary
# clean and prep
text <- tolower(text) # converts to string as a side effect
text <- trimws(text)
Expand All @@ -62,15 +107,31 @@ digits_from <- function(text, lang = "en") {
text <- gsub("\\sun\\s", " uno ", text)
}
if (lang == "fr") {
# lang=fr plural-> singular
# plural to singular
text <- gsub("(cent|mille|million|milliard|billion)s\\b", "\\1", text)
# lang=fr one word
text <- gsub("quatre vingt", "quatre-vingt", text, fixed = TRUE)
# handle 70-79
text <- gsub(
"soixante (dix|onze|douze|treize|quatorze|quinze|seize)",
"soixante-\\1", text
)
text <- gsub("soixante-dix (sept|huit|neuf)", "soixante-dix-\\1", text)
# handle 90-99
text <- gsub(
"quatre vingt (dix|onze|douze|treize|quatorze|quinze|seize)",
"quatre-vingt-\\1", text
)
text <- gsub("quatre-vingt (sept|huit|neuf)", "quatre-vingt-\\1", text)
}

words <- strsplit(text, "\\s+")[[1]]
digits <- numbers[match(words, numbers[[lang]]), "digit"]
digits
positions <- numbers[match(words, numbers[[lang]]), "position"]
if (invalid_structure(positions)) {
warning(
cat("[", original_text, "] can be interpreted in different ways.\n")
)
return(NA)
}
numbers[match(words, numbers[[lang]]), "digit"]
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -107,7 +168,6 @@ number_from <- function(digits) {
summed + total
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' Internal function used in the numberize() call for vectors.
#'
#' @param text Character string in a supported language.
Expand All @@ -118,10 +178,12 @@ number_from <- function(digits) {
#'
#' @keywords internal
#'
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.numberize <- function(text, lang = c("en", "fr", "es")) {
# return NA if the input is NA
if (is.na(text)) {
text <- toString(text)
if (
trimws(text) %in%
c("NA", "TRUE", "FALSE", "nan", "Inf", "") || # check other R keywords
length(text) == 0) { # check for NULL
return(NA)
}

Expand All @@ -140,15 +202,19 @@ number_from <- function(digits) {
}
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' Convert a vector string of spelled numbers in a supported language to
#' its numeric equivalent.
#'
#' @param text Vector containing spelled numbers in a supported language.
#' @param lang The text's language. Currently one of `"en" | "fr" | "es"`.
#' The range of words supported is between \strong{zero} and
#' \strong{nine hundred and ninety nine trillion, nine hundred and}
#' \strong{ninety nine billion, nine hundred and ninety nine million, nine}
#' \strong{hundred and ninety nine thousand, nine hundred and ninety nine}
#'
#' @param text String vector of spelled numbers in a supported language.
#' @param lang The text's language. Currently one of `c("en", "fr", "es")`.
#' Default is "en"
#'
#' @return A vector of numeric values.
#' @return A vector of positive integers.
#'
#' @examples
#' # convert to numbers a scalar
Expand All @@ -158,9 +224,12 @@ number_from <- function(digits) {
#' numberize(c("dix", "soixante-cinq", "deux mille vingt-quatre"), lang = "fr")
#'
#' @export
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
numberize <- function(text, lang = c("en", "fr", "es")) {
lang <- tolower(lang)
lang <- match.arg(lang)
if (is.null(text)) {
return(NA)
}
vapply(
text,
.numberize,
Expand Down
32 changes: 32 additions & 0 deletions tests/testthat/test-numberize.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,35 @@ test_that("text with leading and trailing whitespace works", {
", lang = "fr")
expect_identical(res, 1515)
})

test_that("text with ambigious number conversion returns NA", {
res <- numberize("twenty twenty four", lang = "fr")
expect_true(is.na(res))
})

test_that("NA to return NA", {
res <- numberize(NA, lang = "fr")
expect_true(is.na(res))
})

test_that("NaN to return NA", {
res <- numberize(NaN, lang = "fr")
expect_true(is.na(res))
})

test_that("TRUE to return NA", {
res <- numberize(TRUE, lang = "fr")
expect_true(is.na(res))
})
test_that("FALSE to return NA", {
res <- numberize(FALSE, lang = "fr")
expect_true(is.na(res))
})
test_that("NULL to return NA", {
res <- numberize(NULL, lang = "fr")
expect_true(is.na(res))
})
test_that("Inf to return NA", {
res <- numberize(Inf, lang = "fr")
expect_true(is.na(res))
})

0 comments on commit ca437eb

Please sign in to comment.