From ca437eb06a37650ae88f6cfaff858727202f3aad Mon Sep 17 00:00:00 2001 From: bahadzie Date: Mon, 13 May 2024 22:41:43 +0000 Subject: [PATCH] Detect text that can be interpreted ambiguously - Returns NA for all ambiguities - Include warning for ambiguities - Returns NA for R keywords like NA, NaN, Inf, NULL, TRUE, FALSE --- R/numberize.R | 111 ++++++++++++++++++++++++++------ tests/testthat/test-numberize.R | 32 +++++++++ 2 files changed, 122 insertions(+), 21 deletions(-) diff --git a/R/numberize.R b/R/numberize.R index 721892c..10e8c8d 100644 --- a/R/numberize.R +++ b/R/numberize.R @@ -8,12 +8,28 @@ #' @keywords internal # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ digits_from <- function(text, lang = "en") { + invalid_structure <- function(positions) { + valid_position <- c( + "units", "tens", "hundreds", "thousand", "million", "billion", "trillion" + ) + for (i in seq_along(valid_position)) { + index <- which(positions %in% valid_position[i]) + is_adjacent <- any(diff(index) == 1) + if (is_adjacent) { + return(is_adjacent) + } + } + FALSE + } + # data frame that maps numbers to words numbers <- data.frame( stringsAsFactors = FALSE, digit = c( 0:30, # because es is unique to 30 - seq(40, 90, by = 10), + seq(40, 70, by = 10), + 71:80, + 90:99, seq(100, 900, by = 100), 1000, 1E6, 1E9, 1E12 ), en = c( @@ -21,7 +37,10 @@ digits_from <- function(text, lang = "en") { "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "", "", "", "", "", "", "", "", "", - "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", + "thirty", "forty", "fifty", "sixty", + "seventy", "", "", "", "", "", "", "", "", "", + "eighty", + "ninety", "", "", "", "", "", "", "", "", "", "hundred", "", "", "", "", "", "", "", "", "thousand", "million", "billion", "trillion" ), @@ -30,9 +49,11 @@ digits_from <- function(text, lang = "en") { "nueve", "diez", "once", "doce", "trece", "catorce", "quince", "diecis\u00e9is", "diecisiete", "dieciocho", "diecinueve", "veinte", "veintiuno", "veintid\u00f3s", "veintitr\u00e9s", "veinticuatro", - "veinticinco", "veintis\u00e9is", "veintisiete", "veintiocho", - "veintinueve", "treinta", "cuarenta", "cincuenta", "sesenta", - "setenta", "ochenta", "noventa", + "veinticinco", "veintis\u00e9is", "veintisiete", "veintiocho", "veintinueve", # nolint + "treinta", "cuarenta", "cincuenta", "sesenta", + "setenta", "", "", "", "", "", "", "", "", "", + "ochenta", + "noventa", "", "", "", "", "", "", "", "", "", "ciento", "doscientos", "trescientos", "cuatrocientos", "quinientos", "seiscientos", "setecientos", "ochocientos", "novecientos", "mil", "mill\u00f3n", "mil-millones", "bill\u00f3n" @@ -42,13 +63,37 @@ digits_from <- function(text, lang = "en") { "huit", "neuf", "dix", "onze", "douze", "treize", "quatorze", "quinze", "seize", "dix sept", "dix huit", "dix neuf", "vingt", "", "", "", "", "", "", "", "", "", - "trente", "quarante", "cinquante", - "soixante", "soixante dix", "quatre-vingt", "quatre-vingt dix", + "trente", "quarante", "cinquante", "soixante", + "soixante-dix", "soixante-onze", "soixante-douze", "soixante-treize", + "soixante-quatorze", "soixante-quinze", "soixante-seize", + "soixante-dix-sept", "soixante-dix-huit", "soixante-dix-neuf", + "quatre-vingt", + "quatre-vingt-dix", "quatre-vingt-onze", "quatre-vingt-douze", "quatre-vingt-treize", # nolint + "quatre-vingt-quatorze", "quatre-vingt-quinze", "quatre-vingt-seize", + "quatre-vingt-dix-sept", "quatre-vingt-dix-huit", "quatre-vingt-dix-neuf", "cent", "", "", "", "", "", "", "", "", "mille", "million", "milliard", "billion" + ), + position = c( + rep("units", 10), + rep("tens", 45), + rep("hundreds", 9), + "thousand", "million", "billion", "trillion" + ), + positional_digit = c( + 0:9, # units + rep(1, 10), # tens (10-19) + rep(2, 10), # tens (20-29) + 3:6, # tens (30-60) + rep(7, 10), # tens (70-79) + 8, # tens (80) + rep(9, 10), # tens (90-99) + 1:9, # hundreds (100-900) + rep(1, 4) # thousand, million, billion, trillion ) ) + original_text <- text # to report warning if necessary # clean and prep text <- tolower(text) # converts to string as a side effect text <- trimws(text) @@ -62,15 +107,31 @@ digits_from <- function(text, lang = "en") { text <- gsub("\\sun\\s", " uno ", text) } if (lang == "fr") { - # lang=fr plural-> singular + # plural to singular text <- gsub("(cent|mille|million|milliard|billion)s\\b", "\\1", text) - # lang=fr one word - text <- gsub("quatre vingt", "quatre-vingt", text, fixed = TRUE) + # handle 70-79 + text <- gsub( + "soixante (dix|onze|douze|treize|quatorze|quinze|seize)", + "soixante-\\1", text + ) + text <- gsub("soixante-dix (sept|huit|neuf)", "soixante-dix-\\1", text) + # handle 90-99 + text <- gsub( + "quatre vingt (dix|onze|douze|treize|quatorze|quinze|seize)", + "quatre-vingt-\\1", text + ) + text <- gsub("quatre-vingt (sept|huit|neuf)", "quatre-vingt-\\1", text) } words <- strsplit(text, "\\s+")[[1]] - digits <- numbers[match(words, numbers[[lang]]), "digit"] - digits + positions <- numbers[match(words, numbers[[lang]]), "position"] + if (invalid_structure(positions)) { + warning( + cat("[", original_text, "] can be interpreted in different ways.\n") + ) + return(NA) + } + numbers[match(words, numbers[[lang]]), "digit"] } # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -107,7 +168,6 @@ number_from <- function(digits) { summed + total } -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #' Internal function used in the numberize() call for vectors. #' #' @param text Character string in a supported language. @@ -118,10 +178,12 @@ number_from <- function(digits) { #' #' @keywords internal #' -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .numberize <- function(text, lang = c("en", "fr", "es")) { - # return NA if the input is NA - if (is.na(text)) { + text <- toString(text) + if ( + trimws(text) %in% + c("NA", "TRUE", "FALSE", "nan", "Inf", "") || # check other R keywords + length(text) == 0) { # check for NULL return(NA) } @@ -140,15 +202,19 @@ number_from <- function(digits) { } } -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #' Convert a vector string of spelled numbers in a supported language to #' its numeric equivalent. #' -#' @param text Vector containing spelled numbers in a supported language. -#' @param lang The text's language. Currently one of `"en" | "fr" | "es"`. +#' The range of words supported is between \strong{zero} and +#' \strong{nine hundred and ninety nine trillion, nine hundred and} +#' \strong{ninety nine billion, nine hundred and ninety nine million, nine} +#' \strong{hundred and ninety nine thousand, nine hundred and ninety nine} +#' +#' @param text String vector of spelled numbers in a supported language. +#' @param lang The text's language. Currently one of `c("en", "fr", "es")`. #' Default is "en" #' -#' @return A vector of numeric values. +#' @return A vector of positive integers. #' #' @examples #' # convert to numbers a scalar @@ -158,9 +224,12 @@ number_from <- function(digits) { #' numberize(c("dix", "soixante-cinq", "deux mille vingt-quatre"), lang = "fr") #' #' @export -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ numberize <- function(text, lang = c("en", "fr", "es")) { + lang <- tolower(lang) lang <- match.arg(lang) + if (is.null(text)) { + return(NA) + } vapply( text, .numberize, diff --git a/tests/testthat/test-numberize.R b/tests/testthat/test-numberize.R index dcf525d..100ab34 100644 --- a/tests/testthat/test-numberize.R +++ b/tests/testthat/test-numberize.R @@ -104,3 +104,35 @@ test_that("text with leading and trailing whitespace works", { ", lang = "fr") expect_identical(res, 1515) }) + +test_that("text with ambigious number conversion returns NA", { + res <- numberize("twenty twenty four", lang = "fr") + expect_true(is.na(res)) +}) + +test_that("NA to return NA", { + res <- numberize(NA, lang = "fr") + expect_true(is.na(res)) +}) + +test_that("NaN to return NA", { + res <- numberize(NaN, lang = "fr") + expect_true(is.na(res)) +}) + +test_that("TRUE to return NA", { + res <- numberize(TRUE, lang = "fr") + expect_true(is.na(res)) +}) +test_that("FALSE to return NA", { + res <- numberize(FALSE, lang = "fr") + expect_true(is.na(res)) +}) +test_that("NULL to return NA", { + res <- numberize(NULL, lang = "fr") + expect_true(is.na(res)) +}) +test_that("Inf to return NA", { + res <- numberize(Inf, lang = "fr") + expect_true(is.na(res)) +})