From 10edbf0d495a7f8cd7acd35a932224775639b5b0 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Mar 2024 08:52:51 +0900 Subject: [PATCH] Fix another two corner cases --- .../values/corpus/trainingdata1.tei.xml | 1 + .../grobid/core/utilities/WordsToNumber.java | 4 +-- .../core/utilities/WordsToNumberTest.kt | 26 +++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/resources/dataset/values/corpus/trainingdata1.tei.xml b/resources/dataset/values/corpus/trainingdata1.tei.xml index 3cdfbfe4..e2deda33 100644 --- a/resources/dataset/values/corpus/trainingdata1.tei.xml +++ b/resources/dataset/values/corpus/trainingdata1.tei.xml @@ -11,5 +11,6 @@ 10-1 10 + Six, 12 diff --git a/src/main/java/org/grobid/core/utilities/WordsToNumber.java b/src/main/java/org/grobid/core/utilities/WordsToNumber.java index 71193b46..2c8f9285 100644 --- a/src/main/java/org/grobid/core/utilities/WordsToNumber.java +++ b/src/main/java/org/grobid/core/utilities/WordsToNumber.java @@ -33,9 +33,9 @@ public class WordsToNumber { private final String VALUES_PATH = "lexicon/en/values.json"; - private final Pattern NUMERIC_PATTERN = Pattern.compile("[0-9.,]+", Pattern.CASE_INSENSITIVE); + private final Pattern NUMERIC_PATTERN = Pattern.compile("\\b(?:\\d+(?:[.,]\\d+)*|\\d+[.,]?\\d*\\b)\\b", Pattern.CASE_INSENSITIVE); private final Pattern OUT_OF_PATTERN_NUMBERS = Pattern.compile("([0-9.,]+)( out)? of (the )?([0-9.,]+)", Pattern.CASE_INSENSITIVE); - private final Pattern OUT_OF_PATTERN_ALPHABETIC = Pattern.compile("([A-Za-z ]+) out of (the )?([A-Za-z]+)", Pattern.CASE_INSENSITIVE); + private final Pattern OUT_OF_PATTERN_ALPHABETIC = Pattern.compile("([A-Za-z ]+) out of ([a-z]+ )?([A-Za-z]+)", Pattern.CASE_INSENSITIVE); private static List bases = null; private static List tens = null; diff --git a/src/test/kotlin/org/grobid/core/utilities/WordsToNumberTest.kt b/src/test/kotlin/org/grobid/core/utilities/WordsToNumberTest.kt index ced664c2..96420fd2 100644 --- a/src/test/kotlin/org/grobid/core/utilities/WordsToNumberTest.kt +++ b/src/test/kotlin/org/grobid/core/utilities/WordsToNumberTest.kt @@ -164,6 +164,22 @@ class WordsToNumberTest { MatcherAssert.assertThat(number, Is.`is`(BigDecimal("0.75"))) } + @Test + @Throws(Exception::class) + fun testConvertFractions6_1() { + val input = "three out of these four" + val number = target.normalize(input, Locale.ENGLISH) + MatcherAssert.assertThat(number, Is.`is`(BigDecimal("0.75"))) + } + + @Test + @Throws(Exception::class) + fun testConvertFractions6_2() { + val input = "three out of that four" + val number = target.normalize(input, Locale.ENGLISH) + MatcherAssert.assertThat(number, Is.`is`(BigDecimal("0.75"))) + } + @Test @Throws(Exception::class) fun testConvertFractions4Numeric() { @@ -186,4 +202,14 @@ class WordsToNumberTest { val input = "a temperature of 20" target.normalize(input, Locale.ENGLISH) } + + + @Throws(Exception::class) + fun testErrorCase_1() { + val input = "six, 12" + val output = target.normalize(input, Locale.ENGLISH) + + + } + } \ No newline at end of file