fix MycroftAI#88: multiple decimal places

* extract_number(), extract_numbers(), and all helper functions gain a keyword parameter `decimal_places` (for helpers, just `places`) which does what it sounds like, using builtin round(). * avoid capturing non-adjacent numbers as decimal places * avoid capturing already-used decimal places as separate numbers in extract_numbers() * add a few tests for the above
ChanceNCounter · Mar 16, 2020 · a374d79 · a374d79
1 parent e6837eb
commit a374d79
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 35 deletions.
diff --git a/lingua_franca/lang/parse_en.py b/lingua_franca/lang/parse_en.py
@@ -77,7 +77,7 @@ def generate_plurals_en(originals):
 _STRING_LONG_ORDINAL_EN = invert_dict(_LONG_ORDINAL_EN)
 
 
-def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
+def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False, places=None):
     """
     Convert words in a string into their equivalent numbers.
     Args:
@@ -94,7 +94,8 @@ def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
     text = text.lower()
     tokens = tokenize(text)
     numbers_to_replace = \
-        _extract_numbers_with_text_en(tokens, short_scale, ordinals)
+        _extract_numbers_with_text_en(
+            tokens, short_scale, ordinals, places=places)
     numbers_to_replace.sort(key=lambda number: number.start_index)
 
     results = []
@@ -114,7 +115,8 @@ def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
 
 
 def _extract_numbers_with_text_en(tokens, short_scale=True,
-                                  ordinals=False, fractional_numbers=True):
+                                  ordinals=False, fractional_numbers=True,
+                                  places=None):
     """
     Extract all numbers from a list of Tokens, with the words that
     represent them.
@@ -138,7 +140,8 @@ def _extract_numbers_with_text_en(tokens, short_scale=True,
     while True:
         to_replace = \
             _extract_number_with_text_en(tokens, short_scale,
-                                         ordinals, fractional_numbers)
+                                         ordinals, fractional_numbers,
+                                         places=places)
 
         if not to_replace:
             break
@@ -156,7 +159,8 @@ def _extract_numbers_with_text_en(tokens, short_scale=True,
 
 
 def _extract_number_with_text_en(tokens, short_scale=True,
-                                 ordinals=False, fractional_numbers=True):
+                                 ordinals=False, fractional_numbers=True,
+                                 places=None):
     """
     This function extracts a number from a list of Tokens.
 
@@ -172,15 +176,17 @@ def _extract_number_with_text_en(tokens, short_scale=True,
     """
     number, tokens = \
         _extract_number_with_text_en_helper(tokens, short_scale,
-                                            ordinals, fractional_numbers)
+                                            ordinals, fractional_numbers,
+                                            places=places)
     while tokens and tokens[0].word in _ARTICLES_EN:
         tokens.pop(0)
     return ReplaceableNumber(number, tokens)
 
 
 def _extract_number_with_text_en_helper(tokens,
                                         short_scale=True, ordinals=False,
-                                        fractional_numbers=True):
+                                        fractional_numbers=True,
+                                        places=None):
     """
     Helper for _extract_number_with_text_en.
 
@@ -205,7 +211,8 @@ def _extract_number_with_text_en_helper(tokens,
             return fraction, fraction_text
 
         decimal, decimal_text = \
-            _extract_decimal_with_text_en(tokens, short_scale, ordinals)
+            _extract_decimal_with_text_en(
+                tokens, short_scale, ordinals, places=places)
         if decimal:
             return decimal, decimal_text
 
@@ -254,7 +261,7 @@ def _extract_fraction_with_text_en(tokens, short_scale, ordinals):
     return None, None
 
 
-def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
+def _extract_decimal_with_text_en(tokens, short_scale, ordinals, places=None):
     """
     Extract decimal numbers from a string.
 
@@ -271,6 +278,7 @@ def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
         tokens [Token]: The text to parse.
         short_scale boolean:
         ordinals boolean:
+        places [int]: Number of decimal places to return
 
     Returns:
         (float, [Token])
@@ -284,21 +292,46 @@ def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
         if len(partitions) == 3:
             numbers1 = \
                 _extract_numbers_with_text_en(partitions[0], short_scale,
-                                              ordinals, fractional_numbers=False)
+                                              ordinals, fractional_numbers=False,
+                                              places=places)
             numbers2 = \
                 _extract_numbers_with_text_en(partitions[2], short_scale,
-                                              ordinals, fractional_numbers=False)
-
+                                              ordinals, fractional_numbers=False,
+                                              places=places)
             if not numbers1 or not numbers2:
                 return None, None
 
+            token_idx = numbers2[0].tokens[0].index
+            idx = 1
+            stop = False
+            while idx < len(numbers2) and not stop:
+                if numbers2[idx].tokens[0].index != numbers2[idx-1].tokens[0].index + 1 or \
+                        numbers2[idx].value is None:
+                    stop = True
+                else:
+                    idx += 1
+            numbers2 = numbers2[:idx]
+
             number = numbers1[-1]
-            decimal = numbers2[0]
+            # decimal = numbers2[0]
 
             # TODO handle number dot number number number
-            if "." not in str(decimal.text):
-                return number.value + float('0.' + str(decimal.value)), \
-                    number.tokens + partitions[1] + decimal.tokens
+            if "." not in str(numbers2[0].text):
+                return_value = float('0.' + "".join([str(
+                    decimal.value) for decimal in numbers2]))
+                return_value = number.value + return_value
+                if return_value == int(return_value):
+                    return_value = int(return_value)
+
+                # out_part2 = partitions[2]
+                # for n in numbers2:
+                #     out_part2[n.index] = n.value
+
+                return_tokens = number.tokens + partitions[1]
+                for n in numbers2:
+                    return_tokens += n.tokens
+
+                return (round(return_value, places) if places else return_value), return_tokens
     return None, None
 
 
@@ -319,8 +352,8 @@ def _extract_whole_number_with_text_en(tokens, short_scale, ordinals):
         The value parsed, and tokens that it corresponds to.
 
     """
-    multiplies, string_num_ordinal, string_num_scale = \
-        _initialize_number_data(short_scale)
+    multiplies, string_num_ordinal, string_num_scale = _initialize_number_data(
+        short_scale)
 
     number_words = []  # type: [Token]
     val = False
@@ -445,10 +478,10 @@ def _extract_whole_number_with_text_en(tokens, short_scale, ordinals):
 
         else:
             if all([
-                prev_word in _SUMS,
-                word not in _SUMS,
-                word not in multiplies,
-                current_val >= 10]):
+                    prev_word in _SUMS,
+                    word not in _SUMS,
+                    word not in multiplies,
+                    current_val >= 10]):
                 # Backtrack - we've got numbers we can't sum.
                 number_words.pop()
                 val = prev_val
@@ -495,7 +528,7 @@ def _initialize_number_data(short_scale):
     return multiplies, string_num_ordinal_en, string_num_scale_en
 
 
-def extractnumber_en(text, short_scale=True, ordinals=False):
+def extractnumber_en(text, short_scale=True, ordinals=False, decimal_places=None):
     """
     This function extracts a number from a text string,
     handles pronunciations in long scale and short scale
@@ -506,13 +539,15 @@ def extractnumber_en(text, short_scale=True, ordinals=False):
         text (str): the string to normalize
         short_scale (bool): use short scale if True, long scale if False
         ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
+        decimal_places (int or False): rounds to # decimal places. uses builtin round()
     Returns:
         (int) or (float) or False: The extracted number or False if no number
                                    was found
 
     """
     return _extract_number_with_text_en(tokenize(text.lower()),
-                                        short_scale, ordinals).value
+                                        short_scale, ordinals,
+                                        places=decimal_places).value
 
 
 def extract_duration_en(text):
@@ -1411,7 +1446,7 @@ def isFractional_en(input_str, short_scale=True):
     return False
 
 
-def extract_numbers_en(text, short_scale=True, ordinals=False):
+def extract_numbers_en(text, short_scale=True, ordinals=False, decimal_places=None):
     """
         Takes in a string and extracts a list of numbers.
 
@@ -1422,11 +1457,12 @@ def extract_numbers_en(text, short_scale=True, ordinals=False):
             is now common in most English speaking countries.
             See https://en.wikipedia.org/wiki/Names_of_large_numbers
         ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
+        decimal_places (int or False): rounds to # decimal places. uses builtin round()
     Returns:
         list: list of extracted numbers as floats
     """
     results = _extract_numbers_with_text_en(tokenize(text),
-                                            short_scale, ordinals)
+                                            short_scale, ordinals, places=decimal_places)
     return [float(result.value) for result in results]
 
 

diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py
@@ -77,8 +77,12 @@ def match_one(query, choices):
     else:
         return best
 
+# TODO update these docstrings when decimal_places has been implemented
+#     in all parsers
 
-def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
+
+def extract_numbers(text, short_scale=True, ordinals=False, lang=None,
+                    decimal_places=False):
     """
         Takes in a string and extracts a list of numbers.
 
@@ -90,12 +94,14 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
             See https://en.wikipedia.org/wiki/Names_of_large_numbers
         ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
         lang (str): the BCP-47 code for the language to use, None uses default
+        decimal_places (int or False): rounds to # decimal places. Not yet implemented
+            in all languages. False performs no rounding. Uses builtin round()
     Returns:
         list: list of extracted numbers as floats, or empty list if none found
     """
     lang_code = get_primary_lang_code(lang)
     if lang_code == "en":
-        return extract_numbers_en(text, short_scale, ordinals)
+        return extract_numbers_en(text, short_scale, ordinals, decimal_places)
     elif lang_code == "de":
         return extract_numbers_de(text, short_scale, ordinals)
     elif lang_code == "fr":
@@ -112,7 +118,8 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
     return []
 
 
-def extract_number(text, short_scale=True, ordinals=False, lang=None):
+def extract_number(text, short_scale=True, ordinals=False, lang=None,
+                   decimal_places=False):
     """Takes in a string and extracts a number.
 
     Args:
@@ -123,14 +130,16 @@ def extract_number(text, short_scale=True, ordinals=False, lang=None):
             See https://en.wikipedia.org/wiki/Names_of_large_numbers
         ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
         lang (str): the BCP-47 code for the language to use, None uses default
+        decimal_places (int or False): rounds to # decimal places. Not yet implemented
+            in all languages. False performs no rounding. Uses builtin round()
     Returns:
         (int, float or False): The number extracted or False if the input
                                text contains no numbers
     """
     lang_code = get_primary_lang_code(lang)
     if lang_code == "en":
         return extractnumber_en(text, short_scale=short_scale,
-                                ordinals=ordinals)
+                                ordinals=ordinals, decimal_places=decimal_places)
     elif lang_code == "es":
         return extractnumber_es(text)
     elif lang_code == "pt":

diff --git a/test/test_parse.py b/test/test_parse.py
@@ -136,16 +136,18 @@ def test_extract_number(self):
         self.assertEqual(extract_number("twenty thousand"), 20000)
         self.assertEqual(extract_number("fifty million"), 50000000)
 
-        # This test fails due to 
+        # This test fails due to
         # self.assertEqual(extract_number("twenty billion three hundred million \
         #                                 nine hundred fifty thousand six hundred \
         #                                 seventy five point eight six"),
         #                                 20300950675.86)
 
-        # TODO handle this case
-        # self.assertEqual(
-        #    extract_number("6 dot six six six"),
-        #    6.666)
+        self.assertEqual(extract_number("6 dot six six six"), 6.666)
+        self.assertEqual(extract_number(
+            "6 dot six six six", decimal_places=2), round(6.666, 2))
+        self.assertEqual(extract_number(
+            "6 point seventy", decimal_places=2), 6.7)
+
         self.assertTrue(extract_number("The tennis player is fast") is False)
         self.assertTrue(extract_number("fraggle") is False)
 
@@ -726,6 +728,13 @@ def test_multiple_numbers(self):
         self.assertEqual(extract_numbers("this is a seven eight nine and a"
                                          " half test"),
                          [7.0, 8.0, 9.5])
+        self.assertEqual(extract_numbers("this is a six point five seven nine"
+                                         " bingo ten nancy forty six test"),
+                         [6.579, 10.0, 46.0])
+        self.assertEqual(extract_numbers("this is a six point five seven nine"
+                                         " bingo ten nancy forty six test"
+                                         " with decimal rounding", decimal_places=2),
+                         [round(6.579, 2), 10, 46])
 
     def test_contractions(self):
         self.assertEqual(normalize("ain't"), "is not")