Skip to content

Commit

Permalink
fix MycroftAI#88: multiple decimal places
Browse files Browse the repository at this point in the history
 * extract_number(), extract_numbers(), and all
   helper functions gain a keyword parameter
   `decimal_places` (for helpers, just `places`)
   which does what it sounds like, using builtin
   round().

 * avoid capturing non-adjacent numbers as decimal
   places

 * avoid capturing already-used decimal places as
   separate numbers in extract_numbers()

 * add a few tests for the above
  • Loading branch information
ChanceNCounter committed Mar 16, 2020
1 parent e6837eb commit a374d79
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 35 deletions.
88 changes: 62 additions & 26 deletions lingua_franca/lang/parse_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def generate_plurals_en(originals):
_STRING_LONG_ORDINAL_EN = invert_dict(_LONG_ORDINAL_EN)


def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False, places=None):
"""
Convert words in a string into their equivalent numbers.
Args:
Expand All @@ -94,7 +94,8 @@ def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
text = text.lower()
tokens = tokenize(text)
numbers_to_replace = \
_extract_numbers_with_text_en(tokens, short_scale, ordinals)
_extract_numbers_with_text_en(
tokens, short_scale, ordinals, places=places)
numbers_to_replace.sort(key=lambda number: number.start_index)

results = []
Expand All @@ -114,7 +115,8 @@ def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):


def _extract_numbers_with_text_en(tokens, short_scale=True,
ordinals=False, fractional_numbers=True):
ordinals=False, fractional_numbers=True,
places=None):
"""
Extract all numbers from a list of Tokens, with the words that
represent them.
Expand All @@ -138,7 +140,8 @@ def _extract_numbers_with_text_en(tokens, short_scale=True,
while True:
to_replace = \
_extract_number_with_text_en(tokens, short_scale,
ordinals, fractional_numbers)
ordinals, fractional_numbers,
places=places)

if not to_replace:
break
Expand All @@ -156,7 +159,8 @@ def _extract_numbers_with_text_en(tokens, short_scale=True,


def _extract_number_with_text_en(tokens, short_scale=True,
ordinals=False, fractional_numbers=True):
ordinals=False, fractional_numbers=True,
places=None):
"""
This function extracts a number from a list of Tokens.
Expand All @@ -172,15 +176,17 @@ def _extract_number_with_text_en(tokens, short_scale=True,
"""
number, tokens = \
_extract_number_with_text_en_helper(tokens, short_scale,
ordinals, fractional_numbers)
ordinals, fractional_numbers,
places=places)
while tokens and tokens[0].word in _ARTICLES_EN:
tokens.pop(0)
return ReplaceableNumber(number, tokens)


def _extract_number_with_text_en_helper(tokens,
short_scale=True, ordinals=False,
fractional_numbers=True):
fractional_numbers=True,
places=None):
"""
Helper for _extract_number_with_text_en.
Expand All @@ -205,7 +211,8 @@ def _extract_number_with_text_en_helper(tokens,
return fraction, fraction_text

decimal, decimal_text = \
_extract_decimal_with_text_en(tokens, short_scale, ordinals)
_extract_decimal_with_text_en(
tokens, short_scale, ordinals, places=places)
if decimal:
return decimal, decimal_text

Expand Down Expand Up @@ -254,7 +261,7 @@ def _extract_fraction_with_text_en(tokens, short_scale, ordinals):
return None, None


def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
def _extract_decimal_with_text_en(tokens, short_scale, ordinals, places=None):
"""
Extract decimal numbers from a string.
Expand All @@ -271,6 +278,7 @@ def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
tokens [Token]: The text to parse.
short_scale boolean:
ordinals boolean:
places [int]: Number of decimal places to return
Returns:
(float, [Token])
Expand All @@ -284,21 +292,46 @@ def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
if len(partitions) == 3:
numbers1 = \
_extract_numbers_with_text_en(partitions[0], short_scale,
ordinals, fractional_numbers=False)
ordinals, fractional_numbers=False,
places=places)
numbers2 = \
_extract_numbers_with_text_en(partitions[2], short_scale,
ordinals, fractional_numbers=False)

ordinals, fractional_numbers=False,
places=places)
if not numbers1 or not numbers2:
return None, None

token_idx = numbers2[0].tokens[0].index
idx = 1
stop = False
while idx < len(numbers2) and not stop:
if numbers2[idx].tokens[0].index != numbers2[idx-1].tokens[0].index + 1 or \
numbers2[idx].value is None:
stop = True
else:
idx += 1
numbers2 = numbers2[:idx]

number = numbers1[-1]
decimal = numbers2[0]
# decimal = numbers2[0]

# TODO handle number dot number number number
if "." not in str(decimal.text):
return number.value + float('0.' + str(decimal.value)), \
number.tokens + partitions[1] + decimal.tokens
if "." not in str(numbers2[0].text):
return_value = float('0.' + "".join([str(
decimal.value) for decimal in numbers2]))
return_value = number.value + return_value
if return_value == int(return_value):
return_value = int(return_value)

# out_part2 = partitions[2]
# for n in numbers2:
# out_part2[n.index] = n.value

return_tokens = number.tokens + partitions[1]
for n in numbers2:
return_tokens += n.tokens

return (round(return_value, places) if places else return_value), return_tokens
return None, None


Expand All @@ -319,8 +352,8 @@ def _extract_whole_number_with_text_en(tokens, short_scale, ordinals):
The value parsed, and tokens that it corresponds to.
"""
multiplies, string_num_ordinal, string_num_scale = \
_initialize_number_data(short_scale)
multiplies, string_num_ordinal, string_num_scale = _initialize_number_data(
short_scale)

number_words = [] # type: [Token]
val = False
Expand Down Expand Up @@ -445,10 +478,10 @@ def _extract_whole_number_with_text_en(tokens, short_scale, ordinals):

else:
if all([
prev_word in _SUMS,
word not in _SUMS,
word not in multiplies,
current_val >= 10]):
prev_word in _SUMS,
word not in _SUMS,
word not in multiplies,
current_val >= 10]):
# Backtrack - we've got numbers we can't sum.
number_words.pop()
val = prev_val
Expand Down Expand Up @@ -495,7 +528,7 @@ def _initialize_number_data(short_scale):
return multiplies, string_num_ordinal_en, string_num_scale_en


def extractnumber_en(text, short_scale=True, ordinals=False):
def extractnumber_en(text, short_scale=True, ordinals=False, decimal_places=None):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
Expand All @@ -506,13 +539,15 @@ def extractnumber_en(text, short_scale=True, ordinals=False):
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal_places (int or False): rounds to # decimal places. uses builtin round()
Returns:
(int) or (float) or False: The extracted number or False if no number
was found
"""
return _extract_number_with_text_en(tokenize(text.lower()),
short_scale, ordinals).value
short_scale, ordinals,
places=decimal_places).value


def extract_duration_en(text):
Expand Down Expand Up @@ -1411,7 +1446,7 @@ def isFractional_en(input_str, short_scale=True):
return False


def extract_numbers_en(text, short_scale=True, ordinals=False):
def extract_numbers_en(text, short_scale=True, ordinals=False, decimal_places=None):
"""
Takes in a string and extracts a list of numbers.
Expand All @@ -1422,11 +1457,12 @@ def extract_numbers_en(text, short_scale=True, ordinals=False):
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
decimal_places (int or False): rounds to # decimal places. uses builtin round()
Returns:
list: list of extracted numbers as floats
"""
results = _extract_numbers_with_text_en(tokenize(text),
short_scale, ordinals)
short_scale, ordinals, places=decimal_places)
return [float(result.value) for result in results]


Expand Down
17 changes: 13 additions & 4 deletions lingua_franca/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,12 @@ def match_one(query, choices):
else:
return best

# TODO update these docstrings when decimal_places has been implemented
# in all parsers

def extract_numbers(text, short_scale=True, ordinals=False, lang=None):

def extract_numbers(text, short_scale=True, ordinals=False, lang=None,
decimal_places=False):
"""
Takes in a string and extracts a list of numbers.
Expand All @@ -90,12 +94,14 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
lang (str): the BCP-47 code for the language to use, None uses default
decimal_places (int or False): rounds to # decimal places. Not yet implemented
in all languages. False performs no rounding. Uses builtin round()
Returns:
list: list of extracted numbers as floats, or empty list if none found
"""
lang_code = get_primary_lang_code(lang)
if lang_code == "en":
return extract_numbers_en(text, short_scale, ordinals)
return extract_numbers_en(text, short_scale, ordinals, decimal_places)
elif lang_code == "de":
return extract_numbers_de(text, short_scale, ordinals)
elif lang_code == "fr":
Expand All @@ -112,7 +118,8 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
return []


def extract_number(text, short_scale=True, ordinals=False, lang=None):
def extract_number(text, short_scale=True, ordinals=False, lang=None,
decimal_places=False):
"""Takes in a string and extracts a number.
Args:
Expand All @@ -123,14 +130,16 @@ def extract_number(text, short_scale=True, ordinals=False, lang=None):
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
lang (str): the BCP-47 code for the language to use, None uses default
decimal_places (int or False): rounds to # decimal places. Not yet implemented
in all languages. False performs no rounding. Uses builtin round()
Returns:
(int, float or False): The number extracted or False if the input
text contains no numbers
"""
lang_code = get_primary_lang_code(lang)
if lang_code == "en":
return extractnumber_en(text, short_scale=short_scale,
ordinals=ordinals)
ordinals=ordinals, decimal_places=decimal_places)
elif lang_code == "es":
return extractnumber_es(text)
elif lang_code == "pt":
Expand Down
19 changes: 14 additions & 5 deletions test/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,16 +136,18 @@ def test_extract_number(self):
self.assertEqual(extract_number("twenty thousand"), 20000)
self.assertEqual(extract_number("fifty million"), 50000000)

# This test fails due to
# This test fails due to
# self.assertEqual(extract_number("twenty billion three hundred million \
# nine hundred fifty thousand six hundred \
# seventy five point eight six"),
# 20300950675.86)

# TODO handle this case
# self.assertEqual(
# extract_number("6 dot six six six"),
# 6.666)
self.assertEqual(extract_number("6 dot six six six"), 6.666)
self.assertEqual(extract_number(
"6 dot six six six", decimal_places=2), round(6.666, 2))
self.assertEqual(extract_number(
"6 point seventy", decimal_places=2), 6.7)

self.assertTrue(extract_number("The tennis player is fast") is False)
self.assertTrue(extract_number("fraggle") is False)

Expand Down Expand Up @@ -726,6 +728,13 @@ def test_multiple_numbers(self):
self.assertEqual(extract_numbers("this is a seven eight nine and a"
" half test"),
[7.0, 8.0, 9.5])
self.assertEqual(extract_numbers("this is a six point five seven nine"
" bingo ten nancy forty six test"),
[6.579, 10.0, 46.0])
self.assertEqual(extract_numbers("this is a six point five seven nine"
" bingo ten nancy forty six test"
" with decimal rounding", decimal_places=2),
[round(6.579, 2), 10, 46])

def test_contractions(self):
self.assertEqual(normalize("ain't"), "is not")
Expand Down

0 comments on commit a374d79

Please sign in to comment.