diff --git a/mathparse/mathparse.py b/mathparse/mathparse.py index 2db24b3..d09cda9 100644 --- a/mathparse/mathparse.py +++ b/mathparse/mathparse.py @@ -3,8 +3,8 @@ """ from __future__ import division from decimal import Decimal -import re from . import mathwords +import re class PostfixTokenEvaluationException(Exception): """ @@ -75,6 +75,17 @@ def is_word(word, language): return word in words +def find_word_groups(string, words): + """ + Find matches for words in the format "3 thousand 6 hundred 2". + The words parameter should be the list of words to check for such as "hundred". + """ + scale_pattern = '|'.join(words) + # For example: (?:(?:\d+)\s+(?:hundred|thousand|million)*\s*)+(?:\d+|hundred|thousand|million)+ + regex = re.compile(r'(?:(?:\d+)\s+(?:' + scale_pattern + r')*\s*)+(?:\d+|' + scale_pattern + r')+') + result = regex.findall(string) + return result + def replace_word_tokens(string, language): """ Given a string and an ISO 639-2 language code, @@ -86,13 +97,13 @@ def replace_word_tokens(string, language): # Replace operator words with numeric operators operators = words['binary_operators'].copy() operators.update(words['unary_operators']) - for operator in operators: + for operator in list(operators.keys()): if operator in string: string = string.replace(operator, operators[operator]) # Replace number words with numeric values numbers = words['numbers'] - for number in numbers: + for number in list(numbers.keys()): if number in string: string = string.replace(number, str(numbers[number])) @@ -100,11 +111,16 @@ def replace_word_tokens(string, language): scales = words['scales'] end_index_characters = mathwords.BINARY_OPERATORS end_index_characters.add('(') - for scale in scales: - for _ in range(string.count(scale)): - matches = list(re.finditer(scale, string)) - start_index = matches[0].start() - 1 + word_matches = find_word_groups(string, list(scales.keys())) + + for match in word_matches: + string = string.replace(match, '(' + match + ')') + + for scale in list(scales.keys()): + for _ in range(0, string.count(scale)): + start_index = string.find(scale) - 1 + end_index = len(string) while is_int(string[start_index - 1]) and start_index > 0: start_index -= 1 @@ -112,12 +128,15 @@ def replace_word_tokens(string, language): end_index = string.find(' ', start_index) + 1 end_index = string.find(' ', end_index) + 1 - add = ' +' + add = ' + ' if string[end_index] in end_index_characters: add = '' string = string[:start_index] + '(' + string[start_index:] string = string.replace(scale, '* ' + str(scales[scale]) + ')' + add, 1) + + string = string.replace(') (', ') + (') + return string diff --git a/tests/test_binary_operations.py b/tests/test_binary_operations.py index 149dbb8..d88becd 100644 --- a/tests/test_binary_operations.py +++ b/tests/test_binary_operations.py @@ -64,6 +64,11 @@ def test_division_by_zero_words(self): self.assertEqual(result, 'undefined') + def test_division_words_large(self): + result = mathparse.parse('one thousand two hundred four divided by one hundred', language='ENG') + + self.assertEqual(str(result), '12.04') + class PositiveFloatTestCase(TestCase): diff --git a/tests/test_replace_word_tokens.py b/tests/test_replace_word_tokens.py index d5e93c2..d50c160 100644 --- a/tests/test_replace_word_tokens.py +++ b/tests/test_replace_word_tokens.py @@ -17,9 +17,11 @@ def test_thirty(self): def test_thousand(self): result = mathparse.replace_word_tokens('five thousand + 30', language='ENG') - self.assertEqual(result, '(5 * 1000) + 30') + # Note: this ends up with double parentheses because it is both a + # scaled number ("thousand") and a word group ("five thousand") + self.assertEqual(result, '((5 * 1000)) + 30') def test_double_digit_multiplier_for_scale(self): result = mathparse.replace_word_tokens('fifty thousand + 1', language='ENG') - self.assertEqual(result, '(50 * 1000) + 1') + self.assertEqual(result, '((50 * 1000)) + 1')