From e18ddcb73cbeee8f62b25e7a36144022ae5ac225 Mon Sep 17 00:00:00 2001 From: jarbasai Date: Sun, 9 May 2021 13:20:35 +0100 Subject: [PATCH] support decimal markers (#20) rebase of https://github.com/MycroftAI/lingua-franca/pull/69 Co-authored-by: jarbasal --- lingua_franca/lang/parse_cs.py | 31 +++++++++++------ lingua_franca/lang/parse_da.py | 48 +++++++++++++++++---------- lingua_franca/lang/parse_de.py | 59 +++++++++++++++++++++------------ lingua_franca/lang/parse_en.py | 15 ++++++++- lingua_franca/lang/parse_es.py | 44 ++++++++++++++++-------- lingua_franca/lang/parse_fa.py | 32 ++++++++++++------ lingua_franca/lang/parse_fr.py | 47 ++++++++++++++++++-------- lingua_franca/lang/parse_it.py | 32 ++++++++++++------ lingua_franca/lang/parse_nl.py | 37 ++++++++++++++------- lingua_franca/lang/parse_pl.py | 36 ++++++++++++++------ lingua_franca/lang/parse_pt.py | 22 +++++++++--- lingua_franca/lang/parse_sv.py | 11 +++++- lingua_franca/parse.py | 19 +++++++++++ test/unittests/test_parse_en.py | 11 ++++++ 14 files changed, 322 insertions(+), 122 deletions(-) diff --git a/lingua_franca/lang/parse_cs.py b/lingua_franca/lang/parse_cs.py index e0144b02..2de89c75 100644 --- a/lingua_franca/lang/parse_cs.py +++ b/lingua_franca/lang/parse_cs.py @@ -23,7 +23,7 @@ _LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \ _FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \ _ORDINAL_BASE_CS # _ARTICLES_CS - +from lingua_franca.parse import normalize_decimals import re import json from lingua_franca import resolve_resource_file @@ -579,7 +579,7 @@ def _initialize_number_data(short_scale): return multiplies, string_num_ordinal_cs, string_num_scale_cs -def extract_number_cs(text, short_scale=True, ordinals=False): +def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_cs(tokenize(text.lower()), short_scale, ordinals).value @@ -1560,20 +1566,25 @@ def isFractional_cs(input_str, short_scale=True): return False -def extract_numbers_cs(text, short_scale=True, ordinals=False): +def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_cs(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_da.py b/lingua_franca/lang/parse_da.py index 14b18132..377f3ec0 100644 --- a/lingua_franca/lang/parse_da.py +++ b/lingua_franca/lang/parse_da.py @@ -20,22 +20,31 @@ from lingua_franca.lang.common_data_da import _DA_NUMBERS from lingua_franca.lang.format_da import pronounce_number_da from lingua_franca.time import now_local +from lingua_franca.parse import normalize_decimals -def extract_number_da(text, short_scale=True, ordinals=False): +def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -869,20 +878,25 @@ def normalize_da(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_da(text, short_scale=True, ordinals=False): +def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_da, extract_number_da, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_de.py b/lingua_franca/lang/parse_de.py index 95fda48e..b2409960 100644 --- a/lingua_franca/lang/parse_de.py +++ b/lingua_franca/lang/parse_de.py @@ -21,6 +21,7 @@ from lingua_franca.lang.common_data_de import _DE_NUMBERS from lingua_franca.lang.format_de import pronounce_number_de from lingua_franca.time import now_local +from lingua_franca.parse import normalize_decimals de_numbers = { @@ -143,20 +144,28 @@ def repl(match): return (duration, text) -def extract_number_de(text, short_scale=True, ordinals=False): +def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -1003,20 +1012,28 @@ def normalize_de(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_de(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats +def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'): """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_de, extract_number_de, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_en.py b/lingua_franca/lang/parse_en.py index 1c4887f1..39028e76 100644 --- a/lingua_franca/lang/parse_en.py +++ b/lingua_franca/lang/parse_en.py @@ -30,6 +30,7 @@ from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer from lingua_franca.time import now_local +from lingua_franca.parse import normalize_decimals def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False): @@ -765,11 +766,17 @@ def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_en(tokenize(text.lower()), short_scale, ordinals).value @@ -1880,7 +1887,7 @@ def is_fractional_en(input_str, short_scale=True, spoken=True): return False -def extract_numbers_en(text, short_scale=True, ordinals=False): +def extract_numbers_en(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1891,9 +1898,15 @@ def extract_numbers_en(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_en(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_es.py b/lingua_franca/lang/parse_es.py index 0a810cc4..be19730f 100644 --- a/lingua_franca/lang/parse_es.py +++ b/lingua_franca/lang/parse_es.py @@ -20,6 +20,7 @@ from lingua_franca.lang.format_es import pronounce_number_es from lingua_franca.lang.parse_common import * from lingua_franca.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES +from lingua_franca.parse import normalize_decimals def is_fractional_es(input_str, short_scale=True): @@ -56,16 +57,28 @@ def is_fractional_es(input_str, short_scale=True): return False -def extract_number_es(text, short_scale=True, ordinals=False): +def extract_number_es(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -268,20 +281,25 @@ def es_number(i): return es_number(i) -def extract_numbers_es(text, short_scale=True, ordinals=False): +def extract_numbers_es(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_es, extract_number_es, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_fa.py b/lingua_franca/lang/parse_fa.py index 753ac8eb..9e8874a4 100644 --- a/lingua_franca/lang/parse_fa.py +++ b/lingua_franca/lang/parse_fa.py @@ -19,6 +19,7 @@ _FARSI_ONES, _FARSI_TENS, _FORMAL_VARIANT) from lingua_franca.time import now_local +from lingua_franca.parse import normalize_decimals def _is_number(s): @@ -307,20 +308,25 @@ def extract_datetime_fa(text, anchorDate=None, default_time=None): return (result, " ".join(remainder)) -def extract_numbers_fa(text, short_scale=True, ordinals=False): +def extract_numbers_fa(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) ar = _parse_sentence(text) result = [] @@ -330,7 +336,7 @@ def extract_numbers_fa(text, short_scale=True, ordinals=False): return result -def extract_number_fa(text, ordinals=False): +def extract_number_fa(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -341,11 +347,17 @@ def extract_number_fa(text, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) x = extract_numbers_fa(text, ordinals=ordinals) if (len(x) == 0): return False diff --git a/lingua_franca/lang/parse_fr.py b/lingua_franca/lang/parse_fr.py index 9728653f..fa9d2e18 100644 --- a/lingua_franca/lang/parse_fr.py +++ b/lingua_franca/lang/parse_fr.py @@ -23,6 +23,7 @@ from lingua_franca.lang.common_data_fr import _ARTICLES_FR, _NUMBERS_FR, \ _ORDINAL_ENDINGS_FR from lingua_franca.time import now_local +from lingua_franca.parse import normalize_decimals def extract_duration_fr(text): @@ -369,13 +370,28 @@ def _number_ordinal_fr(words, i): return None -def extract_number_fr(text, short_scale=True, ordinals=False): - """Takes in a string and extracts a number. +def extract_number_fr(text, short_scale=True, ordinals=False, decimal='.'): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: - text (str): the string to extract a number from + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (str): The number extracted or the original text. + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -1067,20 +1083,25 @@ def normalize_fr(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_fr(text, short_scale=True, ordinals=False): +def extract_numbers_fr(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_fr, extract_number_fr, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_it.py b/lingua_franca/lang/parse_it.py index 88c7455d..9d512bb3 100644 --- a/lingua_franca/lang/parse_it.py +++ b/lingua_franca/lang/parse_it.py @@ -28,6 +28,7 @@ pronounce_number_it from lingua_franca.lang.common_data_it import _SHORT_ORDINAL_STRING_IT, \ _ARTICLES_IT, _LONG_ORDINAL_STRING_IT, _STRING_NUM_IT +from lingua_franca.parse import normalize_decimals def is_fractional_it(input_str, short_scale=False): @@ -224,7 +225,7 @@ def _extract_number_long_it(word): return value -def extract_number_it(text, short_scale=False, ordinals=False): +def extract_number_it(text, short_scale=False, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -235,11 +236,17 @@ def extract_number_it(text, short_scale=False, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) text = text.lower() string_num_ordinal_it = {} @@ -1148,20 +1155,25 @@ def get_gender_it(word, context=""): return gender -def extract_numbers_it(text, short_scale=False, ordinals=False): +def extract_numbers_it(text, short_scale=False, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_it, extract_number_it, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_nl.py b/lingua_franca/lang/parse_nl.py index ba197704..e9b0a895 100644 --- a/lingua_franca/lang/parse_nl.py +++ b/lingua_franca/lang/parse_nl.py @@ -26,6 +26,7 @@ _STRING_SHORT_ORDINAL_NL, _SUMS_NL from lingua_franca.time import now_local import re +from lingua_franca.parse import normalize_decimals def _convert_words_to_numbers_nl(text, short_scale=True, ordinals=False): @@ -414,10 +415,10 @@ def _initialize_number_data_nl(short_scale): return multiplies, string_num_ordinal_nl, string_num_scale_nl -def extract_number_nl(text, short_scale=True, ordinals=False): - """Extract a number from a text string - - The function handles pronunciations in long scale and short scale +def extract_number_nl(text, short_scale=True, ordinals=False, decimal='.'): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers @@ -425,10 +426,17 @@ def extract_number_nl(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_nl(tokenize(text.lower()), short_scale, ordinals).value @@ -1294,19 +1302,24 @@ def is_fractional_nl(input_str, short_scale=True): return False -def extract_numbers_nl(text, short_scale=True, ordinals=False): +def extract_numbers_nl(text, short_scale=True, ordinals=False, decimal='.'): """Takes in a string and extracts a list of numbers. Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_nl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_pl.py b/lingua_franca/lang/parse_pl.py index 84f83bc8..3a6bbe3b 100644 --- a/lingua_franca/lang/parse_pl.py +++ b/lingua_franca/lang/parse_pl.py @@ -24,6 +24,8 @@ _TIME_UNITS_NORMALIZATION, _MONTHS_TO_EN, _DAYS_TO_EN, _ORDINAL_BASE_PL, \ _ALT_ORDINALS_PL from lingua_franca.time import now_local +from lingua_franca.parse import normalize_decimals + import re @@ -576,7 +578,7 @@ def _initialize_number_data(short_scale): return multiplies, _STRING_SHORT_ORDINAL_PL, string_num_scale -def extract_number_pl(text, short_scale=True, ordinals=False): +def extract_number_pl(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -587,11 +589,17 @@ def extract_number_pl(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_pl(tokenize(text.lower()), True, ordinals).value @@ -1333,20 +1341,28 @@ def isFractional_pl(input_str, short_scale=True): return False -def extract_numbers_pl(text, short_scale=True, ordinals=False): +def extract_numbers_pl(text, short_scale=True, ordinals=False, decimal='.'): """ - Takes in a string and extracts a list of numbers. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_pl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_pt.py b/lingua_franca/lang/parse_pt.py index 356c1e83..9a611ea2 100644 --- a/lingua_franca/lang/parse_pt.py +++ b/lingua_franca/lang/parse_pt.py @@ -29,6 +29,8 @@ from lingua_franca.internal import resolve_resource_file from lingua_franca.lang.parse_common import Normalizer from lingua_franca.time import now_local +from lingua_franca.parse import normalize_decimals + import json import re import unicodedata @@ -77,16 +79,28 @@ def is_fractional_pt(input_str, short_scale=True): return False -def extract_number_pt(text, short_scale=True, ordinals=False): +def extract_number_pt(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. diff --git a/lingua_franca/lang/parse_sv.py b/lingua_franca/lang/parse_sv.py index 02164111..5a0edb32 100644 --- a/lingua_franca/lang/parse_sv.py +++ b/lingua_franca/lang/parse_sv.py @@ -17,6 +17,7 @@ from dateutil.relativedelta import relativedelta from lingua_franca.time import now_local +from lingua_franca.parse import normalize_decimals from .parse_common import (is_numeric, look_for_fractions, Normalizer, tokenize, Token) @@ -156,15 +157,23 @@ def extract_duration_sv(text): return (td, remainder) if valid else None -def extract_number_sv(text, short_scale=True, ordinals=False): +def extract_number_sv(text, short_scale=True, ordinals=False, decimal='.'): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float): The value of extracted number + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API # compatibility reasons. diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index cbaa3e80..cd0960a6 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import re import json from lingua_franca.util import match_one, fuzzy_match, MatchStrategy from lingua_franca.lang.parse_common import match_yes_or_no, is_numeric @@ -36,6 +37,7 @@ "extract_langcode", "extract_number_spans", "normalize", + "normalize_decimals", "get_gender", "yes_or_no", "is_fractional", @@ -44,6 +46,19 @@ populate_localized_function_dict("parse", langs=get_active_langs()) +@localized_function(run_own_code_on=[FunctionNotLocalizedError]) +def normalize_decimals(text, decimal, lang=""): + """ + Replace 'decimal' with decimal periods so Python can floatify them + """ + regex = r"\b\d+" + decimal + r"{1}\d+\b" + sanitize_decimals = re.compile(regex) + for _, match in enumerate(re.finditer(sanitize_decimals, text)): + text = text.replace(match.group( + 0), match.group(0).replace(decimal, '.')) + return text + + @localized_function(run_own_code_on=[FunctionNotLocalizedError]) def extract_number_spans(utterance, short_scale=True, ordinals=False, fractional_numbers=True, decimal=".", lang=''): @@ -112,8 +127,12 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang='', ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats, or empty list if none found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ spans = extract_number_spans(text, short_scale=short_scale, lang=lang, ordinals=ordinals, decimal=decimal, diff --git a/test/unittests/test_parse_en.py b/test/unittests/test_parse_en.py index 3fb038c7..c6077073 100644 --- a/test/unittests/test_parse_en.py +++ b/test/unittests/test_parse_en.py @@ -290,6 +290,17 @@ def test_combinations(self): class TestExtractNumber(unittest.TestCase): + def test_extract_number_decimal_markers(self): + # Test decimal normalization + self.assertEqual(extract_number("4,4", decimal=','), 4.4) + self.assertEqual(extract_number("we have 3,5 kilometers to go", + decimal=','), 3.5) + self.assertEqual(extract_numbers("this is a seven eight 9,5 test", + decimal=','), + [7.0, 8.0, 9.5]) + self.assertEqual(extract_numbers("this is a 7,0 8.0 9,6 test", + decimal=','), [7.0, 8.0, 9.6]) + def test_extract_number_priority(self): # sanity check self.assertEqual(extract_number("third", ordinals=True), 3)